diff --git a/.dockerignore b/.dockerignore index 7f692fee5b7..221434ec369 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,4 +1,3 @@ Dockerfile -.git .dockerignore Brewfile diff --git a/.github/ISSUE_TEMPLATE/report-a-bug.md b/.github/ISSUE_TEMPLATE/report-a-bug.md new file mode 100644 index 00000000000..6d2fa48b004 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/report-a-bug.md @@ -0,0 +1,43 @@ +--- +name: Report a Bug +about: Report a bug that causes vg to crash or otherwise behave incorrectly +title: '' +labels: '' +assignees: '' + +--- + + + +**1. What were you trying to do?** + + +**2. What did you want to happen?** + + +**3. What actually happened?** + + +**4. If you got a line like `Stack trace path: /somewhere/on/your/computer/stacktrace.txt`, please copy-paste the contents of that file here:** + +``` +Place stacktrace here. +``` + +**5. What data and command can the vg dev team use to make the problem happen?** + + +**6. What does running `vg version` say?** + +``` +Place vg version output here +``` diff --git a/.github/ISSUE_TEMPLATE/support-request.md b/.github/ISSUE_TEMPLATE/support-request.md new file mode 100644 index 00000000000..6b79d32e4a1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/support-request.md @@ -0,0 +1,14 @@ +--- +name: Support Request +about: Get help installing or using vg, or get questions answered +title: '' +labels: '' +assignees: '' + +--- + +**PLEASE DO NOT MAKE SUPPORT REQUESTS HERE** + +Please the Biostars forum instead: + +https://www.biostars.org/new/post/?tag_val=vg diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 00000000000..f847026b238 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,6 @@ +## Changelog Entry +To be copied to the [draft changelog](https://github.com/vgteam/vg/wiki/Draft-Changelog) by merger: + + * Whatsits now frobnicated + +## Description diff --git a/.github/workflows/testmac.yml b/.github/workflows/testmac.yml new file mode 100644 index 00000000000..77a2e8442c3 --- /dev/null +++ b/.github/workflows/testmac.yml @@ -0,0 +1,78 @@ +name: Test Mac + +# Run on our main branch and any PRs to it, and on release tags, but not every +# commit in every branch. +on: + push: + branches: + - master + tags: + - "*" + pull_request: + branches: + - master + +jobs: + testmac: + name: Test on Mac + runs-on: macos-12 + + steps: + - name: Use cache + uses: actions/cache@v2 + with: + path: | + deps + lib + include + bin + key: ${{ runner.os }}-12-${{ github.ref }} + # Restore keys are a "list", but really only a multiline string is + # accepted. Also we match by prefix. And the most recent cache is + # used, not the most specific. + # See: https://docs.github.com/en/actions/guides/caching-dependencies-to-speed-up-workflows#matching-a-cache-key + restore-keys: | + ${{ runner.os }}-12-${{ github.base_ref }} + ${{ runner.os }}-12 + + - name: Checkout code without submodules + uses: actions/checkout@v2 + + - name: Get or restore dependencies + run: scripts/restore-deps.sh + + - name: Install packages + # We don't use artemnovichkov/action-homebrew because that's for Linux. + # We uninstall everything we don't need in order to prevent linking + # conflicts with existing/outdated packages, which we can't resolve + # because there's no way to tell Homebrew to force-link when installing + # from a Brewfile. We also update Protobuf to make sure we have 3.21.3+ + # to avoid problems with ABI changes with/without -DNDEBUG. + # And we update libomp to make extra sure it will be picked up by the compiler. + # We pre-install a pinned txm to work around https://github.com/anko/txm/issues/8 + run: | + brew bundle cleanup --force && \ + brew bundle install && \ + brew update && \ + brew install protobuf && \ + brew install libomp && \ + npm install -g txm@7.4.5 && \ + brew config && \ + (brew doctor || echo "brew doctor is unhappy") + + - name: Run build and test + run: | + export VG_FULL_TRACEBACK=1 + echo "Build with $(nproc) threads" + set +e + make -j$(nproc) test + RETVAL=$? + set -e + # Whether vg testing succeeds or fails, see if we can get any Apple crash logs for it. + ls ~/Library/Logs/DiagnosticReports/ + for CRASH_FILE in $(ls ~/Library/Logs/DiagnosticReports/vg-* 2>/dev/null) ; do + echo "vg crash report found: ${CRASH_FILE}" + cat ${CRASH_FILE} + done + exit $RETVAL + shell: bash diff --git a/.gitignore b/.gitignore index dd90fa982ac..11c9ce3001f 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ test/*/*.lcp test/**/*.index/ trash src/*.gch +.vscode # Ignore a bunch of files people might dump in the root when testing /*.vg /*.gcsa @@ -42,3 +43,4 @@ src/*.gch /*.svg /*.fa /*.gfa +.vscode/* \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 00000000000..73d82f85bbc --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,260 @@ +# The VG tests are going to use toil-vg. +# toil-vg needs to be able to mount paths it can see into Docker containers. +# There's no good way to do that when running Docker containers as siblings of a container containing toil-vg. +# So we either have to genuinely nest Docker inside another Docker, or we have to run the build on the real host. + +# Pull in an image that has our apt packages pre-installed, to save time installing them for every test. +# Make sure to use :latest so we re-check and re-pull if needed on every run. +image: quay.io/vgteam/vg_ci_prebake:latest + +before_script: + - sudo apt-get -q -y update + # Make sure we have some curl stuff for pycurl which we need for some Python stuff + # And the CI report upload needs uuidgen from uuid-runtime + - sudo apt-get -q -y install --no-upgrade docker.io python3-pip python3-virtualenv libcurl4-gnutls-dev python-dev npm nodejs node-gyp uuid-runtime libgnutls28-dev doxygen libzstd-dev + - which junit-merge || sudo npm install -g junit-merge + # Configure Docker to use a mirror for Docker Hub and restart the daemon + - | + if [[ ! -z "${DOCKER_HUB_MIRROR}" ]] ; then + if [[ "${DOCKER_HUB_MIRROR}" == https* ]] ; then + # Set up a secure mirror + echo "{\"registry-mirrors\": [\"${DOCKER_HUB_MIRROR}\"]}" | sudo tee /etc/docker/daemon.json + else + # Set up an insecure mirror + echo "{\"registry-mirrors\": [\"${DOCKER_HUB_MIRROR}\"], \"insecure-registries\": [\"${DOCKER_HUB_MIRROR##*://}\"]}" | sudo tee /etc/docker/daemon.json + fi + fi + # Restart or start the Docker daemon + - stopdocker || true + - startdocker || true + # Get buildx + - mkdir -p ~/.docker/cli-plugins/ ; curl -L https://github.com/docker/buildx/releases/download/v0.5.1/buildx-v0.5.1.linux-amd64 > ~/.docker/cli-plugins/docker-buildx ; chmod u+x ~/.docker/cli-plugins/docker-buildx + # Connect to the Kubernetes-based builder "buildkit" if appropriate + # See vgci/buildkit-deployment.yml + - if [[ "${CI_BUILDKIT_DRIVER}" == "kubernetes" ]] ; then docker buildx create --use --name=buildkit --platform=linux/amd64,linux/arm64 --node=buildkit-amd64 --driver=kubernetes --driver-opt="nodeselector=kubernetes.io/arch=amd64" ; else docker buildx create --use --name=container-builder --driver=docker-container ; fi + # Report on the builders, and make sure they exist. + - docker buildx inspect --bootstrap || (echo "Docker builder deployment can't be found! Are we on the right Gitlab runner?" && exit 1) + # Prune down build cache to make space. This will hang if the builder isn't findable. + - (echo "y" | docker buildx prune --keep-storage 80G) || true + # Connect so we can upload our images + - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" + - docker info + - mkdir -p ~/.aws && cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials + +after_script: + - stopdocker || true + +# We have two pipeline stages: build to make a Docker, and test to run tests. +# TODO: make test stage parallel +stages: + - build + - test + - report + +# We define one job to do the out-of-container (re)build, and run the Bash +# tests. It uses a Gitlab-managed cache to prevent a full rebuild every time. It +# still takes longer than the Docker build, so we put it in the test stage +# alongside other longer jobs. +local-build-test-job: + stage: test + cache: + # Gitlab isn't clever enough to fill PR caches from the main branch, so we + # just use one megacache and hope the Makefile is smart enough to recover + # from the future + key: local-build-test-cache + paths: + - deps/ + - include/ + - lib/ + - bin/ + - obj/ + before_script: + - sudo apt-get -q -y update + # We need to make sure we get the right submodule files for this version + # and don't clobber sources with the cache. We want to have a dirty state + # with the correct source files. + - scripts/restore-deps.sh + # We need to make sure we have nvm for testing the tube map + - curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh | bash + - export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" + script: + - nvm version + - python3 ./configure.py + - source ./source_me.sh + - make get-deps + - make -j8 + - echo Testing + - bin/vg test "Target to alignment extraction" + - echo Full Testing + - make test + - make static -j8 + # Also test as a backend for the tube map + - git clone https://github.com/vgteam/sequenceTubeMap.git + - cd sequenceTubeMap + # Tube map expects local IPv6 but Kubernetes won't let us have it + - 'sed -i "s/^}$/,\"serverBindAddress\": \"127.0.0.1\"}/" src/config.json' + # Tube map expects to have its specified version of Node + - nvm install + - nvm use + - npm ci + - CI=true npm run test + after_script: + - echo "Don't do anything" + variables: + VG_FULL_TRACEBACK: "1" + GIT_SUBMODULE_STRATEGY: none + + +# We define one job to do the Docker container build +build-job: + stage: build + script: + - CI_REPO=${CI_REGISTRY}/vgteam/vg + - CACHE_TAG="cache-$(echo ${CI_COMMIT_BRANCH}${CI_COMMIT_TAG} | tr '/' '-')" + - MAINLINE_CACHE_TAG="cache-master" + - PLATFORMS=linux/amd64 + - DOCKER_TAG=ci-${CI_PIPELINE_IID}-${CI_COMMIT_SHA} + - make include/vg_git_version.hpp + - cat include/vg_git_version.hpp + # Connect so we can upload our images + - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" + # Note that A LOCAL CACHE CAN ONLY HOLD ONE TAG/TARGET AT A TIME! + # And Quay can't use mode=max registry caching to cache the intermediate targets with the final image, just inline caching. + # So we have to do the Complicated Cache Shuffle. + # Build base image from branch and mainline base caches to local cache + - docker buildx build --cache-from type=registry,ref=${CI_REPO}:${MAINLINE_CACHE_TAG}-base --cache-from type=registry,ref=${CI_REPO}:${CACHE_TAG}-base --cache-to type=local,dest=${HOME}/docker-cache --platform="${PLATFORMS}" --build-arg THREADS=8 --target base -t ${CI_REPO}:${CACHE_TAG}-base -f Dockerfile . + # Push base image from local cache to registry cache for branch. + - docker buildx build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=inline --platform="${PLATFORMS}" --build-arg THREADS=8 --target base -t ${CI_REPO}:${CACHE_TAG}-base -f Dockerfile --push . + # Build build image from local base cache and branch and mainline build caches to local cache + - docker buildx build --cache-from type=registry,ref=${CI_REPO}:${MAINLINE_CACHE_TAG}-build --cache-from type=registry,ref=${CI_REPO}:${CACHE_TAG}-build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=local,dest=${HOME}/docker-cache --platform="${PLATFORMS}" --build-arg THREADS=8 --target build -t ${CI_REPO}:${CACHE_TAG}-build -f Dockerfile . + # Push build image to registry cache for branch + - docker buildx build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=inline --platform="${PLATFORMS}" --build-arg THREADS=8 --target build -t ${CI_REPO}:${CACHE_TAG}-build -f Dockerfile --push . + # Build run image from local build cache and branch and mainline run caches to local cache + - docker buildx build --cache-from type=registry,ref=${CI_REPO}:${MAINLINE_CACHE_TAG}-run --cache-from type=registry,ref=${CI_REPO}:${CACHE_TAG}-run --cache-from type=local,src=${HOME}/docker-cache --cache-to type=local,dest=${HOME}/docker-cache --platform="${PLATFORMS}" --build-arg THREADS=8 --target run -t ${CI_REPO}:${CACHE_TAG}-run -f Dockerfile . + # Push run image to registry cache for branch + - docker buildx build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=inline --platform="${PLATFORMS}" --build-arg THREADS=8 --target run -t ${CI_REPO}:${CACHE_TAG}-run -f Dockerfile --push . + # Finally, push run image to where we actually want it. + - docker buildx build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=inline --platform="${PLATFORMS}" --build-arg THREADS=8 --target run -t ${CI_REPO}:${DOCKER_TAG} -f Dockerfile --push . + variables: + GIT_SUBMODULE_STRATEGY: recursive + +# The arm container build takes like 90 minutes, so we don't want to run it +# before the main test phase where the other long tests live. +# To ship a final production Docker tag, we need the ARM and x86 builds +# happening in the same command so we can push one multiarch manifest. +production-build-job: + stage: test + only: + - /^arm/ + - master + - tags + script: + - CI_REPO=${CI_REGISTRY}/vgteam/vg + - CACHE_TAG="cache-$(echo ${CI_COMMIT_BRANCH}${CI_COMMIT_TAG} | tr '/' '-')" + - MAINLINE_CACHE_TAG="cache-master" + - PLATFORMS=linux/amd64,linux/arm64 + # Determine what we should be tagging vg Dockers as. If we're running on a Git tag we want to use that. Otherwise push over the tag we made already. + - if [[ ! -z "${CI_COMMIT_TAG}" ]]; then DOCKER_TAG="${CI_COMMIT_TAG}" ; else DOCKER_TAG="ci-${CI_PIPELINE_IID}-${CI_COMMIT_SHA}"; fi + - make include/vg_git_version.hpp + # Make sure ARM emulation is available. + - if [[ "${CI_BUILDKIT_DRIVER}" != "kubernetes" ]] ; then docker run --privileged --rm tonistiigi/binfmt --install all || true ; fi + # TODO: deduplicate this code with normal build above + # Note that A LOCAL CACHE CAN ONLY HOLD ONE TAG/TARGET AT A TIME! + # And Quay can't use mode=max registry caching to cache the intermediate targets with the final image, just inline caching. + # So we have to do the Complicated Cache Shuffle. + # Build base image from branch and mainline base caches to local cache + - docker buildx build --cache-from type=registry,ref=${CI_REPO}:${MAINLINE_CACHE_TAG}-base --cache-from type=registry,ref=${CI_REPO}:${CACHE_TAG}-base --cache-to type=local,dest=${HOME}/docker-cache --platform="${PLATFORMS}" --build-arg THREADS=8 --target base -t ${CI_REPO}:${CACHE_TAG}-base -f Dockerfile . + # Push base image from local cache to registry cache for branch. + - docker buildx build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=inline --platform="${PLATFORMS}" --build-arg THREADS=8 --target base -t ${CI_REPO}:${CACHE_TAG}-base -f Dockerfile --push . + # Build build image from local base cache and branch and mainline build caches to local cache + - docker buildx build --cache-from type=registry,ref=${CI_REPO}:${MAINLINE_CACHE_TAG}-build --cache-from type=registry,ref=${CI_REPO}:${CACHE_TAG}-build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=local,dest=${HOME}/docker-cache --platform="${PLATFORMS}" --build-arg THREADS=8 --target build -t ${CI_REPO}:${CACHE_TAG}-build -f Dockerfile . + # Push build image to registry cache for branch + - docker buildx build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=inline --platform="${PLATFORMS}" --build-arg THREADS=8 --target build -t ${CI_REPO}:${CACHE_TAG}-build -f Dockerfile --push . + # Build run image from local build cache and branch and mainline run caches to local cache + - docker buildx build --cache-from type=registry,ref=${CI_REPO}:${MAINLINE_CACHE_TAG}-run --cache-from type=registry,ref=${CI_REPO}:${CACHE_TAG}-run --cache-from type=local,src=${HOME}/docker-cache --cache-to type=local,dest=${HOME}/docker-cache --platform="${PLATFORMS}" --build-arg THREADS=8 --target run -t ${CI_REPO}:${CACHE_TAG}-run -f Dockerfile . + # Push run image to registry cache for branch + - docker buildx build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=inline --platform="${PLATFORMS}" --build-arg THREADS=8 --target run -t ${CI_REPO}:${CACHE_TAG}-run -f Dockerfile --push . + # Finally, push run image to where we actually want it. + - docker buildx build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=inline --platform="${PLATFORMS}" --build-arg THREADS=8 --target run -t ${CI_REPO}:${DOCKER_TAG} -f Dockerfile --push . + # Tag it latest if we pushed a real release tag + - if [[ ! -z "${CI_COMMIT_TAG}" ]]; then docker buildx build --cache-from type=local,src=${HOME}/docker-cache --cache-to type=inline --platform="${PLATFORMS}" --build-arg THREADS=8 --target run -t ${CI_REPO}:latest -f Dockerfile --push .; fi + # If we wanted to run the tests under ARM emulation, we could do: + # docker buildx build --platform=linux/arm64 --build-arg THREADS=8 --target test -f Dockerfile . + # But we don't, because they both don't actually pass yet on ARM and also + # manage to hit a 6 hour timeout on our extremely slow emulators. + variables: + GIT_SUBMODULE_STRATEGY: recursive + +# We also run the toil-vg/pytest-based tests +# Note that WE ONLY RUN TESTS LISTED IN vgci/test-list.txt +test-job: + stage: test + # Run in parallel, setting CI_NODE_INDEX and CI_NODE_TOTAL + # We will find our share of tests from vgci/test-list.txt and run them + # We ought to run one job per test, but we can wrap around. + parallel: 6 + cache: + key: docker-pull-cache + paths: + - /var/lib/docker + script: + - docker images + - docker pull "quay.io/vgteam/vg:ci-${CI_PIPELINE_IID}-${CI_COMMIT_SHA}" + - docker tag "quay.io/vgteam/vg:ci-${CI_PIPELINE_IID}-${CI_COMMIT_SHA}" vgci-docker-vg-local + - mkdir -p junit + # Drop secrets before we do any Toil; it might want to log the environment + - export GITLAB_SECRET_FILE_AWS_CREDENTIALS="" + - export GITLAB_SECRET_FILE_DOCS_SSH_KEY="" + - export CI_REGISTRY_PASSWORD="" + - export GH_TOKEN="" + # Make sure IO to Gitlab is in blocking mode so we can't swamp it and crash + - vgci/blockify.py bash vgci/vgci-parallel-wrapper.sh vgci/test-list.txt vgci-docker-vg-local ${CI_NODE_INDEX} ${CI_NODE_TOTAL} ./junit ./test_output + after_script: + - stopdocker || true + - rm -f /var/run/docker.sock + - startdocker || true + # Don't leave containers in the cache + - docker ps -q -a | xargs docker rm -f || true + # Don't leave each run's CI image laying around in the cache + - docker rmi "quay.io/vgteam/vg:ci-${CI_PIPELINE_IID}-${CI_COMMIT_SHA}" vgci-docker-vg-local + # Show what we are caching + - docker images + - stopdocker || true + + artifacts: + # Let Gitlab see the junit report + reports: + junit: junit/*.xml + paths: + - junit/*.xml + - test_output/* + # Make sure they get artifact'd even if (especially when) the tests fail + when: always + expire_in: 3 days + +# We have a final job in the last stage to compose an HTML report +report-job: + stage: report + # Run this even when the tests fail, because otherwise we won't hear about it. + # Hopefully if the actual build failed we fail at the docker pull and we don't upload stuff for no version. + when: always + # All artifacts from previous stages are available + script: + # Get the Docker for version detection + - docker pull "quay.io/vgteam/vg:ci-${CI_PIPELINE_IID}-${CI_COMMIT_SHA}" + - docker tag "quay.io/vgteam/vg:ci-${CI_PIPELINE_IID}-${CI_COMMIT_SHA}" vgci-docker-vg-local + # Collect all the junit files from all the test jobs into one + - junit-merge -o junit.all.xml junit/*.xml + # All the test output folder artifacts should automatically merge. + # Make the report and post it. + # We still need the Docker for version detection. + # Make sure IO to Gitlab is in blocking mode so we can't swamp it and crash + - vgci/blockify.py bash vgci/vgci.sh -J junit.all.xml -T vgci-docker-vg-local -W test_output + +# We need a separate job to build the Doxygen docs +docs-job: + stage: build + script: + - doc/publish-docs.sh + + diff --git a/.gitmodules b/.gitmodules index a04fd8e6660..6b6337be74d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,12 +1,12 @@ [submodule "vcflib"] path = deps/vcflib - url = https://github.com/vgteam/vcflib.git + url = https://github.com/vcflib/vcflib.git [submodule "fastahack"] path = deps/fastahack url = https://github.com/vgteam/fastahack.git [submodule "gssw"] path = deps/gssw - url = https://github.com/ekg/gssw.git + url = https://github.com/vgteam/gssw.git [submodule "bash-tap"] path = deps/bash-tap url = https://github.com/illusori/bash-tap.git @@ -16,30 +16,21 @@ [submodule "lru_cache"] path = deps/lru_cache url = https://github.com/ekg/lru_cache.git -[submodule "htslib"] - path = deps/htslib - url = https://github.com/vgteam/htslib.git [submodule "sha1"] path = deps/sha1 url = https://github.com/vog/sha1.git -[submodule "protobuf"] - path = deps/protobuf - url = https://github.com/google/protobuf.git [submodule "gcsa2"] path = deps/gcsa2 url = https://github.com/jltsiren/gcsa2.git [submodule "sdsl-lite"] path = deps/sdsl-lite - url = https://github.com/simongog/sdsl-lite + url = https://github.com/vgteam/sdsl-lite.git [submodule "deps/libVCFH"] path = deps/libVCFH url = https://github.com/edawson/libVCFH.git [submodule "deps/sparsehash"] path = deps/sparsehash url = https://github.com/sparsehash/sparsehash.git -[submodule "gfakluge"] - path = deps/gfakluge - url = https://github.com/edawson/gfakluge.git [submodule "deps/DYNAMIC"] path = deps/DYNAMIC url = https://github.com/vgteam/DYNAMIC @@ -60,13 +51,7 @@ url = https://github.com/benedictpaten/sonLib.git [submodule "deps/fermi-lite"] path = deps/fermi-lite - url = https://github.com/edawson/fermi-lite.git -[submodule "deps/rocksdb"] - path = deps/rocksdb - url = https://github.com/facebook/rocksdb.git -[submodule "deps/gperftools"] - path = deps/gperftools - url = https://github.com/gperftools/gperftools.git + url = https://github.com/vgteam/fermi-lite.git [submodule "deps/gbwt"] path = deps/gbwt url = https://github.com/jltsiren/gbwt.git @@ -85,15 +70,66 @@ [submodule "deps/sparsepp"] path = deps/sparsepp url = https://github.com/greg7mdp/sparsepp.git -[submodule "deps/vowpal_wabbit"] - path = deps/vowpal_wabbit - url = https://github.com/JohnLangford/vowpal_wabbit.git -[submodule "deps/boost-subset"] - path = deps/boost-subset - url = https://github.com/vgteam/boost-subset.git [submodule "deps/libdeflate"] path = deps/libdeflate url = https://github.com/ebiggers/libdeflate.git [submodule "deps/dozeu"] path = deps/dozeu - url = https://github.com/ocxtal/dozeu.git + url = https://github.com/vgteam/dozeu.git +[submodule "deps/libhandlegraph"] + path = deps/libhandlegraph + url = https://github.com/vgteam/libhandlegraph.git +[submodule "deps/libvgio"] + path = deps/libvgio + url = https://github.com/vgteam/libvgio.git +[submodule "deps/jemalloc"] + path = deps/jemalloc + url = https://github.com/jemalloc/jemalloc.git +[submodule "deps/sglib"] + path = deps/sglib + url = https://github.com/vgteam/sglib.git + branch = master +[submodule "deps/FlameGraph"] + path = deps/FlameGraph + url = https://github.com/brendangregg/FlameGraph +[submodule "deps/libbdsg"] + path = deps/libbdsg + url = https://github.com/vgteam/libbdsg.git +[submodule "deps/xg"] + path = deps/xg + url = https://github.com/vgteam/xg.git +[submodule "deps/gbwtgraph"] + path = deps/gbwtgraph + url = https://github.com/jltsiren/gbwtgraph.git +[submodule "deps/ips4o"] + path = deps/ips4o + url = https://github.com/vgteam/ips4o.git +[submodule "deps/mmmultimap"] + path = deps/mmmultimap + url = https://github.com/ekg/mmmultimap.git +[submodule "vgteam_bbhash"] + path = deps/BBHash + url = https://github.com/vgteam/BBHash.git +[submodule "src/simde"] + path = src/simde + url = https://github.com/nemequ/simde-no-tests +[submodule "doc/wiki"] + path = doc/wiki + url = https://github.com/vgteam/vg.wiki.git +[submodule "deps/mio"] + path = deps/mio + url = https://github.com/mandreyel/mio.git +[submodule "deps/atomic_queue"] + path = deps/atomic_queue + url = https://github.com/max0x7ba/atomic_queue.git +[submodule "deps/htslib"] + path = deps/htslib + url = https://github.com/samtools/htslib.git + branch = master +[submodule "deps/tabixpp"] + path = deps/tabixpp + url = https://github.com/ekg/tabixpp.git + branch = master +[submodule "deps/kff-cpp-api"] + path = deps/kff-cpp-api + url = https://github.com/Kmer-File-Format/kff-cpp-api.git diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index ead1f26b4d8..00000000000 --- a/.travis.yml +++ /dev/null @@ -1,127 +0,0 @@ -# Control file for continuous integration testing at http://travis-ci.org/ - -language: cpp -compiler: gcc -sudo: required -dist: trusty -# We have some shenanigans to let us cache submodules, and update changed files -# without messing up mtimes and triggering rebuilds unnecessarily. Travis checks -# out our submodules and then restores the cache over them. We move the cached -# version out of the way, check out the versions we want, rsync over only the -# differing/updated files (updating only their mtimes), and then put the fixed -# version back. -before_install: - - if [[ -z "$BUILD_DOCS_ONLY" ]]; then if [[ -e deps ]]; then mv deps deps_cached; fi; fi - - if [[ -z "$BUILD_DOCS_ONLY" ]]; then git submodule update --init --recursive; fi - - if [[ -z "$BUILD_DOCS_ONLY" ]]; then rsync -rv --links --checksum deps/ deps_cached/; fi - - if [[ -z "$BUILD_DOCS_ONLY" ]]; then rm -Rf deps; fi - # Keep the cached deps if the right compiler version was used. - # Otherwise start fresh - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "linux" && -e deps_cached/gcc6 ]]; then mv deps_cached deps; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" && "$INSTALL_GCC" == "1" && -e deps_cached/gcc6 ]]; then mv deps_cached deps; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" && "$INSTALL_GCC" == "0" && -e deps_cached/clang ]]; then mv deps_cached deps; fi - - (ls -lah deps/; ls -lah bin/; ls -lah lib/; ls -lah include/) || true - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "linux" ]]; then ls /etc/apt/sources.list.d; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "linux" ]]; then sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 60 --slave /usr/bin/g++ g++ /usr/bin/g++-6; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "linux" ]]; then mkdir -p deps; touch deps/gcc6; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" ]]; then brew bundle; fi # TODO: Check back after addon has been fixed. - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" ]]; then sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.4"; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" ]]; then export PKG_CONFIG_PATH="/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" && "$INSTALL_GCC" == "1" ]]; then brew install gcc6 || true; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" && "$INSTALL_GCC" == "1" ]]; then brew link --overwrite gcc@6; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" && "$INSTALL_GCC" == "1" ]]; then mkdir -p ./bin; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" && "$INSTALL_GCC" == "1" ]]; then ln -sf `which g++-6` ./bin/g++; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" && "$INSTALL_GCC" == "1" ]]; then ln -sf `which gcc-6` ./bin/gcc; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" && "$INSTALL_GCC" == "1" ]]; then mkdir -p deps; touch deps/gcc6; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" && "$INSTALL_GCC" == "0" ]]; then mkdir -p deps; touch deps/clang; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" ]]; then brew link bison --force; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" ]]; then export PATH="/usr/local/opt/coreutils/libexec/gnubin:/usr/local/bin:$PATH"; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" ]]; then export LD_LIBRARY_PATH=/usr/local/lib/; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" ]]; then export CFLAGS="-I/usr/local/include/"; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" ]]; then export LIBRARY_PATH=$LD_LIBRARY_PATH; fi - - gcc --version && g++ --version - - if [[ -z "$BUILD_DOCS_ONLY" ]]; then python ./configure.py; fi - - if [[ -z "$BUILD_DOCS_ONLY" ]]; then source ./source_me.sh; fi -install: - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "linux" ]]; then make get-deps; fi -script: - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "linux" ]]; then git submodule update --recursive && make -j4 && echo Testing && make test && make static -j4; fi - - if [[ -z "$BUILD_DOCS_ONLY" && "$TRAVIS_OS_NAME" == "osx" ]]; then git submodule update --recursive && timeout 1800 make deps -j4 && make -j4 && echo Testing && make test; fi - - if [[ ! -z "$BUILD_DOCS_ONLY" ]]; then doc/publish-docs.sh; fi -# Cache all our dependency directories, and our lib and include -cache: - directories: - - deps - - lib - - include - - bin -before_cache: - - rm -f lib/libvg.* - - rm -f include/vg.pb.h include/vg_git_version.hpp - - rm -f bin/vg /bin/vg* bin/g++ bin/gcc - -addons: - homebrew: - # We should be able to list packages here, but it isn't quite working, so let's try a Brewfile - brewfile: true - apt: - sources: - - ubuntu-toolchain-r-test - update: true - packages: # Get all the current dependency packages in advance. We will still do make get-deps but it will do less work. - - gcc-6 - - g++-6 - - bc - - rs - - jq - - samtools - - cmake - - protobuf-compiler - - libprotoc-dev - - libjansson-dev - - libbz2-dev - - libncurses5-dev - - automake - - libtool - - curl - - unzip - - redland-utils - - librdf-dev - - pkg-config - - wget - - gtk-doc-tools - - raptor2-utils - - rasqal-utils - - bison - - flex - - gawk - - libgoogle-perftools-dev - - liblz4-dev - - liblzma-dev - - libcairo2-dev - - libpixman-1-dev - - libffi-dev - - doxygen - -os: - - linux - - osx -compiler: - - gcc -env: - global: - - DOCS_KEY_ENCRYPTION_LABEL=125272388526 - matrix: - - INSTALL_GCC=1 - - INSTALL_GCC=0 - -matrix: - exclude: - - os: linux - env: INSTALL_GCC=1 - include: - # We have a special entry to do the docs build - - os: linux - env: BUILD_DOCS_ONLY=1 - - diff --git a/Brewfile b/Brewfile index fc16f2b4400..f730aae8bee 100644 --- a/Brewfile +++ b/Brewfile @@ -1,15 +1,22 @@ +brew "coreutils" +brew "python" brew "jq" +brew "parallel" +brew "node" brew "jansson" -brew "md5sha1sum" +brew "protobuf" brew "samtools" -brew "rasqal" brew "bison" -brew "raptor" -brew "rasqal" brew "gperftools" brew "autogen" brew "lz4" brew "xz" +brew "zstd" brew "cairo" brew "expat" brew "libomp" +brew "automake" +brew "autoconf" +brew "cmake" +brew "boost" +brew "pybind11" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000000..8b96a634a57 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,153 @@ +# Multi-container Dockerfile for build and run containers for vg + +# Use Google's non-rate-limited mirror of Docker Hub to get our base image. +# This helps automated Quay builds because Quay hasn't built a caching system +# and exposes pull rate limits to users. +FROM mirror.gcr.io/library/ubuntu:20.04 AS base +MAINTAINER vgteam + +RUN echo base > /stage.txt + +WORKDIR /vg + +# Prevent dpkg from trying to ask any questions, ever +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +FROM base AS build +ARG THREADS=8 +ARG TARGETARCH + +RUN echo build > /stage.txt + +RUN apt-get -qq -y update && \ + apt-get -qq -y upgrade && \ + apt-get -qq -y install sudo + +# Install all vg's dependencies. +# The Makefile will come parse the Dockerfile to get the correct dependencies; +# this is the One True Depencency List. +# We don't need to clean the package index since we don't ship this image and +# don't care about its size. +# We clip out everything between these begin and end markers, except the line +# that starts with RUN, or comments. And we pull out line continuation slashes. +# TODO: can we read them here and in the Makefile from the README instead? +###DEPS_BEGIN### +RUN apt-get -qq -y update && apt-get -qq -y upgrade && apt-get -qq -y install \ + make git build-essential protobuf-compiler libprotoc-dev libjansson-dev libbz2-dev \ + libncurses5-dev automake gettext autopoint libtool jq bsdmainutils bc rs parallel npm \ + samtools curl unzip redland-utils librdf-dev cmake pkg-config wget gtk-doc-tools \ + raptor2-utils rasqal-utils bison flex gawk libgoogle-perftools-dev liblz4-dev liblzma-dev \ + libcairo2-dev libpixman-1-dev libffi-dev libcairo-dev libprotobuf-dev libboost-all-dev \ + tabix bcftools libzstd-dev pybind11-dev python3-pybind11 +###DEPS_END### + +# Prepare to build submodule dependencies +COPY source_me.sh /vg/source_me.sh +COPY deps /vg/deps +# To increase portability of the docker image, when building for amd64, set the +# target CPU architecture to Nehalem (2008) rather than auto-detecting the +# build machine's CPU. This has no AVX1, AVX2, or PCLMUL, but it does have +# SSE4.2. UCSC has a Nehalem machine that we want to support. +RUN if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then sed -i s/march=native/march=nehalem/ deps/sdsl-lite/CMakeLists.txt; fi +# Clear any CMake caches in case we are building from someone's checkout +RUN find . -name CMakeCache.txt | xargs rm -f +# Build the dependencies +COPY Makefile /vg/Makefile +RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" CFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) deps + +# Bring in the sources, which we need in order to build +COPY src /vg/src + +# Build all the object files for vg, but don't link. +# Also pass the arch here +RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) objs + +# Bring in any includes we pre-made, like the git version, if present +COPY include /vg/include + +# Make sure version introspection is up to date +RUN rm -f obj/version.o && . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) obj/version.o + +# Announce the version file, which must exist by now +RUN ls /vg/include && cat /vg/include/vg_git_version.hpp + +# Do the final build and link, knowing the version. Trim down the resulting binary but make sure to include enough debug info for profiling. +RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) static && strip -d bin/vg + +# Ship the scripts +COPY scripts /vg/scripts + +ENV PATH /vg/bin:$PATH + +############################################################################################ +FROM build AS test +ARG THREADS=8 + +RUN echo test > /stage.txt + +RUN curl -sL https://deb.nodesource.com/setup_16.x | bash - && apt-get -qq -y install nodejs && npm install -g txm@7.4.5 + +# Fail if any non-portable instructions were used +RUN /bin/bash -e -c 'if objdump -d /vg/bin/vg | grep vperm2i128 ; then exit 1 ; else exit 0 ; fi' + +# Bring in the tests and docs, which have doctests +COPY test /vg/test +COPY doc /vg/doc +# We test the README so bring it along. +COPY README.md /vg/ + +# Run tests in the middle so the final container that gets tagged is the run container. +# Tests may not actually be run by smart builders like buildkit. +RUN /bin/bash -e -c "export OMP_NUM_THREADS=$((THREADS < $(nproc) ? THREADS : $(nproc))); make test" + + +############################################################################################ +FROM base AS run + +RUN echo run > /stage.txt + +# Install packages which toil-vg needs to be available inside the image, for +# pipes and profiling, and good usability on Kubernetes. +# TODO: which of these can be removed? +# Make sure to clean so we don't ship old apt package indexes in our Docker. +RUN ls -lah /vg && \ + apt-get -qq -y update && \ + apt-get -qq -y upgrade && \ + apt-get -qq -y install --no-upgrade \ + curl \ + wget \ + pigz \ + dstat \ + pv \ + jq \ + samtools \ + tabix \ + parallel \ + fontconfig-config \ + awscli \ + binutils \ + libpython2.7 \ + libperl-dev \ + libelf1 \ + libdw1 \ + libslang2 \ + libnuma1 \ + numactl \ + bc \ + linux-tools-common \ + linux-tools-generic \ + perl \ + time \ + && apt-get -qq -y clean + +COPY --from=build /vg/bin/vg /vg/bin/ + +COPY --from=build /vg/scripts/* /vg/scripts/ +# Make sure we have the flame graph scripts so we can do self-profiling +COPY --from=build /vg/deps/FlameGraph /vg/deps/FlameGraph + +ENV PATH /vg/bin:$PATH + + + diff --git a/Dockerfile.static b/Dockerfile.static new file mode 100644 index 00000000000..3ede4c24458 --- /dev/null +++ b/Dockerfile.static @@ -0,0 +1,35 @@ +# Dockerfile for shipping just the vg binary you have +# Run with DOCKER_BUILDKIT=1 to avoid shipping the whole vg directory as context +FROM ubuntu:18.04 +MAINTAINER vgteam + +WORKDIR /vg + +ENV PATH /vg/bin:$PATH + +ENTRYPOINT /vg/bin/vg + +# Prevent dpkg from trying to ask any questions, ever +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +# Install dependencies for scripts +RUN apt-get -qq -y update && \ + apt-get -qq -y upgrade && \ + apt-get -qq -y install --no-upgrade \ + numactl \ + python3-matplotlib \ + python3-numpy \ + awscli \ + bwa \ + jq \ + bc \ + linux-tools-common \ + linux-tools-generic \ + binutils \ + perl \ + && apt-get -qq -y clean + +COPY deps/FlameGraph /vg/deps/FlameGraph +COPY scripts /vg/scripts +COPY bin/vg /vg/bin/vg diff --git a/Doxyfile b/Doxyfile index 5f171db1e97..b77457d032a 100644 --- a/Doxyfile +++ b/Doxyfile @@ -758,7 +758,7 @@ WARN_LOGFILE = # spaces. # Note: If this tag is empty the current directory is searched. -INPUT = src src/subcommand src/unittest/driver.cpp src/unittest/driver.hpp +INPUT = src src/algorithms src/io src/subcommand src/unittest/driver.cpp src/unittest/driver.hpp deps/xg/src/xg.cpp deps/xg/src/xg.hpp deps/libbdsg/src deps/libbdsg/include/bdsg deps/libhandlegraph/src deps/libhandlegraph/src/include/handlegraph deps/libvgio/src deps/libvgio/include/vg/io deps/libvgio/deps # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -863,7 +863,7 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. -INPUT_FILTER = "python contrib/proto2cpp/proto2cpp.py" +INPUT_FILTER = "python3 contrib/proto2cpp/proto2cpp.py" # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the @@ -1060,7 +1060,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = doc/footer_template.html # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md deleted file mode 100644 index 3feb16ffce5..00000000000 --- a/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,10 +0,0 @@ -Please describe: - -1. What you were trying to do -2. What you wanted to happen -3. What actually happened -4. What data and command line to use to make the problem recur, if applicable - -``` -Format code blocks or terminal copy-pastes like this, between triple backticks. -``` diff --git a/LICENSE b/LICENSE index 6caac54e16e..149d71b25b7 100644 --- a/LICENSE +++ b/LICENSE @@ -22,7 +22,7 @@ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -Catch.hpp is licensed under the Boost Software License. +src/unittest/catch.hpp is licensed under the Boost Software License. Boost Software License - Version 1.0 - August 17th, 2003 @@ -47,3 +47,11 @@ SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +src/stream/fdstream.hpp is licensed under the following license: + +(C) Copyright Nicolai M. Josuttis 2001. +Permission to copy, use, modify, sell and distribute this software +is granted provided this copyright notice appears in all copies. +This software is provided "as is" without express or implied +warranty, and with no claim as to its suitability for any purpose. diff --git a/Makefile b/Makefile index f8a2509372a..103900bfde0 100644 --- a/Makefile +++ b/Makefile @@ -1,94 +1,281 @@ DEP_DIR:=./deps SRC_DIR:=src ALGORITHMS_SRC_DIR:=$(SRC_DIR)/algorithms -UNITTEST_SRC_DIR:=$(SRC_DIR)/unittest +CONFIG_SRC_DIR:=$(SRC_DIR)/config +IO_SRC_DIR:=$(SRC_DIR)/io SUBCOMMAND_SRC_DIR:=$(SRC_DIR)/subcommand +UNITTEST_SRC_DIR:=$(SRC_DIR)/unittest +UNITTEST_SUPPORT_SRC_DIR:=$(SRC_DIR)/unittest/support BIN_DIR:=bin +UNITTEST_BIN_DIR:=$(BIN_DIR)/unittest OBJ_DIR:=obj +SHARED_OBJ_DIR:=obj/pic ALGORITHMS_OBJ_DIR:=$(OBJ_DIR)/algorithms -UNITTEST_OBJ_DIR:=$(OBJ_DIR)/unittest +ALGORITHMS_SHARED_OBJ_DIR:=$(SHARED_OBJ_DIR)/algorithms +CONFIG_OBJ_DIR:=$(OBJ_DIR)/config +IO_OBJ_DIR:=$(OBJ_DIR)/io +IO_SHARED_OBJ_DIR:=$(SHARED_OBJ_DIR)/io SUBCOMMAND_OBJ_DIR:=$(OBJ_DIR)/subcommand +UNITTEST_OBJ_DIR:=$(OBJ_DIR)/unittest +UNITTEST_SUPPORT_OBJ_DIR:=$(OBJ_DIR)/unittest/support LIB_DIR:=lib # INC_DIR must be a relative path INC_DIR:=include -CPP_DIR:=cpp CWD:=$(shell pwd) +CXX ?= g++ +PKG_CONFIG ?= pkg-config -EXE:=vg +SFX := +EXE:=vg$(SFX) all: $(BIN_DIR)/$(EXE) # Magic dependencies (see ) include $(wildcard $(OBJ_DIR)/*.d) +include $(wildcard $(SHARED_OBJ_DIR)/*.d) include $(wildcard $(ALGORITHMS_OBJ_DIR)/*.d) -include $(wildcard $(UNITTEST_OBJ_DIR)/*.d) +include $(wildcard $(ALGORITHMS_SHARED_OBJ_DIR)/*.d) +include $(wildcard $(CONFIG_OBJ_DIR)/*.d) +include $(wildcard $(IO_OBJ_DIR)/*.d) +include $(wildcard $(IO_SHARED_OBJ_DIR)/*.d) include $(wildcard $(SUBCOMMAND_OBJ_DIR)/*.d) +include $(wildcard $(UNITTEST_OBJ_DIR)/*.d) +include $(wildcard $(UNITTEST_BIN_DIR)/*.d) + +# What pkg-config-controlled system dependencies should we use compile and link flags from? +# Use PKG_CONFIG_PATH to point the build system at the right versions of these, if they aren't picked up automatically. +# We can't do this for our bundled, pkg-config-supporting dependencies (like htslib) because they won't be built yet. +PKG_CONFIG_DEPS := cairo libzstd +# These are like PKG_CONFIG_DEPS but we try to always link them statically, if possible. +# Note that we then must *always* link anything *else* that uses them statically. +# Jansson has to be in here because it has to come after libvgio, which is in the static deps. +PKG_CONFIG_STATIC_DEPS := protobuf jansson + +# We don't ask for -fopenmp here because how we get it can depend on the compiler. +# We don't ask for automatic Make dependency file (*.d) generation here because +# the options we pass can interfere with similar options in dependency project. +CXXFLAGS := -O3 -Werror=return-type -ggdb -g $(CXXFLAGS) +# Keep dependency generation flags for just our own sources +DEPGEN_FLAGS := -MMD -MP + +# Set include flags. All -I options need to go in here, so the first directory +# listed is genuinely searched first. +# We make our dependency install directory -isystem; this might not be +# necessary on all platforms and suppresses warnings. +# Also, pkg-config flags need to be made -isystem if our dependency install +# directory is, or they might put a system HTSlib before ours. +# Also, Protobuf produces an absurd number of these now, so we deduplicate them +# even though that's not *always* safe. See +# and +# +INCLUDE_FLAGS :=-I$(CWD)/$(INC_DIR) -isystem $(CWD)/$(INC_DIR) -I. -I$(CWD)/$(SRC_DIR) -I$(CWD)/$(UNITTEST_SRC_DIR) -I$(CWD)/$(UNITTEST_SUPPORT_SRC_DIR) -I$(CWD)/$(SUBCOMMAND_SRC_DIR) -I$(CWD)/$(INC_DIR)/dynamic $(shell $(PKG_CONFIG) --cflags $(PKG_CONFIG_DEPS) $(PKG_CONFIG_STATIC_DEPS) | tr ' ' '\n' | awk '!x[$$0]++' | tr '\n' ' ' | sed 's/ -I/ -isystem /g') + +# Define libraries to link vg against. +LD_LIB_DIR_FLAGS := -L$(CWD)/$(LIB_DIR) +LD_LIB_FLAGS := -lvcflib -ltabixpp -lgssw -lssw -lsublinearLS -lpthread -lncurses -lgcsa2 -lgbwtgraph -lgbwt -lkff -ldivsufsort -ldivsufsort64 -lvcfh -lraptor2 -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -lstructures -lbdsg -lxg -lsdsl -lzstd -lhandlegraph +# We omit Boost Program Options for now; we find it in a platform-dependent way. +# By default it has no suffix +BOOST_SUFFIX="" +# We define some more libraries to link against at the end, in static linking mode if possible, so we can use faster non-PIC code. +LD_STATIC_LIB_FLAGS := -lvgio $(CWD)/$(LIB_DIR)/libtabixpp.a $(CWD)/$(LIB_DIR)/libhts.a $(CWD)/$(LIB_DIR)/libdeflate.a -lz -lbz2 -llzma +# Some of our static libraries depend on libraries that may not always be avilable in static form. +LD_STATIC_LIB_DEPS := -lpthread -lm +# Use pkg-config to find dependencies. +# Always use --static so that we have the -l flags for transitive dependencies, in case we're doing a full static build. +# But only force static linking of the dependencies we want to use non-PIC code for, for speed. +LD_LIB_FLAGS += $(shell $(PKG_CONFIG) --libs --static $(PKG_CONFIG_DEPS)) +LD_STATIC_LIB_FLAGS += $(shell $(PKG_CONFIG) --libs --static $(PKG_CONFIG_STATIC_DEPS)) +# We also use plain LDFLAGS to point at system library directories that we want +# to propagate through to dependencies' builds. + +# CMake builds that need to find OpenMP might not know about all the prefixes it could be installed into. +# So we make a list of prefixes to search for it. +OMP_PREFIXES:=/ + +# Travis needs -latomic for all builds *but* GCC on Mac +ifeq ($(strip $(shell $(CXX) -latomic /dev/null -o/dev/null 2>&1 | grep latomic | wc -l)), 0) + # Use -latomic if the compiler doesn't complain about it + LD_LIB_FLAGS += -latomic +endif -# We don't ask for -fopenmp here because how we get it can depend on the compiler -CXXFLAGS := -O3 -Werror=return-type -std=c++11 -ggdb -g -MMD -MP -msse4.2 $(CXXFLAGS) - -LD_INCLUDE_FLAGS:=-I$(CWD)/$(INC_DIR) -I. -I$(CWD)/$(SRC_DIR) -I$(CWD)/$(UNITTEST_SRC_DIR) -I$(CWD)/$(SUBCOMMAND_SRC_DIR) -I$(CWD)/$(CPP_DIR) -I$(CWD)/$(INC_DIR)/dynamic -I$(CWD)/$(INC_DIR)/sonLib $(shell pkg-config --cflags cairo) - -LD_LIB_FLAGS:= -L$(CWD)/$(LIB_DIR) -lvcflib -lgssw -lssw -lprotobuf -lsublinearLS -lhts -ldeflate -lpthread -ljansson -lncurses -lgcsa2 -lgbwt -ldivsufsort -ldivsufsort64 -lvcfh -lgfakluge -lraptor2 -lsdsl -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -llz4 -lstructures -lvw -lboost_program_options -lallreduce -# Use pkg-config to find Cairo and all the libs it uses -LD_LIB_FLAGS += $(shell pkg-config --libs --static cairo) +COMPILER_ID=$(strip $(shell $(CXX) --version 2>&1)) +ifeq ($(shell uname -s),Darwin) + $(info OS is Mac) + # Don't try and set an rpath on any dependency utilities because that's not + # a thing and install names will work. + LD_UTIL_RPATH_FLAGS="" + # Homebrew installs a Protobuf that uses an Abseil that is built with C++17, so we need to build with at least C++17 + CXX_STANDARD=17 -ifeq ($(shell uname -s),Darwin) - # We may need libraries from Macports - # TODO: where does Homebrew keep libraries? + # We may need libraries from Macports ifeq ($(shell if [ -d /opt/local/lib ];then echo 1;else echo 0;fi), 1) - # Use /opt/local/lib if present - LD_LIB_FLAGS += -L/opt/local/lib + # Use /opt/local/lib if present when building dependencies + LDFLAGS += -L/opt/local/lib endif - ifeq ($(shell if [ -d /usr/local/lib ];then echo 1;else echo 0;fi), 1) # Use /usr/local/lib if present. - LD_LIB_FLAGS += -L/usr/local/lib + LDFLAGS += -L/usr/local/lib + endif + ifeq ($(shell if [ -d /usr/local/include ];then echo 1;else echo 0;fi), 1) + # Use /usr/local/include to the end of the include search path. + # Make sure it is system level only so it comes after other -I paths. + INCLUDE_FLAGS += -isystem /usr/local/include + + ifeq ($(shell if [ -d /usr/local/include/cairo ];then echo 1;else echo 0;fi), 1) + # pkg-config is not always smart enough to find Cairo's include path for us. + # We make sure to grab its directory manually if we see it. + INCLUDE_FLAGS += -isystem /usr/local/include/cairo + LD_LIB_FLAGS += -lcairo + endif endif - # Our compiler might be clang that lacks -fopenmp support. - # Sniff that - ifeq ($(strip $(shell $(CXX) -fopenmp /dev/null -o/dev/null 2>&1 | grep fopenmp | wc -l)), 1) - # The compiler complained about fopenmp instead of its nonsense input file. + ifndef HOMEBREW_PREFIX + BREW_PATH=$(shell which brew 2>/dev/null) + ifneq ($(BREW_PATH),) + # Get prefix from Homebrew instead of environment + HOMEBREW_PREFIX=$(shell brew --prefix) + endif + endif + + ifdef HOMEBREW_PREFIX + # We need Bison from Homebrew instead of Apple's old Bison, and GNU coreutils + export PATH:=$(HOMEBREW_PREFIX)/opt/bison/bin:$(HOMEBREW_PREFIX)/opt/coreutils/libexec/gnubin:$(PATH) + # If we have homebrew, use Homebrew in general + CXXFLAGS += -I$(HOMEBREW_PREFIX)/include + LDFLAGS += -L$(HOMEBREW_PREFIX)/lib + endif + + # We need to find Boost Program Options. It is usually + # -lboost_program_options, except for on Macports installs of Boost where + # it is -lboost_program_options-mt. If we were a real build system we would + # try things until it worked. Instead, we guess. + ifeq ($(shell if [ -f /opt/local/lib/libboost_program_options-mt.dylib ];then echo 1;else echo 0;fi), 1) + # This is where Macports puts it, so use that name + BOOST_SUFFIX="-mt" + endif + + # Our compiler might be Apple clang, which doesn't have -fopenmp. + ifneq ($(strip $(shell echo "$(COMPILER_ID)" | grep -i clang | wc -l)), 0) + # This is Clang. + $(info Compiler $(CXX) is Clang) + # We need to use the hard way of getting OpenMP not bundled with the compiler. # The compiler only needs to do the preprocessing CXXFLAGS += -Xpreprocessor -fopenmp - ifeq ($(shell if [ -d /opt/local/lib/libomp ];then echo 1;else echo 0;fi), 1) - # Use /opt/local/lib/libomp if present, because Macports installs libomp there. - # Brew is supposed to put it somewhere the compiler can find it by default. - LD_LIB_FLAGS += -L/opt/local/lib/libomp - # And we need to find the includes. Homebrew puts them in the normal place - # but Macports hides them in "libomp" + ifeq ($(shell if [ -e $(HOMEBREW_PREFIX)/include/omp.h ]; then echo 1; else echo 0; fi), 1) + # libomp used to be globally installed in Homebrew + $(info OMP source is Homebrew libomp global install) + OMP_PREFIXES:=$(OMP_PREFIXES);$(HOMEBREW_PREFIX) + else ifeq ($(shell if [ -d $(HOMEBREW_PREFIX)/opt/libomp/include ]; then echo 1; else echo 0; fi), 1) + # libomp moved to these directories, recently, because it is now keg-only to not fight GCC + $(info OMP source is Homebrew libomop keg) + CXXFLAGS += -I$(HOMEBREW_PREFIX)/opt/libomp/include + LDFLAGS += -L$(HOMEBREW_PREFIX)/opt/libomp/lib + OMP_PREFIXES:=$(OMP_PREFIXES);$(HOMEBREW_PREFIX)/opt/libomp + else ifeq ($(shell if [ -d /opt/local/lib/libomp ]; then echo 1; else echo 0; fi), 1) + # Macports installs libomp to /opt/local/lib/libomp + $(info OMP source Macports) CXXFLAGS += -I/opt/local/include/libomp + LDFLAGS += -L/opt/local/lib/libomp + OMP_PREFIXES:=$(OMP_PREFIXES);/opt/local + else + $(error OMP is not available from either Homebrew or Macports) endif # We also need to link it LD_LIB_FLAGS += -lomp - # And we need to find the includes. Homebrew puts them in the normal place but macports hides them in "libomp" - CXXFLAGS += -I/opt/local/include/libomp else + $(info Compiler $(CXX) is GCC) + # The compiler is (probably?) GNU GCC + # On Mac, we need to make sure to configure it to use libc++ like + # Clang, and not GNU libstdc++. + # Otherwise, we won't be able to use any C++ system libraries from + # Homebrew or Macports, which will be built against libc++. + + # See https://stackoverflow.com/q/22228208 + CXXFLAGS += -fopenmp + + # Find includes using Clang + LIBCXX_INCLUDES := $(shell clang++ -print-search-dirs | perl -ne 's{^libraries: =(.*)}{$$1/../../../} && print') + # Use them and libc++ and not the normal standard library + CXXFLAGS := -isystem $(LIBCXX_INCLUDES)/include/c++/v1 -nostdinc++ -nodefaultlibs -lc -lc++ -lc++abi -lgcc_s.1 -Wl,-no_compact_unwind $(CXXFLAGS) + + # Make sure to use the right libgomp to go with libomp + LD_LIB_FLAGS += -lomp -lgomp.1 endif + # We care about building only for the current machine. If we do something + # more restrictive we can have trouble inlining parts of the standard + # library that were built for something less restrictive. However, + # Apple Clang does not recognize -march=native on ARM. + ifeq ($(shell uname -m), x86_64) + CXXFLAGS += -march=native + endif + + # Note shared libraries are dylibs + SHARED_SUFFIX = dylib + # Define options to start static linking of libraries. + # We don't actually do any static linking on Mac, so we leave this empty. + START_STATIC = + END_STATIC = else # We are not running on OS X - # We can also have a normal Unix rpath - LD_LIB_FLAGS += -Wl,-rpath,$(CWD)/$(LIB_DIR) - # Make sure to allow backtrace access to all our symbols, even those which are not exported. - # Absolutely no help in a static build. - LD_LIB_FLAGS += -rdynamic + $(info OS is Linux) + $(info Compiler $(CXX) is assumed to be GCC) + + # Linux can have some old compilers so we want to work back to C++14 + CXX_STANDARD=14 + + # Set an rpath for vg and dependency utils to find installed libraries + LD_UTIL_RPATH_FLAGS="-Wl,-rpath,$(CWD)/$(LIB_DIR)" + LD_LIB_FLAGS += $(LD_UTIL_RPATH_FLAGS) + # Make sure to allow backtrace access to all our symbols, even those which are not exported. + # Absolutely no help in a static build. + LD_LIB_FLAGS += -rdynamic - # We want to link against the elfutils libraries - LD_LIB_FLAGS += -ldwfl -ldw -ldwelf -lelf -lebl + # We want to link against the elfutils libraries + LD_LIB_FLAGS += -ldwfl -ldw -ldwelf -lelf -lebl # We get OpenMP the normal way, using whatever the compiler knows about CXXFLAGS += -fopenmp + + ifeq ($(shell arch), x86_64) + # We care about building for SSE4.2 only and not AVX, to have vaguely portable binaries + CXXFLAGS += -msse4.2 + endif + + # Note shared libraries are so files + SHARED_SUFFIX = so + # Define options to start static linking of libraries on GNU ld. + START_STATIC = -Wl,-Bstatic + # Note that END_STATIC is only safe to use in a mostly-dynamic build, and has to appear or we will try to statically link secret trailing libraries. + END_STATIC = -Wl,-Bdynamic + + +endif + +# Set the C++ standard we are using +CXXFLAGS := -std=c++$(CXX_STANDARD) $(CXXFLAGS) + +# Propagate CXXFLAGS and LDFLAGS to child makes and other build processes +export CXXFLAGS +$(info CXXFLAGS are $(CXXFLAGS)) +export LDFLAGS +$(info LDFLAGS are $(LDFLAGS)) + +OMP_MISSING=$(strip $(shell echo \\\#include \ | $(CXX) $(CXXFLAGS) -x c++ -E /dev/stdin -o /dev/null 2>&1 | head -n1 | grep error | wc -l)) +ifeq ($(OMP_MISSING), 1) + $(warning OpenMP header omp.h is not available! vg will not be able to build!) endif +# Actually set the Boost library option, with the determined suffix +LD_LIB_FLAGS += "-lboost_program_options$(BOOST_SUFFIX)" + # These libs need to come after libdw if used, because libdw depends on them -LD_LIB_FLAGS += -ldl -llzma +LD_LIB_FLAGS += -ldl -llzma -lbz2 -lzstd # Sometimes we need to filter the assembler output. The assembler can run during # ./configure scripts, compiler calls, or $(MAKE) calls (other than $(MAKE) @@ -96,7 +283,7 @@ LD_LIB_FLAGS += -ldl -llzma ifeq ($(shell uname -s),Darwin) # We need to apply a filter to all our build command output. This discards # all the assembler warnings which can overwhelm Travis log storage. - FILTER=2>&1 | python $(CWD)/scripts/filter-noisy-assembler-warnings.py + FILTER=2>&1 | python3 $(CWD)/scripts/filter-noisy-assembler-warnings.py # For the filter to work and not just swallow errors we also need to turn on # pipefail in the shell SHELL=/bin/bash -o pipefail @@ -105,46 +292,56 @@ else FILTER= endif -ROCKSDB_PORTABLE=PORTABLE=1 # needed to build rocksdb without weird assembler options -# TODO: configure RPATH-equivalent on OS X for finding libraries without environment variables at runtime - -# RocksDB's dependecies depend on whether certain compression libraries -# happen to be installed on the build system. Define a lazy macro to -# detect these from its self-configuration. It has to be lazy because -# the configuration (make_config.mk) won't exist until after RocksDB -# is built by this Makefile. -LD_LIB_FLAGS += -lrocksdb -ROCKSDB_LDFLAGS = $(shell grep PLATFORM_LDFLAGS deps/rocksdb/make_config.mk | cut -d '=' -f2 | sed s/-ljemalloc// | sed s/-ltcmalloc// | sed s/-ltbb//) - # When building statically, we need to tell the linker not to bail if it sees multiple definitions. -# libc on e.g. our Jenkins host does not define malloc as weak, so tcmalloc can't override it in a static build. +# libc on e.g. our Jenkins host does not define malloc as weak, so other mallocs can't override it in a static build. # TODO: Why did this problem only begin to happen when libvw was added? -STATIC_FLAGS=-static -static-libstdc++ -static-libgcc -Wl,--allow-multiple-definition +STATIC_FLAGS=-static -static-libstdc++ -static-libgcc -Wl,--allow-multiple-definition -# These are put into libvg. Grab everything except main. +# These are put into libvg. Grab everything except main OBJ = $(filter-out $(OBJ_DIR)/main.o,$(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cpp))) +SHARED_OBJ = $(patsubst $(OBJ_DIR)/%.o,$(SHARED_OBJ_DIR)/%.o,$(OBJ)) + # And all the algorithms ALGORITHMS_OBJ = $(patsubst $(ALGORITHMS_SRC_DIR)/%.cpp,$(ALGORITHMS_OBJ_DIR)/%.o,$(wildcard $(ALGORITHMS_SRC_DIR)/*.cpp)) +ALGORITHMS_SHARED_OBJ = $(patsubst $(ALGORITHMS_OBJ_DIR)/%.o,$(ALGORITHMS_SHARED_OBJ_DIR)/%.o,$(ALGORITHMS_OBJ)) -# These aren't put into libvg. But they do go into the main vg binary to power its self-test. -UNITTEST_OBJ = $(patsubst $(UNITTEST_SRC_DIR)/%.cpp,$(UNITTEST_OBJ_DIR)/%.o,$(wildcard $(UNITTEST_SRC_DIR)/*.cpp)) +# These aren't put into libvg. They are linked into vg itself to communicate +# things about the platform. +# Config objects are built individually and conditionally; that's the point. +CONFIG_OBJ = + +# But always build all the IO logic +IO_OBJ = $(patsubst $(IO_SRC_DIR)/%.cpp,$(IO_OBJ_DIR)/%.o,$(wildcard $(IO_SRC_DIR)/*.cpp)) +IO_SHARED_OBJ = $(patsubst $(IO_OBJ_DIR)/%.o,$(IO_SHARED_OBJ_DIR)/%.o,$(IO_OBJ)) # These aren't put into libvg, but they provide subcommand implementations for the vg bianry SUBCOMMAND_OBJ = $(patsubst $(SUBCOMMAND_SRC_DIR)/%.cpp,$(SUBCOMMAND_OBJ_DIR)/%.o,$(wildcard $(SUBCOMMAND_SRC_DIR)/*.cpp)) +# These aren't put into libvg. But they do go into the main vg binary to power its self-test. +UNITTEST_OBJ = $(patsubst $(UNITTEST_SRC_DIR)/%.cpp,$(UNITTEST_OBJ_DIR)/%.o,$(wildcard $(UNITTEST_SRC_DIR)/*.cpp)) + +# These support the tests. Some should go into the main vg binary but some should only go into test-suite binaries. +UNITTEST_SUPPORT_OBJ = $(patsubst $(UNITTEST_SUPPORT_SRC_DIR)/%.cpp,$(UNITTEST_SUPPORT_OBJ_DIR)/%.o,$(wildcard $(UNITTEST_SUPPORT_SRC_DIR)/*.cpp)) + +# These are per-test-suite binaries we can build faster +UNITTEST_EXE = $(patsubst $(UNITTEST_SRC_DIR)/%.cpp,$(UNITTEST_BIN_DIR)/%,$(wildcard $(UNITTEST_SRC_DIR)/*.cpp)) + + RAPTOR_DIR:=deps/raptor -PROTOBUF_DIR:=deps/protobuf -GPERF_DIR:=deps/gperftools +JEMALLOC_DIR:=deps/jemalloc +LOCKFREE_MALLOC_DIR:=deps/lockfree-malloc SDSL_DIR:=deps/sdsl-lite SNAPPY_DIR:=deps/snappy -ROCKSDB_DIR:=deps/rocksdb GCSA2_DIR:=deps/gcsa2 GBWT_DIR:=deps/gbwt +GBWTGRAPH_DIR=deps/gbwtgraph +KFF_DIR=deps/kff-cpp-api PROGRESS_BAR_DIR:=deps/progress_bar FASTAHACK_DIR:=deps/fastahack FERMI_DIR:=deps/fermi-lite -HTSLIB_DIR:=deps/htslib VCFLIB_DIR:=deps/vcflib +TABIXPP_DIR:=deps/tabixpp +HTSLIB_DIR:=deps/htslib GSSW_DIR:=deps/gssw SPARSEHASH_DIR:=deps/sparsehash SPARSEPP_DIR:=deps/sparsepp @@ -154,101 +351,178 @@ SSW_DIR:=deps/ssw/src LINLS_DIR:=deps/sublinear-Li-Stephens STRUCTURES_DIR:=deps/structures BACKWARD_CPP_DIR:=deps/backward-cpp +DOZEU_DIR:=deps/dozeu ELFUTILS_DIR:=deps/elfutils -BOOST_DIR:=deps/boost-subset -VOWPALWABBIT_DIR:=deps/vowpal_wabbit LIBDEFLATE_DIR:=deps/libdeflate +LIBVGIO_DIR:=deps/libvgio +LIBHANDLEGRAPH_DIR:=deps/libhandlegraph +LIBBDSG_DIR:=deps/libbdsg +XG_DIR:=deps/xg +MMMULTIMAP_DIR=deps/mmmultimap +IPS4O_DIR=deps/ips4o +BBHASH_DIR=deps/BBHash +MIO_DIR=deps/mio +ATOMIC_QUEUE_DIR=deps/atomic_queue # Dependencies that go into libvg's archive # These go in libvg but come from dependencies DEP_OBJ = -DEP_OBJ += $(OBJ_DIR)/vg.pb.o DEP_OBJ += $(OBJ_DIR)/progress_bar.o DEP_OBJ += $(OBJ_DIR)/sha1.o DEP_OBJ += $(OBJ_DIR)/Fasta.o - +DEP_SHARED_OBJ = $(patsubst $(OBJ_DIR)/%.o,$(SHARED_OBJ_DIR)/%.o,$(DEP_OBJ)) # These are libraries that we need to build before we link vg. # It would be nice to dump their contents into libvg to make it stand-alone. # But that requires fancy ar scripting. # If you just pass them to ar it puts the library *file* in libvg where nothing can read it. LIB_DEPS = -LIB_DEPS += $(LIB_DIR)/libprotobuf.a LIB_DEPS += $(LIB_DIR)/libsdsl.a LIB_DEPS += $(LIB_DIR)/libssw.a LIB_DEPS += $(LIB_DIR)/libsnappy.a -LIB_DEPS += $(LIB_DIR)/librocksdb.a LIB_DEPS += $(LIB_DIR)/libgcsa2.a LIB_DEPS += $(LIB_DIR)/libgbwt.a +LIB_DEPS += $(LIB_DIR)/libgbwtgraph.a +LIB_DEPS += $(LIB_DIR)/libkff.a LIB_DEPS += $(LIB_DIR)/libhts.a +LIB_DEPS += $(LIB_DIR)/libtabixpp.a LIB_DEPS += $(LIB_DIR)/libvcflib.a LIB_DEPS += $(LIB_DIR)/libgssw.a LIB_DEPS += $(LIB_DIR)/libvcfh.a -LIB_DEPS += $(LIB_DIR)/libgfakluge.a LIB_DEPS += $(LIB_DIR)/libsonlib.a LIB_DEPS += $(LIB_DIR)/libpinchesandcacti.a LIB_DEPS += $(LIB_DIR)/libraptor2.a LIB_DEPS += $(LIB_DIR)/libfml.a LIB_DEPS += $(LIB_DIR)/libsublinearLS.a LIB_DEPS += $(LIB_DIR)/libstructures.a -LIB_DEPS += $(LIB_DIR)/libvw.a -LIB_DEPS += $(LIB_DIR)/liballreduce.a -LIB_DEPS += $(LIB_DIR)/libboost_program_options.a LIB_DEPS += $(LIB_DIR)/libdeflate.a +LIB_DEPS += $(LIB_DIR)/libvgio.a +LIB_DEPS += $(LIB_DIR)/libhandlegraph.a +LIB_DEPS += $(LIB_DIR)/libbdsg.a +LIB_DEPS += $(LIB_DIR)/libxg.a ifneq ($(shell uname -s),Darwin) - # On non-Mac (i.e. Linux), where ELF binaries are used, pull in libdw which - # backward-cpp will use. - LIB_DEPS += $(LIB_DIR)/libdw.a - LIB_DEPS += $(LIB_DIR)/libdwfl.a - LIB_DEPS += $(LIB_DIR)/libdwelf.a - LIB_DEPS += $(LIB_DIR)/libebl.a - LIB_DEPS += $(LIB_DIR)/libelf.a + # On non-Mac (i.e. Linux), where ELF binaries are used, pull in libdw which + # backward-cpp will use. + LIB_DEPS += $(LIB_DIR)/libdw.a + LIB_DEPS += $(LIB_DIR)/libdwfl.a + LIB_DEPS += $(LIB_DIR)/libdwelf.a + LIB_DEPS += $(LIB_DIR)/libebl.a + LIB_DEPS += $(LIB_DIR)/libelf.a +endif + +# Control variable for allocator +# On the command line, you can `make jemalloc=off` if you definitely don't want jemalloc. +jemalloc = on +ifeq ($(shell uname -s),Darwin) + jemalloc = off +endif + +# Only depend on these files for the final linking stage. +# These libraries provide no headers to affect the vg build. +LINK_DEPS = + +ifeq ($(jemalloc),on) + # Use jemalloc at link time + LINK_DEPS += $(LIB_DIR)/libjemalloc.a + # We have to use it statically or we can't get at its secret symbols. + LD_LIB_FLAGS += $(LIB_DIR)/libjemalloc.a + # Use the config object for jemalloc + CONFIG_OBJ += $(CONFIG_OBJ_DIR)/allocator_config_jemalloc.o +else + # Use the config object for the normal allocator + CONFIG_OBJ += $(CONFIG_OBJ_DIR)/allocator_config_system.o endif # common dependencies to build before all vg src files DEPS = $(LIB_DEPS) -DEPS += $(CPP_DIR)/vg.pb.h DEPS += $(INC_DIR)/gcsa/gcsa.h DEPS += $(INC_DIR)/gbwt/dynamic_gbwt.h +DEPS += $(INC_DIR)/gbwtgraph/gbwtgraph.h +DEPS += $(INC_DIR)/kff_io.hpp DEPS += $(INC_DIR)/lru_cache.h -DEPS += $(INC_DIR)/dynamic.hpp +DEPS += $(INC_DIR)/dynamic/dynamic.hpp DEPS += $(INC_DIR)/sparsehash/sparse_hash_map DEPS += $(INC_DIR)/sparsepp/spp.h -DEPS += $(INC_DIR)/gfakluge.hpp DEPS += $(INC_DIR)/sha1.hpp DEPS += $(INC_DIR)/progress_bar.hpp DEPS += $(INC_DIR)/backward.hpp +DEPS += $(INC_DIR)/dozeu/dozeu.h +DEPS += $(INC_DIR)/mmmultimap.hpp +DEPS += $(INC_DIR)/ips4o.hpp +DEPS += $(INC_DIR)/raptor2/raptor2.h +DEPS += $(INC_DIR)/BooPHF.h +DEPS += $(INC_DIR)/mio/mmap.hpp +DEPS += $(INC_DIR)/atomic_queue.h -ifneq ($(shell uname -s),Darwin) - DEPS += $(LIB_DIR)/libtcmalloc_minimal.a - LD_LIB_FLAGS += -ltcmalloc_minimal -endif - -.PHONY: clean get-deps deps test set-path static docs .pre-build .check-environment +.PHONY: clean clean-tests get-deps deps test set-path objs static static-docker docs man .pre-build .check-environment .check-git .no-git -$(BIN_DIR)/vg: $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(DEPS) - . ./source_me.sh && $(CXX) $(CXXFLAGS) -o $(BIN_DIR)/vg $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) -lvg $(LD_INCLUDE_FLAGS) $(LD_LIB_FLAGS) $(ROCKSDB_LDFLAGS) +# Aggregate all libvg deps, and exe deps other than libvg +LIBVG_DEPS = $(OBJ) $(ALGORITHMS_OBJ) $(IO_OBJ) $(DEP_OBJ) $(DEPS) +LIBVG_SHARED_DEPS = $(SHARED_OBJ) $(ALGORITHMS_SHARED_OBJ) $(IO_SHARED_OBJ) $(DEP_SHARED_OBJ) $(DEPS) +EXE_DEPS = $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(DEPS) $(LINK_DEPS) -static: $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) - $(CXX) $(CXXFLAGS) -o $(BIN_DIR)/vg $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) -lvg $(STATIC_FLAGS) $(LD_INCLUDE_FLAGS) $(LD_LIB_FLAGS) $(ROCKSDB_LDFLAGS) +# We have a target we can build to do everything but link the library and executable +objs: $(LIBVG_DEPS) $(EXE_DEPS) -$(LIB_DIR)/libvg.a: $(OBJ) $(ALGORITHMS_OBJ) $(DEP_OBJ) $(DEPS) +$(LIB_DIR)/libvg.a: $(LIBVG_DEPS) rm -f $@ - ar rs $@ $(OBJ) $(ALGORITHMS_OBJ) $(DEP_OBJ) + ar rs $@ $(OBJ) $(ALGORITHMS_OBJ) $(IO_OBJ) $(DEP_OBJ) + +$(LIB_DIR)/libvg.$(SHARED_SUFFIX): $(LIBVG_SHARED_DEPS) + rm -f $@ + $(CXX) -shared -o $@ $(SHARED_OBJ) $(ALGORITHMS_SHARED_OBJ) $(IO_SHARED_OBJ) $(DEP_SHARED_OBJ) $(LDFLAGS) $(LD_LIB_DIR_FLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS) + +# Each test set can have its own binary, and not link everything static +$(UNITTEST_EXE): $(UNITTEST_BIN_DIR)/%: $(UNITTEST_OBJ_DIR)/%.o $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $@ $< $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) $(LDFLAGS) $(LD_LIB_DIR_FLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS) + +# For a normal dynamic build we remove the static build marker +$(BIN_DIR)/$(EXE): $(LIB_DIR)/libvg.a $(EXE_DEPS) + -rm -f $(LIB_DIR)/vg_is_static + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_DIR_FLAGS) $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(LD_STATIC_LIB_DEPS) +# We keep a file that we touch on the last static build. +# If the vg linkables are newer than the last static build, we do a build +$(LIB_DIR)/vg_is_static: $(INC_DIR)/vg_environment_version.hpp $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(DEPS) $(LINK_DEPS) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LDFLAGS) $(LIB_DIR)/libvg.a $(STATIC_FLAGS) $(LD_LIB_DIR_FLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS) + -touch $(LIB_DIR)/vg_is_static + +# We don't want to always rebuild the static vg if no files have changed. +# But we do need to rebuild it if files have changed. +# TODO: is there a way to query the mtimes of all the files and rebuild if they changed *or* vg isn't static? +# For now we link dynamically and then link statically, if we actually need to rebuild anything. +static: $(LIB_DIR)/vg_is_static + +# Make sure to strip out the symbols that make the binary 300 MB, but leave the +# symbols perf needs for profiling. +static-docker: static scripts/* + strip -d $(BIN_DIR)/$(EXE) + DOCKER_BUILDKIT=1 docker build . -f Dockerfile.static -t vg # We have system-level deps to install +# We want the One True Place for them to be in the Dockerfile. get-deps: - sudo apt-get install -qq -y protobuf-compiler libprotoc-dev libjansson-dev libbz2-dev libncurses5-dev automake libtool jq samtools curl unzip redland-utils librdf-dev cmake pkg-config wget bc gtk-doc-tools raptor2-utils rasqal-utils bison flex gawk libgoogle-perftools-dev liblz4-dev liblzma-dev libcairo2-dev libpixman-1-dev libffi-dev + sudo apt-get install -qq -y --no-upgrade $(shell cat Dockerfile | sed -n '/^###DEPS_BEGIN###/,$${p;/^###DEPS_END###/q}' | grep -v '^ *#' | grep -v "^RUN" | tr '\n' ' ' | tr -d '\\') # And we have submodule deps to build deps: $(DEPS) -test: $(BIN_DIR)/vg $(LIB_DIR)/libvg.a test/build_graph $(BIN_DIR)/shuf $(VCFLIB_DIR)/bin/vcf2tsv $(FASTAHACK_DIR)/fastahack +test: $(BIN_DIR)/$(EXE) $(LIB_DIR)/libvg.a test/build_graph $(BIN_DIR)/shuf $(BIN_DIR)/vcf2tsv $(FASTAHACK_DIR)/fastahack $(BIN_DIR)/rapper . ./source_me.sh && cd test && prove -v t + . ./source_me.sh && doc/test-docs.sh -docs: $(SRC_DIR)/*.cpp $(SRC_DIR)/*.hpp $(SUBCOMMAND_SRC_DIR)/*.cpp $(SUBCOMMAND_SRC_DIR)/*.hpp $(UNITTEST_SRC_DIR)/*.cpp $(UNITTEST_SRC_DIR)/*.hpp $(CPP_DIR)/vg.pb.cc +# Somebody has been polluting the test directory with temporary files that are not deleted after the tests. +# To make git status more useful, we delete everything that looks like a temporary file. +clean-test: + cd test && rm -rf tmp && mkdir tmp && mv 2_2.mat build_graph.cpp default.mat tmp && rm -f *.* && mv tmp/* . && rmdir tmp + +docs: $(SRC_DIR)/*.cpp $(SRC_DIR)/*.hpp $(ALGORITHMS_SRC_DIR)/*.cpp $(ALGORITHMS_SRC_DIR)/*.hpp $(SUBCOMMAND_SRC_DIR)/*.cpp $(SUBCOMMAND_SRC_DIR)/*.hpp $(UNITTEST_SRC_DIR)/*.cpp $(UNITTEST_SRC_DIR)/*.hpp $(UNITTEST_SUPPORT_SRC_DIR)/*.cpp doxygen echo "View documentation at: file://$(PWD)/doc/doxygen/index.html" + +man: $(patsubst doc/asciidoc/man/%.adoc,doc/man/%.1,$(wildcard doc/asciidoc/man/*.adoc)) + +doc/man/%.1: doc/asciidoc/man/%.adoc + asciidoctor -b manpage -d manpage -o $@ $< # Hack to use gshuf or shuf as appropriate to the platform when testing $(BIN_DIR)/shuf: @@ -258,153 +532,229 @@ else ln -s `which shuf` $(BIN_DIR)/shuf endif -# Make sure we have protoc built, and the protobuf lib, both of which come from the same command using this fake intermediate -bin/protoc: .rebuild-protobuf -$(LIB_DIR)/libprotobuf.a: .rebuild-protobuf - # intermediate targets don't trigger a rebuild just because they're missing. -.INTERMEDIATE: .rebuild-protobuf - # Make sure to delete outdated libs and headers before rebuilding - # Outdated headers can get picked up during the build -.rebuild-protobuf: deps/protobuf/src/google/protobuf/*.cc - rm -rf $(LIB_DIR)/libprotobuf* $(LIB_DIR)/libprotoc* - rm -Rf include/google/protobuf/ - +. ./source_me.sh && cd $(PROTOBUF_DIR) && ./autogen.sh && export DIST_LANG=cpp && ./configure --prefix="$(CWD)" $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install && export PATH=$(CWD)/bin:$$PATH - -test/build_graph: test/build_graph.cpp $(LIB_DIR)/libvg.a $(CPP_DIR)/vg.pb.h $(SRC_DIR)/json2pb.h $(SRC_DIR)/vg.hpp - . ./source_me.sh && $(CXX) $(CXXFLAGS) -o test/build_graph test/build_graph.cpp $(LD_INCLUDE_FLAGS) -lvg $(LD_LIB_FLAGS) $(ROCKSDB_LDFLAGS) $(FILTER) - -# remove annoying large alloc messages from tcmalloc -$(GPERF_DIR)/src/tcmalloc.cc.bak: - cp $(GPERF_DIR)/src/tcmalloc.cc $(GPERF_DIR)/src/tcmalloc.cc.bak - sed 's/printer.printf("tcmalloc: large alloc/return; printer.printf("tcmalloc: large alloc/' $(GPERF_DIR)/src/tcmalloc.cc.bak >$(GPERF_DIR)/src/tcmalloc.cc - -$(LIB_DIR)/libtcmalloc_minimal.a: $(GPERF_DIR)/src/tcmalloc.cc.bak - +. ./source_me.sh && cd $(GPERF_DIR) && ./autogen.sh && ./configure --prefix=`pwd` $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install && cp -r lib/* $(CWD)/$(LIB_DIR)/ && cp -r bin/* $(CWD)/$(BIN_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/ - -$(LIB_DIR)/libsdsl.a: $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp +test/build_graph: test/build_graph.cpp $(LIB_DIR)/libvg.a $(SRC_DIR)/vg.hpp + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o test/build_graph test/build_graph.cpp $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_DIR_FLAGS) $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(FILTER) + +$(LIB_DIR)/libjemalloc.a: $(JEMALLOC_DIR)/src/*.c + +. ./source_me.sh && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp -r lib/* $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/ + +# Use fake patterns to tell Make that this rule generates all these files when run once. +# Here % should always match "lib" which is a common substring. +# See https://stackoverflow.com/a/19822767 +$(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER) + +. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) else - +. ./source_me.sh && cd $(SDSL_DIR) && BUILD_PORTABLE=1 ./install.sh $(CWD) $(FILTER) -endif - + +. ./source_me.sh && cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) +endif -$(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.h - +. ./source_me.sh && cd $(SSW_DIR) && $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(LIB_DIR) +$(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.cpp $(SSW_DIR)/*.h + +. ./source_me.sh && cd $(SSW_DIR) && $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(INC_DIR) +# We need to hide -Xpreprocessor -fopenmp from Snappy, at least on Mac, because +# it will drop the -Xpreprocessor and keep the -fopenmp and upset Clang. $(LIB_DIR)/libsnappy.a: $(SNAPPY_DIR)/*.cc $(SNAPPY_DIR)/*.h - +. ./source_me.sh && cd $(SNAPPY_DIR) && ./autogen.sh && ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) libsnappy.la $(FILTER) && cp .libs/libsnappy.a $(CWD)/lib/ && cp snappy-c.h snappy-sinksource.h snappy-stubs-public.h snappy.h $(CWD)/include/ - -$(LIB_DIR)/librocksdb.a: $(LIB_DIR)/libtcmalloc_minimal.a $(LIB_DIR)/libsnappy.a $(ROCKSDB_DIR)/db/*.cc $(ROCKSDB_DIR)/db/*.h - +. ./source_me.sh && cd $(ROCKSDB_DIR) && $(ROCKSDB_PORTABLE) DISABLE_JEMALLOC=1 $(MAKE) static_lib $(FILTER) && mv librocksdb.a $(CWD)/${LIB_DIR}/ && cp -r include/* $(CWD)/$(INC_DIR)/ + +. ./source_me.sh && cd $(SNAPPY_DIR) && ./autogen.sh && CXXFLAGS="$(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" ./configure --prefix=$(CWD) $(FILTER) && CXXFLAGS="$(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" $(MAKE) libsnappy.la $(FILTER) && cp .libs/libsnappy.a $(CWD)/lib/ && cp snappy-c.h snappy-sinksource.h snappy-stubs-public.h snappy.h $(CWD)/include/ $(INC_DIR)/gcsa/gcsa.h: $(LIB_DIR)/libgcsa2.a -$(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/include/gcsa/*.h) +$(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/include/gcsa/*.h) ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cd $(GCSA2_DIR) && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) libgcsa2.a $(FILTER) && mv libgcsa2.a $(CWD)/$(LIB_DIR) && cp -r include/gcsa $(CWD)/$(INC_DIR)/ + +. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && make directories && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cd $(GCSA2_DIR) && $(MAKE) libgcsa2.a $(FILTER) && mv libgcsa2.a $(CWD)/$(LIB_DIR) && cp -r include/gcsa $(CWD)/$(INC_DIR)/ + +. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && make directories && $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/gbwt/dynamic_gbwt.h: $(LIB_DIR)/libgbwt.a -$(LIB_DIR)/libgbwt.a: $(LIB_DIR)/libsdsl.a $(wildcard $(GBWT_DIR)/*.cpp) $(wildcard $(GBWT_DIR)/include/gbwt/*.h) + +$(LIB_DIR)/libgbwt.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GBWT_DIR)/src/*.cpp) $(wildcard $(GBWT_DIR)/include/gbwt/*.h) ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cd $(GBWT_DIR) && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) libgbwt.a build_gbwt $(FILTER) && mv libgbwt.a $(CWD)/$(LIB_DIR) && cp -r include/gbwt $(CWD)/$(INC_DIR)/ + +. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cd $(GBWT_DIR) && $(MAKE) libgbwt.a build_gbwt $(FILTER) && mv libgbwt.a $(CWD)/$(LIB_DIR) && cp -r include/gbwt $(CWD)/$(INC_DIR)/ + +. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) endif -$(INC_DIR)/progress_bar.hpp: $(PROGRESS_BAR_DIR)/progress_bar.hpp - +cp $(PROGRESS_BAR_DIR)/progress_bar.hpp $(CWD)/$(INC_DIR) +$(INC_DIR)/gbwtgraph/gbwtgraph.h: $(LIB_DIR)/libgbwtgraph.a -$(OBJ_DIR)/progress_bar.o: $(PROGRESS_BAR_DIR)/*.hpp $(PROGRESS_BAR_DIR)/*.cpp - +cd $(PROGRESS_BAR_DIR) && $(MAKE) $(FILTER) && cp progress_bar.o $(CWD)/$(OBJ_DIR) +$(LIB_DIR)/libgbwtgraph.a: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(LIB_DIR)/libhandlegraph.a $(wildcard $(GBWTGRAPH_DIR)/src/*.cpp) $(wildcard $(GBWTGRAPH_DIR)/include/gbwtgraph/*.h) +ifeq ($(shell uname -s),Darwin) + +. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) +else + +. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) +endif -$(OBJ_DIR)/Fasta.o: $(FASTAHACK_DIR)/*.h $(FASTAHACK_DIR)/*.cpp - +cd $(FASTAHACK_DIR) && $(MAKE) $(FILTER) && mv Fasta.o $(CWD)/$(OBJ_DIR) && cp Fasta.h $(CWD)/$(INC_DIR) +$(INC_DIR)/kff_io.hpp: $(LIB_DIR)/libkff.a -$(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDEFLATE_DIR)/lib/*/*.h $(LIBDEFLATE_DIR)/lib/*.c $(LIBDEFLATE_DIR)/lib/*/*.c - +cd $(LIBDEFLATE_DIR) && $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) +$(LIB_DIR)/libkff.a: $(KFF_DIR)/kff_io.cpp $(KFF_DIR)/kff_io.hpp.in +ifeq ($(shell uname -s),Darwin) + +. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) +else + +. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) +endif -# We clear out the include/htslib/* headers because they may be cached and out of date and confuse the build. -# Also we build after libdeflate so it can be used -$(LIB_DIR)/libhts.a: $(LIB_DIR)/libdeflate.a $(HTSLIB_DIR)/*.c $(HTSLIB_DIR)/*.h $(HTSLIB_DIR)/htslib/*.h $(HTSLIB_DIR)/cram/*.c $(HTSLIB_DIR)/cram/*.h - +rm -Rf $(CWD)/$(INC_DIR)/htslib $(CWD)/$(LIB_DIR)/libhts.a - +cd $(HTSLIB_DIR) && autoheader && autoconf && CFLAGS="-I$(CWD)/$(INC_DIR)" LDFLAGS="-L$(CWD)/$(LIB_DIR)" ./configure --with-libdeflate --disable-s3 --disable-gcs --disable-libcurl --disable-plugins $(FILTER) && $(MAKE) lib-static $(FILTER) && cp libhts.a $(CWD)/$(LIB_DIR) && cp *.h $(CWD)/$(INC_DIR) && cp -r htslib $(CWD)/$(INC_DIR)/ +$(INC_DIR)/BooPHF.h: $(BBHASH_DIR)/BooPHF.h + +cp $(BBHASH_DIR)/BooPHF.h $(CWD)/$(INC_DIR) -# We tell the vcflib build to use our own htslib -$(LIB_DIR)/libvcflib.a: $(LIB_DIR)/libhts.a $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.hpp $(VCFLIB_DIR)/intervaltree/*.cpp $(VCFLIB_DIR)/intervaltree/*.h $(VCFLIB_DIR)/tabixpp/*.cpp $(VCFLIB_DIR)/tabixpp/*.hpp - +. ./source_me.sh && cd $(VCFLIB_DIR) && HTS_LIB="$(CWD)/$(LIB_DIR)/libhts.a" HTS_INCLUDES="-I$(CWD)/$(INC_DIR)" HTS_LDFLAGS="-L$(CWD)/$(LIB_DIR) -lhts -lpthread -lm -lbz2 -llzma -lz -ldeflate" $(MAKE) libvcflib.a $(FILTER) && cp lib/* $(CWD)/$(LIB_DIR)/ && cp include/* $(CWD)/$(INC_DIR)/ && cp intervaltree/*.h $(CWD)/$(INC_DIR)/ && cp src/*.h* $(CWD)/$(INC_DIR)/ - -$(VCFLIB_DIR)/bin/vcf2tsv: $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.h $(LIB_DIR)/libvcflib.a - +. ./source_me.sh && cd $(VCFLIB_DIR) && HTS_LIB="$(CWD)/$(LIB_DIR)/libhts.a" HTS_INCLUDES="-I$(CWD)/$(INC_DIR)" HTS_LDFLAGS="-L$(CWD)/$(LIB_DIR) -lhts -lpthread -lm -lbz2 -llzma -lz -ldeflate" $(MAKE) vcf2tsv $(FILTER) +$(INC_DIR)/progress_bar.hpp: $(PROGRESS_BAR_DIR)/progress_bar.hpp + +cp $(PROGRESS_BAR_DIR)/progress_bar.hpp $(CWD)/$(INC_DIR) + +$(OBJ_DIR)/progress_bar.o: $(PROGRESS_BAR_DIR)/progress_bar.cpp $(PROGRESS_BAR_DIR)/*.hpp + +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< +$(SHARED_OBJ_DIR)/progress_bar.o: $(PROGRESS_BAR_DIR)/progress_bar.cpp $(PROGRESS_BAR_DIR)/*.hpp + +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< + +$(INC_DIR)/Fasta.h: $(FASTAHACK_DIR)/Fasta.h + +. ./source_me.sh && cd $(FASTAHACK_DIR) && cp Fasta.h $(CWD)/$(INC_DIR) + +$(OBJ_DIR)/Fasta.o: $(FASTAHACK_DIR)/Fasta.cpp $(INC_DIR)/Fasta.h $(FASTAHACK_DIR)/fastahack + +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(FILTER) +$(SHARED_OBJ_DIR)/Fasta.o: $(FASTAHACK_DIR)/Fasta.cpp $(INC_DIR)/Fasta.h $(FASTAHACK_DIR)/fastahack + +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER) + +# We have this target to clean up the old Protobuf we used to have. +# We can remove it after we no longer care about building properly on a dirty +# build from vg versions that shipped Protobuf themselves. +$(LIB_DIR)/cleaned_old_protobuf_v003: $(wildcard $(LIB_DIR)/libproto*) $(wildcard $(LIB_DIR)/pkgconfig/protobuf*) + +rm -f $(LIB_DIR)/cleaned_old_protobuf* + +rm -f $(LIB_DIR)/libproto* $(LIB_DIR)/pkgconfig/protobuf* $(BIN_DIR)/protoc + +rm -Rf $(INC_DIR)/google/protobuf deps/protobuf + +touch $(LIB_DIR)/cleaned_old_protobuf_v003 + +# We used to ship our own version of boost, but now we use the system version instead. +$(LIB_DIR)/cleaned_old_boost: $(wildcard $(LIB_DIR)/libboost_*) $(wildcard $(INC_DIR)/boost/*) + +rm -f $(LIB_DIR)/libboost_* + +rm -Rf $(INC_DIR)/boost + +touch $(LIB_DIR)/cleaned_old_boost + +# We used to build elfutils with libdebuginfod, but we now need to build +# without it. +$(LIB_DIR)/cleaned_old_elfutils: + +rm -f $(LIB_DIR)/libelf.a $(LIB_DIR)/libebl.a $(LIB_DIR)/libdwfl.a $(LIB_DIR)/libdwelf.a $(LIB_DIR)/libdw.a + +touch $(LIB_DIR)/cleaned_old_elfutils + +$(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp $(LIBVGIO_DIR)/deps/vg.proto + +rm -f $(CWD)/$(INC_DIR)/vg.pb.h $(CWD)/$(INC_DIR)/vg/vg.pb.h + +rm -Rf $(CWD)/$(INC_DIR)/vg/io/ + +. ./source_me.sh && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LDFLAGS) $(LD_LIB_DIR_FLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH=$(CWD) -DCMAKE_LIBRARY_PATH=$(CWD)/$(LIB_DIR) -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install + +$(LIB_DIR)/libhandlegraph.a: $(LIBHANDLEGRAPH_DIR)/src/include/handlegraph/*.hpp $(LIBHANDLEGRAPH_DIR)/src/*.cpp + +. ./source_me.sh && cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_VERBOSE_MAKEFILE=ON .. && $(MAKE) $(FILTER) && cp libhandlegraph.a $(CWD)/$(LIB_DIR) && cp -r ../src/include/handlegraph $(CWD)/$(INC_DIR) + + +# On Linux, libdeflate builds a .so. +# On Mac, it *still* builds an so, which is just a dylib with .so extension. +# On Mac we need to make sure to set the install name. We do that by renaming to dylib. +# We don't just leave it as .so because we need to deal with outdated .so files with no paths set. +$(LIB_DIR)/libdeflate.$(SHARED_SUFFIX): $(LIB_DIR)/libdeflate.a + +cd $(LIBDEFLATE_DIR) && cp libdeflate.so $(CWD)/$(LIB_DIR) + +touch $(CWD)/$(LIB_DIR)/libdeflate.so +ifeq ($(shell uname -s),Darwin) + +mv $(LIB_DIR)/libdeflate.so $(LIB_DIR)/libdeflate.$(SHARED_SUFFIX) + +install_name_tool -id $(CWD)/$(LIB_DIR)/libdeflate.$(SHARED_SUFFIX) $(LIB_DIR)/libdeflate.$(SHARED_SUFFIX) +endif + +$(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDEFLATE_DIR)/lib/*/*.h $(LIBDEFLATE_DIR)/lib/*.c $(LIBDEFLATE_DIR)/lib/*/*.c + +. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) + +# We build htslib after libdeflate so it can use libdeflate. +# We need to do some wizardry to get it to pick up the right build and target system types on modern autotools. +# We have to do a full build in order to install, to get the pkg-config file so libvgio can link against it. +# We also have to have the shared libdeflate or we will get complaints that the static one is not position independent. +# If we need either the library or the pkg-config file (which we didn't used to ship), run the whole build. +# We use a wildcard match to make sure make understands that both files come from one command run. +# See https://stackoverflow.com/a/3077254 +# We also need to make sure that htslib searches itself before system paths, as +# a system path, in case another htslib is installed on the system. Some HTSlib +# headers look for the current HTSlib with <>. +$(LIB_DIR)/libhts%a $(LIB_DIR)/pkgconfig/htslib%pc: $(LIB_DIR)/libdeflate.a $(LIB_DIR)/libdeflate.$(SHARED_SUFFIX) $(HTSLIB_DIR)/*.c $(HTSLIB_DIR)/*.h $(HTSLIB_DIR)/htslib/*.h $(HTSLIB_DIR)/cram/*.c $(HTSLIB_DIR)/cram/*.h + +. ./source_me.sh && cd $(HTSLIB_DIR) && rm -Rf $(CWD)/$(INC_DIR)/htslib $(CWD)/$(LIB_DIR)/libhts* && autoreconf -i && autoheader && autoconf || true + +. ./source_me.sh && cd $(HTSLIB_DIR) && (./configure -n 2>&1 || true) | grep "build system type" | rev | cut -f1 -d' ' | rev >systype.txt + +. ./source_me.sh && cd $(HTSLIB_DIR) && CFLAGS="-I$(CWD)/$(HTSLIB_DIR) -isystem $(CWD)/$(HTSLIB_DIR) -I$(CWD)/$(INC_DIR) $(CFLAGS)" LDFLAGS="$(LDFLAGS) -L$(CWD)/$(LIB_DIR) $(LD_UTIL_RPATH_FLAGS)" ./configure --with-libdeflate --disable-s3 --disable-gcs --disable-libcurl --disable-plugins --prefix=$(CWD) --host=$$(cat systype.txt) $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && $(MAKE) install + +# Build and install tabixpp for vcflib. +$(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR)/*.hpp + +. ./source_me.sh && cd $(TABIXPP_DIR) && rm -f tabix.o libtabixpp.a && INCLUDES="-I$(CWD)/$(INC_DIR)" HTS_HEADERS="" $(MAKE) tabix.o $(FILTER) && ar rcs libtabixpp.a tabix.o + +cp $(TABIXPP_DIR)/libtabixpp.a $(LIB_DIR) && cp $(TABIXPP_DIR)/tabix.hpp $(INC_DIR) + +echo "Name: tabixpp" > $(LIB_DIR)/pkgconfig/tabixpp.pc + +echo "Description: Self-packaged tabixpp" >> $(LIB_DIR)/pkgconfig/tabixpp.pc + +echo "Version: 1.0" >> $(LIB_DIR)/pkgconfig/tabixpp.pc + +echo "Cflags: -I$(CWD)/$(INC_DIR)" >> $(LIB_DIR)/pkgconfig/tabixpp.pc + +echo "Libs: -L$(CWD)/$(LIB_DIR) -ltabixpp" >> $(LIB_DIR)/pkgconfig/tabixpp.pc + +# Build vcflib. Install the library and headers but not binaries or man pages. +# We need to build as RelWithDebInfo to avoid vcflib using its own +# -march=native, which would conflict with the -march that comes in through +# CXXFLAGS from the vg Dockerfile. +# We also need to use the magic path hint to let CMake find Mac OpenMP. +# We need to use /usr first for CMake search or Ubuntu 22.04 will decide pybind11 is installed in / when actually it is only fully installed in /usr. +$(LIB_DIR)/libvcflib.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libtabixpp.a $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.hpp $(VCFLIB_DIR)/contrib/*/*.cpp $(VCFLIB_DIR)/contrib/*/*.h + +rm -f $(VCFLIB_DIR)/contrib/WFA2-lib/VERSION + +. ./source_me.sh && cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build . + +cp $(VCFLIB_DIR)/contrib/filevercmp/*.h* $(INC_DIR) + +cp $(VCFLIB_DIR)/contrib/fastahack/*.h* $(INC_DIR) + +cp $(VCFLIB_DIR)/contrib/smithwaterman/*.h* $(INC_DIR) + +cp $(VCFLIB_DIR)/contrib/intervaltree/*.h* $(INC_DIR) + +cp $(VCFLIB_DIR)/contrib/multichoose/*.h* $(INC_DIR) + +cp $(VCFLIB_DIR)/src/*.h* $(INC_DIR) + +cp $(VCFLIB_DIR)/build/libvcflib.a $(LIB_DIR) + +# vcflib binaries are all automatically built. We need this one. +$(BIN_DIR)/vcf2tsv: $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.h $(LIB_DIR)/libvcflib.a + +cp $(VCFLIB_DIR)/build/vcf2tsv $(BIN_DIR) $(FASTAHACK_DIR)/fastahack: $(FASTAHACK_DIR)/*.c $(FASTAHACK_DIR)/*.h $(FASTAHACK_DIR)/*.cpp - +cd $(FASTAHACK_DIR) && $(MAKE) $(FILTER) + +. ./source_me.sh && cd $(FASTAHACK_DIR) && $(MAKE) $(FILTER) $(LIB_DIR)/libgssw.a: $(GSSW_DIR)/src/gssw.c $(GSSW_DIR)/src/gssw.h - +cd $(GSSW_DIR) && $(MAKE) $(FILTER) && cp lib/* $(CWD)/$(LIB_DIR)/ && cp obj/* $(CWD)/$(OBJ_DIR) && cp src/*.h $(CWD)/$(INC_DIR) + +. ./source_me.sh && cd $(GSSW_DIR) && $(MAKE) $(FILTER) && cp lib/libgssw.a $(CWD)/$(LIB_DIR)/ && cp src/gssw.h $(CWD)/$(INC_DIR)/ $(INC_DIR)/lru_cache.h: $(DEP_DIR)/lru_cache/*.h $(DEP_DIR)/lru_cache/*.cc - +cd $(DEP_DIR)/lru_cache && $(MAKE) $(FILTER) && cp *.h* $(CWD)/$(INC_DIR)/ + +cd $(DEP_DIR)/lru_cache && cp *.h* $(CWD)/$(INC_DIR)/ -$(INC_DIR)/dynamic.hpp: $(DYNAMIC_DIR)/include/*.hpp $(DYNAMIC_DIR)/include/internal/*.hpp - +cat $(DYNAMIC_DIR)/include/dynamic.hpp | sed 's%$(INC_DIR)/dynamic.hpp && cp -r $(CWD)/$(DYNAMIC_DIR)/include/internal $(CWD)/$(INC_DIR)/dynamic +# We moved the Dynamic headers so make sure to clean up the old ones. +$(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/dynamic/*.hpp $(DYNAMIC_DIR)/include/dynamic/*/*.hpp + +rm -Rf $(INC_DIR)/dynamic.hpp $(INC_DIR)/dynamic + # annoyingly doesn't have an install option on the cmake, so we manually move their external dependency headers + +cd $(CWD)/$(DYNAMIC_DIR) && rm -Rf build && mkdir -p build && cd build && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && cmake -DCMAKE_VERBOSE_MAKEFILE=ON .. && make && cp -r $(CWD)/$(DYNAMIC_DIR)/deps/hopscotch_map/include/* $(CWD)/$(INC_DIR)/ + # Do the copy of the main file last so we can tell if this recipe failed and redo it. + # Otherwise we get dynamic.hpp without its deps + +mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/dynamic/* $(INC_DIR)/dynamic/ -$(INC_DIR)/sparsehash/sparse_hash_map: $(wildcard $(SPARSEHASH_DIR)/**/*.cc) $(wildcard $(SPARSEHASH_DIR)/**/*.h) - +cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="-L/opt/local/lib" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install +$(INC_DIR)/sparsehash/sparse_hash_map: $(wildcard $(SPARSEHASH_DIR)/**/*.cc) $(wildcard $(SPARSEHASH_DIR)/**/*.h) + +. ./source_me.sh && cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LDFLAGS) $(LD_LIB_DIR_FLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install $(INC_DIR)/sparsepp/spp.h: $(wildcard $(SPARSEHASH_DIR)/sparsepp/*.h) +cp -r $(SPARSEPP_DIR)/sparsepp $(INC_DIR)/ #$(INC_DIR)/Variant.h -$(LIB_DIR)/libvcfh.a: $(DEP_DIR)/libVCFH/*.cpp $(DEP_DIR)/libVCFH/*.hpp - +cd $(DEP_DIR)/libVCFH && $(MAKE) $(FILTER) && cp libvcfh.a $(CWD)/$(LIB_DIR)/ && cp vcfheader.hpp $(CWD)/$(INC_DIR)/ - -$(LIB_DIR)/libgfakluge.a: $(INC_DIR)/gfakluge.hpp $(DEP_DIR)/gfakluge/src/*.hpp $(DEP_DIR)/gfakluge/src/*.cpp - +cd $(DEP_DIR)/gfakluge && $(MAKE) libgfakluge.a $(FILTER) && cp libgfakluge.a $(CWD)/$(LIB_DIR)/ - -$(INC_DIR)/gfakluge.hpp: $(DEP_DIR)/gfakluge/src/gfakluge.hpp - +cp $(DEP_DIR)/gfakluge/src/*.hpp $(CWD)/$(INC_DIR)/ && cp $(DEP_DIR)/gfakluge/src/tinyFA/*.hpp $(CWD)/$(INC_DIR)/ +$(LIB_DIR)/libvcfh.a: $(DEP_DIR)/libVCFH/*.cpp $(DEP_DIR)/libVCFH/*.hpp + +. ./source_me.sh && cd $(DEP_DIR)/libVCFH && $(MAKE) $(FILTER) && cp libvcfh.a $(CWD)/$(LIB_DIR)/ && cp vcfheader.hpp $(CWD)/$(INC_DIR)/ $(LIB_DIR)/libsonlib.a: $(CWD)/$(DEP_DIR)/sonLib/C/inc/*.h $(CWD)/$(DEP_DIR)/sonLib/C/impl/*.c - +cd $(DEP_DIR)/sonLib && kyotoTycoonLib="" $(MAKE) $(FILTER) && cp lib/sonLib.a $(CWD)/$(LIB_DIR)/libsonlib.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib + +. ./source_me.sh && cd $(DEP_DIR)/sonLib && kyotoTycoonLib="" $(MAKE) $(FILTER) && cp lib/sonLib.a $(CWD)/$(LIB_DIR)/libsonlib.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib $(LIB_DIR)/libpinchesandcacti.a: $(LIB_DIR)/libsonlib.a $(CWD)/$(DEP_DIR)/pinchesAndCacti/inc/*.h $(CWD)/$(DEP_DIR)/pinchesAndCacti/impl/*.c - +cd $(DEP_DIR)/pinchesAndCacti && $(MAKE) $(FILTER) && cd $(CWD)/$(DEP_DIR)/sonLib && cp lib/stPinchesAndCacti.a $(CWD)/$(LIB_DIR)/libpinchesandcacti.a && cp lib/3EdgeConnected.a $(CWD)/$(LIB_DIR)/lib3edgeconnected.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib + +. ./source_me.sh && cd $(DEP_DIR)/pinchesAndCacti && $(MAKE) $(FILTER) && cd $(CWD)/$(DEP_DIR)/sonLib && cp lib/stPinchesAndCacti.a $(CWD)/$(LIB_DIR)/libpinchesandcacti.a && cp lib/3EdgeConnected.a $(CWD)/$(LIB_DIR)/lib3edgeconnected.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib # When building raptor we need to make sure to pre-generate and fix up the lexer -$(LIB_DIR)/libraptor2.a: $(RAPTOR_DIR)/src/*.c $(RAPTOR_DIR)/src/*.h - +cd $(RAPTOR_DIR)/build && cmake .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) && mkdir -p $(CWD)/$(INC_DIR)/raptor2 && cp src/*.h $(CWD)/$(INC_DIR)/raptor2 - -$(LIB_DIR)/libstructures.a: $(STRUCTURES_DIR)/src/include/structures/*.hpp $(STRUCTURES_DIR)/src/*.cpp - +. ./source_me.sh && cd $(STRUCTURES_DIR) && $(MAKE) lib/libstructures.a $(FILTER) && cp lib/libstructures.a $(CWD)/$(LIB_DIR)/ && cp -r src/include/structures $(CWD)/$(INC_DIR)/ - -# To build libvw we need to point it at our Boost, but then configure decides -# it needs to build vwdll, which depends on codecvt, which isn't actually -# shipped in the GCC 4.9 STL. So we hack vwdll AKA libvw_c_wrapper out of the -# build. -# Also, autogen.sh looks for Boost in the system, and who knows what it will do -# if it doesn't find it, so let it fail. -$(LIB_DIR)/libvw.a: $(LIB_DIR)/libboost_program_options.a $(VOWPALWABBIT_DIR)/* $(VOWPALWABBIT_DIR)/vowpalwabbit/* - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && sed -i -e 's/libvw_c_wrapper\.pc//g' Makefile.am - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && sed -i -e 's/libvw_c_wrapper\.la//g' vowpalwabbit/Makefile.am - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && sed -i -e '/libvw_c_wrapper\.pc/d' configure.ac - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && sed -i -e '/vwdll/d' Makefile.am - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && sed -i -e '/libvw_c_wrapper/d' vowpalwabbit/Makefile.am - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && (./autogen.sh || true) - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && ./configure --with-boost=$(CWD) - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && $(MAKE) $(FILTER) - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && cp vowpalwabbit/.libs/libvw.a vowpalwabbit/.libs/liballreduce.a $(CWD)/$(LIB_DIR)/ - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && mkdir -p $(CWD)/$(INC_DIR)/vowpalwabbit - +. ./source_me.sh && cd $(VOWPALWABBIT_DIR) && cp vowpalwabbit/*.h $(CWD)/$(INC_DIR)/vowpalwabbit/ - -$(LIB_DIR)/liballreduce.a: $(LIB_DIR)/libvw.a - -$(LIB_DIR)/libboost_program_options.a: $(BOOST_DIR)/libs/program_options/src/* $(BOOST_DIR)/boost/program_options/* - +. ./source_me.sh && cd $(BOOST_DIR) && ./bootstrap.sh --with-libraries=program_options --libdir=$(CWD)/$(LIB_DIR) --includedir=$(CWD)/$(INC_DIR) $(FILTER) && ./b2 --ignore-site-config --link=static install $(FILTER) - +. ./source_me.sh && if [[ $(shell uname -s) == "Darwin" ]]; then install_name_tool -id $(CWD)/$(LIB_DIR)/libboost_program_options.dylib $(CWD)/$(LIB_DIR)/libboost_program_options.dylib; fi +# We also need to clear out its cmake stuff in case it found a wrong Bison and cached it. +$(LIB_DIR)/libraptor2.a: $(RAPTOR_DIR)/src/* $(wildcard $(RAPTOR_DIR)/build/*) + which bison + +. ./source_me.sh && cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && cmake .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) + +touch $(LIB_DIR)/libraptor2.a + +# We need rapper from Raptor for the tests +$(BIN_DIR)/rapper: $(LIB_DIR)/libraptor2.a + +cp $(RAPTOR_DIR)/build/utils/rapper $(BIN_DIR)/ + +# The Raptor header needs to be newer than the library. +# Mac Travis managed to get an old header with a new binary. +$(INC_DIR)/raptor2/raptor2.h: $(LIB_DIR)/libraptor2.a $(RAPTOR_DIR)/build/* + +cd $(RAPTOR_DIR)/build && mkdir -p $(CWD)/$(INC_DIR)/raptor2 && cp src/*.h $(CWD)/$(INC_DIR)/raptor2 + +touch $(INC_DIR)/raptor2/raptor2.h + +$(LIB_DIR)/libstructures.a: $(STRUCTURES_DIR)/src/include/structures/*.hpp $(STRUCTURES_DIR)/src/*.cpp $(STRUCTURES_DIR)/Makefile + +. ./source_me.sh && cd $(STRUCTURES_DIR) && $(MAKE) clean && $(MAKE) lib/libstructures.a $(FILTER) && cp lib/libstructures.a $(CWD)/$(LIB_DIR)/ && cp -r src/include/structures $(CWD)/$(INC_DIR)/ $(INC_DIR)/sha1.hpp: $(SHA1_DIR)/sha1.hpp +cp $(SHA1_DIR)/*.h* $(CWD)/$(INC_DIR)/ @@ -412,6 +762,12 @@ $(INC_DIR)/sha1.hpp: $(SHA1_DIR)/sha1.hpp $(INC_DIR)/backward.hpp: $(BACKWARD_CPP_DIR)/backward.hpp +cp $(BACKWARD_CPP_DIR)/backward.hpp $(CWD)/$(INC_DIR)/ +$(INC_DIR)/simde/x86/sse4.1.h: $(DOZEU_DIR)/simde/*.h $(DOZEU_DIR)/simde/x86/*.h + +cp -r $(DOZEU_DIR)/simde $(INC_DIR) + +$(INC_DIR)/dozeu/dozeu.h: $(DOZEU_DIR)/*.h $(INC_DIR)/simde/x86/sse4.1.h + +mkdir -p $(CWD)/$(INC_DIR)/dozeu && cp $(DOZEU_DIR)/*.h $(CWD)/$(INC_DIR)/dozeu/ + $(LIB_DIR)/libebl.a: $(LIB_DIR)/libelf.a $(LIB_DIR)/libdw.a: $(LIB_DIR)/libelf.a @@ -423,34 +779,93 @@ $(LIB_DIR)/libdwfl.a: $(LIB_DIR)/libelf.a # We can't build elfutils from Git without "maintainer mode". # There are some release-only headers or something that it complains it can't find otherwise. # We also don't do a normal make and make install here because we don't want to build and install all the elfutils binaries and libasm. -$(LIB_DIR)/libelf.a: $(ELFUTILS_DIR)/libebl/* $(ELFUTILS_DIR)/libdw/* $(ELFUTILS_DIR)/libelf/* $(ELFUTILS_DIR)/src/* - +cd $(ELFUTILS_DIR) && autoreconf -i -f && ./configure --enable-maintainer-mode --prefix=$(CWD) $(FILTER) - +cd $(ELFUTILS_DIR)/libelf && $(MAKE) libelf.a $(FILTER) - +cd $(ELFUTILS_DIR)/libebl && $(MAKE) libebl.a $(FILTER) - +cd $(ELFUTILS_DIR)/libdw && $(MAKE) libdw.a known-dwarf.h $(FILTER) - +cd $(ELFUTILS_DIR)/libdwfl && $(MAKE) libdwfl.a $(FILTER) - +cd $(ELFUTILS_DIR)/libdwelf && $(MAKE) libdwelf.a $(FILTER) +# We need to disable libdebuginfod or the static binary will try and load it at +# runtime and pull in incompatible libs it depends on on whatever system it's +# running on. +$(LIB_DIR)/libelf.a: $(ELFUTILS_DIR)/libebl/*.c $(ELFUTILS_DIR)/libebl/*.h $(ELFUTILS_DIR)/libdw/*.c $(ELFUTILS_DIR)/libdw/*.h $(ELFUTILS_DIR)/libelf/*.c $(ELFUTILS_DIR)/libelf/*.h $(ELFUTILS_DIR)/src/*.c $(ELFUTILS_DIR)/src/*.h $(LIB_DIR)/cleaned_old_elfutils + +cd $(CWD)/$(INC_DIR)/ && rm -Rf elfutils gelf.h libelf.h dwarf.h libdwflP.h libdwfl.h libebl.h libelf.h + +. ./source_me.sh && cd $(ELFUTILS_DIR) && autoreconf -i -f && ./configure --enable-maintainer-mode --disable-libdebuginfod --disable-debuginfod --prefix=$(CWD) $(FILTER) + +. ./source_me.sh && cd $(ELFUTILS_DIR)/libelf && $(MAKE) clean && $(MAKE) libelf.a $(FILTER) + +. ./source_me.sh && cd $(ELFUTILS_DIR)/libebl && $(MAKE) clean && $(MAKE) libebl.a $(FILTER) + +. ./source_me.sh && cd $(ELFUTILS_DIR)/libdwfl && $(MAKE) clean && $(MAKE) libdwfl.a $(FILTER) + +. ./source_me.sh && cd $(ELFUTILS_DIR)/libdwelf && $(MAKE) clean && $(MAKE) libdwelf.a $(FILTER) + +. ./source_me.sh && cd $(ELFUTILS_DIR)/lib && $(MAKE) clean && $(MAKE) libeu.a $(FILTER) + +. ./source_me.sh && cd $(ELFUTILS_DIR)/libcpu && $(MAKE) clean && $(MAKE) libcpu.a $(FILTER) + +. ./source_me.sh && cd $(ELFUTILS_DIR)/backends && $(MAKE) clean && $(MAKE) libebl_backends.a $(FILTER) + +. ./source_me.sh && cd $(ELFUTILS_DIR)/libdw && $(MAKE) clean && $(MAKE) libdw.a known-dwarf.h $(FILTER) +cd $(ELFUTILS_DIR) && mkdir -p $(CWD)/$(INC_DIR)/elfutils && cp libdw/known-dwarf.h libdw/libdw.h libebl/libebl.h libelf/elf-knowledge.h version.h libdwfl/libdwfl.h libdwelf/libdwelf.h $(CWD)/$(INC_DIR)/elfutils && cp libelf/gelf.h libelf/libelf.h libdw/dwarf.h $(CWD)/$(INC_DIR) && cp libebl/libebl.a libdw/libdw.a libdwfl/libdwfl.a libdwelf/libdwelf.a libelf/libelf.a $(CWD)/$(LIB_DIR)/ $(OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp - +$(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) $(FILTER) + +$(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(FILTER) +$(SHARED_OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp + +$(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER) $(LIB_DIR)/libfml.a: $(FERMI_DIR)/*.h $(FERMI_DIR)/*.c - cd $(FERMI_DIR) && $(MAKE) $(FILTER) && cp *.h $(CWD)/$(INC_DIR)/ && cp libfml.a $(CWD)/$(LIB_DIR)/ + . ./source_me.sh && cd $(FERMI_DIR) && $(MAKE) $(FILTER) && cp *.h $(CWD)/$(INC_DIR)/ && cp libfml.a $(CWD)/$(LIB_DIR)/ # We don't need to hack the build to point at our htslib because sublinearLS gets its htslib from the include flags we set $(LIB_DIR)/libsublinearLS.a: $(LINLS_DIR)/src/*.cpp $(LINLS_DIR)/src/*.hpp $(LIB_DIR)/libhts.a - cd $(LINLS_DIR) && INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/ + . ./source_me.sh && cd $(LINLS_DIR) && $(MAKE) clean && INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/ + +$(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/Makefile $(LIBBDSG_DIR)/bdsg/src/*.cpp $(LIBBDSG_DIR)/bdsg/include/bdsg/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/internal/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/overlays/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp $(INC_DIR)/mio/mmap.hpp + +. ./source_me.sh && rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && cp -r bdsg/include/* $(CWD)/$(INC_DIR) + +$(INC_DIR)/mio/mmap.hpp: $(MIO_DIR)/include/mio/* + +. ./source_me.sh && cp -r $(MIO_DIR)/include/mio $(CWD)/$(INC_DIR)/ + +# It would be better to copy the atomic_queue directory rather than its contents, but to avoid re-writing mmmultimap... +$(INC_DIR)/atomic_queue.h: $(ATOMIC_QUEUE_DIR)/include/* + +. ./source_me.sh && cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/ + +$(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp +$(INC_DIR)/mmmultimap.hpp: $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h + +. ./source_me.sh && cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/ + +$(INC_DIR)/ips4o.hpp: $(IPS4O_DIR)/ips4o.hpp $(IPS4O_DIR)/ips4o/* + +. ./source_me.sh && cp -r $(IPS4O_DIR)/ips4o* $(CWD)/$(INC_DIR)/ + +# The xg repo has a cmake build system based all around external projects, and +# we need it to use our installed versions of everything instead. +# We also need to not build against GFAKluge +$(LIB_DIR)/libxg.a: $(XG_DIR)/src/*.hpp $(XG_DIR)/src/*.cpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/ips4o.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h + +rm -f $@ + +cp -r $(XG_DIR)/src/*.hpp $(CWD)/$(INC_DIR) + +. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -DNO_GFAKLUGE -c -o $(XG_DIR)/xg.o $(XG_DIR)/src/xg.cpp $(FILTER) + +ar rs $@ $(XG_DIR)/xg.o # Auto-git-versioning -$(INC_DIR)/vg_git_version.hpp: .git - @echo "#define VG_GIT_VERSION \"$(shell git describe --always --tags || echo unknown)\"" > $@ +# We need to scope this variable here +GIT_VERSION_FILE_DEPS = +# Decide if .git exists and needs to be watched +ifeq ($(shell if [ -d .git ]; then echo present; else echo absent; fi),present) + # If so, try and make a git version file + GIT_VERSION_FILE_DEPS = .check-git +else + # Just use the version file we have, if any + GIT_VERSION_FILE_DEPS = .no-git +endif + +# Build a real git version file. +# If it's not the same as the old one, replace the old one. +# If it is the same, do nothing and don't rebuild dependent targets. +.check-git: + @echo "#define VG_GIT_VERSION \"$(shell git describe --always --tags 2>/dev/null || echo git-error)\"" > $(INC_DIR)/vg_git_version.hpp.tmp + @diff $(INC_DIR)/vg_git_version.hpp.tmp $(INC_DIR)/vg_git_version.hpp >/dev/null 2>/dev/null || cp $(INC_DIR)/vg_git_version.hpp.tmp $(INC_DIR)/vg_git_version.hpp + @rm -f $(INC_DIR)/vg_git_version.hpp.tmp + +# Make sure the version file exists, if we weren't given one in our tarball +.no-git: + @if [ ! -e $(INC_DIR)/vg_git_version.hpp ]; then \ + touch $(INC_DIR)/vg_git_version.hpp; \ + fi; + +$(INC_DIR)/vg_git_version.hpp: $(GIT_VERSION_FILE_DEPS) # Build an environment version file with this phony target. # If it's not the same as the old one, replace the old one. # If it is the same, do nothing and don't rebuild dependent targets. .check-environment: - @echo "#define VG_COMPILER_VERSION \"$(shell $(CXX) --version | head -n 1)\"" > $(INC_DIR)/vg_environment_version.hpp.tmp + @echo "#define VG_COMPILER_VERSION \"$(shell $(CXX) --version 2>/dev/null | head -n 1)\"" > $(INC_DIR)/vg_environment_version.hpp.tmp @echo "#define VG_OS \"$(shell uname)\"" >> $(INC_DIR)/vg_environment_version.hpp.tmp @echo "#define VG_BUILD_USER \"$(shell whoami)\"" >> $(INC_DIR)/vg_environment_version.hpp.tmp @echo "#define VG_BUILD_HOST \"$(shell hostname)\"" >> $(INC_DIR)/vg_environment_version.hpp.tmp @@ -459,29 +874,11 @@ $(INC_DIR)/vg_git_version.hpp: .git # The way to get the actual file is to maybe replace it. $(INC_DIR)/vg_environment_version.hpp: .check-environment - - -# Not important if .git isn't real -.git: ################################### ## VG source code compilation begins here #################################### -include/stream.hpp: src/stream.hpp - cp src/stream.hpp include/stream.hpp - -$(OBJ_DIR)/vg.pb.o: $(CPP_DIR)/vg.pb.o - cp $(CPP_DIR)/vg.pb.o $(OBJ_DIR)/vg.pb.o - -$(CPP_DIR)/vg.pb.o: $(CPP_DIR)/vg.pb.cc - -$(CPP_DIR)/vg.pb.cc: $(CPP_DIR)/vg.pb.h - -$(CPP_DIR)/vg.pb.h: $(LIB_DIR)/libprotobuf.a bin/protoc $(SRC_DIR)/vg.proto - +. ./source_me.sh && ./bin/protoc $(SRC_DIR)/vg.proto --proto_path=$(SRC_DIR) --cpp_out=cpp - +cp $@ $(INC_DIR) - $(OBJ_DIR)/version.o: $(SRC_DIR)/version.cpp $(SRC_DIR)/version.hpp $(INC_DIR)/vg_git_version.hpp $(INC_DIR)/vg_environment_version.hpp ######################## @@ -490,29 +887,55 @@ $(OBJ_DIR)/version.o: $(SRC_DIR)/version.cpp $(SRC_DIR)/version.hpp $(INC_DIR)/v # Define a default rule for building objects from CPP files # Depend on the .d file so we rebuild if dependency info is missing/deleted +# Make sure to touch the .o file after the compiler finishes so it is always newer than the .d file # Use static pattern rules so the dependency files will not be ignored if the output exists # See $(OBJ) $(OBJ_DIR)/main.o: $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cpp $(OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) $(FILTER) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + @touch $@ +$(SHARED_OBJ): $(SHARED_OBJ_DIR)/%.o : $(SRC_DIR)/%.cpp $(SHARED_OBJ_DIR)/%.d $(DEPS) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) + @touch $@ $(ALGORITHMS_OBJ): $(ALGORITHMS_OBJ_DIR)/%.o : $(ALGORITHMS_SRC_DIR)/%.cpp $(ALGORITHMS_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) $(FILTER) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + @touch $@ +$(ALGORITHMS_SHARED_OBJ): $(ALGORITHMS_SHARED_OBJ_DIR)/%.o : $(ALGORITHMS_SRC_DIR)/%.cpp $(ALGORITHMS_SHARED_OBJ_DIR)/%.d $(DEPS) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) + @touch $@ +$(IO_OBJ): $(IO_OBJ_DIR)/%.o : $(IO_SRC_DIR)/%.cpp $(IO_OBJ_DIR)/%.d $(DEPS) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + @touch $@ +$(IO_SHARED_OBJ): $(IO_SHARED_OBJ_DIR)/%.o : $(IO_SRC_DIR)/%.cpp $(IO_SHARED_OBJ_DIR)/%.d $(DEPS) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) + @touch $@ $(SUBCOMMAND_OBJ): $(SUBCOMMAND_OBJ_DIR)/%.o : $(SUBCOMMAND_SRC_DIR)/%.cpp $(SUBCOMMAND_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) $(FILTER) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + @touch $@ $(UNITTEST_OBJ): $(UNITTEST_OBJ_DIR)/%.o : $(UNITTEST_SRC_DIR)/%.cpp $(UNITTEST_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) $(FILTER) - -# Protobuf stuff builds into its same directory -$(CPP_DIR)/%.o : $(CPP_DIR)/%.cc $(DEPS) - . ./source_me.sh && $(CXX) $(CXXFLAGS) -c -o $@ $< $(LD_INCLUDE_FLAGS) $(FILTER) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + @touch $@ +$(UNITTEST_SUPPORT_OBJ): $(UNITTEST_SUPPORT_OBJ_DIR)/%.o : $(UNITTEST_SUPPORT_SRC_DIR)/%.cpp $(UNITTEST_SUPPORT_OBJ_DIR)/%.d $(DEPS) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + @touch $@ + +# Config objects get individual rules +$(CONFIG_OBJ_DIR)/allocator_config_jemalloc.o: $(CONFIG_SRC_DIR)/allocator_config_jemalloc.cpp $(CONFIG_OBJ_DIR)/allocator_config_jemalloc.d $(DEPS) $(LIB_DIR)/libjemalloc.a + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + @touch $@ +$(CONFIG_OBJ_DIR)/allocator_config_system.o: $(CONFIG_SRC_DIR)/allocator_config_system.cpp $(CONFIG_OBJ_DIR)/allocator_config_system.d $(DEPS) + . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + @touch $@ # Use a fake rule to build .d files, so we don't complain if they don't exist. $(OBJ_DIR)/%.d: ; $(ALGORITHMS_OBJ_DIR)/%.d: ; +$(CONFIG_OBJ_DIR)/%.d: ; +$(IO_OBJ_DIR)/%.d: ; $(SUBCOMMAND_OBJ_DIR)/%.d: ; $(UNITTEST_OBJ_DIR)/%.d: ; # Don't delete them. -.PRECIOUS: $(OBJ_DIR)/%.d $(ALGORITHMS_OBJ_DIR)/%.d $(SUBCOMMAND_OBJ_DIR)/%.d $(UNITTEST_OBJ_DIR)/%.d +.PRECIOUS: $(OBJ_DIR)/%.d $(ALGORITHMS_OBJ_DIR)/%.d $(CONFIG_OBJ_DIR)/%.d $(IO_OBJ_DIR)/%.d $(SUBCOMMAND_OBJ_DIR)/%.d $(UNITTEST_OBJ_DIR)/%.d # Use no implicit rules .SUFFIXES: @@ -522,80 +945,108 @@ $(UNITTEST_OBJ_DIR)/%.d: ; #################################### - +# Make directories before quitting target due to missing protoc. +# If we run the rest of the build without these, lib and include can become files. +# TODO: quitting if no protoc doesn't reliably stop the build. .pre-build: @if [ ! -d $(BIN_DIR) ]; then mkdir -p $(BIN_DIR); fi + @if [ ! -d $(UNITTEST_BIN_DIR) ]; then mkdir -p $(UNITTEST_BIN_DIR); fi @if [ ! -d $(LIB_DIR) ]; then mkdir -p $(LIB_DIR); fi @if [ ! -d $(OBJ_DIR) ]; then mkdir -p $(OBJ_DIR); fi + @if [ ! -d $(SHARED_OBJ_DIR) ]; then mkdir -p $(SHARED_OBJ_DIR); fi @if [ ! -d $(ALGORITHMS_OBJ_DIR) ]; then mkdir -p $(ALGORITHMS_OBJ_DIR); fi - @if [ ! -d $(UNITTEST_OBJ_DIR) ]; then mkdir -p $(UNITTEST_OBJ_DIR); fi + @if [ ! -d $(ALGORITHMS_SHARED_OBJ_DIR) ]; then mkdir -p $(ALGORITHMS_SHARED_OBJ_DIR); fi + @if [ ! -d $(CONFIG_OBJ_DIR) ]; then mkdir -p $(CONFIG_OBJ_DIR); fi + @if [ ! -d $(IO_OBJ_DIR) ]; then mkdir -p $(IO_OBJ_DIR); fi + @if [ ! -d $(IO_SHARED_OBJ_DIR) ]; then mkdir -p $(IO_SHARED_OBJ_DIR); fi @if [ ! -d $(SUBCOMMAND_OBJ_DIR) ]; then mkdir -p $(SUBCOMMAND_OBJ_DIR); fi + @if [ ! -d $(UNITTEST_OBJ_DIR) ]; then mkdir -p $(UNITTEST_OBJ_DIR); fi + @if [ ! -d $(UNITTEST_SUPPORT_OBJ_DIR) ]; then mkdir -p $(UNITTEST_SUPPORT_OBJ_DIR); fi @if [ ! -d $(INC_DIR) ]; then mkdir -p $(INC_DIR); fi - @if [ ! -d $(CPP_DIR) ]; then mkdir -p $(CPP_DIR); fi - + @protoc --version >/dev/null 2>/dev/null || (echo "Error: protobuf compiler (protoc) not available!" ; exit 1) + @if [ -e $(INC_DIR)/vg/vg.pb.h ] ; then \ + HEADER_VER=$$(cat $(INC_DIR)/vg/vg.pb.h | grep GOOGLE_PROTOBUF_VERSION | sed 's/[^0-9]*\([0-9]*\)[^0-9]*/\1/' | head -n1); \ + WORKDIR=$$(pwd); \ + TESTDIR=$$(mktemp -d); \ + echo 'syntax = "proto3";' > $${TESTDIR}/empty.proto; \ + protoc $${TESTDIR}/empty.proto --proto_path=$${TESTDIR} --cpp_out=$${TESTDIR}; \ + PROTOC_VER=$$(cat $${TESTDIR}/empty.pb.h | grep GOOGLE_PROTOBUF_VERSION | sed 's/[^0-9]*\([0-9]*\)[^0-9]*/\1/' | head -n1); \ + if [ "$${HEADER_VER}" != "$${PROTOC_VER}" ] ; then \ + echo "Protobuf version has changed!"; \ + echo "Headers are for $${HEADER_VER} but we make headers for $${PROTOC_VER}"; \ + echo "Need to rebuild libvgio"; \ + rm -f $(LIB_DIR)/libvgio.a; \ + rm -f $(INC_DIR)/vg/vg.pb.h; \ + fi; \ + rm $${TESTDIR}/empty.proto $${TESTDIR}/empty.pb.h $${TESTDIR}/empty.pb.cc; \ + rmdir $${TESTDIR}; \ + fi; + +# A note about Protobuf: +# We have a lot of logic here to make sure that the protoc we have henerates headers with exactly the same +# version requirements as the headers we already have. +# If not, we regenerate them. +# Doesn't handle Protobuf 3.12.3 weirdness; just make clean if you change flavors of Protobuf 3.12.3. + + + # run .pre-build before we make anything at all. -include .pre-build # for rebuilding just vg clean-vg: - $(RM) -r $(BIN_DIR)/vg - $(RM) -r $(UNITTEST_OBJ_DIR)/*.o $(UNITTEST_OBJ_DIR)/*.d - $(RM) -r $(SUBCOMMAND_OBJ_DIR)/*.o $(SUBCOMMAND_OBJ_DIR)/*.d - $(RM) -r $(OBJ_DIR)/*.o $(OBJ_DIR)/*.d - $(RM) -r $(CPP_DIR)/*.o $(CPP_DIR)/*.d $(CPP_DIR)/*.cc $(CPP_DIR)/*.h + $(RM) -f $(BIN_DIR)/$(EXE) + $(RM) -f $(UNITTEST_SUPPORT_OBJ_DIR)/*.o $(UNITTEST_SUPPORT_OBJ_DIR)/*.d + $(RM) -f $(UNITTEST_OBJ_DIR)/*.o $(UNITTEST_OBJ_DIR)/*.d + $(RM) -f $(SUBCOMMAND_OBJ_DIR)/*.o $(SUBCOMMAND_OBJ_DIR)/*.d + $(RM) -f $(OBJ_DIR)/*.o $(OBJ_DIR)/*.d + $(RM) -f $(SHARED_OBJ_DIR)/*.o $(SHARED_OBJ_DIR)/*.d + $(RM) -f $(ALGORITHMS_OBJ_DIR)/*.o $(ALGORITHMS_OBJ_DIR)/*.d + $(RM) -f $(ALGORITHMS_SHARED_OBJ_DIR)/*.o $(ALGORITHMS_SHARED_OBJ_DIR)/*.d + $(RM) -f $(IO_OBJ_DIR)/*.o $(IO_OBJ_DIR)/*.d + $(RM) -f $(IO_SHARED_OBJ_DIR)/*.o $(IO_SHARED_OBJ_DIR)/*.d $(RM) -f $(INC_DIR)/vg_git_version.hpp $(INC_DIR)/vg_system_version.hpp -clean: clean-rocksdb clean-protobuf clean-vcflib +clean: clean-vcflib + $(RM) -r $(UNITTEST_BIN_DIR) $(RM) -r $(BIN_DIR) $(RM) -r $(LIB_DIR) + $(RM) -r $(UNITTEST_SUPPORT_OBJ_DIR) $(RM) -r $(UNITTEST_OBJ_DIR) $(RM) -r $(SUBCOMMAND_OBJ_DIR) + $(RM) -r $(IO_SHARED_OBJ_DIR) + $(RM) -r $(IO_OBJ_DIR) + $(RM) -r $(ALGORITHMS_SHARED_OBJ_DIR) $(RM) -r $(ALGORITHMS_OBJ_DIR) + $(RM) -r $(CONFIG_OBJ_DIR) + $(RM) -r $(SHARED_OBJ_DIR) $(RM) -r $(OBJ_DIR) $(RM) -r $(INC_DIR) - $(RM) -r $(CPP_DIR) $(RM) -r share/ - cd $(DEP_DIR) && cd sonLib && $(MAKE) clean - cd $(DEP_DIR) && cd sparsehash && $(MAKE) clean cd $(DEP_DIR) && cd htslib && $(MAKE) clean + cd $(DEP_DIR) && cd tabixpp && rm -f tabix.o libtabixpp.a + cd $(DEP_DIR) && cd sonLib && $(MAKE) clean + cd $(DEP_DIR) && cd sparsehash && $(MAKE) clean || true cd $(DEP_DIR) && cd fastahack && $(MAKE) clean cd $(DEP_DIR) && cd gcsa2 && $(MAKE) clean cd $(DEP_DIR) && cd gbwt && $(MAKE) clean + cd $(DEP_DIR) && cd gbwtgraph && $(MAKE) clean + cd $(DEP_DIR) && cd kff-cpp-api && rm -Rf build cd $(DEP_DIR) && cd gssw && $(MAKE) clean cd $(DEP_DIR) && cd ssw && cd src && $(MAKE) clean cd $(DEP_DIR) && cd progress_bar && $(MAKE) clean cd $(DEP_DIR) && cd sdsl-lite && ./uninstall.sh || true cd $(DEP_DIR) && cd libVCFH && $(MAKE) clean cd $(DEP_DIR) && cd vcflib && $(MAKE) clean - cd $(DEP_DIR) && cd vcflib && cd fastahack && $(MAKE) clean - cd $(DEP_DIR) && cd vcflib && cd fsom && $(MAKE) clean - cd $(DEP_DIR) && cd vcflib && cd libVCFH && $(MAKE) clean - cd $(DEP_DIR) && cd vcflib && cd smithwaterman && $(MAKE) clean - cd $(DEP_DIR) && cd vcflib && cd test && $(MAKE) clean - cd $(DEP_DIR) && cd vcflib && cd filevercmp && $(MAKE) clean - cd $(DEP_DIR) && cd vcflib && cd intervaltree && $(MAKE) clean - cd $(DEP_DIR) && cd vcflib && cd multichoose && $(MAKE) clean - cd $(DEP_DIR) && cd vcflib && cd tabixpp && $(MAKE) clean - cd $(DEP_DIR) && cd gfakluge && $(MAKE) clean cd $(DEP_DIR) && cd sha1 && $(MAKE) clean cd $(DEP_DIR) && cd structures && $(MAKE) clean - cd $(DEP_DIR) && cd gperftools && $(MAKE) clean - cd $(DEP_DIR) && cd vowpal_wabbit && $(MAKE) clean + cd $(DEP_DIR) && cd jemalloc && $(MAKE) clean || true cd $(DEP_DIR) && cd sublinear-Li-Stephens && $(MAKE) clean - rm -Rf $(RAPTOR_DIR)/build/* - ## TODO vg source code - ## TODO LRU_CACHE - ## TODO bash-tap - -clean-rocksdb: - cd $(DEP_DIR) && cd rocksdb && $(MAKE) clean - rm -f $(LIB_DIR)/librocksdb.a - rm -rf $(INC_DIR)/rocksdb/ - -clean-protobuf: - cd $(DEP_DIR) && cd protobuf && $(MAKE) clean - rm -f $(LIB_DIR)/libprotobuf.a - rm -rf $(INC_DIR)/google/protobuf/ + cd $(DEP_DIR) && cd libhandlegraph && rm -Rf build CMakeCache.txt CMakeFiles + cd $(DEP_DIR) && cd libvgio && rm -Rf build CMakeCache.txt CMakeFiles + cd $(DEP_DIR) && cd raptor && cd build && find . -not \( -name '.gitignore' -or -name 'pkg.m4' \) -delete + # lru_cache is never built because it is header-only + # bash-tap is never built either clean-vcflib: cd $(DEP_DIR) && cd vcflib && $(MAKE) clean diff --git a/README.md b/README.md index 8f508399690..3ce235cd8b6 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ + # vg -[![Join the chat at https://gitter.im/vgteam/vg](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/vgteam/vg?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Build Status](https://travis-ci.org/vgteam/vg.svg?branch=master)](https://travis-ci.org/vgteam/vg) [![Performance Report](https://img.shields.io/badge/performance-report-brightgreen.svg)](http://cgl-pipeline-inputs.s3.amazonaws.com/vg_cgl/vg_ci/jenkins_reports/branch/master/index.html) [![Stories in Ready](https://badge.waffle.io/vgteam/vg.png?label=ready&title=Ready)](https://waffle.io/vgteam/vg) +[![Join the chat at https://gitter.im/vgteam/vg](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/vgteam/vg?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Latest Release](https://img.shields.io/github/release/vgteam/vg.svg)](https://github.com/vgteam/vg/releases/latest) [![Doxygen API Documentation](https://img.shields.io/badge/doxygen-docs-brightgreen.svg)](https://vgteam.github.io/vg/) ## variation graph data structures, interchange formats, alignment, genotyping, and variant calling methods -![Variation graph](https://raw.githubusercontent.com/vgteam/vg/master/doc/figures/vg_logo.png) +![Variation graph](https://raw.githubusercontent.com/vgteam/vg/master/doc/figures/vg_logo_small.png) _Variation graphs_ provide a succinct encoding of the sequences of many genomes. A variation graph (in particular as implemented in vg) is composed of: @@ -13,37 +14,70 @@ _Variation graphs_ provide a succinct encoding of the sequences of many genomes. * _edges_, which connect two nodes via either of their respective ends * _paths_, describe genomes, sequence alignments, and annotations (such as gene models and transcripts) as walks through nodes connected by edges -This model is similar to a number of sequence graphs that have been used in assembly and multiple sequence alignment. Paths provide coordinate systems relative to genomes encoded in the graph, allowing stable mappings to be produced even if the structure of the graph is changed. +This model is similar to sequence graphs that have been used in assembly and multiple sequence alignment. + +Paths provide coordinate systems relative to genomes encoded in the graph, allowing stable mappings to be produced even if the structure of the graph is changed. +The variation graph model makes this embedding explicit and essential. +Tools in vg maintain paths as immutable during transformations of the graph. +They use paths to project graph-relative data into reference-relative coordinate spaces. +Paths provide stable coordinates for graphs built in different ways from the same input sequences. ![example variation graph](https://raw.githubusercontent.com/vgteam/vg/master/doc/figures/smallgraph.png) -## Usage +## Support + +We maintain a support forum on biostars: https://www.biostars.org/tag/vg/ + +## Installation + +### Download Releases + +The easiest way to get vg is to download one of our release builds for Linux. We have a 6-week release cadence, so our builds are never too far out of date. + +**[![Download Button](doc/figures/download-linux.png)](https://github.com/vgteam/vg/releases/latest)** +**[Download the latest vg release for Linux](https://github.com/vgteam/vg/releases/latest)** + +**For MacOS**, see [Building on MacOS](#building-on-macos). ### Building on Linux +If you don't want to or can't use a pre-built release of vg, or if you want to become a vg developer, you can build it from source instead. + First, obtain the repo and its submodules: git clone --recursive https://github.com/vgteam/vg.git cd vg -Then, install VG's dependencies. You'll need the protobuf and jansson development libraries installed, and to run the tests you will need `jq`, `bc` and `rs`. On Ubuntu, you should be able to do: +Then, install VG's dependencies. You'll need the protobuf and jansson development libraries installed, and to run the tests you will need: + * `jq`, `bc`, `rs`, and `parallel` + * `hexdump` and `column` from `bsdmainutils` + * [`npm` for testing documentation examples](https://github.com/anko/txm)). +On Ubuntu, you should be able to do: make get-deps On other distros, you will need to perform the equivalent of: sudo apt-get install build-essential git cmake pkg-config libncurses-dev libbz2-dev \ - protobuf-compiler libprotoc-dev libjansson-dev automake libtool \ - jq bc rs curl unzip redland-utils librdf-dev bison flex gawk \ - lzma-dev liblzma-dev liblz4-dev libffi-dev + protobuf-compiler libprotoc-dev libprotobuf-dev libjansson-dev \ + automake gettext autopoint libtool jq bsdmainutils bc rs parallel \ + npm curl unzip redland-utils librdf-dev bison flex gawk lzma-dev \ + liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \ + libzstd-devel pybind11-dev python3-pybind11 + +Note that **Ubuntu 16.04** does not ship a sufficiently new Protobuf; vg requires **Protobuf 3** which will have to be manually installed. -At present, you will need GCC version 4.9 or greater to compile vg. (Check your version with `gcc --version`.) +At present, you will need GCC version 4.9 or greater, with support for C++14, to compile vg. (Check your version with `gcc --version`.) GCC up to 11.2.0 is supported. Other libraries may be required. Please report any build difficulties. -Note that a 64-bit OS is required. Ubuntu 16.04 should work. You will also need a CPU that supports SSE 4.2 to run VG; you can check this with `cat /proc/cpuinfo | grep sse4_2`. +Note that a 64-bit OS is required. Ubuntu 20.04 should work. -When you are ready, build with `. ./source_me.sh && make static`, and run with `./bin/vg`. +When you are ready, build with `. ./source_me.sh && make`, and run with `./bin/vg`. + +Note that vg can take anywhere from 10 minutes to more than an hour to compile depending on your machine and the number of threads used. + +You can also produce a static binary with `make static`, assuming you have static versions of all the dependencies installed on your system. ### Building on MacOS @@ -62,7 +96,7 @@ VG depends on a number of packages being installed on the system where it is bei You can use MacPorts to install VG's dependencies: - sudo port install libtool jansson jq cmake pkgconfig autoconf automake libtool coreutils samtools redland bison gperftools md5sha1sum rasqal gmake autogen cairo libomp + sudo port install libtool protobuf3-cpp jansson jq cmake pkgconfig autoconf automake libtool coreutils samtools redland bison gperftools md5sha1sum rasqal gmake autogen cairo libomp boost zstd pybind11 ##### Using Homebrew @@ -72,65 +106,94 @@ Homebrew provides another package management solution for OSX, and may be prefer # Install all the dependencies in the Brewfile brew bundle - # Use GNU versions of coreutils over Apple versions - export PATH="/usr/local/opt/coreutils/libexec/gnubin:/usr/local/bin:$PATH" - - # Force use of new version of bison - brew link bison --force - - # Use glibtool/ize - export LIBTOOL=glibtool - export LIBTOOLIZE=glibtoolize - - # Use installed libraries - export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH; - export LIBRARY_PATH=$LD_LIBRARY_PATH; +#### Build -#### (Optional) Install GNU GCC +With dependencies installed, VG can now be built: -While Apple's `clang` can build VG, the C++ standard library it uses doesn't support some parallel extensions, so a Clang-built VG will be slower. Better results can be achieved by building with GNU GCC >= 4.9 and its `libstdc++` standard library. + . ./source_me.sh && make + +**Note that static binaries cannot yet be built for Mac.** -With **MacPorts**, you can install GNU GCC like this: +Our team has successfully built vg on Mac with GCC versions 4.9, 5.3, 6, 7, and 7.3, as well as Clang 9.0. - sudo port install gcc7 clang-3.8 +#### Migrating to ARM Macs -To make GCC 7 the default compiler, run (use `none` instead of `mp-gcc7` to revert back): +The Mac platform is moving to ARM, with Apple's M1, M1 Pro, M1 Max, and subsequent chip designs. The vg codebase supports ARM on Mac as well as on Linux. **The normal installation instructions work on a factory-fresh ARM Mac**. - sudo port select gcc mp-gcc7 +However, it is easy to run into problems when **migrating a working vg build environment** or **migrating Macports or Homebrew** from x86_64 to ARM. The ARM machine can successfully run x86_64 tools installed via Macports or Homebrew on the old machine, but vg can only build properly on ARM if you are using ARM versions of the build tools, like `make` and CMake. -Some OSX users also need to have the MacPorts Clang assembler for building VG's dependencies (use `none` instead of `mp-clang-3.8` to revert back): +So, after migrating to an ARM Mac using e.g. Apple's migration tools: - sudo port select clang mp-clang-3.8 +1. Uninstall Macports and its packages, if they were migrated from the old machine. Only an ARM Macports install can be used to provide dependencies for vg on ARM. +2. Uninstall Homebrew and its packages, if they were migrated. Similarly, only an ARM Homebrew install will work. +3. Reinstall one of Macports or Homebrew. Make sure to use the M1 or ARM version. +4. Use the package manager you installed to install system dependencies of vg, such as CMake, [as documented above](#install-dependencies). +5. Clean vg with `make clean`. This *should* remove all build artefacts. +6. Build vg again with `make`. -With **Homebrew**, you can install GNU GCC for VG like this: +If you still experience build problems after this, delete the whole checkout and check out the code again; `make clean` is not under CI test and is not always up to date with the rest of the build system. - brew install gcc6 - # Manually create symlinks to make Homebrew GCC 6 the default gcc and g++ - ln -s gcc-6 /usr/local/bin/gcc - ln -s g++-6 /usr/local/bin/g++ - -#### Build +Whether or not that helps, please then [open an issue](https://github.com/vgteam/vg/issues/new) so we can help fix the build or fix `make clean`. -With dependencies and compilers installed, VG can now be built: +## Usage - . ./source_me.sh && make - -**Note that static binaries cannot yet be built for Mac.** +### Variation graph construction -Our team has successfully built vg on Mac with GCC versions 4.9, 5.3, 6, 7, and 7.3, as well as Clang 9.0. +#### From VCF -### Variation graph construction +> **Note** +> See the `vg autoindex` examples below for how to use that tool in place of `vg construct` to build and index graphs in a single step. -The simplest thing to do with `vg` is to build a graph and align to it. At present, you'll want to use a reference and VCF file to do so. If you're working in the `test/` directory: +One way to build a graph with `vg` is to `construct` it from variant calls using a reference FASTA file and VCF file. If you're working in vg's `test/` directory: + ```sh vg construct -r small/x.fa -v small/x.vcf.gz >x.vg ``` -### Viewing, conversion +Note that to build a graph, an index of the VCF file is required. The VCF index file can be generated using the `tabix` command provided by SAMtools (e.g. `tabix -p vcf x.vcf.gz` on the command line). + +#### From Assemblies + +You can also build a graph (and indexes for mapping with vg) from a set of genome assemblies (FASTA), as opposed to variant calls as described above, using [Minigraph-Cactus](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md). + +### Importing and exporting different graph formats + +`vg` supports [many formats](https://github.com/vgteam/vg/wiki/File-Formats), the three most important are: + +* `PackedGraph (.vg)` : This is `vg's` native format. It supports edits of all kinds (to topology and paths), but can be inefficient at large scales, especially with many paths. +* `GFA (.gfa)` : [GFA](https://github.com/GFA-spec/GFA-spec) is standard text-based format and usually the best way to exchange graphs between `vg` and other pangenome tools. `vg` can also operate on (**uncompressed**) GFA files directly, by way of using a `PackedGraph` representation in memory (and therefore shares that format's scaling concerns and edit-ability). +* `GBZ (.gbz)` : [GBZ](https://github.com/jltsiren/gbwtgraph/blob/master/SERIALIZATION.md) is a highly-compressed format that uses much less space to store paths than the above formats, but at the cost of not allowing general edits to the graph. + +You can query the format of any graph using `vg stats -F`. + +#### Importing + +In general, you will build and index `vg` graphs using `vg autoindex` (from GFA or VCF) or `Minigraph-Cactus` (FASTAs). You can also import `GFA` files from other tools such as [ODGI](https://github.com/pangenome/odgi) and [PGGB](https://github.com/pangenome/pggb) using `vg convert -g`. + +#### Exporting + +You can convert any graph to `GFA` using `vg convert -f`. By default, `vg` uses [GFA v1.1](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md#w-walk-line-since-v11) where paths are represented as W-lines. To use P-lines instead (GFA v1.0), use `vg convert -fW`. + +#### Path Types + +The `GBZ` format makes the distinction between `REFERENCE` and `HAPLOTYPE` paths. `REFERENCE` paths can be used as coordinate systems but are more expensive to store. `HAPLOTYPE` paths are highly compressed but cannot be used for position lookups. In the [HPRC](https://github.com/human-pangenomics/hpp_pangenome_resources/) graphs for example, contigs from `GRCh38` and `CHM13(T2T)` are `REFERENCE` paths and all other samples `HAPLOTYPE` paths. + +The distinction between `REFERENCE` and `HAPLOTYPE` paths is carried over into the other formats such as `.vg` and `.gfa` to facilitate conversion and inter-operation. In `.gfa`, `REFERENCE` paths are P-Lines, or W-lines whose sample names are flagged in the header. W-lines whose names are not flagged in the header are `HAPLOTYPE` paths. In `.vg` they are denoted using a naming convention. + +See the [Path Metadata WIKI](https://github.com/vgteam/vg/wiki/Path-Metadata-Model) for more details. + +> **Warning** +> `GBZ` is the only format that supports efficient loading large numbers of `HAPLOTYPE` paths in `vg`. You may run into issues trying to load whole-genome graphs with thousands of `HAPLOTYPE` from `.vg` or `.gfa` files. `vg convert -H` can be used to drop `HAPLOTYPE` paths, allowing the graph to be more easily loaded in other formats. + +### Viewing + +> **Note** +> It is best to use the newer `vg convert` tool (described above) for GFA conversion `vg view` provides a way to convert the graph into various formats: + ```sh # GFA output vg view x.vg >x.gfa @@ -138,101 +201,269 @@ vg view x.vg >x.gfa # dot output suitable for graphviz vg view -d x.vg >x.dot +# And if you have a GAM file +cp small/x-s1337-n1.gam x.gam + # json version of binary alignments vg view -a x.gam >x.json ``` -### Alignment +### Mapping -As this is a small graph, you could align to it using a full-length partial order alignment: +If you have more than one sequence, or you are working on a large graph, you will want to map rather than merely aligning. -```sh -vg align -s CTACTGACAGCAGAAGTTTGCTGTGAAGATTAAATTAGGTGATGCTTG x.vg -``` +There are multiple read mappers in `vg`: -Note that you don't have to store the graph on disk at all, you can simply pipe it into the local aligner: +* `vg giraffe` is designed to be fast for highly accurate short reads, against graphs with haplotype information. +* `vg map` is a general-purpose read mapper. +* `vg mpmap` does "munti-path" mapping, to allow describing local alignment uncertainty. [This is useful for transcriptomics.](#Transcriptomic-analysis) +#### Mapping with `vg giraffe` + +To use `vg giraffe` to map reads, you will first need to prepare indexes. This is best done using `vg autoindex`. In order to get `vg autoindex` to use haplotype information from a VCF file, you can give it the VCF and the associated linear reference directly. + + ```sh -vg construct -r small/x.fa -v small/x.vcf.gz | vg align -s CTACTGACAGCAGAAGTTTGCTGTGAAGATTAAATTAGGTGATGCTTG - +# construct the graph and indexes (paths below assume running from `vg/test` directory) +vg autoindex --workflow giraffe -r small/x.fa -v small/x.vcf.gz -p x + +# simulate a bunch of 150bp reads from the graph, into a GAM file of reads aligned to a graph +vg sim -n 1000 -l 150 -x x.giraffe.gbz -a > x.sim.gam +# now re-map these reads against the graph, and get BAM output in linear space +# FASTQ input uses -f instead of -G. +vg giraffe -Z x.giraffe.gbz -G x.sim.gam -o BAM > aln.bam ``` -Most commands allow the streaming of graphs into and out of `vg`. +[More information on using `vg girafe` can be found on the `vg` wiki.](https://github.com/vgteam/vg/wiki/Mapping-short-reads-with-Giraffe) -### Mapping +#### Mapping with `vg map` If your graph is large, you want to use `vg index` to store the graph and `vg map` to align reads. `vg map` implements a kmer based seed and extend alignment model that is similar to that used in aligners like novoalign or MOSAIK. First an on-disk index is built with `vg index` which includes the graph itself and kmers of a particular size. When mapping, any kmer size shorter than that used in the index can be employed, and by default the mapper will decrease the kmer size to increase sensitivity when alignment at a particular _k_ fails. + ```sh -# construct the graph -vg construct -r small/x.fa -v small/x.vcf.gz >x.vg +# construct the graph (paths below assume running from `vg/test` directory) +vg construct -r small/x.fa -v small/x.vcf.gz > x.vg # store the graph in the xg/gcsa index pair vg index -x x.xg -g x.gcsa -k 16 x.vg # align a read to the indexed version of the graph # note that the graph file is not opened, but x.vg.index is assumed -vg map -s CTACTGACAGCAGAAGTTTGCTGTGAAGATTAAATTAGGTGATGCTTG -x x.xg -g x.gcsa >read.gam +vg map -s CTACTGACAGCAGAAGTTTGCTGTGAAGATTAAATTAGGTGATGCTTG -x x.xg -g x.gcsa > read.gam -# simulate a bunch of 150bp reads from the graph and map them -vg map -r <(vg sim -n 1000 -l 150 -x x.xg ) -x x.xg -g x.gcsa >aln.gam +# simulate a bunch of 150bp reads from the graph, one per line +vg sim -n 1000 -l 150 -x x.xg > x.sim.txt +# now map these reads against the graph to get a GAM +vg map -T x.sim.txt -x x.xg -g x.gcsa > aln.gam # surject the alignments back into the reference space of sequence "x", yielding a BAM file -vg surject -x x.xg -b aln.gam >aln.bam +vg surject -x x.xg -b aln.gam > aln.bam # or alternatively, surject them to BAM in the call to map -vg map -r <(vg sim -n 1000 -l 150 -x x.xg ) -x x.xg -g x.gcsa --surject-to bam >aln.bam +vg sim -n 1000 -l 150 -x x.xg > x.sim.txt +vg map -T x.sim.txt -x x.xg -g x.gcsa --surject-to bam > aln.bam ``` + +### Augmentation + +Variation from alignments can be embedded back into the graph. This process is called augmentation and can be used for *de novo* variant calling, for example (see below). + +> **Warning** +> Using `vg augment` for variant calling remains very experimental. It is not at all recommended for structural variant calling, and even for small variants, you will often get much more accurate results (at least on human) by projecting your alignment to `BAM` and running a linear variant caller such as DeepVariant. + + +```sh +# augment the graph with all variation from the GAM except that implied by soft clips, saving to aug.vg. aug.gam contains the same reads as aln.gam but mapped to aug.vg +vg augment x.vg aln.gam -A aug.gam > aug.vg + +# augment the graph with all variation from the GAM, saving each mapping as a path in the graph. +# softclips of alignment paths are preserved (`-S`). +# Note, this can be much less efficient than the above example if there are many alignments in the GAM +vg augment x.vg aln.gam -i -S > aug_with_paths.vg +``` + ### Variant Calling -The following example shows how to construct a VCF file from a read alignment and graph. Input must be split into chunks (see vg chunk) in order to run on whole genome. +> **Note** +> More information can be found in the [WIKI](https://github.com/vgteam/vg/wiki/SV-Genotyping-and-variant-calling). + +#### Calling variants using read support + +The following examples show how to generate a VCF with vg using read support. They depend on output from the Mapping and Augmentation examples above. Small variants and SVs can be called using the same approach. **Currently, it is more accuracte for SVs**. + +Call only variants that are present in the graph: + +```sh +# Compute the read support from the gam +# -Q 5: ignore mapping and base qualitiy < 5 +vg pack -x x.xg -g aln.gam -Q 5 -o aln.pack + +# Generate a VCF from the support. +vg call x.xg -k aln.pack > graph_calls.vcf +``` + +By default, `vg call` omits `0/0` variants and tries to normalize alleles to make the VCF more compact. Both these steps can make it difficult to compare the outputs from different samples as the VCFs will have different coordinates even though they were created using the same graph. The `-a` option addresses this by calling every snarl using the same coordinates and including reference calls. Outputs for different samples can be combined with `bcftools merge -m all`. + +``` +vg call x.xg -k aln.pack -a > snarl_genotypes.vcf +``` + +In order to also consider *novel* variants from the reads, use the augmented graph and gam (as created in the "Augmentation" example using `vg augment -A`): + +> **Warning** +> Using `vg augment` for variant calling remains very experimental. It is not at all recommended for structural variant calling, and even for small variants, you will often get much more accurate results (at least on human) by projecting your alignment to `BAM` and running a linear variant caller such as DeepVariant. + + +```sh +# Index our augmented graph +vg index aug.vg -x aug.xg + +# Compute the read support from the augmented gam (ignoring qualitiy < 5, and 1st and last 5bp of each read) +vg pack -x aug.xg -g aug.gam -Q 5 -s 5 -o aln_aug.pack + +# Generate a VCF from the support +vg call aug.xg -k aln_aug.pack > calls.vcf +``` + +A similar process can by used to *genotype* known variants from a VCF. To do this, the graph must be constructed from the VCF with `vg construct -a` (graphs from other sources such as `vg autoindex` and `Minigraph-Cactus` cannot be used): + + +```sh +# Re-construct the same graph as before but with `-a` +vg construct -r small/x.fa -v small/x.vcf.gz -a > xa.vg + +# Index the graph with `-L' to preserve alt paths in the xg +vg index xa.vg -x xa.xg -L + +# Compute the support (we could also reuse aln.pack from above) +vg pack -x xa.xg -g aln.gam -o aln.pack + +# Genotype the VCF (use -v) +vg call xa.xg -k aln.pack -v small/x.vcf.gz > genotypes.vcf +``` + +Pre-filtering the GAM before computing support can improve precision of SNP calling: + + ```sh # filter secondary and ambiguous read mappings out of the gam -vg filter alignment.gam -r 0.90 -fu -s 2 -o 0 -D 999 -x graph.xg > filtered.gam +vg filter aln.gam -r 0.90 -fu -m 1 -q 15 -D 999 -x x.xg > aln.filtered.gam + +# then compute the support from aln.filtered.gam instead of aln.gam in above etc. +vg pack -x xa.xg -g aln.filtered.gam -o aln.pack +vg call xa.xg -k aln.pack -v small/x.vcf.gz > genotypes.vcf +``` + +For larger graphs, it is recommended to compute snarls separately: + + +```sh +vg snarls x.xg > x.snarls + +# load snarls from a file instead of computing on the fly +vg call x.xg -k aln.pack -r x.snarls > calls.vcf +``` + +Note: `vg augment`, `vg pack`, `vg call` and `vg snarls` can now all be run on directly on any graph format (ex '.gbz', '.gfa', `.vg`, `.xg` (except `augment`) or anything output by `vg convert`). Operating on `.vg` or '.gfa' uses the most memory and is not recommended for large graphs. The output of `vg pack` can only be read in conjunction with the same graph used to create it, so `vg pack x.vg -g aln.gam -o x.pack` then `vg call x.xg -k x.pack` will not work. + +#### Calling variants from paths in the graph + +Infer variants from from alignments implied by paths in the graph. This can be used, for example, to call SVs directly from a variation graph that was constructed from a multiple alignment of different assemblies: + + +```sh +# create a graph from a multiple alignment of HLA haplotypes (from vg/test directory) +vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg -# create an augmented graph by adding variation from the reads -vg augment graph.vg filtered.gam -a pileup -S aug_graph.support -Z aug_graph.trans > aug_graph.vg +# index it +vg index hla.vg -x hla.xg -# to only recall variants that are already in the graph, add -g 9999999 to the augment options above. +# generate a VCF using gi|568815592:29791752-29792749 as the reference contig. The other paths will be considered as haploid samples +vg deconstruct hla.xg -e -p "gi|568815592:29791752-29792749" > hla_variants.vcf +``` + +Variants can also be inferred strictly from topology by not using `-e`, though unlike the above example, cycles are not supported. "Deconstruct" the VCF variants that were used to construct the graph. The output will be similar but identical to `small/x.vcf.gz` as `vg construct` can add edges between adjacent alts and/or do some normalization: + + +```sh +# using the same graph from the `map` example +vg deconstruct x.xg -p x > x.vcf +``` + +Haplotype paths from `.gbz` or `.gbwt` indexes input can be considered using `-z` and `-g', respectively. + +As with `vg call`, it is best to compute snarls separately and pass them in with `-r` when working with large graphs. + +### Transcriptomic analysis + +`vg` has a number of tools to support transcriptomic analyses with spliced graphs (i.e. graphs that have annotated splice junctions added as edges into the graph). These edges can be added into an existing graph using `vg rna`. We can then perform splice-aware mapping to these graphs using `vg mpmap`. `vg` developers have also made a tool for haplotype-aware transcript quantification based on these tools in [`rpvg`](https://github.com/jonassibbesen/rpvg). The easiest way to start this pipeline is to use the `vg autoindex` subcommand to make indexes for `vg mpmap`. `vg autoindex` creates indexes for mapping from common interchange formats like FASTA, VCF, and GTF. + +More information is available in the [wiki page on transcriptomics](https://github.com/vgteam/vg/wiki/Transcriptomic-analyses). + +Working from the `test/` directory the following example shows how to create a spliced pangenome graph and indexes using `vg autoindex` with 4 threads: + + +```sh +# Create spliced pangenome graph and indexes for vg mpmap +vg autoindex --workflow mpmap -t 4 --prefix vg_rna --ref-fasta small/x.fa --vcf small/x.vcf.gz --tx-gff small/x.gtf +``` + +RNA-seq reads can be mapped to the spliced pangenome graph using `vg mpmap` with 4 threads: + + +```sh +# Map simulated RNA-seq reads using vg mpmap +vg mpmap -n rna -t 4 -x vg_rna.spliced.xg -g vg_rna.spliced.gcsa -d vg_rna.spliced.dist -f small/x_rna_1.fq -f small/x_rna_2.fq > mpmap.gamp +``` -# Make calls by thresholding based on read support for graph path SEQ -vg call aug_graph.vg -b graph.vg -s aug_graph.support -z aug_graph.trans -r SEQ > calls.vcf +This will produce alignments in the multipath format. For more information on the multipath alignment format and `vg mpmap` see [wiki page on mpmap](https://github.com/vgteam/vg/wiki/Multipath-alignments-and-vg-mpmap). Running the two commands on the small example data using 4 threads should on most machines take less than a minute. +### Alignment -# Or Make calls using a Freebayes-like genotyping algorithm for graph path SEQ -vg genotype graph.vg -G alignment.gam -E -v -r SEQ > calls.vcf +If you have a small graph, you can align a sequence to the whole graph, using a full-length partial order alignment: -# for comparison purposes, it's very useful to normalize the vcf output, especially for more complex graphs which can make large variant blocks that contain a lot of reference bases (Note: requires [vt](http://genome.sph.umich.edu/wiki/Vt)): -vt decompose_blocksub -a calls.vcf | vt normalize -r FASTA_FILE - > calls.clean.vcf + +```sh +vg align -s CTACTGACAGCAGAAGTTTGCTGTGAAGATTAAATTAGGTGATGCTTG x.vg +``` +Note that you don't have to store the graph on disk at all, you can simply pipe it into the local aligner: + + +```sh +vg construct -r small/x.fa -v small/x.vcf.gz | vg align -s CTACTGACAGCAGAAGTTTGCTGTGAAGATTAAATTAGGTGATGCTTG - ``` -To produce a VCF file for a whole chromosome, the graph must be cut up along the reference genome and called in chunks. `scripts/chunked_call` wraps this functionality to produce chromosome-sized VCFs in a single command line (from a GAM file and XG index) +Most commands allow the streaming of graphs into and out of `vg`. ### Command line interface A variety of commands are available: +- *autoindex*: construct graphs and indexes for other tools from common interchange file formats - *construct*: graph construction -- *view*: conversion (dot/protobuf/json/GFA) - *index*: index features of the graph in a disk-backed key/value store -- *find*: use an index to find nodes, edges, kmers, or positions -- *paths*: traverse paths in the graph -- *align*: local alignment -- *map*: global alignment (kmer-driven) -- *stats*: metrics describing graph properties -- *join*: combine graphs (parallel) -- *concat*: combine graphs (serial) -- *ids*: id manipulation -- *kmers*: generate kmers from a graph +- *map*: mapp reads to a graph +- *giraffe*: fast, haplotype-based mapping of reads to a graph +- *mpmap*: short read mapping and multipath alignment (optionally spliced) +- *surject*: project graph alignments onto a linear reference +- *augment*: adds variation from aligned reads into the graph +- *call*: call variants from an augmented graph +- *rna*: construct splicing graphs and pantranscriptomes +- *convert*: convert graph and alignment formats +- *combine*: combine graphs +- *chunk*: extract or break into subgraphs +- *ids*: node ID manipulation - *sim*: simulate reads by walking paths in the graph -- *mod*: various transformations of the graph -- *surject*: force graph alignments into a linear reference space -- *msga*: construct a graph from an assembly of multiple sequences -- *validate*: determine if graph is valid +- *prune*: prune graphs to restrict their path complexity +- *snarls*: find bubble-like motifs in a graph +- *mod*: various graph transformations - *filter*: filter reads out of an alignment -- *augment*: adds variation from aligned reads into the graph -- *call/genotype*: call variants from an augmented graph +- *deconstruct*: create a VCF from variation in the graph +- *paths*: traverse paths in the graph +- *stats*: metrics describing graph properties ## Implementation notes @@ -241,3 +472,4 @@ A variety of commands are available: ## License MIT + diff --git a/contrib/proto2cpp/proto2cpp.py b/contrib/proto2cpp/proto2cpp.py index 137818c789f..e41a2d443df 100755 --- a/contrib/proto2cpp/proto2cpp.py +++ b/contrib/proto2cpp/proto2cpp.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python3 ## # Doxygen filter for Google Protocol Buffers .proto files. # This script converts .proto files into C++ style ones @@ -16,7 +16,7 @@ # 4. In the Doxygen configuration file, find EXTENSION_MAPPING and add proto=C # EXTENSION_MAPPING = proto=C # 5. In the Doxygen configuration file, find INPUT_FILTER and add this script -# INPUT_FILTER = "python proto2cpp.py" +# INPUT_FILTER = "python3 proto2cpp.py" # 6. Run Doxygen with the modified configuration # doxygen doxyfile # diff --git a/deps/BBHash b/deps/BBHash new file mode 160000 index 00000000000..36e4fe3eaee --- /dev/null +++ b/deps/BBHash @@ -0,0 +1 @@ +Subproject commit 36e4fe3eaeef762c831c49cdc01f1a3a2c7a97a4 diff --git a/deps/DYNAMIC b/deps/DYNAMIC index 2a82bbfda15..1ca3150af76 160000 --- a/deps/DYNAMIC +++ b/deps/DYNAMIC @@ -1 +1 @@ -Subproject commit 2a82bbfda156ea38c1abc0078820277f492ef615 +Subproject commit 1ca3150af76ce499959c1eef67faee5fab457af3 diff --git a/deps/FlameGraph b/deps/FlameGraph new file mode 160000 index 00000000000..1b1c6deede9 --- /dev/null +++ b/deps/FlameGraph @@ -0,0 +1 @@ +Subproject commit 1b1c6deede9c33c5134c920bdb7a44cc5528e9a7 diff --git a/deps/atomic_queue b/deps/atomic_queue new file mode 160000 index 00000000000..d9d66b6d20d --- /dev/null +++ b/deps/atomic_queue @@ -0,0 +1 @@ +Subproject commit d9d66b6d20d74042da481ed5504fa81c0d79c8ae diff --git a/deps/boost-subset b/deps/boost-subset deleted file mode 160000 index 53ee8d82dbd..00000000000 --- a/deps/boost-subset +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 53ee8d82dbd2ae54825b49606d8a24d9d45bb142 diff --git a/deps/dozeu b/deps/dozeu index 9589a1e7612..17e38965380 160000 --- a/deps/dozeu +++ b/deps/dozeu @@ -1 +1 @@ -Subproject commit 9589a1e76125a299ea13675908630160221d0f64 +Subproject commit 17e3896538058d64a3528e2684267007afeef32e diff --git a/deps/elfutils b/deps/elfutils index 4407903a743..983e86fd89e 160000 --- a/deps/elfutils +++ b/deps/elfutils @@ -1 +1 @@ -Subproject commit 4407903a7433327dbeda74519d036fc82ef64de3 +Subproject commit 983e86fd89e8bf02f2d27ba5dce5bf078af4ceda diff --git a/deps/fastahack b/deps/fastahack index b5d42afcb13..75f12d25df9 160000 --- a/deps/fastahack +++ b/deps/fastahack @@ -1 +1 @@ -Subproject commit b5d42afcb1356e0ee15ef601614184928f5ed65e +Subproject commit 75f12d25df9416b9d49b84c70dcc58406afce11a diff --git a/deps/fermi-lite b/deps/fermi-lite index cde7871d5a6..243c9330fdb 160000 --- a/deps/fermi-lite +++ b/deps/fermi-lite @@ -1 +1 @@ -Subproject commit cde7871d5a6658be61a7c3995a56ac21fbacf4f4 +Subproject commit 243c9330fdb2dc5e0bbb5e441ba3335678ec7136 diff --git a/deps/gbwt b/deps/gbwt index 59c22e2cdcb..15ca8a18bbe 160000 --- a/deps/gbwt +++ b/deps/gbwt @@ -1 +1 @@ -Subproject commit 59c22e2cdcb7196f4816138f30a2401af0833df7 +Subproject commit 15ca8a18bbe00cbc51bc80842daac8ecd3fe07a7 diff --git a/deps/gbwtgraph b/deps/gbwtgraph new file mode 160000 index 00000000000..eff446aeb2d --- /dev/null +++ b/deps/gbwtgraph @@ -0,0 +1 @@ +Subproject commit eff446aeb2da0fed1ae1bd2ef01e577354c27fca diff --git a/deps/gcsa2 b/deps/gcsa2 index 5c41acf5b80..d246cfd5dbe 160000 --- a/deps/gcsa2 +++ b/deps/gcsa2 @@ -1 +1 @@ -Subproject commit 5c41acf5b805273b96f0c61404db8074e18a3e89 +Subproject commit d246cfd5dbe0d45546f28213c0245aa308bdb462 diff --git a/deps/gfakluge b/deps/gfakluge deleted file mode 160000 index 4c2fd088911..00000000000 --- a/deps/gfakluge +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4c2fd0889117a5767ca39bb129be268bf511504c diff --git a/deps/gperftools b/deps/gperftools deleted file mode 160000 index 855b3800064..00000000000 --- a/deps/gperftools +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 855b3800064db49af823b85a54be269923eb6f4d diff --git a/deps/gssw b/deps/gssw index b3e029fadad..14b4d43736b 160000 --- a/deps/gssw +++ b/deps/gssw @@ -1 +1 @@ -Subproject commit b3e029fadadef52e667f998032a0bf8d45ca6af6 +Subproject commit 14b4d43736bb606c3fc97c4724d1959d13550d37 diff --git a/deps/htslib b/deps/htslib index 379d44acbb0..bd133acf514 160000 --- a/deps/htslib +++ b/deps/htslib @@ -1 +1 @@ -Subproject commit 379d44acbb02b4a1c1950740d219b0b63aa4d7b8 +Subproject commit bd133acf51498431a5c7dfd8aa06ce17ec6d3b96 diff --git a/deps/ips4o b/deps/ips4o new file mode 160000 index 00000000000..22069381cc1 --- /dev/null +++ b/deps/ips4o @@ -0,0 +1 @@ +Subproject commit 22069381cc1bf2df07ee1ff47f6b6073fcfb4508 diff --git a/deps/jemalloc b/deps/jemalloc new file mode 160000 index 00000000000..041145c2727 --- /dev/null +++ b/deps/jemalloc @@ -0,0 +1 @@ +Subproject commit 041145c272711b55f91aa42128b108674a12fd91 diff --git a/deps/kff-cpp-api b/deps/kff-cpp-api new file mode 160000 index 00000000000..d12fe041838 --- /dev/null +++ b/deps/kff-cpp-api @@ -0,0 +1 @@ +Subproject commit d12fe0418387933bcc928805e18ea00125577525 diff --git a/deps/libVCFH b/deps/libVCFH index 7ee7a250ce0..615a06b5de2 160000 --- a/deps/libVCFH +++ b/deps/libVCFH @@ -1 +1 @@ -Subproject commit 7ee7a250ce009d29f20aca7a2e2ef24bb7825a0f +Subproject commit 615a06b5de21f59294d7d806e9f505162ae8c2c1 diff --git a/deps/libbdsg b/deps/libbdsg new file mode 160000 index 00000000000..8e47864fbf6 --- /dev/null +++ b/deps/libbdsg @@ -0,0 +1 @@ +Subproject commit 8e47864fbf6513d54810b1d16df38b1b4cffcdfe diff --git a/deps/libhandlegraph b/deps/libhandlegraph new file mode 160000 index 00000000000..0b519b72bec --- /dev/null +++ b/deps/libhandlegraph @@ -0,0 +1 @@ +Subproject commit 0b519b72becbeb8f56f0e3478a1aef54fa241106 diff --git a/deps/libvgio b/deps/libvgio new file mode 160000 index 00000000000..1bc5ce5fc9b --- /dev/null +++ b/deps/libvgio @@ -0,0 +1 @@ +Subproject commit 1bc5ce5fc9b1c2938ad937b29962f1e4220fdb9b diff --git a/deps/lru_cache b/deps/lru_cache index e5fb7f3b15e..98c5eacfb2b 160000 --- a/deps/lru_cache +++ b/deps/lru_cache @@ -1 +1 @@ -Subproject commit e5fb7f3b15ec49edc3fc6293797b44cc7ce420cc +Subproject commit 98c5eacfb2bf60debd2f9c3a3faf814e8821458c diff --git a/deps/mio b/deps/mio new file mode 160000 index 00000000000..3f86a95c078 --- /dev/null +++ b/deps/mio @@ -0,0 +1 @@ +Subproject commit 3f86a95c0784d73ce6815237ec33ed25f233b643 diff --git a/deps/mmmultimap b/deps/mmmultimap new file mode 160000 index 00000000000..d8fb6c995d3 --- /dev/null +++ b/deps/mmmultimap @@ -0,0 +1 @@ +Subproject commit d8fb6c995d3d9dc5e5844c5f72d1e7d32163fd0a diff --git a/deps/pinchesAndCacti b/deps/pinchesAndCacti index 87b93048b4d..52df64cc1e2 160000 --- a/deps/pinchesAndCacti +++ b/deps/pinchesAndCacti @@ -1 +1 @@ -Subproject commit 87b93048b4deb0c77bdf0a36c95dff70b557f81e +Subproject commit 52df64cc1e2ad2633cc9192381c5a8c9021e772a diff --git a/deps/protobuf b/deps/protobuf deleted file mode 160000 index 174c82d8cef..00000000000 --- a/deps/protobuf +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 174c82d8cef27be5cb9d8491dd1e26d27898870b diff --git a/deps/raptor b/deps/raptor index 453ff1676b6..17fe084ee96 160000 --- a/deps/raptor +++ b/deps/raptor @@ -1 +1 @@ -Subproject commit 453ff1676b6749c890997e59026b88abc0b8010d +Subproject commit 17fe084ee96def426beb62bac9c6fb4d8f741c92 diff --git a/deps/rocksdb b/deps/rocksdb deleted file mode 160000 index acef93ed65f..00000000000 --- a/deps/rocksdb +++ /dev/null @@ -1 +0,0 @@ -Subproject commit acef93ed65f0cab701565a5d21edc1dea894130c diff --git a/deps/sdsl-lite b/deps/sdsl-lite index 8f5245ff17f..863d0118cf3 160000 --- a/deps/sdsl-lite +++ b/deps/sdsl-lite @@ -1 +1 @@ -Subproject commit 8f5245ff17f0a8799f4bc6c650a0db527403e4c1 +Subproject commit 863d0118cf303f9c9c55576e0f6b2f70ecd9689a diff --git a/deps/sha1 b/deps/sha1 index 6474be99be6..011accb73af 160000 --- a/deps/sha1 +++ b/deps/sha1 @@ -1 +1 @@ -Subproject commit 6474be99be64bc9d930e9f30d89d71119f78f699 +Subproject commit 011accb73af23c2703bd6801cd4285728c264d94 diff --git a/deps/sonLib b/deps/sonLib index dc477fe27f7..63b0301bce3 160000 --- a/deps/sonLib +++ b/deps/sonLib @@ -1 +1 @@ -Subproject commit dc477fe27f7524c10c40d2a8e713624f1b30baa6 +Subproject commit 63b0301bce3861f247684c804fd204c452b8faf2 diff --git a/deps/ssw b/deps/ssw index d27667cbb93..f53d459841b 160000 --- a/deps/ssw +++ b/deps/ssw @@ -1 +1 @@ -Subproject commit d27667cbb93bab73de3c31bf0a265fd8e8632e72 +Subproject commit f53d459841b632ac6ab0e59e1a0ef6d056034631 diff --git a/deps/structures b/deps/structures index b490e84b7ca..d9e184da874 160000 --- a/deps/structures +++ b/deps/structures @@ -1 +1 @@ -Subproject commit b490e84b7ca291592cac3c7ee8a994898ce9d113 +Subproject commit d9e184da87492535c5e69d60706ee02a7eb2e82e diff --git a/deps/tabixpp b/deps/tabixpp new file mode 160000 index 00000000000..ae5cdf846af --- /dev/null +++ b/deps/tabixpp @@ -0,0 +1 @@ +Subproject commit ae5cdf846af85bd1d0e310c05e5c67b037f51a25 diff --git a/deps/vcflib b/deps/vcflib index 1ba62e5ee8f..b5287d1c2db 160000 --- a/deps/vcflib +++ b/deps/vcflib @@ -1 +1 @@ -Subproject commit 1ba62e5ee8fdd3346a097c187c2235798767c452 +Subproject commit b5287d1c2dbbfe53fc7af8f88ff4aa1208d5dd49 diff --git a/deps/vowpal_wabbit b/deps/vowpal_wabbit deleted file mode 160000 index 0105bbc11ba..00000000000 --- a/deps/vowpal_wabbit +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0105bbc11baf24db25490b0a9a808b18a8a846e3 diff --git a/deps/xg b/deps/xg new file mode 160000 index 00000000000..f2e565d9c5b --- /dev/null +++ b/deps/xg @@ -0,0 +1 @@ +Subproject commit f2e565d9c5bd8a0bde3da69e365b8fc29d2bf7b7 diff --git a/doc/asciidoc/man/vg-giraffe.adoc b/doc/asciidoc/man/vg-giraffe.adoc new file mode 100644 index 00000000000..05d538c3192 --- /dev/null +++ b/doc/asciidoc/man/vg-giraffe.adoc @@ -0,0 +1,203 @@ += vg-giraffe(1) +vgteam contributors +v1.20.0 +:doctype: manpage +:manmanual: vg +:mansource: vg +:man-linkstyle: pass:[blue R < >] + +== Name + +vg-giraffe - map unpaired short reads using minimizers and gapless extension + +== Synopsis + +*vg giraffe* [_OPTION_]... [_FASTA_ [_VCF_]] > output.gam + +== Arguments + +_FASTA_:: + Specify a FASTA file to build the graph from. Must have an extension *.fa*, *.fna*, or *.fasta*, with optional *.gz*. The name without extension is used as the _basename_ under which to look for index files with their own extensions, if *-x*/*--xg-name* is not specified. If omitted, *-x*/*--xg-name* is required. + +_VCF_:: + Variant Call Format file containing phased haplotypes, used to build the graph and haplotype database (GBWT) if those are not themselves provided. Must have a *.vcf.gz* extension, and an associated *.vcf.gz.tbi* index file. If omitted, a graph and GBWT must already exist and must be provided, either explicitly with *-x*/*--xg-name* and *-H*/*--gbwt-name*, or via the _FASTA_ or *-x*/*--xg-name* derived _basename_. + +_TAG_:: + Specify a collection of tests to run, via []-enclosed tag. Tag may need to be quoted to avoid being interpreted as a shell wildcard character class. + +== Options + +*-x*:: +*--xg-name*=_FILE_:: + Use this xg index or graph. The file name without extension is also used as the _basename_ for finding indexes, overriding any FASTA-derived _basename_. If omitted, _FASTA_ is required. If not specified, will load _basename.vg_ and create that file if not present. + +*-g*:: +*--graph-name*=_FILE_:: + Load this GBWTGraph. If not specified, will load _basename.gg_ and create that file if not present. + +*-H*:: +*--gbwt-name*=_FILE_:: + Use this GBWT index. If not specified, will load _basename.gbwt_ and create that file if not present. + +*-m*:: +*--minimizer-name*=_FILE_:: + Use this minimizer index. If not specified, will load _basename.min_ and create that file if not present. + +*-d*:: +*--dist-name*=_FILE_:: + Cluster using this distance index. If not specified, will load _basename.dist_ and create that file if not present. + +*-p*:: +*--progress*:: + Show progress + +*-G*:: +*--gam-in*=_FILE_:: + Read and realign GAM-format reads from FILE (may repeat) + +*-f*:: +*--fastq-in*=_FILE_:: + Read and align FASTQ-format reads from FILE (may repeat) + +*-i*:: +*--interleaved*:: + GAM/FASTQ input is interleaved pairs, for paired-end alignment + +*-M*:: +*--max-multimap*=_INT_:: + Produce up to INT alignments for each read [1] + +*-N*:: +*--sample*=_NAME_:: + Add this sample name + +*-R*:: +*--read-group*=_NAME_:: + Add this read group + +*-n*:: +*--discard*:: + Discard all output alignments (for profiling) + +*--output-basename*=_NAME_:: + Write output to a GAM file beginning with the given prefix for each setting combination + +*--report-name*=_NAME_:: + Write a TSV of output file and mapping speed to the given file + +*-c*:: +*--hit-cap*=_INT_:: + Use all minimizers with at most INT hits [10] + +*-C*:: +*--hard-hit-cap*=_INT_:: + Use all minimizers with at most INT hits [10] + +*-F*:: +*--score-fraction*=_FLOAT_:: + Select minimizers between hit caps until score is FLOAT of total [0.6] + +*-D*:: +*--distance-limit*=_INT_:: + Cluster using this distance limit [200] + +*-e*:: +*--max-extensions*=_INT_:: + Extend up to INT clusters [48] + +*-a*:: +*--max-alignments*=_INT_:: + Align up to INT clusters [8] + +*-s*:: +*--cluster-score*=_INT_:: + Only extend clusters if they are within INT of the best score [50] + +*-u*:: +*--cluster-coverage*=_FLOAT_:: + Only extend clusters if they are within INT of the best read coverage [0.4] + +*-v*:: +*--extension-score*=_INT_:: + Only align extensions if their score is within INT of the best score [1] + +*-w*:: +*--extension-set*=_INT_:: + Only align extension sets if their score is within extension-set of the best score [20] + +*-O*:: +*--no-dp*:: + Disable all gapped alignment + +*--track-provenance*:: + Track how internal intermediate alignment candidates were arrived at + +*--track-correctness*:: + Track if internal intermediate alignment candidates are correct (implies --track-provenance) + +*-t*:: +*--threads*=_INT_:: + Number of compute threads to use + + +== Description + +*vg gaffe* is a fast (experimental) algorithm to map reads to a graph. +It is specialized for low-error-rate short reads. +Giraffe uses minimizers of the graph's haplotypes and gapless extension to map the reads. +Because the graph is expected to contain a relatively complete inventory of a certain type of variation, gapless alignment is sufficient to align most reads and a more expensive gapped alignment step is required for only a minority of cases. + +*vg gaffe* requires four input files to define the reference: A graph or GBWTGraph, a GBWT index, a minimizer index, and a distance index. +Each can also be automatically produced by *vg gaffe*, given the requisite input files. +The graph and indexes can be produced automatically if _FASTA_ and _VCF_ are specified. +The _basename_ is a file path derived from the graph file (specified by *-x*/*--xg-name*), or from the _FASTA_ argument if no graph file is specified. It is combined with an extension for each index type to produce the filename from which that index will be loaded, or to which it will be saved if it is constructed. + +Because indexing is resource-intensive, the graph and indexes can be manually constructed in advance. +The graph can be built wiht *vg construct*. +Indexes can be manually built with *vg index* and *vg minimizer*, as well as *vg snarls* to provide the snarls file needed for the distance index. +If desired, the GBWTgraph can also be pre-generated with *vg gbwt*. + +When building the graph with *vg construct* for use with *vg gaffe*, it is important to provide the *-a* option in order to embed the variant information necessary to later build the GBWT. + +When building snarls with *vg snarls*, it is important to provide the *-T*/*--include-trivial* option to include trivial snarls, which are required when building the distance index. + +== Examples + +To map reads to an indexed graph and write the alignment to a gam file: + +---- +$ vg gaffe -x reference.xg -H reference.gbwt -m reference.min -d reference.dist -G reads.gam > mapped.gam +---- + +Same as above, but implicitly finding other indexes using the graph's filename: + +---- +$ vg gaffe -x reference.xg -G reads.gam > mapped.gam +---- + +To map reads building all indexes dynamically, if not found, from a FASTA and indexed VCF: + +---- +$ vg gaffe reference.fa phased_haplotypes.vcf.gz -G reads.gam > mapped.gam +---- + +Same as above, but manually pre-building the graph and all indexes, and providing the graph to define _basename_: + +---- +$ vg construct -a -r reference.fa -v phased_haplotypes.vcf.gz >reference.vg +$ vg index -G reference.gbwt -v phased_haplotypes.vcf.gz reference.vg +$ vg snarls --include-trivial reference.vg > reference.snarls +$ vg index -s reference.snarls -j reference.dist reference.vg +$ vg minimizer -k 29 -w 11 -g reference.gbwt -i reference.min reference.vg +$ vg gbwt -g reference.gg -x reference.vg reference.gbwt +$ vg gaffe -x reference.vg -G reads.gam > mapped.gam +---- + +== See Also +*vg*(1) + +== Copyright + +Copyright (C) 2020 {author}. + +Free use of this documentation is granted under the terms of the MIT License. diff --git a/doc/asciidoc/man/vg-test.adoc b/doc/asciidoc/man/vg-test.adoc new file mode 100644 index 00000000000..1c6a6d6a05b --- /dev/null +++ b/doc/asciidoc/man/vg-test.adoc @@ -0,0 +1,154 @@ += vg-test(1) +vgteam contributors +v1.20.0 +:doctype: manpage +:manmanual: vg +:mansource: vg +:man-linkstyle: pass:[blue R < >] + +== Name + +vg-test - run internal vg unit tests + +== Synopsis + +*vg test* [_TESTNAME_ | _PATTERN_ | _TAG_]... [_OPTION_]... + +== Arguments + +_TESTNAME_:: + Specify a test to run, by full name. + +_PATTERN_:: + Specify a collection of tests to run, via regular expression match. + +_TAG_:: + Specify a collection of tests to run, via []-enclosed tag. Tag may need to be quoted to avoid being interpreted as a shell wildcard character class. + +== Options + +*-?*:: +*-h*:: +*--help*:: + display usage information + +*-l*:: +*--list-tests*:: + list all/matching test cases + +*-t*:: +*--list-tags*:: + list all/matching tags + +*-s*:: +*--success*:: + include successful tests in output + +*-b*:: +*--break*:: + break into debugger on failure + +*-e*:: +*--nothrow*:: + skip exception tests + +*-i*:: +*--invisibles*:: + show invisibles (tabs, newlines) + +*-o*:: +*--out*=_FILENAME_:: + output filename + +*-r*:: +*--reporter*=_NAME_:: + reporter to use (defaults to console) + +*-n*:: +*--name*=_NAME_:: + suite name + +*-a*:: +*--abort*:: + abort at first failure + +*-x*:: +*--abortx*=_NUM_:: + abort after _NUM_ failures + +*-w*:: +*--warn*=_NAME_:: + enable warnings + +*-d*:: +*--durations*=[_yes_|_no_]:: + show test durations + +*-f*:: +*--input-file*=_FILE_:: + load test names to run from a file + +*-#*:: +*--filenames-as-tags*:: + adds a tag for the filename + +*-c*:: +*--section*=_NAME_:: + specify section to run + +*--list-test-names-only*:: + list all/matching test cases names only + +*--list-reporters*:: + list all reporters + +*--order*=[_decl_|_lex_|_rand_]:: + test case order (defaults to _decl_) + +*--rng-seed*=[_time_|_NUM_]:: + set a specific seed for random numbers + +*--force-colour*:: + force colourised output (deprecated) + +*--use-colour*=[_yes_|_no_]:: + should output be colourised + +== Description + +When run without options or arguments, *vg test* runs all unit tests compiled into the *vg* binary. + +Particular tests can be selected by name, by pattern match on name, or by tag (between _[_ and _]_), by specifying the selectors as arguments. If multiple selectors are specified, only tests matching all of the selectors will be run. + +The tool supports all options provided by the Catch 1.x testing framework. + +See https://github.com/catchorg/Catch2/blob/Catch1.x/docs/command-line.md for more information on Catch's available options. + +== Examples + +To run all tests: + +---- +vg test +---- + +To see all available test tags: + +---- +vg test --list-tags +---- + +To run only tests tagged with _[a-star]_: + +---- +vg test [a-star] +---- + +== See Also +*vg*(1) + +== Copyright + +Copyright (C) 2020 {author}. + +Free use of this documentation is granted under the terms of the MIT License. diff --git a/doc/asciidoc/man/vg-version.adoc b/doc/asciidoc/man/vg-version.adoc new file mode 100644 index 00000000000..50032b2406a --- /dev/null +++ b/doc/asciidoc/man/vg-version.adoc @@ -0,0 +1,67 @@ += vg-version(1) +vgteam contributors +v1.20.0 +:doctype: manpage +:manmanual: vg +:mansource: vg +:man-linkstyle: pass:[blue R < >] + +== Name + +vg-version - get version and build information about vg + +== Synopsis + +*vg version* [_OPTION_]... + +== Options + +*-s*:: +*--slug*:: + Print only the one-line, whitespace-free version string (e.g. _v1.20.0-70-g472e24c9c_), for use in scripts. + +*-h*:: +*--help*:: + Print help about the *vg version* command and its supported options. + +== Description + +When run without options, *vg version* outputs information about the version of *vg* that is running, including: + +* The most recent released version on which your *vg* is based +* The number of commits since that version (if not itself a released version) +* The Git commit hash (if not itself a released version) +* The compiler that was used to build *vg* +* The OS that was used to build *vg* +* The C++ standard library that *vg* was linked against +* The user name and host name that built *vg* + +When run with the *-s* option, *vg version* prints just the release and Git commit information. + +== Examples + +To print all version information (human-readable): + +---- +$ vg version +vg version v1.20.0-70-g472e24c9c "Ginestra" +Compiled with g++ (GCC) 8.1.0 on Linux +Linked against libstd++ 20180502 +Built by anovak@courtyard +---- + +To print just the short "`version slug`": + +---- +$ vg version -s +v1.20.0-70-g472e24c9c +---- + +== See Also +*vg*(1), *git*(1) + +== Copyright + +Copyright (C) 2019 {author}. + +Free use of this documentation is granted under the terms of the MIT License. diff --git a/doc/conf.py b/doc/conf.py index 6edca265de6..f2fe79563b1 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -26,8 +26,8 @@ source_suffix = '.rst' master_doc = 'index' -project = u'vg' -copyright = u'vgteam' +project = 'vg' +copyright = 'vgteam' exclude_patterns = ['_build'] pygments_style = 'sphinx' diff --git a/doc/deploy_key.enc b/doc/deploy_key.enc deleted file mode 100644 index 891dc6e60e5..00000000000 Binary files a/doc/deploy_key.enc and /dev/null differ diff --git a/doc/deploy_key.pub b/doc/deploy_key.pub deleted file mode 100644 index 12ee99a3b31..00000000000 --- a/doc/deploy_key.pub +++ /dev/null @@ -1 +0,0 @@ -ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDNjZ2QzbRQuVCyzrmF4ZR3eVSuNm1Ln5Im53CcrjAssitiT6V8aSX3IeroB3fXnqzVtGb6/RReDaRhU7wKmEREvRwfllWzWw0RNqtg/8BT4iup9zSwOpnmhWNaKd1XTi6StwCvQr1LRMXVhrVnW7eX7LX1oTbaTmWMKS4g8+X8LZG4NtMcsFqkjAmeAKECDRMK04MZ02J//u93vc12nzUwi+azUA3IrGLBlXp58CXpDfpKW3A20w84J6nxZziAtzXwd2I8o4Kg1Y2ncqEg+pAv0VXBcxxs3vXDjevLXVV3nAqyWhSO7pnJBY7E9N1WMRgyvWr2qYazItN22M7CLofJ travis@travis diff --git a/doc/figures/download-linux.png b/doc/figures/download-linux.png new file mode 100644 index 00000000000..bd055522430 Binary files /dev/null and b/doc/figures/download-linux.png differ diff --git a/doc/figures/download-linux.svg b/doc/figures/download-linux.svg new file mode 100644 index 00000000000..64462957f17 --- /dev/null +++ b/doc/figures/download-linux.svg @@ -0,0 +1,975 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + Openclipart + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/figures/multipath.png b/doc/figures/multipath.png new file mode 100644 index 00000000000..fffe736df2e Binary files /dev/null and b/doc/figures/multipath.png differ diff --git a/doc/figures/pipeline_flowchart.odg b/doc/figures/pipeline_flowchart.odg new file mode 100644 index 00000000000..eb3c330be60 Binary files /dev/null and b/doc/figures/pipeline_flowchart.odg differ diff --git a/doc/figures/pipeline_flowchart.png b/doc/figures/pipeline_flowchart.png new file mode 100644 index 00000000000..92a018170ec Binary files /dev/null and b/doc/figures/pipeline_flowchart.png differ diff --git a/doc/figures/pipeline_flowchart.svg b/doc/figures/pipeline_flowchart.svg new file mode 100644 index 00000000000..73416d19bc3 --- /dev/null +++ b/doc/figures/pipeline_flowchart.svg @@ -0,0 +1,850 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Graph Indexes + + + + + + + + + + + + + + Input Files(.fai/.tbi omitted) + + + + + + + + + + + + + + Augmented Graph and Pileup + + + + + + + + + + + + + + Alignments + + + + + + + + + + + + + + Graph Reference + + + + + + + + + + + + + + Sample Genotypes + + + + + + + + + + + + + + + Variant Calling + + + + + + + + + + + + + + + Graph Augmentation + + + + + + + + + + + + + + + Read Alignment + + + + + + + + + + + + + + + Indexing + + + + + + + + + + + + + + + Graph Construction + + + + + + + + + vg construct + + + + + + + + graph.vg + + + + + + + + vg index + + + + + + + + graph.xg + + + + + + + + graph.gcsa + + + + + + + + graph.gcsa.lcp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + graph.gbwt + + + + + + + + variants.vcf.gz + + + + + + + + reference.fa + + + + + + + + reads.fq + + + + + + + + vg map + + + + + + + + mapped.gam + + + + + + + + vg augment + + + + + + + + samp.trans + + + + + + + + samp.support + + + + + + + + samp.aug.vg + + + + + + + + vg call + + + + + + + + calls.vcf + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The vg Graph Genomics Pipeline + + + + + + + \ No newline at end of file diff --git a/doc/footer_template.html b/doc/footer_template.html new file mode 100644 index 00000000000..32366a05cd6 --- /dev/null +++ b/doc/footer_template.html @@ -0,0 +1,21 @@ + + + + + + + + + + diff --git a/doc/publish-docs.sh b/doc/publish-docs.sh index 75393b733bb..2fcc07102b1 100755 --- a/doc/publish-docs.sh +++ b/doc/publish-docs.sh @@ -16,96 +16,91 @@ SOURCE_DIR="doc/doxygen/html/" # Also probably needs a trailing slash DEST_DIR="./" # Who should be seen as making the commits? -COMMIT_AUTHOR_NAME="Travis Doc Bot" -COMMIT_AUTHOR_EMAIL="anovak+travisdocbot@soe.ucsc.edu" -# What SSH key, relative to this repo's root, should we decrypt and use for doc deployment? -ENCRYPTED_SSH_KEY_FILE="doc/deploy_key.enc" +COMMIT_AUTHOR_NAME="VG Doc Bot" +COMMIT_AUTHOR_EMAIL="anovak+vgdocbot@soe.ucsc.edu" -# We expect DOCS_KEY_ENCRYPTION_LABEL to come in from the environment, specifying the ID -# of the encrypted deploy key we will use to get at the docs repo. +# We expect GITLAB_SECRET_FILE_DOCS_SSH_KEY to come in from the environment, +# specifying the private deploy key we will use to get at the docs repo. + +# Find all the submodules that Doxygen wants to look at and make sure we have +# those. +cat Doxyfile | grep "^INPUT *=" | cut -f2 -d'=' | tr ' ' '\n' | grep "^ *deps" | sed 's_ *\(deps/[^/]*\).*_\1_' | sort | uniq | xargs -n 1 git submodule update --init --recursive # Build the documentation. # Assumes we are running in the repo root. make docs -if [[ ! -z "${TRAVIS_PULL_REQUEST_SLUG}" && "${TRAVIS_PULL_REQUEST_SLUG}" != "${TRAVIS_REPO_SLUG}" ]]; then - # This is an external PR. We have no access to the encryption keys for the encrypted deploy SSH key. - # We want to check out the dest repo with that key because it's much simpler than hacking the remote from https to ssh. - # So we won't even test copying the docs over to the destination repo. - echo "Not testing deploy; no encryption keys available for external PRs." - exit 0 -fi - # Get ready to deploy the docs -# Make a scratch directory -mkdir -p ./tmp - -# Get our encryption key and IV variable names -ENCRYPTION_KEY_VAR="encrypted_${DOCS_KEY_ENCRYPTION_LABEL}_key" -ENCRYPTION_IV_VAR="encrypted_${DOCS_KEY_ENCRYPTION_LABEL}_iv" +# Make a scratch directory *outside* our normal git repo +SCRATCH_DIR="$(mktemp -d)" +# And clean it up when we stop +function cleanup { + rm -Rf ${SCRATCH_DIR} +} +trap cleanup EXIT -echo "Want to decrypt ${ENCRYPTED_SSH_KEY_FILE} using key from variable ${ENCRYPTION_KEY_VAR} and IV from variable ${ENCRYPTION_IV_VAR}" -if [[ -z "${!ENCRYPTION_KEY_VAR}" ]]; then - echo "Encryption key not found!" - exit 1 -fi -if [[ -z "${!ENCRYPTION_IV_VAR}" ]]; then - echo "Encryption IV not found!" - exit 1 -fi +# Set up our SSH key +touch "${SCRATCH_DIR}/deploy_key" -# Decrypt the encrypted deploy SSH key -# Get the key and IV from the variables we have the names of. -openssl aes-256-cbc -K "${!ENCRYPTION_KEY_VAR}" -iv "${!ENCRYPTION_IV_VAR}" -in "${ENCRYPTED_SSH_KEY_FILE}" -out ./tmp/deploy_key -d # Protect it so the agent is happy -chmod 600 ./tmp/deploy_key +chmod 600 "${SCRATCH_DIR}/deploy_key" + +# Fill it in with NO COMMAND ECHO +set +x +echo "${GITLAB_SECRET_FILE_DOCS_SSH_KEY}" > ${SCRATCH_DIR}/deploy_key + +# Turn on echo so we can see what we're doing. +# This MUST happen only AFTER we are done touching the encryption stuff. +set -x -# Start an agent and add the key -eval "$(ssh-agent -s)" -ssh-add ./tmp/deploy_key +# Make sure we have an known_hosts +mkdir -p ~/.ssh +touch ~/.ssh/known_hosts +cat ~/.ssh/known_hosts -# Check out the dest repo, now that we can authenticate, shallow-ly to avoid getting all history -git clone "${DEST_REPO}" ./tmp/dest +# Clone the dest repo, now that we can authenticate. +# Don't check it out, so we can get just the branch we want or start a new branch with a clean working copy. +git -c "core.sshCommand=ssh -i ${SCRATCH_DIR}/deploy_key -o 'UserKnownHostsFile=/dev/null' -o 'StrictHostKeyChecking=no'" clone --no-checkout "${DEST_REPO}" "${SCRATCH_DIR}/dest" # Go in and get/make the destination branch -cd ./tmp/dest +pushd "${SCRATCH_DIR}/dest" git checkout "${DEST_BRANCH}" || git checkout --orphan "${DEST_BRANCH}" -cd ../.. +popd # Drop the files in -# See https://explainshell.com/explain?cmd=rsync+-aqr+--delete+--filter +# See https://explainshell.com/explain?cmd=rsync+-aqr+--delete+--exclude # We need to not clobber any .git in the destination. -rsync -aqr "${SOURCE_DIR}" "tmp/dest/${DEST_DIR}" --delete --filter='protect .git/**/*' +rsync -avr "${SOURCE_DIR}" "${SCRATCH_DIR}/dest/${DEST_DIR}" --delete --exclude .git -cd ./tmp/dest +# Go back in to make the commit +pushd "${SCRATCH_DIR}/dest" -if git diff --quiet; then - # Don't commit nothing - echo "No changes to commit" - exit 0 -fi +# Disable Jeckyll processing for Github Pages since we did it already +touch .nojekyll +git add .nojekyll + +# Add all the files here (except hidden ones) and add deletions +git add -A # Become the user we want to be git config user.name "${COMMIT_AUTHOR_NAME}" git config user.email "${COMMIT_AUTHOR_EMAIL}" -# Make the commit -git commit -am "Commit new auto-generated docs" +# Make the commit. Tolerate failure because this fails when there is nothing to commit. +git commit -m "Commit new auto-generated docs" || true -if [[ "${TRAVIS_PULL_REQUEST}" != "false" || "${TRAVIS_BRANCH}" != "master" ]]; then - # If we're not a real master commit, we just make sure the docs build. - # Also, unless we're a branch in the main vgteam/vg repo, we don't have access to the encryption keys anyway. - # So we can't even try to deploy. - echo "Documentation should not be deployed because this is not a mainline master build" +if [[ -z "${CI_COMMIT_BRANCH}" || "${CI_COMMIT_BRANCH}" != "${CI_DEFAULT_BRANCH}" ]]; then + # If we're not a real mainline commit, we just make sure the docs build. + echo "Documentation should not be deployed because this is not a mainline build" exit 0 fi # If we are on the right branch, actually push the commit. -# Push the commit -git push origin "${DEST_BRANCH}" +# Push the commit. This does not fail if there is no commit. +git -c "core.sshCommand=ssh -i ${SCRATCH_DIR}/deploy_key -o 'UserKnownHostsFile=/dev/null' -o 'StrictHostKeyChecking=no'" push origin "${DEST_BRANCH}" diff --git a/doc/test-docs.sh b/doc/test-docs.sh new file mode 100755 index 00000000000..246e7c07aa0 --- /dev/null +++ b/doc/test-docs.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# test-docs.sh: Test the examples in the vg documentation and wiki with https://github.com/anko/txm +set -e + +# Work out where we are. +# See https://stackoverflow.com/a/246128 +HERE="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# Make sure we have the tester +which txm || npm install -g txm + +# Go to the test directory, where the tests expect to run +cd "${HERE}/../test" + +# Test the readme +echo txm --jobs 1 "${HERE}/../README.md" +txm --jobs 1 "${HERE}/../README.md" + +# Run all the wiki tests +find "${HERE}/wiki" -name "*.md" | xargs -n 1 -t txm --jobs 1 + + diff --git a/doc/wiki b/doc/wiki new file mode 160000 index 00000000000..acd5d146b08 --- /dev/null +++ b/doc/wiki @@ -0,0 +1 @@ +Subproject commit acd5d146b08c629dd0ccda2117bb31243f5b1875 diff --git a/jenkins/Dockerfile.jenkins b/jenkins/Dockerfile.jenkins deleted file mode 100644 index fb6b37b703b..00000000000 --- a/jenkins/Dockerfile.jenkins +++ /dev/null @@ -1,37 +0,0 @@ -# Dockerfile for a full vg build from source -# (derived from vgteam/vg_docker) - -FROM ubuntu:16.04 -MAINTAINER vgteam -ARG vg_git_revision=master - -# Make sure the en_US.UTF-8 locale exists, since we need it for tests -#RUN locale-gen en_US en_US.UTF-8 && DEBIAN_FRONTEND=noninteractive dpkg-reconfigure locales - -# install basic apt dependencies -# note: most vg apt dependencies are installed by "make get-deps" below -RUN apt-get -qq update && apt-get -qq install -y \ - sudo \ - pv \ - pigz \ - bsdmainutils \ - build-essential \ - make \ - git \ - zlib1g-dev \ - rs \ - libffi-dev - -ADD http://mirrors.kernel.org/ubuntu/pool/universe/b/bwa/bwa_0.7.15-5_amd64.deb /tmp/bwa.deb -RUN dpkg -i /tmp/bwa.deb - -# copy over current directory to docker -ADD . /vg - -# set our working directory -WORKDIR /vg - -# Build -RUN . ./source_me.sh && make get-deps && make -j$(nproc) && make static - -ENV PATH /vg/bin:$PATH diff --git a/jenkins/jenkins.sh b/jenkins/jenkins.sh deleted file mode 100755 index 1bf09ed3c7c..00000000000 --- a/jenkins/jenkins.sh +++ /dev/null @@ -1,301 +0,0 @@ -#!/bin/bash - -# Run some CI tests on vg using toil-vg - -# https://github.com/BD2KGenomics/toil-vg - -# This script is hooked into - -# http://jenkins.cgcloud.info - -# Most of the setup here is cribbed from other cgcloud jenkins projects such as toil-vg -# itself - -# Note: we assume we run this in vg/ ie inside the vg directory we want to test - -#!/bin/bash - -# Don't stop on errors, so we can post a report no matter what -set +e - -# Should we build and run locally, or should we use Docker? -LOCAL_BUILD=0 -# Should we re-use and keep around the same virtualenv? -REUSE_VENV=0 -# Should we keep our test output around after uploading the new baseline? -KEEP_OUTPUT=0 -# Should we keep all intermediate output (ie --force_outstore in toil-vg)? -KEEP_INTERMEDIATE_FILES=0 -# Should we show stdout and stderr from tests? If so, set to "-s". -SHOW_OPT="" -# What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/adamnovak/toil-vg.git@f04a21197cef57dc42a09e4f4d142baabd7c92bd" -# What toil should we install? -# Could be something like "toil[aws,mesos]==3.13.0" -# or "git+https://github.com/adamnovak/toil.git@2b696bec34fa1381afdcf187456571d2b41f3842#egg=toil[aws,mesos]" -TOIL_PACKAGE="toil[aws,mesos]==3.13.0" -# What tests should we run? -# Should be something like "jenkins/vgci.py::VGCITest::test_sim_brca2_snp1kg" -PYTEST_TEST_SPEC="jenkins/vgci.py" - -usage() { - # Print usage to stderr - exec 1>&2 - printf "Usage: $0 [Options] \n" - printf "Options:\n\n" - printf "\t-l\t\tBuild vg locally (instead of in Docker) and don't use Docker at all.\n" - printf "\t\t\tNon-Python dependencies must be installed.\n" - printf "\t-r\t\tRe-use virtualenvs across script invocations. \n" - printf "\t-k\t\tKeep on-disk output. \n" - printf "\t-i\t\tKeep intermediate on-disk output. \n" - printf "\t-s\t\tShow test output and error streams (pass -s to pytest). \n" - printf "\t-p PACKAGE\tUse the given Python package specifier to install toil-vg.\n" - printf "\t-t TESTSPEC\tUse the given PyTest test specifier to select tests to run.\n" - exit 1 -} - -while getopts "lrkisp:t:" o; do - case "${o}" in - l) - LOCAL_BUILD=1 - ;; - r) - REUSE_VENV=1 - ;; - k) - KEEP_OUTPUT=1 - ;; - i) - KEEP_INTERMEDIATE_FILES=1 - ;; - s) - SHOW_OPT="-s" - ;; - p) - TOIL_VG_PACKAGE="${OPTARG}" - ;; - t) - PYTEST_TEST_SPEC="${OPTARG}" - ;; - *) - usage - ;; - esac -done - -shift $((OPTIND-1)) - -if [ ! -e ~/.aws/credentials ]; then - >&2 echo "WARNING: No AWS credentials at ~/.aws/credentials; test data may not be able to be downloaded!" -fi - -# Maximum number of minutes that can have passed since new vg docker image built -PLATFORM=`uname -s` -if [ $PLATFORM == "Darwin" ]; then - NUM_CORES=`sysctl -n hw.ncpu` -else - NUM_CORES=`cat /proc/cpuinfo | grep "^processor" | wc -l` -fi - -if [ "${NUM_CORES}" == "0" ]; then - echo "could not determine NUM_CORES, using 2" - NUM_CORES=2 -fi - -# Create Toil venv -if [ ! "${REUSE_VENV}" == "1" ]; then - rm -rf .env -fi -if [ ! -e .env ]; then - virtualenv .env -fi -. .env/bin/activate - -# Prepare directory for temp files (assuming cgcloud file structure) -# Sometimes the instances have un-deletable files in tmp, so we continue through errors -if [ -d "/mnt/ephemeral" ] -then - TMPDIR=/mnt/ephemeral/tmp - rm -rf $TMPDIR - mkdir -p $TMPDIR - export TMPDIR -fi - -# Upgrade pip so that it can use the wheels for numpy & scipy, so that they -# don't try to build from source -pip install --upgrade pip - -# Create s3am venv -if [ ! "${REUSE_VENV}" == "1" ]; then - rm -rf s3am -fi -if [ ! -e s3am ]; then - virtualenv --never-download s3am && s3am/bin/pip install s3am==2.0 -fi -mkdir -p bin -# Expose binaries to the PATH -ln -snf ${PWD}/s3am/bin/s3am bin/ -export PATH=$PATH:${PWD}/bin - -# Create awscli venv -if [ ! "${REUSE_VENV}" == "1" ]; then - rm -rf awscli -fi -if [ ! -e awscli ]; then - virtualenv --never-download awscli && awscli/bin/pip install awscli -fi -# Expose binaries to the PATH -ln -snf ${PWD}/awscli/bin/aws bin/ -export PATH=$PATH:${PWD}/bin - -# Dependencies for running tests. Need numpy, scipy and sklearn -# for running toil-vg mapeval, and dateutils and reqests for ./mins_since_last_build.py -pip install numpy -pip install scipy==1.0.0rc2 -pip install sklearn -pip install dateutils -pip install requests -pip install timeout_decorator -pip install pytest -pip install pygithub - -# Install Toil -echo "Installing toil from ${TOIL_PACKAGE}" -pip install --upgrade "${TOIL_PACKAGE}" -if [ "$?" -ne 0 ] -then - echo "pip install toil fail" - exit 1 -fi - -# Don't manually install boto since toil just installs its preferred version - -# Install toil-vg itself -echo "Installing toil-vg from ${TOIL_VG_PACKAGE}" -pip install --upgrade "${TOIL_VG_PACKAGE}" -if [ "$?" -ne 0 ] -then - echo "pip install toil-vg fail" - exit 1 -fi - -# Make sure we have submodules -git submodule update --init --recursive - -# we pass some parameters through pytest by way of our config file -# in particular, we set the vg version and cores, and specify -# that we want to keep all the results in vgci-work/ -printf "cores ${NUM_CORES}\n" > vgci_cfg.tsv -printf "teardown False\n" >> vgci_cfg.tsv -printf "workdir ./vgci-work\n" >> vgci_cfg.tsv -if [ "${KEEP_INTERMEDIATE_FILES}" == "0" ]; then - printf "force_outstore False\n" >> vgci_cfg.tsv -else - printf "force_outstore True\n" >> vgci_cfg.tsv -fi -#printf "verify False\n" >> vgci_cfg.tsv -#printf "baseline ./vgci-baseline\n" >> vgci_cfg.tsv - -rm -rf vgci-work -mkdir vgci-work -BUILD_FAIL=0 -if [ "${LOCAL_BUILD}" == "1" ] -then - # Just build vg here - . ./source_me.sh - make -j ${NUM_CORES} - - if [ "$?" -ne 0 ] - then - echo "vg local build fail" - BUILD_FAIL=1 - fi - VG_VERSION=`vg version -s` - printf "vg-docker-version None\n" >> vgci_cfg.tsv - printf "container None\n" >> vgci_cfg.tsv -else - # Build a docker image locally. Can be useful when don't - # have priveleges to easily install dependencies - - # we actually want to throw git in our local image so we can get - # a proper version - rm -f .dockerignore - - docker pull ubuntu:16.04 - DOCKER_TAG="jenkins-docker-vg-local" - docker build --no-cache -t "jenkins-docker-vg-local" -f jenkins/Dockerfile.jenkins . - if [ "$?" -ne 0 ] - then - echo "vg docker build fail" - BUILD_FAIL=1 - else - # Pull down the docker images, so time costs (and instability) of doing so doesn't affect - # individual test results (looking at you, rocker/tidyverse:3.4.2) - # Allow two tries - for img in $(toil-vg generate-config | grep docker: | grep -v vg | awk '{print $2}' | sed "s/^\([\"']\)\(.*\)\1\$/\2/g"); do docker pull $img ; done - for img in $(toil-vg generate-config | grep docker: | grep -v vg | awk '{print $2}' | sed "s/^\([\"']\)\(.*\)\1\$/\2/g"); do docker pull $img ; done - fi - VG_VERSION=`docker run jenkins-docker-vg-local vg version -s` - printf "vg-docker-version jenkins-docker-vg-local\n" >> vgci_cfg.tsv -fi - -# run the tests, output the junit report for Jenkins -rm -f test-report.xml -PYRET=1 -if [ ${BUILD_FAIL} -ne 1 ] -then - pytest -vv "${PYTEST_TEST_SPEC}" --junitxml=test-report.xml ${SHOW_OPT} - PYRET="$?" -fi - -# Generate a report in two files: HTML full output, and a Markdown summary. -# Takes as input the Jenkins test result XML and the work directory with the -# test output files. -jenkins/mine-logs.py test-report.xml vgci-work/ report-html/ summary.md - -# Put the report on Github for the current pull request or commit. -jenkins/post-report report-html summary.md - - -if [ ! -z "${BUILD_NUMBER}" ] -then - # We are running on Jenkins (and not manually running the Jenkins tests), so - # we probably have AWS credentials and can upload stuff to S3. - - # we publish the results to the archive - tar czf "${VG_VERSION}_output.tar.gz" vgci-work test-report.xml jenkins/vgci.py jenkins/jenkins.sh vgci_cfg.tsv - aws s3 cp --only-show-errors --acl public-read "${VG_VERSION}_output.tar.gz" s3://cgl-pipeline-inputs/vg_cgl/vg_ci/jenkins_output_archives/ - - # if we're merging the PR (and not just testing it), we publish results to the baseline - if [ -z ${ghprbActualCommit} ] - then - echo "Updating baseline" - aws s3 sync --acl public-read ./vgci-work/ s3://cgl-pipeline-inputs/vg_cgl/vg_ci/jenkins_regression_baseline - printf "${VG_VERSION}\n" > vg_version_${VG_VERSION}.txt - printf "${ghprbActualCommitAuthor}\n${ghprbPullTitle}\n${ghprbPullLink}\n" >> vg_version_${VG_VERSION}.txt - aws s3 cp --only-show-errors --acl public-read vg_version_${VG_VERSION}.txt s3://cgl-pipeline-inputs/vg_cgl/vg_ci/jenkins_regression_baseline/ - fi -fi - - -# clean up changes to bin -# Don't disturb bin/protoc or vg will want to rebuild protobuf needlessly -rm bin/aws bin/s3am - -if [ ! "${REUSE_VENV}" == "1" ]; then - rm -rf awscli s3am -fi - -if ([ "${LOCAL_BUILD}" == "0" ] || [ "${PYRET}" == 0 ]) && [ ! "${KEEP_OUTPUT}" == "1" ]; then - # On anything other than a failed local run, and if we haven't been told not to, clean up the test output. - rm -rf vgci-work -fi -if [ ! "${REUSE_VENV}" == "1" ]; then - # If we aren't re-using the virtualenv, clean it up - rm -rf .env -fi - -if [ -d "/mnt/ephemeral" ] -then - rm -rf $TMPDIR -fi diff --git a/ontology/README.md b/ontology/README.md index 8ca51fccccb..e038a3b728c 100644 --- a/ontology/README.md +++ b/ontology/README.md @@ -3,7 +3,7 @@ ## Conceptual model `Node`s, `Path`s and `Step`s, are the three core parts of any VG graph in RDF. -A `Node` in the VG RDF corersponds directly to the Node concept in the VG protobuf serialization. +A `Node` in the VG RDF corresponds directly to the Node concept in the VG protobuf serialization. `Paths` are a number of `Step`s that represent a sequence of Node visits that generate a linear biological sequence. Each `Step` connects a `Node` into a `Path` @@ -46,9 +46,49 @@ me:example:some_gene rdfs:seeAlso ENSEMBL:ESG00000XXXX . #and then pick up the a ## Examples of using VG RDF -[2 ecoli genomes, with ensembl and uniprot annotation](/vgteam/vg/wiki/VG-RDF,-the-Ensembl-bacteria-E.-coli-genome-hack-attack) +[2 ecoli genomes, with ensembl and uniprot annotation](https://github.com/vgteam/vg/wiki/VG-RDF,-the-Ensembl-bacteria-E.-coli-genome-hack-attack) ## VG RDF limitations At this moment VG RDF wants a fully embedded variation graph. e.g. all positions in vg json have a single edit which covers a whole node. This is to enable easy SPARQL queries where substring operations are rarely used. + +## Annotations on Pantograph + +On top of VG RDF, we can describe the same path information on Pantograph format as well. + +```ttl + a vg:ZoomLevel ; + vg:components , ; + vg:zoomFactor 10 . + a vg:ZoomLevel ; + vg:components , ; + vg:zoomFactor 1000 . // zoomFactor is binWidth here. + a vg:Component ; + vg:componentRank 1 ; # The order of component is inferred by rank. + vg:forwardComponentEdge ; + vg:bins , . + a vg:Bin ; + vg:forwardBinEdge ; + vg:binRank 1 ; + vg:cells , . + a vg:Bin ; + vg:reverseBinEdge ; + vg:forwardBinEdge ; + vg:binRank 2 ; + vg:cells , . + a vg:Cell ; + vg:positionPercent 0.04 ; + vg:inversionPercent 0.98 ; + vg:cellRegion . # To infer firstNucleotide and last Nucleotide. faldo:begin of stepRegion is the first position. faldo:end of cellRegion is the last position. + a vg:Link ; # This is a non-linear connection between Bins. + vg:arrival ; + vg:departure ; + vg:forwardLinkEdge ; + vg:linkRank 1 ; + vg:linkPaths ; # Participants of the link + vg:linkZoomLevel . + a faldo:Region ; + faldo:begin ; + faldo:end . +``` diff --git a/ontology/owl2xhtml.xsl b/ontology/owl2xhtml.xsl index 4eeeac5fe0d..a4f66a5fab3 100644 --- a/ontology/owl2xhtml.xsl +++ b/ontology/owl2xhtml.xsl @@ -198,7 +198,8 @@ by Masahide Kanzaki, and from the OWL2HTML stylesheet (2), by Li Ding. We are ve
  • NIH under R24OD011883
  • DNA Databank of Japan (DDBJ)
  • Sanger centre
  • -
  • University of California, Santa Cruz, David Haussler lab
  • +
  • The University of Tokyo
  • +
  • University of California, Santa Cruz, David Haussler lab
  • @@ -610,7 +611,7 @@ by Masahide Kanzaki, and from the OWL2HTML stylesheet (2), by Li Ding. We are ve - + diff --git a/ontology/vg.html b/ontology/vg.html index fb25b33b0ce..982fc7a668a 100644 --- a/ontology/vg.html +++ b/ontology/vg.html @@ -116,28 +116,78 @@ Classes ( - 3 + 16 ) - + + + + + + + + + + + vg:Step + + + Properties ( - 7 + 30 ) Object properties ( - 6 + 23 ) - + + + + + + + + + + + + @@ -147,17 +197,39 @@ vg:linksReverseToReverse + + + + + Datatype properties ( - 1 + 7 ) - + @@ -207,8 +279,8 @@

    owl:versionInfo - "Created at the DBCLS RDF Summit 2, Sendai Japan 20" - + "Created at the DBCLS RDF Summit 2, Sendai Japan and COVID-19 Virtual Biohackathon" + xsd:string @@ -220,9 +292,9 @@

    - - + @@ -242,8 +314,8 @@

    @@ -256,9 +328,9 @@

    - vg:Node +
    + vg:Bin (rdf:type owl:Class @@ -233,8 +305,8 @@

    rdfs:comment - "A node in the variant graph, representing a sequence section." - + "A bin in the output of odgi bin, representing a sequence section." + xsd:string
    rdfs:label - "Node" - + "Bin" + xsd:string
    - - + @@ -278,8 +350,8 @@

    @@ -292,9 +364,9 @@

    - vg:Path +
    + vg:Cell (rdf:type owl:Class @@ -269,8 +341,8 @@

    rdfs:comment - "A Path is a collection of steps from path to path that represent an asserdfs:labelmbled sequence integrated into the variant grap." - + "A cell along a path of a specific group (component) of a specific zoom. A cell to a :Component and a :Path with supplemental informations." + xsd:string
    rdfs:label - "Path" - + "Cell" + xsd:string
    - - + @@ -314,8 +386,8 @@

    @@ -327,147 +399,135 @@

    - vg:Step +
    + vg:Component (rdf:type owl:Class @@ -305,8 +377,8 @@

    rdfs:comment - "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)." - + "A region between Links structuring all bins and their present individuals into one component." + xsd:string
    rdfs:label - "Step" - + "Component" + xsd:string
    -
    - -
    -

    - Properties -

    -
    -

    Object properties

    - - + - + - - + + - + + + +
    - vg:linksForwardToForward +
    + vg:GAMP_Format (rdf:type - - owl:ObjectProperty + + owl:Class )
    rdfs:comment - "This links a node from the forward (5' to 3') strand on the subject node to the forward (5' to 3') strand on the predicate node." - - xsd:string -Contains a DAG of subalignments to different paths through the reference graph
    - rdfs:domain - vg:Node + rdfs:isDefinedByhttps://raw.githubusercontent.com/vgteam/libvgio/master/deps/vg.proto
    rdfs:label - "++" - - xsd:string -MultipathAlignment
    + + + + + + + + + + - + - - + +
    + vg:GAM_Format + (rdf:type + + owl:Class + ) + +
    + rdfs:commentContains an alignment to a graph
    + rdfs:isDefinedByhttps://raw.githubusercontent.com/vgteam/libvgio/master/deps/vg.proto
    rdfs:label - "linksForwardToForward" - - xsd:string -Alignment
    - rdfs:range - vg:Node + rdfs:subClassOfhttp://edamontology.org/format_2055
    - - + - - - - - + - - + + - - - - - +
    - vg:linksForwardToReverse +
    + vg:GBWT_Format (rdf:type - - owl:ObjectProperty + + owl:Class )
    rdfs:comment - "This links a node from the forward (5' to 3') strand on the subject node to the reverse (3' to 5') strand on the predicate node." - - xsd:string -
    - rdfs:domain - vg:NodeGraph Burrows-Wheeler Transform, a succinct collection of paths through a genome graph, published by Jouni Siren. Usually represents haplotypes.
    - rdfs:label - "+-" - - xsd:string - + rdfs:isDefinedByhttps://raw.githubusercontent.com/vgteam/libvgio/master/deps/vg.proto
    rdfs:label - "linksForwardToReverse" - - xsd:string -
    - rdfs:range - vg:NodeGBWT
    - - + - + - - + + + + + +
    - vg:linksReverseToForward +
    + vg:GCSA2_Format (rdf:type - - owl:ObjectProperty + + owl:Class )
    rdfs:comment - "This links a node from the reverse (3' to 5') strand on the subject node to the forward (5' to 3') strand on the predicate node." - - xsd:string -Generalized compressed suffix array 2, an exact match index published by Jouni Siren
    - rdfs:domain - vg:Node + rdfs:isDefinedByhttps://raw.githubusercontent.com/vgteam/libvgio/master/deps/vg.proto
    rdfs:labelGCSA2
    + + + + + @@ -475,26 +535,26 @@

    Object properties

    - - + +
    + vg:Link + (rdf:type + + owl:Class + ) + +
    + rdfs:comment - "-+" - + "A link marks a graph traversal along a nonlinear connection." + xsd:string
    rdfs:label - "linksReverseToForward" - + "Link" + xsd:string
    - rdfs:range - vg:Node + rdfs:subClassOf + owl:Thing
    - - + @@ -502,23 +562,8 @@

    Object properties

    - - - - - - - - @@ -526,26 +571,26 @@

    Object properties

    - - + +
    - vg:linksReverseToReverse +
    + vg:Node (rdf:type - - owl:ObjectProperty + + owl:Class )
    rdfs:comment - "This links a node from the reverse (3' to 5') strand on the subject node to the reverse (3' to 5') strand on the predicate node." - - xsd:string -
    - rdfs:domain - vg:Node
    - rdfs:label - "--" - + "A node in the variant graph, representing a sequence section." + xsd:string
    rdfs:label - "linksReverseToReverse" - + "Node" + xsd:string
    - rdfs:range - vg:Node + rdfs:subClassOf + owl:Thing
    - - + @@ -553,84 +598,89 @@

    Object properties

    - - - - - - + +
    - vg:node +
    + vg:Path (rdf:type - - owl:ObjectProperty + + owl:Class )
    rdfs:comment - "This means that this step occurs on the forward strand of the sequence attaced to the node (i.e. it is on the explicit encoded forward (5' to 3') strand) of the predicate node." - + "A Path is a collection of steps from path to path that represent an asserdfs:labelmbled sequence integrated into the variant graph." + xsd:string
    - rdfs:domain - vg:Step
    rdfs:label - "node" - + "Path" + xsd:string
    - rdfs:range - vg:Step + rdfs:subClassOf + owl:Thing
    - - + - + - - + + - + + + +
    - vg:reverseOfNode +
    + vg:Snarl_Format (rdf:type - - owl:ObjectProperty + + owl:Class )
    rdfs:comment - "This means this step occurs on the revese complement of the sequence attaced to the node (i.e. it is on the implicit reverse (3' to 5') strand) of the predicate node." - - xsd:string - A series of Snarl objects, each of which contain: The boundaries of a snarl (a bubble-like motif in a graph), the snarl (if any) that contains this one, classifications of the snarl based on the reachability of the two boundary nodes from each other and the cyclic-ness of the subcomponent.
    - rdfs:domain - vg:Step + rdfs:isDefinedByhttps://raw.githubusercontent.com/vgteam/libvgio/master/deps/vg.proto
    rdfs:label - "reverseOfNode" - - xsd:string -Snarl
    + + + + + + - - + + + + + +
    + vg:Snarl_Traversal_Format + (rdf:type + + owl:Class + ) + +
    + rdfs:comment A series of SnarlTraversal objects, each of which contain: A walk in the graph through a Snarl, and optionally a name.
    - rdfs:range - vg:Node + rdfs:isDefinedByhttps://raw.githubusercontent.com/vgteam/libvgio/master/deps/vg.proto
    + rdfs:labelSnarlTraversal
    -

    Datatype properties

    - - + @@ -638,13 +688,1411 @@

    Datatype properties

    - + + + + + + + +
    - vg:rank +
    + vg:Step (rdf:type - - owl:DatatypeProperty + + owl:Class )
    rdfs:comment - "The rank records the step place along its path." - + "A step along a path in the variant graph. A series of steps along a path represent an assembled sequence that was originally inserted into the the variant graph. A step points to a :Node or the reverse complement of a node and has a rank (step number)." + xsd:string
    + + rdfs:label + "Step" + + xsd:string +
    + rdfs:subClassOf + owl:Thing
    + + + + + + + + + + + + + + + + + + + + + + + + +
    + vg:VG_Format + (rdf:type + + owl:Class + ) + +
    + rdfs:commentConsists of nodes, edges, and paths. Nodes each have a unique ID and a DNA sequence, edges consist of two node sides (so the graph is bidirected), and paths are a sequence of nodes and orientations. Paths also have names.
    + rdfs:isDefinedByhttps://raw.githubusercontent.com/vgteam/libvgio/master/deps/vg.proto
    + rdfs:labelGraph
    + rdfs:subClassOfhttp://edamontology.org/format_2055
    + rdfs:subClassOfhttp://edamontology.org/format_2921
    + + + + + + + + + + + + + + + + + + + + +
    + vg:XG_Format + (rdf:type + + owl:Class + ) + +
    + rdfs:commentAn immutable, succinct representation of a graph. Contains the same info as the .vg.
    + rdfs:isDefinedByhttps://raw.githubusercontent.com/vgteam/libvgio/master/deps/vg.proto
    + rdfs:labelXG
    + rdfs:seeAlso + vg:VG_Format
    + + + + + + + + + + + + + + + + +
    + vg:ZoomLevel + (rdf:type + + owl:Class + ) + +
    + rdfs:comment + "A zoom level of Pangenome." + + xsd:string +
    + rdfs:label + "ZoomLevel" + + xsd:string +
    + rdfs:subClassOf + owl:Thing
    +
    +
    +
    +

    + Properties +

    +
    +

    Object properties

    + + + + + + + + + + + + + + + + + + + + + + + + +
    + vg:arrival + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "An end bin of nonlinear link. Incoming edge of the bin." + + xsd:string +
    + rdfs:domain + vg:Link
    + rdfs:label + "arrival" + + xsd:string +
    + rdfs:range + vg:Bin
    + owl:inverseOf + vg:departure
    + + + + + + + + + + + + + + + + + + + + +
    + vg:bins + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "A Component has one or more Bins." + + xsd:string +
    + rdfs:domain + vg:Component
    + rdfs:label + "bins" + + xsd:string +
    + rdfs:range + vg:Bin
    + + + + + + + + + + + + + + + + + + + + +
    + vg:cellRegions + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "A cell is composed of the specific subsequence of paths." + + xsd:string +
    + rdfs:domain + vg:Cell
    + rdfs:label + "cellRegions" + + xsd:string +
    + rdfs:rangehttp://biohackathon.org/resource/faldo#Region
    + + + + + + + + + + + + + + + + + + + + +
    + vg:cells + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "A Bin has one or more paths of a specific group (component) of a specific zoom. That is represented as cells on the matrix." + + xsd:string +
    + rdfs:domain + vg:Bin
    + rdfs:label + "cells" + + xsd:string +
    + rdfs:range + vg:Cell
    + + + + + + + + + + + + + + + + + + + + +
    + vg:components + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "A zoom level has one or more components." + + xsd:string +
    + rdfs:domain + vg:ZoomLevel
    + rdfs:label + "components" + + xsd:string +
    + rdfs:range + vg:Component
    + + + + + + + + + + + + + + + + + + + + + + + + +
    + vg:departure + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "A start bin of nonlinear link. Outgoing edge of the bin." + + xsd:string +
    + rdfs:domain + vg:Link
    + rdfs:label + "departure" + + xsd:string +
    + rdfs:range + vg:Bin
    + owl:inverseOf + vg:arrival
    + + + + + + + + + + + + + + + + + + + + +
    + vg:forwardBinEdge + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "The super property that says two bins are linked in forward orientation on the pangenome sequence." + + xsd:string +
    + rdfs:domain + vg:Bin
    + rdfs:label + "forwardBinEdge" + + xsd:string +
    + owl:inverseOf + vg:reverseBinEdge
    + + + + + + + + + + + + + + + + + + + + +
    + vg:forwardComponentEdge + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "The super property that says two components are linked in forward orientation on the pangenome sequence." + + xsd:string +
    + rdfs:domain + vg:Component
    + rdfs:label + "forwardComponentEdge" + + xsd:string +
    + rdfs:range + vg:Component
    + + + + + + + + + + + + + + + + + + + + +
    + vg:forwardLinkEdge + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "The super property that says two links are linked in forward orientation on the link column." + + xsd:string +
    + rdfs:domain + vg:Link
    + rdfs:label + "forwardLinkEdge" + + xsd:string +
    + rdfs:range + vg:Link
    + + + + + + + + + + + + + + + + + + + + +
    + vg:linkPaths + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "A list of paths that follow the same non-linear link between two bins." + + xsd:string +
    + rdfs:domain + vg:Link
    + rdfs:label + "linkPaths" + + xsd:string +
    + rdfs:range + vg:Path
    + + + + + + + + + + + + + + + + + + + + +
    + vg:linkZoomLevel + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "A link is related to each zoom level." + + xsd:string +
    + rdfs:domain + vg:Link
    + rdfs:label + "linkZoomLevel" + + xsd:string +
    + rdfs:range + vg:ZoomLevel
    + + + + + + + + + + + + + + + + + + + + +
    + vg:links + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "The super property that says two nodes are linked and does not allow one to infer which side to side it goes" + + xsd:string +
    + rdfs:domain + vg:Node
    + rdfs:label + "links" + + xsd:string +
    + rdfs:range + vg:Node
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + vg:linksForwardToForward + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "This links a node from the forward (5' to 3') strand on the subject node to the forward (5' to 3') strand on the predicate node." + + xsd:string +
    + rdfs:domain + vg:Node
    + rdfs:label + "++" + + xsd:string +
    + rdfs:label + "linksForwardToForward" + + xsd:string +
    + rdfs:range + vg:Node
    + rdfs:subPropertyOf + vg:links
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + vg:linksForwardToReverse + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "This links a node from the forward (5' to 3') strand on the subject node to the reverse (3' to 5') strand on the predicate node." + + xsd:string +
    + rdfs:domain + vg:Node
    + rdfs:label + "+-" + + xsd:string +
    + rdfs:label + "linksForwardToReverse" + + xsd:string +
    + rdfs:range + vg:Node
    + rdfs:subPropertyOf + vg:links
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + vg:linksReverseToForward + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "This links a node from the reverse (3' to 5') strand on the subject node to the forward (5' to 3') strand on the predicate node." + + xsd:string +
    + rdfs:domain + vg:Node
    + rdfs:label + "-+" + + xsd:string +
    + rdfs:label + "linksReverseToForward" + + xsd:string +
    + rdfs:range + vg:Node
    + rdfs:subPropertyOf + vg:links
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + vg:linksReverseToReverse + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "This links a node from the reverse (3' to 5') strand on the subject node to the reverse (3' to 5') strand on the predicate node." + + xsd:string +
    + rdfs:domain + vg:Node
    + rdfs:label + "--" + + xsd:string +
    + rdfs:label + "linksReverseToReverse" + + xsd:string +
    + rdfs:range + vg:Node
    + rdfs:subPropertyOf + vg:links
    + + + + + + + + + + + + + + + + + + + + +
    + vg:node + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "This means that this step occurs on the forward strand of the sequence attaced to the node (i.e. it is on the explicit encoded forward (5' to 3') strand) of the predicate node." + + xsd:string +
    + rdfs:domain + vg:Step
    + rdfs:label + "node" + + xsd:string +
    + rdfs:range + vg:Node
    + + + + + + + + + + + + + + + + + + + + +
    + vg:path + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "This means that this step occurs on the path that is the object of this statment" + + xsd:string +
    + rdfs:domain + vg:Step
    + rdfs:label + "path" + + xsd:string +
    + rdfs:range + vg:Path
    + + + + + + + + + + + + + + + + + + + + +
    + vg:position + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "This is the position on the reference path at which this step starts." + + xsd:string +
    + rdfs:domain + vg:Step
    + rdfs:label + "position" + + xsd:string +
    + rdfs:range + xsd:positiveInteger
    + + + + + + + + + + + + + + + + + + + + +
    + vg:reverseBinEdge + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "The super property that says two bins are linked in reverse orientation on the pangenome sequence." + + xsd:string +
    + rdfs:domain + vg:Bin
    + rdfs:label + "reverseBinEdge" + + xsd:string +
    + owl:inverseOf + vg:forwardBinEdge
    + + + + + + + + + + + + + + + + + + + + +
    + vg:reverseComponentEdge + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "The super property that says two components are linked in reverse orientation on the pangenome sequence." + + xsd:string +
    + rdfs:domain + vg:Component
    + rdfs:label + "reverseComponentEdge" + + xsd:string +
    + owl:inverseOf + vg:forwardComponentEdge
    + + + + + + + + + + + + + + + + + + + + +
    + vg:reverseLinkEdge + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "The super property that says two links are linked in reverse orientation on the link column." + + xsd:string +
    + rdfs:domain + vg:Link
    + rdfs:label + "reverseLinkEdge" + + xsd:string +
    + owl:inverseOf + vg:forwardLinkEdge
    + + + + + + + + + + + + + + + + + + + + +
    + vg:reverseOfNode + (rdf:type + + owl:ObjectProperty + ) + +
    + rdfs:comment + "This means this step occurs on the revese complement of the sequence attaced to the node (i.e. it is on the implicit reverse (3' to 5') strand) of the predicate node." + + xsd:string +
    + rdfs:domain + vg:Step
    + rdfs:label + "reverseOfNode" + + xsd:string +
    + rdfs:range + vg:Node
    +

    Datatype properties

    + + + + + + + + + + + + + + + + + + + + +
    + vg:binRank + (rdf:type + + owl:DatatypeProperty + ) + +
    + rdfs:comment + "The rank records that step place along its pangenome sequence." + + xsd:string +
    + rdfs:domain + vg:Bin
    + rdfs:label + "binRank" + + xsd:string +
    + rdfs:range + xsd:positiveInteger
    + + + + + + + + + + + + + + + + + + + + +
    + vg:componentRank + (rdf:type + + owl:DatatypeProperty + ) + +
    + rdfs:comment + "The rank records that step place along its pangenome sequence." + + xsd:string +
    + rdfs:domain + vg:Component
    + rdfs:label + "componentRank" + + xsd:string +
    + rdfs:range + xsd:positiveInteger
    + + + + + + + + + + + + + + + + + + + + +
    + vg:inversionPercent + (rdf:type + + owl:DatatypeProperty + ) + +
    + rdfs:comment + "The inversion percent of the path in the component." + + xsd:string +
    + rdfs:domain + vg:Cell
    + rdfs:label + "inversionPercent" + + xsd:string +
    + rdfs:range + xsd:float
    + + + + + + + + + + + + + + + + + + + + +
    + vg:linkRank + (rdf:type + + owl:DatatypeProperty + ) + +
    + rdfs:comment + "The rank records that step place along a link column." + + xsd:string +
    + rdfs:domain + vg:Link
    + rdfs:label + "linkRank" + + xsd:string +
    + rdfs:range + xsd:positiveInteger
    + + + + + + + + + + + + + + + + + + + + +
    + vg:positionPercent + (rdf:type + + owl:DatatypeProperty + ) + +
    + rdfs:comment + "The position coverage percent of the path in the component." + + xsd:string +
    + rdfs:domain + vg:Cell
    + rdfs:label + "positionPercent" + + xsd:string +
    + rdfs:range + xsd:float
    + + + + + + + + + @@ -654,7 +2102,49 @@

    Datatype properties

    rdfs:label + + + + + + +
    + vg:rank + (rdf:type + + owl:DatatypeProperty + ) + +
    + rdfs:comment + "The rank records the step place along its path." + + xsd:string +
    rdfs:domain vg:Step "rank" - + + xsd:string +
    + rdfs:range + xsd:positiveInteger
    + + + + + + + + + + + + + + @@ -687,6 +2177,7 @@

    Datatype properties

  • NIH under R24OD011883
  • DNA Databank of Japan (DDBJ)
  • Sanger centre
  • +
  • The University of Tokyo
  • University of California, Santa Cruz, David Haussler lab
  • diff --git a/ontology/vg.ttl b/ontology/vg.ttl index 54a2b24bdc4..056e421fa01 100644 --- a/ontology/vg.ttl +++ b/ontology/vg.ttl @@ -6,6 +6,8 @@ @prefix rdfs: . @prefix spin: . @prefix xsd: . +@prefix edam: . +@prefix faldo: . rdf:type owl:Ontology ; @@ -13,7 +15,7 @@ spin:imports ; spin:imports ; spin:imports ; - owl:versionInfo "Created at the DBCLS RDF Summit 2, Sendai Japan 20"^^xsd:string ; + owl:versionInfo "Created at the DBCLS RDF Summit 2, Sendai Japan and COVID-19 Virtual Biohackathon"^^xsd:string ; . :Node rdf:type owl:Class ; @@ -23,7 +25,7 @@ . :Path rdf:type owl:Class ; - rdfs:comment "A Path is a collection of steps from path to path that represent an asserdfs:labelmbled sequence integrated into the variant grap."^^xsd:string ; + rdfs:comment "A Path is a collection of steps from path to path that represent an asserdfs:labelmbled sequence integrated into the variant graph."^^xsd:string ; rdfs:label "Path"^^xsd:string ; rdfs:subClassOf owl:Thing ; . @@ -33,8 +35,16 @@ rdfs:label "Step"^^xsd:string ; rdfs:subClassOf owl:Thing ; . +:links + rdf:type owl:ObjectProperty ; + rdfs:comment "The super property that says two nodes are linked and does not allow one to infer which side to side it goes"^^xsd:string ; + rdfs:domain :Node ; + rdfs:label "links"^^xsd:string ; + rdfs:range :Node ; +. :linksForwardToForward rdf:type owl:ObjectProperty ; + rdfs:subPropertyOf :links ; rdfs:comment "This links a node from the forward (5' to 3') strand on the subject node to the forward (5' to 3') strand on the predicate node."^^xsd:string ; rdfs:domain :Node ; rdfs:label "++"^^xsd:string ; @@ -43,6 +53,7 @@ . :linksForwardToReverse rdf:type owl:ObjectProperty ; + rdfs:subPropertyOf :links ; rdfs:comment "This links a node from the forward (5' to 3') strand on the subject node to the reverse (3' to 5') strand on the predicate node."^^xsd:string ; rdfs:domain :Node ; rdfs:label "+-"^^xsd:string ; @@ -51,6 +62,7 @@ . :linksReverseToForward rdf:type owl:ObjectProperty ; + rdfs:subPropertyOf :links ; rdfs:comment "This links a node from the reverse (3' to 5') strand on the subject node to the forward (5' to 3') strand on the predicate node."^^xsd:string ; rdfs:domain :Node ; rdfs:label "-+"^^xsd:string ; @@ -59,6 +71,7 @@ . :linksReverseToReverse rdf:type owl:ObjectProperty ; + rdfs:subPropertyOf :links ; rdfs:comment "This links a node from the reverse (3' to 5') strand on the subject node to the reverse (3' to 5') strand on the predicate node."^^xsd:string ; rdfs:domain :Node ; rdfs:label "--"^^xsd:string ; @@ -70,8 +83,23 @@ rdfs:comment "This means that this step occurs on the forward strand of the sequence attaced to the node (i.e. it is on the explicit encoded forward (5' to 3') strand) of the predicate node."^^xsd:string ; rdfs:domain :Step ; rdfs:label "node"^^xsd:string ; - rdfs:range :Step ; + rdfs:range :Node ; . +:path + rdf:type owl:ObjectProperty ; + rdfs:comment "This means that this step occurs on the path that is the object of this statment"^^xsd:string ; + rdfs:domain :Step ; + rdfs:label "path"^^xsd:string ; + rdfs:range :Path ; +. + +:position + rdf:type owl:ObjectProperty ; + rdfs:comment "This is the position on the reference path at which this step starts."^^xsd:string ; + rdfs:domain :Step ; + rdfs:label "position"^^xsd:string ; + rdfs:range xsd:positiveInteger . + :rank rdf:type owl:DatatypeProperty ; rdfs:comment "The rank records the step place along its path."^^xsd:string ; @@ -86,3 +114,230 @@ rdfs:label "reverseOfNode"^^xsd:string ; rdfs:range :Node ; . +:ZoomLevel + rdf:type owl:Class ; + rdfs:comment "A zoom level of Pangenome."^^xsd:string ; + rdfs:label "ZoomLevel"^^xsd:string ; + rdfs:subClassOf owl:Thing ; +. +:zoomFactor + rdf:type owl:DatatypeProperty ; + rdfs:comment "The zoom factor of pangenome, which is defined as bin width, generally."^^xsd:string ; + rdfs:domain :ZoomLevel ; + rdfs:label "zoomFactor"^^xsd:string ; + rdfs:range xsd:positiveInteger ; +. +:components + rdf:type owl:ObjectProperty ; + rdfs:comment "A zoom level has one or more components."^^xsd:string ; + rdfs:domain :ZoomLevel ; + rdfs:label "components"^^xsd:string ; + rdfs:range :Component ; +. +:Component + rdf:type owl:Class ; + rdfs:comment "A region between Links structuring all bins and their present individuals into one component."^^xsd:string ; + rdfs:label "Component"^^xsd:string ; + rdfs:subClassOf owl:Thing ; +. +:forwardComponentEdge + rdf:type owl:ObjectProperty ; + rdfs:comment "The super property that says two components are linked in forward orientation on the pangenome sequence."^^xsd:string ; + rdfs:domain :Component ; + rdfs:label "forwardComponentEdge"^^xsd:string ; + rdfs:range :Component ; +. +:reverseComponentEdge + rdf:type owl:ObjectProperty ; + owl:inverseOf :forwardComponentEdge ; + rdfs:comment "The super property that says two components are linked in reverse orientation on the pangenome sequence."^^xsd:string ; + rdfs:domain :Component ; + rdfs:label "reverseComponentEdge"^^xsd:string ; + rdfs:domain :Component ; +. +:componentRank + rdf:type owl:DatatypeProperty ; + rdfs:comment "The rank records that step place along its pangenome sequence."^^xsd:string ; + rdfs:domain :Component ; + rdfs:label "componentRank"^^xsd:string ; + rdfs:range xsd:positiveInteger ; +. +:bins + rdf:type owl:ObjectProperty ; + rdfs:comment "A Component has one or more Bins."^^xsd:string ; + rdfs:domain :Component ; + rdfs:label "bins"^^xsd:string ; + rdfs:range :Bin ; +. +:Bin + rdf:type owl:Class ; + rdfs:comment "A bin in the output of odgi bin, representing a sequence section."^^xsd:string ; + rdfs:label "Bin"^^xsd:string ; + rdfs:subClassOf owl:Thing ; +. +:binRank + rdf:type owl:DatatypeProperty ; + rdfs:comment "The rank records that step place along its pangenome sequence."^^xsd:string ; + rdfs:domain :Bin ; + rdfs:label "binRank"^^xsd:string ; + rdfs:range xsd:positiveInteger ; +. +:forwardBinEdge + rdf:type owl:ObjectProperty ; + owl:inverseOf :reverseBinEdge ; + rdfs:comment "The super property that says two bins are linked in forward orientation on the pangenome sequence."^^xsd:string ; + rdfs:domain :Bin ; + rdfs:label "forwardBinEdge"^^xsd:string ; + rdfs:domain :Bin ; +. +:reverseBinEdge + rdf:type owl:ObjectProperty ; + owl:inverseOf :forwardBinEdge ; + rdfs:comment "The super property that says two bins are linked in reverse orientation on the pangenome sequence."^^xsd:string ; + rdfs:domain :Bin ; + rdfs:label "reverseBinEdge"^^xsd:string ; + rdfs:domain :Bin ; +. +:Link + rdf:type owl:Class ; + rdfs:comment "A link marks a graph traversal along a nonlinear connection."^^xsd:string ; + rdfs:label "Link"^^xsd:string ; + rdfs:subClassOf owl:Thing ; +. +:arrival + rdf:type owl:ObjectProperty ; + rdfs:comment "An end bin of nonlinear link. Incoming edge of the bin."^^xsd:string ; + owl:inverseOf :departure; + rdfs:domain :Link ; + rdfs:label "arrival"^^xsd:string ; + rdfs:range :Bin ; +. +:departure + rdf:type owl:ObjectProperty ; + rdfs:comment "A start bin of nonlinear link. Outgoing edge of the bin."^^xsd:string ; + owl:inverseOf :arrival; + rdfs:domain :Link ; + rdfs:label "departure"^^xsd:string ; + rdfs:range :Bin ; +. +:linkPaths + rdf:type owl:ObjectProperty ; + rdfs:comment "A list of paths that follow the same non-linear link between two bins."^^xsd:string ; + rdfs:domain :Link ; + rdfs:label "linkPaths"^^xsd:string ; + rdfs:range :Path ; +. +:forwardLinkEdge + rdf:type owl:ObjectProperty ; + rdfs:comment "The super property that says two links are linked in forward orientation on the link column."^^xsd:string ; + rdfs:domain :Link ; + rdfs:label "forwardLinkEdge"^^xsd:string ; + rdfs:range :Link ; +. +:reverseLinkEdge + rdf:type owl:ObjectProperty ; + owl:inverseOf :forwardLinkEdge ; + rdfs:comment "The super property that says two links are linked in reverse orientation on the link column."^^xsd:string ; + rdfs:domain :Link ; + rdfs:label "reverseLinkEdge"^^xsd:string ; + rdfs:domain :Link ; +. +:linkRank + rdf:type owl:DatatypeProperty ; + rdfs:comment "The rank records that step place along a link column."^^xsd:string ; + rdfs:domain :Link ; + rdfs:label "linkRank"^^xsd:string ; + rdfs:range xsd:positiveInteger ; +. +:linkZoomLevel + rdf:type owl:ObjectProperty ; + rdfs:comment "A link is related to each zoom level."^^xsd:string ; + rdfs:domain :Link ; + rdfs:label "linkZoomLevel"^^xsd:string ; + rdfs:range :ZoomLevel ; +. +:cells + rdf:type owl:ObjectProperty ; + rdfs:comment "A Bin has one or more paths of a specific group (component) of a specific zoom. That is represented as cells on the matrix."^^xsd:string ; + rdfs:domain :Bin ; + rdfs:label "cells"^^xsd:string ; + rdfs:range :Cell ; +. +:Cell + rdf:type owl:Class ; + rdfs:comment "A cell along a path of a specific group (component) of a specific zoom. A cell to a :Component and a :Path with supplemental informations."^^xsd:string ; + rdfs:label "Cell"^^xsd:string ; + rdfs:subClassOf owl:Thing ; +. +:cellRegions + rdf:type owl:ObjectProperty ; + rdfs:comment "A cell is composed of the specific subsequence of paths."^^xsd:string ; + rdfs:domain :Cell ; + rdfs:label "cellRegions"^^xsd:string ; + rdfs:range faldo:Region ; +. +:positionPercent + rdf:type owl:DatatypeProperty ; + rdfs:comment "The position coverage percent of the path in the component."^^xsd:string ; + rdfs:domain :Cell ; + rdfs:label "positionPercent"^^xsd:string ; + rdfs:range xsd:float ; +. +:inversionPercent + rdf:type owl:DatatypeProperty ; + rdfs:comment "The inversion percent of the path in the component."^^xsd:string ; + rdfs:domain :Cell ; + rdfs:label "inversionPercent"^^xsd:string ; + rdfs:range xsd:float ; +. +:VG_Format + a owl:Class ; + rdfs:subClassOf edam:format_2921, edam:format_2055 ; + rdfs:label "Graph" ; + rdfs:comment "Consists of nodes, edges, and paths. Nodes each have a unique ID and a DNA sequence, edges consist of two node sides (so the graph is bidirected), and paths are a sequence of nodes and orientations. Paths also have names." ; + rdfs:isDefinedBy ; +. +:GAM_Format + a owl:Class ; + rdfs:label "Alignment" ; + rdfs:subClassOf edam:format_2055 ; + rdfs:comment "Contains an alignment to a graph" ; + rdfs:isDefinedBy ; +. +:GAMP_Format + a owl:Class ; + rdfs:label "MultipathAlignment" ; + rdfs:comment "Contains a DAG of subalignments to different paths through the reference graph" ; + rdfs:isDefinedBy ; +. +:Snarl_Format + a owl:Class ; + rdfs:label "Snarl" ; + rdfs:comment " A series of Snarl objects, each of which contain: The boundaries of a snarl (a bubble-like motif in a graph), the snarl (if any) that contains this one, classifications of the snarl based on the reachability of the two boundary nodes from each other and the cyclic-ness of the subcomponent." ; + rdfs:isDefinedBy ; +. +:Snarl_Traversal_Format + a owl:Class ; + rdfs:label "SnarlTraversal" ; + rdfs:comment " A series of SnarlTraversal objects, each of which contain: A walk in the graph through a Snarl, and optionally a name." ; + rdfs:isDefinedBy ; +. +:XG_Format + a owl:Class ; + rdfs:label "XG" ; + rdfs:comment "An immutable, succinct representation of a graph. Contains the same info as the .vg." ; + rdfs:seeAlso :VG_Format ; + rdfs:isDefinedBy ; +. +:GCSA2_Format + a owl:Class ; + rdfs:label "GCSA2" ; + rdfs:comment "Generalized compressed suffix array 2, an exact match index published by Jouni Siren" ; + rdfs:isDefinedBy ; +. +:GBWT_Format + a owl:Class ; + rdfs:label "GBWT" ; + rdfs:comment "Graph Burrows-Wheeler Transform, a succinct collection of paths through a genome graph, published by Jouni Siren. Usually represents haplotypes." ; + rdfs:isDefinedBy ; +. diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000000..dd8e64d19b8 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +junit_logging = all diff --git a/scripts/analyze_indels.py b/scripts/analyze_indels.py new file mode 100755 index 00000000000..f112d6faa5d --- /dev/null +++ b/scripts/analyze_indels.py @@ -0,0 +1,130 @@ +#!/usr/bin/python3 + +""" +analyze_indels.py: take GAM JSON as input and produce tables of non-softclip +indel positions in the read and lengths +""" + +import argparse +import os +import sys +import time +import subprocess +import collections +import json + + +def parse_args(args): + """ + Takes in the command-line arguments list (args), and returns a nice argparse + result with fields for all the options. + + Borrows heavily from the argparse documentation examples: + + """ + + # Construct the parser (which is stored in parser) + # Module docstring lives in __doc__ + # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847 + # And a formatter class so our examples in the docstring look good. Isn't it + # convenient how we already wrapped it to 80 characters? + # See http://docs.python.org/library/argparse.html#formatter-class + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument("--input", type=argparse.FileType('r'), default=sys.stdin, + help="JSON GAM to process") + parser.add_argument("--positions", required=True, type=argparse.FileType('w'), + help="TSV of positions and counts to write") + parser.add_argument("--lengths", required=True, type=argparse.FileType('w'), + help="TSV of lengths and counts to write") + parser.add_argument("--mapqs", required=True, type=argparse.FileType('w'), + help="TSV of indel/gapless categories, MAPQs and counts to write") + + # The command line arguments start with the program name, which we don't + # want to treat as an argument for argparse. So we remove it. + args = args[1:] + + return parser.parse_args(args) + +def main(args): + """ + Parses command line arguments and do the work of the program. + "args" specifies the program arguments, with args[0] being the executable + name. The return value should be used as the program's exit code. + """ + + options = parse_args(args) # This holds the nicely-parsed options object + + # These will track stats over all the indels. + + # Where does each indel start in read space? + indel_starts = collections.Counter() + # How long is each indel? + indel_lengths = collections.Counter() + # For reads with and without indels, what are your MAPQs? + read_mapqs = {'indel': collections.Counter(), 'gapless': collections.Counter()} + + for line in options.input: + # Load each line + alignment = json.loads(line) + + # Parse the alignment using these counters + read_offset = 0 + ref_offset = 0 + + # Determine if the read is 'indel' or 'gapless' + status = 'gapless' + + mappings = alignment.get("path", {}).get("mapping", []) + + for i, mapping in enumerate(mappings): + edits = mapping.get("edit", []) + for j, edit in enumerate(edits): + # Parse each edit + from_length = edit.get("from_length", 0) + to_length = edit.get("to_length", 0) + + # If we're the first or last edit, we're never an indel; we're a softclip instead + is_border = (i == 0 and j == 0) or (i == len(mappings) - 1 and j == len(edits) - 1) + + # If the lengths differ and we aren't on the end of the read we're an indel. + is_indel = from_length != to_length and not is_border + + if is_indel: + # Record this indel + indel_starts[read_offset] += 1 + indel_lengths[to_length - from_length] += 1 + # Mark that the read has an indel + status = 'indel' + + # Advance the position counters + read_offset += to_length + ref_offset += from_length + + # Record the read's MAPQ + read_mapqs[status][alignment.get("mapping_quality", 0)] += 1 + + # Now dump tables for plotting + for value, count in indel_starts.items(): + options.positions.write("{}\t{}\n".format(value, count)) + + for value, count in indel_lengths.items(): + options.lengths.write("{}\t{}\n".format(value, count)) + + for status, counts in read_mapqs.items(): + for value, count in counts.items(): + options.mapqs.write("{}\t{}\t{}\n".format(status, value, count)) + + +def entrypoint(): + """ + 0-argument entry point for setuptools to call. + """ + + # Provide main with its arguments and handle exit codes + sys.exit(main(sys.argv)) + +if __name__ == "__main__" : + entrypoint() + diff --git a/scripts/calc_insert.py b/scripts/calc_insert.py index cbb774222cf..c942d51ad5e 100755 --- a/scripts/calc_insert.py +++ b/scripts/calc_insert.py @@ -26,5 +26,5 @@ def calc_sd(vals, mean): if len(vals) == 1000: mean = calc_mean(vals) sd = calc_sd(vals, mean) - print "Mean: ", mean, ", SD: ", sd + print("Mean: ", mean, ", SD: ", sd) del vals[:] diff --git a/scripts/chunked_call b/scripts/chunked_call index 9b0995546c1..675245828c0 100755 --- a/scripts/chunked_call +++ b/scripts/chunked_call @@ -1,4 +1,4 @@ -#!/usr/bin/env python2.7 +#!/usr/bin/python3 """ Generate a VCF from a GAM and XG by splitting into GAM/VG chunks. Chunks are then called in series, and the VCFs stitched together. @@ -83,7 +83,7 @@ def run(cmd, proc_stdout = sys.stdout, proc_stderr = sys.stderr, check = True): """ run command in shell and throw exception if it doesn't work """ - print cmd + print(cmd) proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=proc_stdout, stderr=proc_stderr) output, errors = proc.communicate() diff --git a/scripts/compare-graphs.sh b/scripts/compare-graphs.sh deleted file mode 100755 index 4616b9eff3e..00000000000 --- a/scripts/compare-graphs.sh +++ /dev/null @@ -1,281 +0,0 @@ -#!/usr/bin/env bash -# compare-graphs.sh: compare a set of graph against each other using toil-vg mapeval on AWS - -set -ex - -# What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@bf1006b4932ce48d1bd742691619808285582c4c" - -# What Toil should we use? -TOIL_APPLIANCE_SELF=quay.io/ucsc_cgl/toil:3.16.0a1.dev2281-c7d77b028064a739e897f7b1eb158c902b530475 - -# What vg should we use? -VG_DOCKER_OPTS=() - -# How many nodes should we use at most? -MAX_NODES=6 - -# What's our unique run ID? Should be lower-case and start with a letter for maximum compatibility. -# See -RUN_ID="run$(cat /dev/urandom | LC_CTYPE=C tr -dc 'a-z0-9' | fold -w 32 | head -n 1)" - -# What cluster should we use? -CLUSTER_NAME="${RUN_ID}" -MANAGE_CLUSTER=1 - -# Should we delete the job store when we exit? -# We do by default, and if the run finishes successfully. -# We don't if we're asked to keep it and Toil errors out. -REMOVE_JOBSTORE=1 - -# Should we delete the outstore at the end of the script, if we're deleting the -# jobstore? -REMOVE_OUTSTORE=1 - -# What input reads and position truth set should we use? -READ_STEM="comparison" - -# Should we look for .bam reads (instead of .gam)? -USE_BAM_READS=0 - -# Should we look for .fq.gz reads (instead of .gam)? -USE_FQ_READS=0 - -# Should we add --ignore_quals -IGNORE_QUALS="" - -# Should we restart? -RESTART_ARG="" - -usage() { - # Print usage to stderr - exec 1>&2 - printf "Usage: $0 [Options] OUTPUT_PATH KEYPAIR_NAME REGION_NAME GRAPH [GRAPH [GRAPH ...]] \n" - printf "Options:\n\n" - printf "\t-p PACKAGE\tUse the given Python package specifier to install toil-vg.\n" - printf "\t-t CONTAINER\tUse the given Toil container in the cluster (default: ${TOIL_APPLIANCE_SELF}).\n" - printf "\t-c CLUSTER\tUse the given existing Toil cluster.\n" - printf "\t-v DOCKER\tUse the given Docker image specifier for vg.\n" - printf "\t-r READS\tUse the given read set stem (default: ${READ_STEM}).\n" - printf "\t-b BAM-READS\tUse BAM input reads (in READ_STEM).\n" - printf "\t-f FASTQ-READS\tUse .fq.gz input reads (in READ_STEM). Much faster than GAM/BAM.\n" - printf "\t-R RUN_ID\tUse or restart the given run ID.\n" - printf "\t-k \tKeep the out store and job store in case of error.\n" - printf "\t-s \tRestart a failed run.\n" - printf "\t-3 \tUse S3 instead of HTTP (much faster)\n" - printf "\t-i \tIgnore base qualities (needed if not running on trained sim or bam data)\n" - exit 1 -} - -while getopts "hp:t:c:v:r:bfR:ks3i" o; do - case "${o}" in - p) - TOIL_VG_PACKAGE="${OPTARG}" - ;; - t) - TOIL_APPLIANCE_SELF="${OPTARG}" - ;; - c) - CLUSTER_NAME="${OPTARG}" - MANAGE_CLUSTER=0 - ;; - v) - VG_DOCKER_OPTS="--vg_docker ${OPTARG}" - ;; - r) - READ_STEM="${OPTARG}" - ;; - b) - USE_BAM_READS=1 - ;; - f) - USE_FQ_READS=1 - ;; - R) - # This doesn't change the cluster name, which will still be the old run ID if not manually set. - # That's probably fine. - RUN_ID="${OPTARG}" - ;; - k) - REMOVE_JOBSTORE=0 - ;; - s) - RESTART_ARG="--restart" - ;; - 3) - USE_S3=1 - ;; - i) - IGNORE_QUALS="--ignore-quals" - ;; - *) - usage - ;; - esac -done - -shift $((OPTIND-1)) - -if [[ "$#" -lt "4" ]]; then - # Too few arguments - usage -fi - -OUTPUT_PATH="${1}" -shift -KEYPAIR_NAME="${1}" -shift -REGION_NAME="${1}" -shift - -GRAPH_NAMES=( ) -while [[ "$#" -gt "0" ]]; do - # Put all the args as graph names - GRAPH_NAMES+=("$1") - shift -done - -# Where do we keep our input files -if [ "$REGION_NAME" == "B37" ] || [ "$REGION_NAME" == "HS37D5" ]; then - STORE_TAG="$REGION_NAME" -else - STORE_TAG="bakeoff" -fi -if [[ "${USE_S3}" -eq "1" ]]; then - INPUT_STORE="s3://cgl-pipeline-inputs/vg_cgl/${STORE_TAG}" -else - INPUT_STORE="https://cgl-pipeline-inputs.s3.amazonaws.com/vg_cgl/${STORE_TAG}" -fi - -# Where do we save our results from the various jobs responsible for writing them? -OUTPUT_STORE="aws:us-west-2:cgl-pipeline-inputs/vg_cgl/comparison-script/runs/${RUN_ID}" -OUTPUT_STORE_URL="s3://cgl-pipeline-inputs/vg_cgl/comparison-script/runs/${RUN_ID}" - -# Where do we store our jobs? -JOB_TREE="aws:us-west-2:${RUN_ID}" - -# Put this in front of commands to do or not do them -PREFIX="" - -echo "Running run ${RUN_ID} as ${KEYPAIR_NAME} to compare ${GRAPH_NAMES[*]} on ${REGION_NAME} into ${OUTPUT_PATH}" - -function get_input_url() { - # Prints the input URL to download for the given file name - local BASE_FILENAME="${1}" - shift - echo "${INPUT_STORE}/${BASE_FILENAME}" -} - -function get_graph_url() { - # Prints the base URL for the given graph - local BASE_GRAPHNAME="${1}" - shift - get_input_url "${BASE_GRAPHNAME}-${REGION_NAME}" -} - -# Make sure we don't leave the cluster running or data laying around on exit. -function clean_up() { - set +e - if [[ "${REMOVE_JOBSTORE}" == "1" ]]; then - # Delete the Toil intermediates we could have used to restart the job - $PREFIX toil clean "${JOB_TREE}" - - if [[ "${REMOVE_OUTSTORE}" == "1" ]]; then - # Toil is happily done and we downloaded things OK - # Delete the outputs - $PREFIX aws s3 rm --recursive "${OUTPUT_STORE_URL}" - fi - fi - if [[ "${MANAGE_CLUSTER}" == "1" ]]; then - # Destroy the cluster - $PREFIX toil destroy-cluster "${CLUSTER_NAME}" -z us-west-2a - fi -} -trap clean_up EXIT - -# Convert just graph stems to full base urls -GRAPH_URLS=() -for GRAPH_STEM in "${GRAPH_NAMES[@]}"; do - GRAPH_URLS+=(`get_graph_url "${GRAPH_STEM}"`) -done - -if [[ "${MANAGE_CLUSTER}" == "1" ]]; then - TOIL_APPLIANCE_SELF="${TOIL_APPLIANCE_SELF}" $PREFIX toil launch-cluster "${CLUSTER_NAME}" --leaderNodeType=t2.medium -z us-west-2a "--keyPairName=${KEYPAIR_NAME}" -fi - -# We need to manually install git to make pip + git work... -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" apt update -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" apt install git -y - -# Ignore the old virtualenv if re-using a cluster - -# For hot deployment to work, toil-vg needs to be in a virtualenv that can see the system Toil -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" virtualenv --system-site-packages venv - -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install pyyaml -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install aws -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install numpy -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install scipy -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install scikit-learn -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install "${TOIL_VG_PACKAGE}" - -# We need the master's IP to make Mesos go -MASTER_IP="$($PREFIX toil ssh-cluster --insecure --zone=us-west-2a --logOff "${CLUSTER_NAME}" hostname -i)" - -# Strip some garbage from MASTER_IP -MASTER_IP="${MASTER_IP//[$'\t\r\n ']}" - -# Make sure we download the outstore whether we break now or not -set +e - -# What truth/read set should we use? -READ_SET="${READ_STEM}-${REGION_NAME}" - -# Toggle BAM Reads -if [[ "${USE_BAM_READS}" -eq "1" ]]; then - INPUT_OPTS="--bam_input_reads $(get_input_url ${READ_SET}.bam)" -elif [[ "${USE_FQ_READS}" -eq "1" ]]; then - INPUT_OPTS="--truth $(get_input_url ${READ_SET}.pos) --fastq $(get_input_url ${READ_SET}.fq.gz)" -else - INPUT_OPTS="--truth $(get_input_url ${READ_SET}.pos) --gam_input_reads $(get_input_url ${READ_SET}.gam)" -fi - -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/toil-vg mapeval \ - --whole_genome_config \ - ${RESTART_ARG} \ - ${VG_DOCKER_OPTS} \ - ${INPUT_OPTS} \ - --fasta `get_input_url "${REGION_NAME}.fa"` \ - --index-bases "${GRAPH_URLS[@]}" \ - --gam-names "${GRAPH_NAMES[@]}" \ - --bwa \ - --multipath ${IGNORE_QUALS} \ - --mapeval-threshold 200 \ - --realTimeLogging --logInfo \ - "${JOB_TREE}" \ - "${OUTPUT_STORE}" \ - --batchSystem mesos --provisioner=aws "--mesosMaster=${MASTER_IP}:5050" \ - --nodeTypes=r3.8xlarge:0.70 --defaultPreemptable --maxNodes=${MAX_NODES} --retryCount=3 -TOIL_ERROR="$?" - -# Make sure the output is public -$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/aws s3 sync --acl public-read "${OUTPUT_STORE_URL}" "${OUTPUT_STORE_URL}" - -mkdir -p "${OUTPUT_PATH}" -aws s3 sync "${OUTPUT_STORE_URL}" "${OUTPUT_PATH}" -DOWNLOAD_ERROR="$?" - -if [[ "${TOIL_ERROR}" == "0" ]]; then - # Toil completed successfully. - # We will delete the job store - REMOVE_JOBSTORE=1 -fi - -if [[ ! "${DOWNLOAD_ERROR}" == "0" ]]; then - # Download failed - # We will keep the out store - # (we also keep if if Toil fails and we're keeping the jobstore) - REMOVE_OUTSTORE=0 -fi - -# Cluster, tree, and output will get cleaned up by the exit trap diff --git a/scripts/filter-noisy-assembler-warnings.py b/scripts/filter-noisy-assembler-warnings.py index 49f8f5e90da..85d9c291e18 100755 --- a/scripts/filter-noisy-assembler-warnings.py +++ b/scripts/filter-noisy-assembler-warnings.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python3 # # filter-noisy-assembler-warnings.py # Author: Stuart Berg diff --git a/scripts/filter_unpaired_mappings.py b/scripts/filter_unpaired_mappings.py index 6674e858b7d..bfbbbdea0b3 100755 --- a/scripts/filter_unpaired_mappings.py +++ b/scripts/filter_unpaired_mappings.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python3 # filter_unpaired_mappings.py: Filter alignments so that every pair of lines represents a valid read pair """ @@ -112,8 +112,8 @@ def filter_json_gam(options): first_record = gam elif 'fragment_prev' in gam and first_record and gam['fragment_prev']['name'] == first_record['name']: assert first_record['fragment_next']['name'] == gam['name'] - print json.dumps(first_record) - print json.dumps(gam) + print(json.dumps(first_record)) + print(json.dumps(gam)) num_written += 2 elif 'fragment_next' not in gam and 'fragment_prev' not in gam: raise RuntimeError('fragment_prev/next not set for record: {}'.format(json.dumps(gam))) diff --git a/scripts/filter_variants_on_repeats.py b/scripts/filter_variants_on_repeats.py index 0451795bdfc..8b020de6f6a 100755 --- a/scripts/filter_variants_on_repeats.py +++ b/scripts/filter_variants_on_repeats.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python3 # -*- coding: utf-8 -*- # filter_variants_on_repeats.py: Drop low-frequency variants on repeats """ diff --git a/scripts/format_reads.py b/scripts/format_reads.py index de8fc62ea2d..b585cbc168c 100644 --- a/scripts/format_reads.py +++ b/scripts/format_reads.py @@ -23,7 +23,7 @@ def make_fastq(record, name, fake_qual, seq="", anno=""): if record.name == "": record.name = name - record.qual = "".join([fake_qual for i in xrange(0, len(record.seq))]) + record.qual = "".join([fake_qual for i in range(0, len(record.seq))]) return record @@ -49,5 +49,5 @@ def parse_args(): record.seq = line.strip() record.name = name_base + "_" + str(count) record = make_fastq(record, "", fake_qual) - print record.string() + print(record.string()) count += 1 diff --git a/scripts/giraffe-facts.py b/scripts/giraffe-facts.py new file mode 100755 index 00000000000..ffb0392aa79 --- /dev/null +++ b/scripts/giraffe-facts.py @@ -0,0 +1,958 @@ +#!/usr/bin/python3 + + + +""" +giraffe-facts.py: process a GAM file from the new minimizer-based mapper (vg giraffe) and report runtime statistics by filter. +""" + +import argparse +import os +import sys +import time +import subprocess +import collections +import io +import itertools +import json +import random +import math + +# Force output to UTF-8. Eventually we can use reconfigure() if we drop 3.6 +# and earlier. +# We need to do this before these streams get picked as any argument default +# values. +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') +sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf8') + +# We depend on our local histogram.py +import histogram + +FACTS = ["Giraffes are the tallest living terrestrial animal.", + "There are nine subspecies of giraffe, each occupying separate regions of Africa. Some researchers consider some subspecies to be separate species entirely.", + "Giraffes' horn-like structures are called 'ossicones'. They consist mostly of ossified cartilage.", + "Male giraffes compete for dominance in fights in which they strike each other with their necks.", + "There are more than 1600 giraffes in captivity worldwide.", + "The name 'giraffe' has roots in the Arabic 'zarafah', meaning 'fast-walker'.", + "Before the name 'giraffe' came into standard use, giraffes were commonly called 'camelopards'.", + "There are 10 known extinct species of giraffe.", + "The closest living relative to the giraffe is the okapi, an endangered hoofed mammal from the Congo.", + "Full grown giraffes are usually between 14 and 18 feet tall.", + "The tallest recorded giraffe was 19.3 feet tall.", + "Adult male giraffes weigh an average of 2628 lbs., whereas females weight 1825 lbs.", + "Giraffes have the ability to close their nostrils to protect against sandstorms and ants.", + "Giraffes have 18-inch-long prehensile tongues, which they use for grasping foliage and for grooming.", + "Male giraffes' spots grow darker as they age.", + "Under their fur coat, giraffes have grey skin.", + "Female giraffes have hair on their ossicones, whereas males' ossicones are bald.", + "Giraffes use the weight of their head to maintain their balance when they gallop.", + "Giraffes can run at 37 miles per hour for short distances, and 31 miles per hour for several miles.", + "Giraffes sleep for about half an hour a day.", + "Giraffes have the same number of vertebrae as most mammals. The length of their neck comes from longer vertebrae (over 10 inches each).", + "Giraffes' neck is fairly short at birth, probably to make birthing easier for mothers.", + "A giraffe's heart can weigh more than 25 lbs.", + "Giraffes have structures like check valves in their necks' veins to prevent blood from rushing to their head when they bend down to drink.", + "Giraffes have a four-chambered stomach similar to cattle.", + "An adult girafffe can eat 75 lbs. of foliage per day.", + "While generally herbivorous, giraffes have been observed eating meat and bone from carcasses.", + "The giraffe's gestation period is 14 months.", + "Newborn giraffes are about 6 feet tall.", + "Giraffes are lions' most common prey.", + "Most of giraffes' mounting behavior is between two males, often after a fight for dominance.", + "Giraffes allow red-billed oxpeckers (a bird species) to perch on them to feed on ticks.", + "Egyptian heiroglyphs use the giraffe as a character, pronounced 'sr'.", + "Designers of suits for fighter pilots studied giraffe skin, since figher pilots are also at risk of passing out when blood rushes to the legs.", + "The Humr people of Sudan use giraffe liver to create a putatively hallucinogenic drink called 'umm nyolokh'. The drink's psychoactive properties may come from the giraffe's diet of acacia plants.", + "The giraffe is the national animal of Tanzania.", + "There are around 100,000 giraffes in the wild as of 2016.", + "Giraffes only need to drink every few days. Most of their water comes from the vegetation they eat.", + "Giraffes give birth standing up, so newborn giraffes fall over 5 feet upon being born.", + "Giraffes usually sleep standing upright.", + "Male giraffes detect oestrus in females by tasting their urine.", + "June 21 is World Giraffe Day.", + "Toys R' Us has used Geoffrey the Giraffe as its mascot since 1965, although earlier advertisements in the 1950's used another giraffe: Dr. G. Raffe.", + "Giraffe hooves are 1 foot in diameter.", + "About 50% of giraffe calves die in their first year, mostly due to predation.", + "Kirahvi sanoo öri öri öri öri öri öri.", + "The giraffe's average walking speed is 10 miles per hour.", + "The giraffe's tongue is colored dark blue.", + "Some of giraffes' vocalizations are too low to be heard by human ears.", + "Giraffes have never been observed swimming.", + "Mozambique requires power lines to be 39 feet high so giraffes can safely pass underneath."] + +def parse_args(args): + """ + Takes in the command-line arguments list (args), and returns a nice argparse + result with fields for all the options. + + Borrows heavily from the argparse documentation examples: + + """ + + # Construct the parser (which is stored in parser) + # Module docstring lives in __doc__ + # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847 + # And a formatter class so our examples in the docstring look good. Isn't it + # convenient how we already wrapped it to 80 characters? + # See http://docs.python.org/library/argparse.html#formatter-class + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument("--input", type=argparse.FileType('r'), default=sys.stdin, + help="line-oriented JSON GAM to process") + parser.add_argument("outdir", + help="directory to place output in") + + # The command line arguments start with the program name, which we don't + # want to treat as an argument for argparse. So we remove it. + args = args[1:] + + return parser.parse_args(args) + +def sniff_params(read): + """ + Given a read dict parsed from JSON, compute a mapping parameter dict for the read. + + The read will have param_XXX annotations. Turn those into a dict from XXX to value. + + These should be the same for every read. + """ + + # This is the annotation dict from the read + annot = read.get('annotation', {}) + + # This is the params dict to fill in + params = {} + + for annot_name in annot.keys(): + if annot_name.startswith('param_'): + # Split the param annotations on underscore + (_, param_name) = annot_name.split('_') + + # Save the values under the name + params[param_name] = annot[annot_name] + + return params + +# Stats under NO_FILTER are not associated with a filter +NO_FILTER = "__none__" + +def make_stats(read): + """ + Given a read dict parsed from JSON, compute a stats OrderedDict for the read. + + Run on an empty dict, makes a zero-value stats dict. + + A stats dict maps from filter name to a Counter of filter stats. + The filter stats include: + + - 'passed_count_total' which is the count of results passing the + filter. + - 'failed_count_total' which is the count of results failing the + filter. + - 'passed_count_correct' which is the count of correct results passing + the filter. + - 'failed_count_correct' which is the count of correct results failing + the filter. + + Additionally, each of these '_count_' stats has a '_size_' version, + describing the total size of all items meeting the specified criteria (as + opposed to the number of items). + + For the 'seed' stage, correctness information is not yet available, so only + the '_total' values will be defined. '_correct' values will be set to None + (instead of 0). + + The Counter for a filter also has sub-Counters embedded in it for + expressing distributions of filter statistic values, to assist in filter + design. + + - 'statistic_distribution_correct': statistic value counts for items + deemed correct + - 'statistic_distribution_noncorrect': statistic value counts for items + not deemed correct + + NaN values of the statistics are filtered out. + + Filters appear in the OrderedDict in an order corresponding to their filter + number in the GAM. + + The stats dict may also have an entry for NO_FILTER, with stats not + associated with a filter. + """ + + # This is the annotation dict from the read + annot = read.get('annotation', {}) + + # This will map from filter number int to filter name + filters_by_index = {} + + # This will map from filter name to Counter of filter stats + filter_stats = collections.defaultdict(collections.Counter) + + for annot_name in annot.keys(): + # For each annotation + if annot_name.startswith('filter_'): + # If it is an individual filter info item + + # Names look like 'filter_2_cluster-score-threshold_cluster_passed_size_correct' + + # Break into components on underscores + (_, filter_num, filter_name, filter_stage, filter_status, filter_accounting, filter_metric) = annot_name.split('_') + + # Collect integers + filter_num = int(filter_num) + filter_stat_value = annot[annot_name] + + # Record the filter being at this index if not known already + filters_by_index[filter_num] = filter_name + + if filter_stage == 'minimizer': + # Wer are filtering items produced by the minimizer stage. + # At the minimizer stage, correct and incorrect are not defined yet. + if filter_metric == 'correct': + # Make sure we didn't get any counts + assert filter_stat_value == 0 + # None out the correct stat so we can detect this when making the table + filter_stat_value = None + + # Record the stat value + filter_stats[filter_name]['{}_{}_{}'.format(filter_status, filter_accounting, filter_metric)] = filter_stat_value + + elif annot_name.startswith('filterstats_'): + # It is a whole collection of correct or not-necessarily-correct filter statistic distribution values, for plotting. + + # Break into components on underscores (correctness will be 'correct' or 'noncorrect' + (_, filter_num, filter_name, filter_stage, filter_correctness) = annot_name.split('_') + + distribution = collections.Counter() + + for item in annot[annot_name]: + # Parse all the statistic vlues + item = float(item) + + if math.isnan(item): + # Discard NANs + continue + + # Count all instances of the same value + distribution[item] += 1 + + # Save the statistic distribution + filter_stats[filter_name]['statistic_distribution_{}'.format(filter_correctness)] = distribution + + elif annot_name.startswith('last_correct_stage'): + stage = annot[annot_name] + if stage == 'none': + filter_stats['hard-hit-cap']['last_correct_stage'] = 1 + elif stage == 'cluster': + filter_stats['cluster-coverage']['last_correct_stage'] = 1 + elif stage == 'extend': + filter_stats['extension-set']['last_correct_stage'] = 1 + elif stage == 'align': + filter_stats['max-alignments']['last_correct_stage'] = 1 + + # Now put them all in this OrderedDict in order + ordered_stats = collections.OrderedDict() + for filter_index in sorted(filters_by_index.keys()): + filter_name = filters_by_index[filter_index] + ordered_stats[filter_name] = filter_stats[filter_name] + + # Add in special non-filter stats + ordered_stats[NO_FILTER] = collections.Counter() + for k in ['time_used']: + if k in read: + ordered_stats[NO_FILTER][k] = read[k] + + return ordered_stats + +def add_in_stats(destination, addend): + """ + Add the addend stats dict into the destination stats dict. + Implements += for stats dicts. + """ + + for k, v in addend.items(): + if v is None: + # None will replace anything and propagate through + destination[k] = None + elif isinstance(v, dict): + # Recurse into dict + if k in destination: + add_in_stats(destination[k], v) + else: + # Use real += and hope it works + destination[k] += v + +def read_line_oriented_json(lines): + """ + For each line in the given stream, yield it as a parsed JSON object. + """ + + for line in lines: + yield json.loads(line) + +class Table(object): + """ + Format a table of output nicely in fixed-width text. + """ + + # Interface + + def __init__(self, widths, out=sys.stdout): + """ + Start a table with the given column widths (a list of integers) in + characters, printing to the given stream. + """ + + # Remember the base widths + self.widths = widths + + # Remember the out stream + self.out = out + + # Remember the previous actual column widths used, if any. + # None if no wor has been produced. + self.last_widths = None + + # Remember if we need a dividing line + self.need_line = False + + def line(self): + """ + Say to divide the previous row from the next row. + """ + + self.need_line = True + + def row(self, values, justify='l', merge=None, line_top=False, line_bottom=False): + """ + Given a list of values, one per column, for up to the number of columns + in the table, draw a table row. + + Justify can be 'l', 'r', 'c' or a list/string of those per value. + + If merge is given, it must be a list of the number of cells to merge + horizontally for each value. + + Different merge values without a line_top separator will look bad. + If line_top is set, divide from the previous row. + If line_bottom is set, divide from the next row. + """ + + # Compute new merged widths + merged_widths = self.compute_merges(merge) + + # Start or continue the table + if self.last_widths is None: + # Start the table + self.start(merged_widths) + elif self.need_line or line_top: + # Divide from the previous row. + self.sep(self.last_widths, merged_widths) + + # Print the actual row + self.cells(values, justify, merged_widths) + + # Remember this row's widths for next time. + self.last_widths = merged_widths + + # Remember if we need a line + self.need_line = line_bottom + + def full_row(self, left_value, right_value): + """ + Draw a full-width row in the table with a left-justified and a + right-justified value. + """ + + full_value = left_value + right_value.rjust(self.inner_width() - len(left_value)) + self.row([full_value], merge=[len(self.widths)]) + + + def close(self): + """ + Close off the table at the bottom. + """ + + if self.last_widths is None: + self.last_widths = self.widths + + self.end(self.last_widths) + + self.last_widths = None + + def inner_width(self): + """ + Get the total width of the table across all columns, between the outer edges. + """ + + return sum(self.widths) + len(self.widths) - 1 + + # Internal methods + + def box(self, part): + """ + Return the box-drawing character to draw the given part of a box. + Parts are {(t)op, (m)iddle, (b)ottom} crossed with {(l)eft, (m)iddle, + (r)ight} as two-character strings, plus (v)ertical and (h)orizontal as one-character strings. + """ + + skin = { + 'tl': '┌', + 'tm': '┬', + 'tr': 'â”', + 'bl': 'â””', + 'bm': 'â”´', + 'br': '┘', + 'ml': '├', + 'mm': '┼', + 'mr': '┤', + 'v': '│', + 'h': '─' + } + + return skin[part] + + def horizontal(self, left, junction, right, column, widths=None): + """ + Print a line across (either top, middle, or bottom). + + Takes the leftmost, between-column, rightmost, and in-column characters + as box() character ID strings. + + Can use a specified widths list, usually self.widths. + """ + + if widths is None: + widths = self.widths + + # Start edge + self.out.write(self.box(left)) + + for i, width in enumerate(widths): + # For each column + # Do its top line + self.out.write(self.box(column) * width) + if i + 1 != len(widths): + # Do the separator + self.out.write(self.box(junction)) + + # End edge + self.out.write(self.box(right)) + + self.out.write('\n') + + def start(self, widths_after): + """ + Print an opening line at the top of the table. + Needs to know the widths of the cells on the next table line. + """ + + self.horizontal('tl', 'tm', 'tr', 'h', widths_after) + + def end(self, widths_before): + """ + Print a closing line at the bottom of the table. + Needs to know the widths of the cells on the previous table line. + """ + + self.horizontal('bl', 'bm', 'br', 'h', widths_before) + + def sep(self, widths_before, widths_after): + """ + Print a middle separator line across the table. + Needs to know the widths of the cells on the previous and next table lines. + Both sets of widths must describe a table of the same total width. + """ + + # Start edge + self.out.write(self.box('ml')) + + # Compute total width (cells and separators), not counting final border + total_width = sum(widths_before) + len(widths_before) - 1 + + # Track what cell we are in on top + before_cursor = 0 + # And what column its trailing border is at + before_border = widths_before[before_cursor] + # Track what cell we are in on the bottom + after_cursor = 0 + # And what column its trailing border is at + after_border = widths_after[after_cursor] + # Track what column of internal table width we are in. + col = 0 + + while col < total_width: + if col == before_border: + if col == after_border: + # Junction on both sides + char = self.box('mm') + + # Advance after + after_cursor += 1 + after_border += widths_after[after_cursor] + 1 + else: + # Junction on top only + char = self.box('bm') + + # Advance before + before_cursor += 1 + before_border += widths_before[before_cursor] + 1 + elif col == after_border: + # Junction on bottom only + char = self.box('tm') + + # Advance after + after_cursor += 1 + after_border += widths_after[after_cursor] + 1 + else: + # No junction + char = self.box('h') + + # Print the character + self.out.write(char) + + # Go to the next column + col += 1 + + + # End edge + self.out.write(self.box('mr')) + + self.out.write('\n') + + def compute_merges(self, merges=None): + """ + Given a list of cell counts to merge horizontally, compute new widths from self.widths. + + If merges is None, use self.widths. + """ + + widths = self.widths + + if merges is not None: + new_widths = [] + width_cursor = 0 + for merge in merges: + # Compute a new column by merging the given number of old columns. + merged_width = 0 + for i in range(merge): + # Take the widths of all cells + merged_width += widths[width_cursor] + width_cursor += 1 + # Take the separating columns between cells + merged_width += merge - 1 + new_widths.append(merged_width) + while width_cursor < len(widths): + # Copy any unmerged columns + new_widths.append(widths[i]) + + widths = new_widths + + return widths + + def cells(self, values, justify, widths): + """ + Given a list of values, one per column, for up to the number of columns + in the table, draw a table row. + + Justify can be 'l', 'r', 'c', or a list/string of those per value. + + Column count/widths must be passed. + """ + + # Start the row + self.out.write(self.box('v')) + + for i, (value, width) in enumerate(itertools.zip_longest(values, widths)): + # For each item and its column and width... + if width is None: + # Too many items + raise RuntimeError("Ran out of table width values ({}) for {} columns".format( + len(widths), len(values))) + + # Compute the item string + item_string = str(value) if value is not None else '' + + # Decide on justification for this item + if justify == 'l': + item_just = 'l' + elif justify == 'r': + item_just = 'r' + if justify == 'c': + item_just = 'c' + elif i < len(justify): + item_just = justify[i] + else: + item_just = 'l' + + # Actually justify it in a field of the necessary width + if item_just == 'l': + justified_item = item_string.ljust(width) + elif item_just == 'r': + justified_item = item_string.rjust(width) + elif item_just == 'c': + justified_item = item_string.center(width) + else: + raise RuntimeError('Invalid justification: {}'.format(item_just)) + + # Output the content + self.out.write(justified_item) + + if (i + 1 != len(widths)): + # This isn't the last item. Do a separator. + self.out.write(self.box('v')) + + + # End the row + # TODO: Same as the separator + self.out.write(self.box('v')) + + self.out.write('\n') + +def print_table(read_count, stats_total, params=None, out=sys.stdout): + """ + Take the read count, the accumulated total stats dict, and an optional dict + of mapping parameters corresponding to values for filters. + + Print a nicely formatted table to the given stream. + + """ + + if stats_total is None: + # Handle the empty case + assert(read_count == 0) + out.write('No reads.\n') + return + + # Now do a table + + # First header line for each column + headers = [] + # Second header line for wach column + headers2 = [] + # Column min widths from headers + header_widths = [] + + # Find all the filters + filters = [k for k in stats_total.keys() if k != NO_FILTER] + + # Compute filter row headings + filter_headings = list(filters) + + if params is not None: + # Annotate each filter with its parameter value + annotated_headings = [] + for heading in filter_headings: + # For each filter + # It may be a compound thing||thing filter + parts = heading.split('||') + + # We will fill this with all the relevant filter cutoff values + filter_values = [] + for part in parts: + if part in params: + filter_values.append(params[part]) + + if len(filter_values) == 0: + # No parameters + annotated_headings.append(heading) + else: + # Annotate with the parameters + annotated_headings.append('{} ({})'.format(heading, ', '.join((str(x) for x in filter_values)))) + + filter_headings = annotated_headings + + # How long is the longest filter name + filter_width = max(itertools.chain((len(x) for x in filter_headings), [0])) + # Leave room for the header + filter_header = "Filter" + filter_width = max(filter_width, len(filter_header)) + # And for the "Overall" entry + filter_overall = "Overall" + filter_width = max(filter_width, len(filter_overall)) + + headers.append(filter_header) + headers2.append('') + header_widths.append(filter_width) + + # And the passing count columns (average) + passing_header = "Passing" + passing_header2 = "(/Read)" + passing_width = max(len(passing_header), len(passing_header2)) + + headers.append(passing_header) + headers2.append(passing_header2) + header_widths.append(passing_width) + + # And the failing count columns (average) + failing_header = "Failing" + failing_header2 = "(/Read)" + failing_width = max(len(failing_header), len(failing_header2)) + + headers.append(failing_header) + headers2.append(failing_header2) + header_widths.append(failing_width) + + # And the number of correct reads lost at each stage + lost_stage_header = "Lost" + lost_stage_header2 = "reads" + lost_stage_reads = [x for x in (stats_total[filter_name].get('last_correct_stage', 0) for filter_name in filters) if x is not None] + max_stage = max(itertools.chain(lost_stage_reads, [0])) + overall_lost_stage = sum(lost_stage_reads) + lost_stage_width = max(len(lost_stage_header), len(lost_stage_header2), len(str(max_stage)), len(str(overall_lost_stage))) + + headers.append(lost_stage_header) + headers2.append(lost_stage_header2) + header_widths.append(lost_stage_width) + + # And the correct result lost count header + lost_header = "Lost" + lost_header2 = "" + # How big a number will we need to hold? + # Look at the reads lost at all filters + # Account for None values for stages that don't have correctness defined yet. + lost_reads = [x for x in (stats_total[filter_name]['failed_count_correct'] for filter_name in filters) if x is not None] + max_filter_stop = max(itertools.chain(lost_reads, [0])) + # How many correct reads are lost overall by filters? + overall_lost = sum(lost_reads) + lost_width = max(len(lost_header), len(lost_header2), len(str(max_filter_stop)), len(str(overall_lost))) + + headers.append(lost_header) + headers2.append(lost_header2) + header_widths.append(lost_width) + + # And the total rejected count header + rejected_header = "Cut" + rejected_header2 = "" + # How big a number will we need to hold? + # Look at the reads rejected at all filters + rejected_reads = [stats_total[filter_name]['failed_count_total'] for filter_name in filters] + max_filter_stop = max(itertools.chain(rejected_reads, [0])) + # How many incorrect reads are rejected overall by filters? + overall_rejected = sum(rejected_reads) + rejected_width = max(len(rejected_header), len(rejected_header2), len(str(max_filter_stop)), len(str(overall_rejected))) + + headers.append(rejected_header) + headers2.append(rejected_header2) + header_widths.append(rejected_width) + + # Now do precision and recall + # How should we format them? + pr_format = '{:.2f}' + precision_header = "P" + precision_header2 = "" + precision_width = max(len(precision_header), len(precision_header2), len(pr_format.format(1.0)), len('N/A')) + headers.append(precision_header) + headers2.append(precision_header2) + header_widths.append(precision_width) + recall_header = "R" + recall_header2 = "" + recall_width = max(len(recall_header), len(recall_header2), len(pr_format.format(1.0)), len('N/A')) + headers.append(recall_header) + headers2.append(recall_header2) + header_widths.append(recall_width) + + + # Start the table + table = Table(header_widths) + + table.row(["Giraffe Facts"], 'c', merge=[len(header_widths)]) + table.line() + table.full_row('Reads', str(read_count)) + if 'time_used' in stats_total[NO_FILTER] and stats_total[NO_FILTER]['time_used'] != 0: + table.full_row('Mapping speed', '{:0.2f} RPS'.format(read_count / stats_total[NO_FILTER]['time_used'])) + table.line() + table.row(headers, 'c') + table.row(headers2, 'c') + table.line() + + + for i, filter_name in enumerate(filters): + # Grab average results passing this filter per read + total_passing = stats_total[filter_name]['passed_count_total'] + average_passing = total_passing / read_count if read_count != 0 else float('NaN') + + # Grab average results failing this filter per read + total_failing = stats_total[filter_name]['failed_count_total'] + average_failing = total_failing / read_count if read_count != 0 else float('NaN') + + # Grab reads that are lost. + # No reads are lost at the final stage. + lost = stats_total[filter_name]['failed_count_correct'] + + lost_stage = stats_total[filter_name]['last_correct_stage'] + + # And reads that are rejected at all + rejected = stats_total[filter_name]['failed_count_total'] + + if lost is None: + # Correctness is not defined yet. + # TODO: have a way to see if the correct mapping never shows up. + lost = 'N/A' + + # Compute precision + try: + precision = pr_format.format(stats_total[filter_name]['passed_count_correct'] / + stats_total[filter_name]['passed_count_total']) + except: + precision = 'N/A' + + # Compute recall + try: + recall = pr_format.format(stats_total[filter_name]['passed_count_correct'] / + (stats_total[filter_name]['passed_count_correct'] + + stats_total[filter_name]['failed_count_correct'])) + except: + recall = 'N/A' + + row = [filter_headings[i]] + align = 'c' + # Add the provenance columns + row += ['{:.2f}'.format(average_passing), '{:.2f}'.format(average_failing), lost_stage, lost, rejected, + precision, recall] + align += 'rrrrrr' + + # Output the finished row + table.row(row, align) + + table.line() + + # Compose the overall row + row = [filter_overall] + align = 'c' + # Add the provenance columns + row += ['', '', overall_lost_stage, overall_lost, overall_rejected, '', ''] + align += 'rr' + + table.row(row, align) + + # Close off table + table.close() + +def plot_filter_statistic_histograms(out_dir, stats_total): + """ + For each filter in the stats dict, see if it has nonempty + 'statistic_distribution_correct' and/or 'statistic_distribution_noncorrect' + Counters. Then if so, plot a histogram comparing correct and noncorrect + distributions, or just the noncorrect distribution if that is the only one + available (because correctness isn't known). + + Store histograms in out_dir. + """ + + for filter_name in stats_total.keys(): + correct_counter = stats_total[filter_name]['statistic_distribution_correct'] + noncorrect_counter = stats_total[filter_name]['statistic_distribution_noncorrect'] + + if not ((isinstance(correct_counter, dict) and len(correct_counter) > 0) or + (isinstance(noncorrect_counter, dict) and len(noncorrect_counter) > 0)): + + # No stats to plot + continue + + # Open a TSV file to draw a histogram from + tsv_path = os.path.join(out_dir, 'stat_{}.tsv'.format(filter_name)) + tsv = open(tsv_path, 'w') + + # Some stages don't have correctness annotation. So we track if we saw + # correct and noncorrect things to identify them. + have_correct = False + have_noncorrect = False + + if isinstance(correct_counter, dict) and len(correct_counter) > 0: + # We have correct item stats. + have_correct = True + for value, count in correct_counter.items(): + # Output format: label, value, repeats + tsv.write('correct\t{}\t{}\n'.format(value, count)) + + if isinstance(noncorrect_counter, dict) and len(noncorrect_counter) > 0: + # We have noncorrect item stats. + have_noncorrect = True + for value, count in noncorrect_counter.items(): + # Output format: label, value, repeats + tsv.write('noncorrect\t{}\t{}\n'.format(value, count)) + + tsv.close() + + # Now make the plot + svg_path = os.path.join(out_dir, 'stat_{}.svg'.format(filter_name)) + + args = ['histogram.py', tsv_path, '--save', svg_path, + '--title', '{} Statistic Histogram'.format(filter_name), + '--x_label', 'Statistic Value', + '--bins', '20', + '--y_label', 'Frequency'] + if have_correct and have_noncorrect: + args.append('--legend_overlay') + args.append('best') + args.append('--categories') + args.append('correct') + args.append('noncorrect') + histogram.main(args) + + + + +def main(args): + """ + Parses command line arguments and do the work of the program. + "args" specifies the program arguments, with args[0] being the executable + name. The return value should be used as the program's exit code. + """ + + print(random.choice(FACTS), file = sys.stderr) + + options = parse_args(args) # This holds the nicely-parsed options object + + # Make the output directory if it doesn't exist + os.makedirs(options.outdir, exist_ok=True) + + # Make a place to total up all the stats + stats_total = None + + # Count all the reads + read_count = 0 + + # Record mapping parameters from at least one read + params = None + + for read in read_line_oriented_json(options.input): + + if params is None: + # Go get the mapping parameters + params = sniff_params(read) + + # For the stats dict for each read + stats = make_stats(read) + if stats_total is None: + stats_total = stats + else: + # Sum up all the stats + add_in_stats(stats_total, stats) + + # Count the read + read_count += 1 + + # After processing all the reads + + # Print the table now in case plotting fails + print_table(read_count, stats_total, params) + + # Make filter statistic histograms + plot_filter_statistic_histograms(options.outdir, stats_total) + +def entrypoint(): + """ + 0-argument entry point for setuptools to call. + """ + + # Provide main with its arguments and handle exit codes + sys.exit(main(sys.argv)) + +if __name__ == "__main__" : + entrypoint() + + diff --git a/scripts/giraffe-speed.sh b/scripts/giraffe-speed.sh new file mode 100755 index 00000000000..51068fa98f8 --- /dev/null +++ b/scripts/giraffe-speed.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# +# giraffe-speed.sh: evaluate Giraffe mapping speed. +# +# Although CPU instruction counting doesn't work in this configuration for some +# reason, you can run on an AWS r5.4xlarge, with 100 GB disk, the +# "vg-data-user" IAM role and Ubuntu 20.04: +# +# git clone https://github.com/vgteam/vg.git +# cd vg +# git checkout +# git submodule update --init --recursive +# sudo apt update +# sudo apt install -y build-essential awscli +# make get-deps +# make -j16 +# ./scripts/giraffe-speed.sh +# +# In this configuration, a mapping speed of 3520.61 reads per second per +# thread, or 3527.4 reads per CPU-second (including output), is typical. +# Significant reductions may indicate a performance regression. +# +# Or, run in Docker or on Kubernetes using the Docker build of the version of +# vg you want to evaluate. Allocate at least 50 GB memory, 16 hyperthreads, and +# 100 GB of disk. The script dependencies should already be available. +# +# In this configuration, speed will vary with the machine type, and the load on +# the host, but a mapping speed of 3836.3 reads per CPU-second (including +# output) has been observed in a heavily loaded Kubernetes pod, and 5333.43 +# reads per CPU-second (including output) in a lightly loaded one, on the GI +# Kubernetes cluster. In all cases, 0.918424 M instructions per read is a +# typical value for GI Kubernetes's CPUs; significant changes from that +# indicate that more or less work is now required to map each read. +# +# We expect to be root, or to be able to sudo, so Giraffe can evaluate its +# instruction execution speed. +# +# Partially based on +# https://github.com/vgteam/giraffe-sv-paper/blob/566573b708878d8854acb088a0a8f7c920b120eb/scripts/giraffe/speed_giraffe.sh +set -e + +THREAD_COUNT=16 + +READS=s3://vg-k8s/profiling/reads/real/NA19239/novaseq6000-ERR3239454-shuffled-1m.fq.gz + +# The HGSVC graph is smaller and faster to load so we use that here. +GRAPH=hgsvc +GRAPH_BASE=s3://vg-k8s/profiling/graphs/v2/for-NA19240/hgsvc/hs38d1/HGSVC_hs38d1 +# sampled.64 isn't much bigger than the base GBWT for HGSVC +GBWT="sampled.64" + +function fetch() { + # Download a URL to a file, if it isn't there already + if [ ! -e "${2}" ] ; then + # For now we only do S3 URLs + aws s3 cp "${1}" "${2}" + fi +} + +fetch "${READS}" "./reads.fq.gz" +fetch "${GRAPH_BASE}.xg" "./${GRAPH}.xg" +fetch "${GRAPH_BASE}.dist" "./${GRAPH}.dist" +fetch "${GRAPH_BASE}.${GBWT}.gbwt" "./${GRAPH}.${GBWT}.gbwt" +fetch "${GRAPH_BASE}.${GBWT}.gg" "./${GRAPH}.${GBWT}.gg" +fetch "${GRAPH_BASE}.${GBWT}.min" "./${GRAPH}.${GBWT}.min" + +# Build a bigger reads file (10m) so we can run for more than like 16 seconds +cat ./reads.fq.gz ./reads.fq.gz ./reads.fq.gz ./reads.fq.gz ./reads.fq.gz ./reads.fq.gz ./reads.fq.gz ./reads.fq.gz ./reads.fq.gz ./reads.fq.gz >./many-reads.fq.gz + +SUDO=$(which sudo 2>/dev/null || true) +VG=$(which vg 2>/dev/null || echo "bin/vg") + +timeout -k1 1h bash -c "${SUDO} ${VG} giraffe -x ${GRAPH}.xg -H ${GRAPH}.${GBWT}.gbwt -g ${GRAPH}.${GBWT}.gg -m ${GRAPH}.${GBWT}.min -d ${GRAPH}.dist -f ./many-reads.fq.gz -i -t ${THREAD_COUNT} -p 2>log.txt >mapped.gam" || true + +cat log.txt + diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh new file mode 100755 index 00000000000..0f6d8dda15c --- /dev/null +++ b/scripts/giraffe-wrangler.sh @@ -0,0 +1,296 @@ +#!/usr/bin/env bash + +# giraffe-wrangler.sh: Run and profile vg giraffe and analyze the results. + +set -e + +usage() { + # Print usage to stderr + exec 1>&2 + printf "Usage: $0 [Options] FASTA XG_INDEX GCSA_INDEX GBWT_INDEX MINIMIZER_INDEX DISTANCE_INDEX SIM_GAM REAL_FASTQ\n" + printf "\n" + printf "Inputs may be files or S3 URLs.\n" + printf "\n" + printf "Arguments:\n" + printf " FASTA FASTA reference to run bwa-mem against; may be \"\"\n" + printf " XG_INDEX XG to annotate reads with positions, with corresponding .gg GBWTGraph\n" + printf " GCSA_INDEX GCSA (with LCP) for running vg map\n" + printf " GBWT_INDEX Haplotypes for mapping with Giraffe\n" + printf " MINIMIZER_INDEX Minimizers for mapping with Giraffe\n" + printf " DISTANCE_INDEX Distances for mapping with Giraffe\n" + printf " SIM_GAM Simulated reads for measuring mapping accuracy; may be \"\"\n" + printf " REAL_FASTQ Real reads for measuring mapping performance; may be \"\"\n" + printf "\n" + printf "Options:\n" + printf " -s DEST Save alignments and other internal files to DEST (directory or S3 url)\n" + printf " -t N Use N threads\n" + printf " -i Map reads as paired\n" + printf "\n" + exit 1 +} + +# Define where we should save our output +OUTPUT_DEST="" + +# Define the thread count for everyone. Can be changed with -t. +# Should fit on a NUMA node +THREAD_COUNT=24 + +# Define if we should pair reads +PAIR_READS=0 + +while getopts ":s:t:i" o; do + case "${o}" in + s) + OUTPUT_DEST="${OPTARG}" + ;; + t) + THREAD_COUNT="${OPTARG}" + ;; + i) + PAIR_READS=1 + ;; + ?) + usage + ;; + esac +done +shift $((OPTIND-1)) +if [[ "$#" -lt "8" ]]; then + # Too few arguments + usage +fi + +echo "Using ${THREAD_COUNT} threads" + +fetch_input() { + # Download the specified file, if not empty and not a file already. + # Dumps all files into the current directory as their basenames + # Output the new filename + if [[ ! -z "${1}" && "${1}" == s3://* ]] ; then + aws s3 --quiet cp "${1}" "$(basename "${1}")" + basename "${1}" + else + echo "${1}" + fi +} + +FASTA="$(fetch_input "${1}")" +if [[ ! -z ${FASTA} ]] ; then + # We have a FASTA + for EXT in amb ann bwt fai pac sa ; do + # Make sure we have all the indexes adjacent to the FASTA + fetch_input "${1}.${EXT}" >/dev/null + done +fi +shift +XG_INDEX="$(fetch_input "${1}")" +# Make sure we have the GBWTGraph pre-made +GBWT_GRAPH="$(fetch_input "${1%.xg}.gg")" +shift +GCSA_INDEX="$(fetch_input "${1}")" +LCP_INDEX="$(fetch_input "${1}.lcp")" +shift +GBWT_INDEX="$(fetch_input "${1}")" +shift +MINIMIZER_INDEX="$(fetch_input "${1}")" +shift +DISTANCE_INDEX="$(fetch_input "${1}")" +shift +SIM_GAM="$(fetch_input "${1}")" +shift +REAL_FASTQ="$(fetch_input "${1}")" +shift + +if [ -f "$GBWT_GRAPH" ]; then + GIRAFFE_GRAPH=(-g "${GBWT_GRAPH}") +else + GIRAFFE_GRAPH=(-x "${XG_INDEX}") +fi + +echo "Indexes:" +echo "${XG_INDEX}" +echo "${GBWT_GRAPH}" +echo "${GCSA_INDEX}" +echo "${LCP_INDEX}" +echo "${GBWT_INDEX}" +echo "${MINIMIZER_INDEX}" +echo "${DISTANCE_INDEX}" +echo "${SIM_GAM}" +echo "${REAL_FASTQ}" + +# Define the Giraffe parameters +GIRAFFE_OPTS=() + +# And the map parameters +MAP_OPTS=() + +# And the bwa mem parameters +BWA_OPTS=() + +if [[ "${PAIR_READS}" == 1 ]] ; then + # Turn on paired mapping + GIRAFFE_OPTS+=(-i) + MAP_OPTS+=(-i) + BWA_OPTS+=(-p) +fi + + + +# Define a work directory +# TODO: this requires GNU mptemp +WORK="$(mktemp -d)" + +# Check for NUMA. If we have NUMA and no numactl results may be unreliable +NUMA_COUNT="$(lscpu | grep "NUMA node(s)" | cut -f3- -d' ' | tr -d ' ')" +NUMA_PREFIX="" +NUMA_WARNING=0 + +if [[ "${NUMA_COUNT}" -gt "1" ]] ; then + if which numactl >/dev/null 2>&1 ; then + # Run everything on one NUMA node + NUMA_PREFIX="numactl --cpunodebind=0 --membind=0" + else + # We should warn in the report that NUMA may confound the results + NUMA_WARNING=1 + fi +fi + +if [[ ! -z "${REAL_FASTQ}" ]] ; then + if which perf >/dev/null 2>&1 ; then + # Record profile. + # Do this first because perf is likely to be misconfigured and we want to fail fast. + + # If we don't strip bin/vg to make it small, the addr2line calls that perf + # script makes take forever because the binary is huge + strip -d bin/vg + + ${NUMA_PREFIX} perf record -F 100 --call-graph dwarf -o "${WORK}/perf.data" vg giraffe "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/perf.gam" + perf script -i "${WORK}/perf.data" >"${WORK}/out.perf" + deps/FlameGraph/stackcollapse-perf.pl "${WORK}/out.perf" >"${WORK}/out.folded" + deps/FlameGraph/flamegraph.pl "${WORK}/out.folded" > "${WORK}/profile.svg" + fi +fi + +if [[ ! -z "${SIM_GAM}" ]] ; then + # Do simulated reads + + # Run simulated reads, with stats + ${NUMA_PREFIX} vg giraffe --track-correctness -x "${XG_INDEX}" "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/mapped.gam" + + # And map to compare with them + ${NUMA_PREFIX} vg map -x "${XG_INDEX}" -g "${GCSA_INDEX}" -G "${SIM_GAM}" -t "${THREAD_COUNT}" "${MAP_OPTS[@]}" >"${WORK}/mapped-map.gam" + + # Annotate and compare against truth + vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped.gam" >"${WORK}/annotated.gam" + vg annotate -p -x "${XG_INDEX}" -a "${WORK}/mapped-map.gam" >"${WORK}/annotated-map.gam" + + # GAM compare against truth. Use gamcompare to count correct reads to save a JSON scan. + CORRECT_COUNT="$(vg gamcompare -r 100 "${WORK}/annotated.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')" + CORRECT_COUNT_MAP="$(vg gamcompare -r 100 "${WORK}/annotated-map.gam" "${SIM_GAM}" 2>&1 >/dev/null | sed 's/[^0-9]//g')" + + # Compute identity of mapped reads + MEAN_IDENTITY="$(vg view -aj "${WORK}/mapped.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')" + MEAN_IDENTITY_MAP="$(vg view -aj "${WORK}/mapped-map.gam" | jq -c 'select(.path) | .identity' | awk '{x+=$1} END {print x/NR}')" + + # Compute loss stages + # Let giraffe facts errors out + vg view -aj "${WORK}/mapped.gam" | scripts/giraffe-facts.py "${WORK}/facts" >"${WORK}/facts.txt" +fi + +if [[ ! -z "${REAL_FASTQ}" ]] ; then + # Now do the real reads + + # Count them + if [[ "${REAL_FASTQ}" == *.gz ]] ; then + REAL_READ_COUNT="$(zcat "${REAL_FASTQ}" | wc -l)" + else + REAL_READ_COUNT="$(cat "${REAL_FASTQ}" | wc -l)" + fi + ((REAL_READ_COUNT /= 4)) + + # Get RPS for Giraffe + ${NUMA_PREFIX} vg giraffe -p "${GIRAFFE_GRAPH[@]}" -m "${MINIMIZER_INDEX}" -H "${GBWT_INDEX}" -d "${DISTANCE_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${GIRAFFE_OPTS[@]}" >"${WORK}/real.gam" 2>"${WORK}/log.txt" + + GIRAFFE_RPS="$(cat "${WORK}/log.txt" | grep "reads per second" | sed 's/[^0-9.]//g')" + + if [[ ! -z "${FASTA}" ]] ; then + # Get RPS for bwa-mem + + ${NUMA_PREFIX} bwa mem -t "${THREAD_COUNT}" "${FASTA}" "${REAL_FASTQ}" "${BWA_OPTS[@]}" >"${WORK}/mapped.bam" 2>"${WORK}/bwa-log.txt" + + # Now we get all the batch times from BWA and use those to compute RPS values. + # This is optimistic but hopefully consistent. + BWA_RPS_ALL_THREADS="$(cat "${WORK}/bwa-log.txt" | grep "Processed" | sed 's/[^0-9]*\([0-9]*\) reads in .* CPU sec, \([0-9]*\.[0-9]*\) real sec/\1 \2/g' | tr ' ' '\t' | awk '{sum1+=$1; sum2+=$2} END {print sum1/sum2}')" + + BWA_RPS="$(echo "${BWA_RPS_ALL_THREADS} / ${THREAD_COUNT}" | bc -l)" + + fi + + # Align the real reads with map, ignoring speed + ${NUMA_PREFIX} vg map -x "${XG_INDEX}" -g "${GCSA_INDEX}" -f "${REAL_FASTQ}" -t "${THREAD_COUNT}" "${MAP_OPTS[@]}" >"${WORK}/real-map.gam" + + # Compute stats for giraffe and map on real reads + echo "Real read stats:" >"${WORK}/real-stats.txt" + echo "Giraffe:" >>"${WORK}/real-stats.txt" + vg stats -a "${WORK}/real.gam" >>"${WORK}/real-stats.txt" 2>&1 + echo "Map:" >>"${WORK}/real-stats.txt" + vg stats -a "${WORK}/real-map.gam" >>"${WORK}/real-stats.txt" 2>&1 +fi + + +echo "==== Giraffe Wrangler Report for vg $(vg version -s) ====" + +if [[ "${NUMA_WARNING}" == "1" ]] ; then + echo "WARNING! Unable to restrict to a single NUMA node! Results may have high variance!" +fi + +if [[ ! -z "${REAL_FASTQ}" ]] ; then + if which perf >/dev/null 2>&1 ; then + # Output perf stuff + mv "${WORK}/perf.data" ./perf.data + mv "${WORK}/profile.svg" ./profile.svg + echo "Profiling information saved as ./perf.data" + echo "Interactive flame graph (for browsers) saved as ./profile.svg" + fi +fi + +# Print the report +if [[ ! -z "${SIM_GAM}" ]] ; then + # Include simulated reads + echo "Giraffe got ${CORRECT_COUNT} simulated reads correct with ${MEAN_IDENTITY} average identity per mapped base" + echo "Map got ${CORRECT_COUNT_MAP} simulated reads correct with ${MEAN_IDENTITY_MAP} average identity per mapped base" +fi +if [[ ! -z "${REAL_FASTQ}" ]] ; then + # Include real reads + echo "Giraffe aligned real reads at ${GIRAFFE_RPS} reads/second on ${THREAD_COUNT} threads" + if [[ ! -z "${FASTA}" ]] ; then + echo "bwa-mem aligned real reads at ${BWA_RPS} reads/second on ${THREAD_COUNT} threads" + fi +fi + +if [[ ! -z "${SIM_GAM}" ]] ; then + # Print Giraffe Facts for simulated reads + cat "${WORK}/facts.txt" +fi + +if [[ ! -z "${REAL_FASTQ}" ]] ; then + # Print real read stats + cat "${WORK}/real-stats.txt" +fi + +if [[ ! -z "${OUTPUT_DEST}" ]] ; then + if [[ "${OUTPUT_DEST}" == s3://* ]] ; then + # Save our intermediates to S3 + aws s3 cp --recursive "${WORK}" "${OUTPUT_DEST}" + else + # Save our intermediates to disk + cp -R "${WORK}" "${OUTPUT_DEST}" + fi +fi + +rm -Rf "${WORK}" + + + + diff --git a/scripts/histogram.py b/scripts/histogram.py new file mode 100755 index 00000000000..55aad143a42 --- /dev/null +++ b/scripts/histogram.py @@ -0,0 +1,677 @@ +#!/usr/bin/python3 +""" +histogram: plot a histogram of a file of numbers. Numbers can be floats, one per +line. Lines with two numbers are interpreted as pre-counted, with the number of +repeats of the first being given by the second. + +Multiple instances of the same value in a category will be merged by adding +weights. + +Re-uses sample code and documentation from + +""" + +import argparse, sys, os, itertools, math, numpy, collections +import matplotlib, matplotlib.ticker + +def intify(x): + """ + Turn an integral float into an int, if applicable. + """ + + if isinstance(x, float) and x.is_integer(): + return int(x) + return x + +def draw_labels(bin_counts, bar_patches, size=None): + """ + Put the given count labels on the given bar patches, on the current axes. + Takes an optional font size. + + """ + + from matplotlib import pyplot + + # Grab the axes + axes = pyplot.gca() + + for bin_count, bar_patch in zip(bin_counts, bar_patches): + + if(bin_count.is_integer()): + # Intify if applicable + bin_count = int(bin_count) + + # Label each bar + if bin_count == 0: + # Except those for empty bins + continue + + # Find the center of the bar + bar_center_x = bar_patch.get_x() + bar_patch.get_width() / float(2) + # And its height + bar_height = bar_patch.get_height() + + # Label the bar + axes.annotate("{:,}".format(bin_count), (bar_center_x, bar_height), + ha="center", va="bottom", rotation=45, xytext=(0, 5), + textcoords="offset points", size=size) + +def parse_args(args): + """ + Takes in the command-line arguments list (args), and returns a nice argparse + result with fields for all the options. + Borrows heavily from the argparse documentation examples: + + """ + + # The command line arguments start with the program name, which we don't + # want to treat as an argument for argparse. So we remove it. + args = args[1:] + + # Construct the parser (which is stored in parser) + # Module docstring lives in __doc__ + # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847 + # And a formatter class so our examples in the docstring look good. Isn't it + # convenient how we already wrapped it to 80 characters? + # See http://docs.python.org/library/argparse.html#formatter-class + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + + # Now add all the options to it + parser.add_argument("data", nargs="+", + help="the file to read") + parser.add_argument("--redPortion", type=float, action="append", default=[], + help="portion of each bin to color red") + parser.add_argument("--redWeight", type=float, action="append", default=[], + help="value to plot in red in each bin") + parser.add_argument("--title", default="Histogram", + help="the plot title") + parser.add_argument("--x_label", default="Value", + help="the plot title") + parser.add_argument("--y_label", default="Number of Items (count)", + help="the plot title") + parser.add_argument("--bins", type=int, default=10, + help="the number of histogram bins") + parser.add_argument("--x_min", "--min", type=float, default=None, + help="minimum value allowed") + parser.add_argument("--x_max", "--max", type=float, default=None, + help="maximum value allowed") + parser.add_argument("--y_min", type=float, default=None, + help="minimum count on plot") + parser.add_argument("--y_max", type=float, default=None, + help="maximum count on plot") + parser.add_argument("--cutoff", type=float, default=None, + help="note portion above and below a value, and draw a vertical line") + parser.add_argument("--font_size", type=int, default=12, + help="the font size for text") + parser.add_argument("--categories", nargs="+", default=None, + help="categories to plot, in order") + parser.add_argument("--category_labels", "--labels", nargs="+", + default=[], + help="labels for all categories or data files, in order") + parser.add_argument("--colors", nargs="+", default=[], + help="use the specified Matplotlib colors per category or file") + parser.add_argument("--styles", nargs="+", default=[], + help="use the specified line styles per category or file") + parser.add_argument("--cumulative", action="store_true", + help="plot cumulatively") + parser.add_argument("--log", action="store_true", + help="take the base-10 logarithm of values before plotting histogram") + parser.add_argument("--log_counts", "--logCounts", action="store_true", + help="take the logarithm of counts before plotting histogram") + parser.add_argument("--fake_zero", action="store_true", + help="split lines where points would be 0") + parser.add_argument("--split_at_zero", action="store_true", + help="split lines between positive and negative") + parser.add_argument("--stats", action="store_true", + help="print data stats") + parser.add_argument("--save", + help="save figure to the given filename instead of showing it") + parser.add_argument("--dpi", type=int, default=300, + help="save the figure with the specified DPI, if applicable") + parser.add_argument("--sparse_ticks", action="store_true", + help="use sparse tick marks on both axes") + parser.add_argument("--sparse_x", action="store_true", + help="use sparse tick marks on X axis") + parser.add_argument("--sparse_y", action="store_true", + help="use sparse tick marks on Y axis") + parser.add_argument("--ticks", nargs="+", default=None, + help="use particular X tick locations") + parser.add_argument("--scientific_x", action="store_true", + help="use scientific notation on the X axis") + parser.add_argument("--scientific_y", action="store_true", + help="use scientific notation on the Y axis") + parser.add_argument("--label", action="store_true", + help="label bins with counts") + parser.add_argument("--label_size", type=float, + help="bin count label font size") + parser.add_argument("--no_n", dest="show_n", action="store_false", + help="don't add n value to title") + parser.add_argument("--normalize", action="store_true", + help="normalize to total weight of 1") + parser.add_argument("--line", action="store_true", + help="draw a line instead of a barchart") + parser.add_argument("--no_zero_ends", dest="zero_ends", default=True, + action="store_false", + help="don't force line ends to zero") + parser.add_argument("--legend_overlay", default=None, + help="display the legend overlayed on the graph at this location") + parser.add_argument("--no_legend", action="store_true", + help="don't display a legend when one would otherwise be dispalyed") + parser.add_argument("--points", action="store_true", + help="draw points instead of a barchart") + parser.add_argument("--width", type=float, default=8, + help="plot width in inches") + parser.add_argument("--height", type=float, default=6, + help="plot height in inches") + + + return parser.parse_args(args) + +def filter2(criterion, key_list, other_list): + """ + Filter two lists of corresponding items based on some function of the first + list. + + """ + + # Make the output lists + out1 = [] + out2 = [] + + for key_val, other_val in zip(key_list, other_list): + # Pair up the items + if criterion(key_val): + # The key passed the filter, so take both. + out1.append(key_val) + out2.append(other_val) + + return out1, out2 + +def filter_n(*args): + """ + Filter any number of lists of corresponding items based on some function of + the first list. + + """ + + filter_function = args[0] + to_filter = args[1:] + + to_return = [list() for _ in to_filter] + + for i in range(len(to_filter[0])): + # For each run of entries + if filter_function(to_filter[0][i]): + # If the key passes the filter + for j in range(len(to_filter)): + # Keep the whole row + if i < len(to_filter[j]): + to_return[j].append(to_filter[j][i]) + + + # return all the lists as a tuple, which unpacks as multiple return values + return tuple(to_return) + +def main(args): + """ + Parses command line arguments, and plots a histogram. + "args" specifies the program arguments, with args[0] being the executable + name. The return value should be used as the program's exit code. + """ + + options = parse_args(args) # This holds the nicely-parsed options object + + if options.save is not None: + # Set up plot for use in headless mode if we just want to save. See + # . We need to do this before + # we grab pyplot. + matplotlib.use('Agg') + + from matplotlib import pyplot + + # Make the figure with the appropriate size and DPI. + pyplot.figure(figsize=(options.width, options.height), dpi=options.dpi) + + # This will hold a dict of dicts from data value to weight, by category or + # file name. Later gets converted to a dict of lists of (value, weight) + # pairs, aggregated by value. + all_data = collections.defaultdict(lambda: collections.defaultdict(float)) + + for data_filename in options.data: + + for line_number, line in enumerate(open(data_filename)): + # Split each line + parts = line.split() + + if len(parts) == 1: + # This is one instance of a value + + all_data[data_filename][float(parts[0])] += 1.0 + elif len(parts) == 2: + if len(options.data) > 1: + # This is multiple instances of a value, and we are doing + # categories by filename. + all_data[data_filename][float(parts[0])] += float(parts[1]) + else: + try: + value = float(parts[0]) + # If the first column is a number, this is value, weight + # data. + all_data[data_filename][value] += float(parts[1]) + except ValueError: + # This is category, instance data, since first column + # isn't a number. + all_data[parts[0]][float(parts[1])] += 1.0 + elif len(parts) == 3: + # This is category, instance, weight data + all_data[parts[0]][float(parts[1])] += float(parts[2]) + else: + raise Exception("Wrong number of fields on {} line {}".format( + data_filename, line_number + 1)) + + for category in list(all_data.keys()): + # Strip NaNs and Infs and weight-0 entries, and convert to a dict of + # lists of tuples. + all_data[category] = [(value, weight) for (value, weight) + in list(all_data[category].items()) if + value < float("+inf") and value > float("-inf") and weight > 0] + + + + # Calculate our own bins, over all the data. First we need the largest and + # smallest observed values. The fors in the comprehension have to be in + # normal for loop order and not the other order. + # Make sure to filter out 0s from bounds determination if using log space. + bin_min = options.x_min if options.x_min is not None else min((pair[0] + for pair_list in list(all_data.values()) for pair in pair_list if (not options.log or pair[0] != 0))) + bin_max = options.x_max if options.x_max is not None else max((pair[0] + for pair_list in list(all_data.values()) for pair in pair_list if (not options.log or pair[0] != 0))) + + if options.log: + # Do our bins in log space, so they look evenly spaced on the plot. + bin_max = math.log10(bin_max) + bin_min = math.log10(bin_min) + + # Work out what step we should use between bin edges + bin_step = (bin_max - bin_min) / float(options.bins) + # Work out where the bin edges should be + bins = [bin_min + bin_step * i for i in range(options.bins + 1)] + # Work out where the bin centers should be + bin_centers = [left_edge + bin_step / 2.0 for left_edge in bins[:-1]] + + if options.log: + # Bring bins back into data space + bins = [math.pow(10, x) for x in bins] + bin_centers = [math.pow(10, x) for x in bin_centers] + + if options.categories is not None: + # Order data by category order + ordered_data = [(category, all_data[category]) for category in + options.categories] + elif len(options.data) > 1: + # Order data by file order + ordered_data = [(filename, all_data[filename]) for filename in + options.data] + else: + # Order arbitrarily + ordered_data = list(all_data.items()) + + if options.categories is not None and options.category_labels == []: + # Label categories with their internal names + category_labels = options.categories + else: + # Label categories exactly as asked + category_labels = options.category_labels + + for (category, data_and_weights), label, color, line_style, marker in \ + zip(ordered_data, + itertools.chain(category_labels, itertools.repeat(None)), + itertools.chain(options.colors, itertools.cycle( + ['b', 'g', 'r', 'c', 'm', 'y', 'k'])), + itertools.chain(options.styles, itertools.cycle( + ['-', '--', ':', '-.'])), + itertools.cycle( + ['o', 'v', '^', '<', '>', 's', '+', 'x', 'D', '|', '_'])): + # For every category and its display properties... + + if len(data_and_weights) == 0: + # Skip categories with no data + continue + + # Split out the data and the weights for this category/file + data = [pair[0] for pair in data_and_weights] + weights = [pair[1] for pair in data_and_weights] + + # For each set of data and weights that we want to plot, and the label + # it needs (or None)... + + # We may want to normalize by total weight + # We need a float here so we don't get int division later. + total_weight_overall = float(0) + + for value, weight in zip(data, weights): + # Sum up the weights overall + total_weight_overall += weight + + if options.normalize and total_weight_overall > 0: + # Normalize all the weight to 1.0 total weight. + weights = [w / total_weight_overall for w in weights] + + # Apply the limits after normalization + if options.x_min is not None: + data, weights = filter2(lambda x: x >= options.x_min, data, weights) + if options.x_max is not None: + data, weights = filter2(lambda x: x <= options.x_max, data, weights) + + # Work out how many samples there are left within the chart area + samples = intify(sum(weights)) + + if options.stats: + # Compute and report some stats + data_min = numpy.min(data) + data_min_count = weights[numpy.argmin(data)] + data_max = numpy.max(data) + data_max_count = weights[numpy.argmax(data)] + # The mode is the data item with maximal count + data_mode = data[numpy.argmax(weights)] + data_mode_count = numpy.max(weights) + + # Intify floats pretending to be ints + data_min = intify(data_min) + data_min_count = intify(data_min_count) + data_max = intify(data_max) + data_max_count = intify(data_max_count) + data_mode = intify(data_mode) + data_mode_count = intify(data_mode_count) + + # TODO: median, mean + + print(("Min: {} occurs {} times".format(data_min, data_min_count))) + print(("Mode: {} occurs {} times".format(data_mode, data_mode_count))) + print(("Max: {} occurs {} times".format(data_max, data_max_count))) + + if options.cutoff is not None: + # Work out how much weight is above and below the cutoff + above = 0 + below = 0 + + for value, weight in zip(data, weights): + if value > options.cutoff: + above += weight + else: + below += weight + + # Report the results wrt the cutoff. + print(("{} above {}, {} below".format( + above / total_weight_overall, options.cutoff, + below / total_weight_overall))) + + if options.line or options.points: + # Do histogram binning manually + + # Do the binning + bin_values, _ = numpy.histogram(data, bins=bins, weights=weights) + + if options.cumulative: + # Calculate cumulative weights for each data point + bin_values = numpy.cumsum(bin_values) + + if options.zero_ends: + if options.cumulative: + # Pin things to 0 on the low end and max on the high + all_bin_centers = [bins[0]] + list(bin_centers) + [bins[-1]] + all_bin_values = [0] + list(bin_values) + [sum(weights)] + else: + # Pin things to 0 on the end + all_bin_centers = [bins[0]] + list(bin_centers) + [bins[-1]] + all_bin_values = [0] + list(bin_values) + [0] + else: + all_bin_centers = bin_centers + all_bin_values = bin_values + + # Now we make a bunch of deries for each line, potentially. This + # holds pairs of (centers, values) lists. + series = [] + + if options.fake_zero or options.split_at_zero: + # We need to split into multiple series, potentially. + # This holds the series we are working on. + this_series = ([], []) + + # What was the last bin we saw? + last_bin = 0 + + for center, value in zip(all_bin_centers, + all_bin_values): + # For every point on the line, see if we need to break here + # because it's zero. + + # This logic gets complicated so we do some flags. + # Do we keep this point? + includeSample = True + # Do we split the line? + breakSeries = False + + if options.fake_zero and value == 0: + # We don't want this sample, and we need to break the + # series + includeSample = False + breakSeries = True + + if options.split_at_zero and last_bin < 0 and center > 0: + # We crossed the y axis, or we went down to the x axis. + # We can maybe keep the sample, and we need to break the + # series + breakSeries = True + + if breakSeries and len(this_series[0]) > 0: + # Finish the series and start another + series.append(this_series) + this_series = ([], []) + + if includeSample: + # Stick this point in the series + this_series[0].append(center) + this_series[1].append(value) + + last_bin = center + + if len(this_series[0]) > 0: + # Finish the last series + series.append(this_series) + + else: + # Just do one series + series.append((all_bin_centers, all_bin_values)) + + # We only want to label the first series in the legend, so we'll + # none this out after we use it. + label_to_use = label + + for series_centers, series_values in series: + # Plot every series + + if options.line and options.points: + # Do the plots as lines with points + pyplot.plot(series_centers, series_values, + label=label_to_use, linestyle=line_style, color=color, + marker=marker) + label_to_use = None + elif options.line: + # Do the plots as lines only + pyplot.plot(series_centers, series_values, + label=label_to_use, linestyle=line_style, color=color) + label_to_use= None + elif options.points: + # Do the plot as points. + pyplot.scatter(series_centers, series_values, + label=label_to_use, color=color, marker=marker) + label_to_use = None + + if options.log_counts: + # Log the Y axis + pyplot.yscale('log') + + if options.split_at_zero: + # Put a big vertical line. + pyplot.axvline(linewidth=2, color="k") + + else: + # Do the plot. Do cumulative, or logarithmic Y axis, optionally. + # Keep the bin total counts and the bar patches. + bin_counts, _, bar_patches = pyplot.hist(data, bins, + cumulative=options.cumulative, log=options.log_counts, + weights=weights, alpha=0.5 if len(ordered_data) > 1 else 1.0, + label=label) + + if options.cutoff is not None: + # Put a vertical line at the cutoff. + pyplot.axvline(x=options.cutoff, color="r") + + + if len(options.redPortion) > 0: + # Plot a red histogram over that one, modified by redPortion. + + red_data = [] + red_weights = [] + + for item, weight in zip(data, weights): + # For each item, what bin is it in? + bin_number = int(item / bin_step) + + if bin_number < len(options.redPortion): + # We have a potentially nonzero scaling factor. Apply that. + weight *= options.redPortion[bin_number] + + # Keep this item. + red_data.append(item) + red_weights.append(weight) + + # Plot the re-weighted data with the same bins, in red + red_counts, _, red_patches = pyplot.hist(red_data, bins, + cumulative=options.cumulative, log=options.log_counts, + weights=red_weights, color='#FF9696', hatch='/'*6) + + if options.label: + # Label all the red portion-based bars + draw_labels(red_counts, red_patches, size=options.label_size) + + + + if len(options.redWeight) > 0: + # Plot a red histogram over that one, modified by redPortion. + + # Grab an item in each bin + items = bins[0:len(options.redWeight)] + + # Plot the re-weighted data with the same bins, in red + red_counts, _, red_patches = pyplot.hist(items, bins, + cumulative=options.cumulative, log=options.log_counts, + weights=options.redWeight, color='#FF9696', hatch='/'*6) + + if options.label: + # Label all the red weight-based bars + draw_labels(red_counts, red_patches, size=options.label_size) + + + + # StackOverflow provides us with font sizing. See + # + matplotlib.rcParams.update({"font.size": options.font_size}) + if options.show_n: + # Add an n value to the title + options.title += " (n = {:,})".format(samples) + pyplot.title(options.title) + pyplot.xlabel(options.x_label) + pyplot.ylabel(options.y_label) + + if options.log: + # Set the X axis to log mode + pyplot.xscale('log') + + if options.x_min is not None: + # Set only the lower x limit + pyplot.xlim((options.x_min, pyplot.xlim()[1])) + if options.x_max is not None: + # Set only the upper x limit + pyplot.xlim((pyplot.xlim()[0], options.x_max)) + + if options.y_min is not None: + # Set only the lower y limit + pyplot.ylim((options.y_min, pyplot.ylim()[1])) + elif options.log_counts: + # Make sure the default lower Y limit is 1 on log plots. + pyplot.ylim((1, pyplot.ylim()[1])) + if options.y_max is not None: + # Set only the upper y limit + pyplot.ylim((pyplot.ylim()[0], options.y_max)) + + if options.sparse_ticks or options.sparse_x: + # Set up X tickmarks to have only 2 per axis, at the ends + pyplot.gca().xaxis.set_major_locator( + matplotlib.ticker.FixedLocator(pyplot.xlim())) + if options.sparse_ticks or options.sparse_y: + # Same for the Y axis + pyplot.gca().yaxis.set_major_locator( + matplotlib.ticker.FixedLocator(pyplot.ylim())) + + if options.ticks is not None: + # Use these particular X ticks instead + pyplot.gca().xaxis.set_major_locator( + matplotlib.ticker.FixedLocator( + [float(pos) for pos in options.ticks])) + + # Make sure tick labels don't overlap. See + # + pyplot.gca().tick_params(axis="x", pad=0.5 * options.font_size) + + # Make our own scientific notation formatter since set_scientific is not + # working + sci_formatter = matplotlib.ticker.FormatStrFormatter("%1.2e") + if options.scientific_x: + # Force scientific notation on X axis + pyplot.gca().xaxis.set_major_formatter(sci_formatter) + if options.scientific_y: + # Force scientific notation on Y axis + pyplot.gca().yaxis.set_major_formatter(sci_formatter) + + if options.label: + # Label all the normal bars + draw_labels(bin_counts, bar_patches, size=options.label_size) + + # Make everything fit + pyplot.tight_layout() + + if len(category_labels) > 0 and not options.no_legend: + # We need a legend + + if options.legend_overlay is None: + # We want the default legend, off to the right of the plot. + + # First shrink the plot to make room for it. + # TODO: automatically actually work out how big it will be. + bounds = pyplot.gca().get_position() + pyplot.gca().set_position([bounds.x0, bounds.y0, + bounds.width * 0.5, bounds.height]) + + # Make the legend + pyplot.legend(loc="center left", bbox_to_anchor=(1.05, 0.5)) + + else: + # We want the legend on top of the plot at the user-specified + # location, and we want the plot to be full width. + pyplot.legend(loc=options.legend_overlay) + + if options.save is not None: + # Save the figure to a file + pyplot.savefig(options.save, dpi=options.dpi) + else: + # Show the figure to the user + pyplot.show() + + return 0 + +if __name__ == "__main__" : + sys.exit(main(sys.argv)) + diff --git a/scripts/intron_length_distribution.py b/scripts/intron_length_distribution.py new file mode 100755 index 00000000000..4fe4566ce2e --- /dev/null +++ b/scripts/intron_length_distribution.py @@ -0,0 +1,603 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import sys +import math +import numpy as np +import scipy.optimize as opt +import scipy.stats as stats +import scipy.special as special +from collections import Counter +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D + +def parse_intron_length_distr(filepath, label, unique = False): + + observed_introns = set() + lengths = Counter() + with open(filepath) as f: + curr_tx = None + prev_end = None + for line in f: + if line.startswith("#"): + continue + tokens = line.strip().split("\t") + if tokens[2] != "exon": + continue + chrom = tokens[0] + start = None + end = None + strand = (tokens[6] == "+") + if strand: + start = int(tokens[3]) + end = int(tokens[4]) + else: + start = int(tokens[4]) + end = int(tokens[3]) + annotations = tokens[8].strip().split(";") + tx_id = None + for annotation in annotations: + annotation = annotation.replace("\"", "").strip() + if annotation.startswith(label): + tx_id = annotation.split()[1] + break + assert(tx_id is not None) + intron = (chrom, strand, prev_end, start) + if tx_id == curr_tx and (not unique or intron not in observed_introns): + # inclusive indexing on interval + lengths[abs(start - prev_end) - 1] += 1 + observed_introns.add(intron) + else: + curr_tx = tx_id + prev_end = end + return lengths + +def frechet_log_likelihood(x, a, s, m): + if x < m: + return -sys.float_info.max * np.ones_like(x) + z = (x - m) / s + return np.log(a / s) - (a + 1.0) * np.log(z) - np.power(z, -a) + +def frechet_pdf_raw(x, a, s, m): + f1 = np.log(a / s) + f2 = (-a - 1) * np.log((x - m) / s) + f3 = -np.power((x - m) / s, -a) + return np.exp(f1 + f2 + f3) + +def frechet_pdf(x, a, s, m): + if np.isscalar(x): + if x > m: + return frechet_pdf_raw(x, a, s, m) + else: + return 0.0 + else: + p = np.zeros_like(x) + mask = x > m + x_pos = x[mask] + p[mask] = frechet_pdf_raw(x_pos, a, s, m) + return p + +def frechet_dshape(x, a, s, m): + z = (x - m) / s + return 1.0 / a + np.log(z) * (np.power(z, -a) - 1.0) + +def frechet_dscale(x, a, s, m): + return (a / s) * (1.0 - np.power((x - m) / s, -a)) + +def frechet_dlocation(x, a, s, m): + return (1.0 + a - np.power((x - m) / s, -a)) / (x - m) + +def frechet_mean(a, s, m): + return m + s * special.gamma(1.0 - 1.0 / a) + +def frechet_variance(a, s): + k1 = special.gamma(1.0 - 1.0 / a) + return s * s * (special.gamma(1.0 - 2.0 / a) - k1 * k1) + +def frechet_skewness(a): + assert(a > 3.0) + k1 = special.gamma(1.0 - 1.0 / a) + k2 = special.gamma(1.0 - 2.0 / a) + k3 = special.gamma(1.0 - 3.0 / a) + return (k3 - 3.0 * k1 * k2 + 2.0 * k1 * k1 * k1) / np.power(k2 - k1 * k1, 1.5) + +# exponential search for method of moments +def find_shape(skewness, tol = 1e-8): + assert(skewness > 0.0) + lo = 3.0 # skewness is not defined for shape <= 3 + step = 1.0 + while frechet_skewness(lo + step) > skewness: + lo += step + step *= 2 + hi = lo + step + while hi - lo > tol: + mid = 0.5 * (hi + lo) + if frechet_skewness(mid) > skewness: + lo = mid + else: + hi = mid + return 0.5 * (hi + lo) + +def normal_pdf(x, s, m): + z = (x - m) / s + return np.exp(-z * z / 2.0) / (s * 2.5066282746310002) # root(2pi) + + +# fit frechet distribution with method of moments +def frechet_mom(mean, variance, skewness): + # skewness depends only on shape + shape = find_shape(skewness) + # variance depends only on shape and scale + k1 = math.gamma(1.0 - 1.0 / shape) + k2 = math.gamma(1.0 - 2.0 / shape) + scale = (variance / (k2 - k1 * k1))**0.5 + # location depends on all three parameters + location = mean - scale * k1 + return shape, scale, location + + +# returns tuple of (frechet params, mixture weights) +def frechet_M_step(vals, counts, Z, params): + + assert(len(vals) == len(counts)) + assert(len(vals) == Z.shape[0]) + assert(Z.shape[1] * 3 == len(params)) + + + # mixture weights have an easy closed form solution + m0 = np.sum(Z * counts[:,np.newaxis], 0) + mix_weights = m0 / np.sum(m0) + + # # get the cumulants + # k1 = np.sum(Z * (vals * counts)[:,np.newaxis], 0) / m0 + # k2 = np.sum(Z * (np.power(vals, 2) * counts)[:,np.newaxis], 0) / m0 + # k3 = np.sum(Z * (np.power(vals, 3) * counts)[:,np.newaxis], 0) / m0 + # # compute central moments from the cumulants + # mean = k1 + # var = k2 - np.power(k1, 2) + # skew = (k3 - 3 * mean * var - np.power(mean, 3)) / np.power(var, 1.5) + # # choose initial values for maximization + # init_params = np.zeros_like(params) + # for j in range(Z.shape[1]): + # if skew[j] > 0.0: + # # use method of moments to choose good initial values + # init_params[3*j:3*j+3] = np.array(frechet_mom(mean[j], var[j], skew[j])) + # else: + # # it can happen that sample skew is negative by random + # # sampling. if so, the method of moments is undefined, so + # # we just use previous value as the starting position + # init_params[3*j:3*j+3] = params[3*j:3*j+3] + init_params = params + + # function to compute negative log likelihood + def neg_expected_log_likelihood(par): + ll = 0.0 + for j in range(Z.shape[1]): + a, s, m = par[3*j:3*j+3] + for i in range(Z.shape[0]): + v = vals[i] + if v <= m: + continue + lx = frechet_log_likelihood(v, a, s, m) + ll += counts[i] * Z[i,j] * lx + return -ll + + # function to compute negative gradient + def neg_gradient(par): + grad = np.zeros_like(par) + for j in range(Z.shape[1]): + a, s, m = par[3*j:3*j+3] + for i in range(Z.shape[0]): + v = vals[i] + if v <= m: + continue + C = Z[i,j] * counts[i] + grad[3*j] += C * frechet_dshape(v, a, s, m) + grad[3*j+1] += C * frechet_dscale(v, a, s, m) + grad[3*j+2] += C * frechet_dlocation(v, a, s, m) + return -grad + + + bounds = [] + for k in range(len(params)): + if k % 3 != 2: + # make sure the scale and shape are positive + bounds.append((1e-12, None)) + else: + # keep m from leaving the support + j = k // 3 + max_m = min(vals[i] for i in range(len(vals)) if Z[i,j] != 0.0) - 1e-12 + bounds.append((None, max_m)) + + # TODO: the documentation in scipy says that both nelder-mead and full + # memory BGFS should be able to take bounds, but i can only do it with + # L-BGFS-B + + # # improve the initial guess with nelder-mead, which supposedly is less + # # likely to be fooled by shallow regions + # res = opt.minimize(fun = neg_expected_log_likelihood, + # x0 = init_params, + # bounds = bounds, + # method = "Nelder-Mead", + # options = {"maxiter":25}) + + # use gradient-based quasi-newton method to refine optimum + res = opt.minimize(fun = neg_expected_log_likelihood, + x0 = init_params, + jac = neg_gradient, + bounds = bounds, + method = "L-BFGS-B") + return res.x, mix_weights + +def frechet_E_step(vals, Z, params, mix_weights, trunc): + for i in range(Z.shape[0]): + row = np.array([frechet_pdf(vals[i], params[3*j], params[3*j+1], params[3*j+2]) for j in range(Z.shape[1])]) + Z[i,:] = row / np.sum(row) + +def init_Z_matrix(num_vals, num_comps, asymm): + + # randomly initialize the assignment matrix + Z = np.ones((num_vals, num_comps)) + if num_comps > 1: + for i in range(Z.shape[0]): + # make a dirichlet shape that biases smaller lengths to be assigned to + # earlier components + # TODO: this is pretty hacky, but it should break symmetry... + dir_shape = np.ones(num_comps) + if Z.shape[0] > 1: + for j in range(num_comps): + i_frac = i / (Z.shape[0] - 1.0) + j_frac = j / (num_comps - 1.0) + if j_frac == 0.0: + dir_shape[j] += asymm * (1.0 - i_frac) + elif j_frac == 1.0: + dir_shape[j] += asymm * i_frac + elif i_frac <= j_frac: + dir_shape[j] += asymm * (i_frac / j_frac) + else: + dir_shape[j] += asymm * ((1.0 - i_frac) / (1.0 - j_frac)) + Z[i,:] = stats.dirichlet.rvs(dir_shape)[0] + return Z + +def truncate_Z_matrix(Z, trunc_ratio): + # clip very unlikely assignments to 0 to release the location parameter + # to give them 0 probability + for i in range(Z.shape[0]): + M = np.max(Z[i,:]) + Z[i, Z[i,:] < M * trunc_ratio] = 0.0 + Z[i,:] /= np.sum(Z[i,:]) + +def fit_frechet_mixture(counter, num_comps, max_iters = 100, asymm = 2.0, tol = 1e-3, verbose = False): + + vals = sorted(counter) + counts = np.array([float(counter[v]) for v in vals]) + vals = np.array([float(v) for v in vals]) + + # randomly initialize the assignment matrix + + + mix_weights = np.ones(num_comps) / num_comps + params = np.zeros(3 * num_comps) + for i in range(num_comps): + # shape and scale should be positive + params[3 * i] = 1.0 + params[3 * i + 1] = 1.0 + + # fit a log-normal model initialize the Z matrix (it is much more stable + # and easy to fit) + if verbose: + print("initial lognormal EM", file = sys.stderr) + Z = fit_log_normal_mixture(counter, num_comps, max_iters, tol, asymm, return_Z = True, verbose = verbose) + + if verbose: + print("main frechet EM", file = sys.stderr) + for it in range(max_iters): + if verbose and (it + 1) % 100 == 0: + print("EM iter {}".format(it + 1), file = sys.stderr) + truncate_Z_matrix(Z, 1e-5) + prev_params = params + prev_mix_weights = mix_weights + + # estimated the parameters based on the conditional likelihood matrix + params, mix_weights = frechet_M_step(vals, counts, Z, params) + + + + # check for convergence + converged = True + err = 0.0 + for j in range(num_comps): + err1 = abs(prev_mix_weights[j] - mix_weights[j]) + err2 = abs(params[3*i] - prev_params[3*i]) / prev_params[3*i] + err3 = abs(params[3*i+1] - prev_params[3*i+1]) / prev_params[3*i+1] + err4 = abs(params[3*i+2] - prev_params[3*i+2]) + err = max(err, err1, err2, err3, err4) + if (err1 > tol or err2 > tol or err3 > tol or err4 > tol): + converged = False + if verbose and (it + 1) % 100 == 0: + print("\tmax param velocity {}".format(err), file=sys.stderr) + if converged: + break + + # recompute the conditional likelihood matrix with new params + frechet_E_step(vals, Z, params, mix_weights, 1e-10) + + return params, mix_weights + + +def normal_E_step(vals, Z, params, mix_weights): + assert(len(mix_weights) == Z.shape[1]) + assert(len(mix_weights) * 2 == len(params)) + assert(len(vals) == Z.shape[0]) + + for i in range(Z.shape[0]): + row = np.array([mix_weights[j] * normal_pdf(vals[i], params[2*j], params[2*j+1]) for j in range(Z.shape[1])]) + Z[i,:] = row / np.sum(row) + +def normal_M_step(vals, counts, Z): + + m0 = np.sum(Z * counts[:,np.newaxis], 0) + m1 = np.sum(Z * (vals * counts)[:,np.newaxis], 0) / m0 + m2 = np.sqrt(np.sum(((Z * np.power(np.ones_like(m0) * vals[:,np.newaxis] - m1, 2)) * counts[:,np.newaxis]), 0) / m0) + + mix_weights = m0 / np.sum(m0) + params = np.array([m2[i//2] if i % 2 == 0 else m1[i//2] for i in range(2*Z.shape[1])]) + + return params, mix_weights + + + +def fit_log_normal_mixture(counter, num_comps, max_iters = 100, tol = 1e-3, asymm = 2.0, return_Z = False, verbose = False): + assert(0 not in counter) + + vals = sorted(counter) + counts = np.array([float(counter[v]) for v in vals]) + vals = np.log(np.array(vals)) + + Z = init_Z_matrix(len(vals), num_comps, asymm) + + params = np.array([1.0 if i % 2 == 0 else 0.0 for i in range(2 * num_comps)]) + mix_weights = np.ones(num_comps) / num_comps + + for it in range(max_iters): + if verbose and (it + 1) % 100 == 0: + print("EM iter {}".format(it + 1), file = sys.stderr) + # print(Z) + prev_params = params + prev_mix_weights = mix_weights + + # estimated the parameters based on the conditional likelihood matrix + params, mix_weights = normal_M_step(vals, counts, Z) + + # check for convergence + converged = True + err = 0.0 + for j in range(num_comps): + err1 = abs(prev_mix_weights[j] - mix_weights[j]) + err2 = abs(params[2*j] - prev_params[2*j]) / prev_params[2*j] + err3 = abs(params[2*j+1] - prev_params[2*j+1]) + err = max(err, err1, err2, err3) + if (err1 > tol or err2 > tol or err3 > tol): + converged = False + if verbose and (it + 1) % 100 == 0: + print("\tmax param velocity {}".format(err), file = sys.stderr) + if converged: + break + + # recompute the conditional likelihood matrix with new params + normal_E_step(vals, Z, params, mix_weights) + + if return_Z: + return Z + else: + return params, mix_weights + + +def frechet_mixture_pdf(x, params, mix_weights): + d = 0.0 + for j in range(len(mix_weights)): + d += mix_weights[j] * frechet_pdf(x, params[3*j], params[3*j+1], params[3*j+2]) + return d + +def normal_mixture_pdf(x, params, mix_weights): + d = 0.0 + for j in range(len(mix_weights)): + d += mix_weights[j] * normal_pdf(x, params[2*j], params[2*j+1]) + return d + +def plot_frechet_log_likelihood(counter, x_min, x_max, y_min, y_max, grid, a, s, m): + vals = sorted(counter) + counts = np.array([float(counter[v]) for v in vals]) + vals = np.array([float(v) for v in vals]) + + X = np.array([np.linspace(x_min, x_max, grid) for i in range(grid)]) + Y = np.array([[y for j in range(grid)] for y in np.linspace(y_min, y_max, grid)]) + Z = [] + for i in range(grid): + Z.append([]) + for j in range(grid): + sh, sc, l = None, None, None + if a == "x": + sh = X[i,j] + elif a == "y": + sh = Y[i,j] + else: + sh = a + if s == "x": + sc = X[i,j] + elif s == "y": + sc = Y[i,j] + else: + sc = s + if m == "x": + l = X[i,j] + elif m == "y": + l = Y[i,j] + else: + l = m + Z[-1].append(sum(c * frechet_log_likelihood(v, sh, sc, l) for v,c in zip(vals, counts))) + + Z = np.array(Z) + + elev = 75 + azim = 225 + + fig = plt.figure() + ax = Axes3D(fig) + ax.view_init(elev=elev, azim=azim) + ax.plot_wireframe(X, Y, Z) + plt.show() + + fig = plt.contour(X, Y, Z) + plt.show() + +def log_flatten(counter): + x = np.zeros(sum(counter.values()) - counter[0]) + i = 0 + for l in counter: + if l == 0: + continue + c = counter[l] + x[i:i+c] = np.log10(l) + i += c + return x + +def plot_length_distribution(counter): + + x = log_flatten(counter) + + plt.hist(x, bins = 100) + +def plot_log_normal_mixture(params, mix_weights, log_density = False, num_sigmas = 4.0, log_x = True, xlim = None): + + log_xmin, log_xmax = None, None + for i in range(len(mix_weights)): + s = params[2 * i] + m = params[2 * i + 1] + + lo = m - num_sigmas * s + hi = m + num_sigmas * s + if log_xmin is None or lo < log_xmin: + log_xmin = lo + if log_xmax is None or hi > log_xmax: + log_xmax = hi + + + logx = np.linspace(log_xmin, log_xmax, 500) + x = np.exp(logx) + log10x = logx / np.log(10.) + + y = np.zeros_like(logx) + for i in range(len(logx)): + d = normal_mixture_pdf(logx[i], params, mix_weights) / x[i] + if log_density: + y[i] = np.log(d) + else: + y[i] = d + + if log_density: + y -= np.max(y) + + if log_x: + plt.plot(log10x, y) + ticks, labels = plt.xticks() + plt.xticks(ticks, [str(int(round(t))) for t in 10**np.array(ticks)]) + else: + plt.plot(x, y) + + if xlim is not None: + plt.xlim(xlim) + +def plot_frechet_mixture(params, mix_weights): + + log_xmin = 0 + log_xmax = 1 + for i in range(len(mix_weights)): + a = params[3*i] + s = params[3*i+1] + m = params[3*i+2] + if a > 2: + ev = m + s * np.gamma(1.0 - 1.0 / a) + sd = s * np.sqrt(np.gamma(1.0 - 2.0 / a) - np.power(np.gamma(1.0 - 1.0 / a), 2)) + hi = ev + 8 * sd + else: + mode = m + s * pow(a / (1.0 + a), 1.0 / a) + hi = m + 10 * (mode - m) + + log_xmax = max(log_xmax, hi) + + logx = np.linspace(log_xmin, log_xmax, 500) + log10x = logx / np.log(10.) + y = np.zeros_like(logx) + for i in range(len(logx)): + y[i] = frechet_mixture_pdf(np.exp(logx[i]), params, mix_weights) + + plt.plot(log10x, y) + + +def log_normal_BIC(counter, params, mix_weights): + vals = sorted(counter) + counts = np.array([float(counter[v]) for v in vals]) + vals = np.array([float(v) for v in vals]) + log_vals = np.log(vals) + # one parameter isn't free in the weights + p = len(params) + len(mix_weights) - 1 + N = np.sum(counts) + + log_likelihood = 0.0 + for i in range(len(vals)): + likelihood = 0.0 + for j in range(len(mix_weights)): + likelihood += mix_weights[j] * normal_pdf(log_vals[i], params[2*j], params[2*j+1]) / vals[i] + log_likelihood += counts[i] * np.log(likelihood) + + return float(p) * np.log(N) - 2.0 * log_likelihood + + +def main(): + parser = argparse.ArgumentParser(description='Estimate the intron length distribution from transcript annotations') + + parser.add_argument('-g', '--gtf', required = True, type = str, metavar = "FILE", + help = 'GTF/GFF file containing transcript annotations') + parser.add_argument('-o', '--out', required = True, type = str, metavar = "FILE", + help = 'where to save distribution') + parser.add_argument('-l', '--label', required = False, type = str, metavar = "STR", + default = 'transcript_id', + help = 'label of transcripts in GTF (default: transcript_id)') + + args = parser.parse_args() + print("parsing transcript annotations from {}...".format(args.gtf), file = sys.stderr) + lengths = parse_intron_length_distr(args.gtf, args.label) + + training_results = {} + for num_comps in range(1, 6): + print("training model with {} components...".format(num_comps), file = sys.stderr) + params, weights = fit_log_normal_mixture(lengths, + num_comps, + max_iters = 500 * num_comps, + asymm = 5 * num_comps, + tol = 1e-4) + training_results[num_comps] = params, weights + + + best_BIC = None + best_num_comps = None + for num_comps in sorted(training_results): + params, weights = training_results[num_comps] + BIC = log_normal_BIC(lengths, params, weights) + print("{} component model achieves BIC {}".format(num_comps, BIC), file = sys.stderr) + if best_BIC is None or BIC < best_BIC: + best_BIC = BIC + best_num_comps = num_comps + + print("outputting {} component model to {}".format(best_num_comps, args.out), file = sys.stderr) + params, weights = training_results[best_num_comps] + with open(args.out, 'w') as f: + print(str(best_num_comps), file = f) + for param in list(weights) + list(params): + print(str(param), file = f) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/lr_benchmark.sh b/scripts/lr_benchmark.sh new file mode 100755 index 00000000000..3cfff027af1 --- /dev/null +++ b/scripts/lr_benchmark.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +# lr_benchmark.sh: Run a benchmark for vg long read mapping +# Meant to be run on UCSC Courtyard/Plaza + +set -ex +set -o pipefail + +# Here we use : and := to set variables to default values if not present in the environment. +# You can set these in the environment to override them and I don't have to write a CLI option parser. +# See https://stackoverflow.com/a/28085062 + +# Where should output go? +: "${OUT_DIR:="./lr_benchmark"}" +echo "Writing to ${OUT_DIR}" +mkdir -p "${OUT_DIR}" + +# Adam Novak's simulated reads, loosely following Stephen Hwang's method. +# Annotated with GRCh38.chr1 style path names. +# Also available in 100 and 10000 read versions +# On GRCh38, sample HG00741: +# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/r9.5-acc0.95/HG00741/HG00741-sim-r9.5-acc0.95-1000.gam +# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/hifi/HG00741/HG00741-sim-hifi-1000.gam +# On CHM13, sample HG002: +# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/r9.5-acc0.95/HG002/HG002-sim-r9.5-acc0.95-1000.gam +# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/hifi/HG002/HG002-sim-hifi-1000.gam +# Note that not all of these reads have true positions on CHM13 due to a lack of alignment for some HG002 regions! +: "${INPUT_READ_PATH:=/public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/reads/sim/r9.5-acc0.95/HG002/HG002-sim-r9.5-acc0.95-1000.gam}" + +# For GRCh38 mapping: +# An HPRC graph, linked to /public/groups/cgl/graph-genomes/xhchang/hprc_graph/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.giraffe.gbz +# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.giraffe.gbz +# Its indexes +# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.dist +# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.min +# /public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/GRCh38-f1g-90-mc-aug11-clip.d9.m1000.D10M.m1000.xg + +# For CHM13 mapping: +# An HPRC graph, with .min, .dist, .gbwt, and .xg files at least +: "${INPUT_GRAPH_BASE_URL:=https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc-chm13}" +# Where it is/will be stored on disk locally, along with a .gbz +: "${INPUT_GRAPH_BASE_PATH:=/public/groups/cgl/graph-genomes/anovak/data/hprc-lrgiraffe/graphs/hprc-v1.0-mc-chm13}" + +# Find the indexes +: "${INPUT_DIST_PATH:=${INPUT_GRAPH_BASE_PATH}.dist}" +: "${INPUT_MIN_PATH:=${INPUT_GRAPH_BASE_PATH}.min}" +: "${INPUT_XG_PATH:=${INPUT_GRAPH_BASE_PATH}.xg}" +: "${INPUT_GBZ_PATH:=${INPUT_GRAPH_BASE_PATH}.gbz}" + +# Download the files that are probably there. +if [[ ! -e "${INPUT_DIST_PATH}" ]] ; then + wget "${INPUT_GRAPH_BASE_URL}.dist" -O "${INPUT_DIST_PATH}.tmp" + mv "${INPUT_DIST_PATH}.tmp" "${INPUT_DIST_PATH}" +fi +if [[ ! -e "${INPUT_MIN_PATH}" ]] ; then + wget "${INPUT_GRAPH_BASE_URL}.min" -O "${INPUT_MIN_PATH}.tmp" + mv "${INPUT_MIN_PATH}.tmp" "${INPUT_MIN_PATH}" +fi +if [[ ! -e "${INPUT_XG_PATH}" ]] ; then + wget "${INPUT_GRAPH_BASE_URL}.xg" -O "${INPUT_XG_PATH}.tmp" + mv "${INPUT_XG_PATH}.tmp" "${INPUT_XG_PATH}" +fi + +if [[ ! -e "${INPUT_GBZ_PATH}" ]] ; then + # Make GBZ from other files if not there + + # For which we need the GBWT + : "${INPUT_GBWT_PATH:=${INPUT_GRAPH_BASE_PATH}.gbwt}" + if [[ ! -e "${INPUT_GBWT_PATH}" ]] ; then + wget "${INPUT_GRAPH_BASE_URL}.gbwt" -O "${INPUT_GBWT_PATH}.tmp" + mv "${INPUT_GBWT_PATH}.tmp" "${INPUT_GBWT_PATH}" + fi + + time vg gbwt -x "${INPUT_XG_PATH}" "${INPUT_GBWT_PATH}" --gbz-format -g "${INPUT_GBZ_PATH}.tmp" + mv "${INPUT_GBZ_PATH}.tmp" "${INPUT_GBZ_PATH}" +fi + +if [[ "${WORK_DIR}" == "" ]] ; then + # Make a work directory + WORK_DIR="$(mktemp -d)" + CLEAN_WORK_DIR=1 +else + # Let the user send one in in the environment. + mkdir -p "${WORK_DIR}" + CLEAN_WORK_DIR=0 +fi + +echo "Working in ${WORK_DIR}" + +if [[ ! -e "${WORK_DIR}/possible.txt" ]] ; then + # Get the list of all reads with ref positions set, and which are thus possible to get right + vg view -aj "${INPUT_READ_PATH}" | jq -r 'select(.refpos) | .name' > "${WORK_DIR}/possible.txt.tmp" + mv "${WORK_DIR}/possible.txt.tmp" "${WORK_DIR}/possible.txt" +fi + +if [[ ! -e "${WORK_DIR}/annotated.gam" ]] ; then + # Map reads using correctness tracking. + # Make sure to apply multi-position annotation which Giraffe won't do. + /usr/bin/time -v vg giraffe -G "${INPUT_READ_PATH}" -t 16 -B 8 --align-from-chains -Z "${INPUT_GBZ_PATH}" -d "${INPUT_DIST_PATH}" -m "${INPUT_MIN_PATH}" -x "${INPUT_XG_PATH}" --track-provenance --track-correctness --progress | /usr/bin/time -v vg annotate -x "${INPUT_XG_PATH}" -a - --multi-position -l 100 >"${WORK_DIR}/annotated.gam.tmp" + mv "${WORK_DIR}/annotated.gam.tmp" "${WORK_DIR}/annotated.gam" +fi + +# Compute general stats +vg stats -a "${WORK_DIR}/annotated.gam" >"${OUT_DIR}/stats.txt" + +if [[ ! -e "${WORK_DIR}/benchmark.tsv" ]] ; then + # See if reads get close enough to be correct + # TODO: vg gamcompare announces a correctness count, which we should save + vg gamcompare -r 200 "${WORK_DIR}/annotated.gam" "${INPUT_READ_PATH}" --aligner lrgiraffe --tsv >"${WORK_DIR}/benchmark.tsv.tmp" + mv "${WORK_DIR}/benchmark.tsv.tmp" "${WORK_DIR}/benchmark.tsv" +fi + +# Make a QQ plot +scripts/plot-qq.R "${WORK_DIR}/benchmark.tsv" "${OUT_DIR}/qq.png" + +# Compute a correctness rate +TOTAL_READS="$(cat "${WORK_DIR}/benchmark.tsv" | tail -n +2 | wc -l)" +POSSIBLE_READS="$(cat "${WORK_DIR}/possible.txt" | wc -l)" +CORRECT_READS="$(cat "${WORK_DIR}/benchmark.tsv" | tail -n +2 | grep "^1" | wc -l)" +CORRECT_FRACTION_TOTAL="$(echo "${CORRECT_READS}/${TOTAL_READS}" | bc -l)" +CORRECT_FRACTION_POSSIBLE="$(echo "${CORRECT_READS}/${POSSIBLE_READS}" | bc -l)" +echo "Correct reads: ${CORRECT_READS}" >"${OUT_DIR}/results.txt" +echo "Total reads: ${TOTAL_READS}" >>"${OUT_DIR}/results.txt" +echo "Reads with truth positions: ${POSSIBLE_READS}" >>"${OUT_DIR}/results.txt" +echo "Correct fraction: ${CORRECT_FRACTION_TOTAL} of all reads, ${CORRECT_FRACTION_POSSIBLE} of reads with truth positions" >>"${OUT_DIR}/results.txt" +cat "${OUT_DIR}/results.txt" + +if [[ "${CLEAN_WORK_DIR}" == "1" ]] ; then + # Clean up the work directory + rm -Rf "${WORK_DIR}" +fi + diff --git a/scripts/make_pbsim_reads.sh b/scripts/make_pbsim_reads.sh new file mode 100755 index 00000000000..b71a449dbf4 --- /dev/null +++ b/scripts/make_pbsim_reads.sh @@ -0,0 +1,233 @@ +#!/usr/bin/env bash +# make_pbsim_reads.sh: script to simulate reads with pbsim2. +# Mostly theoretical; records commands that would have worked better than what was actually run +# Intended to run on UCSC Courtyard/Plaza systems +# You may also need to CFLAGS=-fPIC pip3 install --user bioconvert + +set -ex + +# Here we use : and := to set variables to default values if not present in the environment. +# You can set these in the environment to override them and I don't have to write a CLI option parser. +# See https://stackoverflow.com/a/28085062 + +# Graph to simulate from. Can be S3 URLs or local file paths. +: "${GRAPH_XG_URL:=s3://human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc-grch38.xg}" +: "${GRAPH_GBWT_URL:=s3://human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc-grch38.gbwt}" +# Name to use for graph when downloaded +: "${GRAPH_NAME:=hprc-v1.0-mc-grch38}" +# Sample to simulate from +: "${SAMPLE_NAME:=HG00741}" +# Technology name to use in output filenames +: "${TECH_NAME:=hifi}" +# FASTQ to use as a template, or "/dev/null" +: "${SAMPLE_FASTQ:=/public/groups/vg/sjhwang/data/reads/real_HiFi/tmp/HiFi_reads_100k_real.fq}" +# HMM model to use instead of a FASTQ, or "/dev/null" +: "${PBSIM_HMM:=/dev/null}" +# This needs to be the pbsim2 command, which isn't assumed to be in $PATH +: "${PBSIM:=/public/groups/vg/sjhwang/tools/bin/pbsim}" +# Parameters to use with pbsim for simulating reads for each contig. Parameters are space-separated and internal spaces must be escaped. +: "${PBSIM_PARAMS:=--depth 1 --accuracy-min 0.00 --length-min 10000 --difference-ratio 6:50:54}" +# This needs to be a command line which can execute Stephen's script that adds qualities from a FASTQ back into a SAM that is missing them. +# Arguments are space-separated and internal spaces must be escaped. +# This script is at https://gist.github.com/adamnovak/45ae4f500a8ec63ce12ace4ca77afc21 +: "${ADD_QUALITIES:=python3 /public/groups/vg/sjhwang/vg_scripts/bin/readers/sam_reader.py}" +# Directory to save results in +: "${OUT_DIR:=./reads/sim/${TECH_NAME}/${SAMPLE_NAME}}" +# Number of MAFs to convert at once +: "${MAX_JOBS:=10}" + +if [[ "${WORK_DIR}" == "" ]] ; then + # Make a work directory + WORK_DIR="$(mktemp -d)" + CLEAN_WORK_DIR=1 +else + # Let the user send one in in the environment. + CLEAN_WORK_DIR=0 +fi + + +# Make sure scratch directory exists +mkdir -p "${WORK_DIR}" + +# Fetch graph +if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.xg" ]] ; then + # This comparison require Bash 3 or later. See + if [[ ${GRAPH_XG_URL} =~ ^s3:.* ]]; then + # Download from S3 + aws s3 cp "${GRAPH_XG_URL}" "${WORK_DIR}/${GRAPH_NAME}.xg.tmp" + mv "${WORK_DIR}/${GRAPH_NAME}.xg.tmp" "${WORK_DIR}/${GRAPH_NAME}.xg" + else + # Use local symlink + ln -s "$(realpath "${GRAPH_XG_URL}")" "${WORK_DIR}/${GRAPH_NAME}.xg" + fi +fi +if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbwt" ]] ; then + if [[ ${GRAPH_GBWT_URL} =~ ^s3:.* ]]; then + # Download from S3 + aws s3 cp "${GRAPH_GBWT_URL}" "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp" + mv "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbwt" + else + # Use local symlink + ln -s "$(realpath "${GRAPH_GBWT_URL}")" "${WORK_DIR}/${GRAPH_NAME}.gbwt" + fi +fi + +if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbz" ]] ; then + # Make it one file + time vg gbwt -x "${WORK_DIR}/${GRAPH_NAME}.xg" "${WORK_DIR}/${GRAPH_NAME}.gbwt" --gbz-format -g "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp" + mv "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbz" +fi + +if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}-${SAMPLE_NAME}-as-ref.gbz" ]] ; then + # Make it have our sample as the reference + vg gbwt -Z trash/${GRAPH_NAME}.gbz --set-tag "reference_samples=${SAMPLE_NAME}" --gbz-format -g "${WORK_DIR}/${GRAPH_NAME}-${SAMPLE_NAME}-as-ref.gbz.tmp" + mv "${WORK_DIR}/${GRAPH_NAME}-${SAMPLE_NAME}-as-ref.gbz.tmp" "${WORK_DIR}/${GRAPH_NAME}-${SAMPLE_NAME}-as-ref.gbz" +fi + +if [[ ! -e "${WORK_DIR}/${SAMPLE_NAME}.fa" ]] ; then + # Extract sample assembly FASTA from graph where sample is the *reference*. If + # we so it from the one where the sample is haplotypes, we get different path + # name strings and we can't inject without hacking them up. We leave the code + # to hack them up anyway though, for reference later. + vg paths -x "${WORK_DIR}/${GRAPH_NAME}-${SAMPLE_NAME}-as-ref.gbz" \ + --sample "${SAMPLE_NAME}" \ + --extract-fasta \ + > "${WORK_DIR}/${SAMPLE_NAME}.fa.tmp" + mv "${WORK_DIR}/${SAMPLE_NAME}.fa.tmp" "${WORK_DIR}/${SAMPLE_NAME}.fa" +fi + +if [[ -d "${WORK_DIR}/${SAMPLE_NAME}-reads" && "$(ls "${WORK_DIR}/${SAMPLE_NAME}-reads/"sim_*.maf | wc -l)" == "0" ]] ; then + # Sim directory exists but has no MAFs. Shouldn't have any files at all. + rmdir "${WORK_DIR}/${SAMPLE_NAME}-reads" +fi + +if [[ ! -d "${WORK_DIR}/${SAMPLE_NAME}-reads" ]] ; then + rm -Rf "${WORK_DIR}/${SAMPLE_NAME}-reads.tmp" + mkdir "${WORK_DIR}/${SAMPLE_NAME}-reads.tmp" + + if [[ "${PBSIM_HMM}" != "/dev/null" ]] ; then + if [[ "${SAMPLE_FASTQ}" != "/dev/null" ]] ; then + echo "Can't use both a PBSIM_HMM and a SAMPLE_FASTQ" + exit 1 + fi + # Using an HMM to make qualities. + QUAL_SOURCE_ARGS=(--hmm_model "${SAMPLE_FASTQ}") + else + # Using a FASTQ to make qualities. + # No read may be over 1 megabase or pbsim2 will crash. + QUAL_SOURCE_ARGS=(--sample-fastq "${SAMPLE_FASTQ}") + fi + + # Simulate reads + time "${PBSIM}" \ + ${PBSIM_PARAMS} \ + "${QUAL_SOURCE_ARGS[@]}" \ + --prefix "${WORK_DIR}/${SAMPLE_NAME}-reads.tmp/sim" \ + "${WORK_DIR}/${SAMPLE_NAME}.fa" + + mv "${WORK_DIR}/${SAMPLE_NAME}-reads.tmp" "${WORK_DIR}/${SAMPLE_NAME}-reads" +fi + +function do_job() { + # Run this file in a job + set -e + + SAM_NAME="${MAF_NAME%.maf}.sam" + FASTQ_NAME="${MAF_NAME%.maf}.fastq" + REF_NAME="${MAF_NAME%.maf}.ref" + RENAMED_BAM_NAME="${MAF_NAME%.maf}.renamed.bam" + # Get the contig name in the format it would be as a reference sense path. + # It may already be a reference sense path. + # Can't run under pipefail because some of these may not match. + CONTIG_NAME="$(cat "${REF_NAME}" | head -n1 | sed 's/^>//' | sed 's/ .*//' | sed 's/#\([0-9]*\)$/[\1]/')" + # Haplotype paths can end in a 0 offset/fragment but reference paths don't include that in the name. + CONTIG_NAME="${CONTIG_NAME%\[0\]}" + if [[ ! -e "${RENAMED_BAM_NAME}" ]] ; then + echo "Making ${RENAMED_BAM_NAME}..." + if [[ ! -e "${SAM_NAME}" ]] ; then + echo "Making SAM ${SAM_NAME}..." + /usr/bin/time -v bioconvert maf2sam --force "${MAF_NAME}" "${SAM_NAME}.tmp" + mv "${SAM_NAME}.tmp" "${SAM_NAME}" + fi + set -o pipefail + ${ADD_QUALITIES} -s "${SAM_NAME}" -f "${FASTQ_NAME}" | sed "s/ref/${CONTIG_NAME}/g" | samtools view -b - > "${RENAMED_BAM_NAME}.tmp" + set +o pipefail + mv "${RENAMED_BAM_NAME}.tmp" "${RENAMED_BAM_NAME}" + else + echo "Already have ${RENAMED_BAM_NAME}..." + fi +} + + +# Convert all the reads to BAM in the space of the sample as a primary reference +for MAF_NAME in "${WORK_DIR}/${SAMPLE_NAME}-reads/"sim_*.maf ; do + if [[ "${MAX_JOBS}" == "1" ]] ; then + # Serial mode + do_job + else + # Parallel mode + while [[ "$(jobs -p | wc -l)" -ge "${MAX_JOBS}" ]] ; do + # Don't do too much in parallel + # Fake wait on any job without wait -n + sleep 0.5 + done + ( + do_job + ) & + ((RUNNING_JOBS += 1)) + fi +done +# Wait on all jobs +wait + +if [[ "$(ls "${WORK_DIR}/${SAMPLE_NAME}-reads"/sim_*.tmp 2>/dev/null | wc -l)" != "0" ]] ; then + # Make sure all the per-file temp files got moved + echo "Loose temp files; failure detected." + exit 1 +fi + +if [[ ! -e "${WORK_DIR}/${SAMPLE_NAME}-reads/merged.bam" ]] ; then + # Combine all the BAM files + time samtools merge -n "${WORK_DIR}/${SAMPLE_NAME}-reads"/sim_*.renamed.bam -o "${WORK_DIR}/${SAMPLE_NAME}-reads/merged.bam.tmp" --threads 14 + mv "${WORK_DIR}/${SAMPLE_NAME}-reads/merged.bam.tmp" "${WORK_DIR}/${SAMPLE_NAME}-reads/merged.bam" +fi + +if [[ ! -e "${WORK_DIR}/${SAMPLE_NAME}-reads/injected.gam" ]] ; then + # Move reads into graph space + time vg inject -x "${WORK_DIR}/${GRAPH_NAME}-${SAMPLE_NAME}-as-ref.gbz" "${WORK_DIR}/${SAMPLE_NAME}-reads/merged.bam" -t 16 >"${WORK_DIR}/${SAMPLE_NAME}-reads/injected.gam.tmp" + mv "${WORK_DIR}/${SAMPLE_NAME}-reads/injected.gam.tmp" "${WORK_DIR}/${SAMPLE_NAME}-reads/injected.gam" +fi + +if [[ ! -e "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}.gam" ]] ; then + # Annotate reads with linear reference positions + time vg annotate -x "${WORK_DIR}/${GRAPH_NAME}.gbz" -a "${WORK_DIR}/${SAMPLE_NAME}-reads/injected.gam" --multi-position -l 100 -t 16 >"${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}.gam.tmp" + mv "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}.gam.tmp" "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}.gam" +fi + +# Work out howe many reads there are +TOTAL_READS="$(vg stats -a "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}.gam" | grep "^Total alignments:" | cut -f2 -d':' | tr -d ' ')" + +if [[ "${TOTAL_READS}" -lt 10500 ]] ; then + echo "Only ${TOTAL_READS} reads were simulated. Cannot subset to 10000 reads with buffer!" + exit 1 +fi +echo "Simulated ${TOTAL_READS} reads overall" + +SUBSAMPLE_SEED=1 +for READ_COUNT in 100 1000 10000 ; do + # Subset to manageable sizes (always) + # Get the fraction of reads to keep, overestimated, with no leading 0, to paste onto subsample seed. + FRACTION="$(echo "(${READ_COUNT} + 500)/${TOTAL_READS}" | bc -l | sed 's/^[0-9]*//g')" + # Can't use pipefail here because head will cut off the pipe and fail the previous command + vg filter -d "${SUBSAMPLE_SEED}${FRACTION}" "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}.gam" | vg view -aj - | shuf | head -n"${READ_COUNT}" | vg view -JGa - > "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}-${READ_COUNT}.gam" + ((SUBSAMPLE_SEED+=1)) +done + +# Output them +mkdir -p "${OUT_DIR}" +cp "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}.gam" "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}-"*".gam" "${OUT_DIR}/" + +if [[ "${CLEAN_WORK_DIR}" == "1" ]] ; then + # Clean up the work directory + rm -Rf "${WORK_DIR}" +fi diff --git a/scripts/mcmc_Makefile b/scripts/mcmc_Makefile new file mode 100644 index 00000000000..bbaa6968ccc --- /dev/null +++ b/scripts/mcmc_Makefile @@ -0,0 +1,94 @@ + +WORKDIR ?= . + +# Directory for Toil's temporary files +TOIL_JS="$(WORKDIR)/my-jobstore" + +# All output will be written here +#TOIL_OS="$(WORKDIR)/my-output" +TOIL_OS="$(WORKDIR)/my-output-small" + + +MCMC_ITERATIONS = 1000 +BURN_IN = 500 +GAMMA_FREQUENCY = 100 +READS= 300 + +#FASTA = CHR21.fa +#TBI = 1kg_hg19-CHR21.vcf.gz.tbi +#VCF = 1kg_hg19-CHR21.vcf.gz +#BASENAME = CHR21 +#READS= 7050000 + + +FASTA = test/small/x.fa +TBI = test/small/x.vcf.gz.tbi +VCF = test/small/x.vcf.gz +BASENAME = x_small + + + +SAMP=1 +HAPLO_0 = 0 +HAPLO_1 = 1 + +all: $(TOIL_OS)/$(BASENAME).svg + +$(TOIL_OS)/$(BASENAME).vg: $(FASTA) $(VCF) $(TBI) + toil-vg construct $(TOIL_JS) $(TOIL_OS) --container None --pangenome --gcsa_opts '-k 16' --gbwt_prune --vcf_phasing $(VCF) --fasta_regions --max_node_size 1000 --alt_paths --fasta $(FASTA) --all_index --vcf $(VCF) --out_name $(BASENAME) + +$(TOIL_OS)/$(BASENAME).xg: $(TOIL_OS)/$(BASENAME).vg +$(TOIL_OS)/$(BASENAME).gbwt : $(TOIL_OS)/$(BASENAME).xg +$(TOIL_OS)/$(BASENAME).gcsa: $(TOIL_OS)/$(BASENAME).gbwt +$(TOIL_OS)/$(BASENAME).snarls: $(TOIL_OS)/$(BASENAME).vg + +$(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg: $(TOIL_OS)/$(BASENAME).vg $(TOIL_OS)/$(BASENAME).gbwt $(TOIL_OS)/$(BASENAME).xg + vg paths -d -v $(TOIL_OS)/$(BASENAME).vg > $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg + vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --extract-vg -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_0) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg + +$(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).vg: $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg + vg mod -N $(TOIL_OS)/$(BASENAME)_$(HAPLO_0)_thread.merge.vg > $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).vg + +$(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).xg: $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).vg + vg index -x $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).xg $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_0).vg + +$(TOIL_OS)/$(BASENAME)_$(HAPLO_0).gam: $(TOIL_OS)/$(BASENAME).xg $(TOIL_OS)/$(BASENAME).gbwt + vg sim -x $(TOIL_OS)/$(BASENAME).xg -g $(TOIL_OS)/$(BASENAME).gbwt -m $(SAMP) -n 300 --sub-rate 0.05 --indel-rate 0.05 --read-length 100 -a > $(TOIL_OS)/$(BASENAME)_$(HAPLO_0).gam + +$(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg: $(TOIL_OS)/$(BASENAME).vg $(TOIL_OS)/$(BASENAME).gbwt $(TOIL_OS)/$(BASENAME).xg + vg paths -d -v $(TOIL_OS)/$(BASENAME).vg > $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg + vg paths --gbwt $(TOIL_OS)/$(BASENAME).gbwt --extract-vg -x $(TOIL_OS)/$(BASENAME).xg -Q _thread_$(SAMP)_x_$(HAPLO_1) >> $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg + +$(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).vg: $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg + vg mod -N $(TOIL_OS)/$(BASENAME)_$(HAPLO_1)_thread.merge.vg > $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).vg + +$(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).xg: $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).vg + vg index -x $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).xg $(TOIL_OS)/$(BASENAME)_thread_$(SAMP)_$(HAPLO_1).vg + +$(TOIL_OS)/$(BASENAME)_$(HAPLO_1).gam: $(TOIL_OS)/$(BASENAME).xg $(TOIL_OS)/$(BASENAME).gbwt + vg sim -x $(TOIL_OS)/$(BASENAME).xg -g $(TOIL_OS)/$(BASENAME).gbwt -m $(SAMP) -n 300 --sub-rate 0.05 --indel-rate 0.05 --read-length 100 -a > $(TOIL_OS)/$(BASENAME)_$(HAPLO_1).gam + +$(TOIL_OS)/$(BASENAME)_merged.gam: $(TOIL_OS)/$(BASENAME)_$(HAPLO_0).gam $(TOIL_OS)/$(BASENAME)_$(HAPLO_1).gam + cat $(TOIL_OS)/$(BASENAME)_$(HAPLO_0).gam $(TOIL_OS)/$(BASENAME)_$(HAPLO_1).gam > $(TOIL_OS)/$(BASENAME)_merged.gam + +$(TOIL_OS)/$(BASENAME).mgam: $(TOIL_OS)/$(BASENAME).xg $(TOIL_OS)/$(BASENAME).gcsa $(TOIL_OS)/$(BASENAME)_merged.gam + vg mpmap -A -x $(TOIL_OS)/$(BASENAME).xg -g $(TOIL_OS)/$(BASENAME).gcsa -t 1 -G $(TOIL_OS)/$(BASENAME)_merged.gam > $(TOIL_OS)/$(BASENAME).mgam + + +$(TOIL_OS)/$(BASENAME)_paths.vg: $(TOIL_OS)/$(BASENAME).mgam $(TOIL_OS)/$(BASENAME).snarls $(TOIL_OS)/$(BASENAME).vg + vg mcmc -i $(MCMC_ITERATIONS) -b $(BURN_IN) -g $(GAMMA_FREQUENCY) --vcf-out $(TOIL_OS)/$(BASENAME).vcf $(TOIL_OS)/$(BASENAME).mgam $(TOIL_OS)/$(BASENAME).vg $(TOIL_OS)/$(BASENAME).snarls > $(TOIL_OS)/$(BASENAME)_paths.vg + +$(TOIL_OS)/$(BASENAME).svg: $(TOIL_OS)/$(BASENAME)_paths.vg + vg view -d -n $(TOIL_OS)/$(BASENAME)_paths.vg | dot -Tsvg -o $(TOIL_OS)/$(BASENAME).svg + +clean: + rm -r $(TOIL_OS) + +CHR21.fa: + wget https://courtyard.gi.ucsc.edu/~anovak/vg-data/bakeoff/CHR21.fa + +1kg_hg19-CHR21.vcf.gz: + wget https://courtyard.gi.ucsc.edu/~anovak/vg-data/bakeoff/1kg_hg19-CHR21.vcf.gz + +1kg_hg19-CHR21.vcf.gz.tbi: + wget https://courtyard.gi.ucsc.edu/~anovak/vg-data/bakeoff/1kg_hg19-CHR21.vcf.gz.tbi diff --git a/scripts/mcmc_dist.sh b/scripts/mcmc_dist.sh new file mode 100755 index 00000000000..610260b2a26 --- /dev/null +++ b/scripts/mcmc_dist.sh @@ -0,0 +1,10 @@ +#!/bin/bash +date +for burn in 500 999999999 +do + for i in {1..1} + do + (vg mcmc -i 1000 -b $burn -g 100 --vcf-out ./my-output-small/x_small.vcf my-output-small/x_small.mgam ./my-output-small/x_small.vg ./my-output-small/x_small.snarls > ./my-output-small/x_small_paths.vg) 2>&1 | cut -f 1 >> my-output-small/likelihood_${i}_burn${burn}.csv + done +done +date \ No newline at end of file diff --git a/scripts/plot-roc.R b/scripts/plot-roc.R index a94fe39b42e..3353f0c4d6b 100755 --- a/scripts/plot-roc.R +++ b/scripts/plot-roc.R @@ -45,7 +45,7 @@ if (length(commandArgs(TRUE)) > 3) { } # Determine the order of aligners, based on sorting in a dash-separated tag aware manner -aligner.names <- levels(dat$aligner) +aligner.names <- levels(factor(dat$aligner)) name.lists <- aligner.names %>% (function(name) map(name, (function(x) as.list(unlist(strsplit(x, "-")))))) # Transpose name fragments into a list of vectors for each position, with NAs when tag lists end early max.parts <- max(sapply(name.lists, length)) diff --git a/scripts/restore-deps.sh b/scripts/restore-deps.sh new file mode 100755 index 00000000000..8deb595c7de --- /dev/null +++ b/scripts/restore-deps.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# On CI, we want to cache deps/ and avoid rebuilding unmodified submodules, but +# we also want to make sure every build has the right source code for all the +# submodules. +# +# This script will start with the version of deps/ fetched from the cache, and +# put in place only the differing source files. Differing files will be newer +# than the files from the cache. +# +# This script is meant to run in the root of the repository. +# +# This script will not, on its own, tolerate moving between compiler versions. + +set -ex + +if [[ -e deps ]]; then + # Get the cached deps out of the way + mv deps deps_cached +fi +mkdir -p deps_cached + +# Get the correct code +git submodule update --init --recursive + +# Clobber any differing files +rsync -r --links --checksum deps/ deps_cached/ + +rm -Rf deps + +# And move the built dependencies into place +mv deps_cached deps + diff --git a/scripts/test-gbwt.sh b/scripts/test-gbwt.sh deleted file mode 100755 index f595aef5a32..00000000000 --- a/scripts/test-gbwt.sh +++ /dev/null @@ -1,700 +0,0 @@ -#!/usr/bin/env bash -# test-gbwt.sh: Plot the effect of haplotype information on mapping performance - -# We are going to compare 5 mapping regimes: -# 1. The snp1kg graph without the variants in the sample (negative control) -# 2. The full snp1kg graph (Heng Effect positive control) -# 3. The full snp1kg graph with GBWT haplotype information (under test) -# 4. The frequency-filtered minaf snp1kg graph (current best) -# 5. The primary path graph (Heng Effect negative control) - -# We want to know if the GBWT reduces the Heng Effect (poor mapping due to -# spurious alignments to rare variants) - -# OK so on to the actual code. -set -ex - -# Define constants - -# Define a region name to process. This sets the name that the graphs and -# indexes will be saved/looked for under. -REGION_NAME="CHR21" -# Define the VCF and FASTA basenames. We assume the VCF has a TBI. -VCF_BASENAME="1kg_hg19-CHR21.vcf.gz" -FASTA_BASENAME="CHR21.fa" -# Define where to get them -SOURCE_BASE_URL="s3://cgl-pipeline-inputs/vg_cgl/bakeoff" -# Define the contig we are using -GRAPH_CONTIG="21" -# Define the region to build the graph on, as contig[:start-end] -GRAPH_REGION="${GRAPH_CONTIG}" - - -# Set a FASTQ to model reads after -TRAINING_FASTQ="ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/NA12878/NIST_NA12878_HG001_HiSeq_300x/131219_D00360_005_BH814YADXX/Project_RM8398/Sample_U5a/U5a_AGTCAA_L002_R1_007.fastq.gz" -# And a read simulation seed -READ_SEED="90" -# And a read count -READ_COUNT="10000000" -# Chunks to simulate in (which affects results) -READ_CHUNKS="32" - -MODE="21" -if [[ "${MODE}" == "mhc" ]]; then - # Actually do a smaller test - READ_COUNT="100000" - REGION_NAME="MHC" - GRAPH_CONTIG="6" - GRAPH_REGION="${GRAPH_CONTIG}:28510119-33480577" - FASTA_BASENAME="chr6.fa.gz" - VCF_BASENAME="1kg_hg38-MHC.vcf.gz" -elif [[ "${MODE}" == "tiny" ]]; then - # Do just 20 kb of MHC and a very few reads - READ_COUNT="1000" - REGION_NAME="MHC" - GRAPH_CONTIG="6" - GRAPH_REGION="${GRAPH_CONTIG}:28510119-28520119" - FASTA_BASENAME="chr6.fa.gz" - VCF_BASENAME="1kg_hg38-MHC.vcf.gz" -fi - -# Define the sample to use for synthesizing reads -SAMPLE_NAME="HG00096" - -# What min allele frequency limit do we use? -MIN_AF="0.0335570469" - -# Do we actually want to run the mapeval jobs? Or just do plotting? -RUN_JOBS="1" - - -# Now we need to parse our arguments -usage() { - # Print usage to stderr - exec 1>&2 - printf "Usage: $0 [Options] TREE_PATH GRAPHS_PATH OUTPUT_PATH\n" - printf "\tTREE_PATH\ta Toil job tree location\n" - printf "\tGRAPHS_PATH\ta directory of graphs (which, if extant, allows graph construction to be skipped)\n" - printf "\tOUTPUT_PATH\ta results directory\n" - exit 1 -} - -while getopts "" o; do - case "${o}" in - *) - usage - ;; - esac -done - -shift $((OPTIND-1)) - -if [[ "$#" -lt "3" ]]; then - # Too few arguments - usage -fi - -TREE_PATH="${1}" -shift -GRAPHS_PATH="${1}" -shift -OUTPUT_PATH="${1}" -shift - -if [[ -e "${TREE_PATH}" ]]; then - # Make sure we don't clobber the first arg on accident - echo "ERROR: Tree path ${TREE_PATH} already exists and needs to be removed" 1>&2 - exit 1 -fi - -mkdir -p "${TREE_PATH}" - -# Generate a toil-vg config -toil-vg generate-config > "${TREE_PATH}/toil-vg.conf" -sed -i "s/alignment-cores:.*/alignment-cores: 64/" "${TREE_PATH}/toil-vg.conf" -sed -i "s/alignment-disk:.*/alignment-disk: '20G'/" "${TREE_PATH}/toil-vg.conf" - -# Now we need to make sure our graphs exist and are downloaded - -if [[ ! -d "${GRAPHS_PATH}" ]]; then - # Graphs need to be gotten - - if [[ -e "${GRAPHS_PATH}" ]]; then - # It needs to not exist at all - echo "ERROR: Graph path ${GRAPHS_PATH} is not a directory" 1>&2 - exit 1 - fi - - # Make the directory - mkdir -p "${GRAPHS_PATH}" - - # Construct the graphs - # Hardcoded constants here go with the snp1kg URLs above. - toil-vg construct "${TREE_PATH}/construct" "${GRAPHS_PATH}" \ - --config "${TREE_PATH}/toil-vg.conf" \ - --vcf "${SOURCE_BASE_URL}/${VCF_BASENAME}" \ - --fasta "${SOURCE_BASE_URL}/${FASTA_BASENAME}" \ - --out_name "snp1kg-${REGION_NAME}" \ - --alt_paths \ - --realTimeLogging \ - --control_sample "${SAMPLE_NAME}" \ - --haplo_sample "${SAMPLE_NAME}" \ - --filter_samples "${SAMPLE_NAME}" \ - --regions "${GRAPH_REGION}" \ - --min_af "${MIN_AF}" \ - --primary \ - --gcsa_index \ - --xg_index \ - --gbwt_index \ - --snarls_index -fi - -READS_DIR="${GRAPHS_PATH}/sim-${READ_SEED}-${READ_COUNT}-${READ_CHUNKS}" - -if [[ ! -e "${READS_DIR}" ]]; then - # Now we need to simulate reads from the two haplotypes - # This will make a "sim.gam" - toil-vg sim "${TREE_PATH}/sim" \ - "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_${SAMPLE_NAME}_haplo_thread_0.xg" \ - "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_${SAMPLE_NAME}_haplo_thread_1.xg" \ - "${READ_COUNT}" \ - "${READS_DIR}" \ - --config "${TREE_PATH}/toil-vg.conf" \ - --annotate_xg "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_${SAMPLE_NAME}_haplo.xg" \ - --gam \ - --fastq_out \ - --seed "${READ_SEED}" \ - --sim_chunks "${READ_CHUNKS}" \ - --fastq "${TRAINING_FASTQ}" -fi - -# Make sure we have the SLLS linear index file -# We will work on the filtered VCF -FILTERED_VCF_BASENAME="${VCF_BASENAME%.vcf.gz}_filter.vcf.gz" -SLLS_INDEX="${GRAPHS_PATH}/slls/${FILTERED_VCF_BASENAME}.slls" -if [[ ! -e "${SLLS_INDEX}" ]]; then - # We need to make the SLLS index - mkdir "${GRAPHS_PATH}/slls" - cp "${GRAPHS_PATH}/${FILTERED_VCF_BASENAME}" "${GRAPHS_PATH}/slls/${FILTERED_VCF_BASENAME}" - cd deps/sublinear-Li-Stephens && make && cd ../.. - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`pwd`/deps/sublinear-Li-Stephens/deps/htslib/ ./deps/sublinear-Li-Stephens/bin/serializer "${GRAPHS_PATH}/slls/${FILTERED_VCF_BASENAME}" -fi - - # Now we do a bunch of stuff in parallel -JOB_ARRAY=() - -# What do they return? -JOB_RETURNS=() - -# What condition names have we run -CONDITIONS=() - -# How many jobs should we let run at once -MAX_JOBS=2 - -# We have a function to wait for the parallel jobs to finish, if MAX_JOBS or -# more are running -function wait_on_jobs() { - # How many jobs are running? - CURRENT_JOBS="${#JOB_ARRAY[@]}" - - if [[ "${CURRENT_JOBS}" -lt "${MAX_JOBS}" ]]; then - # If we haven't hit the cap, don't do anything. - return - fi - - # Otherwise we have to collect some jobs - COLLECTED_JOBS=0 - - # Now wait for all the jobs and fail if any failed - for JOB in "${JOB_ARRAY[@]}"; do - if [[ -z "${JOB}" ]]; then - # Drop empty strings that get in here. This happens if we forget - # the trailing & on something intended to be in parallel. - continue - fi - wait "${JOB}" - RETURN_CODE="$?" - - if [[ "${RETURN_CODE}" != "0" ]]; then - # A job has failed. - # Collect up all the jobs now, actually. - echo "Job PID ${JOB} failed with return code ${RETURN_CODE}; flushing queue" 1>&2 - MAX_JOBS=0 - fi - - JOB_RETURNS+=("${RETURN_CODE}") - ((COLLECTED_JOBS+=1)) - - if [[ "$((CURRENT_JOBS-COLLECTED_JOBS))" -lt "${MAX_JOBS}" ]]; then - # No need to clean up any more jobs - break - fi - done - - JOB_NUMBER=1 - for JOB_RETURN in "${JOB_RETURNS[@]}"; do - echo "Job ${JOB_NUMBER} exit status: ${JOB_RETURN}" - ((JOB_NUMBER+=1)) - if [[ "${JOB_RETURN}" != "0" ]]; then - echo "Job failed!" 1>&2 - exit 1 - fi - done - - # Pop off the finished jobs - JOB_ARRAY=("${JOB_ARRAY[@]:${COLLECTED_JOBS}}") - # Delete all the return codes - JOB_RETURNS=() -} - -if [[ "${RUN_JOBS}" == "1" ]]; then - # We actually want to run the toil-vg jobs - - #CONDITIONS+=("snp1kg-mp") - #if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp" ]]; then - # # Do the full snp1kg graph multipath - # toil-vg mapeval "${TREE_PATH}/snp1kg-mp" "${OUTPUT_PATH}/snp1kg-mp" \ - # --single_reads_chunk \ - # --config "${TREE_PATH}/toil-vg.conf" \ - # --maxDisk 100G \ - # --multipath-only \ - # --fastq "${READS_DIR}/sim.fq.gz" \ - # --truth "${READS_DIR}/true.pos" \ - # --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter" \ - # --gam-names snp1kg 2>&1 & - # JOB_ARRAY+=("$!") - #fi - #wait_on_jobs - - CONDITIONS+=("snp1kg-mp-snarlcut") - if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-snarlcut" ]]; then - # Do the full snp1kg graph multipath - toil-vg mapeval "${TREE_PATH}/snp1kg-mp-snarlcut" "${OUTPUT_PATH}/snp1kg-mp-snarlcut" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-snarls \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter" \ - --gam-names snp1kg-snarlcut 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - #CONDITIONS+=("snp1kg-mp-gbwt") - #if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-gbwt" ]]; then - # # Do the full snp1kg graph multipath with gbwt - # toil-vg mapeval "${TREE_PATH}/snp1kg-mp-gbwt" "${OUTPUT_PATH}/snp1kg-mp-gbwt" \ - # --single_reads_chunk \ - # --config "${TREE_PATH}/toil-vg.conf" \ - # --maxDisk 100G \ - # --multipath-only \ - # --use-gbwt \ - # --fastq "${READS_DIR}/sim.fq.gz" \ - # --truth "${READS_DIR}/true.pos" \ - # --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter" \ - # --gam-names snp1kg-gbwt 2>&1 & - # JOB_ARRAY+=("$!") - #fi - #wait_on_jobs - - #CONDITIONS+=("snp1kg-mp-gbwt-traceback") - #if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-gbwt-traceback" ]]; then - # # Do the full snp1kg graph multipath with gbwt - # toil-vg mapeval "${TREE_PATH}/snp1kg-mp-gbwt-traceback" "${OUTPUT_PATH}/snp1kg-mp-gbwt-traceback" \ - # --single_reads_chunk \ - # --config "${TREE_PATH}/toil-vg.conf" \ - # --maxDisk 100G \ - # --multipath-only \ - # --use-gbwt \ - # --mpmap_opts "--max-paths 10" \ - # --fastq "${READS_DIR}/sim.fq.gz" \ - # --truth "${READS_DIR}/true.pos" \ - # --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter" \ - # --gam-names snp1kg-gbwt-traceback 2>&1 & - # JOB_ARRAY+=("$!") - #fi - #wait_on_jobs - - CONDITIONS+=("snp1kg-mp-gbwt-snarlcut") - if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-gbwt-snarlcut" ]]; then - # Do the full snp1kg graph multipath with snarl cutting and gbwt but single path traceback - toil-vg mapeval "${TREE_PATH}/snp1kg-mp-gbwt-snarlcut" "${OUTPUT_PATH}/snp1kg-mp-gbwt-snarlcut" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-gbwt \ - --use-snarls \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter" \ - --gam-names snp1kg-gbwt-snarlcut 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - CONDITIONS+=("snp1kg-mp-gbwt-traceback-snarlcut") - if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-gbwt-traceback-snarlcut" ]]; then - # Do the full snp1kg graph multipath with gbwt - toil-vg mapeval "${TREE_PATH}/snp1kg-mp-gbwt-traceback-snarlcut" "${OUTPUT_PATH}/snp1kg-mp-gbwt-traceback-snarlcut" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-gbwt \ - --use-snarls \ - --mpmap_opts "--max-paths 10" \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter" \ - --gam-names snp1kg-gbwt-traceback-snarlcut 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - # This should replicate our best known performance... - CONDITIONS+=("snp1kg-mp-gbwt-traceback-snarlcut-unfiltered") - if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-gbwt-traceback-snarlcut-unfiltered" ]]; then - # Do the full snp1kg graph multipath with gbwt - toil-vg mapeval "${TREE_PATH}/snp1kg-mp-gbwt-traceback-snarlcut-unfiltered" "${OUTPUT_PATH}/snp1kg-mp-gbwt-traceback-snarlcut-unfiltered" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-gbwt \ - --use-snarls \ - --mpmap_opts "--max-paths 10" \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}" \ - --gam-names snp1kg-gbwt-traceback-snarlcut-unfiltered 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - CONDITIONS+=("snp1kg-mp-gbwt-traceback") - if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-gbwt-traceback" ]]; then - # Do the full snp1kg graph multipath with gbwt - toil-vg mapeval "${TREE_PATH}/snp1kg-mp-gbwt-traceback" "${OUTPUT_PATH}/snp1kg-mp-gbwt-traceback" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-gbwt \ - --use-snarls \ - --mpmap_opts "--max-paths 10" \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter" \ - --gam-names snp1kg-gbwt-traceback 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - CONDITIONS+=("snp1kg-mp-minaf-snarlcut") - if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-minaf-snarlcut" ]]; then - # And with the min allele frequency - toil-vg mapeval "${TREE_PATH}/snp1kg-mp-minaf-snarlcut" "${OUTPUT_PATH}/snp1kg-mp-minaf-snarlcut" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-snarls \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_minaf_${MIN_AF}" \ - --gam-names snp1kg-minaf-snarlcut 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - CONDITIONS+=("snp1kg-mp-minaf") - if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-minaf" ]]; then - # And with the min allele frequency - toil-vg mapeval "${TREE_PATH}/snp1kg-mp-minaf" "${OUTPUT_PATH}/snp1kg-mp-minaf" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-snarls \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_minaf_${MIN_AF}" \ - --gam-names snp1kg-minaf 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - CONDITIONS+=("snp1kg-mp-positive-snarlcut") - if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-positive-snarlcut" ]]; then - # And the positive control with only real variants - toil-vg mapeval "${TREE_PATH}/snp1kg-mp-positive-snarlcut" "${OUTPUT_PATH}/snp1kg-mp-positive-snarlcut" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-snarls \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_${SAMPLE_NAME}" \ - --gam-names snp1kg-positive-snarlcut 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - CONDITIONS+=("snp1kg-mp-positive") - if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-positive" ]]; then - # And the positive control with only real variants - toil-vg mapeval "${TREE_PATH}/snp1kg-mp-positive" "${OUTPUT_PATH}/snp1kg-mp-positive" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-snarls \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_${SAMPLE_NAME}" \ - --gam-names snp1kg-positive 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - CONDITIONS+=("primary-mp-snarlcut") - if [[ ! -e "${OUTPUT_PATH}/primary-mp-snarlcut" ]]; then - # And the primary path only - toil-vg mapeval "${TREE_PATH}/primary-mp-snarlcut" "${OUTPUT_PATH}/primary-mp-snarlcut" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-snarls \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/primary" \ - --gam-names primary-snarlcut 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - CONDITIONS+=("primary-mp") - if [[ ! -e "${OUTPUT_PATH}/primary-mp" ]]; then - # And the primary path only - toil-vg mapeval "${TREE_PATH}/primary-mp" "${OUTPUT_PATH}/primary-mp" \ - --single_reads_chunk \ - --config "${TREE_PATH}/toil-vg.conf" \ - --maxDisk 100G \ - --multipath-only \ - --use-snarls \ - --fastq "${READS_DIR}/sim.fq.gz" \ - --truth "${READS_DIR}/true.pos" \ - --index-bases "${GRAPHS_PATH}/primary" \ - --gam-names primary 2>&1 & - JOB_ARRAY+=("$!") - fi - wait_on_jobs - - #CONDITIONS+=("snp1kg-mp-slls") - #if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-slls/position.results.tsv" ]]; then - # # This one's a bit different since we need to manually do the mapping - # if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-slls/aligned-snp1kg-slls-pe_default.gam" ]]; then - # mkdir -p "${OUTPUT_PATH}/snp1kg-mp-slls" - # vg mpmap --linear-index "${SLLS_INDEX}" \ - # --linear-path "${GRAPH_CONTIG}" \ - # -x "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter.xg" \ - # -g "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter.gcsa" \ - # --fastq "${READS_DIR}/sim.fq.gz" \ - # -i \ - # -S \ - # -t 32 \ - # >"${OUTPUT_PATH}/snp1kg-mp-slls/aligned-snp1kg-slls-pe_default.gam" - # fi - # # Then do the mapeval - # toil-vg mapeval "${TREE_PATH}/snp1kg-mp-slls" "${OUTPUT_PATH}/snp1kg-mp-slls" \ - # --gams "${OUTPUT_PATH}/snp1kg-mp-slls/aligned-snp1kg-slls-pe_default.gam" \ - # --config "${TREE_PATH}/toil-vg.conf" \ - # --maxDisk 100G \ - # --truth "${READS_DIR}/true.pos" \ - # --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter" \ - # --gam-names snp1kg-slls-mp-pe 2>&1 & - # JOB_ARRAY+=("$!") - #fi - #wait_on_jobs - - #CONDITIONS+=("snp1kg-mp-slls-snarlcut") - #if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-slls-snarlcut/position.results.tsv" ]]; then - # # This one's a bit different since we need to manually do the mapping - # if [[ ! -e "${OUTPUT_PATH}/snp1kg-mp-slls-snarlcut/aligned-snp1kg-slls-snarlcut-pe_default.gam" ]]; then - # mkdir -p "${OUTPUT_PATH}/snp1kg-mp-slls-snarlcut" - # vg mpmap --linear-index "${SLLS_INDEX}" \ - # --linear-path "${GRAPH_CONTIG}" \ - # -x "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter.xg" \ - # -g "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter.gcsa" \ - # --snarls "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter.snarls" \ - # --fastq "${READS_DIR}/sim.fq.gz" \ - # -i \ - # -S \ - # -t 32 \ - # >"${OUTPUT_PATH}/snp1kg-mp-slls-snarlcut/aligned-snp1kg-slls-snarlcut-pe_default.gam" - # fi - # # Then do the mapeval - # toil-vg mapeval "${TREE_PATH}/snp1kg-mp-slls-snarlcut" "${OUTPUT_PATH}/snp1kg-mp-slls-snarlcut" \ - # --gams "${OUTPUT_PATH}/snp1kg-mp-slls-snarlcut/aligned-snp1kg-slls-snarlcut-pe_default.gam" \ - # --config "${TREE_PATH}/toil-vg.conf" \ - # --maxDisk 100G \ - # --truth "${READS_DIR}/true.pos" \ - # --index-bases "${GRAPHS_PATH}/snp1kg-${REGION_NAME}_filter" \ - # --gam-names snp1kg-slls-snarlcut-mp-pe 2>&1 & - # JOB_ARRAY+=("$!") - #fi - #wait_on_jobs - -fi - -# Make all the jobs finish -MAX_JOBS=0 -wait_on_jobs - -if [[ ! -e "${OUTPUT_PATH}/position.results.tsv" ]]; then - # Concatenate all the conditions' position results files - - FIRST_CONDITION=1 - for CONDITION in "${CONDITIONS[@]}"; do - if [[ "${FIRST_CONDITION}" == "1" ]]; then - # Keep the header on the first condition we process - cat "${OUTPUT_PATH}/${CONDITION}/position.results.tsv" > "${OUTPUT_PATH}/position.results.tsv" - else - # Drop the header - cat "${OUTPUT_PATH}/${CONDITION}/position.results.tsv" | sed 1d >> "${OUTPUT_PATH}/position.results.tsv" - fi - FIRST_CONDITION=0 - done -fi - -if [[ ! -e "${OUTPUT_PATH}/qq" ]]; then - # Collect all the qq plots to one directory to page through easily - mkdir "${OUTPUT_PATH}/qq" - - CONDITION_NUMBER=0 - for CONDITION in "${CONDITIONS[@]}"; do - cp "${OUTPUT_PATH}/${CONDITION}/plot-qq.svg" "${OUTPUT_PATH}/qq/qq-${CONDITION_NUMBER}-${CONDITION}.svg" - ((CONDITION_NUMBER+=1)) - done - -fi - -if [[ ! -e "${OUTPUT_PATH}/table.tsv" ]]; then - # Make a table of wrong reads - - # First we need a baseline for comparing against - BASELINE_CONDITION="snp1kg-mp-snarlcut" - - # Make header for table of wrong read counts - printf "Condition\tWrong reads total\tAt MAPQ 60\tAt MAPQ 0\tAt MAPQ >0\tNew vs. ${BASELINE_CONDITION}\tFixed vs. ${BASELINE_CONDITION}\tAvg. Correct MAPQ\tCorrect MAPQ 0\n" > "${OUTPUT_PATH}/table.tsv" - - # Pull out the baseline wrong reads - cat "${OUTPUT_PATH}/${BASELINE_CONDITION}/position.results.tsv" | sed 1d | grep -- "-pe" | grep -v "^1" | cut -f4 | sort > "${OUTPUT_PATH}/baseline-wrong-names.tsv" - - for CONDITION in "${CONDITIONS[@]}"; do - - # We want a table like - # Condition Wrong reads total At MAPQ 60 At MAPQ 0 At MAPQ >0 New vs. baseline Fixed vs. baseline - printf "${CONDITION}\t" >> "${OUTPUT_PATH}/table.tsv" - - - # Get the wrong reads for this condition - cat "${OUTPUT_PATH}/${CONDITION}/position.results.tsv" | sed 1d | grep -- "-pe" | grep -v "^1" > "${OUTPUT_PATH}/${CONDITION}/wrong.tsv" - - # Wrong total - cat "${OUTPUT_PATH}/${CONDITION}/wrong.tsv" | wc -l | tr -d '\n' >> "${OUTPUT_PATH}/table.tsv" - printf "\t" >> "${OUTPUT_PATH}/table.tsv" - - # Wrong at MAPQ 60 - cat "${OUTPUT_PATH}/${CONDITION}/wrong.tsv" | grep -P "\t60\t" | wc -l | tr -d '\n' >> "${OUTPUT_PATH}/table.tsv" - printf "\t" >> "${OUTPUT_PATH}/table.tsv" - - # Wrong at MAPQ 0 - cat "${OUTPUT_PATH}/${CONDITION}/wrong.tsv" | grep -P "\t0\t" | wc -l | tr -d '\n' >> "${OUTPUT_PATH}/table.tsv" - printf "\t" >> "${OUTPUT_PATH}/table.tsv" - - # Wrong at MAPQ >0 - cat "${OUTPUT_PATH}/${CONDITION}/wrong.tsv" | grep -v -P "\t0\t" | wc -l | tr -d '\n' >> "${OUTPUT_PATH}/table.tsv" - printf "\t" >> "${OUTPUT_PATH}/table.tsv" - - # Get the wrong read names for this condition - cat "${OUTPUT_PATH}/${CONDITION}/wrong.tsv" | cut -f4 | sort > "${OUTPUT_PATH}/${CONDITION}/wrong-names.tsv" - - # Count newly wrong names (not in file 1 or both) - comm -1 -3 "${OUTPUT_PATH}/baseline-wrong-names.tsv" "${OUTPUT_PATH}/${CONDITION}/wrong-names.tsv" | wc -l | tr -d '\n' >> "${OUTPUT_PATH}/table.tsv" - printf "\t" >> "${OUTPUT_PATH}/table.tsv" - - # Count newly right names (not in file 2 or both) - comm -2 -3 "${OUTPUT_PATH}/baseline-wrong-names.tsv" "${OUTPUT_PATH}/${CONDITION}/wrong-names.tsv" | wc -l | tr -d '\n' >> "${OUTPUT_PATH}/table.tsv" - printf "\t" >> "${OUTPUT_PATH}/table.tsv" - - # Get the right reads - cat "${OUTPUT_PATH}/${CONDITION}/position.results.tsv" | sed 1d | grep -- "-pe" | grep "^1" > "${OUTPUT_PATH}/${CONDITION}/right.tsv" - - # Compute average MAPQ for correct reads - # See - cat "${OUTPUT_PATH}/${CONDITION}/right.tsv" | cut -f2 | awk '{total += $1} END {if (NR > 0) {print total / NR} else {print "N/A"}}' | tr -d '\n' >> "${OUTPUT_PATH}/table.tsv" - printf "\t" >> "${OUTPUT_PATH}/table.tsv" - - # Count correct reads at MAPQ 0 - cat "${OUTPUT_PATH}/${CONDITION}/right.tsv" | grep -P "\t0\t" | wc -l | tr -d '\n' >> "${OUTPUT_PATH}/table.tsv" - printf "\n" >> "${OUTPUT_PATH}/table.tsv" - done - - rm "${OUTPUT_PATH}/baseline-wrong-names.tsv" - -fi - -# Determine our source directory, where the ROC plotting script also lives -# See -SCRIPT_DIRECTORY="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Do the R plots - -if [[ ! -e "${OUTPUT_PATH}/roc.svg" ]]; then - Rscript "${SCRIPT_DIRECTORY}/plot-roc.R" "${OUTPUT_PATH}/position.results.tsv" "${OUTPUT_PATH}/roc.svg" -fi -if [[ ! -e "${OUTPUT_PATH}/pr.svg" ]]; then - Rscript "${SCRIPT_DIRECTORY}/plot-pr.R" "${OUTPUT_PATH}/position.results.tsv" "${OUTPUT_PATH}/pr.svg" -fi - -rm "${TREE_PATH}/toil-vg.conf" -rmdir "${TREE_PATH}" -echo "Results available as plots ${OUTPUT_PATH}/roc.svg and ${OUTPUT_PATH}/pr.svg and table ${OUTPUT_PATH}/table.tsv" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/vg_sim_pos_compare.py b/scripts/vg_sim_pos_compare.py index 5c91dcfefa8..4b2e128cd90 100755 --- a/scripts/vg_sim_pos_compare.py +++ b/scripts/vg_sim_pos_compare.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 ### positional comparison script useful for tagging alignments with their true position @@ -13,7 +13,7 @@ # every input has a true position, and if it has less than the expected number of fields we assume alignment failed aln_name = fields[0] if len(fields) != 13: - print aln_name, 0, 0, 0, 0, 0, 0, 0, 0, 0 + print(aln_name, 0, 0, 0, 0, 0, 0, 0, 0, 0) continue aln_chr = fields[1] aln_pos = int(fields[2]) @@ -30,4 +30,4 @@ novel_nodes = int(fields[11]) novel_bp = int(fields[12]) aln_correct = 1 if aln_chr == true_chr and abs(true_pos - aln_pos) < threshold else 0 - print aln_name, aln_correct, aln_mapq, aln_score, length, unaligned, known_nodes, known_bp, novel_nodes, novel_bp + print(aln_name, aln_correct, aln_mapq, aln_score, length, unaligned, known_nodes, known_bp, novel_nodes, novel_bp) diff --git a/source_me.sh b/source_me.sh index c21c371deca..0bb948c9280 100755 --- a/source_me.sh +++ b/source_me.sh @@ -2,10 +2,13 @@ export LIBRARY_PATH=`pwd`/lib:$LIBRARY_PATH export LD_LIBRARY_PATH=`pwd`/lib:$LD_LIBRARY_PATH export DYLD_LIBRARY_PATH=`pwd`/lib:$DYLD_LIBRARY_PATH export LD_INCLUDE_PATH=`pwd`/include:$LD_INCLUDE_PATH -export C_INCLUDE_PATH=`pwd`/include:$C_INCLUDE_PATH -export CPLUS_INCLUDE_PATH=`pwd`/include:$CPLUS_INCLUDE_PATH -export INCLUDE_PATH=`pwd`/include:$INCLUDE_PATH -export PATH=`pwd`/bin:`pwd`/scripts:$PATH +# Setting include directories via C_INCLUDE_PATH/CPLUS_INCLUDE_PATH will +# automatically get them demoted to the end of the search list even if a -I +# option is passed to try and bump them up earlier, before other -I options. +# We leave the Makefile in charge of finding all the include directories. +export CFLAGS="-I $(pwd)/include ${CFLAGS}" +export CXXFLAGS="-I $(pwd)/include -I$(pwd)/include/dynamic ${CXXFLAGS}" +export PATH=`pwd`/bin:`pwd`/scripts:"$PATH" export CC=$(which gcc) export CXX=$(which g++) diff --git a/src/algorithms/a_star.hpp b/src/algorithms/a_star.hpp new file mode 100644 index 00000000000..8639c5cf55a --- /dev/null +++ b/src/algorithms/a_star.hpp @@ -0,0 +1,215 @@ +#ifndef VG_ALGORITHMS_A_STAR_HPP_INCLUDED +#define VG_ALGORITHMS_A_STAR_HPP_INCLUDED + +/** + * \file a_star.hpp + * + * Defines an implementation of the A* heuristic-guided search algorithm. + */ + +#include "../handle.hpp" + +#include +#include + +//#define debug_a_star + +namespace vg { +namespace algorithms { + +using namespace std; + + /// Implements the A* heuristic-guided search algorithm. Returns the path from pos_1 to + /// pos_2 that is either minimal or maximal length, according to the parameters. Allows + /// an extremal distance beyond which the algorithm will cease looking for paths (this + /// should be a large value when looking for minimal paths and a small value when looking + /// for maximum paths). If there is no path between the positions, or none within the + /// extremal length, an empty vector will be returned. + template + vector a_star(const HandleGraph* graph, + const pos_t& pos_1, const pos_t& pos_2, + const DistHeuristic& dist_heuristic, + bool find_min = true, + int64_t extremal_distance = numeric_limits::max(), + bool monotonic_heuristic = true); + + // TODO: non-monotonic heuristics in cyclic graphs cannot identify unreachable positions + + + + + + /* + * Implementation of template functions + */ + + template + vector a_star(const HandleGraph* graph, + const pos_t& pos_1, const pos_t& pos_2, + const DistHeuristic& dist_heuristic, + bool find_min, + int64_t extremal_distance, + bool monotonic_heuristic) { + +#ifdef debug_a_star + cerr << "doing A* search from " << pos_1 << " to " << pos_2 << " looking for " << (find_min ? "min" : "max") << " distance path with breakout distance " << extremal_distance << endl; +#endif + + /* + * The last node in a search path, pointing back to the previous node for traceback purposes + */ + struct AStarPathEnd { + AStarPathEnd(handle_t handle, int64_t predecessor_idx) + : handle(handle), predecessor_idx(predecessor_idx) { } + + handle_t handle; // current node + int64_t predecessor_idx; // the index of the predecessor node in the search history or -1 if none + }; + + /* + * A step in the search, including the node, the traveled distance, and the heuristic estimate of + * the length of the whole path + */ + struct AStarSearchRecord { + + AStarSearchRecord(const AStarPathEnd& path_end, int64_t distance, int64_t heuristic_estimate) + : path_end(path_end), distance(distance), heuristic_estimate(heuristic_estimate) { } + + AStarPathEnd path_end; // the end of the current path + int64_t distance; // distance up to this point in traversed path + int64_t heuristic_estimate; // estimate of the total distance to the target + }; + + /* + * A stateful comparator that prioritizes search steps based on their heuristic distance + */ + struct AStarCompare { + AStarCompare(bool do_min) : do_min(do_min) { } + + // reverse the ordering for a min to match the behavior of a priority queue (which selects + // the max value) + inline bool operator()(const AStarSearchRecord& a, const AStarSearchRecord& b) const { + return ((do_min && a.heuristic_estimate > b.heuristic_estimate) + || (!do_min && a.heuristic_estimate < b.heuristic_estimate)); + } + + bool do_min; + }; + + + // TODO: this function won't handle cycles as written + // it needs to be able to move past the end node and circle back around to the start + // node without breaking the predecessor (i.e. have multiple predecessors) + + // TODO: handle a sentinel for being unreachable + + // handle the same node reachable case as an edge case + if (id(pos_1) == id(pos_2) && is_rev(pos_1) == is_rev(pos_2) && offset(pos_1) <= offset(pos_2) + && (find_min ? offset(pos_2) - offset(pos_1) <= extremal_distance : offset(pos_2) - offset(pos_1) >= extremal_distance)) { + return vector(1, graph->get_handle(id(pos_1), is_rev(pos_1))); + + } + + // the paths we've walked so far + vector path_search_history; + + // the nodes we've decided we don't need to revisit + unordered_set closed_nodes; + + handle_t start = graph->get_handle(id(pos_1), is_rev(pos_1)); + handle_t end = graph->get_handle(id(pos_2), is_rev(pos_2)); + + // init the priority queue, ordered by either min or max + AStarCompare compare(find_min); + priority_queue, AStarCompare> queue(compare); + + // set negative distance so the search starts from the beginning of the node + queue.emplace(AStarPathEnd(start, -1), + -offset(pos_1), + dist_heuristic(make_pos_t(id(pos_1), is_rev(pos_1), 0), pos_2)); + + while (!queue.empty()) { + auto search_record = queue.top(); + queue.pop(); + + // have we marked this node as one we don't need to traverse again? + if (closed_nodes.count(search_record.path_end.handle)) { + continue; + } + + // create a record of this search step in our history + int64_t current_idx = path_search_history.size(); + path_search_history.emplace_back(search_record.path_end); + +#ifdef debug_a_star + cerr << "at node " << graph->get_id(search_record.path_end.handle) << (graph->get_is_reverse(search_record.path_end.handle) ? "-" : "+") << " with heuristic distance " << search_record.heuristic_estimate << " and prefix distance " << search_record.distance << " from predecessor " << graph->get_id(path_search_history[search_record.path_end.predecessor_idx].handle) << (graph->get_is_reverse(path_search_history[search_record.path_end.predecessor_idx].handle) ? "-" : "+") << endl; +#endif + + if ((find_min && search_record.distance + int64_t(offset(pos_2)) > extremal_distance) + || (!find_min && search_record.distance + int64_t(offset(pos_2)) < extremal_distance)) { + // we've crossed over the distance beyond which we're not interested + +#ifdef debug_a_star + cerr << "\twe've crossed the extremal distance of " << (find_min ? extremal_distance - offset(pos_2) : extremal_distance + offset(pos_2)) << endl; +#endif + break; + } + + // we never need to return here if we're using a monotonic heuristic + if (monotonic_heuristic) { + closed_nodes.insert(search_record.path_end.handle); + } + + if (path_search_history.size() > 1 && // in first step, we only got to this loop if offsets are unreachable + search_record.path_end.handle == end) { + // we've found the end, reconstruct the path and return it + +#ifdef debug_a_star + cerr << "\tfound target" << endl; + cerr << "path search history:" << endl; + for (size_t i = 0; i < path_search_history.size(); i++) { + cerr << "\t" << i << ": " << graph->get_id(path_search_history[i].handle) << (graph->get_is_reverse(path_search_history[i].handle) ? "-" : "+") << " -> " << path_search_history[i].predecessor_idx << endl; + } +#endif + + vector path; + // walk the path backwards through the searh history + int64_t idx = path_search_history.size() - 1; + while (idx >= 0) { + auto& path_step = path_search_history[idx]; + path.emplace_back(path_step.handle); + idx = path_step.predecessor_idx; + } + // put the path in forward order + reverse(path.begin(), path.end()); + return path; + } + + int64_t distance = graph->get_length(search_record.path_end.handle) + search_record.distance; + +#ifdef debug_a_star + cerr << "\tdistance through node is " << distance << endl; +#endif + + auto enqueue_next = [&](const handle_t& next) { + + int64_t remaining = dist_heuristic(make_pos_t(graph->get_id(next), graph->get_is_reverse(next), 0), + pos_2); +#ifdef debug_a_star + cerr << "\ttraversing to " << graph->get_id(next) << (graph->get_is_reverse(next) ? "-" : "+") << " with guess for remaining distance at " << remaining << endl; +#endif + + queue.emplace(AStarPathEnd(next, current_idx), distance, distance + remaining); + }; + + // enqueue walking forward + graph->follow_edges(search_record.path_end.handle, false, enqueue_next); + } + + // we made it through the loop without finding a path + return vector(); + } +} +} + +#endif diff --git a/src/algorithms/alignment_path_offsets.cpp b/src/algorithms/alignment_path_offsets.cpp new file mode 100644 index 00000000000..d50f9100818 --- /dev/null +++ b/src/algorithms/alignment_path_offsets.cpp @@ -0,0 +1,229 @@ +#include "alignment_path_offsets.hpp" + +//#define debug_mpaln_offsets + +namespace vg { +namespace algorithms { + +unordered_map > > +alignment_path_offsets(const PathPositionHandleGraph& graph, + const Alignment& aln, + bool just_min, + bool nearby, + size_t search_limit, + const std::function* path_filter) { + if (nearby && search_limit == 0) { + // Fill in the search limit + search_limit = aln.sequence().size(); + } + unordered_map > > offsets; + if (graph.get_path_count() == 0) return offsets; + for (auto& mapping : aln.path().mapping()) { + // How many bases does this Mapping cover over? + size_t mapping_width = mapping_from_length(mapping); + if (mapping_width == 0 && !nearby) { + // Just skip over this mapping; it touches no bases. + continue; + } + // We may have to consider both the starts and ends of mappings + vector end = {false}; + if (just_min && !nearby) { + // We want the min actually touched position along each path. It + // could come from the Mapping start or the Mapping end. + end.push_back(true); + } + // Find the position of this end of this mapping + pos_t mapping_pos = make_pos_t(mapping.position()); + // Find the positions for this end of this Mapping + auto pos_offs = algorithms::nearest_offsets_in_paths(&graph, mapping_pos, nearby ? search_limit : -1, path_filter); + for (auto look_at_end : end) { + // For the start and the end of the Mapping, as needed + for (auto& p : pos_offs) { + // For each path, splice the list of path positions for this Mapping + // onto the end of the list of positions we found in that path + auto& v = offsets[p.first]; + for (pair& y : p.second) { + v.emplace_back(y.second ? y.first - mapping_width : y.first, + y.second); + } + } + } + } + if (!nearby && offsets.empty()) { + // find the nearest if we couldn't find any before + return alignment_path_offsets(graph, aln, just_min, true, search_limit, path_filter); + } + if (just_min) { + // We need the minimum position for each path + for (auto& p : offsets) { + auto& v = p.second; + auto m = *min_element(v.begin(), v.end(), + [](const pair& a, + const pair& b) + { return a.first < b.first; }); + v.clear(); + v.push_back(m); + } + } + return offsets; +} + +unordered_map > > +multipath_alignment_path_offsets(const PathPositionHandleGraph& graph, + const multipath_alignment_t& mp_aln, + const std::function* path_filter) { + + using path_positions_t = unordered_map>>; + + // collect the search results for each mapping on each subpath + vector> search_results(mp_aln.subpath_size()); + for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { + const subpath_t& subpath = mp_aln.subpath(i); + auto& subpath_search_results = search_results[i]; + subpath_search_results.resize(subpath.path().mapping_size()); + for (size_t j = 0; j < subpath.path().mapping_size(); ++j) { + // get the positions on paths that this mapping touches + pos_t mapping_pos = make_pos_t(subpath.path().mapping(j).position()); + subpath_search_results[j] = nearest_offsets_in_paths(&graph, mapping_pos, 0, path_filter); + // make sure that offsets are stored in increasing order + for (pair>>& search_record : subpath_search_results[j]) { + sort(search_record.second.begin(), search_record.second.end()); + } +#ifdef debug_mpaln_offsets + cerr << "subpath " << i << ", mapping " << j << " path locations" << endl; + for (const auto& pps : subpath_search_results[j]) { + cerr << graph.get_path_name(pps.first) << endl; + for (const auto& pp : pps.second) { + cerr << "\t" << pp.first << " " << pp.second << endl; + } + } +#endif + } + } + + path_positions_t return_val; + + // to keep track of whether we've already chosen a position on each path + // earlier in the multipath alignment in either the forward or reverse pass + vector> covered_fwd(mp_aln.subpath_size()); + vector> covered_rev(mp_aln.subpath_size()); + + // forward pass looking for positions on the forward strand of paths + for (size_t i = 0; i < mp_aln.subpath_size(); ++i) { + const auto& subpath_search_results = search_results[i]; + for (size_t j = 0; j < subpath_search_results.size(); ++j) { + for (const auto& path_pos : subpath_search_results[j]) { + if (!covered_fwd[i].count(path_pos.first)) { + // we haven't already covered this path at an earlier position on the alignment + for (const auto& path_offset : path_pos.second) { + if (!path_offset.second) { + // there's a position on the forward strand of this path + return_val[path_pos.first].emplace_back(path_offset); + + // we're now covering this path for future search results + covered_fwd[i].insert(path_pos.first); + +#ifdef debug_mpaln_offsets + cerr << "found fwd pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first << " " << path_offset.second << endl; +#endif + + break; + } + } + } + } + } + + // the following subpaths will be covered for any path that this + // one is covered for + for (auto n : mp_aln.subpath(i).next()) { + auto& next_coverings = covered_fwd[n]; + for (auto path_handle : covered_fwd[i]) { + next_coverings.insert(path_handle); + } + } + for (const auto& c : mp_aln.subpath(i).connection()) { + auto& next_coverings = covered_fwd[c.next()]; + for (auto path_handle : covered_fwd[i]) { + next_coverings.insert(path_handle); + } + } + } + + // now do a backward pass for the reverse strand of paths + for (int64_t i = mp_aln.subpath_size() - 1; i >= 0; --i) { + // find which paths are already covered in the reverse + for (auto n : mp_aln.subpath(i).next()) { + for (auto path_handle : covered_rev[n]) { + covered_rev[i].insert(path_handle); + } + } + for (const auto& c : mp_aln.subpath(i).connection()) { + for (auto path_handle : covered_rev[c.next()]) { + covered_rev[i].insert(path_handle); + } + } + + const auto& subpath_search_results = search_results[i]; + for (int64_t j = subpath_search_results.size() - 1; j >= 0; --j) { + for (const auto& path_pos : subpath_search_results[j]) { + if (!covered_rev[i].count(path_pos.first)) { + // we haven't already covered this path at an earlier position on the alignment + for (const auto& path_offset : path_pos.second) { + if (path_offset.second) { + // there's a position on the reverse strand of this path + auto mapping_len = mapping_from_length(mp_aln.subpath(i).path().mapping(j)); + return_val[path_pos.first].emplace_back(path_offset.first - mapping_len, + path_offset.second); + +#ifdef debug_mpaln_offsets + cerr << "found rev pass pos, subpath " << i << ", mapping " << j << ", path " << graph.get_path_name(path_pos.first) << ", pos " << path_offset.first - mapping_len << " " << path_offset.second << endl; +#endif + // we're now covering this path for future search results + covered_rev[i].insert(path_pos.first); + + break; + } + } + } + } + } + } + + return return_val; +} + +void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { + annotate_with_path_positions(graph, aln, true, search_limit, path_filter); +} + +void annotate_with_node_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit, const std::function* path_filter) { + annotate_with_path_positions(graph, aln, false, search_limit, path_filter); +} + +void annotate_with_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, bool just_min, size_t search_limit, const std::function* path_filter) { + if (!aln.refpos_size()) { + // Get requested path positions + unordered_map > > positions = alignment_path_offsets(graph, aln, just_min, false, search_limit, path_filter); + // emit them in order of the path handle + vector ordered; + for (auto& path : positions) { ordered.push_back(path.first); } + std::sort(ordered.begin(), ordered.end(), [](const path_handle_t& a, const path_handle_t& b) { return as_integer(a) < as_integer(b); }); + for (auto& path : ordered) { + for (auto& p : positions[path]) { + // Add each determined refpos + Position* refpos = aln.add_refpos(); + refpos->set_name(graph.get_path_name(path)); + refpos->set_offset(p.first); + refpos->set_is_reverse(p.second); + } + } + } +} + +void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, vector& alns, size_t search_limit, const std::function* path_filter) { + for (auto& aln : alns) annotate_with_initial_path_positions(graph, aln, search_limit, path_filter); +} + +} +} diff --git a/src/algorithms/alignment_path_offsets.hpp b/src/algorithms/alignment_path_offsets.hpp new file mode 100644 index 00000000000..4c601404d85 --- /dev/null +++ b/src/algorithms/alignment_path_offsets.hpp @@ -0,0 +1,95 @@ +#pragma once + +#include "../handle.hpp" +#include "../position.hpp" +#include "../path.hpp" +#include "../multipath_alignment.hpp" +#include +#include +#include "nearest_offsets_in_paths.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +/// Gives the path positions of the alignment. If just_min is set, gives the +/// minimum position on each path. Else, gives all Mapping start positions on +/// each path. If nearby is set, will search for a nearby path. Will recurse +/// with nearby set if it is not set on initial call and no positions are +/// found. Respects search_limit in bp in that case. If search_limit is 0, read +/// length is used. +/// +/// If path_filter is set, and it returns false for a path, that path is not +/// used to annotate the read. +unordered_map > > +alignment_path_offsets(const PathPositionHandleGraph& graph, + const Alignment& aln, + bool just_min, + bool nearby, + size_t search_limit = 0, + const std::function* path_filter = nullptr); + +/// Find the position of a multipath alignment on paths. Returns the lowest offset +/// position on a path for each contiguous stretch of the multipath alignment, but +/// multiple positions on the same path may be returned if the multipath alignment +/// is disconnected or fans out toward the sources or sinks. +/// +/// If path_filter is set, and it returns false for a path, that path is not +/// used to annotate the read. +unordered_map > > +multipath_alignment_path_offsets(const PathPositionHandleGraph& graph, + const multipath_alignment_t& mp_aln, + const std::function* path_filter = nullptr); + +/// Use the graph to annotate an Alignment with the first +/// position it touches on each reference path. Thread safe. +/// +/// search_limit gives the maximum distance to search for a path if the +/// alignment does not actually touch any paths. If 0, the alignment's +/// sequence length is used. +/// +/// If path_filter is set, and it returns false for a path, that path is not +/// used to annotate the read. +void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit = 0, const std::function* path_filter = nullptr); + +/// Use the graph to annotate an Alignment with the first +/// position it touches on each node it visits in each reference path. Thread +/// safe. If no nodes on any path are visited, searches for a nearby path +/// position to use. +/// +/// search_limit gives the maximum distance to search for a path if the +/// alignment does not actually touch any paths. If 0, the alignment's +/// sequence length is used. +/// +/// If path_filter is set, and it returns false for a path, that path is not +/// used to annotate the read. +void annotate_with_node_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, size_t search_limit = 0, const std::function* path_filter = nullptr); + +/// Use the graph to annotate an Alignment with positions on each reference +/// path. Thread safe. +/// +/// If just_min is set, gives the minimum position on each path. Else, gives +/// all Mapping start positions on each path. If no positions on the path are +/// found, looks for nearby path positions in graph space. Respects +/// search_limit in bp in that case. If search_limit is 0, read length is used. +/// +/// If path_filter is set, and it returns false for a path, that path is not +/// used to annotate the read. +void annotate_with_path_positions(const PathPositionHandleGraph& graph, Alignment& aln, bool just_min, size_t search_limit = 0, const std::function* path_filter = nullptr); + +/// Use the graph annotate Alignments with the first position +/// they touch on each reference path. Thread safe. +/// +/// search_limit gives the maximum distance to search for a path if the +/// alignment does not actually touch any paths. If 0, the alignment's +/// sequence length is used. +/// +/// If path_filter is set, and it returns false for a path, that path is not +/// used to annotate the read. +void annotate_with_initial_path_positions(const PathPositionHandleGraph& graph, vector& aln, size_t search_limit = 0, const std::function* path_filter = nullptr); + + +} + +} diff --git a/src/algorithms/apply_bulk_modifications.cpp b/src/algorithms/apply_bulk_modifications.cpp deleted file mode 100644 index c4c31fb91f3..00000000000 --- a/src/algorithms/apply_bulk_modifications.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include "apply_bulk_modifications.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - - unordered_set apply_orientations(MutableHandleGraph* graph, const vector& orientations) { - - // Track what we flip - unordered_set flipped; - for (const auto& handle : orientations) { - if (graph->get_is_reverse(handle)) { - // This needs to be flipped - flipped.insert(graph->get_id(handle)); - // Flip it - graph->apply_orientation(handle); - } - } - return flipped; - } - - void apply_ordering(MutableHandleGraph* graph, const vector& ordering) { - - if (graph->node_size() != ordering.size()) { - cerr << "error:[algorithms] attempting to sort a graph with an incomplete ordering" << endl; - exit(1); - } - - // TODO: we don't check that all nodes are present only once, which might be nice to do - - size_t index = 0; - graph->for_each_handle([&](const handle_t& at_index) { - // For each handle in the graph, along with its index - - // Swap the handle we observe at this index with the handle that we know belongs at this index. - // The loop invariant is that all the handles before index are the correct sorted handles in the right order. - // Note that this ignores orientation - graph->swap_handles(at_index, ordering.at(index)); - - // Now we've written the sorted handles through one more space. - index++; - }); - } -} -} diff --git a/src/algorithms/apply_bulk_modifications.hpp b/src/algorithms/apply_bulk_modifications.hpp deleted file mode 100644 index 10cc08e18b2..00000000000 --- a/src/algorithms/apply_bulk_modifications.hpp +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef VG_ALGORITHMS_APPLY_BULK_MODIFICATIONS_HPP_INCLUDED -#define VG_ALGORITHMS_APPLY_BULK_MODIFICATIONS_HPP_INCLUDED - -/** - * \file apply_bulk_modifications.hpp - * - * Defines utility algorithms for applying mutable graph operations in bulk. - */ - -#include "../handle.hpp" - -#include -#include - -namespace vg { -namespace algorithms { - -using namespace std; - - /// Modifies underlying graph so that any node whose handle is given in the reverse orientation - /// is flipped so that all locally forward orientations match the orientation of the provided handles. - /// Returns a set of IDs for nodes that were flipped. Invalid if vector contains multiple handles to - /// the same node. May change the ordering of the underlying graph. - unordered_set apply_orientations(MutableHandleGraph* graph, const vector& orientations); - - /// Modifies underlying graph so that nodes occur in the same order as in the provided vector. Vector - /// must contain exactly one handle for each node. - void apply_ordering(MutableHandleGraph* graph, const vector& ordering); - -} -} - -#endif diff --git a/src/algorithms/approx_path_distance.cpp b/src/algorithms/approx_path_distance.cpp new file mode 100644 index 00000000000..6d60a1b505a --- /dev/null +++ b/src/algorithms/approx_path_distance.cpp @@ -0,0 +1,28 @@ +#include "approx_path_distance.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +size_t min_approx_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos1, const pos_t& pos2, uint64_t max_search) { + auto nearest1 = nearest_offsets_in_paths(graph, pos1, max_search); + auto nearest2 = nearest_offsets_in_paths(graph, pos2, max_search); + uint64_t min_distance = numeric_limits::max(); + for (auto& p : nearest1) { + auto q = nearest2.find(p.first); + if (q != nearest2.end()) { + // note, doesn't respect orientation + for (auto& o1 : p.second) { + for (auto& o2 : q->second) { + uint64_t x = (o1.first > o2.first ? o1.first - o2.first : o2.first - o1.first); + min_distance = std::min(min_distance, x); + } + } + } + } + return (size_t)min_distance; +} + +} +} diff --git a/src/algorithms/approx_path_distance.hpp b/src/algorithms/approx_path_distance.hpp new file mode 100644 index 00000000000..f4f8f78a882 --- /dev/null +++ b/src/algorithms/approx_path_distance.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "../handle.hpp" +#include +#include +#include +#include "nearest_offsets_in_paths.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +/// use the embedded paths to get an estimated minimum distance between the positions +size_t min_approx_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos1, const pos_t& pos2, uint64_t max_search); + +} + +} diff --git a/src/algorithms/back_translate.cpp b/src/algorithms/back_translate.cpp new file mode 100644 index 00000000000..ef07863d255 --- /dev/null +++ b/src/algorithms/back_translate.cpp @@ -0,0 +1,212 @@ +/** + * \file back_translate.cpp + */ + +#include "back_translate.hpp" +#include "../path.hpp" +#include "../snarls.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +/** + * Erase the items at the given indices in the given Protobuf repeated pointer field. + */ +template +static void erase_at(RepeatedPtrField* field, const vector& sorted_indices_to_remove) { + if (sorted_indices_to_remove.empty()) { + // We don't need to do anything. + return; + } + + // This is how many slots ahead we are reading to write the current slot. + // It's also (1+) the cursor in sorted_indices_to_remove + size_t slots_ahead = 1; + for (size_t i = sorted_indices_to_remove[slots_ahead - 1]; i + sorted_indices_to_remove.size() < field->size(); i++) { + // We're at a slot that will need to be filled in the output. + while (slots_ahead < sorted_indices_to_remove.size() && i + slots_ahead == sorted_indices_to_remove[slots_ahead]) { + // Slide copy source ahead until it stops being the next thing to skip. + slots_ahead++; + } + // Overwrite the item here with the item that is supposed to be here. + // We use a swap so we don't have to actually copy any item data, just pointers. + field->SwapElements(i, i + slots_ahead); + } + // Now we've bumped all the unused items to the end so delete them. + field->DeleteSubrange(field->size() - sorted_indices_to_remove.size(), sorted_indices_to_remove.size()); +} + +void back_translate_in_place(const NamedNodeBackTranslation* translation, Path& path) { + // For combining mappings that can combine, we need to track where the previous mapping ended. + nid_t prev_segment_number = numeric_limits::max(); + bool prev_segment_is_reverse = false; + size_t prev_segment_offset = numeric_limits::max(); + // And also what mapping it is or has been glommed into + Mapping* prev_mapping = nullptr; + + // When we glom mappings into other mappings we steal their edits and put + // their indices in this list. + vector mapping_indices_to_remove; + + for (size_t i = 0; i < path.mapping_size(); i++) { + // For each Mapping + Mapping* mapping = path.mutable_mapping(i); + + // Determine its range + oriented_node_range_t source_range(mapping->position().node_id(), mapping->position().is_reverse(), + mapping->position().offset(), from_length(*mapping)); + + // Translate it + vector translated = translation->translate_back(source_range); + + if (translated.size() != 1) { + // TODO: Implement translations that split graph nodes into multiple segments. + throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) + + " to " + to_string(translated.size()) + + " named segment ranges, but complex translations like this are not yet implemented"); + } + + auto& translated_range = translated[0]; + if (get<1>(translated_range) != get<1>(source_range)) { + // TODO: Implement translations that flip orientations. + throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) + + " ended up on the opposite strand; complex translations like this are not yet implemented"); + } + + // Change over to the named sequence and the offset there. + mapping->mutable_position()->clear_node_id(); + mapping->mutable_position()->set_name(translation->get_back_graph_node_name(get<0>(translated_range))); + mapping->mutable_position()->set_offset(get<2>(translated_range)); + + if (i == 0 || get<0>(translated_range) != prev_segment_number || get<1>(translated_range) != prev_segment_is_reverse || get<2>(translated_range) != prev_segment_offset) { + // We have done a transition that isn't just abutting in a + // segment. We assume anything we want to represent as a + // deletion is already represented as a deletion, so we + // preserve all jumps as jumps. + + // Just move to this part of this segment, and then advance by + // the length of the piece we translated. + prev_segment_number = get<0>(translated_range); + prev_segment_is_reverse = get<1>(translated_range); + prev_segment_offset = get<2>(translated_range) + get<3>(translated_range); + + // We are now the prev mapping to glom into + prev_mapping = mapping; + } else { + // We abut the previous Mapping. So we should be able to glom + // all our edits into it, and then stop existing. + for (size_t j = 0; j < mapping->edit_size(); j++) { + // Glom each edit into the previous mapping + + if (j == 0 && prev_mapping->edit_size() > 0 && edits_are_compatible(prev_mapping->edit(prev_mapping->edit_size() - 1), mapping->edit(j))) { + // We assume our edits are all merged up if they can be. So + // we only have to consider merging the first of our edits + // into the last edit of the previous mapping. Turns out + // they can merge. + merge_edits_in_place(*prev_mapping->mutable_edit(prev_mapping->edit_size() - 1), mapping->edit(j)); + } else { + // This edit has to become a new edit in the previous mapping. + *prev_mapping->add_edit() = std::move(*mapping->mutable_edit(j)); + } + } + + // Now we need to get rid of this mapping. + // Leave prev_mapping pointing where it is. + // And remember we don't want this mapping anymore. + mapping_indices_to_remove.push_back(i); + + // Advance along the segment + prev_segment_offset += get<3>(translated_range); + } + } + + // We need to batch-erase all these indices. + erase_at(path.mutable_mapping(), mapping_indices_to_remove); +} + + + +void back_translate_in_place(const NamedNodeBackTranslation* translation, Snarl& snarl) { + // To translate a snarl, you translate its bounding visits + back_translate_in_place(translation, *snarl.mutable_start()); + back_translate_in_place(translation, *snarl.mutable_end()); +} + +void back_translate_in_place(const NamedNodeBackTranslation* translation, SnarlTraversal& traversal) { + vector visit_indices_to_remove; + // We need to keep track of the last visit actually kept, for coalescing + // over dropped snarls that are the artifacts of segment splitting. + size_t last_kept_visit = numeric_limits::max(); + for (size_t i = 0; i < traversal.visit_size(); i++) { + // Translate every visit + Visit& here = *(traversal.mutable_visit(i)); + back_translate_in_place(translation, here); + if (here.has_snarl()) { + // We're a visit to a snarl, so we should be elided if we're a + // trivial snarl made by breaking a segment. + // TODO: We don't account for edits to the graph to insert in the + // middle of segments. + if (here.snarl().start() == here.snarl().end()) { + // This visit must be to a trivial snarl created by breaking the segment, so elide it. + visit_indices_to_remove.push_back(i); + continue; + } + } else { + // We're a visit to a segment, so we should be elided if we're just + // the same visit to a segment as the last thing that isn't getting + // dropped. + if (last_kept_visit != numeric_limits::max()) { + const Visit& prev = traversal.visit(last_kept_visit); + if (here == prev) { + // These visits can coalesce because they are to the same + // segment and orientation. + visit_indices_to_remove.push_back(i); + continue; + } + } + } + // If we aren't elided, we are a kept visit. + last_kept_visit = i; + } + + // Get rid of visits that coalesced away + erase_at(traversal.mutable_visit(), visit_indices_to_remove); +} + +void back_translate_in_place(const NamedNodeBackTranslation* translation, Visit& visit) { + if (visit.has_snarl()) { + // A visit with a snarl is translated by translating its snarl + back_translate_in_place(translation, *visit.mutable_snarl()); + } else { + // Otherwise, translate its node ID to a segment name. Use made-up boundaries on the node. + // TODO: Can we have an easy way to say "whole node"? + oriented_node_range_t source_range(visit.node_id(), visit.backward(), 0, 1); + + // Translate it + vector translated = translation->translate_back(source_range); + + if (translated.size() != 1) { + // TODO: Implement translations that split graph nodes into multiple segments. + throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) + + " to " + to_string(translated.size()) + + " named segment ranges, but complex translations like this are not yet implemented"); + } + + auto& translated_range = translated[0]; + if (get<1>(translated_range) != get<1>(source_range)) { + // TODO: Implement translations that flip orientations. + throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) + + " ended up on the opposite strand; complex translations like this are not yet implemented"); + } + + // Save the change to the visit. + visit.clear_node_id(); + visit.set_name(translation->get_back_graph_node_name(get<0>(translated_range))); + // Ignore the offset and length info. + } +} + +} +} diff --git a/src/algorithms/back_translate.hpp b/src/algorithms/back_translate.hpp new file mode 100644 index 00000000000..ce03c51213a --- /dev/null +++ b/src/algorithms/back_translate.hpp @@ -0,0 +1,47 @@ +#ifndef VG_ALGORITHMS_BACK_TRANSLATE_HPP_INCLUDED +#define VG_ALGORITHMS_BACK_TRANSLATE_HPP_INCLUDED + +/** + * \file back_translate.hpp + * + * Defines an algorithm for translating an Alignment from node ID space to named segment space. + */ + +#include "../handle.hpp" +#include + +namespace vg { +namespace algorithms { +using namespace std; + +/** + * Translate the given Path in place from node ID space to named segment space. + */ +void back_translate_in_place(const NamedNodeBackTranslation* translation, Path& path); + +/** + * Translate the given Snarl in place from node ID space to named segment space. + */ +void back_translate_in_place(const NamedNodeBackTranslation* translation, Snarl& snarl); + +/** + * Translate the given SnarlTraversal in place from node ID space to named + * segment space. Visits that end up being to snarls where both boundaries are + * from the same orientation of the same segment will be removed. Multiple + * visits in a row to the same orientation of the same segment will be elided. + * + * TODO: Work out a way to preserve traversals around cycles while not + * repeating ourselves for visits to diffrent pieces of the same chopped + * segment. + */ +void back_translate_in_place(const NamedNodeBackTranslation* translation, SnarlTraversal& traversal); + +/** + * Translate the given Visit in place from node ID space to named segment space. + */ +void back_translate_in_place(const NamedNodeBackTranslation* translation, Visit& visit); + +} +} + +#endif diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp new file mode 100644 index 00000000000..fd1f90b8d96 --- /dev/null +++ b/src/algorithms/chain_items.cpp @@ -0,0 +1,481 @@ +/** + * \file chain_items.cpp + * Non-template function implementations for chaining pieces of a read-to-graph alignment. + */ + + +#include "chain_items.hpp" + +#include + +//#define debug_chaining + +namespace vg { +namespace algorithms { + +using namespace std; + +ostream& operator<<(ostream& out, const Anchor& anchor) { + return out << "{R:" << anchor.read_start() << "=G:" << anchor.graph_start() << "*" << anchor.length() << "}"; +} + +ostream& operator<<(ostream& out, const TracedScore& value) { + if (value.source == TracedScore::nowhere()) { + return out << value.score << " from nowhere"; + } + return out << value.score << " from #" << value.source; +} + + +void TracedScore::max_in(const vector& options, size_t option_number) { + auto& option = options[option_number]; + if (option.score > this->score || this->source == nowhere()) { + // This is the new winner. + this->score = option.score; + this->source = option_number; + } +} + +TracedScore TracedScore::score_from(const vector& options, size_t option_number) { + TracedScore got = options[option_number]; + got.source = option_number; + return got; +} + +TracedScore TracedScore::add_points(int adjustment) const { + return {this->score + adjustment, this->source}; +} + +void sort_and_shadow(const std::vector& items, std::vector& indexes) { + + // Sort the indexes by read start ascending, and read end descending + std::sort(indexes.begin(), indexes.end(), [&](const size_t& a, const size_t& b) { + auto& a_item = items[a]; + auto& b_item = items[b]; + auto a_start = a_item.read_start(); + auto b_start = b_item.read_start(); + // a should be first if it starts earlier, or starts atthe same place and ends later. + return (a_start < b_start || (a_start == b_start && a_item.read_end() > b_item.read_end())); + }); + + // Keep a collection of the diagonals that are already represented, + // and the read end position of the latest-ending item on those pairs that + // we have taken. A diagonal is defined as a graph node ID, a graph strand, + // and the difference between the graph offset and the read position. So we + // can represent them with pos_t, and subtract the read position out of the + // stored offset to make them. + std::unordered_map diagonal_progress; + + // Scan through and make a new collection of indexes, keeping the first on + // any pair of diagonals, which will thus be the one with the earliest + // start, and within those the latest end. Since we need to keep items + // which partially overlap but don't contain each other, we also keep an + // item if it is the new latest-ending thing we've seen for a pair of + // diagonals. + std::vector kept_indexes; + kept_indexes.reserve(indexes.size()); + for (auto i : indexes) { + // For each item we might keep + auto& item = items[i]; + + // Prepare the key of the diagonals it visits + pos_t diagonal = item.graph_start(); + // Make the offsets store a difference between graph and read offset so + // they really represent diagonals. + get_offset(diagonal) -= item.read_start(); + + auto& furthest_read_end = diagonal_progress[diagonal]; + if (furthest_read_end < item.read_end()) { + // This is the first, or latest-ending, item seen on this diagonal. + // If there was an earlier-ending item taken, we know it started before this one, because of iteration order. + // So take this item. + kept_indexes.push_back(i); + // And record that we got out this far + furthest_read_end = item.read_end(); +#ifdef debug_chaining + std::cerr << "Keep " << item << " which gets us to R" << furthest_read_end << " on diagonal " << diagonal << std::endl; +#endif + } else { +#ifdef debug_chaining + std::cerr << "Discard " << item << " as shadowed because we already got to R" << furthest_read_end << " on diagonal " << diagonal << std::endl; +#endif + } + } + + // Replace the indexes with the sorted and deduplicated ones. + indexes = std::move(kept_indexes); +} + +void sort_and_shadow(std::vector& items) { + // Use the index-based implementation and then apply those indexes + std::vector indexes = range_vector(items.size()); + sort_and_shadow(items, indexes); + std::vector kept_items; + kept_items.reserve(indexes.size()); + for (auto& index : indexes) { + kept_items.emplace_back(std::move(items[index])); + } + items = std::move(kept_items); +} + +TracedScore chain_items_dp(vector& best_chain_score, + const VectorView& to_chain, + const SnarlDistanceIndex& distance_index, + const HandleGraph& graph, + int gap_open, + int gap_extension, + size_t max_lookback_bases, + size_t min_lookback_items, + size_t lookback_item_hard_cap, + size_t initial_lookback_threshold, + double lookback_scale_factor, + double min_good_transition_score_per_base, + int item_bonus, + size_t max_indel_bases) { + + DiagramExplainer diagram; + diagram.add_globals({{"rankdir", "LR"}}); + +#ifdef debug_chaining + cerr << "Chaining group of " << to_chain.size() << " items" << endl; +#endif + + // We want to consider all the important transitions in the graph of what + // items can come before what other items. We aren't allowing any + // transitions between items that overlap in the read. We're going through + // the destination items in order by read start, so we should also keep a + // list of them in order by read end, and sweep a cursor over that, so we + // always know the fisrt item that overlaps with or passes the current + // destination item, in the read. Then when we look for possible + // predecessors of the destination item, we can start just before there and + // look left. + vector read_end_order = sort_permutation(to_chain.begin(), to_chain.end(), [&](const Anchor& a, const Anchor& b) { + return a.read_end() < b.read_end(); + }); + // We use first overlapping instead of last non-overlapping because we can + // just initialize first overlapping at the beginning and be right. + auto first_overlapping_it = read_end_order.begin(); + + // Make our DP table big enough + best_chain_score.resize(to_chain.size(), TracedScore::unset()); + + // What's the winner so far? + TracedScore best_score = TracedScore::unset(); + + for (size_t i = 0; i < to_chain.size(); i++) { + // For each item + auto& here = to_chain[i]; + + while (to_chain[*first_overlapping_it].read_end() <= here.read_start()) { + // Scan ahead through non-overlapping items that past-end too soon, + // to the first overlapping item that ends earliest. + // Ordering physics *should* constrain the iterator to not run off the end. + ++first_overlapping_it; + assert(first_overlapping_it != read_end_order.end()); + } + + // How many points is it worth to collect? + auto item_points = here.score() + item_bonus; + + std::string here_gvnode = "i" + std::to_string(i); + + // If we come from nowhere, we get those points. + best_chain_score[i] = std::max(best_chain_score[i], {item_points, TracedScore::nowhere()}); + +#ifdef debug_chaining + cerr << "Look at transitions to #" << i + << " at " << here; + cerr << endl; +#endif + +#ifdef debug_chaining + cerr << "\tFirst item overlapping #" << i << " beginning at " << here.read_start() << " is #" << *first_overlapping_it << " past-ending at " << to_chain[*first_overlapping_it].read_end() << " so start before there." << std::endl; +#endif + + // Set up lookback control algorithm. + // Until we have looked at a certain number of items, we keep going + // even if we meet other stopping conditions. + size_t items_considered = 0; + // If we are looking back further than this + size_t lookback_threshold = initial_lookback_threshold; + // And a gooid score has been found, stop + bool good_score_found = false; + // A good score will be positive and have a transition component that + // looks good relative to how far we are looking back. The further we + // look back the lower our transition score standards get, so remember + // the best one we have seen so far in case the standard goes below it. + int best_transition_found = std::numeric_limits::min(); + + // Start considering predecessors for this item. + auto predecessor_index_it = first_overlapping_it; + while (predecessor_index_it != read_end_order.begin()) { + --predecessor_index_it; + + // How many items have we considered before this one? + size_t item_number = items_considered++; + + // For each source that ended before here started, in reverse order by end position... + auto& source = to_chain[*predecessor_index_it]; + +#ifdef debug_chaining + cerr << "\tConsider transition from #" << *predecessor_index_it << ": " << source << endl; +#endif + + // How far do we go in the read? + size_t read_distance = get_read_distance(source, here); + + if (item_number > lookback_item_hard_cap) { + // This would be too many +#ifdef debug_chaining + cerr << "\t\tDisregard due to hitting lookback item hard cap" << endl; +#endif + break; + } + if (item_number >= min_lookback_items) { + // We have looked at enough predecessors that we might consider stopping. + // See if we should look back this far. + if (read_distance > max_lookback_bases) { + // This is further in the read than the real hard limit. + break; + } else if (read_distance > lookback_threshold && good_score_found) { + // We already found something good enough. + break; + } + } + if (read_distance > lookback_threshold && !good_score_found) { + // We still haven't found anything good, so raise the threshold. + lookback_threshold *= lookback_scale_factor; + } + + // Now it's safe to make a distance query +#ifdef debug_chaining + cerr << "\t\tCome from score " << best_chain_score[*predecessor_index_it] + << " across " << source << " to " << here << endl; +#endif + + // We will actually evaluate the source. + + // How far do we go in the graph? + size_t graph_distance = get_graph_distance(source, here, distance_index, graph); + + // How much does it pay (+) or cost (-) to make the jump from there + // to here? + // Don't allow the transition if it seems like we're going the long + // way around an inversion and needing a huge indel. + int jump_points; + + if (read_distance == numeric_limits::max()) { + // Overlap in read, so not allowed. + jump_points = std::numeric_limits::min(); + } else if (graph_distance == numeric_limits::max()) { + // No graph connection + jump_points = std::numeric_limits::min(); + } else { + // Decide how much length changed + size_t indel_length = (read_distance > graph_distance) ? read_distance - graph_distance : graph_distance - read_distance; + + if (indel_length > max_indel_bases) { + // Don't allow an indel this long + jump_points = std::numeric_limits::min(); + } else { + // Then charge for that indel + jump_points = score_gap(indel_length, gap_open, gap_extension); + } + } + + // And how much do we end up with overall coming from there. + int achieved_score; + + if (jump_points != numeric_limits::min()) { + // Get the score we are coming from + TracedScore source_score = TracedScore::score_from(best_chain_score, *predecessor_index_it); + + // And the score with the transition and the points from the item + TracedScore from_source_score = source_score.add_points(jump_points + item_points); + + // Remember that we could make this jump + best_chain_score[i] = std::max(best_chain_score[i], + from_source_score); + +#ifdef debug_chaining + cerr << "\t\tWe can reach #" << i << " with " << source_score << " + " << jump_points << " from transition + " << item_points << " from item = " << from_source_score << endl; +#endif + if (from_source_score.score > 0) { + // Only explain edges that were actual candidates since we + // won't let local score go negative + + std::string source_gvnode = "i" + std::to_string(*predecessor_index_it); + // Suggest that we have an edge, where the edges that are the best routes here are the most likely to actually show up. + diagram.suggest_edge(source_gvnode, here_gvnode, here_gvnode, from_source_score.score, { + {"label", std::to_string(jump_points)}, + {"weight", std::to_string(std::max(1, from_source_score.score))} + }); + } + + achieved_score = from_source_score.score; + } else { +#ifdef debug_chaining + cerr << "\t\tTransition is impossible." << endl; +#endif + achieved_score = std::numeric_limits::min(); + } + + // Note that we checked out this transition and saw the observed scores and distances. + best_transition_found = std::max(best_transition_found, jump_points); + if (achieved_score > 0 && best_transition_found >= min_good_transition_score_per_base * std::max(read_distance, graph_distance)) { + // We found a jump that looks plausible given how far we have searched, so we can stop searching way past here. + good_score_found = true; + } + } + +#ifdef debug_chaining + cerr << "\tBest way to reach #" << i << " is " << best_chain_score[i] << endl; +#endif + + std::stringstream label_stream; + label_stream << "#" << i << " " << here << " = " << item_points << "/" << best_chain_score[i].score; + diagram.add_node(here_gvnode, { + {"label", label_stream.str()} + }); + auto graph_start = here.graph_start(); + std::string graph_gvnode = "n" + std::to_string(id(graph_start)) + (is_rev(graph_start) ? "r" : "f"); + diagram.ensure_node(graph_gvnode, { + {"label", std::to_string(id(graph_start)) + (is_rev(graph_start) ? "-" : "+")}, + {"shape", "box"} + }); + // Show the item as connected to its source graph node + diagram.add_edge(here_gvnode, graph_gvnode, {{"color", "gray"}}); + // Make the next graph node along the same strand + std::string graph_gvnode2 = "n" + std::to_string(id(graph_start) + (is_rev(graph_start) ? -1 : 1)) + (is_rev(graph_start) ? "r" : "f"); + diagram.ensure_node(graph_gvnode2, { + {"label", std::to_string(id(graph_start) + (is_rev(graph_start) ? -1 : 1)) + (is_rev(graph_start) ? "-" : "+")}, + {"shape", "box"} + }); + // And show them as connected. + diagram.ensure_edge(graph_gvnode, graph_gvnode2, {{"color", "gray"}}); + + // See if this is the best overall + best_score.max_in(best_chain_score, i); + +#ifdef debug_chaining + cerr << "\tBest chain end so far: " << best_score << endl; +#endif + + } + + return best_score; +} + +vector chain_items_traceback(const vector& best_chain_score, + const VectorView& to_chain, + const TracedScore& best_past_ending_score_ever) { + + // Now we need to trace back. + vector traceback; + size_t here = best_past_ending_score_ever.source; + if (here != TracedScore::nowhere()) { +#ifdef debug_chaining + cerr << "Chain ends at #" << here << " " << to_chain[here] + << " with score " << best_past_ending_score_ever << endl; +#endif + while(here != TracedScore::nowhere()) { + traceback.push_back(here); +#ifdef debug_chaining + cerr << "Which gets score " << best_chain_score[here] << endl; +#endif + here = best_chain_score[here].source; +#ifdef debug_chaining + if (here != TracedScore::nowhere()) { + cerr << "And comes after #" << here + << " " << to_chain[here] << endl; + } else { + cerr << "And is first" << endl; + } +#endif + } + // Flip it around front-ways + std::reverse(traceback.begin(), traceback.end()); + } + +#ifdef debug_chaining + cerr << "Best score of chain overall: " << best_past_ending_score_ever << endl; +#endif + + return traceback; +} + +pair> find_best_chain(const VectorView& to_chain, + const SnarlDistanceIndex& distance_index, + const HandleGraph& graph, + int gap_open, + int gap_extension, + size_t max_lookback_bases, + size_t min_lookback_items, + size_t lookback_item_hard_cap, + size_t initial_lookback_threshold, + double lookback_scale_factor, + double min_good_transition_score_per_base, + int item_bonus, + size_t max_indel_bases) { + + if (to_chain.empty()) { + return std::make_pair(0, vector()); + } else { + + // We actually need to do DP + vector best_chain_score; + TracedScore best_past_ending_score_ever = chain_items_dp(best_chain_score, + to_chain, + distance_index, + graph, + gap_open, + gap_extension, + max_lookback_bases, + min_lookback_items, + lookback_item_hard_cap, + initial_lookback_threshold, + lookback_scale_factor, + min_good_transition_score_per_base, + item_bonus, + max_indel_bases); + // Then do the traceback and pair it up with the score. + return std::make_pair( + best_past_ending_score_ever.score, + chain_items_traceback(best_chain_score, to_chain, best_past_ending_score_ever)); + } +} + +int score_best_chain(const VectorView& to_chain, const SnarlDistanceIndex& distance_index, const HandleGraph& graph, int gap_open, int gap_extension) { + + if (to_chain.empty()) { + return 0; + } else { + // Do the DP but without the traceback. + vector best_chain_score; + TracedScore winner = algorithms::chain_items_dp(best_chain_score, to_chain, distance_index, graph, gap_open, gap_extension); + return winner.score; + } +} + +size_t get_graph_distance(const Anchor& from, const Anchor& to, const SnarlDistanceIndex& distance_index, const HandleGraph& graph) { + // TODO: hide something in the Anchors so we can use the minimizer cache information + // For now just measure between the graph positions. + + auto from_pos = from.graph_end(); + auto& to_pos = to.graph_start(); + + return distance_index.minimum_distance( + id(from_pos), is_rev(from_pos), offset(from_pos), + id(to_pos), is_rev(to_pos), offset(to_pos), + false, &graph); +} + +size_t get_read_distance(const Anchor& from, const Anchor& to) { + if (to.read_start() < from.read_end()) { + return std::numeric_limits::max(); + } + return to.read_start() - from.read_end(); +} + +} +} diff --git a/src/algorithms/chain_items.hpp b/src/algorithms/chain_items.hpp new file mode 100644 index 00000000000..54db7528338 --- /dev/null +++ b/src/algorithms/chain_items.hpp @@ -0,0 +1,280 @@ +#ifndef VG_ALGORITHMS_CHAIN_ITEMS_HPP_INCLUDED +#define VG_ALGORITHMS_CHAIN_ITEMS_HPP_INCLUDED + +/** + * \file + * Algorithms for chaining subalignments into larger alignments. + * + * To use these algorithms, decide on the type (Anchor) you want to chain up. + * + * Then, make a ChainingSpace, or a ChainingSpace if your + * Items need to be interpreted in the context of some source object (like a + * seed hit needs to be interpreted in the context of its source minimizer). + * + * Then, make a dynamic programming table: vector. + * + * Then, call chain_items_dp() to fill in the dynamic programming table and get + * the score of the best chain. + * + * You can use chain_items_traceback() to get a traceback of the chain's items + * in order. + * + * Helper entry points are find_best_chain() and score_best_chain() which set + * up the DP for you and do the traceback if appropriate. + */ + +#include "extract_containing_graph.hpp" + +#include "../gbwt_extender.hpp" +#include "../snarl_seed_clusterer.hpp" +#include "../handle.hpp" +#include "../explainer.hpp" +#include "../utility.hpp" + +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +// Make sure all of vg's print operators are available. +using vg::operator<<; + +//#define debug_chaining + +/** + * Represents a piece fo a graph node matching to a piece of a read. Can be + * chained together. + */ +class Anchor { +public: + // Set up with accessors in case we want to stop copying stuff so much later. + + // Base API: + + /// Get the start position in the read of this anchor's match. + inline size_t read_start() const { + return start; + } + /// Get the start position in the graph of this anchor's match + inline const pos_t& graph_start() const { + return pos; + } + /// Get the length of this anchor's match + inline size_t length() const { + return size; + } + /// Get the alignment score of the anchor + inline int score() const { + return points; + } + + // Other API implemented on top of this + + /// Get the end position in the read of this anchor's match + inline size_t read_end() const { + return read_start() + length(); + } + + /// Get the end position in the graph of this anchor's match + inline pos_t graph_end() const { + pos_t p = graph_start(); + get_offset(p) += length(); + return p; + } + + // Construction + + /// Compose a read start position, graph start position, and match length into an Anchor + inline Anchor(size_t read_start, const pos_t& graph_start, size_t length, int score) : start(read_start), size(length), pos(graph_start), points(score) { + // Nothing to do! + } + + // Act like data + Anchor() = default; + Anchor(const Anchor& other) = default; + Anchor& operator=(const Anchor& other) = default; + Anchor(Anchor&& other) = default; + Anchor& operator=(Anchor&& other) = default; + +protected: + size_t start; + size_t size; + pos_t pos; + int points; +}; + +/// Explain an Anchor to the given stream +ostream& operator<<(ostream& out, const Anchor& anchor); + +// For doing scores with backtracing, we use this type, which is a +// score and a number for the place we came from to get it. +class TracedScore { +public: + /// What is the sentinel for an empty provenance? + /// Use a function instead of a constant because that's easier when we're just a header. + inline static size_t nowhere() { + return numeric_limits::max(); + } + + /// What's the default value for an empty table cell? + /// Use a function instead of a constant because that's easier when we're just a header. + inline static TracedScore unset() { + return {0, nowhere()}; + } + + /// Max in a score from a DP table. If it wins, record provenance. + void max_in(const vector& options, size_t option_number); + + /// Get a score from a table and record provenance in it. + static TracedScore score_from(const vector& options, size_t option_number); + + /// Add (or remove) points along a route to somewhere. Return a modified copy. + TracedScore add_points(int adjustment) const; + + /// Compare for equality + inline bool operator==(const TracedScore& other) const { + return score == other.score && source == other.source; + } + + /// Compare for inequality + inline bool operator!=(const TracedScore& other) const { + return !(*this == other); + } + + /// Compare for less-than + inline bool operator<(const TracedScore& other) const { + return score < other.score || (score == other.score && source < other.source); + } + + /// Compare for greater-than + inline bool operator>(const TracedScore& other) const { + return score > other.score || (score == other.score && source > other.source); + } + + // Number of points + int score; + // Index of source score among possibilities/traceback pointer + size_t source; +}; + +} + +} + +namespace std { + /// Allow maxing TracedScore + inline vg::algorithms::TracedScore max(const vg::algorithms::TracedScore& a, const vg::algorithms::TracedScore& b) { + return a > b ? a : b; + } +} + +namespace vg { + +namespace algorithms { + +using namespace std; + +// Make sure all of vg's print operators are available. +using vg::operator<<; + +/// Print operator +ostream& operator<<(ostream& out, const TracedScore& value); + +/** + * Get rid of items that are shadowed or contained by (or are identical to) others. + * + * Erases items that didn't survive from indexes, and sorts them by read start + * position. + */ +void sort_and_shadow(const std::vector& items, std::vector& indexes); + +/** + * Get rid of items that are shadowed or contained by (or are identical to) others. + * + * Erases items that didn't survive from items, and sorts them by read start + * position. + */ +void sort_and_shadow(std::vector& items); + +/** + * Fill in the given DP table for the best chain score ending with each + * item. Returns the best observed score overall from that table, + * with provenance to its location in the table, if tracked in the type. + * Assumes some items exist. + * + * Input items must be sorted by start position in the read. + * + * Takes the given per-item bonus for each item collected. + * + * Uses a finite lookback in items and in read bases when checking where we can + * come from to reach an item. Also, once a given number of good-looking + * predecessor items have been found, stop looking back. + * + * Limits transitions to those involving indels of the given size or less, to + * avoid very bad transitions. + */ +TracedScore chain_items_dp(vector& best_chain_score, + const VectorView& to_chain, + const SnarlDistanceIndex& distance_index, + const HandleGraph& graph, + int gap_open, + int gap_extension, + size_t max_lookback_bases = 150, + size_t min_lookback_items = 0, + size_t lookback_item_hard_cap = 100, + size_t initial_lookback_threshold = 10, + double lookback_scale_factor = 2.0, + double min_good_transition_score_per_base = -0.1, + int item_bonus = 0, + size_t max_indel_bases = 100); + +/** + * Trace back through in the given DP table from the best chain score. + */ +vector chain_items_traceback(const vector& best_chain_score, + const VectorView& to_chain, + const TracedScore& best_past_ending_score_ever); + +/** + * Chain up the given group of items. Determines the best score and + * traceback that can be obtained by chaining items together. + * + * Input items must be sorted by start position in the read. + * + * Returns the score and the list of indexes of items visited to achieve + * that score, in order. + */ +pair> find_best_chain(const VectorView& to_chain, + const SnarlDistanceIndex& distance_index, + const HandleGraph& graph, + int gap_open, + int gap_extension, + size_t max_lookback_bases = 150, + size_t min_lookback_items = 0, + size_t lookback_item_hard_cap = 100, + size_t initial_lookback_threshold = 10, + double lookback_scale_factor = 2.0, + double min_good_transition_score_per_base = -0.1, + int item_bonus = 0, + size_t max_indel_bases = 100); + +/** + * Score the given group of items. Determines the best score that can be + * obtained by chaining items together. + * + * Input items must be sorted by start position in the read. + */ +int score_best_chain(const VectorView& to_chain, const SnarlDistanceIndex& distance_index, const HandleGraph& graph, int gap_open, int gap_extension); + +/// Get distance in the graph, or std::numeric_limits::max() if unreachable. +size_t get_graph_distance(const Anchor& from, const Anchor& to, const SnarlDistanceIndex& distance_index, const HandleGraph& graph); + +/// Get distance in the read, or std::numeric_limits::max() if unreachable. +size_t get_read_distance(const Anchor& from, const Anchor& to); + +} +} + +#endif diff --git a/src/algorithms/component.cpp b/src/algorithms/component.cpp new file mode 100644 index 00000000000..97dba3e6638 --- /dev/null +++ b/src/algorithms/component.cpp @@ -0,0 +1,520 @@ +/** \file + * Implements per-component algorithms + */ + +#include +#include +#include +#include +#include + +#include "../cluster.hpp" +#include "component.hpp" +#include "utility.hpp" +#include "sdsl/bit_vectors.hpp" +#include "sdsl/int_vector.hpp" + +//#define debug_component_paths +//#define debug_parallel_component_paths + +namespace vg { +namespace algorithms { + +using namespace std; + +// internal generic BFS implementation +void traverse_components(const HandleGraph& graph, + function& on_new_comp, + function& on_node) { + // we will use this as a bit-flag for when a node id has already been + // traverseed + id_t min_id = graph.min_node_id(); + sdsl::bit_vector enqueued(graph.max_node_id() - min_id + 1, 0); + + graph.for_each_handle([&](const handle_t& handle) { + + if (enqueued[graph.get_id(handle) - min_id]) { + // we've already traversed this node + return; + } + +#ifdef debug_component_paths + cerr << "starting new component on node " << graph.get_id(handle) << endl; +#endif + + // a node that hasn't been traversed means a new component + on_new_comp(); + + // init a BFS queue + deque queue; + + // function to call on each subsequent handle we navigate to + function on_node_and_enqueue = [&](const handle_t& here) { + +#ifdef debug_component_paths + cerr << "traverse to handle on node " << graph.get_id(here) << endl; +#endif + + // don't queue up the same node twice + if (!enqueued[graph.get_id(here) - min_id]) { + + // add the paths of the new node + on_node(here); + + // and add it to the queue + queue.push_back(here); + enqueued[graph.get_id(here) - min_id] = 1; + } + return true; + }; + + // queue up the first node + on_node_and_enqueue(handle); + + // do the BFS traversal + while (!queue.empty()) { + handle_t handle = queue.front(); + queue.pop_front(); + + // traverse in both directions + graph.follow_edges(handle, false, on_node_and_enqueue); + graph.follow_edges(handle, true, on_node_and_enqueue); + } + }); +} + +size_t num_components(const HandleGraph& graph) { + size_t num_comps = 0; + function on_new_comp = [&]() { + ++num_comps; + }; + function on_node = [&](const handle_t here) { }; + traverse_components(graph, on_new_comp, on_node); + return num_comps; +} + + +vector component_sizes(const HandleGraph& graph) { + + vector comp_sizes; + + // add a new size for a new component + function on_new_comp = [&]() { + comp_sizes.push_back(0); + }; + + function on_node = [&](const handle_t here) { + comp_sizes.back()++; + }; + + traverse_components(graph, on_new_comp, on_node); + + return comp_sizes; +} + + +vector> component_paths(const PathHandleGraph& graph) { + + vector> component_path_sets; + + // add a new set for a new component + function on_new_comp = [&]() { + component_path_sets.emplace_back(); + }; + + // add the paths of the new node + function on_node = [&](const handle_t here) { + graph.for_each_step_on_handle(here, [&](const step_handle_t& step) { + component_path_sets.back().insert(graph.get_path_handle_of_step(step)); + +#ifdef debug_component_paths + cerr << "found path " << graph.get_path_name(graph.get_path_handle_of_step(step)) << endl; +#endif + }); + }; + + traverse_components(graph, on_new_comp, on_node); + + return component_path_sets; +} + +template +void reallocate_atomic_int_vector(vector>*& vec1, vector>*& vec2) { + + if (!vec1) { + // the vector has already been reallocated + return; + } + + // allocate the second vector + vec2 = new vector>(vec1->size()); + + // TODO: these threads will actually be fighting for processor time + // with the spin-locks holding the main threads busy while they wait... + + // parallel copy in contiguous blocks + int thread_count = get_thread_count(); + static const int64_t block_size = 4096; + atomic next(0); + vector workers; + for (int i = 0; i < thread_count; ++i) { + workers.emplace_back([&]() { + while (true) { + int64_t begin = block_size * (next++); + if (begin >= vec2->size()) { + // we're past the end of the vector + break; + } + for (int64_t j = begin, n = min(begin + block_size, vec2->size()); j < n; ++j) { + (*vec2)[j].store((*vec1)[j].load()); + } + } + }); + } + + // barrier sync + for (auto& worker : workers) { + worker.join(); + } + + // free the first vector + delete vec1; + vec1 = nullptr; +}; + +vector> component_paths_parallel(const PathHandleGraph& graph) { + +#ifdef debug_parallel_component_paths + cerr << "computing component paths in parallel" << endl; +#endif + + // get all paths + vector paths; + paths.reserve(graph.get_path_count()); + graph.for_each_path_handle([&](const path_handle_t& path) { + paths.emplace_back(path); + }); + + // sort in descending order by step count + stable_sort(paths.begin(), paths.end(), [&](path_handle_t a, path_handle_t b) { + return graph.get_step_count(a) > graph.get_step_count(b); + }); + + int thread_count = get_thread_count(); + + // the number of threads that haven't exited + atomic threads_active(thread_count); + + // a system that lets one thread freeze the others at checkpoints while it does + // some job + atomic frozen(0); + atomic num_frozen(0); + + // checkpoint to wait if any other thread wants us to freeze + auto check_freeze = [&]() { + if (frozen.load()) { + ++num_frozen; + while (frozen.load()) { + // spin lock + } + --num_frozen; + } + }; + // wait until all other threads have reached a freeze check point + // and then execute a function + auto freeze_and_execute = [&](const function& exec) { + // continue trying to freeze until we actually get to be the thread + // the freezes the other threads + bool was_frozen = true; + while (was_frozen) { + was_frozen = frozen.fetch_or(1); + if (was_frozen) { + // we weren't the thread who switched the freeze on, freeze + // ourselves and wait + check_freeze(); + } + else { + while (num_frozen.load() < threads_active.load() - 1) { + // spin lock waiting for the other threads to reach a checkpoint + } + // execute the function + exec(); + // unfreeze the rest of the threads + frozen.fetch_and(0); + // leave the loop + } + } + }; + + // we'll be using the ID space as vector indices, calculat evalues we'll use for that + nid_t min_id = graph.min_node_id(); + nid_t id_range = graph.max_node_id() - min_id + 1; + + // we'll try to accomplish the job with the minimum int size possible to + // keep the memory use down + // note: this has to be done on the heap because the deleted copy assignment + // and copy construction for atomic ints means vectors can never be resized + vector>* id_vec_8 = new vector>(id_range); + vector>* id_vec_16 = nullptr; + vector>* id_vec_32 = nullptr; + // in parallel initialize with sentinels, which will be replaced by search IDs + static const size_t block_size = 4096; + atomic block_idx(0); + vector initializers; + for (int i = 0; i < thread_count; ++i) { + initializers.emplace_back([&]() { + while (true) { + size_t begin = block_size * (block_idx++); + if (begin >= id_vec_8->size()) { + break; + } + for (size_t j = begin, end = min(begin + block_size, id_vec_8->size()); j < end; ++j) { + (*id_vec_8)[j].store(0); + } + } + }); + } + + // barrier sync + for (auto& initializer : initializers) { + initializer.join(); + } + + // this value keeps track of which one of these we're actually using, taking + // the values 0, 1, or 2 + uint8_t which_vec = 0; + // the last search ID we can accommodate for each vector + const uint32_t max_search_id[3] = { + numeric_limits::max(), + numeric_limits::max(), + numeric_limits::max() + }; + + + // for each search ID, the other search IDs it encountered adjacent to its search + vector> neighbors(max_search_id[0] + 1); + // for each search ID, the path handles it fouund while traversing + vector> search_path_sets(max_search_id[0] + 1); + + // define accessors that hide the ugliness of checking which vector we're using: + + // perform atomic load on whichever vector we're currently using + auto load = [&](int64_t i) { + uint32_t loaded; + switch (which_vec) { + case 0: + loaded = (*id_vec_8)[i].load(); + break; + case 1: + loaded = (*id_vec_16)[i].load(); + break; + default: + loaded = (*id_vec_32)[i].load(); + break; + } + return loaded; + }; + // perform atomic compare-exchange on whichever vector we're currently using + auto compare_exchange = [&](int64_t i, uint32_t& expected, uint32_t desired) { + bool exchanged; + switch (which_vec) { + case 0: + { + uint8_t expected_8 = expected; + uint8_t desired_8 = desired; + exchanged = (*id_vec_8)[i].compare_exchange_strong(expected_8, desired_8); + if (!exchanged) { + expected = expected_8; + } + break; + } + case 1: + { + uint16_t expected_16 = expected; + uint16_t desired_16 = desired; + exchanged = (*id_vec_16)[i].compare_exchange_strong(expected_16, desired_16); + if (!exchanged) { + expected = expected_16; + } + break; + } + default: + { + exchanged = (*id_vec_32)[i].compare_exchange_strong(expected, desired); + break; + } + } + return exchanged; + }; + + + // to keep track of the index of the path we will use to seed a BFS next + atomic next_path(0); + // to keep track of the ID of the next seeded search + atomic next_search_id(1); + // initialize the swarm of workers + vector workers; + for (int i = 0; i < thread_count; ++i) { + workers.emplace_back([&,i]() { + while (true) { + + int64_t path_idx = next_path++; + + if (path_idx >= paths.size()) { + // all of the paths have been explored, we can exit + break; + } + +#ifdef debug_parallel_component_paths + cerr << ("worker " + to_string(i) + " got path idx " + to_string(path_idx) + ": " + graph.get_path_name(paths[path_idx]) + " with step count " + to_string(graph.get_step_count(paths[path_idx])) + "\n"); +#endif + + path_handle_t path = paths[path_idx]; + if (graph.get_step_count(path) == 0) { + // skip an empty path + check_freeze(); + continue; + } + + // seed a BFS search off of the first node in this path + handle_t seed = graph.get_handle_of_step(graph.path_begin(path)); + + if (load(graph.get_id(seed) - min_id) != 0) { + // another thread has already traversed over this node, no need + // to start a search here +#ifdef debug_parallel_component_paths + cerr << ("worker " + to_string(i) + " skipping seed " + to_string(graph.get_id(seed)) + ", which was previously visited by " + to_string(load(graph.get_id(seed) - min_id)) + "\n"); +#endif + check_freeze(); + continue; + } + + + + // we're going to initiate a BFS from the seed, assign a new search ID + uint32_t search_id = next_search_id++; + + // TODO: add finer-grain reallocations so that neighbors and + // search_path_sets don't need to get so large to guarantee that + // we don't index past them with a search ID + if (search_id > max_search_id[which_vec]) { + // we need to move up to the next int size in order to acommodate + // this search ID, so demand that the other threads stop writing so + // we can switch to a larger bit width + freeze_and_execute([&]() { + // check to make sure another thread didn't already move us over + // to the next vector while we were waiting to freeze + if (search_id <= max_search_id[which_vec]) { + return; + } + + ++which_vec; + neighbors.resize(max_search_id[which_vec] + 1); + search_path_sets.resize(max_search_id[which_vec] + 1); + if (which_vec == 1) { + reallocate_atomic_int_vector(id_vec_8, id_vec_16); + } + else if (which_vec == 2) { + reallocate_atomic_int_vector(id_vec_16, id_vec_32); + } + else { + cerr << "error: parallel component paths algorithm ran out of 32-bit search IDs\n"; + exit(1); + } + }); + } + +#ifdef debug_parallel_component_paths + cerr << ("worker " + to_string(i) + " starting search on seed " + to_string(graph.get_id(seed)) + " with search ID " + to_string(search_id) + "\n"); +#endif + + // FIFO queue for BFS + deque queue; + + // function to call on each subsequent handle we navigate to + function record_paths_and_enqueue = [&](const handle_t& here) { + int64_t idx = graph.get_id(here) - min_id; + uint32_t visit_id = 0; + bool exchanged = compare_exchange(idx, visit_id, search_id); + if (exchanged) { + // we found the unvisited sentinel and replaced it with our search ID + + // add the paths of the new node + graph.for_each_step_on_handle(here, [&](const step_handle_t& step) { + search_path_sets[search_id].insert(graph.get_path_handle_of_step(step)); + }); + + // and add it to the queue + queue.push_back(here); + } + else if (visit_id != search_id) { + // we are adjacent to nodes explored by a different search, record the + // neighbor + neighbors[search_id].insert(visit_id); + } + return true; + }; + + // init the queue + record_paths_and_enqueue(seed); + + while (!queue.empty()) { + // set a checkpoint in case a thread is trying to reallocate + check_freeze(); + + // de-queue a node + handle_t handle = queue.front(); + queue.pop_front(); + + // traverse in both directions + graph.follow_edges(handle, false, record_paths_and_enqueue); + graph.follow_edges(handle, true, record_paths_and_enqueue); + } + + } + // keep track of the fact this thread is exiting + --threads_active; +#ifdef debug_parallel_component_paths + cerr << ("worker " + to_string(i) + " exiting\n"); +#endif + }); + } + + // barrier sync + for (auto& worker : workers) { + worker.join(); + } + + // find equivalence classes of search IDs on the same component + size_t num_search_ids = next_search_id.load(); + structures::UnionFind union_find(num_search_ids, false); + for (size_t i = 0; i < num_search_ids; ++i) { + for (auto j : neighbors[i]) { + union_find.union_groups(i, j); + } + } + // agglomerate the sets of paths ecountered by all of the searches + // in each equivalence class + vector> return_val; + for (const auto& component_search_ids : union_find.all_groups()) { + bool added_new_set = false; + for (size_t i : component_search_ids) { + for (path_handle_t path : search_path_sets[i]) { + if (!added_new_set) { + return_val.emplace_back(); + added_new_set = true; + } + return_val.back().insert(path); + } + } + } + + delete id_vec_8; + delete id_vec_16; + delete id_vec_32; + + return return_val; +} + +} +} diff --git a/src/algorithms/component.hpp b/src/algorithms/component.hpp new file mode 100644 index 00000000000..d5f3027fe5e --- /dev/null +++ b/src/algorithms/component.hpp @@ -0,0 +1,34 @@ +/** \file + * Contains algorithms that report per-component info + */ + +#ifndef VG_ALGORITHMS_COMPONENT_HPP_INCLUDED +#define VG_ALGORITHMS_COMPONENT_HPP_INCLUDED + +#include + +#include "handle.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +// returns the number of weakly connected components +size_t num_components(const HandleGraph& graph); + +// returns the size in number of nodes of each component +vector component_sizes(const HandleGraph& graph); + +// returns sets of path handles, one set for each component (unless the +// component doesn't have any paths) +vector> component_paths(const PathHandleGraph& graph); + +// the same semantics as the previous, but multithreaded and more +// memory intensive +vector> component_paths_parallel(const PathHandleGraph& graph); +} + +} + +#endif // VG_ALGORITHMS_COMPONENT_HPP_INCLUDED diff --git a/src/algorithms/count_covered.hpp b/src/algorithms/count_covered.hpp new file mode 100644 index 00000000000..bfbe340241c --- /dev/null +++ b/src/algorithms/count_covered.hpp @@ -0,0 +1,57 @@ +#ifndef VG_ALGORITHMS_COUNT_COVERED_HPP_INCLUDED +#define VG_ALGORITHMS_COUNT_COVERED_HPP_INCLUDED + +/** + * \file count_covered.hpp + * + * Sweep-line algorithm to count the number of positions covered by a set of + * intervals. + */ + +namespace vg { +namespace algorithms { + +using namespace std; + +/** + * Count, from begin to end, the number of positions covered by ranges in the + * given collection. The collection will be sorted in place. + * + * The collection must be a have a begin(), end(), and random [] access (like a + * vector). + * + * No boundaries are needed because no positions can be covered without + * segments representing them. + */ +template +size_t count_covered(Collection& segments) { + + if (segments.empty()) { + // Protect against no segments + return 0; + } + + std::sort(segments.begin(), segments.end()); + auto curr_begin = segments[0].first; + auto curr_end = segments[0].second; + + size_t total = 0; + for (size_t i = 1; i < segments.size(); i++) { + if (segments[i].first >= curr_end) { + total += (curr_end - curr_begin); + curr_begin = segments[i].first; + curr_end = segments[i].second; + } + else if (segments[i].second > curr_end) { + curr_end = segments[i].second; + } + } + total += (curr_end - curr_begin); + return total; +} + +} +} + +#endif + diff --git a/src/algorithms/count_walks.cpp b/src/algorithms/count_walks.cpp deleted file mode 100644 index 7aba84837ad..00000000000 --- a/src/algorithms/count_walks.cpp +++ /dev/null @@ -1,63 +0,0 @@ -#include "count_walks.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - - size_t count_walks(const HandleGraph* graph) { - - vector sinks; - unordered_map count; - count.reserve(graph->node_size()); - - // identify sources and sinks - graph->for_each_handle([&](const handle_t& handle) { - bool is_source = true, is_sink = true; - graph->follow_edges(handle, true, [&](const handle_t& prev) { - is_source = false; - return false; - }); - graph->follow_edges(handle, false, [&](const handle_t& next) { - is_sink = false; - return false; - }); - - // base case for dynamic programming - if (is_source) { - count[handle] = 1; - } - if (is_sink) { - sinks.emplace_back(handle); - } - }); - - // count walks by dynamic programming - bool overflowed = false; - for (const handle_t& handle : lazier_topological_order(graph)) { - size_t count_here = count[handle]; - graph->follow_edges(handle, false, [&](const handle_t& next) { - size_t& count_next = count[next]; - if (numeric_limits::max() - count_here < count_next) { - overflowed = true; - } - else { - count_next += count_here; - } - }); - - if (overflowed) { - return numeric_limits::max(); - } - } - - // total up the walks at the sinks - size_t total_count = 0; - for (handle_t& sink : sinks) { - total_count += count[sink]; - } - - return total_count; - } -} -} diff --git a/src/algorithms/count_walks.hpp b/src/algorithms/count_walks.hpp deleted file mode 100644 index 245eeceaac7..00000000000 --- a/src/algorithms/count_walks.hpp +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef VG_ALGORITHMS_COUNT_WALKS_HPP_INCLUDED -#define VG_ALGORITHMS_COUNT_WALKS_HPP_INCLUDED - -/** - * \file count_walks.hpp - * - * Defines algorithm for counting the number of distinct walks through a DAG. - */ - -#include "../handle.hpp" -#include "topological_sort.hpp" - -#include -#include - -namespace vg { -namespace algorithms { - -using namespace std; - - /// Returns the number of source-to-sink walks through the graph. Assumes that - /// the graph is a single-stranded DAG. Consider checking these properties with - /// algorithms::is_single_stranded and algorithms::is_directed_acyclic for safety. - /// Returns numeric_limits::max() if the actual number of walks is larger - /// than this. - size_t count_walks(const HandleGraph* graph); - -} -} - -#endif diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp new file mode 100644 index 00000000000..c92471348da --- /dev/null +++ b/src/algorithms/coverage_depth.cpp @@ -0,0 +1,432 @@ +#include "coverage_depth.hpp" +#include +#include "algorithms/subgraph.hpp" +#include +#include "../path.hpp" + +namespace vg { +namespace algorithms { + +void packed_depths(const Packer& packer, const string& path_name, size_t min_coverage, ostream& out_stream) { + const PathHandleGraph& graph = dynamic_cast(*packer.get_graph()); + path_handle_t path_handle = graph.get_path_handle(path_name); + step_handle_t start_step = graph.path_begin(path_handle); + step_handle_t end_step = graph.path_end(path_handle); + Position cur_pos; + subrange_t subrange; + string base_name = Paths::strip_subrange(path_name, &subrange); + size_t path_offset = subrange == PathMetadata::NO_SUBRANGE ? 1 : 1 + subrange.first; + + for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph.get_next_step(cur_step)) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + nid_t cur_id = graph.get_id(cur_handle); + size_t cur_len = graph.get_length(cur_handle); + cur_pos.set_node_id(cur_id); + cur_pos.set_is_reverse(graph.get_is_reverse(cur_handle)); + for (size_t i = 0; i < cur_len; ++i) { + cur_pos.set_offset(i); + size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos)); + if (pos_coverage >= min_coverage) { + out_stream << base_name << "\t" << path_offset << "\t" << pos_coverage << "\n"; + } + ++path_offset; + } + } +} + +pair packed_depth_of_bin(const Packer& packer, + step_handle_t start_step, step_handle_t end_plus_one_step, + size_t min_coverage, bool include_deletions) { + + const PathHandleGraph& graph = dynamic_cast(*packer.get_graph()); + + // coverage of each node via deletion (that's contained in the bin) + unordered_map deletion_coverages; + if (include_deletions) { + const VectorizableHandleGraph* vec_graph = dynamic_cast(packer.get_graph()); + unordered_map deletion_candidates; + handle_t prev_handle; + for (step_handle_t cur_step = start_step; cur_step != end_plus_one_step; cur_step = graph.get_next_step(cur_step)) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + graph.follow_edges(cur_handle, true, [&] (handle_t other) { + if (!deletion_candidates.empty() && other!= prev_handle && deletion_candidates.count(other)) { + edge_t edge = graph.edge_handle(other, cur_handle); + size_t edge_pos = vec_graph->edge_index(edge); + size_t deletion_coverage = packer.edge_coverage(edge_pos); + // quadratic alert. if this is too slow, can use interval tree or something + for (step_handle_t del_step = graph.get_next_step(deletion_candidates[other]); + del_step != cur_step; + del_step = graph.get_next_step(del_step)) { + handle_t del_handle = graph.get_handle_of_step(del_step); + nid_t del_id = graph.get_id(del_handle); + if (!deletion_coverages.count(del_id)) { + deletion_coverages[del_id] = deletion_coverage; + } else { + deletion_coverages[del_id] += deletion_coverage; + } + } + } + }); + prev_handle = cur_handle; + deletion_candidates[cur_handle] = cur_step; + } + } + + // compute the mean and variance of our base coverage across the bin + size_t bin_length = 0; + double mean = 0.0; + double M2 = 0.0; + + for (step_handle_t cur_step = start_step; cur_step != end_plus_one_step; cur_step = graph.get_next_step(cur_step)) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + nid_t cur_id = graph.get_id(cur_handle); + size_t cur_len = graph.get_length(cur_handle); + size_t del_coverage = !include_deletions or !deletion_coverages.count(cur_id) ? 0 : deletion_coverages[cur_id]; + Position cur_pos; + cur_pos.set_node_id(cur_id); + cur_pos.set_is_reverse(graph.get_is_reverse(cur_handle)); + for (size_t i = 0; i < cur_len; ++i) { + cur_pos.set_offset(i); + size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos)) + del_coverage; + if (pos_coverage >= min_coverage) { + wellford_update(bin_length, mean, M2, pos_coverage); + } + } + } + return wellford_mean_var(bin_length, mean, M2); +} + +vector> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size, + size_t min_coverage, bool include_deletions) { + + const PathHandleGraph& graph = dynamic_cast(*packer.get_graph()); + path_handle_t path_handle = graph.get_path_handle(path_name); + + // one scan of our path to collect the bins + step_handle_t start_step = graph.path_begin(path_handle); + step_handle_t end_step = graph.path_end(path_handle); + vector> bins; // start offset / start step of each bin + size_t offset = 0; + size_t cur_bin_size = bin_size; + for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph.get_next_step(cur_step)) { + if (cur_bin_size >= bin_size) { + bins.push_back(make_pair(offset, cur_step)); + cur_bin_size = 0; + } + size_t node_len = graph.get_length(graph.get_handle_of_step(cur_step)); + offset += node_len; + cur_bin_size += node_len; + } + + // parallel scan to compute the coverages + vector> binned_depths(bins.size()); +#pragma omp parallel for + for (size_t i = 0; i < bins.size(); ++i) { + step_handle_t bin_start_step = bins[i].second; + step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step; + size_t bin_start = bins[i].first; + size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset; + pair coverage = packed_depth_of_bin(packer, bin_start_step, bin_end_step, min_coverage, include_deletions); + binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second); + } + + return binned_depths; +} + +BinnedDepthIndex binned_packed_depth_index(const Packer& packer, + const vector& path_names, + size_t min_bin_size, + size_t max_bin_size, + double exp_growth_factor, + size_t min_coverage, + bool include_deletions, + bool std_err) { + const PathHandleGraph& graph = dynamic_cast(*packer.get_graph()); + + BinnedDepthIndex depth_index; + for (const string& path_name : path_names) { + size_t path_max_bin = 0; + graph.for_each_step_in_path(graph.get_path_handle(path_name), [&] (step_handle_t step_handle) { + path_max_bin += graph.get_length(graph.get_handle_of_step(step_handle)); + return path_max_bin < max_bin_size; + }); + path_max_bin = std::min(max_bin_size, path_max_bin); + + map>>& scaled_depth_map = depth_index[path_name]; + size_t prev_bin_size = 0; + for (size_t bin_size = min_bin_size; bin_size != prev_bin_size;) { + + map>& depth_map = scaled_depth_map[bin_size]; + vector> binned_depths = binned_packed_depth(packer, path_name, bin_size, + min_coverage, include_deletions); + // todo: probably more efficent to just leave in sorted vector + for (auto& binned_depth : binned_depths) { + double var = get<3>(binned_depth); + // optionally convert variance to standard error + if (std_err) { + var = sqrt(var / (double)(get<1>(binned_depth) - get<0>(binned_depth))); + } + depth_map[get<0>(binned_depth)] = make_pair(get<2>(binned_depth), var); + } + + prev_bin_size = bin_size; + // todo: trim out useless last bins that are only a bit bigger than prev + bin_size = std::min(path_max_bin, (size_t)pow(bin_size, exp_growth_factor)); + } + } + return depth_index; +} + + +pair get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t start_offset, size_t end_offset) { + + // accept backward ranges + if (end_offset < start_offset) { + swap(start_offset, end_offset); + } + size_t bin_size = 1 + end_offset - start_offset; + // pad it out + bin_size *= 2; + + auto ub1 = depth_index.at(path_name).upper_bound(bin_size); + if (ub1 == depth_index.at(path_name).end()) { + --ub1; + } + auto ub = ub1->second.upper_bound(start_offset); + --ub; + auto ub_end = ub1->second.upper_bound(end_offset); + size_t count = 0; + pair total = make_pair(0, 0); + for (auto cur = ub; cur != ub_end; ++cur, ++count) { + total.first += cur->second.first; + total.second += cur->second.second; + } + // todo: better way of combining? + total.first /= (double)count; + total.second /= (double)count; + return total; +} + +// draw (roughly) max_nodes nodes from the graph using the random seed +static unordered_map sample_nodes(const HandleGraph& graph, size_t max_nodes, size_t random_seed) { + default_random_engine generator(random_seed); + uniform_real_distribution distribution(0, 1); + double cutoff = std::min((double)1.0, (double)max_nodes / (double)graph.get_node_count()); + unordered_map sampled_nodes; + graph.for_each_handle([&](handle_t handle) { + if (cutoff == 1. || cutoff <= distribution(generator)) { + sampled_nodes[graph.get_id(handle)] = 0; + } + }); + return sampled_nodes; +} + +// update the coverage from an alignment. only count nodes that are in the map already +static void update_sample_gam_depth(const Alignment& aln, unordered_map& node_coverage) { + const Path& path = aln.path(); + for (int i = 0; i < path.mapping_size(); ++i) { + const Mapping& mapping = path.mapping(i); + nid_t node_id = mapping.position().node_id(); + if (node_coverage.count(node_id)) { + // we add the number of bases covered + node_coverage[node_id] += mapping_from_length(mapping); + } + } +} + +// sum up the results from the different threads and return the average. +// if a min_coverage is given, nodes with less coverage are ignored +static pair combine_and_average_node_coverages(const HandleGraph& graph, vector>& node_coverages, size_t min_coverage) { + for (int i = 1; i < node_coverages.size(); ++i) { + for (const auto& node_cov : node_coverages[i]) { + node_coverages[0][node_cov.first] += node_cov.second; + } + } + size_t count = 0; + double mean = 0.; + double M2 = 0.; + for (const auto & node_cov : node_coverages[0]) { + if (node_cov.second >= min_coverage) { + // we normalize the bases covered by the node length as we sum + double node_len = graph.get_length(graph.get_handle(node_cov.first)); + wellford_update(count, mean, M2, (double)node_cov.second / node_len); + } + } + + return wellford_mean_var(count, mean, M2); +} + + +pair sample_mapping_depth(const HandleGraph& graph, const string& input_filename, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq, const string& format) { + // one node counter per thread + vector> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed)); + + function aln_callback = [&](Alignment& aln) { + if (aln.mapping_quality() >= min_mapq) { + update_sample_gam_depth(aln, node_coverages[omp_get_thread_num()]); + } + }; + if (format == "GAM") { + get_input_file(input_filename, [&] (istream& gam_stream) { + vg::io::for_each_parallel(gam_stream, aln_callback); + }); + } else if (format == "GAF") { + vg::io::gaf_unpaired_for_each_parallel(graph, input_filename, aln_callback); + } else { + throw runtime_error("vg::aglorithms::coverage_depth: Invalid format specified for sample_mapping_depth(): " + + format + ". Valid options are GAM and GAF."); + } + + return combine_and_average_node_coverages(graph, node_coverages, min_coverage); +} + + + +pair sample_gam_depth(const HandleGraph& graph, const vector& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq) { + // one node counter per thread + vector> node_coverages(get_thread_count(), sample_nodes(graph, max_nodes, random_seed)); + +#pragma omp parallel for + for (size_t i = 0; i < alignments.size(); ++i) { + if (alignments[i].mapping_quality() >= min_mapq) { + update_sample_gam_depth(alignments[i], node_coverages[omp_get_thread_num()]); + } + } + return combine_and_average_node_coverages(graph, node_coverages, min_coverage); +} + +void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream) { + assert(graph.has_path(path_name)); + + path_handle_t path_handle = graph.get_path_handle(path_name); + // big speedup + unordered_map path_to_name; + + subrange_t subrange; + string base_name = Paths::strip_subrange(path_name, &subrange); + size_t offset = subrange == PathMetadata::NO_SUBRANGE ? 1 : 1 + subrange.first; + + graph.for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + unordered_set path_set; + size_t step_count = 0; + handle_t handle = graph.get_handle_of_step(step_handle); + graph.for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { + if (count_cycles) { + ++step_count; + } else { + path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2); + auto it = path_to_name.find(step_path_handle); + if (it == path_to_name.end()) { + string step_path_name = graph.get_path_name(step_path_handle); + // disregard subpath tags when counting + it = path_to_name.insert(make_pair(step_path_handle, Paths::strip_subrange(step_path_name))).first; + } + path_set.insert(it->second); + } + }); + size_t coverage = (count_cycles ? step_count : path_set.size()) - 1; + size_t node_len = graph.get_length(handle); + if (coverage >= min_coverage) { + for (size_t i = 0; i < node_len; ++i) { + if (coverage >= min_coverage) { + out_stream << base_name << "\t" << (offset + i) << "\t" << coverage << "\n"; + } + } + } + offset += node_len; + }); +} + +pair path_depth_of_bin(const PathHandleGraph& graph, + step_handle_t start_step, step_handle_t end_plus_one_step, + size_t min_coverage, bool count_cycles) { + + // compute the mean and variance of our base coverage across the bin + size_t bin_length = 0; + double mean = 0.0; + double M2 = 0.0; + + // big speedup + unordered_map path_to_name; + + for (step_handle_t cur_step = start_step; cur_step != end_plus_one_step; cur_step = graph.get_next_step(cur_step)) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + nid_t cur_id = graph.get_id(cur_handle); + size_t cur_len = graph.get_length(cur_handle); + + unordered_set path_set; + size_t step_count = 0; + graph.for_each_step_on_handle(cur_handle, [&](step_handle_t step_handle) { + if (count_cycles) { + ++step_count; + } else { + path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle); + auto it = path_to_name.find(step_path_handle); + if (it == path_to_name.end()) { + string step_path_name = graph.get_path_name(step_path_handle); + // disregard subpath tags when counting + it = path_to_name.insert(make_pair(step_path_handle, Paths::strip_subrange(step_path_name))).first; + } + path_set.insert(it->second); + } + }); + size_t coverage = (count_cycles ? step_count : path_set.size()) - 1; + + if (coverage >= min_coverage) { + // todo: iteration here copied from packer implementation, not necessary + for (size_t i = 0; i < cur_len; ++i) { + wellford_update(bin_length, mean, M2, coverage); + } + } + } + return wellford_mean_var(bin_length, mean, M2); +} + +vector> binned_path_depth(const PathHandleGraph& graph, + const string& path_name, + size_t bin_size, + size_t min_coverage, + bool count_cycles) { + + path_handle_t path_handle = graph.get_path_handle(path_name); + + // one scan of our path to collect the bins + step_handle_t start_step = graph.path_begin(path_handle); + step_handle_t end_step = graph.path_end(path_handle); + vector> bins; // start offset / start step of each bin + size_t offset = 0; + size_t cur_bin_size = bin_size; + for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph.get_next_step(cur_step)) { + if (cur_bin_size >= bin_size) { + bins.push_back(make_pair(offset, cur_step)); + cur_bin_size = 0; + } + size_t node_len = graph.get_length(graph.get_handle_of_step(cur_step)); + offset += node_len; + cur_bin_size += node_len; + } + + // parallel scan to compute the coverages + vector> binned_depths(bins.size()); +#pragma omp parallel for + for (size_t i = 0; i < bins.size(); ++i) { + step_handle_t bin_start_step = bins[i].second; + step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step; + size_t bin_start = bins[i].first; + size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset; + pair coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles); + binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second); + } + + return binned_depths; +} + + + +} + + + + +} + diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp new file mode 100644 index 00000000000..021cc57fa52 --- /dev/null +++ b/src/algorithms/coverage_depth.hpp @@ -0,0 +1,82 @@ +#ifndef VG_DEPTH_HPP_INCLUDED +#define VG_DEPTH_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include "handle.hpp" +#include "statistics.hpp" +#include "packer.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +/// print path-name offset base-coverage for every base on a path (just like samtools depth) +/// ignoring things below min_coverage. offsets are 1-based in output stream +void packed_depths(const Packer& packer, const string& path_name, size_t min_coverage, ostream& out_stream); + +/// Estimate the coverage along a given reference path interval [start_step, end_plus_one_step) +/// Coverage is obtained only from positions along the path, and variation is not counted +/// Except if "include_deletions" is true, then reference path positions covered by a deletion edge +/// (which is contained in the bin) will get the deletion edge's coverage counted. +/// Other types of events (such as SNPs) can throw off coverage in similar ways but deletions tend to be bigger +/// (and easier to find), so we hope that counting them is enough. +/// If one wants to infer deletions from the coverage, obviously this should be false, but if looking for +/// a background coverage for genotyping, then setting it to true may be helpful +pair packed_depth_of_bin(const Packer& packer, step_handle_t start_step, step_handle_t end_plus_one_step, + size_t min_coverage, bool include_deletions); + +/// Use all available threads to estimate the binned packed coverage of a path using above fucntion +/// Each element is a bin's 0-based open-ended interval in the path, and its coverage mean,variance. +vector> binned_packed_depth(const Packer& packer, const string& path_name, size_t bin_size, + size_t min_coverage, bool include_deletions); + +/// Use the above function to retrieve the binned depths of a list of paths, and store them indexed by start +/// coordinate. If std_err is true, store instead of +/// For each path, a series of indexes is computed, for bin sizes from min_bin_size, min_bin_size^(exp_growth_factor), etc. +using BinnedDepthIndex = unordered_map>>>; +BinnedDepthIndex binned_packed_depth_index(const Packer& packer, + const vector& path_names, + size_t min_bin_size, + size_t max_bin_size, + double exp_growth_factor, + size_t min_coverage, + bool include_deletions, + bool std_err); + +/// Query index created above +pair get_depth_from_index(const BinnedDepthIndex& depth_index, const string& path_name, size_t start_offset, size_t end_offset); + +/// Return the mean and variance of coverage of randomly sampled nodes from a mappings file +/// Nodes with less than min_coverage are ignored +/// The input_filename can be - for stdin +/// The stream is scanned in parallel with all threads +/// max_nodes is used to keep memory down +/// valid formats are "GAM" and "GAF" +pair sample_mapping_depth(const HandleGraph& graph, const string& input_filename, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq, const string& format="GAM"); + +/// As above, but read a vector instead of a stream +pair sample_mapping_depth(const HandleGraph& graph, const vector& alignments, size_t max_nodes, size_t random_seed, size_t min_coverage, size_t min_mapq); + +/// print path-name offset base-coverage for every base on a path (just like samtools depth) +/// ignoring things below min_coverage. offsets are 1-based in output stream +/// coverage here is the number of steps from (unique) other paths +void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream); + +/// like packed_depth_of_bin (above), but use paths (as in path_depths) for measuring coverage +pair path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step, + size_t min_coverage, bool count_cycles); + +vector> binned_path_depth(const PathHandleGraph& graph, const string& path_name, size_t bin_size, + size_t min_coverage, bool count_cycles); + +} +} + +#endif diff --git a/src/algorithms/dfs.cpp b/src/algorithms/dfs.cpp index c73d2671831..b6aecc269c6 100644 --- a/src/algorithms/dfs.cpp +++ b/src/algorithms/dfs.cpp @@ -1,5 +1,7 @@ #include "dfs.hpp" +//#define debug + namespace vg { namespace algorithms { diff --git a/src/algorithms/disjoint_components.cpp b/src/algorithms/disjoint_components.cpp new file mode 100644 index 00000000000..aa2fa6a2375 --- /dev/null +++ b/src/algorithms/disjoint_components.cpp @@ -0,0 +1,36 @@ +#include "disjoint_components.hpp" + +namespace vg { +namespace algorithms { + +list disjoint_components(const HandleGraph& graph) { + + vector> weak_comps = handlealgs::weakly_connected_components(&graph); + + list comps; + for (const auto& weak_comp : weak_comps) { + comps.emplace_back(); + auto& comp = comps.back(); + for (auto node_id : weak_comp) { + comp.create_handle(graph.get_sequence(graph.get_handle(node_id)), node_id); + } + comp.for_each_handle([&](const handle_t& handle) { + handle_t original_handle = graph.get_handle(comp.get_id(handle)); + // TODO: this will create duplicate edges if we ever decide + // to switch back to not deduplicating on the fly + graph.follow_edges(handle, true, [&](const handle_t& prev) { + comp.create_edge(comp.get_handle(graph.get_id(prev), graph.get_is_reverse(prev)), + handle); + }); + graph.follow_edges(handle, false, [&](const handle_t& next) { + comp.create_edge(handle, + comp.get_handle(graph.get_id(next), graph.get_is_reverse(next))); + }); + }); + } + + return comps; +} + +} +} diff --git a/src/algorithms/disjoint_components.hpp b/src/algorithms/disjoint_components.hpp new file mode 100644 index 00000000000..efb886619e4 --- /dev/null +++ b/src/algorithms/disjoint_components.hpp @@ -0,0 +1,22 @@ +#ifndef VG_DISJOINT_COMPONENTS_HPP_INCLUDED +#define VG_DISJOINT_COMPONENTS_HPP_INCLUDED + +#include + +#include + +#include "../handle.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +/// Return a list of graphs, one for each connected component in the original graph. +/// Node IDs are preserved from the original graph. +list disjoint_components(const HandleGraph& graph); + +} +} + +#endif diff --git a/src/algorithms/distance_to_head.hpp b/src/algorithms/distance_to_head.hpp index 2b95ea9126e..b71c12b2081 100644 --- a/src/algorithms/distance_to_head.hpp +++ b/src/algorithms/distance_to_head.hpp @@ -3,9 +3,9 @@ #include +#include + #include "../position.hpp" -#include "../cached_position.hpp" -#include "../vg.pb.h" #include "../hash_map.hpp" #include "../handle.hpp" @@ -14,8 +14,6 @@ namespace algorithms { using namespace std; -/// Find all of the nodes with no edges on their left sides. -vector head_nodes(const HandleGraph* g); int32_t distance_to_head(handle_t h, int32_t limit, const HandleGraph* graph); /// Get the distance in bases from start of node to start of closest head node of graph, or -1 if that distance exceeds the limit. /// dist increases by the number of bases of each previous node until you reach the head node @@ -25,4 +23,4 @@ int32_t distance_to_head(handle_t h, int32_t limit, int32_t dist, unordered_set< } } -#endif \ No newline at end of file +#endif diff --git a/src/algorithms/distance_to_tail.hpp b/src/algorithms/distance_to_tail.hpp index 0459b92e65f..c634251c60f 100644 --- a/src/algorithms/distance_to_tail.hpp +++ b/src/algorithms/distance_to_tail.hpp @@ -3,9 +3,9 @@ #include +#include + #include "../position.hpp" -#include "../cached_position.hpp" -#include "../vg.pb.h" #include "../hash_map.hpp" #include "../handle.hpp" @@ -14,8 +14,6 @@ namespace algorithms { using namespace std; -/// Find all of the nodes with no edges on their left sides. -vector tail_nodes(const HandleGraph* g); int32_t distance_to_tail(handle_t h, int32_t limit, const HandleGraph* graph); /// Get the distance in bases from end of node to end of closest tail node of graph, or -1 if that distance exceeds the limit. /// dist increases by the number of bases of each previous node until you reach the head node @@ -25,4 +23,4 @@ int32_t distance_to_tail(handle_t h, int32_t limit, int32_t dist, unordered_set< } } -#endif \ No newline at end of file +#endif diff --git a/src/algorithms/expand_context.cpp b/src/algorithms/expand_context.cpp new file mode 100644 index 00000000000..1cf82c227e1 --- /dev/null +++ b/src/algorithms/expand_context.cpp @@ -0,0 +1,252 @@ +#include "expand_context.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + + void expand_context_by_steps(const HandleGraph* source, MutableHandleGraph* subgraph, + int64_t dist, bool expand_forward, bool expand_backward) { + + // let's make an O(1) lookup for the current edges in case the implementation doesn't provide + // one (avoids O(N^2) behavior using has_edge) + unordered_set already_included_edges; + subgraph->for_each_edge([&](const edge_t& edge) { + already_included_edges.insert(edge); + }); + + // a fifo queue for BFS + queue> bfs_queue; + // all the edges we encounter along the way + unordered_set seen_edges; + + // initialize the queue with the current subgraph + subgraph->for_each_handle([&](const handle_t& handle) { + bfs_queue.emplace(source->get_handle(subgraph->get_id(handle)), 0); + }); + + // BFS outward + while (!bfs_queue.empty()) { + pair here = bfs_queue.front(); + bfs_queue.pop(); + + int64_t dist_thru = here.second + 1; + + if (dist_thru <= dist) { + if (expand_forward) { + source->follow_edges(here.first, false, [&](const handle_t& next) { + seen_edges.insert(source->edge_handle(here.first, next)); + if (!subgraph->has_node(source->get_id(next))) { + subgraph->create_handle(source->get_sequence(source->forward(next)), + source->get_id(next)); + bfs_queue.emplace(next, dist_thru); + } + }); + } + + if (expand_backward) { + source->follow_edges(here.first, true, [&](const handle_t& prev) { + seen_edges.insert(source->edge_handle(prev, here.first)); + if (!subgraph->has_node(source->get_id(prev))) { + subgraph->create_handle(source->get_sequence(source->forward(prev)), + source->get_id(prev)); + bfs_queue.emplace(prev, dist_thru); + } + }); + } + } + } + + // add in the edges we saw + for (const edge_t& edge : seen_edges) { + edge_t insertable = subgraph->edge_handle(subgraph->get_handle(source->get_id(edge.first), + source->get_is_reverse(edge.first)), + subgraph->get_handle(source->get_id(edge.second), + source->get_is_reverse(edge.second))); + if (!already_included_edges.count(insertable)) { + subgraph->create_edge(insertable); + } + } + } + + void expand_context_by_length(const HandleGraph* source, MutableHandleGraph* subgraph, + int64_t dist, bool expand_forward, bool expand_backward) { + + // let's make an O(1) lookup for the current edges in case the implementation doesn't provide + // one (avoids O(N^2) behavior using has_edge) + unordered_set already_included_edges; + subgraph->for_each_edge([&](const edge_t& edge) { + already_included_edges.insert(edge); + }); + + // extra bool indicates whether the position indicated is at the + // beginning of the node (beginning = true, end = false) + structures::RankPairingHeap, int64_t, greater> dijk_queue; + // all the edges we encounter along the way + unordered_set seen_edges; + + // initialize the queue at the ends pointing out of the node in the direction + // of our search + subgraph->for_each_handle([&](const handle_t& handle) { + handle_t src_handle = source->get_handle(subgraph->get_id(handle)); + if (expand_forward) { + dijk_queue.push_or_reprioritize(make_pair(src_handle, false), 0); + } + if (expand_backward) { + dijk_queue.push_or_reprioritize(make_pair(src_handle, true), 0); + } + }); + + + while (!dijk_queue.empty()) { + // the next closest untraversed node end + pair, int64_t> here = dijk_queue.top(); + dijk_queue.pop(); + + if (here.second > dist) { + break; + } + + if ((here.first.second && expand_forward) + || (!here.first.second && expand_backward)) { + // cross the node (traverse the sequence length) + int64_t dist_across = here.second + source->get_length(here.first.first); + if (dist_across <= dist) { + dijk_queue.push_or_reprioritize(make_pair(here.first.first, !here.first.second), dist_across); + } + } + if ((here.first.second && expand_backward) + || (!here.first.second && expand_forward)) { + // cross an edge (no added distance) + source->follow_edges(here.first.first, here.first.second, [&](const handle_t& next) { + dijk_queue.push_or_reprioritize(make_pair(next, !here.first.second), here.second); + + // the edge handle will be in a different order depending on whether we're + // going left or right + if (here.first.second) { + seen_edges.insert(source->edge_handle(next, here.first.first)); + } + else { + seen_edges.insert(source->edge_handle(here.first.first, next)); + } + + if (!subgraph->has_node(source->get_id(next))) { + subgraph->create_handle(source->get_sequence(source->forward(next)), + source->get_id(next)); + } + }); + } + } + + // add in the edges we saw + for (const edge_t& edge : seen_edges) { + edge_t insertable = subgraph->edge_handle(subgraph->get_handle(source->get_id(edge.first), + source->get_is_reverse(edge.first)), + subgraph->get_handle(source->get_id(edge.second), + source->get_is_reverse(edge.second))); + if (!already_included_edges.count(insertable)) { + subgraph->create_edge(insertable); + } + } + } + + void expand_context(const HandleGraph* source, MutableHandleGraph* subgraph, + int64_t dist, bool use_steps, bool expand_forward, + bool expand_backward) { + + if (use_steps) { + expand_context_by_steps(source, subgraph, dist, expand_forward, expand_backward); + } + else { + expand_context_by_length(source, subgraph, dist, expand_forward, expand_backward); + } + } + + void expand_context_with_paths(const PathHandleGraph* source, + MutablePathMutableHandleGraph* subgraph, + int64_t dist, bool use_steps, bool expand_forward, + bool expand_backward) { + + // get the topology of the subgraph we want + expand_context(source, subgraph, dist, use_steps, expand_forward, expand_backward); + + // find all steps on all nodes + unordered_set seen_steps; + subgraph->for_each_handle([&](const handle_t& handle) { + handle_t src_handle = source->get_handle(subgraph->get_id(handle)); + source->for_each_step_on_handle(src_handle, [&](const step_handle_t& step) { + seen_steps.insert(step); + }); + }); + + // keep track of how many segments we've added for each path + unordered_map segment_count; + + while (!seen_steps.empty()) { + // choose a segment arbitrarily + step_handle_t segment_seed = *seen_steps.begin(); + path_handle_t path_handle = source->get_path_handle_of_step(segment_seed); + + + // walk backwards until we're out of the subgraph or we loop around to + // the same step + auto step = segment_seed; + step_handle_t prev; + bool first_iter = true; + while (seen_steps.count(step) && (first_iter || step != segment_seed)) { + prev = step; + step = source->get_previous_step(step); + first_iter = false; + } + + if (step == segment_seed) { + // we walked all the way around a circular path + assert(source->get_is_circular(path_handle)); + + // copy the whole path, making the same step the "first" + path_handle_t segment_handle = subgraph->create_path_handle(source->get_path_name(path_handle), true); + source->for_each_step_in_path(path_handle, [&](const step_handle_t& step) { + handle_t handle = source->get_handle_of_step(step); + subgraph->append_step(segment_handle, + subgraph->get_handle(source->get_id(handle), + source->get_is_reverse(handle))); + seen_steps.erase(step); + }); + } + else { + // we walked off the subgraph, so this is the start of a segment + + // get the path name (and possibly disambiguate segments from the same path) + string path_name = source->get_path_name(path_handle); + if (segment_count[path_handle]) { + path_name += "-" + to_string(segment_count[path_handle]); + } + + if (subgraph->has_path(path_name)) { + cerr << "error: failed to create a unique path name for segment of " << source->get_path_name(path_handle) << ", there is already path named " << path_name << endl; + exit(1); + } + + // make a new path for this segment + path_handle_t segment_handle = subgraph->create_path_handle(path_name); + + for (step = prev; // start back on the subgraph + seen_steps.count(step); // stop once we leave the subgraph + step = source->get_next_step(step)) { + + handle_t src_handle = source->get_handle_of_step(step); + subgraph->append_step(segment_handle, + subgraph->get_handle(source->get_id(src_handle), + source->get_is_reverse(src_handle))); + + // keep track of which steps we've already added + seen_steps.erase(step); + } + } + + // make note of the fact that we've made a segment from this source path + segment_count[path_handle]++; + } + } +} +} diff --git a/src/algorithms/expand_context.hpp b/src/algorithms/expand_context.hpp new file mode 100644 index 00000000000..4029970a64d --- /dev/null +++ b/src/algorithms/expand_context.hpp @@ -0,0 +1,43 @@ +#ifndef VG_ALGORITHMS_EXPAND_CONTEXT_HPP_INCLUDED +#define VG_ALGORITHMS_EXPAND_CONTEXT_HPP_INCLUDED + +/** + * \file expand_context.hpp + * + * Defines algorithm for adding graph material from the context around a subgraph + * into the subgraph + */ + +#include "../handle.hpp" + +#include "structures/rank_pairing_heap.hpp" + +#include +#include + +namespace vg { +namespace algorithms { + +using namespace std; + + // basic method to query regions of the graph. subgraph must have the same + // ID and sequence space as source. + // use_steps flag toggles whether dist refers to steps or length in base pairs + // note: neighboring nodes are considered to be length 0 away from each other, + // so to do a null expansion by length, use dist < 0. + void expand_context(const HandleGraph* source, MutableHandleGraph* subgraph, + int64_t dist, bool use_steps = true, bool expand_forward = true, + bool expand_backward = true); + + // same as above but with addition of paths + // if the subgraph contains disconnected regions of a path, it will attempt to + // add both regions as separate paths with a disambiguated name + void expand_context_with_paths(const PathHandleGraph* source, + MutablePathMutableHandleGraph* subgraph, + int64_t dist, bool use_steps = true, bool expand_forward = true, + bool expand_backward = true); + +} +} + +#endif diff --git a/src/algorithms/extract_connecting_graph.cpp b/src/algorithms/extract_connecting_graph.cpp index 0fea70e3410..4858c5716d3 100644 --- a/src/algorithms/extract_connecting_graph.cpp +++ b/src/algorithms/extract_connecting_graph.cpp @@ -15,17 +15,15 @@ namespace algorithms { using namespace structures; unordered_map extract_connecting_graph(const HandleGraph* source, - MutableHandleGraph* into, + DeletableHandleGraph* into, int64_t max_len, pos_t pos_1, pos_t pos_2, - bool detect_terminal_cycles, - bool only_paths, bool strict_max_len) { #ifdef debug_vg_algorithms cerr << "[extract_connecting_graph] max len: " << max_len << ", pos 1: " << pos_1 << ", pos 2: " << pos_2 << endl; #endif - if (into->node_size()) { + if (into->get_node_count()) { cerr << "error:[extract_connecting_graph] must extract into an empty graph" << endl; exit(1); } @@ -97,12 +95,8 @@ unordered_map extract_connecting_graph(const HandleGraph* source, // keep track of whether we find a path or not bool found_target = false; - unordered_set skip_handles{source_handle_1}; - // mark final position for skipping so that we won't look for additional traversals unless that's - // the only way to find terminal cycles - if (!(colocation == SharedNodeReverse && detect_terminal_cycles)) { - skip_handles.insert(source_handle_2); - } + // mark final position for skipping so that we won't look for additional traversals + unordered_set skip_handles{source_handle_1, source_handle_2}; // initialize the queue UpdateablePriorityQueue queue([](const Traversal& item) { @@ -198,81 +192,10 @@ unordered_map extract_connecting_graph(const HandleGraph* source, return id_trans; } - // STEP 2: BACKWARD SEARCH (TO EXTRACT CYCLES ON THE FINAL NODE) - // the forward search doesn't traverse through the second position, so we need to traverse - // backwards from last position too if we're detecting cycles - // also we cannot find any new nodes/edges that will pass future distance filters if - // both forward and backward traversals are starting along the same edges, or if all paths - // are already cyclical, so we exclude those cases to simplify some case checking in the loop - if (detect_terminal_cycles && - (colocation == SeparateNodes || colocation == SharedNodeReachable)) { - - -#ifdef debug_vg_algorithms - cerr << "BACKWARD SEARCH: beginning search with backward max len " << backward_max_len << " and last traversal length " << last_traversal_length << endl; -#endif - - // initialize the queue going backward from the last position if it's reachable - queue.clear(); - if (last_traversal_length <= backward_max_len) { - queue.emplace(source->flip(source_handle_2), last_traversal_length); - } - - // reset the traversal list to skip and add the two reverse traversals - skip_handles.clear(); - skip_handles.insert(source->flip(source_handle_1)); - skip_handles.insert(source->flip(source_handle_2)); - - // search along a Dijkstra tree - while (!queue.empty()) { - // get the next closest node to the starting position - Traversal trav = queue.top(); - queue.pop(); - -#ifdef debug_vg_algorithms - cerr << "BACKWARD SEARCH: traversing node " << source->get_id(trav.handle) - << " in " << (source->get_is_reverse(trav.handle) ? "reverse" : "forward") - << " orientation at distance " << trav.dist << endl; -#endif - - source->follow_edges(trav.handle, false, [&](const handle_t& next) { - // get the orientation and id of the other side of the edge - id_t next_id = source->get_id(next); - bool next_rev = source->get_is_reverse(next); - -#ifdef debug_vg_algorithms - cerr << "BACKWARD SEARCH: got edge " - << source->get_id(trav.handle) << " " << source->get_is_reverse(trav.handle) - << " -> " << next_id << " " << next_rev << endl; -#endif - - max_id = max(max_id, next_id); - - // make sure the node is in the graph - if (!id_trans.count(next_id)) { - into->create_handle(source->get_sequence(source->forward(next)), next_id); - id_trans[next_id] = next_id; - } - - // distance to the end of this node - int64_t dist_thru = trav.dist + source->get_length(next); - if (!skip_handles.count(next) && dist_thru <= forward_max_len) { - // we can add more nodes along same path without going over the max length - // and we have not reached the target node yet - queue.emplace(next, dist_thru); -#ifdef debug_vg_algorithms - cerr << "BACKWARD SEARCH: distance " << dist_thru << " is under maximum, adding to queue" << endl; -#endif - } - - observed_edges.insert(source->edge_handle(trav.handle, next)); - }); - } - } - // add the edges we saw for (const edge_t& edge : observed_edges) { - into->create_edge(edge); + into->create_edge(into->get_handle(source->get_id(edge.first), source->get_is_reverse(edge.first)), + into->get_handle(source->get_id(edge.second), source->get_is_reverse(edge.second))); } #ifdef debug_vg_algorithms @@ -290,7 +213,7 @@ unordered_map extract_connecting_graph(const HandleGraph* source, }); #endif - // STEP 3: DUPLICATING NODES + // STEP 2: DUPLICATING NODES // if we're trying to detect terminal cycles, duplicate out the node so that the cyclic paths // survive the node cutting step @@ -341,99 +264,7 @@ unordered_map extract_connecting_graph(const HandleGraph* source, return dup_handle; }; - id_t duplicate_node_1 = 0, duplicate_node_2 = 0; - - if (detect_terminal_cycles) { - // if there are edges traversed in both directions from the boundary position's nodes, then - // they might be in a cycle - bool has_left_edges_1 = false, has_left_edges_2 = false, has_right_edges_1 = false, has_right_edges_2 = false; - into->follow_edges(into_handle_1, true, [&](const handle_t& ignored) { - has_left_edges_1 = true; - return false; - }); - into->follow_edges(into_handle_1, false, [&](const handle_t& ignored) { - has_right_edges_1 = true; - return false; - }); - into->follow_edges(into_handle_2, true, [&](const handle_t& ignored) { - has_left_edges_2 = true; - return false; - }); - into->follow_edges(into_handle_2, false, [&](const handle_t& ignored) { - has_right_edges_2 = true; - return false; - }); - - bool possibly_in_cycle_1 = has_left_edges_1 && has_right_edges_1; - bool possibly_in_cycle_2 = has_left_edges_2 && has_right_edges_2; - - // logic changes depending on colocation of positions on same node - switch (colocation) { - case SeparateNodes: - { - // the two positions are on separate nodes, so we can duplicate cycles independently - - if (possibly_in_cycle_1) { - duplicate_node(into_handle_1, true, true); - } - - if (possibly_in_cycle_2) { - duplicate_node(into_handle_2, true, true); - } - break; - } - case SharedNodeReachable: - { - // one position is reachable from the next within the same node - - if (possibly_in_cycle_1) { - - // later, we're going to trim this node to it's middle portion between the two positions - // so now that we want to preserve cycles, we need to make two new nodes that will hold - // the prefix and suffix of the node so that the edges have somewhere to attach to - - - duplicate_node_1 = into->get_id(duplicate_node(into_handle_1, false, true)); - duplicate_node_2 = into->get_id(duplicate_node(into_handle_2, true, false)); - - // we also need a simple duplicate node in case there are any cycles that pass all the way - // through the node - - duplicate_node(into_handle_2, true, true); - } - break; - } - case SharedNodeUnreachable: - case SharedNodeReverse: - { - // all paths between these positions are cyclical, but we still duplicate the node - // so that any cycles that pass all the way through the node are there to be accepted - // or rejected by the distance filter even after we cut the node up - - if (possibly_in_cycle_1) { - duplicate_node(into_handle_1, true, true); - } - break; - } - } - } - -#ifdef debug_vg_algorithms - cerr << "state of graph after duplicating nodes to preserve cycles:" << endl; - into->for_each_handle([&](const handle_t& handle) { - cerr << "node " << into->get_id(handle) << " " << into->get_sequence(handle) << endl; - cerr << "\tleft" << endl; - into->follow_edges(handle, true, [&](const handle_t& next) { - cerr << "\t\t" << into->get_id(next) << (into->get_is_reverse(next) ? "-" : "+") << endl; - }); - cerr << "\tright" << endl; - into->follow_edges(handle, false, [&](const handle_t& next) { - cerr << "\t\t" << into->get_id(next) << (into->get_is_reverse(next) ? "-" : "+") << endl; - }); - }); -#endif - - // STEP 4: CUTTING NODES + // STEP 3: CUTTING NODES // now cut the two end nodes at the designated positions and remove the edges on the cut side // to make the end positions tips in the graph @@ -443,52 +274,24 @@ unordered_map extract_connecting_graph(const HandleGraph* source, case SeparateNodes: { // split the node, update the IDs, and clean up the other side - auto halves_1 = into->divide_handle(into_handle_1, offset(pos_1)); + cut_handle_1 = into->truncate_handle(into_handle_1, true, offset(pos_1)); id_trans.erase(id(pos_1)); - id_trans[into->get_id(halves_1.second)] = id(pos_1); - into->destroy_handle(halves_1.first); - cut_handle_1 = halves_1.second; + id_trans[into->get_id(cut_handle_1)] = id(pos_1); // repeat for the second position - auto halves_2 = into->divide_handle(into_handle_2, offset(pos_2)); + cut_handle_2 = into->truncate_handle(into_handle_2, false, offset(pos_2)); id_trans.erase(id(pos_2)); - id_trans[into->get_id(halves_2.first)] = id(pos_2); - into->destroy_handle(halves_2.second); - cut_handle_2 = halves_2.first; + id_trans[into->get_id(cut_handle_2)] = id(pos_2); break; } case SharedNodeReachable: { // split the node, update the IDs, and clean up the two ends - auto thirds = into->divide_handle(into_handle_2, vector{offset(pos_1), offset(pos_2)}); + cut_handle_1 = into->truncate_handle(into->truncate_handle(into_handle_2, false, offset(pos_2)), true, offset(pos_1)); id_trans.erase(id(pos_1)); - id_trans[into->get_id(thirds[1])] = id(pos_1); - into->destroy_handle(thirds.front()); - into->destroy_handle(thirds.back()); - cut_handle_1 = thirds[1]; - cut_handle_2 = thirds[1]; - - // if we created duplicate nodes to hold the right and left side edges in cycles, cut - // those as well, update the IDs, and then clean up the other side - if (duplicate_node_1) { - handle_t dup_handle = into->get_handle(duplicate_node_1, is_rev(pos_1)); - auto halves = into->divide_handle(dup_handle, offset(pos_1)); - id_trans.erase(duplicate_node_1); - id_trans[into->get_id(halves.second)] = id(pos_1); - duplicate_node_1 = into->get_id(halves.second); - into->destroy_handle(halves.first); - } - - if (duplicate_node_2) { - handle_t dup_handle = into->get_handle(duplicate_node_2, is_rev(pos_2)); - auto halves = into->divide_handle(dup_handle, offset(pos_2)); - id_trans.erase(duplicate_node_2); - id_trans[into->get_id(halves.first)] = id(pos_2); - duplicate_node_2 = into->get_id(halves.first); - into->destroy_handle(halves.second); - } - + id_trans[into->get_id(cut_handle_1)] = id(pos_1); + cut_handle_2 = cut_handle_1; break; } case SharedNodeUnreachable: @@ -496,17 +299,13 @@ unordered_map extract_connecting_graph(const HandleGraph* source, { // make a new node that will preserve the edges on the righthand side handle_t dup_node = duplicate_node(into_handle_1, false, true); - auto halves_1 = into->divide_handle(dup_node, offset(pos_1)); - id_trans[into->get_id(halves_1.second)] = id(pos_1); - into->destroy_handle(halves_1.first); - cut_handle_1 = halves_1.second; + cut_handle_1 = into->truncate_handle(dup_node, true, offset(pos_1)); + id_trans[into->get_id(cut_handle_1)] = id(pos_1); // cut the original node and preserve its lefthand side edges - auto halves_2 = into->divide_handle(into_handle_2, offset(pos_2)); + cut_handle_2 = into->truncate_handle(into_handle_2, false, offset(pos_2)); id_trans.erase(id(pos_2)); - id_trans[into->get_id(halves_2.first)] = id(pos_2); - into->destroy_handle(halves_2.second); - cut_handle_2 = halves_2.first; + id_trans[into->get_id(cut_handle_2)] = id(pos_2); break; } @@ -527,7 +326,7 @@ unordered_map extract_connecting_graph(const HandleGraph* source, }); #endif - // STEP 5: PRUNING + // STEP 4: PRUNING // the graph now contains all the paths we've indicated and the end positions are tips, we now // provide three options for pruning away any unnecessary nodes and edges we've added in the // process of searching for the subgraph that has this guarantee @@ -542,36 +341,8 @@ unordered_map extract_connecting_graph(const HandleGraph* source, // above the maximum distance, so we do a forward-backward distance search to check // compute the minimum distance from the two start point s - unordered_map forward_dist = find_shortest_paths(into, cut_handle_1, false); - unordered_map reverse_dist = find_shortest_paths(into, cut_handle_2, true); - - // also consider minimum distances from alternate start points if we have them - if (duplicate_node_1) { - auto alt_forward_dist = find_shortest_paths(into, into->get_handle(duplicate_node_1, is_rev(pos_1)), false); - for (const auto& alt_dist : alt_forward_dist) { - auto iter = forward_dist.find(alt_dist.first); - if (iter != forward_dist.end()) { - iter->second = min(iter->second, alt_dist.second); - } - else { - forward_dist.insert(alt_dist); - } - } - } - - // also consider minimum distances from alternate start points if we have them - if (duplicate_node_2) { - auto alt_reverse_dist = find_shortest_paths(into, into->get_handle(duplicate_node_2, is_rev(pos_2)), true); - for (const auto& alt_dist : alt_reverse_dist) { - auto iter = reverse_dist.find(alt_dist.first); - if (iter != reverse_dist.end()) { - iter->second = min(iter->second, alt_dist.second); - } - else { - reverse_dist.insert(alt_dist); - } - } - } + unordered_map forward_dist = handlealgs::find_shortest_paths(into, cut_handle_1, false); + unordered_map reverse_dist = handlealgs::find_shortest_paths(into, cut_handle_2, true); // now we have the lengths of the shortest path remaining in graph to and from each node // with these, we can compute the shortest path that uses each node and edge to see if it @@ -652,101 +423,47 @@ unordered_map extract_connecting_graph(const HandleGraph* source, } }); } - else if (only_paths) { + else { // OPTION 2: PRUNE TO PATHS - // some nodes in the current graph may not be on paths, so we do a forward-backward - // reachability search to check + // some nodes in the current graph may not be on paths between the cut points, + // so we do a reverse traversal (the initial traversal establishes forward reachability) + + unordered_set reverse_reachable{cut_handle_2}; + reverse_reachable.reserve(into->get_node_count()); + vector stack(1, cut_handle_2); - auto identify_reachable = [&](const vector& starts, bool search_leftward) { - - unordered_set reachable; - for (handle_t handle : starts){ - reachable.insert(handle); - } - auto stack = starts; - #ifdef debug_vg_algorithms - cerr << "REACHABILILTY PRUNE: beginning reachability test " << (search_leftward ? "backward" : "forward") << " from "; - for (auto handle : stack) { - cerr << into->get_id(handle) << (into->get_is_reverse(handle) ? "-" : "+") << " "; - } - cerr << endl; + cerr << "REACHABILILTY PRUNE: beginning reachability test backward from "; + for (auto handle : stack) { + cerr << into->get_id(handle) << (into->get_is_reverse(handle) ? "-" : "+") << " "; + } + cerr << endl; #endif + + while (!stack.empty()) { + handle_t handle = stack.back(); + stack.pop_back(); - while (!stack.empty()) { - handle_t handle = stack.back(); - stack.pop_back(); - #ifdef debug_vg_algorithms - cerr << "REACHABILILTY PRUNE: traversing node " << into->get_id(handle) << " in " << (into->get_is_reverse(handle) ? "reverse" : "forward") << endl; + cerr << "REACHABILILTY PRUNE: traversing node " << into->get_id(handle) << " in " << (into->get_is_reverse(handle) ? "reverse" : "forward") << endl; #endif - - into->follow_edges(handle, search_leftward, [&](const handle_t& next) { - if (!reachable.count(next)) { - stack.emplace_back(next); - reachable.insert(next); - }; - }); - } - return reachable; - }; - - vector forward_starts, reverse_starts; - - // identify the search origins - forward_starts.emplace_back(cut_handle_1); - reverse_starts.emplace_back(cut_handle_2); - - // if we duplicated the start nodes, add those too - if (duplicate_node_1) { - forward_starts.emplace_back(into->get_handle(duplicate_node_1, into->get_is_reverse(cut_handle_1))); - } - if (duplicate_node_2) { - reverse_starts.emplace_back(into->get_handle(duplicate_node_2, into->get_is_reverse(cut_handle_2))); + into->follow_edges(handle, true, [&](const handle_t& next) { + if (!reverse_reachable.count(next)) { + stack.emplace_back(next); + reverse_reachable.insert(next); + }; + }); } - unordered_set forward_reachable = identify_reachable(forward_starts, false); - unordered_set reverse_reachable = identify_reachable(reverse_starts, true); - // now we know which nodes are reachable from both ends, to be on a path between the end positions, // a node or edge must be reachable from both directions - // is node reachable from both positions? - auto should_remove_node = [&](const handle_t& handle) { - handle_t flipped = into->flip(handle); - return !bool((forward_reachable.count(handle) && reverse_reachable.count(handle)) - || (forward_reachable.count(flipped) && reverse_reachable.count(flipped))); - }; - - // is edge reachable from both positions? - auto should_remove_edge = [&](const handle_t& prev, const handle_t& next) { - handle_t flipped_prev = into->flip(next); - handle_t flipped_next = into->flip(prev); - - return !bool((forward_reachable.count(prev) && reverse_reachable.count(next)) - || (forward_reachable.count(flipped_prev) && reverse_reachable.count(flipped_next))); - }; - - // apply the tests to each node/edge and collect the results + // apply the tests to each node and collect the results into->for_each_handle([&](const handle_t& handle) { - - if (should_remove_node(handle)) { + if (!reverse_reachable.count(handle) && !reverse_reachable.count(into->flip(handle))) { nodes_to_erase.insert(handle); } - else { - into->follow_edges(handle, false, [&](const handle_t& next) { - if (should_remove_edge(handle, next)) { - edges_to_erase.insert(into->edge_handle(handle, next)); - } - }); - - into->follow_edges(handle, true, [&](const handle_t& prev) { - if (should_remove_edge(prev, handle)) { - edges_to_erase.insert(into->edge_handle(prev, handle)); - } - }); - } }); } @@ -757,8 +474,10 @@ unordered_map extract_connecting_graph(const HandleGraph* source, // and the edges for (const edge_t& edge : edges_to_erase) { - if (!nodes_to_erase.count(into->forward(edge.first)) - && !nodes_to_erase.count(into->forward(edge.second))) { + if (!nodes_to_erase.count(edge.first) + && !nodes_to_erase.count(into->flip(edge.first)) + && !nodes_to_erase.count(edge.second) + && !nodes_to_erase.count(into->flip(edge.second))) { into->destroy_edge(edge); } } diff --git a/src/algorithms/extract_connecting_graph.hpp b/src/algorithms/extract_connecting_graph.hpp index f3de203fd79..7f8892cd745 100644 --- a/src/algorithms/extract_connecting_graph.hpp +++ b/src/algorithms/extract_connecting_graph.hpp @@ -8,29 +8,21 @@ */ #include +#include #include "../position.hpp" -#include "../cached_position.hpp" #include "../handle.hpp" -#include "../vg.pb.h" #include "../hash_map.hpp" -#include "find_shortest_paths.hpp" - namespace vg { namespace algorithms { - /// Fills a MutableHandleGraph with the subgraph of a HandleGraph that connects two positions. The nodes that - /// contain the two positions will be 'cut' at the position and will be tips in the returned graph. By default, - /// the algorithm provides only one guarantee: - /// - 'into' contains all walks between pos_1 and pos_2 under the maximum length except walks that include - /// a cycle involving either position - /// Cutting the nodes containing the two positions breaks cycles containing those nodes, so these nodes may - /// optinally be duplicated so that cycles involving the two positions are maintained. No other nodes will be - /// duplicated. The algorithm optionally provides additional guarantees at the expense of increased computational - /// cost, but no increase in asymptotic complexity (the guarantees are described below). If no walk between the - /// two positions under the maximum length exists, 'into' will be left empty. An error is thrown if 'into' is - /// not empty when passed to function. + /// Fills a DeletableHandleGraph with the subgraph of a HandleGraph that connects two positions. The nodes + /// that contain the two positions will be 'cut' at the position and will be tips in the returned graph. The + /// algorithm guarantees that 'into' contains all walks between pos_1 and pos_2 under the maximum length + /// except walks that include a cycle involving either position. If no walk between the two positions under + /// the maximum length exists, 'into' will be left empty. An error is thrown if 'into' is not empty when + /// passed to function. /// /// Args: /// source graph to extract subgraph from @@ -38,16 +30,14 @@ namespace algorithms { /// max_len guarantee finding walks along which pos_1 and pos_2 are this distance apart /// pos_1 start position, subgraph walks begin from here in same orientation /// pos_2 end position, subgraph walks end here in the same orientation - /// detect_terminal_cycles also find walks that include cycles involving pos_1 and/or pos_2 - /// only_walks only extract nodes and edges if they fall on some walk between pos_1 and pos_2 /// strict_max_len only extract nodes and edges if they fall on some walk between pos_1 and pos_2 /// that is under the maximum length (implies only_walks = true) + /// + /// Returns: a map from node ids in the extracted graph to the node ids in the original graph unordered_map extract_connecting_graph(const HandleGraph* source, - MutableHandleGraph* into, + DeletableHandleGraph* into, int64_t max_len, pos_t pos_1, pos_t pos_2, - bool detect_terminal_cycles = false, - bool only_walks = false, bool strict_max_len = false); } diff --git a/src/algorithms/extract_containing_graph.cpp b/src/algorithms/extract_containing_graph.cpp index cfa4d2f326d..8bd17c49c87 100644 --- a/src/algorithms/extract_containing_graph.cpp +++ b/src/algorithms/extract_containing_graph.cpp @@ -5,7 +5,6 @@ */ #include "extract_containing_graph.hpp" -#include //#define debug_vg_algorithms @@ -18,17 +17,18 @@ void extract_containing_graph(const HandleGraph* source, MutableHandleGraph* into, const vector& positions, const vector& forward_search_lengths, - const vector& backward_search_lengths) { + const vector& backward_search_lengths, + size_t reversing_walk_length) { if (forward_search_lengths.size() != backward_search_lengths.size() || forward_search_lengths.size() != positions.size()) { cerr << "error:[extract_containing_graph] subgraph extraction search lengths do not match seed positions" << endl; - assert(false); + exit(1); } - if (into->node_size()) { + if (into->get_node_count()) { cerr << "error:[extract_containing_graph] must extract into an empty graph" << endl; - assert(false); + exit(1); } #ifdef debug_vg_algorithms @@ -38,28 +38,27 @@ void extract_containing_graph(const HandleGraph* source, } #endif - // TODO: struct is duplicative with extract_connecting_graph - // a local struct that packages a handle with its distance from the first position - struct Traversal { - Traversal(handle_t handle, int64_t dist) : handle(handle), dist(dist) {} - int64_t dist; // distance from pos to the right side of this node - handle_t handle; // Oriented node traversal - inline bool operator<(const Traversal& other) const { - return dist > other.dist; // opposite order so priority queue selects minimum - } - }; + // computing search distances relative to this maximum allows us to keep the searches + // from all of the seed nodes in the same priority queue so that we only need to do + // one Dijkstra traversal + int64_t max_search_length = max(*std::max_element(forward_search_lengths.begin(), forward_search_lengths.end()), + *std::max_element(backward_search_lengths.begin(), backward_search_lengths.end())); - size_t max_search_length = max(*std::max_element(forward_search_lengths.begin(), forward_search_lengths.end()), - *std::max_element(backward_search_lengths.begin(), backward_search_lengths.end())); - unordered_set observed_ids; - unordered_set observed_edges; +#ifdef debug_vg_algorithms + cerr << "[extract_containing_graph] artificial offset calculated to be " << max_search_length << endl; +#endif + // for keeping track of all the edges we cross to add later + // + // we use spp because the order of the edges affects some tie-breaking behavior + // later on that we want to remain system independent (i.e. no dependence on the + // system stdlib) + spp::sparse_hash_set observed_edges; - // initialize the queue - UpdateablePriorityQueue queue([](const Traversal& item) { - return item.handle; - }); + // initialize the queue, opposite order so priority queue selects minimum + // priority represent distance from starting pos to the left side of this node + RankPairingHeap> queue; for (size_t i = 0; i < positions.size(); i++) { @@ -67,71 +66,102 @@ void extract_containing_graph(const HandleGraph* source, handle_t source_handle = source->get_handle(id(pos), false); // add all of the initial nodes to the graph - if (!observed_ids.count(id(pos))) { + if (!into->has_node(id(pos))) { into->create_handle(source->get_sequence(source_handle), id(pos)); - observed_ids.insert(id(pos)); } - // adding this extra distance allows us to keep the searches from all of the seed nodes in - // the same priority queue so that we only need to do one Dijkstra traversal + // compute the modified search lengths + int64_t dist_forward = -offset(pos) + max_search_length - forward_search_lengths[i]; + int64_t dist_backward = offset(pos) - source->get_length(source_handle) + max_search_length - backward_search_lengths[i]; // add a traversal for each direction - size_t dist_forward = source->get_length(source_handle) - offset(pos) + max_search_length - forward_search_lengths[i]; - size_t dist_backward = offset(pos) + max_search_length - backward_search_lengths[i]; - if (dist_forward < max_search_length) { - queue.emplace(is_rev(pos) ? source->flip(source_handle) : source_handle, dist_forward); - } - if (dist_backward < max_search_length) { - queue.emplace(is_rev(pos) ? source_handle : source->flip(source_handle), dist_backward); - } + queue.push_or_reprioritize(is_rev(pos) ? source->flip(source_handle) : source_handle, dist_forward); + queue.push_or_reprioritize(is_rev(pos) ? source_handle : source->flip(source_handle), dist_backward); +#ifdef debug_vg_algorithms + cerr << "[extract_containing_graph] init enqueue " << id(pos) << " " << is_rev(pos) << ": " << dist_forward << endl; + cerr << "[extract_containing_graph] init enqueue " << id(pos) << " " << !is_rev(pos) << ": " << dist_backward << endl; +#endif } while (!queue.empty()) { // get the next shortest distance traversal from either the init - Traversal trav = queue.top(); + pair trav = queue.top(); queue.pop(); - source->follow_edges(trav.handle, false, [&](const handle_t& next) { - // Look locally right from this position - - // Get the ID of where we're going. - id_t next_id = source->get_id(next); + +#ifdef debug_vg_algorithms + cerr << "[extract_containing_graph] dequeue " << source->get_id(trav.first) << " " << source->get_is_reverse(trav.first) << ": " << trav.second << endl; +#endif + + // make sure the node is in the graph + if (!into->has_node(source->get_id(trav.first))) { +#ifdef debug_vg_algorithms + cerr << "[extract_containing_graph] adding node to subgraph" << endl; +#endif + into->create_handle(source->get_sequence(source->forward(trav.first)), source->get_id(trav.first)); + } + + int64_t dist_thru = trav.second + source->get_length(trav.first); + + if (dist_thru < max_search_length) { + // we can add more nodes along same path without going over the max length - // record the edge - observed_edges.insert(source->edge_handle(trav.handle, next)); + // look locally right from this position + source->follow_edges(trav.first, false, [&](const handle_t& next) { + // record the edge + observed_edges.insert(source->edge_handle(trav.first, next)); + // add it to the queue + queue.push_or_reprioritize(next, dist_thru); +#ifdef debug_vg_algorithms + cerr << "[extract_containing_graph] traverse and (possibly) enqueue " << source->get_id(next) << " " << source->get_is_reverse(next) << ": " << dist_thru << endl; +#endif + }); + } + + // if we're allowing reversing walks and this isn't one of the starting positions... + if (reversing_walk_length > 0 && trav.second > 0) { - // make sure the node is in the graph - if (!observed_ids.count(next_id)) { - into->create_handle(source->get_sequence(source->forward(next)), next_id); - observed_ids.insert(next_id); - } + // choose a distance that will let the reversing walk only as far as the minimum of the max + // search length or the reverse length + int64_t synthetic_dist = max_search_length > reversing_walk_length ? max_search_length - reversing_walk_length : 0; - // distance to the end of this node - int64_t dist_thru = trav.dist + source->get_length(next); - if (dist_thru < max_search_length) { - // we can add more nodes along same path without going over the max length - queue.emplace(next, dist_thru); - } - }); + handle_t flipped = source->flip(trav.first); + // look locally right from this position + source->follow_edges(flipped, false, [&](const handle_t& next) { + // record the edge + observed_edges.insert(source->edge_handle(flipped, next)); + + // add it to the queue + queue.push_or_reprioritize(next, max_search_length - reversing_walk_length); +#ifdef debug_vg_algorithms + cerr << "[extract_containing_graph] reverse walk and (possibly) enqueue " << source->get_id(next) << " " << source->get_is_reverse(next) << ": " << max_search_length - reversing_walk_length << endl; +#endif + }); + } } // add the edges to the graph for (const edge_t& edge : observed_edges) { +#ifdef debug_vg_algorithms + cerr << "[extract_containing_graph] adding edge " << source->get_id(edge.first) << " " << source->get_is_reverse(edge.first) << " -> " << source->get_id(edge.second) << " " << source->get_is_reverse(edge.second) << endl; +#endif into->create_edge(into->get_handle(source->get_id(edge.first), source->get_is_reverse(edge.first)), into->get_handle(source->get_id(edge.second), source->get_is_reverse(edge.second))); } } void extract_containing_graph(const HandleGraph* source, MutableHandleGraph* into, const vector& positions, - size_t max_dist) { + size_t max_dist, size_t reversing_walk_length) { - return extract_containing_graph(source, into, positions, vector(positions.size(), max_dist)); + extract_containing_graph(source, into, positions, vector(positions.size(), max_dist), + reversing_walk_length); } void extract_containing_graph(const HandleGraph* source, MutableHandleGraph* into, const vector& positions, - const vector& position_max_dist) { + const vector& position_max_dist, size_t reversing_walk_length) { - return extract_containing_graph(source, into, positions, position_max_dist, position_max_dist); + extract_containing_graph(source, into, positions, position_max_dist, position_max_dist, + reversing_walk_length); } } diff --git a/src/algorithms/extract_containing_graph.hpp b/src/algorithms/extract_containing_graph.hpp index 732cb921d54..c3480c998a8 100644 --- a/src/algorithms/extract_containing_graph.hpp +++ b/src/algorithms/extract_containing_graph.hpp @@ -10,32 +10,37 @@ #include #include "../position.hpp" -#include "../xg.hpp" -#include "../vg.hpp" -#include "../vg.pb.h" #include "../handle.hpp" -#include "../hash_map.hpp" + +#include +#include namespace vg { namespace algorithms { /// Fills graph 'into' with the subgraph of the handle graph 'source' that contains all of the /// positions in the positions vector and all other nodes and edges that can be reached within - /// a maximum distance from any of these positions. Node IDs in the subgraph are retained from - /// the source graph. + /// a maximum distance from any of these positions. Optionally also finds nodes and edges that + /// can be reached within some distance from the previously mentioned nodes, except along non- + /// proper bidirected walks. Node IDs in the subgraph are retained from the source graph. /// /// Args: - /// source graph to extract subgraph from - /// into graph to extract into - /// positions search outward from these positions - /// max_dist include all nodes and edges that can be reached in at most this distance - void extract_containing_graph(const HandleGraph* source, MutableHandleGraph* into, const vector& positions, size_t max_dist); + /// source graph to extract subgraph from + /// into graph to extract into + /// positions search outward from these positions + /// max_dist include all nodes and edges that can be reached in at most this distance + /// reversing_walk_length also find graph material that can be reached + void extract_containing_graph(const HandleGraph* source, MutableHandleGraph* into, + const vector& positions, size_t max_dist, + size_t reversing_walk_length = 0); /// Same semantics as previous except that there is a separate maximum distance for different /// positions in the graph. Each distance is associated with the position with the same index. Throws /// an error if the position and distance vectors are not the same length. - void extract_containing_graph(const HandleGraph* source, MutableHandleGraph* into, const vector& positions, - const vector& position_max_dist); + void extract_containing_graph(const HandleGraph* source, MutableHandleGraph* into, + const vector& positions, + const vector& position_max_dist, + size_t reversing_walk_length = 0); /// Same semantics as previous except that there is a separate maximum distance for different @@ -43,9 +48,11 @@ namespace algorithms { /// position with the same index. The forward distance is in the same orientation as the position, /// and the backward distance is in the reverse orientation of the position. Throws an error if /// the position and distance vectors are not the same length. - void extract_containing_graph(const HandleGraph* source, MutableHandleGraph* into, const vector& positions, + void extract_containing_graph(const HandleGraph* source, MutableHandleGraph* into, + const vector& positions, const vector& position_forward_max_dist, - const vector& position_backward_max_dist); + const vector& position_backward_max_dist, + size_t reversing_walk_length = 0); } } diff --git a/src/algorithms/extract_extending_graph.cpp b/src/algorithms/extract_extending_graph.cpp index 0484f654802..80a04a302ba 100644 --- a/src/algorithms/extract_extending_graph.cpp +++ b/src/algorithms/extract_extending_graph.cpp @@ -14,10 +14,10 @@ namespace algorithms { using namespace structures; -unordered_map extract_extending_graph(const HandleGraph* source, MutableHandleGraph* into, int64_t max_dist, pos_t pos, +unordered_map extract_extending_graph(const HandleGraph* source, DeletableHandleGraph* into, int64_t max_dist, pos_t pos, bool backward, bool preserve_cycles_on_src_node) { - if (into->node_size()) { + if (into->get_node_count()) { cerr << "error:[extract_extending_graph] must extract into an empty graph" << endl; assert(false); } @@ -38,7 +38,8 @@ unordered_map extract_extending_graph(const HandleGraph* source, Mut } }; - // a map from node ids in the extracted graph to the node ids in the original graph + // A map from node ids in the extracted graph to the node ids in the original graph. + // The IDs are always the same except for when we cut/duplicate the source node. unordered_map id_trans; // a graph index that we will maintain as we extract the subgraph @@ -159,7 +160,7 @@ unordered_map extract_extending_graph(const HandleGraph* source, Mut } if (add_edge) { - into->create_edge(edge.first, edge.second); + into->create_edge(into->get_handle(source->get_id(edge.first), source->get_is_reverse(edge.first)), into->get_handle(source->get_id(edge.second), source->get_is_reverse(edge.second))); } } diff --git a/src/algorithms/extract_extending_graph.hpp b/src/algorithms/extract_extending_graph.hpp index 98e9bc1f4c1..474012b7aa0 100644 --- a/src/algorithms/extract_extending_graph.hpp +++ b/src/algorithms/extract_extending_graph.hpp @@ -8,10 +8,9 @@ */ #include +#include #include "../position.hpp" -#include "../cached_position.hpp" -#include "../vg.pb.h" #include "../hash_map.hpp" #include "../handle.hpp" @@ -21,9 +20,11 @@ namespace algorithms { /// Fills graph 'into' with the subgraph of the handle graph 'source' that extends in one direction from /// a given position, up to a maximum distance. The node containing the position will be "cut" so that only /// the portion that is forward in the search direction remains. Node IDs may be changed in the extracted - /// graph, but they can be translated back to node IDs in the original graph with the returned map. The - /// node containing the source node may optionally be duplicated to preserve cycles on it after its cut, - /// but no other nodes will will duplicated. + /// graph, but they can be translated back to node IDs in the original graph with the returned map, although + /// that translation procedure MUST handle the node that pos is on specially, as it may be cut. + /// translate_node_ids from path.hpp can do this as long as you pass along what part of the node was removed. + /// The node containing the source position may optionally be duplicated to preserve cycles on it after its + /// cut, but no other nodes will will duplicated. /// /// Args: /// source graph to extract subgraph from @@ -32,7 +33,7 @@ namespace algorithms { /// pos extend from this position /// backward extend in this direction /// preserve_cycles_on_src if necessary, duplicate starting node to preserve cycles after cutting it - unordered_map extract_extending_graph(const HandleGraph* source, MutableHandleGraph* into, int64_t max_dist, pos_t pos, + unordered_map extract_extending_graph(const HandleGraph* source, DeletableHandleGraph* into, int64_t max_dist, pos_t pos, bool backward, bool preserve_cycles_on_src_node); } diff --git a/src/algorithms/find_gbwt.cpp b/src/algorithms/find_gbwt.cpp new file mode 100644 index 00000000000..3d5f1b4e18d --- /dev/null +++ b/src/algorithms/find_gbwt.cpp @@ -0,0 +1,39 @@ +/** + * \file find_gbwt.cpp + */ + +#include "find_gbwt.hpp" +#include "find_gbwtgraph.hpp" +#include + +#include "../gbzgraph.hpp" + +namespace vg { +namespace algorithms { + +const gbwt::GBWT* find_gbwt(const HandleGraph* graph) { + const gbwtgraph::GBWTGraph* typed_graph = find_gbwtgraph(graph); + if (!typed_graph) { + return nullptr; + } + return typed_graph->index; +} + +const gbwt::GBWT* find_gbwt(const HandleGraph* graph, std::unique_ptr& holder, const std::string& filename) { + if (!filename.empty()) { + holder = vg::io::VPKG::load_one(filename); + + if (holder.get() == nullptr) { + // Complain if we couldn't get it but were supposed to. + cerr << "error:[vg::algorithms::find_gbwt] unable to load gbwt index file " << filename << endl; + exit(1); + } + + return holder.get(); + } + // If we don't need to load it, try and get it from the graph. + return find_gbwt(graph); +} + +} +} diff --git a/src/algorithms/find_gbwt.hpp b/src/algorithms/find_gbwt.hpp new file mode 100644 index 00000000000..0ac8d959099 --- /dev/null +++ b/src/algorithms/find_gbwt.hpp @@ -0,0 +1,33 @@ +#ifndef VG_ALGORITHMS_FIND_GBWT_HPP_INCLUDED +#define VG_ALGORITHMS_FIND_GBWT_HPP_INCLUDED + +/** + * \file find_gbwtgraph.hpp + * + * Defines an algorithm for finding the GBWTGraph associated with a handle graph, if any. + */ + +#include "../handle.hpp" +#include + +namespace vg { +namespace algorithms { +using namespace std; + +/** + * Find the GBWT that is part of the given handle graph, if any exists. + * Works on GBWTGraphs and GBZGraphs. + * Returns null if no such GBWT exists. + */ +const gbwt::GBWT* find_gbwt(const HandleGraph* graph); + +/** + * Find a GBWT either by getting it from the given graph or loading it from the + * given filename into the given unique_ptr. + */ +const gbwt::GBWT* find_gbwt(const HandleGraph* graph, std::unique_ptr& holder, const std::string& filename); + +} +} + +#endif diff --git a/src/algorithms/find_gbwtgraph.cpp b/src/algorithms/find_gbwtgraph.cpp new file mode 100644 index 00000000000..b7ae4008348 --- /dev/null +++ b/src/algorithms/find_gbwtgraph.cpp @@ -0,0 +1,30 @@ +/** + * \file find_gbwtgraph.cpp + */ + +#include "find_gbwtgraph.hpp" + +#include "../gbzgraph.hpp" + +namespace vg { +namespace algorithms { + +const gbwtgraph::GBWTGraph* find_gbwtgraph(const HandleGraph* graph) { + if (!graph) { + // No graph means no translation. + return nullptr; + } + if (dynamic_cast(graph)) { + // If it already is one, return it + return dynamic_cast(graph); + } + if (dynamic_cast(graph)) { + // If it's a GBZGraph, go get the GBWTGraph and return that. + return &dynamic_cast(graph)->gbz.graph; + } + // Otherwise there's no applicable GBWTGraph + return nullptr; +} + +} +} diff --git a/src/algorithms/find_gbwtgraph.hpp b/src/algorithms/find_gbwtgraph.hpp new file mode 100644 index 00000000000..c81f6785341 --- /dev/null +++ b/src/algorithms/find_gbwtgraph.hpp @@ -0,0 +1,27 @@ +#ifndef VG_ALGORITHMS_FIND_GBWTGRAPH_HPP_INCLUDED +#define VG_ALGORITHMS_FIND_GBWTGRAPH_HPP_INCLUDED + +/** + * \file find_gbwtgraph.hpp + * + * Defines an algorithm for finding the GBWTGraph associated with a handle graph, if any. + */ + +#include "../handle.hpp" +#include + +namespace vg { +namespace algorithms { +using namespace std; + +/** + * Find the GBWTGraph that is part of the given handle graph, if any exists. + * Works on GBWTGraphs and GBZGraphs. + * Returns null if no such GBWTGraph exists. + */ +const gbwtgraph::GBWTGraph* find_gbwtgraph(const HandleGraph* graph); + +} +} + +#endif diff --git a/src/algorithms/find_shortest_paths.cpp b/src/algorithms/find_shortest_paths.cpp deleted file mode 100644 index c7db2b2f85b..00000000000 --- a/src/algorithms/find_shortest_paths.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/** - * \file find_shortest_paths.cpp - * - * Implementation for the find_shortest_paths algorithm. - */ - -#include "find_shortest_paths.hpp" -#include - -namespace vg { -namespace algorithms { - -using namespace structures; - -unordered_map find_shortest_paths(const HandleGraph* g, handle_t start, - bool traverse_leftward) { - - // This is the minimum distance to each handle - unordered_map distances; - - // We keep a priority queue so we can visit the handle with the shortest - // distance next. We put handles in here whenever we see them with shorter - // distances (since STL priority queue can't update), so we also need to - // make sure nodes coming out haven't been visited already. - using Record = pair; - - // We need a custom ordering for the queue - struct IsFirstGreater { - inline bool operator()(const Record& a, const Record& b) { - return a.first > b.first; - } - }; - - // We use a filtered priority queue for auto-Dijkstra - UpdateablePriorityQueue, IsFirstGreater> queue([](const Record& item) { - return item.second; - }); - - // We keep a current handle - handle_t current = start; - size_t distance = 0; - queue.push(make_pair(distance, start)); - - while (!queue.empty()) { - // While there are things in the queue, get the first. - tie(distance, current) = queue.top(); - queue.pop(); - -#ifdef debug_vg_algorithms - cerr << "Visit " << g->get_id(current) << " " << g->get_is_reverse(current) << " at distance " << distance << endl; -#endif - - // Record handle's distance - distances[current] = distance; - - if (current != start) { - // Up the distance with the node's length. We don't do this for the - // start handle because we want to count distance from the *end* of - // the start handle unless directed otherwise. - distance += g->get_length(current); - } - - g->follow_edges(current, traverse_leftward, [&](const handle_t& next) { - // For each handle to the right of here - - if (!distances.count(next)) { - // New shortest distance. Will never happen after the handle comes out of the queue because of Dijkstra. - queue.push(make_pair(distance, next)); - -#ifdef debug_vg_algorithms - cerr << "\tNew best path to " << g->get_id(next) << " " << g->get_is_reverse(next) - << " at distance " << distance << endl; -#endif - - } - }); - } - - return distances; - -} - - -} -} diff --git a/src/algorithms/find_shortest_paths.hpp b/src/algorithms/find_shortest_paths.hpp deleted file mode 100644 index 89f7ae95d19..00000000000 --- a/src/algorithms/find_shortest_paths.hpp +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef VG_ALGORITHMS_FIND_SHORTEST_PATHS_HPP_INCLUDED -#define VG_ALGORITHMS_FIND_SHORTEST_PATHS_HPP_INCLUDED - -/** - * \file find_shortest_paths.hpp - * - * Definitions for the find_shortest_paths algorithm. - */ - -#include - -#include "../position.hpp" -#include "../vg.pb.h" -#include "../hash_map.hpp" -#include "../handle.hpp" - -namespace vg { -namespace algorithms { - - /// Finds the length of the shortest oriented path from the given handle - /// in a given direction to all reachable oriented nodes on a directed walk. - /// Uses Dijkstra's Algorithm. Distances are measured between the outgoing - /// side of the start node and the incoming side of the target. - unordered_map find_shortest_paths(const HandleGraph* g, handle_t start, - bool traverse_leftward = false); - -} -} - -#endif diff --git a/src/algorithms/find_translation.cpp b/src/algorithms/find_translation.cpp new file mode 100644 index 00000000000..9e44edfd904 --- /dev/null +++ b/src/algorithms/find_translation.cpp @@ -0,0 +1,41 @@ +/** + * \file find_translation.cpp + */ + +#include "find_translation.hpp" + +#include "../io/save_handle_graph.hpp" +#include "../gbzgraph.hpp" + +namespace vg { +namespace algorithms { + +const NamedNodeBackTranslation* find_translation(const HandleGraph* graph) { + if (!graph) { + // No graph means no translation. + return nullptr; + } + if (const gbwtgraph::GBWTGraph* gg = dynamic_cast(graph)) { + // A GBWTGraph is a NamedNodeBackTranslation, but the translation may not be present. + if (gg->has_segment_names()) { return gg; } + else { return nullptr; } + } + if (const GBZGraph* gg = dynamic_cast(graph)) { + // The same goes for a GBWTGraph contained in a GBZ graph. + if (gg->gbz.graph.has_segment_names()) { return &(gg->gbz.graph); } + else { return nullptr; } + } + if (dynamic_cast(graph)) { + // Some graph implementations just are a NamedNodeBackTranslation already. + return dynamic_cast(graph); + } + if (dynamic_cast(graph)) { + // If we loaded the graph from a GFA we would have attached this translation. + return &dynamic_cast(graph)->gfa_id_space; + } + // Otherwise there's no applicable translation + return nullptr; +} + +} +} diff --git a/src/algorithms/find_translation.hpp b/src/algorithms/find_translation.hpp new file mode 100644 index 00000000000..47ac35e03ef --- /dev/null +++ b/src/algorithms/find_translation.hpp @@ -0,0 +1,25 @@ +#ifndef VG_ALGORITHMS_FIND_TRANSLATION_HPP_INCLUDED +#define VG_ALGORITHMS_FIND_TRANSLATION_HPP_INCLUDED + +/** + * \file find_translation.hpp + * + * Defines an algorithm for finding the NamedNodeBackTranslation associated with a handle graph, if any. + */ + +#include "../handle.hpp" + +namespace vg { +namespace algorithms { +using namespace std; + +/** + * Find the NamedNodeBackTranslation defining e.g. GFA segment space for the given handle graph, if any exists. + * Works on GFAs that have been loaded and finds their path space information. + */ +const NamedNodeBackTranslation* find_translation(const HandleGraph* graph); + +} +} + +#endif diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp new file mode 100644 index 00000000000..65f19320676 --- /dev/null +++ b/src/algorithms/gfa_to_handle.cpp @@ -0,0 +1,1405 @@ +#include "gfa_to_handle.hpp" +#include "../path.hpp" + +#include + +namespace vg { +namespace algorithms { + +void GFAIDMapInfo::invert_translation() { + if (!numeric_mode) { + // Make the mapping + id_to_name.reset(new unordered_map()); + // And then populate it + for (auto mapping = name_to_id->begin(); mapping != name_to_id->end(); ++mapping) { + id_to_name->emplace(mapping->second, &mapping->first); + } + } +} + +std::vector GFAIDMapInfo::translate_back(const oriented_node_range_t& range) const { + // Nodes haven't been split. + return {range}; +} + +std::string GFAIDMapInfo::get_back_graph_node_name(const nid_t& back_node_id) const { + if (numeric_mode) { + // Just use string version of number + return std::to_string(back_node_id); + } + // We must have filled in the relevant mapping otherwise. + assert(id_to_name); + // Go look up and dereference the name string. + return *id_to_name->at(back_node_id); +} + +static void write_gfa_translation(const GFAIDMapInfo& id_map_info, const string& translation_filename) { + // don't write anything unless we have both an output file and at least one non-trivial mapping + if (!translation_filename.empty() && !id_map_info.numeric_mode) { + ofstream trans_file(translation_filename); + if (!trans_file) { + throw runtime_error("error:[gfa_to_handle_graph] Unable to open output translation file: " + translation_filename); + } + for (const auto& mapping : *id_map_info.name_to_id) { + trans_file << "T\t" << mapping.first << "\t" << mapping.second << "\n"; + } + } +} + +/// Add listeners which let a GFA parser fill in a handle graph with nodes and edges. +static void add_graph_listeners(GFAParser& parser, MutableHandleGraph* graph) { + parser.node_listeners.push_back([&parser, graph](nid_t id, const GFAParser::chars_t& sequence, const GFAParser::tag_list_t& tags) { + graph->create_handle(GFAParser::extract(sequence), id); + }); + parser.edge_listeners.push_back([&parser, graph](nid_t from, bool from_is_reverse, nid_t to, bool to_is_reverse, const GFAParser::chars_t& overlap, const GFAParser::tag_list_t& tags) { + static const string not_blunt = ("error:[gfa_to_handle_graph] Can only load blunt-ended GFAs. " + "Try \"bluntifying\" your graph with a tool like , or " + "transitively merge overlaps with a pipeline of and " + "."); + if (GFAParser::length(overlap) > 0) { + string overlap_text = GFAParser::extract(overlap); + if (overlap_text != "0M" && overlap_text != "*") { + // This isn't an allowed overlap value. + throw GFAFormatError(not_blunt + " Found edge with a non-null alignment '" + overlap_text + "'."); + } + } + + graph->create_edge(graph->get_handle(from, from_is_reverse), + graph->get_handle(to, to_is_reverse)); + + }); +} + +/// Add listeners which let a GFA parser fill in a path handle graph with paths. +static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* graph, + unordered_set* ignore_sense) { + + // For rGFA we need to have some state. Use a smart pointer to smuggle it + // into the closure. + // For each path name, we remember handle and expected next starting + // position. If we aren't at the expected next starting position, there's a + // gap and we need to make a new path. + // TODO: This duplicates some work with the parser, which also caches rGFA path expected offsets. + // TODO: Come up with a better listener interface that announces breaks and lets you keep the path handy? + using rgfa_cache_t = unordered_map>; + std::shared_ptr rgfa_cache = std::make_shared(); + + // We also need some shared state for making reference sample (RS) tags on the header apply to P and W lines later + std::shared_ptr> reference_samples = std::make_shared>(); + + parser.header_listeners.push_back([&parser, reference_samples](const GFAParser::tag_list_t& tags) { + for (const std::string& tag : tags) { + if (tag.size() >= 5 && + std::equal(gbwtgraph::REFERENCE_SAMPLE_LIST_GFA_TAG.begin(), gbwtgraph::REFERENCE_SAMPLE_LIST_GFA_TAG.end(), tag.begin()) && + tag[2] == ':' && + tag[3] == 'Z' && + tag[4] == ':') { + + // This is a reference samples tag like GBWTGraph's GFA parser knows how to parse. + // Parse the tag's value + *reference_samples = gbwtgraph::parse_reference_samples_tag(tag.substr(5)); + } + } + }); + + parser.path_listeners.push_back([&parser, graph, reference_samples, ignore_sense](const string& name, + const GFAParser::chars_t& visits, + const GFAParser::chars_t& overlaps, + const GFAParser::tag_list_t& tags) { + // For P lines, we add the path. + + // Parse out the path name's metadata + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(name, + sense, + sample, + locus, + haplotype, + phase_block, + subrange); + + if (sense == PathSense::HAPLOTYPE && reference_samples->count(sample)) { + // This P line is about a sample that looks like a haplotype but + // actually wants to be a reference. + sense = PathSense::REFERENCE; + } else if (sense == PathSense::REFERENCE && haplotype != PathMetadata::NO_HAPLOTYPE && !reference_samples->count(sample)) { + // Mimic the GBWTGraph behavior of parsing full PanSN names + // (sample, haplotype number, contig) as haplotypes by default, + // even though we use PanSN names in vg to indicate reference + // sense. + // TODO: This is super ugly, can we just change the way the + // metadata name format works, or use a dedicated PanSN parser here + // instead? + // TODO: Can we use GBWTGraph's regex priority system? + sense = PathSense::HAPLOTYPE; + if (phase_block == PathMetadata::NO_PHASE_BLOCK) { + // Assign a phase block if none is specified, since haplotypes need one. + phase_block = 0; + } + } + + if (ignore_sense && ignore_sense->count(sense)) { + return; + } + + // Compose what we think the path ought to be named. + // TODO: When we get a has_path that takes fully specified metadata, use that instead. + string implied_path_name = PathMetadata::create_path_name(sense, + sample, + locus, + haplotype, + phase_block, + subrange); + if (graph->has_path(implied_path_name)) { + // This is a duplicate. + throw GFADuplicatePathError(implied_path_name); + } + + // Create the path. + auto path_handle = graph->create_path(sense, + sample, + locus, + haplotype, + phase_block, + subrange); + + // Overlaps are pre-checked in scan_p + // TODO: Do it in a better place. + + GFAParser::scan_p_visits(visits, [&](int64_t step_rank, + const GFAParser::chars_t& step_name, + bool step_is_reverse) { + if (step_rank >= 0) { + // Not an empty path sentinel. + // Find the node ID to visit. + nid_t n = GFAParser::find_existing_sequence_id(GFAParser::extract(step_name), parser.id_map()); + // And add the step. + graph->append_step(path_handle, graph->get_handle(n, step_is_reverse)); + } + // Don't stop. + return true; + }); + }); + + parser.walk_listeners.push_back([&parser, graph, reference_samples](const string& sample_name, + int64_t haplotype, + const string& contig_name, + const subrange_t& subrange, + const GFAParser::chars_t& visits, + const GFAParser::tag_list_t& tags) { + // For W lines, we add the path with a bit more metadata. + + // By default this is interpreted as a haplotype + PathSense sense; + + // We need to determine a phase block + size_t phase_block; + // And a haplotype. + size_t assigned_haplotype = (size_t) haplotype; + + string assigned_sample_name; + if (sample_name == "*") { + // The sample name is elided from the walk. + // This walk must be a generic path. + sense = PathSense::GENERIC; + // We don't send a sample name. + assigned_sample_name = PathMetadata::NO_SAMPLE_NAME; + if (assigned_haplotype != 0) { + // We can't have multiple haplotypes for a generic path + throw GFAFormatError("Generic path on omitted (*) sample has nonzero haplotype"); + } + assigned_haplotype = PathMetadata::NO_HAPLOTYPE; + phase_block = PathMetadata::NO_PHASE_BLOCK; + } else { + // This is probably a sample name we can use + + if (reference_samples->count(sample_name)) { + // This sample is supposed to be reference. + sense = PathSense::REFERENCE; + phase_block = PathMetadata::NO_PHASE_BLOCK; + } else { + // We're a haplotype + sense = PathSense::HAPLOTYPE; + // GFA doesn't really encode phase blocks. Always use the 0th one. + phase_block = 0; + } + + // Keep the sample name + assigned_sample_name = sample_name; + } + + // Drop the subrange completely if it starts at 0. + // TODO: Detect if there are going to be multiple walks describing + // different subranges, and keep the subrange on the first one even if + // it starts at 0, because then we know it's really a partial walk. + subrange_t assigned_subrange = (subrange.first == 0) ? PathMetadata::NO_SUBRANGE : subrange; + + // Compose what we think the path ought to be named. + // TODO: When we get a has_path that takes fully specified metadata, use that instead. + string implied_path_name = PathMetadata::create_path_name(sense, + assigned_sample_name, + contig_name, + assigned_haplotype, + phase_block, + assigned_subrange); + if (graph->has_path(implied_path_name)) { + // This is a duplicate. + throw GFADuplicatePathError(implied_path_name); + } + + // Create the path. + auto path_handle = graph->create_path(sense, + assigned_sample_name, + contig_name, + assigned_haplotype, + phase_block, + assigned_subrange); + + GFAParser::scan_w_visits(visits, [&](int64_t step_rank, + const GFAParser::chars_t& step_name, + bool step_is_reverse) { + if (step_rank >= 0) { + // Not an empty path sentinel. + // Find the node ID to visit. + nid_t n = GFAParser::find_existing_sequence_id(GFAParser::extract(step_name), parser.id_map()); + // And add the step. + graph->append_step(path_handle, graph->get_handle(n, step_is_reverse)); + } + // Don't stop. + return true; + }); + }); + + + + parser.rgfa_listeners.push_back([&parser, graph, rgfa_cache](nid_t id, + int64_t offset, + size_t length, + const string& path_name, + int64_t path_rank) { + auto found = rgfa_cache->find(path_name); + if (found != rgfa_cache->end() && found->second.second != offset) { + // This path already exists, but there's a gap. We need to drop it + // from the cache and make a new one with the right subpath info. + rgfa_cache->erase(found); + found = rgfa_cache->end(); + } + if (found == rgfa_cache->end()) { + // Need to make a new path, possibly with subrange start info. + + std::pair subrange; + if (offset == 0) { + // Don't send a subrange + subrange = PathMetadata::NO_SUBRANGE; + } else { + // Start later than 0 + subrange = std::pair(offset, PathMetadata::NO_END_POSITION); + } + + // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense. + path_handle_t path = graph->create_path(PathSense::GENERIC, + PathMetadata::NO_SAMPLE_NAME, + path_name, + PathMetadata::NO_HAPLOTYPE, + PathMetadata::NO_PHASE_BLOCK, + subrange); + // Then cache it + found = rgfa_cache->emplace_hint(found, path_name, std::make_pair(path, offset)); + } + + // Add the step to the path + auto& path = found->second.first; + // rGFA paths always visit sequences forward. + handle_t step = graph->get_handle(id, false); + graph->append_step(path, step); + + // Increment the expected next offset + auto& next_offset = found->second.second; + next_offset += length; + }); +} + +void gfa_to_handle_graph(const string& filename, MutableHandleGraph* graph, + GFAIDMapInfo* translation) { + + get_input_file(filename, [&](istream& in) { + gfa_to_handle_graph(in, graph, translation); + }); +} + +void gfa_to_handle_graph(const string& filename, MutableHandleGraph* graph, + const string& translation_filename) { + + + GFAIDMapInfo id_map_info; + gfa_to_handle_graph(filename, graph, &id_map_info); + write_gfa_translation(id_map_info, translation_filename); +} + +void gfa_to_handle_graph(istream& in, MutableHandleGraph* graph, + GFAIDMapInfo* translation) { + + GFAParser parser; + if (translation) { + // Use the given external translation so the caller can keep it around. + parser.external_id_map = translation; + } + add_graph_listeners(parser, graph); + + parser.parse(in); +} + + +void gfa_to_path_handle_graph(const string& filename, MutablePathMutableHandleGraph* graph, + GFAIDMapInfo* translation, int64_t max_rgfa_rank, + unordered_set* ignore_sense) { + + get_input_file(filename, [&](istream& in) { + gfa_to_path_handle_graph(in, graph, translation, max_rgfa_rank); + }); +} + +void gfa_to_path_handle_graph(const string& filename, MutablePathMutableHandleGraph* graph, + int64_t max_rgfa_rank, const string& translation_filename, + unordered_set* ignore_sense) { + + GFAIDMapInfo id_map_info; + gfa_to_path_handle_graph(filename, graph, &id_map_info, max_rgfa_rank); + write_gfa_translation(id_map_info, translation_filename); + +} + +void gfa_to_path_handle_graph(istream& in, + MutablePathMutableHandleGraph* graph, + GFAIDMapInfo* translation, + int64_t max_rgfa_rank, + unordered_set* ignore_sense) { + + // TODO: Deduplicate this setup code with gfa_to_handle_graph more. + GFAParser parser; + if (translation) { + // Use the given external translation so the caller can keep it around. + parser.external_id_map = translation; + } + add_graph_listeners(parser, graph); + + // Set up for path input + parser.max_rgfa_rank = max_rgfa_rank; + add_path_listeners(parser, graph, ignore_sense); + + parser.parse(in); +} + +/// Read a range, stopping before any end character in the given null-terminated string, +/// or at the end of the input. +/// Throws if the range would be empty. +static GFAParser::chars_t take_range_until_optional(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, const char* end_chars, const char* parsing_state = nullptr) { + auto start = cursor; + while (cursor != end) { + for (const char* stop_char = end_chars; *stop_char; ++stop_char) { + if (*cursor == *stop_char) { + // We found a stop character + if (cursor == start) { + throw GFAFormatError("Expected nonempty value", cursor, parsing_state); + } + return GFAParser::chars_t(start, cursor); + } + } + ++cursor; + } + return GFAParser::chars_t(start, cursor); +} + +/// Read a range, stopping before any end character in the given null-terminated string. +/// Throws if the range would be empty or none of the characters are encountered. +static GFAParser::chars_t take_range_until(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, const char* end_chars, const char* parsing_state = nullptr) { + GFAParser::chars_t range = take_range_until_optional(cursor, end, end_chars, parsing_state); + if (cursor == end) { + // We didn't find any of the terminators to stop before + throw GFAFormatError("Expected terminator in " + std::string(end_chars), cursor, parsing_state); + } + return range; +} + +/// Read a range, stopping at tab or end of line. +static GFAParser::chars_t take_optional_range(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, const char* parsing_state = nullptr) { + auto start = cursor; + while (cursor != end && *cursor != '\t') { + ++cursor; + } + return GFAParser::chars_t(start, cursor); +} + +/// Read a range, stopping at tab or end of line. +/// Throw if it is empty. +static GFAParser::chars_t take_range(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, const char* parsing_state = nullptr) { + GFAParser::chars_t value = take_optional_range(cursor, end); + if (GFAParser::empty(value)) { + throw GFAFormatError("Expected nonempty value", cursor, parsing_state); + } + return value; +} + +/// Read a string, stopping at tab or end of line. +static string take_optional_string(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, const char* parsing_state = nullptr) { + string value; + while (cursor != end && *cursor != '\t') { + value.push_back(*cursor); + ++cursor; + } + return value; +} + +/// Read a string, stopping at tab or end of line. +/// Throw if it is empty. +static string take_string(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, const char* parsing_state = nullptr) { + string value = take_optional_string(cursor, end); + if (value.empty()) { + throw GFAFormatError("Expected nonempty value", cursor, parsing_state); + } + return value; +} + +/// Read a non-negative integer, stopping at tab or end of line. +/// Throw if it is empty. If it is '*', return the given default value. +static int64_t take_number(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, int64_t default_value, const char* parsing_state = nullptr) { + int64_t value = 0; + if (cursor == end || !((*cursor >= '0' && *cursor <= '9') || *cursor == '*')) { + // Number is empty and not properly elided + throw GFAFormatError("Expected natural number", cursor, parsing_state); + } + if (*cursor == '*') { + // Take the * and use the default value + ++cursor; + value = default_value; + } else { + while (cursor != end && *cursor >= '0' && *cursor <= '9') { + // Read the base 10 number digit by digit + value *= 10; + value += (*cursor - '0'); + ++cursor; + } + } + if (cursor != end && *cursor != '\t' && *cursor != '\n') { + throw GFAFormatError("Unexpected data at end of number", cursor, parsing_state); + } + return value; +} + +/// Advance past a tab character. If it's not there, return false. If something +/// else is there, return GFAFormatError. +static bool take_optional_tab(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, const char* parsing_state = nullptr) { + if (cursor == end) { + return false; + } + if (*cursor != '\t') { + throw GFAFormatError("Expected tab", cursor, parsing_state); + } + ++cursor; + return true; +} + +/// Take the given character. Throw an error if it isn't there. +static void take_character(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, char value, const char* parsing_state = nullptr) { + if (cursor == end || *cursor != value) { + throw GFAFormatError("Expected " + value, cursor, parsing_state); + } + ++cursor; +} + +/// Take one character of two options. Return true if it is the first one, +/// false if it is the second, and throw an error if it is absent or something +/// else. +static bool take_flag_character(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, char true_value, char false_value, const char* parsing_state = nullptr) { + if (cursor != end) { + if (*cursor == true_value) { + ++cursor; + return true; + } + if (*cursor == false_value) { + ++cursor; + return false; + } + } + // Composing the error is tricky because of the bare characters. + stringstream ss; + ss << "Expected " << true_value << " or " << false_value; + throw GFAFormatError(ss.str(), cursor, parsing_state); +} + +/// Advance past a tab character that must exist. +static void take_tab(GFAParser::cursor_t& cursor, const GFAParser::cursor_t& end, const char* parsing_state = nullptr) { + if (!take_optional_tab(cursor, end)) { + throw GFAFormatError("Expected tab", cursor, parsing_state); + } +} + +GFAParser::tag_list_t GFAParser::parse_tags(const chars_t& tag_range) { + tag_list_t tags; + auto cursor = tag_range.first; + auto& end = tag_range.second; + + while (cursor != end) { + // Scan out a tag of non-tab characters + string tag = take_string(cursor, end, "parsing tags"); + if (!tag.empty()) { + // We found a tag. Save it. + tags.emplace_back(std::move(tag)); + } + take_optional_tab(cursor, end, "parsing tags"); + } + + return tags; +} + +tuple GFAParser::parse_h(const string& h_line) { + auto cursor = h_line.begin(); + auto end = h_line.end(); + + // Make sure we start with H + take_character(cursor, end, 'H', "parsing H line start"); + take_tab(cursor, end, "parsing H line"); + + // Now we're either at the end or at the tab before the tags. Parse the tags. + auto tags = GFAParser::parse_tags(chars_t(cursor, end)); + + return make_tuple(std::move(tags)); +} + +tuple GFAParser::parse_s(const string& s_line) { + auto cursor = s_line.begin(); + auto end = s_line.end(); + + // Make sure we start with S + take_character(cursor, end, 'S', "parsing S line start"); + take_tab(cursor, end, "parsing S line"); + + // Parse out the name + string name = take_string(cursor, end, "parsing sequence name"); + take_tab(cursor, end, "parsing end of sequence name"); + + // Parse out the sequence + chars_t sequence = take_range(cursor, end, "parsing sequence"); + take_optional_tab(cursor, end, "parsing end of sequence"); + + // Now we're either at the end or at the tab before the tags. Parse the tags. + auto tags = GFAParser::parse_tags(chars_t(cursor, end)); + + return make_tuple(std::move(name), std::move(sequence), std::move(tags)); +} + +tuple GFAParser::parse_l(const string& l_line) { + auto cursor = l_line.begin(); + auto end = l_line.end(); + + // Make sure we start with L + take_character(cursor, end, 'L', "parsing L line start"); + take_tab(cursor, end, "parsing L line"); + + // Parse out the first node name + string n1 = take_string(cursor, end, "parsing first node name"); + take_tab(cursor, end, "parsing end of first node name"); + + // Parse the first orientation + bool n1_reverse = take_flag_character(cursor, end, '-', '+', "parsing first node orientation"); + take_tab(cursor, end, "parsing end of first node orientation"); + + // Parse out the second node name + string n2 = take_string(cursor, end, "parsing second node name"); + take_tab(cursor, end, "parsing end of sencod node name"); + + // Parse the second orientation + bool n2_reverse = take_flag_character(cursor, end, '-', '+', "parsing second node orientation"); + take_tab(cursor, end, "parsing end of second node orientation"); + + // Parse out the overlaps + chars_t overlaps = take_range(cursor, end, "parsing overlaps"); + take_optional_tab(cursor, end, "parsing end of overlaps"); + + // Now we're either at the end or at the tab before the tags. Parse the tags. + auto tags = GFAParser::parse_tags(chars_t(cursor, end)); + + return make_tuple(std::move(n1), n1_reverse, std::move(n2), n2_reverse, std::move(overlaps), std::move(tags)); +} + +tuple GFAParser::parse_p(const string& p_line) { + auto cursor = p_line.begin(); + auto end = p_line.end(); + + // Make sure we start with P + take_character(cursor, end, 'P', "parsing P line start"); + take_tab(cursor, end, "parsing P line"); + + // Grab the path name + string path_name = take_string(cursor, end, "parsing path name"); + take_tab(cursor, end, "parsing end of path name"); + + // Parse out the visits + chars_t visits = take_range(cursor, end, "parsing path visits"); + take_tab(cursor, end, "parsing end of path visits"); + + // Parse out the overlaps + chars_t overlaps = take_range(cursor, end, "parsing overlaps"); + take_optional_tab(cursor, end, "parsing end of overlaps"); + + // Now we're either at the end or at the tab before the tags. Parse the tags. + auto tags = GFAParser::parse_tags(chars_t(cursor, end)); + + return make_tuple(std::move(path_name), std::move(visits), std::move(overlaps), std::move(tags)); +} + +tuple, GFAParser::chars_t, GFAParser::tag_list_t> GFAParser::parse_w(const string& w_line) { + auto cursor = w_line.begin(); + auto end = w_line.end(); + + // Make sure we start with W + take_character(cursor, end, 'W', "parsing W line start"); + take_tab(cursor, end, "parsing W line"); + + // Grab the sample name + string sample_name = take_string(cursor, end, "parsing sample name"); + take_tab(cursor, end, "parsing end of sample name"); + + // Grab the haplotype number + int64_t haplotype_number = take_number(cursor, end, -1, "parsing haplotype number"); + if (haplotype_number == -1) { + // This field is required + throw GFAFormatError("Missing haplotype number in W line", cursor); + } + take_tab(cursor, end, "parsing end of haplotype number"); + + // Grab the sequence/contig/locus name + string sequence_name = take_string(cursor, end, "parsing sequence name"); + take_tab(cursor, end, "parsing end of sequence name"); + + // Grab the start and end positions + int64_t range_start = take_number(cursor, end, -1, "parsing subrange start"); + take_tab(cursor, end, "parsing end of subrange start"); + int64_t range_end = take_number(cursor, end, -1, "parsing subrange end"); + take_tab(cursor, end, "parsing end of subrange end"); + + // Parse out the visits + chars_t visits = take_range(cursor, end, "parsing walk visits"); + take_optional_tab(cursor, end, "parsing end of walk visits"); + + // Now we're either at the end or at the tab before the tags. Parse the tags. + auto tags = GFAParser::parse_tags(chars_t(cursor, end)); + + // Process the path subrange a bit. Compose it into the sort of subrange + // PathMetadata uses. + pair range = PathMetadata::NO_SUBRANGE; + if (range_start != -1) { + range.first = range_start; + if (range_end != -1) { + range.second = range_end; + } + } + + return make_tuple(std::move(sample_name), std::move(haplotype_number), std::move(sequence_name), std::move(range), std::move(visits), std::move(tags)); +} + +void GFAParser::scan_p_visits(const chars_t& visit_range, + function visit_step) { + + return GFAParser::scan_visits(visit_range, 'P', visit_step); +} + +void GFAParser::scan_w_visits(const chars_t& visit_range, + function visit_step) { + + return GFAParser::scan_visits(visit_range, 'W', visit_step); +} + +void GFAParser::scan_visits(const chars_t& visit_range, char line_type, + function visit_step) { + + auto cursor = visit_range.first; + auto& end = visit_range.second; + int64_t rank = 0; + + while (cursor != end) { + // Until we run out of visit list range + + bool is_reverse; + chars_t name_range; + + // Parse name and orientation as appropriate for line type + if (line_type == 'P') { + // Parse like a path line + name_range = take_range_until(cursor, end, "+-", "parsing name of visited node"); + is_reverse = take_flag_character(cursor, end, '-', '+', "parsing orientation of visited node"); + } else if (line_type == 'W') { + // Parse like a walk line + is_reverse = take_flag_character(cursor, end, '<', '>', "parsing orientation of visited node"); + name_range = take_range_until_optional(cursor, end, "><\t\n", "parsing name of visited node"); + } else { + throw std::runtime_error("Unimplemented line type for scanning visits"); + } + + + if (!visit_step(rank, name_range, is_reverse)) { + // We should stop looping + return; + } + + if (line_type == 'P') { + // P lines might have comma separators + if (cursor != visit_range.second) { + // Go past the comma separator + take_character(cursor, end, ',', "parsing visit separator"); + } + } + // And advance the rank for the next visit + ++rank; + } + + if (rank == 0) { + // Nothing was visited. Show an empty path. + visit_step(-1, chars_t(visit_range.second, visit_range.second), false); + } +} + +bool GFAParser::decode_rgfa_tags(const tag_list_t& tags, + string* out_name, + int64_t* out_offset, + int64_t* out_rank) { + bool has_sn = false; + bool has_so = false; + bool has_sr = false; + for (auto& tag : tags) { + // Try and parse each tag. + // TODO: maybe check for SN:Z:, SO:i:, SR:i: as prefixes? + size_t sep1 = tag.find(':'); + if (sep1 != string::npos) { + string tag_name = tag.substr(0, sep1); + if (tag_name == "SN" || tag_name == "SO" || tag_name == "SR") { + size_t sep2 = tag.find(':', sep1 + 1); + if (sep2 != string::npos) { + string tag_type = tag.substr(sep1 + 1, sep2 - sep1 - 1); + if (tag_name == "SN" && tag_type == "Z") { + // We found a string name + has_sn = true; + if (out_name) { + *out_name = tag.substr(sep2 + 1); + } + } else if (tag_name == "SO" && tag_type == "i") { + // We found an integer offset along the path + has_so = true; + if (out_offset) { + *out_offset = stoll(tag.substr(sep2 + 1)); + } + } else if (tag_name == "SR" && tag_type == "i") { + // We found an integer rank for the path + has_sr = true; + if (out_rank) { + *out_rank = stoll(tag.substr(sep2 + 1)); + } + } + } + } + } + if (has_sn && has_so && has_sr) { + break; + } + } + return has_sn && has_so && has_sr; +} + +nid_t GFAParser::assign_new_sequence_id(const string& str, GFAIDMapInfo& id_map_info) { + + auto found = id_map_info.name_to_id->find(str); + if (found != id_map_info.name_to_id->end()) { + // already in map, so bail out + return 0; + } + + nid_t node_id = -1; + if (id_map_info.numeric_mode) { + if (any_of(str.begin(), str.end(), [](char c) { return !isdigit(c); })) { + // non-numeric: use max id and add to map + id_map_info.numeric_mode = false; + } else { + node_id = stoll(str); + if (node_id <= 0) { + // treat <= 0 as non-numeric + id_map_info.numeric_mode = false; + } + } + } + + // if numeric, the id was set to stoll above, otherwise we take it from current max + if (!id_map_info.numeric_mode) { + node_id = id_map_info.max_id + 1; + } + + id_map_info.max_id = std::max(node_id, id_map_info.max_id); + id_map_info.name_to_id->emplace_hint(found, str, node_id); + + return node_id; +} + +nid_t GFAParser::find_existing_sequence_id(const string& str, GFAIDMapInfo& id_map_info) { + auto found = id_map_info.name_to_id->find(str); + if (found != id_map_info.name_to_id->end()) { + // already in map, just return + return found->second; + } + // Otherwise just fail + return 0; +} + +void GFAParser::parse(istream& in) { + if (!in) { + throw std::ios_base::failure("error:[GFAParser] Couldn't open input stream"); + } + + // Check if stream is seekable + in.clear(); + // This tracks the position of the current line + std::streampos in_pos = in.tellg(); + // And this is what we use when we want to say to read to the end of the file. + // We will fill it in with a real EOF position if the stream is seekable. + std::streampos eof_pos = -1; + bool stream_is_seekable = false; + if (in_pos >= 0 && in.good()) { + // Input stream is seekable. + stream_is_seekable = true; + // Find EOF + in.seekg(0, std::ios_base::end); + eof_pos = in.tellg(); + in.seekg(in_pos); + if (!in.good()) { + throw std::runtime_error("Could not get end of GFA file"); + } + } + // Reset error flags + in.clear(); + +#ifdef debug + std::cerr << "Stream seekable? " << stream_is_seekable << std::endl; +#endif + + bool has_rgfa_tags = false; + string line_buffer; // can be quite big + + // We should be able to parse in 2 passes. One to make all the nodes, and + // one to make all the things that reference nodes we hadn't seen yet. + // We track pass number 1-based. + size_t pass_number; + // And within a pass we remember the line number. Also 1-based. + size_t line_number; + + // We don't want to process any paths until we've seen the header, or we + // know there isn't one, because it affects interpretation of path lines. + bool awaiting_header; + + // We buffer lines we can't actually handle right now. + + // If we can seek back to them, we keep this collection of ranges of + // unprocessed lines. If the second field is the max value, it extends to EOF. + // Third field is starting line number. + vector> unprocessed_ranges; + // And we keep this flag for knowing when we need to close a range. + bool last_line_handled = true; + + // If we can't seek to them, we put them into this temporary file. + string buffer_name; + ofstream buffer_out_stream; + + // For error handling, we need a way to tell an error where it is + auto annotate_error = [&](GFAFormatError& e) { + // Inform the error of where exactly it is. + // We have line buffer in scope and positions are relative to the line buffer. + e.pass_number = pass_number; + if (!stream_is_seekable && pass_number > 1) { + // We're working on this temp file. Report it so line numbers make sense. + e.file_name = buffer_name; + } + e.line_number = line_number; + if (e.has_position) { + // We can find the column we were at within the line. + e.column_number = 1 + (e.position - line_buffer.begin()); + } + }; + + // We call this to save a line either in the buffer or the collection of + // unprocessed ranges, until all lines have been seen once. + auto save_line_until_next_pass = [&]() { + if (stream_is_seekable) { + // We should be able to get back here. + if (unprocessed_ranges.empty() || get<1>(unprocessed_ranges.back()) != eof_pos) { + // There's not currently a run of unprocessed lines that we are a part of. We need to start a new run. + unprocessed_ranges.emplace_back(in_pos, eof_pos, line_number); +#ifdef debug + std::cerr << "Started new unprocessed range at " << in_pos << std::endl; +#endif + // Run will be closed when a line is handled. + } + } else { + if (buffer_name.empty()) { + // Make sure the buffer is available + buffer_name = temp_file::create(); + buffer_out_stream.open(buffer_name); + if (!buffer_out_stream) { + throw runtime_error("error:[GFAParser] Could not open fallback gfa temp file: " + buffer_name); + } + } + + // Store the line into it so we can move on to the next line + buffer_out_stream << line_buffer << "\n"; + } + }; + + // We call this to save a line either in the buffer or the collection of unprocessed ranges, + // until some time after the given node is observed. + auto save_line_until_node = [&](const string& missing_node_name) { + if (pass_number > 1) { + // We should only hit this on the first pass. If we hit it later we are missing a node. + throw GFAFormatError("GFA file references missing node " + missing_node_name); + } + if (!stream_is_seekable && buffer_name.empty()) { + // Warn that we are missing this node because it is the first missing node. The tests want us to. + #pragma omp critical (cerr) + std::cerr << "warning:[GFAParser] Streaming GFA file references node " << missing_node_name << " before it is defined. " + << "GFA lines will be buffered in a temporary file." << std::endl; + } + // TODO: We could be more efficient if we could notice as soon as the node arrives and handle the line. + // Sadly we can't yet, so just save it for the next pass. + save_line_until_next_pass(); + }; + + // We want to warn about unrecognized line types, but each only once. + set warned_line_types; + + // And we need to buffer all the rGFA visits until we have seen all the nodes. + // This is a visit at a path offset to a node. We also need the length so + // we can know when it abuts later visits. + using rgfa_visit_t = tuple; + // This is a heap of those, in order + using visit_queue_t = std::priority_queue, std::greater>; + // This holds rGFA paths we have heard of, mapping from name to rank, start + // position of next visit that is safe to announce, and list of buffered + // visits in a min-heap + unordered_map> rgfa_path_cache; + + // We call this to handle the current line if it is ready to be handled, or + // buffer it if it can't. Return false if we are not ready for the line right + // now and we saved it, and true otherwise. + auto handle_line_if_ready = [&]() { + if (line_buffer.empty()) { + // No line to handle. + return true; + } + try { + switch(line_buffer[0]) { + case 'H': + // Header lines need tags examoned + { + tuple h_parse = GFAParser::parse_h(line_buffer); + auto& tags = get<0>(h_parse); + for (auto& listener : this->header_listeners) { + // Tell all the listener functions + listener(tags); + } + awaiting_header = false; + } + break; + case 'S': + // Sequence lines can always be handled right now + { + tuple s_parse = GFAParser::parse_s(line_buffer); + auto& node_name = get<0>(s_parse); + auto& sequence_range = get<1>(s_parse); + auto& tags = get<2>(s_parse); + nid_t assigned_id = GFAParser::assign_new_sequence_id(node_name, this->id_map()); + if (assigned_id == 0) { + // This name has been used already! + throw GFAFormatError("Duplicate sequence name: " + node_name); + } + for (auto& listener : this->node_listeners) { + // Tell all the listener functions + listener(assigned_id, sequence_range, tags); + } + if (this->max_rgfa_rank >= 0 && tags.size() >= 3) { + // We'll check for the 3 rGFA optional tags. + string rgfa_path_name; + int64_t rgfa_offset_on_path; + int64_t rgfa_path_rank; + if (decode_rgfa_tags(tags, &rgfa_path_name, &rgfa_offset_on_path, &rgfa_path_rank) && + rgfa_path_rank <= this->max_rgfa_rank) { + + // We need to remember this rGFA path visit + auto found = rgfa_path_cache.find(rgfa_path_name); + if (found == rgfa_path_cache.end()) { + // This is a completely new path, so record its rank + found = rgfa_path_cache.emplace_hint(found, rgfa_path_name, std::make_tuple(rgfa_path_rank, 0, visit_queue_t())); + } else { + // This path existed already. Make sure we aren't showing a conflicting rank + if (rgfa_path_rank != get<0>(found->second)) { + throw GFAFormatError("rGFA path " + rgfa_path_name + " has conflicting ranks " + std::to_string(rgfa_path_rank) + " and " + std::to_string(get<0>(found->second))); + } + } + auto& visit_queue = get<2>(found->second); + auto& next_offset = get<1>(found->second); + auto node_length = GFAParser::length(sequence_range); + if (next_offset == rgfa_offset_on_path) { + // It's safe to dispatch this visit right now since it's the next one expected along the path. + for (auto& listener : this->rgfa_listeners) { + // Tell all the listener functions about this visit + listener(assigned_id, rgfa_offset_on_path, node_length, rgfa_path_name, rgfa_path_rank); + } + // Advance the offset by the sequence length; + next_offset += node_length; + while (!visit_queue.empty() && next_offset == get<0>(visit_queue.top())) { + // The lowest-offset queued visit can be handled now because it abuts what we just did. + // Grab the visit. + auto& visit = visit_queue.top(); + for (auto& listener : this->rgfa_listeners) { + // Tell all the listener functions about this visit + listener(get<1>(visit), get<0>(visit), get<2>(visit), rgfa_path_name, rgfa_path_rank); + } + // Advance the offset by the sequence length; + next_offset += get<2>(visit); + // And pop the visit off + visit_queue.pop(); + } + } else { + // Add this visit to the heap so we can handle it when we find the missing visits. + visit_queue.emplace(rgfa_offset_on_path, assigned_id, node_length); + } + } + } + return true; + } + break; + case 'L': + // Edges can be handled if their nodes exist already + { + tuple l_parse = GFAParser::parse_l(line_buffer); + + // We only get these IDs if they have been seen already as nodes + nid_t n1 = GFAParser::find_existing_sequence_id(get<0>(l_parse), this->id_map()); + if (!n1) { + save_line_until_node(get<0>(l_parse)); + return false; + } + nid_t n2 = GFAParser::find_existing_sequence_id(get<2>(l_parse), this->id_map()); + if (!n2) { + save_line_until_node(get<2>(l_parse)); + return false; + } + + for (auto& listener : this->edge_listeners) { + // Tell all the listener functions + listener(n1, get<1>(l_parse), n2, get<3>(l_parse), get<4>(l_parse), get<5>(l_parse)); + } + } + break; + case 'P': + // Paths can be handled if all their nodes have been seen, and we know enough about the header. + { + if (awaiting_header) { + save_line_until_next_pass(); + return false; + } + + bool missing = false; + string missing_name; + + // TODO: we don't check for duplicate path lines here. + // Listeners might. + + // Parse out the path pieces: name, visits, overlaps, tags + tuple p_parse = GFAParser::parse_p(line_buffer); + auto& path_name = get<0>(p_parse); + auto& visits = get<1>(p_parse); + auto& overlaps = get<2>(p_parse); + auto& tags = get<3>(p_parse); + + for(auto it = overlaps.first; it != overlaps.second; ++it) { + if (*it != '*' && *it != ',' && *it != 'M' && (*it < '0' || *it > '9')) { + // This overlap isn't just * or a list of * or a list of matches with numbers. + // We can't handle it + throw GFAFormatError("Path " + path_name + " has nontrivial overlaps and can't be handled", it); + } + } + + // Make sure we have all the nodes in the graph + GFAParser::scan_p_visits(visits, [&](int64_t step_rank, + const GFAParser::chars_t& step_id, + bool step_is_reverse) { + if (step_rank == -1) { + // Nothing to do for empty paths + return true; + } + + if (step_rank >= 0) { + string step_string = GFAParser::extract(step_id); + nid_t n = GFAParser::find_existing_sequence_id(step_string, this->id_map()); + if (!n) { + missing = true; + missing_name = std::move(step_string); + return false; + } + } + return true; + }); + if (missing) { + save_line_until_node(missing_name); + return false; + } + + for (auto& listener : this->path_listeners) { + // Tell all the listener functions + listener(path_name, visits, overlaps, tags); + } + } + break; + case 'W': + // Walks can be handled if all their nodes have been seen, and we know enough about the hneader. + { + if (awaiting_header) { + save_line_until_next_pass(); + return false; + } + + bool missing = false; + string missing_name; + + // Fins the pieces of the walk line + tuple, chars_t, tag_list_t> w_parse = GFAParser::parse_w(line_buffer); + auto& sample_name = get<0>(w_parse); + auto& haplotype = get<1>(w_parse); + auto& contig_name = get<2>(w_parse); + auto& subrange = get<3>(w_parse); + auto& visits = get<4>(w_parse); + auto& tags = get<5>(w_parse); + + GFAParser::scan_w_visits(visits, [&](int64_t step_rank, + const GFAParser::chars_t& step_id, + bool step_is_reverse) { + if (step_rank == -1) { + // Nothing to do for empty paths + return true; + } + + // For every node the walk visits, make sure we have seen it. + string step_string = GFAParser::extract(step_id); + nid_t n = GFAParser::find_existing_sequence_id(step_string, this->id_map()); + if (!n) { + missing = true; + missing_name = std::move(step_string); + return false; + } + return true; + }); + + if (missing) { + save_line_until_node(missing_name); + return false; + } + + for (auto& listener : this->walk_listeners) { + // Tell all the listener functions + listener(sample_name, haplotype, contig_name, subrange, visits, tags); + } + } + break; + default: + if (!warned_line_types.count(line_buffer[0])) { + // Warn once about this weird line type. + warned_line_types.insert(line_buffer[0]); + cerr << "warning:[GFAParser] Ignoring unrecognized " << line_buffer[0] << " line type" << endl; + } + } + } catch (GFADuplicatePathError& e) { + // We couldn't do what this line said because we already have this path. + if (stop_on_duplicate_paths) { + // That's bad. Stop parsing. + throw; + } else { + // We can tolerate this. Just move on to the next line. + + // Make sure the error is annotated with position info. + // TODO: Invent some cool kind of stack-based context? + annotate_error(e); + + // And report it as a warning. + #pragma omp critical (cerr) + std::cerr << "warning:[GFAParser] Skipping GFA " << line_buffer[0] + << " line: " << e.what() << std::endl; + } + } + return true; + }; + + // We have a function to do a pass over a file of lines. It stops at the + // given max offset, if set. + auto process_lines_in_stream = [&](istream& in_stream, std::streampos max_offset) { + if (stream_is_seekable) { + // Keep our position in the input stream up to date. + in_pos = in_stream.tellg(); + } + while ((!stream_is_seekable || in_pos < max_offset) && getline(in_stream, line_buffer)) { + // For each line in the input file, before the max offset + if (!line_buffer.empty()) { + // Handle all lines in the stream that we can handle now. + if (handle_line_if_ready()) { + // If we handled the line, we need to mark the end of any unhandled range that might be running. + if (pass_number== 1 && stream_is_seekable && !unprocessed_ranges.empty() && + get<1>(unprocessed_ranges.back()) == eof_pos) { + // the unprocessed range ends where this line started. + get<1>(unprocessed_ranges.back()) = in_pos; +#ifdef debug + std::cerr << "Ended unprocessed range at " << in_pos << std::endl; +#endif + } + } + } + if (stream_is_seekable) { + // Keep our position in the original input stream up to date. + in_pos = in_stream.tellg(); + } + line_number++; + } +#ifdef debug + std::cerr << "Stop processing run at " << in_pos << "/" << max_offset << std::endl; +#endif + }; + + try { + + pass_number = 1; + line_number = 1; + awaiting_header = true; + process_lines_in_stream(in, eof_pos); + + if (stream_is_seekable) { + if (!unprocessed_ranges.empty()) { + // Handle unprocessed ranges of the file by seeking back to them. + + // Make sure to clear out EOF. + in.clear(); + + pass_number = 2; + // There can't be any new headers. + awaiting_header = false; + for (auto& range : unprocessed_ranges) { + in.seekg(get<0>(range)); + if (!in.good()) { + throw std::runtime_error("Unable to seek in GFA stream that should be seekable"); + } + line_number = get<2>(range); + process_lines_in_stream(in, get<1>(range)); + } + } + } else { + if (!buffer_name.empty()) { + // We also have lines in the buffer to handle. + buffer_out_stream.close(); + ifstream buffer_in_stream(buffer_name); + pass_number = 2; + // We forget the original line numbers and restart. + line_number = 1; + // There can't be any new headers. + awaiting_header = false; + process_lines_in_stream(buffer_in_stream, eof_pos); + + // Clean up buffer before returning + // TODO: Who takes care of this on GFA format error? + buffer_in_stream.close(); + unlink(buffer_name.c_str()); + } + } + + + // Run through any rGFA paths that don't start at 0 or have gaps. + for (auto& kv : rgfa_path_cache) { + auto& rgfa_path_name = kv.first; + auto& rgfa_path_rank = get<0>(kv.second); + auto& visit_queue = get<2>(kv.second); + + while (!visit_queue.empty()) { + // Grab the visit. + auto& visit = visit_queue.top(); + for (auto& listener : this->rgfa_listeners) { + // Tell all the listener functions about this visit + listener(get<1>(visit), get<0>(visit), get<2>(visit), rgfa_path_name, rgfa_path_rank); + } + // And pop the visit off + visit_queue.pop(); + } + } + } catch (GFAFormatError& e) { + // Tell the error where it happened + annotate_error(e); + // Re-throw the new and improved error + throw e; + } +} + +GFAIDMapInfo& GFAParser::id_map() { + if (external_id_map) { + return *external_id_map; + } + if (!internal_id_map) { + internal_id_map = make_unique(); + } + return *internal_id_map; +}; + + +GFAFormatError::GFAFormatError(const string& message) : std::runtime_error(message) { + // Nothing to do! +} + +GFAFormatError::GFAFormatError(const string& message, const GFAParser::cursor_t& position, const char* parsing_state) : std::runtime_error(message + (parsing_state ? (" while " + std::string(parsing_state)) : "")), has_position(true), position(position) { + // Nothing to do! +} + +const char* GFAFormatError::what() const noexcept { + if (message_buffer.empty()) { + // We need to generate the message + stringstream ss; + ss << "GFA format error: "; + + if (pass_number != 0) { + // We do the pass first because we might need to report a buffer + // line number instead of an original line number. + ss << "On pass " << pass_number << ": "; + } + if (!file_name.empty()) { + ss << "In file " << file_name << ": "; + } + if (line_number != 0) { + ss << "On line " << line_number << ": "; + } + if (column_number != 0) { + ss << "At column " << column_number << ": "; + } + + // Add on the message from the base class + ss << std::runtime_error::what(); + + // Save the composed message + message_buffer = ss.str(); + } + return message_buffer.c_str(); +} + +GFADuplicatePathError::GFADuplicatePathError(const std::string& path_name) : GFAFormatError("Duplicate path " + path_name + " exists in graph") { + // Nothing to do! +} + +GFADuplicatePathError::GFADuplicatePathError(const std::string& path_name, const GFAParser::cursor_t& position, const char* parsing_state) : GFAFormatError("Duplicate path " + path_name + " exists in graph", position, parsing_state) { + // Nothing to do! +} + + +} +} diff --git a/src/algorithms/gfa_to_handle.hpp b/src/algorithms/gfa_to_handle.hpp new file mode 100644 index 00000000000..c5e3ed49633 --- /dev/null +++ b/src/algorithms/gfa_to_handle.hpp @@ -0,0 +1,327 @@ +#ifndef VG_ALGORITHMS_GFA_TO_HANDLE_HPP_INCLUDED +#define VG_ALGORITHMS_GFA_TO_HANDLE_HPP_INCLUDED + +/** + * \file gfa_to_handle.hpp + * + * Defines algorithms for copying data from GFA files into handle graphs + */ + +#include +#include +#include + +#include "../handle.hpp" + +namespace vg { +namespace algorithms { +using namespace std; + +/** + * Stores ID information for a graph imported from a GFA. + * Either all IDs are numerically equivalent to their GFA string IDs, or they + * are stored in the name_to_id map. + */ +struct GFAIDMapInfo : public NamedNodeBackTranslation { + /// If true, GFA string IDs are just graph numerical IDs. + bool numeric_mode = true; + /// This holds the max node ID yet used. + nid_t max_id = 0; + /// This maps from GFA string ID to graph numerical ID. + /// This is behind a unique_ptr so it can be safely pointed into. + unique_ptr> name_to_id = std::make_unique>(); + + /// This inverts the name to ID map, and is populated when + /// invert_translation is called, so it can be accessed thread-safely. + unique_ptr> id_to_name; + + /** + * Prepare the backing data structures for get_back_graph_node_name(). Call after name_to_id is complete. + */ + void invert_translation(); + + /** + * Back-translation of node ranges. Is a no-op for imported GFA graphs that + * haven't been modified, since the GFA graph is itself the backing graph. + */ + std::vector translate_back(const oriented_node_range_t& range) const; + + /** + * Get the GFA sequence name of a node, given its ID. + * Assumes this will never be called until after name_to_id is fully populated. + */ + std::string get_back_graph_node_name(const nid_t& back_node_id) const; + +}; + +/// Read a GFA file for a blunt-ended graph into a HandleGraph. Give "-" as a filename for stdin. +/// +/// Throws GFAFormatError if the GFA file is not acceptable, and +/// std::ios_base::failure if an IO operation fails. Throws invalid_argument if +/// otherwise misused. +/// Does not give max ID hints, and so might be very slow when loading into an ODGI graph. +void gfa_to_handle_graph(const string& filename, + MutableHandleGraph* graph, + GFAIDMapInfo* translation = nullptr); + +/// Overload which serializes its translation to a file internally. +void gfa_to_handle_graph(const string& filename, + MutableHandleGraph* graph, + const string& translation_filename); + +/// Load a GFA from a stream (assumed not to be seekable or reopenable) into a HandleGraph. +void gfa_to_handle_graph(istream& in, + MutableHandleGraph* graph, + GFAIDMapInfo* translation = nullptr); + +/// Same as gfa_to_handle_graph but also adds path elements from the GFA to the graph. +void gfa_to_path_handle_graph(const string& filename, + MutablePathMutableHandleGraph* graph, + GFAIDMapInfo* translation = nullptr, + int64_t max_rgfa_rank = numeric_limits::max(), + unordered_set* ignore_sense = nullptr); + +/// Overload which serializes its translation to a file internally. +void gfa_to_path_handle_graph(const string& filename, + MutablePathMutableHandleGraph* graph, + int64_t max_rgfa_rank, + const string& translation_filename, + unordered_set* ignore_sense = nullptr); + +/// Load a GFA from a stream (assumed not to be seekable or reopenable) into a PathHandleGraph. +void gfa_to_path_handle_graph(istream& in, + MutablePathMutableHandleGraph* graph, + GFAIDMapInfo* translation = nullptr, + int64_t max_rgfa_rank = numeric_limits::max(), + unordered_set* ignore_sense = nullptr); + +/** + * Lower-level tools for parsing GFA elements. + * + * Parsing functions return the fields as strings, and don't support overlaps. + * Optional tags get read as strings in the vectors. + * + * Allows you to register "listeners" for different kinds of GFA file items, by + * adding functions to the various *_listeners vectors. These listeners can + * raise GFAFormatError or its subclasses if they do not like what the GFA is + * saying. Some types of GFAFormatError can be caught internally and + * processing of the file will continue with the next line, but *not* with the + * next listener for that line, so the user is responsible for worring about + * what happens if some but not all listeners for something end up getting + * called because one failed. + */ +class GFAParser { +public: + + // We are going to split up existing line buffers. + // So we need a cursor into one. + using cursor_t = string::const_iterator; + // And a range in one + using chars_t = pair; + // And a way to get the string value for one + inline static string extract(const chars_t& range) { + return string(range.first, range.second); + } + // And a way to get the length of one + inline static size_t length(const chars_t& range) { + return range.second - range.first; + } + // And a way to tell if one is empty + inline static bool empty(const chars_t& range) { + return range.second == range.first; + } + // And a type for a collection of GFA tags. + // This could become a range or list of ranges if we wanted to copy less. + using tag_list_t = vector; + + /** + * Parse tags out from a possibly empty range to a vector of tag strings. + */ + static tag_list_t parse_tags(const chars_t& tag_range); + + /** + * Parse an H line to tags + */ + static tuple parse_h(const string& h_line); + + /** + * Parse an S line to name, sequence, and tags + */ + static tuple parse_s(const string& s_line); + + /** + * Parse an L line to name, is_reverse, name, is_reverse, overlap, and tags + */ + static tuple parse_l(const string& l_line); + + /** + * Parse a P line into name, visits, overlaps, and tags. + */ + static tuple parse_p(const string& p_line); + + /** + * Parse a W line into sample, haplotype, sequence, range (start and end), walk, and tags. + * If some or all of the range is missing, uses NO_SUBRANGE and NO_END_POSITION form PathMetadata. + * Doesn't include an end position if a start position isn't set. + */ + static tuple, chars_t, tag_list_t> parse_w(const string& p_line); + + /** + * Scan visits extracted from a P line. + * Calls a callback with all the steps. + * visit_step takes {rank (-1 if path empty), step node name, step reversed} + * and returns true if it wants to keep iterating (false means stop). + */ + static void scan_p_visits(const chars_t& visit_range, + function visit_step); + + /** + * Scan visits extracted from a W line. + * Calls a callback with all the steps. + * visit_step takes {rank (-1 if path empty), step node name, step reversed} + * and returns true if it wants to keep iterating (false means stop). + */ + static void scan_w_visits(const chars_t& visit_range, + function visit_step); + + + /** + * Scan visits extracted from a P or W line, as specified in line_type. + * Calls a callback with all the steps. + * visit_step takes {rank (-1 if path empty), step node name, step reversed} + * and returns true if it wants to keep iterating (false means stop). + */ + static void scan_visits(const chars_t& visit_range, char line_type, + function visit_step); + + /** + * Decode rGFA tags from the given list of tags from an S line. + * Stores rGFA parameters at the given locations if set. + * Returns true if a complete set of tags was found. + */ + static bool decode_rgfa_tags(const tag_list_t& tags, + string* out_name = nullptr, + int64_t* out_offset = nullptr, + int64_t* out_rank = nullptr); + + /** + * Parse a GFA name into a numeric id. + * + * If all ids are numeric, they will be converted directly with stol. + * + * If all ids are non-numeric, they will get incrementing ids beginning + * with 1, in order they are visited. + * + * If they are a mix of numeric and non-numeric, the numberic ones will be + * converted with stol until the first non-numeric one is found, then it + * will revert to using max-id. + * + * Since non-numeric ids are dependent on the order the nodes are scanned, + * there is the unfortunate side effect that they will be different + * sepending on whether the GFA is processed in lexicographic order or file + * order. + * + * If the string ID has been seen before, returns 0. + */ + static nid_t assign_new_sequence_id(const string& str, GFAIDMapInfo& id_map_info); + + /** + * Find the existing sequence ID for the given node name, or 0 if it has not been seen yet. + */ + static nid_t find_existing_sequence_id(const string& str, GFAIDMapInfo& id_map_info); + + // To actually parse GFA, we stick event listeners on here and then we go + // through the GFA. It is the parser's job to make sure events aren't fired + // before events they depend on (so a path is delayed until all the nodes + // in it are parsed). + + // We can either use an internal ID map here + unique_ptr internal_id_map; + // Or have this pointed at an external one before we start parsing. + GFAIDMapInfo* external_id_map = nullptr; + + /// Get the ID map we should be using for parsing. + inline GFAIDMapInfo& id_map(); + + /// These listeners are called for the header line(s), if any. + vector> header_listeners; + /// These listeners will be called with information for all nodes. + /// Listeners are protected from duplicate node IDs. + vector> node_listeners; + /// These listeners will be called with information for all edges, after + /// the node listeners for the involved nodes. + /// Listeners are not protected from duplicate edges. + vector> edge_listeners; + /// These listeners will be called with information for all P line paths, + /// after the listeners for all involved nodes, and for the first header if any. + /// Listeners are not protected from duplicate path names. + vector> path_listeners; + /// These listeners will be called with information for all W line paths, + /// after the listeners for all involved nodes, and for the first header if any. + /// Listeners are not protected from duplicate path metadata. + vector& subrange, const chars_t& visits, const tag_list_t& tags)>> walk_listeners; + /// These listeners will be called with each visit of an rGFA path to a + /// node, after the node listeners for the involved node, but in an + /// unspecified order with respect to listeners for headers. They will be + /// called in order along each path. The listener is responsible for + /// detecting any gaps in the offset space and producing multiple subpaths + /// if necessary. + /// Listeners are protected from duplicate paths with the same name and + /// different ranks, but not from overlaps of nodes in path offset space. + vector> rgfa_listeners; + + /// Include paths from rGFA tags at this rank or lower. Set to -1 to ignore rGFA tags. + int64_t max_rgfa_rank = -1; + + /// Set to true to treat duplicate paths as errors. Otherwise, they will be + /// treated as warnings and the duplicated will be discarded. Some GFA + /// files, like the first HPRC graph releases, include duplicate paths. + bool stop_on_duplicate_paths = false; + + /** + * Parse GFA from the given stream. + */ + void parse(istream& in); + +}; + +/// This exception will be thrown if the GFA data is not acceptable. +struct GFAFormatError : public std::runtime_error { + /// We can make one from a message + GFAFormatError(const string& message); + /// We can also make one with a position and a possibly null parsing state + GFAFormatError(const string& message, const GFAParser::cursor_t& position, const char* parsing_state = nullptr); + + // The error may or may not have a position in a buffer attached. + bool has_position = false; + GFAParser::cursor_t position; + + // The error also can be annotated file location information when it makes + // it up the stack to where those things are known. + // These are all 1-based + size_t pass_number = 0; + size_t line_number = 0; + size_t column_number = 0; + string file_name = ""; + + // For making what() messages we need our own message buffer. + mutable string message_buffer; + + /// Return a pointer to a string describing this exception. + /// Not thread safe. + virtual const char* what() const noexcept; +}; + +/// This exception will be thrown if the GFA data includes multiple copies of +/// what we take to be the same path. We need to be able to tolerate this +/// situation at least in some cases because it is true of the HPRC first +/// release graphs, which duplicate paths across P lines and rGFA tags. +struct GFADuplicatePathError : public GFAFormatError { + GFADuplicatePathError(const std::string& path_name); + GFADuplicatePathError(const std::string& path_name, const GFAParser::cursor_t& position, const char* parsing_state = nullptr); +}; + +} +} + +#endif diff --git a/src/algorithms/id_sort.cpp b/src/algorithms/id_sort.cpp new file mode 100644 index 00000000000..6447b8b3cd6 --- /dev/null +++ b/src/algorithms/id_sort.cpp @@ -0,0 +1,28 @@ +#include "id_sort.hpp" + +#include +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +vector id_order(const HandleGraph* g) { + // We will fill and sort this + vector to_return; + g->for_each_handle([&](const handle_t& handle) { + // Collect all the handles + to_return.push_back(handle); + }); + + std::sort(to_return.begin(), to_return.end(), [&](const handle_t& a, const handle_t& b) { + // Sort in ID order with the standard algorithm + return g->get_id(a) < g->get_id(b); + }); + + return to_return; +} + +} +} diff --git a/src/algorithms/id_sort.hpp b/src/algorithms/id_sort.hpp new file mode 100644 index 00000000000..d7590819197 --- /dev/null +++ b/src/algorithms/id_sort.hpp @@ -0,0 +1,29 @@ +#ifndef VG_ALGORITHMS_ID_SORT_HPP_INCLUDED +#define VG_ALGORITHMS_ID_SORT_HPP_INCLUDED + +/** + * \file id_sort.hpp + * + * Defines a by-ID sort algorithm for handle graphs. + */ + +#include + +#include "../handle.hpp" + + +namespace vg { +namespace algorithms { + +using namespace std; + + +/** + * Order all the handles in the graph in ID order. All orientations are forward. + */ +vector id_order(const HandleGraph* g); + +} +} + +#endif diff --git a/src/algorithms/intersect_path_offsets.cpp b/src/algorithms/intersect_path_offsets.cpp new file mode 100644 index 00000000000..4cd37e03c14 --- /dev/null +++ b/src/algorithms/intersect_path_offsets.cpp @@ -0,0 +1,71 @@ +/** + * \file intersect_path_offsets.cpp + * + * Contains implementation of intersect_path_offsets function + */ + +#include "intersect_path_offsets.hpp" + +#include + +//#define debug + +namespace vg { +namespace algorithms { + +using namespace std; + + +bool intersect_path_offsets(const path_offset_collection_t& a_offsets, + const path_offset_collection_t& b_offsets, + size_t maximum_distance) { + + for (auto& kv : b_offsets) { + // For each path on the b side + const path_handle_t& path = kv.first; + + auto found = a_offsets.find(path); + if (found == a_offsets.end()) { + // Skip it if it's not also on the a side + continue; + } + // If it is on the a side, we'll do search against this sorted list. + auto& target_positions = found->second; + + for (auto& b_position : kv.second) { + // For each offset on the b side, we need to do a binary search. + + // Find the nearest thing to our right, if any. + auto falling_after_it = std::lower_bound(target_positions.begin(), target_positions.end(), b_position); + if (falling_after_it != target_positions.begin()) { + // There's also going to be something to our left. Check that first. + auto falling_before_it = falling_after_it; + --falling_before_it; + if (b_position.first - falling_before_it->first <= maximum_distance) { + // It is close enough before to constitute a hit + return true; + } + } + if (falling_after_it != target_positions.end()) { + // The thing to our right actually exists. Check it too. + if (falling_after_it->first - b_position.first <= maximum_distance) { + // It is close enough to constitute a hit + return true; + } + } + } + } + + // If we get here we found no matches on any paths. + return false; +} + +void sort_path_offsets(path_offset_collection_t& offsets) { + for (auto& kv : offsets) { + // Sort along each path. + std::sort(kv.second.begin(), kv.second.end()); + } +} + +} +} diff --git a/src/algorithms/intersect_path_offsets.hpp b/src/algorithms/intersect_path_offsets.hpp new file mode 100644 index 00000000000..6be5e8d3606 --- /dev/null +++ b/src/algorithms/intersect_path_offsets.hpp @@ -0,0 +1,48 @@ +#ifndef VG_ALGORITHMS_INTERSECT_PATH_OFFSETS_HPP_INCLUDED +#define VG_ALGORITHMS_INTERSECT_PATH_OFFSETS_HPP_INCLUDED + +/** + * \file intersect_path_offsets.hpp + * + * Defines algorithm for finding whether any offsets on paths in one set are + * near any offsets on paths in a different set. + */ + +#include "../handle.hpp" + +#include "nearest_offsets_in_paths.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +/** + * Given two maps from path handle to (position, orientation) pair vectors, + * determine if any positions in the two sets are on the same path, within the + * given maximum distance. + * + * The set expected to have more visits should be passed first. + * + * Orientation is ignored. + * + * The first set must be sorted, for binary search. We run binary search for + * each item in the second set, so the first set should be the larger one. + * + * We run in b log a time. + */ +bool intersect_path_offsets(const path_offset_collection_t& a_offsets, + const path_offset_collection_t& b_offsets, + size_t maximum_distance); + +/** + * Sort path offsets, so intersect_path_offsets() can use them as a target. + */ +void sort_path_offsets(path_offset_collection_t& offsets); + + +} +} + +#endif + diff --git a/src/algorithms/is_acyclic.cpp b/src/algorithms/is_acyclic.cpp deleted file mode 100644 index 0b9a73ef2df..00000000000 --- a/src/algorithms/is_acyclic.cpp +++ /dev/null @@ -1,88 +0,0 @@ -#include "is_acyclic.hpp" - -#include - -namespace vg { -namespace algorithms { - -using namespace std; - -bool is_acyclic(const HandleGraph* graph) { - - // the existence of reversing cycles is equivalent to whether a single stranded - // orientation exists - if (single_stranded_orientation(graph).size() < graph->node_size()) { - return false; - } - // the existence of non-reversing cycles is checked by the directed acyclic algorithm - return is_directed_acyclic(graph); -} - -bool is_directed_acyclic(const HandleGraph* graph) { - // We track the in and out degrees of all nodes. We then clean up degrees - // entries from tips until either we've cleaned up all the nodes or there - // are only directed cycles left. - - // Build the degrees map - unordered_map> degrees; - // And also the stack of tips to start at - vector stack; - graph->for_each_handle([&](const handle_t& here) { - size_t start_degree = 0; - graph->follow_edges(here, true, [&](const handle_t& ignored) { - start_degree++; - }); - - size_t end_degree = 0; - graph->follow_edges(here, false, [&](const handle_t& ignored) { - end_degree++; - }); - - degrees[graph->get_id(here)] = make_pair(start_degree, end_degree); - - if (start_degree == 0) { - // Tip looking forward - stack.push_back(here); - } - if (end_degree == 0) { - // Tip looking backward - stack.push_back(graph->flip(here)); - } - - }); - - - while (!stack.empty()) { - handle_t here = stack.back(); - stack.pop_back(); - - auto iter = degrees.find(graph->get_id(here)); - if (iter == degrees.end()) { - // Already processed - continue; - } - - degrees.erase(iter); - - graph->follow_edges(here, false, [&](const handle_t& next) { - auto next_iter = degrees.find(graph->get_id(next)); - if (next_iter != degrees.end()) { - // We have a node next that we haven't finished yet - - // Reduce its degree on the appropriate side. - int64_t& in_degree = graph->get_is_reverse(next) ? next_iter->second.second : next_iter->second.first; - in_degree--; - if (in_degree == 0) { - // This is a new tip in this orientation - stack.push_back(next); - } - } - }); - } - - // If we clean up the whole graph, it must have been directed-acyclic. - return degrees.empty(); -} - -} -} diff --git a/src/algorithms/is_acyclic.hpp b/src/algorithms/is_acyclic.hpp deleted file mode 100644 index ab1a0d564e3..00000000000 --- a/src/algorithms/is_acyclic.hpp +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef VG_ALGORITHMS_IS_ACYCLIC_HPP_INCLUDED -#define VG_ALGORITHMS_IS_ACYCLIC_HPP_INCLUDED - -/** - * \file is_acyclic.hpp - * - * Defines algorithms for deciding if a graph is acyclic or directed acyclic - */ - -#include "../handle.hpp" -#include "is_single_stranded.hpp" - -#include -#include - -namespace vg { -namespace algorithms { - -using namespace std; - -// Returns true if the graph contains no cycles (i.e. true if no node can reach itself -// along a bidirected walk). -bool is_acyclic(const HandleGraph* graph); - -/// Returns true if the graph contains no directed cycles. It may contain reversing -/// cycles (i.e. true if no node can reach itself in the same orientation along a -/// bidirected walk, but it might be able to reach itself in the opposite orientation). -bool is_directed_acyclic(const HandleGraph* graph); - -} -} - -#endif diff --git a/src/algorithms/is_single_stranded.cpp b/src/algorithms/is_single_stranded.cpp deleted file mode 100644 index e41e07eee39..00000000000 --- a/src/algorithms/is_single_stranded.cpp +++ /dev/null @@ -1,111 +0,0 @@ -#include "is_single_stranded.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - - bool is_single_stranded(const HandleGraph* graph) { - - bool single_stranded = true; - - function check_edges = [&](const handle_t& handle) { - - function check_edge = [&](const handle_t& next) { - single_stranded = (graph->get_is_reverse(handle) == graph->get_is_reverse(next)); - return single_stranded; - }; - - graph->follow_edges(handle, false, check_edge); - if (single_stranded) { - graph->follow_edges(handle, true, check_edge); - } - return single_stranded; - }; - - graph->for_each_handle(check_edges); - - return single_stranded; - } - - vector single_stranded_orientation(const HandleGraph* graph) { - - // the return value - vector orientation; - orientation.reserve(graph->node_size()); - - // keep track of which nodes have already been oriented and which orientation - unordered_map recorded_orientation; - - // keep track of whether we've encountered a node in two orientations - bool failed = false; - - // DFS through the graph - graph->for_each_handle([&](const handle_t& handle) { - if (recorded_orientation.count(graph->get_id(handle))) { - return true; - } - - // initialize the stack - vector stack(1, handle); - - // record the orientation of the seed for the traversal - orientation.push_back(handle); - recorded_orientation[graph->get_id(handle)] = graph->get_is_reverse(handle); - - function walk_edges = [&](const handle_t& next) { - auto iter = recorded_orientation.find(graph->get_id(next)); - if (iter != recorded_orientation.end()) { - // we've been here before, but make sure we're encountering it in the same orientation - failed = (iter->second != graph->get_is_reverse(next)); - } - else { - // add to the DFS stack - stack.push_back(next); - - // record the orientation we encountered it in - orientation.push_back(next); - recorded_orientation[graph->get_id(next)] = graph->get_is_reverse(next); - } - // continue if we haven't failed - return !failed; - }; - - // DFS - while (!stack.empty() && !failed) { - handle_t here = stack.back(); - stack.pop_back(); - - graph->follow_edges(here, true, walk_edges); - if (!failed) { - graph->follow_edges(here, false, walk_edges); - } - } - - // continue if there's any more to do and we haven't failed - return orientation.size() < graph->node_size() && !failed; - }); - - // if we failed, we return an empty vector as a sentinel - if (failed) { - orientation.clear(); - } - - return orientation; - } - - - unordered_set make_single_stranded(MutableHandleGraph* graph) { - - auto orientations = single_stranded_orientation(graph); - - if (orientations.size() != graph->node_size()) { - // we got the sentinel for an un-single-strandable graph - cerr << "error:[algorithms] attempted to apply single-stranded orientation to non-single stranded graph" << endl; - exit(1); - } - - return apply_orientations(graph, orientations); - } -} -} diff --git a/src/algorithms/is_single_stranded.hpp b/src/algorithms/is_single_stranded.hpp deleted file mode 100644 index d2b1d601ca6..00000000000 --- a/src/algorithms/is_single_stranded.hpp +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef VG_ALGORITHMS_IS_SINGLE_STRANDED_HPP_INCLUDED -#define VG_ALGORITHMS_IS_SINGLE_STRANDED_HPP_INCLUDED - -/** - * \file single_stranded.hpp - * - * Defines algorithms for deciding if a graph contains reversing edges. - */ - -#include "../handle.hpp" -#include "apply_bulk_modifications.hpp" - -#include -#include -#include - -namespace vg { -namespace algorithms { - -using namespace std; - - /// Returns true if the graph contains no reversing edges (i.e. edges that connected - /// the locally forward orientation of a node to the locally reverse orientation of - /// of another node). - bool is_single_stranded(const HandleGraph* graph); - - /// Returns a vector of handles where the orientation of each handle indicates an - /// orientation that could be used to convert the graph into a single-stranded graph. - /// That is, if all of the reverse handles (or all of the forward handles) were swapped - /// in orientation, the graph would contain no reversing edges. Returns an empty vector - /// if there is no such combination of node orientations (also if graph has no nodes). - vector single_stranded_orientation(const HandleGraph* graph); - - /// Finds a set of node orientations that can be applied so that there are no - /// reversing edges (i.e. every edge connects a locally forward node traversal - /// to another locally forward orientation). If no such combination of orientations - /// exists, produces an error and exits. Returns a set of the node IDs for nodes that - /// were swapped in orientation. Potentially invalidates any existing handles. - unordered_set make_single_stranded(MutableHandleGraph* graph); - -} -} - -#endif diff --git a/src/algorithms/jump_along_path.cpp b/src/algorithms/jump_along_path.cpp new file mode 100644 index 00000000000..338b9ca4ce2 --- /dev/null +++ b/src/algorithms/jump_along_path.cpp @@ -0,0 +1,120 @@ +/** + * \file jump_along_path.cpp + * + * Contains implementation of jump_along_closest_path algorithm + */ + +#include "jump_along_path.hpp" + +//#define debug_algorithms + +namespace vg { +namespace algorithms { + +using namespace std; + + vector jump_along_closest_path(const PathPositionHandleGraph* graph, + const pos_t& pos, int64_t jump_dist, + size_t max_search_dist) { +#ifdef debug_algorithms + cerr << "jumping " << jump_dist << " from " << pos << " with a max search dist of " << max_search_dist << endl; +#endif + + // intialize the return value + vector to_return; + + function look_for_jumpable_paths = [&](handle_t handle, int64_t search_dist, bool search_left) { + + bool found_jumpable_path = false; + +#ifdef debug_algorithms + cerr << "checking for jumpable paths for " << graph->get_id(handle) << (graph->get_is_reverse(handle) ? "-" : "+") << " at search dist " << search_dist << " from searching " << (search_left ? "leftwards" : "rightwards") << endl; +#endif + + for (const step_handle_t& step : graph->steps_of_handle(handle)) { + bool path_rev_strand = (graph->get_is_reverse(graph->get_handle_of_step(step)) + != graph->get_is_reverse(handle)); + + int64_t dist_to_jump; + if (path_rev_strand && search_left) { + dist_to_jump = -jump_dist - search_dist; + } + else if (path_rev_strand) { + dist_to_jump = -jump_dist + search_dist + graph->get_length(handle); + } + else if (search_left) { + dist_to_jump = jump_dist + search_dist + graph->get_length(handle); + } + else { + dist_to_jump = jump_dist - search_dist; + } + + int64_t target_path_pos = graph->get_position_of_step(step) + dist_to_jump; + if (target_path_pos >= 0 + && target_path_pos < graph->get_path_length(graph->get_path_handle_of_step(step))) { + + step_handle_t jump_step = graph->get_step_at_position(graph->get_path_handle_of_step(step), + target_path_pos); + + + size_t step_offset = graph->get_position_of_step(jump_step); + handle_t jump_handle = graph->get_handle_of_step(jump_step); + + to_return.emplace_back(graph->get_id(jump_handle), + graph->get_is_reverse(jump_handle) != path_rev_strand, + path_rev_strand + ? graph->get_position_of_step(jump_step) + graph->get_length(jump_handle) - target_path_pos + : target_path_pos - graph->get_position_of_step(jump_step)); + + found_jumpable_path = true; +#ifdef debug_algorithms + cerr << "found jump position " << to_return.back() << " at forward path offset " << target_path_pos << endl; +#endif + } + } + + return found_jumpable_path; + }; + + // records of (handle, searching left) prioritized by distance + // all distances are measured to the left end of the node + structures::RankPairingHeap, size_t> queue; + + handle_t handle = graph->get_handle(id(pos), is_rev(pos)); + + // add in the initial traversals in both directions from the start position + queue.push_or_reprioritize(make_pair(handle, true), offset(pos)); + queue.push_or_reprioritize(make_pair(handle, false), graph->get_length(handle) - offset(pos)); + + bool found_jumpable_path = false; + while (!queue.empty() && !found_jumpable_path) { + // get the queue that has the next shortest path + + auto here = queue.top(); + queue.pop(); + +#ifdef debug_algorithms + cerr << "traversing " << graph->get_id(here.first.first) << (graph->get_is_reverse(here.first.first) ? "-" : "+") << " in " << (here.first.second ? "leftward" : "rightward") << " direction at distance " << here.second << endl; +#endif + + graph->follow_edges(here.first.first, here.first.second, [&](const handle_t& next) { +#ifdef debug_algorithms + cerr << "\tfollowing edge to " << graph->get_id(next) << (graph->get_is_reverse(next) ? "-" : "+") << " at dist " << here.second << endl; +#endif + + found_jumpable_path = look_for_jumpable_paths(next, here.second, here.first.second); + + int64_t dist_thru = here.second + graph->get_length(next); + if (dist_thru <= (int64_t) max_search_dist) { + queue.push_or_reprioritize(make_pair(next, here.first.second), dist_thru); + } + + return !found_jumpable_path; + }); + } + + return to_return; + + } +} +} diff --git a/src/algorithms/jump_along_path.hpp b/src/algorithms/jump_along_path.hpp new file mode 100644 index 00000000000..a6c4bdc43ed --- /dev/null +++ b/src/algorithms/jump_along_path.hpp @@ -0,0 +1,32 @@ +#ifndef VG_ALGORITHMS_JUMP_ALONG_PATH_HPP_INCLUDED +#define VG_ALGORITHMS_JUMP_ALONG_PATH_HPP_INCLUDED + +/** + * \file jump_along_path.hpp + * + * Defines algorithm for jumping a given number of bases away from a position + * using paths. + */ + +#include "../handle.hpp" + +#include "structures/rank_pairing_heap.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + + /// returns a vector of positions that are found by jumping a fixed oriented distance + /// along path(s) from the given position. if the position is not on a path, searches + /// from the position to a path and adds/subtracts the search distance to the jump + /// depending on the search direction. returns an empty vector if there is no path within + /// the max search distance or if the jump distance goes past the end of the path + vector jump_along_closest_path(const PathPositionHandleGraph* graph, + const pos_t& pos, int64_t jump_dist, + size_t max_search_dist); + +} +} + +#endif diff --git a/src/algorithms/k_widest_paths.cpp b/src/algorithms/k_widest_paths.cpp new file mode 100644 index 00000000000..e9cb9f303d4 --- /dev/null +++ b/src/algorithms/k_widest_paths.cpp @@ -0,0 +1,318 @@ +/** + * \file k_widest_paths.cpp + * + * Implementation of Yen's Algorithm over the bidirected graph. + */ + +#include "k_widest_paths.hpp" + +#include + +namespace vg { +namespace algorithms { + +using namespace structures; + +//#define debug_vg_algorithms + +pair> widest_dijkstra(const HandleGraph* g, handle_t source, handle_t sink, + function node_weight_callback, + function edge_weight_callback, + function is_node_ignored_callback, + function is_edge_ignored_callback, + bool greedy_avg) { + + // We keep a priority queue so we can visit the handle with the shortest + // distance next. We put handles in here whenever we see them with shorter + // distances (since STL priority queue can't update), so we also need to + // make sure nodes coming out haven't been visited already. + // (score, previous, current) + using Record = tuple; + + // We filter out handles that have already been visited. And keep track of predecessors + unordered_map> visited; + + // We need a custom ordering for the queue + struct IsFirstGreater { + inline bool operator()(const Record& a, const Record& b) { + return get<0>(a) < get<0>(b); + } + }; + + // We use a filtered priority queue for auto-Dijkstra + UpdateablePriorityQueue, IsFirstGreater> queue([](const Record& item) { + return get<2>(item); + }); + + // We toggle between greedy average flow and minimum flow here + function acc_node; + function acc_edge; + if (greedy_avg == false) { + // min flow + acc_node = [&](double next_score, handle_t next, double& total_score, double& total_length) { + return min(next_score, node_weight_callback(next)); + }; + acc_edge = [&](double next_score, edge_t edge, double& total_score, double& total_length) { + return min(next_score, edge_weight_callback(edge)); + }; + } else { + // heuristic average flow + acc_node = [&](double next_score, handle_t next, double& total_score, double& total_length) { + size_t length = g->get_length(next); + total_length += length; + total_score += node_weight_callback(next) * (double)length; + return total_score / (double)total_length; + }; + acc_edge = [&](double next_score, edge_t edge, double& total_score, double& total_length) { + total_length += 1; + total_score += edge_weight_callback(edge); + return total_score / (double)total_length; + }; + } + + // We keep a current handle + handle_t current; + handle_t previous; + // we don't include the score of the source + // todo: should this be an option? + double score = greedy_avg ? 0 : numeric_limits::max(); + double total_score = 0; + double total_length = 0; + queue.push(make_tuple(score, source, source, total_score, total_length)); + + while (!queue.empty()) { + // While there are things in the queue, get the first. + tie(score, previous, current, total_score, total_length) = queue.top(); + queue.pop(); + +#ifdef debug_vg_algorithms + cerr << "Visit " << g->get_id(current) << " " << g->get_is_reverse(current) << " at width " << score << endl; +#endif + + // Remember that we made it here. + if (!visited.count(current) || score > visited[current].second) { + visited[current] = make_pair(previous, score); + } + + if (current != sink && current != g->flip(source)) { + g->follow_edges(current, false, [&](const handle_t& next) { + // For each handle to the right of here + if (!visited.count(next) && + !is_node_ignored_callback(next) && + !is_edge_ignored_callback(g->edge_handle(current, next))) { + + double next_score = score; + + if (next != source && next != sink) { + // we don't include the source / sink + // todo: should we? should it be an option? + next_score = acc_node(next_score, next, total_score, total_length); + } + next_score = acc_edge(next_score, g->edge_handle(current, next), total_score, total_length); + + // New shortest distance. Will never happen after the handle comes out of the queue because of Dijkstra. + queue.push(make_tuple(next_score, current, next, total_score, total_length)); + +#ifdef debug_vg_algorithms + cerr << "\tNew best path to " << g->get_id(next) << ":" << g->get_is_reverse(next) + << " at width " << next_score << endl; +#endif + + } else { +#ifdef debug_vg_algorithms + cerr << "\tDisregard path to " << g->get_id(next) << ":" << g->get_is_reverse(next) + << " at width " << score << " due to " << visited.count(next) << " || " + << is_node_ignored_callback(next) << " || " << is_edge_ignored_callback(g->edge_handle(current, next)) + < widest_path; + double width = 0; + if (visited.count(sink)) { + width = visited[sink].second; + for (handle_t tb = sink; widest_path.empty() || widest_path.back() != source; tb = visited[tb].first) { + widest_path.push_back(tb); + } + std::reverse(widest_path.begin(), widest_path.end()); + } + +#ifdef debug_vg_algorithms + cerr << "Returning wideset path (w=" << width << "): "; + for (auto h : widest_path) { + cerr << g->get_id(h) << ":" << g->get_is_reverse(h) << ", "; + } + cerr << endl; +#endif + + return make_pair(width, widest_path); +} + +// https://en.wikipedia.org/wiki/Yen%27s_algorithm +vector>> yens_k_widest_paths(const HandleGraph* g, handle_t source, handle_t sink, + size_t K, + function node_weight_callback, + function edge_weight_callback, + bool greedy_avg) { + + vector>> best_paths; + best_paths.reserve(K); + + // Eugene Lawler's optimization: keep track of spur nodes in the previous best path + // to not bother looking at the same prefix again + vector best_spurs; + + // get the widest path from dijkstra + best_paths.push_back(widest_dijkstra(g, source, sink, node_weight_callback, + edge_weight_callback, [](handle_t) {return false;}, + [](edge_t) {return false;}, greedy_avg)); + + // unable to get any kind of path. this is either a bug in the search or the graph + if (best_paths.back().second.empty()) { + best_paths.clear(); + return best_paths; + } + + best_spurs.push_back(0); + + // working path set, mapped to spur index (plus 1 -- ie next spot we want to look when finding new spurs) + // todo: make more efficient + map, size_t> B; + // used to pull out the biggest element in B + multimap, size_t>::iterator> score_to_B; + + // start scanning for our k-1 next-widest paths + for (size_t k = 1; k < K; ++k) { + + // we look for a "spur node" in the previous path. the current path will be the previous path + // up to that spur node, then a new path to the sink. (i is the index of the spur node in + // the previous (k - 1) path + vector& prev_path = best_paths[k - 1].second; + for (size_t i = best_spurs[k - 1]; i < prev_path.size() - 1; ++i) { + + handle_t spur_node = prev_path[i]; + // root path = prev_path[0 : i] + +#ifdef debug_vg_algorithms + cerr << "k=" << k << ": spur node=" << g->get_id(spur_node) << ":" << g->get_is_reverse(spur_node) << endl; +#endif + unordered_set forgotten_edges; + for (const auto& p_v : best_paths) { + const vector& p = p_v.second; + + // check if the root path is a prefix of p + bool is_common_root = true; + for (size_t j = 0; j <= i && is_common_root; ++j) { + if (j >= prev_path.size() || p[j] != prev_path[j]) { + is_common_root = false; + } + } + + // remove the links that are part of the previous shortest paths which share the same root path + if (is_common_root) { +#ifdef debug_vg_algorithms + cerr << "forgetting edge " << g->get_id(p[i]) << ":" << g->get_is_reverse(p[i]) << " -- " + << g->get_id(p[i+1]) << ":" << g->get_is_reverse(p[i+1]) << endl; +#endif + forgotten_edges.insert(g->edge_handle(p[i], p[i+1])); + } + } + + // forget the root path too (except spur node) + unordered_set forgotten_nodes; + for (int j = 0; j < (int)i - 1; ++j) { +#ifdef debug_vg_algorithms + cerr << "forgetting node " << g->get_id(prev_path[j]) << ":" << g->get_is_reverse(prev_path[j]) << endl; +#endif + forgotten_nodes.insert(prev_path[j]); + // don't allow loop-backs in paths + forgotten_nodes.insert(g->flip(prev_path[j])); + } + + // find our path from the the spur_node to the sink + pair> spur_path_v = widest_dijkstra(g, spur_node, sink, node_weight_callback, edge_weight_callback, + [&](handle_t h) {return forgotten_nodes.count(h);}, + [&](edge_t e) {return forgotten_edges.count(e);}, + greedy_avg); + + if (!spur_path_v.second.empty()) { + + // make the path by combining the root path and the spur path + pair> total_path; + double total_width = numeric_limits::max(); + for (size_t j = 0; j < i; ++j) { + total_path.second.push_back(prev_path[j]); + total_width = min(total_width, node_weight_callback(prev_path[j])); + if (j > 0) { + total_width = min(total_width, edge_weight_callback(g->edge_handle(prev_path[j-1], prev_path[j]))); + } + } + if (!total_path.second.empty()) { + total_width = min(total_width, edge_weight_callback(g->edge_handle(total_path.second.back(), spur_path_v.second.front()))); + } + + // insert the path into our sorted set + total_path.second.insert(total_path.second.end(), spur_path_v.second.begin(), spur_path_v.second.end()); + if (!greedy_avg) { + total_path.first = min(total_width, spur_path_v.first); + } else { + // todo: we can avoid recomputing the part that comes out of dijkstra by passing it out + size_t total_length = 0; + double total_support = 0; + for (size_t avg_i = 0; avg_i < total_path.second.size(); ++avg_i) { + size_t node_len = g->get_length(total_path.second[avg_i]); + total_length += node_len; + total_support += node_weight_callback(total_path.second[avg_i]) * (double)node_len; + if (avg_i > 0) { + total_length += 1; + total_support += edge_weight_callback(g->edge_handle(total_path.second[avg_i-1], total_path.second[avg_i])); + } + } + total_path.first = total_length > 0 ? total_support / total_length : 0; + } + pair, size_t>::iterator, bool> ins = B.insert(make_pair(total_path.second, i)); + if (ins.second == true) { + score_to_B.insert(make_pair(total_path.first, ins.first)); + } // todo: is there any reason we'd need to update the score of an existing entry in B? + + // if we're in greedy average mode, it's possible that the score got better. Choose this spur and move on + // to save looping through every spur. could miss good paths, but will speed up search. + if (greedy_avg && total_path.first >= best_paths.front().first) { + break; + } + } + } + + if (B.empty()) { + break; + } + + assert(score_to_B.size() == B.size()); + multimap, size_t>::iterator>::iterator best_B_it = std::prev(score_to_B.end()); + + // the best path gets put into our output list + best_paths.push_back(make_pair(best_B_it->first, best_B_it->second->first)); + best_spurs.push_back(best_B_it->second->second); + B.erase(best_B_it->second); + score_to_B.erase(best_B_it); + +#ifdef debug_vg_algorithms + cerr << "adding best path (w=" << best_paths.back().first << "): "; + for (auto h : best_paths.back().second) { + cerr << g->get_id(h) << ":" << g->get_is_reverse(h) << ", "; + } + cerr << endl; +#endif + + } + + return best_paths; +} + + +} +} diff --git a/src/algorithms/k_widest_paths.hpp b/src/algorithms/k_widest_paths.hpp new file mode 100644 index 00000000000..dea5994f7d1 --- /dev/null +++ b/src/algorithms/k_widest_paths.hpp @@ -0,0 +1,45 @@ +#ifndef VG_ALGORITHMS_K_WIDEST_PATHS_HPP_INCLUDED +#define VG_ALGORITHMS_K_WIDEST_PATHS_HPP_INCLUDED + +/** + * \file k_widest_paths.hpp + * + * Yen's algorithm to find the K widest + */ + +#include + +#include "../position.hpp" +#include "../handle.hpp" + +namespace vg { +namespace algorithms { + +/// This Dijkstra is the same underlying algorithm as the one in dijkstra.hpp +/// but the interface is different enough that I opted to make it a seprate +/// thing rather than add loads of optional arguments. The key differences +/// are these generalizations: +/// -- looks for the "widest" path (maximum minimum weight) instead of shortest +/// -- counts node and edge weights (via callbakcs) +/// -- returns the path as well as the score +/// -- option for ignoring certain nodes and edges in search (required by Yen's algorithm) +/// -- greedy_avg option switches the algorithm to a heuristic (no optimal guarantee) search +/// using the running averages support instead of min-flow support as objective function. +pair> widest_dijkstra(const HandleGraph* g, handle_t source, handle_t sink, + function node_weight_callback, + function edge_weight_callback, + function is_node_ignored_callback, + function is_edge_ignored_callbback, + bool greedy_avg = false); + +/// Find the k widest paths +vector>> yens_k_widest_paths(const HandleGraph* g, handle_t source, handle_t sink, + size_t K, + function node_weight_callback, + function edge_weight_callback, + bool greedy_avg = false); + +} +} + +#endif diff --git a/src/algorithms/kmer.cpp b/src/algorithms/kmer.cpp new file mode 100644 index 00000000000..1318423766b --- /dev/null +++ b/src/algorithms/kmer.cpp @@ -0,0 +1,110 @@ +#include "kmer.hpp" + +namespace vg { + +namespace algorithms { + +void for_each_kmer(const HandleGraph& graph, size_t k, size_t edge_max, + const std::function& lambda) { + graph.for_each_handle([&](const handle_t& h) { + // for the forward and reverse of this handle + // walk k bases from the end, so that any kmer starting on the node will be represented in the tree we build + for (auto handle_is_rev : { false, true }) { + handle_t handle = handle_is_rev ? graph.flip(h) : h; + std::list kmers; + // for each position in the node, set up a kmer with that start position and the node end or kmer length as the end position + // determine next positions + nid_t handle_id = graph.get_id(handle); + size_t handle_length = graph.get_length(handle); + std::string handle_seq = graph.get_sequence(handle); + for (size_t i = 0; i < handle_length; ++i) { + pos_t begin = make_pos_t(handle_id, handle_is_rev, i); + pos_t end = make_pos_t(handle_id, handle_is_rev, std::min(handle_length, i+k)); + kmer_t kmer = kmer_t(handle_seq.substr(offset(begin), offset(end)-offset(begin)), begin, end, handle); + if (kmer.seq.size() < k) { + size_t next_count = 0; + if (edge_max) graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { ++next_count; return next_count <= 1; }); + //kmer.seq.reserve(k); // may reduce allocation costs + // follow edges if we haven't completed the kmer here + if (next_count > 1 && (edge_max && edge_max == kmer.forks)) { + } else { + graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { + kmers.push_back(kmer); + auto& todo = kmers.back(); + todo.curr = next; + if (next_count > 1) { + ++todo.forks; + } + }); + } + } else { + kmers.push_back(kmer); + } + } + + // now expand the kmers until they reach k + while (!kmers.empty()) { + // first we check which ones have reached length k in the current handle; for each of these we run lambda and remove them from our list + auto kmers_end = kmers.end(); + for (std::list::iterator q = kmers.begin(); q != kmers_end; ++q) { + auto& kmer = *q; + // did we reach our target length? + if (kmer.seq.size() == k) { + // now pass the kmer to our callback + lambda(kmer); + q = kmers.erase(q); + } else { + // do we finish in the current node? + nid_t curr_id = graph.get_id(kmer.curr); + size_t curr_length = graph.get_length(kmer.curr); + bool curr_is_rev = graph.get_is_reverse(kmer.curr); + std::string curr_seq = graph.get_sequence(kmer.curr); + size_t take = std::min(curr_length, k-kmer.seq.size()); + kmer.end = make_pos_t(curr_id, curr_is_rev, take); + kmer.seq.append(curr_seq.substr(0,take)); + if (kmer.seq.size() < k) { + size_t next_count = 0; + if (edge_max) graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { ++next_count; return next_count <= 1; }); + //kmer.seq.reserve(k); // may reduce allocation costs + // follow edges if we haven't completed the kmer here + if (next_count > 1 && (edge_max && edge_max == kmer.forks)) { + } else { + graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { + kmers.push_back(kmer); + auto& todo = kmers.back(); + todo.curr = next; + if (next_count > 1) { + ++todo.forks; + } + }); + } + // if not, we need to expand through the node then follow on + /* + graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { + kmers.push_back(kmer); + auto& todo = kmers.back(); + todo.curr = next; + }); + */ + q = kmers.erase(q); + } else { + if (kmer.seq.size() > k) { + assert(kmer.seq.size() <= k); + } + } + } + } + } + } + }, true); +} + +std::ostream& operator<<(std::ostream& out, const kmer_t& kmer) { + out << kmer.seq << "\t" + << id(kmer.begin) << ":" << (is_rev(kmer.begin) ? "-":"") << offset(kmer.begin) << "\t"; + return out; +} + +} + +} diff --git a/src/algorithms/kmer.hpp b/src/algorithms/kmer.hpp new file mode 100644 index 00000000000..37c977cc9f7 --- /dev/null +++ b/src/algorithms/kmer.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include +#include +#include +#include "position.hpp" + +/** \file + * Functions for working with `kmers_t`'s in HandleGraphs. + */ + +namespace vg { + +namespace algorithms { + +using namespace handlegraph; + +/// Stores a kmer in the context of a graph. +struct kmer_t { + kmer_t(const std::string& s, + const pos_t& b, + const pos_t& e, + const handle_t& c) + : seq(s), begin(b), end(e), curr(c) { }; + /// the kmer + std::string seq; + /// our start position + pos_t begin; + /// Used in construction + pos_t end; /// one past the (current) end of the kmer + handle_t curr; /// the next handle we extend into + uint16_t forks; /// how many branching edge crossings we took to get here +}; + +/// Iterate over all the kmers in the graph, running lambda on each +void for_each_kmer(const HandleGraph& graph, size_t k, size_t edge_max, + const std::function& lambda); + +/// Print a kmer_t to a stream. +std::ostream& operator<<(std::ostream& out, const kmer_t& kmer); + +} + +} diff --git a/src/algorithms/locally_expand_graph.cpp b/src/algorithms/locally_expand_graph.cpp new file mode 100644 index 00000000000..38c97ee5d1e --- /dev/null +++ b/src/algorithms/locally_expand_graph.cpp @@ -0,0 +1,53 @@ +/** + * \file locally_expand_graph.cpp + * + * Implementation for the locally_expand_graph algorithm. + */ + +#include "locally_expand_graph.hpp" +#include + +//#define debug_locally_expand_graph + +namespace vg { +namespace algorithms { + +using namespace structures; + +void locally_expand_graph(const HandleGraph& parent, MutableHandleGraph& subgraph, + handle_t from, int64_t max_dist) { + + RankPairingHeap> queue; + + handle_t start = parent.get_handle(subgraph.get_id(from), subgraph.get_is_reverse(from)); + + queue.push_or_reprioritize(start, -parent.get_length(start)); + while (!queue.empty()) { + handle_t handle; + int64_t dist; + tie(handle, dist) = queue.top(); + queue.pop(); + +#ifdef debug_locally_expand_graph + cerr << "at " << parent.get_id(handle) << " " << parent.get_is_reverse(handle) << ", dist " << dist << endl; +#endif + + int64_t dist_thru = dist + parent.get_length(handle); + if (dist_thru < max_dist) { + parent.follow_edges(handle, false, [&](const handle_t& next) { +#ifdef debug_locally_expand_graph + cerr << "\ttake edge to " << parent.get_id(next) << " " << parent.get_is_reverse(next) << ", dist " << dist_thru << endl; +#endif + queue.push_or_reprioritize(next, dist_thru); + if (!subgraph.has_node(parent.get_id(next))) { + subgraph.create_handle(parent.get_sequence(parent.forward(next)), parent.get_id(next)); + } + subgraph.create_edge(subgraph.get_handle(parent.get_id(handle), parent.get_is_reverse(handle)), + subgraph.get_handle(parent.get_id(next), parent.get_is_reverse(next))); + }); + } + } +} + +} +} diff --git a/src/algorithms/locally_expand_graph.hpp b/src/algorithms/locally_expand_graph.hpp new file mode 100644 index 00000000000..69df284773e --- /dev/null +++ b/src/algorithms/locally_expand_graph.hpp @@ -0,0 +1,24 @@ +#ifndef VG_ALGORITHMS_LOCALLY_EXPAND_GRAPH_HPP_INCLUDED +#define VG_ALGORITHMS_LOCALLY_EXPAND_GRAPH_HPP_INCLUDED + +/** + * \file locally_expand_graph.hpp + * + * Definitions for the locally_expand_graph algorithm. + */ + +#include "../handle.hpp" + +namespace vg { +namespace algorithms { + +/// Add to a subgraph (with same node ID space as parent) by walking forward from a given +/// node and adding all walks up to a maximum distance away. The handle provided graph +/// should be from the subgraph, not the parent graph. +void locally_expand_graph(const HandleGraph& parent, MutableHandleGraph& subgraph, + handle_t from, int64_t max_dist); + +} +} + +#endif diff --git a/src/algorithms/merge.cpp b/src/algorithms/merge.cpp new file mode 100644 index 00000000000..23c449e1e70 --- /dev/null +++ b/src/algorithms/merge.cpp @@ -0,0 +1,239 @@ +/** + * \file merge.cpp + * + * Defines an algorithm to merge parts of handles together. + */ + +#include "merge.hpp" + +#include +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +void merge(handlegraph::MutablePathDeletableHandleGraph* graph, const vector>& start, size_t length) { + +#ifdef debug + cerr << "Merge " << length << " bases on:" << endl; + for(auto& handle_and_offset : start) { + cerr << "\t" << graph->get_id(handle_and_offset.first) + << (graph->get_is_reverse(handle_and_offset.first) ? '-' : '+') + << "@" << handle_and_offset.second << endl; + } +#endif + + if (start.empty()) { + // Nothing to do! + return; + } + + // Split out the desired ranges of all the handles and get the middle bits + vector middles; + middles.reserve(start.size()); + + for (auto& handle_and_offset : start) { + auto& to_split = handle_and_offset.first; + auto& offset = handle_and_offset.second; + + vector divisions; + // By default the part we are interested in will be at index 0. + size_t output_index = 0; + if (offset != 0) { + // We need to break after the start of the node + divisions.push_back(offset); + // Our middle handle will end up at index 1 + output_index = 1; + } + if (offset + length != graph->get_length(to_split)) { + // We need to break before the end of the node + divisions.push_back(offset + length); + } + +#ifdef debug + cerr << "Splitting " << graph->get_id(to_split) << (graph->get_is_reverse(to_split) ? '-' : '+') << " at:"; + for (auto& index : divisions) { + cerr << " " << index; + } + cerr << endl; +#endif + + if (divisions.empty()) { + // Just use the whole node + middles.push_back(to_split); + } else { + // Actually split + auto parts = graph->divide_handle(to_split, divisions); + middles.push_back(parts.at(output_index)); + } + } + +#ifdef debug + cerr << "Got middles:" << endl; + for (auto& part : middles) { + cerr << "\t" << graph->get_id(part) << (graph->get_is_reverse(part) ? '-' : '+') << endl; + } +#endif + + + // Pick one to be the one that survives + handle_t merged = middles.back(); + middles.pop_back(); + +#ifdef debug + cerr << "Chose representative: " << graph->get_id(merged) << (graph->get_is_reverse(merged) ? '-' : '+') << endl; +#endif + + // Define a translator that reroutes edges from other middles to other + // middles to instead point to the final merged middle. + unordered_set other_middles(middles.begin(), middles.end()); + auto translate = [&](const handle_t neighbor) { + if (other_middles.count(neighbor)) { + return merged; + } else if (other_middles.count(graph->flip(neighbor))) { + return graph->flip(merged); + } else { + return neighbor; + } + }; + + // Make sets of neighbors we already have + unordered_set existing_right_neighbors; + unordered_set existing_left_neighbors; + + graph->follow_edges(merged, false, [&](const handle_t& h) { + // Look right and collect neighbors. + // Don't translate here; edges to other middles will be removed, so + // they will still need remaking. + existing_right_neighbors.insert(h); + }); + graph->follow_edges(merged, true, [&](const handle_t& h) { + // Look left and collect neighbors. + // Don't translate here; edges to other middles will be removed, so + // they will still need remaking. + existing_left_neighbors.insert(h); + }); + +#ifdef debug + cerr << "Existing right neighbors: " << endl; + for (auto& neighbor : existing_right_neighbors) { + cerr << "\t" << graph->get_id(neighbor) << (graph->get_is_reverse(neighbor) ? '-' : '+') << endl; + } + cerr << "Existing left neighbors: " << endl; + for (auto& neighbor : existing_left_neighbors) { + cerr << "\t" << graph->get_id(neighbor) << (graph->get_is_reverse(neighbor) ? '-' : '+') << endl; + } +#endif + + + // Create edges from everything attached to the other ones to the first one. + // We collect and then create edges to avoid upsetting iteration. + // We need to deduplicate due to + // https://github.com/vgteam/libbdsg/issues/39 and also because there are + // likely to be duplicates when merging siblings. + unordered_set right_neighbors; + unordered_set left_neighbors; + + for (auto& other : middles) { + // For each node we merge in + +#ifdef debug + cerr << "Other version " << graph->get_id(other) << (graph->get_is_reverse(other) ? '-' : '+') << " adds:" << endl; +#endif + + graph->follow_edges(other, false, [&](const handle_t& h) { + // Look right and collect new neighbors + + // Alias other middles to the true middle + auto dest = translate(h); + if (!existing_right_neighbors.count(dest)) { + right_neighbors.insert(dest); +#ifdef debug + cerr << "\tRight neighbor " << graph->get_id(h) << (graph->get_is_reverse(h) ? '-' : '+') + << " = " << graph->get_id(dest) << (graph->get_is_reverse(dest) ? '-' : '+') << endl; +#endif + } + }); + graph->follow_edges(other, true, [&](const handle_t& h) { + // Look left and collect new neighbors + + // Alias other middles to the true middle + auto dest = translate(h); + if (!existing_left_neighbors.count(dest)) { + left_neighbors.insert(dest); +#ifdef debug + cerr << "\tLeft neighbor " << graph->get_id(h) << (graph->get_is_reverse(h) ? '-' : '+') + << " = " << graph->get_id(dest) << (graph->get_is_reverse(dest) ? '-' : '+') << endl; +#endif + } + }); + } + + // Make sure the end-to-start self loop only gets made once. + // If it existed before, it won't be added. + // But if it didn't, we might see ourselves as both a right and a left neighbor. + if (right_neighbors.count(merged) && left_neighbors.count(merged)) { + // Erase from right so we only make an edge based on left. + right_neighbors.erase(merged); + } + + for (auto& h : right_neighbors) { + // Make all the right edges. Should be unique. + graph->create_edge(merged, h); + } + for (auto& h : left_neighbors) { + // Make all the left edges. Should be unique. + graph->create_edge(h, merged); + } + + // Move all the paths over to the first one + // Need to aggregate to avoid removing steps as we visit them. + // Also need to record orientation so we can preserve it + vector> to_move; + for (auto& other : middles) { + graph->for_each_step_on_handle(other, [&](const step_handle_t s) { + // Say we need to rewrite this step, and record an orientation: + // true if the step runs against the orientartion of other, and + // false if it runs with it. + to_move.emplace_back(s, graph->get_is_reverse(other) != graph->get_is_reverse(graph->get_handle_of_step(s))); + }); + } + for (auto& step_and_orientation : to_move) { + // For each thing we are moving, unpack it + auto& step = step_and_orientation.first; + auto& flip = step_and_orientation.second; + // Rewrite the path to go through merged forward if we went through the + // handle we're merging in forward, and merged reverse otherwise. + // Make sure to advance the end of the range because rewrite is end-exclusive (to allow insert). + graph->rewrite_segment(step, graph->get_next_step(step), {flip ? graph->flip(merged) : merged}); + } + + for (auto& other : middles) { + // Delete the other versions of the merged segment. + +#ifdef debug + cerr << "Destroy other version " << graph->get_id(other) << (graph->get_is_reverse(other) ? '-' : '+') << " and its edges" << endl; +#endif + + // First we have to delete each edge exactly once + unordered_set to_remove; + graph->follow_edges(other, false, [&](const handle_t& h) { + to_remove.insert(graph->edge_handle(other, h)); + }); + graph->follow_edges(other, true, [&](const handle_t& h) { + to_remove.insert(graph->edge_handle(h, other)); + }); + for (auto& e : to_remove) { + graph->destroy_edge(e); + } + // And then the node itself + graph->destroy_handle(other); + } +} + + +} +} + diff --git a/src/algorithms/merge.hpp b/src/algorithms/merge.hpp new file mode 100644 index 00000000000..1fa6c5cc0f4 --- /dev/null +++ b/src/algorithms/merge.hpp @@ -0,0 +1,24 @@ +#ifndef VG_ALGORITHMS_MERGE_HPP_INCLUDED +#define VG_ALGORITHMS_MERGE_HPP_INCLUDED + + +#include "../handle.hpp" +#include + +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +/** + * Merge the given ranges of bases on the given handles together, rewriting paths. + * Sequences must match. Handles to a single node may occur no more than once. + */ +void merge(handlegraph::MutablePathDeletableHandleGraph* graph, const vector>& start, size_t length); + +} +} + +#endif diff --git a/src/algorithms/min_cut_graph.cpp b/src/algorithms/min_cut_graph.cpp new file mode 100644 index 00000000000..323faff4223 --- /dev/null +++ b/src/algorithms/min_cut_graph.cpp @@ -0,0 +1,453 @@ +/** + * \file min_cut_graph.cpp + * + * Contains implementation of min_cut_graph function + */ + +#include "min_cut_graph.hpp" +#include +#include +#include +#include +#include "../contracting_graph.hpp" +#include +#include +#include +#include +#include + +// #define debug_min_decomp +// #define debug_kargers_min +// #define debug_compute_min + +namespace vg { + namespace algorithms { + + using namespace std; + + pair>, size_t> kargers_min_cut(Graph graph, const int seed) { + + size_t V = graph.get_size(); + minstd_rand0 random_engine(seed); + ContractingGraph cg(graph); + unordered_map cgraph_total_edge_weights; + pair>, size_t> to_return; + + vector node_ids = graph.get_node_ids(); + // check for graph containing a node without an edge + for (auto& id : node_ids){ + if(graph.get_node_by_id(id).edges.size() <=0){ + //return empty container +#ifdef debug_kargers_min + cout << "============================================================================= " << endl; + cout << "Disconnected graph " << endl; + cout << "Node " <> disjoint_sets; + //disjoint sets will just be two sets, each containing one node + //using index starting at 0 for nodes + + + vector vnodes = graph.get_node_ids(); + for(auto& id : vnodes){ +#ifdef debug_kargers_min + + // node id + cout << "node "< supernode0 = {vnodes[1]}; + unordered_set supernode1 = {vnodes[0]}; + disjoint_sets.push_back(supernode0); + disjoint_sets.push_back(supernode1); + + //assumes weights from node 0->node 1 and node 1->node 0 are equal + size_t node_id = vnodes[0]; + size_t weight_of_cut = graph.get_node_by_id(node_id).edges[0].weight; + to_return = make_pair(disjoint_sets, weight_of_cut); + + } + + // get nodes will return the heads of all the nodes + // at first call all nodes will be heads + vector super_nodes = cg.get_nodes(); + //get the total edge weights for super_nodes in contracted graph + + for (int i =0; i < super_nodes.size(); i++){ + //get total edge weights for each super node + unordered_map supernode_edge_weights = cg.get_edges(super_nodes[i]); + +#ifdef debug_kargers_min + cout << "============================================================================= " << endl; + cout << "supernode edge weights for node " << super_nodes[i] << endl; +#endif + // tally up the total weights of incident edges + int total_weight = 0; + for(pair element: supernode_edge_weights){ + total_weight += element.second; +#ifdef debug_kargers_min + cout << element.first << ":" <2 nodes + //assumes the graph is connected, acyclic, graph + while(V > 2){ + + // create a vector with the node weights + vector node_weights; + size_t node_num; + for (int i =0; i nodes_distribution(begin(node_weights), end(node_weights)); + + //pick an node proportional to its total weight from incident edges + //and chooses uniformly from duplicates + //discrete distribution returns an index + int random_weight_idx = nodes_distribution(random_engine); + size_t random_node = super_nodes[random_weight_idx]; + + + //get the edge weights of random node + vector rand_ew; + unordered_map rand_node_edges = cg.get_edges(random_node); + + for_each(rand_node_edges.begin(), rand_node_edges.end() , [&](pair element){ + + //push back the weights + rand_ew.push_back(element.second); + }); + + + // create a discrete distrbution with edge weights + discrete_distribution edges_distribution(begin(rand_ew), end(rand_ew)); + + //pick a random edge weight proportional to its value + int other_node; + int random_edge_weight_idx = edges_distribution(random_engine); + + int count = 0; + for (pair element : rand_node_edges){ + //iterate through the unordered_map rand_node:{other_node: edge_weight} + //use the idx to access other node + if (count==random_edge_weight_idx){ + other_node = element.first; + break; + } + count++; + } + + //contract edge between random node and other node + cg.contract(random_node, other_node); + + //get nodes after contraction + super_nodes = cg.get_nodes(); + + //calculate new number of supernodes left in contracted graph + V = super_nodes.size(); + + // will hold most up-to-date contracted graphs weights + //clear it to recalculate and update + cgraph_total_edge_weights.clear(); + + //update contracted graph and total edge weights + for (int i =0; i < super_nodes.size(); i++){ + //get total edge weights for each super node + unordered_map supernode_edge_weights = cg.get_edges(super_nodes[i]); + + // tally up the total weights of incident edges + int total_weight = 0; + for(pair element: supernode_edge_weights){ + total_weight += element.second; + } + + //add total edge weight for each super node + //ex: 2: {1:1, 4:14}, sum up 1+14 and put that in the total edge weight list for node #2 + cgraph_total_edge_weights[super_nodes[i]] = total_weight; + + } +#ifdef debug_kargers_min + cout << "============================================================================= " << endl; + cout << "random node " << random_node << " and other node " << other_node << " have been unioned" << endl; + cout << "number of nodes after union: " << V << endl; + for (int i =0; i element: cgraph_total_edge_weights){ + cout << "node "<> disjoint_vector = cg.get_disjoint_sets(); + #ifdef debug_kargers_min + cout << "============================================================================= " << endl; + cout << "vector"< disjoint_set1(disjoint_vector[0].begin(), disjoint_vector[0].end()); + unordered_set disjoint_set2(disjoint_vector[1].begin(), disjoint_vector[1].end()); + vector> disjoint_sets; + disjoint_sets.push_back(disjoint_set1); + disjoint_sets.push_back(disjoint_set2); + + //compute the min cut of graph which is equal to the total edge weights of two supernodes + size_t weight_of_cut; + for (pair element: cgraph_total_edge_weights){ + weight_of_cut = element.second; + } +#ifdef debug_kargers_min + cout << "============================================================================= " << endl; + cout << "Weight of cut " << weight_of_cut<< endl; + cout << "============================================================================= " << endl; + +#endif + to_return = make_pair(disjoint_sets, weight_of_cut); + + } + + } + + + // or send back a pair containing min_cut, disjoint_sets + return to_return; + } + + pair>, size_t> compute_min_cut(Graph graph, const int seed){ + + // compute min-cut twice and choose the min-cut with least total graph weights + //the minimum total edge weight of graph will give us the min-cut + const int seed2 = seed+1; + + //TODO: generate seeds in here or send two seeds + pair>, size_t> to_return; + pair>, size_t> min_cut1 = kargers_min_cut(graph, seed); + pair>, size_t> min_cut2 = kargers_min_cut(graph, seed2); + + if (min_cut1.second == 0 || min_cut2.second == 0 ){ + // if pair is empty pair.first and pair.second will both be initialized to 0 during contruction + //return empty container +#ifdef debug_compute_min + cout << "============================================================================= " << endl; + cout << "RETURNING EMPTY MINCUT" <> disjoint_set = min_cut1.first; + for (size_t i = 0; i < disjoint_set.size(); i++){ + for (auto& x:disjoint_set[i] ) { + cout << "MCG set "<< i << "has" << x <> disjoint_set2 = min_cut2.first; + for (size_t i = 0; i < disjoint_set2.size(); i++){ + for (auto& x:disjoint_set2[i] ) { + cout << "MCG set "<< i << "has" << x <> min_cut_decomposition(Graph graph, const int seed){ + + vector> Gamma; + + const int rand_seed = seed; + + function recurse = [&](Graph graph){ +#ifdef debug_min_decomp + cout << "============================================================================= " << endl; + cout << "MIN-CUT-DECOMPOSITION" <=2 + if(disjoint_sets[0].size() >=2){ + Gamma.push_back(disjoint_sets[0]); + } + if(disjoint_sets[1].size() >=2){ + Gamma.push_back(disjoint_sets[1]); + } + + + + //build the subgraphs from disjoint sets + vector subgraph(2); + vector node_ids = graph.get_node_ids(); + for(size_t h =0; h < disjoint_sets.size(); h++){ + + for(auto& id : node_ids){ + // if node from original graph is in the disjoint set + if (disjoint_sets[h].count(id)==1){ + + size_t node_weight =0; + Node node; + Edge edge; + // check if any edges connect to other nodes in the disjoint set + for(size_t j =0; j < graph.get_node_by_id(id).edges.size(); j++){ + + if (disjoint_sets[h].count(graph.get_node_by_id(id).edges[j].other)==1){ + edge.other = graph.get_node_by_id(id).edges[j].other; + edge.weight = graph.get_node_by_id(id).edges[j].weight; + node_weight += edge.weight; + node.edges.push_back(edge); + + + } + } + //tally the node weight using the edges + node.weight = node_weight; + subgraph[h].add_node(id, node); + + + } + + } + + } + + recurse(subgraph[0]); + + recurse(subgraph[1]); + + + }; + + recurse(graph); +#ifdef debug_min_decomp + for(size_t i = 0; i < Gamma.size(); i++){ + for (auto& x:Gamma[i] ) { + cout << "Gamma " << i <<"has " << x < +#include +#include +#include +#include + + +namespace vg { + namespace algorithms { + + using namespace std; + + struct Edge{ + int other; //node at other end + int weight; + }; + + struct Node{ + int weight; + vector edges; + }; + struct Graph { + private: + unordered_map nodes; + + public: + inline vector get_node_ids(){ + vector node_ids; + for (auto& id_and_node : nodes){ + size_t node_id = id_and_node.first; + node_ids.push_back(node_id); + } + return node_ids; + + } + + inline size_t get_size(){ + return nodes.size(); + } + + inline Node& get_node_by_id(size_t node_id){ + Node& node = nodes.at(node_id); + return node; + } + + inline void add_node(size_t id, Node node){ + nodes.emplace(id, node); + } + + // only use this method for unittesting a linear graph with nodes that each contain biderectional edges between nodes + // since the prev node points to current node, and current node points back to it + // we can get the randomly generated edge weight for prev<-current from prev -> other (current) + inline size_t get_weight_using_other(Node prev_node, size_t other){ + size_t to_return; + for(size_t i = 0; i < prev_node.edges.size(); i++){ + if(prev_node.edges[i].other == other){ + to_return = prev_node.edges[i].weight; + } + } + return to_return; + + } + + }; + + pair>, size_t> kargers_min_cut(Graph graph, const int seed); + + pair>, size_t> compute_min_cut(Graph graph, const int seed); + + //Assumption: handles one connected component at a time + //Assumption: all edge weights are > 0 + vector> min_cut_decomposition(Graph graph, const int seed); + + + } +} + +#endif \ No newline at end of file diff --git a/src/algorithms/nearest_offsets_in_paths.cpp b/src/algorithms/nearest_offsets_in_paths.cpp new file mode 100644 index 00000000000..6d0c8bfd1e5 --- /dev/null +++ b/src/algorithms/nearest_offsets_in_paths.cpp @@ -0,0 +1,149 @@ +/** + * \file nearest_offsets_in_paths.cpp + * + * Contains implementation of nearest_offsets_in_paths function + */ + +#include "nearest_offsets_in_paths.hpp" + +//#define debug + +namespace vg { +namespace algorithms { + +using namespace std; + +path_offset_collection_t nearest_offsets_in_paths(const PathPositionHandleGraph* graph, + const pos_t& pos, + int64_t max_search, + const std::function* path_filter) { + + // init the return value + // This is a map from path handle, to vector of offset and orientation pairs + path_offset_collection_t return_val; + + // use greater so that we traverse in ascending order of distance + structures::RankPairingHeap, int64_t, greater> queue; + + // add in the initial traversals in both directions from the start position + // distances are measured to the left side of the node + handle_t start = graph->get_handle(id(pos), is_rev(pos)); + queue.push_or_reprioritize(make_pair(start, false), -offset(pos)); + queue.push_or_reprioritize(make_pair(graph->flip(start), true), offset(pos) - graph->get_length(start)); + + while (!queue.empty()) { + // get the queue that has the next shortest path + auto trav = queue.top(); + queue.pop(); + + // unpack this record + handle_t here = trav.first.first; + bool search_left = trav.first.second; + int64_t dist = trav.second; + +#ifdef debug_algorithms + cerr << "traversing " << graph->get_id(here) << (graph->get_is_reverse(here) ? "-" : "+") + << " in " << (search_left ? "leftward" : "rightward") << " direction at distance " << dist << endl; +#endif + + for (const step_handle_t& step : graph->steps_of_handle(here)) { + // For each path visit that occurs on this node +#ifdef debug + cerr << "handle is on step at path offset " << graph->get_position_of_step(step) << endl; +#endif + + path_handle_t path_handle = graph->get_path_handle_of_step(step); + + if (path_filter && !(*path_filter)(path_handle)) { + // We are to ignore this path +#ifdef debug + cerr << "handle is on ignored path " << graph->get_name(path_handle) << endl; +#endif + continue; + } + + // flip the handle back to the orientation it started in + handle_t oriented = search_left ? graph->flip(here) : here; + + // the orientation of the position relative to the forward strand of the path + bool rev_on_path = (oriented != graph->get_handle_of_step(step)); + + // the offset of this step on the forward strand + int64_t path_offset = graph->get_position_of_step(step); + + if (rev_on_path != search_left) { + path_offset += graph->get_length(oriented) + dist; + } + else { + path_offset -= dist; + } + +#ifdef debug + cerr << "after adding search distance and node offset, " << path_offset << " on strand " << rev_on_path << endl; +#endif + + // handle possible under/overflow from the search distance + path_offset = max(min(path_offset, graph->get_path_length(path_handle)), 0); + + // add in the search distance and add the result to the output + return_val[path_handle].emplace_back(path_offset, rev_on_path); + } + + if (!return_val.empty()) { + // we found the closest, we're done + break; + } + + int64_t dist_thru = dist + graph->get_length(here); + + if (dist_thru <= max_search) { + + // we can cross the node within our budget of search distance, enqueue + // the next nodes in the search direction + graph->follow_edges(here, false, [&](const handle_t& next) { + +#ifdef debug_algorithms + cerr << "\tfollowing edge to " << graph->get_id(next) << (graph->get_is_reverse(next) ? "-" : "+") + << " at dist " << dist_thru << endl; +#endif + + queue.push_or_reprioritize(make_pair(next, search_left), dist_thru); + }); + } + } + + return return_val; +} + +map>> offsets_in_paths(const PathPositionHandleGraph* graph, const pos_t& pos) { + auto offsets = nearest_offsets_in_paths(graph, pos, -1); + map>> named_offsets; + for (pair>>& offset : offsets) { + named_offsets[graph->get_path_name(offset.first)] = move(offset.second); + } + return named_offsets; +} + +path_offset_collection_t simple_offsets_in_paths(const PathPositionHandleGraph* graph, pos_t pos) { + path_offset_collection_t positions; + handle_t handle = graph->get_handle(id(pos), is_rev(pos)); + size_t handle_length = graph->get_length(handle); + for (const step_handle_t& step : graph->steps_of_handle(handle)) { + // the orientation of the position relative to the forward strand of the path + bool rev_path = graph->get_is_reverse(graph->get_handle_of_step(step)); + // the offset of this step on the forward strand + int64_t path_offset = graph->get_position_of_step(step); + auto& pos_in_path = positions[graph->get_path_handle_of_step(step)]; + // Make sure to interpret the pos_t offset on the correct strand. + size_t node_forward_strand_offset = is_rev(pos) ? (handle_length - offset(pos) - 1) : offset(pos); + // Normalize to a forward strand offset. + size_t off = path_offset + (rev_path ? + (handle_length - node_forward_strand_offset - 1) : + node_forward_strand_offset); + pos_in_path.push_back(make_pair(off, rev_path)); + } + return positions; +} + +} +} diff --git a/src/algorithms/nearest_offsets_in_paths.hpp b/src/algorithms/nearest_offsets_in_paths.hpp new file mode 100644 index 00000000000..787b7dfa4b9 --- /dev/null +++ b/src/algorithms/nearest_offsets_in_paths.hpp @@ -0,0 +1,50 @@ +#ifndef VG_ALGORITHMS_FIND_CLOSEST_WITH_PATHS_HPP_INCLUDED +#define VG_ALGORITHMS_FIND_CLOSEST_WITH_PATHS_HPP_INCLUDED + +/** + * \file nearest_offsets_in_paths.hpp + * + * Defines algorithm for finding the nearest offset along a path of the + * the closest position in the graph that overlaps a path + */ + +#include "../handle.hpp" + +#include +#include +#include +#include + +#include "structures/rank_pairing_heap.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +/// Represents a set of positions and orientations, along a collection of paths. +/// Positions and orientations may or may not be stored sorted. +using path_offset_collection_t = unordered_map>>; + +/// Return, for the nearest position in a path to the given position, +/// subject to the given max search distance, a mapping from path name to +/// all positions on each path where that pos_t occurs. +/// Stops search when path(s) are ancountered. +/// +/// If path_filter is set, ignores paths for which it returns false. +path_offset_collection_t nearest_offsets_in_paths(const PathPositionHandleGraph* graph, + const pos_t& pos, int64_t max_search, + const std::function* path_filter = nullptr); + +/// Wrapper for the above to support some earlier code. Only looks for paths +/// that directly touch the position, and returns the paths by name. +map>> offsets_in_paths(const PathPositionHandleGraph* graph, const pos_t& pos); + +/// A "simple" model for path position getting for debugging +path_offset_collection_t simple_offsets_in_paths(const PathPositionHandleGraph* graph, pos_t pos); + + +} +} + +#endif diff --git a/src/algorithms/next_pos_chars.cpp b/src/algorithms/next_pos_chars.cpp new file mode 100644 index 00000000000..2fea4272ce9 --- /dev/null +++ b/src/algorithms/next_pos_chars.cpp @@ -0,0 +1,24 @@ +#include "next_pos_chars.hpp" + + +namespace vg { +namespace algorithms { + +map next_pos_chars(const PathPositionHandleGraph& graph, pos_t pos) { + map nexts; + handle_t handle = graph.get_handle(id(pos), is_rev(pos)); + if (offset(pos) < graph.get_length(handle)-1) { + ++get_offset(pos); + char c = graph.get_base(handle, offset(pos)); + nexts[pos] = c; + } else { + graph.follow_edges(handle, false, [&](const handle_t& next) { + char c = graph.get_base(next, 0); + nexts[make_pos_t(graph.get_id(next), graph.get_is_reverse(next), 0)] = c; + }); + } + return nexts; +} + +} +} diff --git a/src/algorithms/next_pos_chars.hpp b/src/algorithms/next_pos_chars.hpp new file mode 100644 index 00000000000..9f1cf689e23 --- /dev/null +++ b/src/algorithms/next_pos_chars.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../handle.hpp" +#include +#include +#include "../position.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +map next_pos_chars(const PathPositionHandleGraph& graph, pos_t pos); + +} + +} diff --git a/src/algorithms/normalize.cpp b/src/algorithms/normalize.cpp new file mode 100644 index 00000000000..6f7d9558a92 --- /dev/null +++ b/src/algorithms/normalize.cpp @@ -0,0 +1,57 @@ +/** + * \file normalize.cpp + * + * Defines an algorithm to normalize a graph. + */ + +#include "normalize.hpp" +#include "simplify_siblings.hpp" + +#include +#include +#include +#include +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +void normalize(handlegraph::MutablePathDeletableHandleGraph* graph, int max_iter, bool debug, + function can_merge) { + + size_t last_len = 0; + if (max_iter > 1) { + last_len = graph->get_total_length(); + } + int iter = 0; + do { + // Ignore doubly reversing edges; that's not really a coherent concept + // for all handle graphs, or an obstacle to normality. + + // combine diced/chopped nodes (subpaths with no branching) + handlealgs::unchop(*graph); + // Resolve forks that shouldn't be + simplify_siblings(graph, can_merge); + + if (max_iter > 1) { + size_t curr_len = graph->get_total_length(); + if (debug) cerr << "[algorithms::normalize] iteration " << iter+1 << " current length " << curr_len << endl; + if (curr_len == last_len) break; + last_len = curr_len; + } + } while (++iter < max_iter); + if (max_iter > 1) { + if (debug) cerr << "[algorithms::normalize] normalized in " << iter << " steps" << endl; + } + + // there may now be some cut nodes that can be simplified + // This won't change the length. + handlealgs::unchop(*graph); +} + + +} +} + diff --git a/src/algorithms/normalize.hpp b/src/algorithms/normalize.hpp new file mode 100644 index 00000000000..f03dd5532e7 --- /dev/null +++ b/src/algorithms/normalize.hpp @@ -0,0 +1,29 @@ +#ifndef VG_ALGORITHMS_NORMALIZE_HPP_INCLUDED +#define VG_ALGORITHMS_NORMALIZE_HPP_INCLUDED + + +#include "../handle.hpp" +#include + +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +/** + * Normalize a graph, performing up to the given number of iterations. + * Simplifies siblings and unchops runs of nodes, in a loop. + * + * if "can_merge" specified, it must return true in order for a pair of nodes to get merged + */ +void normalize(handlegraph::MutablePathDeletableHandleGraph* graph, int max_iter = 1, + bool debug = false, + function can_merge = nullptr); + + +} +} + +#endif diff --git a/src/algorithms/path_string.cpp b/src/algorithms/path_string.cpp new file mode 100644 index 00000000000..2bf8aeee304 --- /dev/null +++ b/src/algorithms/path_string.cpp @@ -0,0 +1,42 @@ +#include "path_string.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; +using namespace vg::io; + +void append_mapping_sequence(const Mapping& m, const string& node_seq, string& seq) { + size_t t = 0; + size_t f = m.position().offset(); + for (size_t i = 0; i < m.edit_size(); ++i) { + auto& e = m.edit(i); + if (edit_is_match(e)) { + seq.append(node_seq.substr(f, e.from_length())); + } else if (edit_is_sub(e)) { + seq.append(e.sequence()); + } else if (edit_is_insertion(e)) { + seq.append(e.sequence()); + } else if (edit_is_deletion(e)) { + // no-op + } + t += e.to_length(); + f += e.from_length(); + } +} + +string path_string(const HandleGraph& graph, const Path& path) { + string seq; + for (int i = 0; i < path.mapping_size(); ++i) { + auto& m = path.mapping(i); + append_mapping_sequence(m, + graph.get_sequence( + graph.get_handle(m.position().node_id(), + m.position().is_reverse())), + seq); + } + return seq; +} + +} +} diff --git a/src/algorithms/path_string.hpp b/src/algorithms/path_string.hpp new file mode 100644 index 00000000000..7e676c530d5 --- /dev/null +++ b/src/algorithms/path_string.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "../handle.hpp" +#include "vg/io/edit.hpp" +#include +#include +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +/// use the given oriented node sequence and the mapping to reconstruct the sequence represented by the mapping +void append_mapping_sequence(const Mapping& m, const string& node_seq, string& seq); + +/// use the given graph and the path to determine our path string +std::string path_string(const HandleGraph& graph, const Path& path); + +} +} diff --git a/src/algorithms/prune.cpp b/src/algorithms/prune.cpp new file mode 100644 index 00000000000..b0601eda226 --- /dev/null +++ b/src/algorithms/prune.cpp @@ -0,0 +1,233 @@ +#include "algorithms/prune.hpp" +#include "hash_map.hpp" +#include "position.hpp" +#include "source_sink_overlay.hpp" + +#include + +namespace vg { +namespace algorithms { + +/// Record a <=k-length walk in the context of a graph. +struct walk_t { + walk_t(uint16_t l, + const pos_t& b, + const pos_t& e, + const handle_t& c, + uint16_t f) + : length(l), begin(b), end(e), curr(c), forks(f) { }; + /// our start position + pos_t begin; + pos_t end; /// one past the (current) end of the kmer + handle_t curr; /// the next handle we extend into + uint16_t forks; /// how many branching edge crossings we took to get here + uint16_t length; /// how far we've been +}; + +constexpr size_t PRUNE_THREAD_BUFFER_SIZE = 1024 * 1024; + +pair_hash_set find_edges_to_prune(const HandleGraph& graph, size_t k, size_t edge_max) { + + // Each thread collects edges to be deleted into a separate buffer. When the buffer grows + // large enough, flush it into a shared hash set. + pair_hash_set result; + auto flush_buffer = [&result](pair_hash_set& buffer) { +#pragma omp critical (prune_flush) + { + for (const edge_t& edge : buffer) { + result.insert(edge); + } + } + buffer.clear(); + }; + + // for each position on the forward and reverse of the graph + std::vector> buffers(get_thread_count()); + graph.for_each_handle([&](const handle_t& h) { + // for the forward and reverse of this handle + // walk k bases from the end, so that any kmer starting on the node will be represented in the tree we build + for (auto handle_is_rev : { false, true }) { + handle_t handle = handle_is_rev ? graph.flip(h) : h; + std::stack walks; + // for each position in the node, set up a kmer with that start position and the node end or kmer length as the end position + // determine next positions + id_t handle_id = graph.get_id(handle); + size_t handle_length = graph.get_length(handle); + for (size_t i = 0; i < handle_length; i++) { + pos_t begin = make_pos_t(handle_id, handle_is_rev, handle_length); + pos_t end = make_pos_t(handle_id, handle_is_rev, std::min(handle_length, i + k)); + // We are only interested in walks that did not reach length k in the initial node. + if (offset(end) - offset(begin) < k) { + size_t outdegree = graph.get_degree(handle, false); + graph.follow_edges(handle, false, [&](const handle_t& next) { + if (outdegree > 1 && edge_max == 0) { // our next step takes us over the max + int tid = omp_get_thread_num(); + buffers[tid].insert(graph.edge_handle(handle, next)); + if (buffers[tid].size() >= PRUNE_THREAD_BUFFER_SIZE) { + flush_buffer(buffers[tid]); + } + } else { + walk_t walk(offset(end) - offset(begin), begin, end, next, 0); + if (outdegree > 1) { + walk.forks++; + } + walks.push(walk); + } + }); + } + } + + // Now expand the kmers until they reach k. + while (!walks.empty()) { + walk_t walk = walks.top(); + walks.pop(); + // Did we reach our target length? + if (walk.length >= k) { + continue; + } + id_t curr_id = graph.get_id(walk.curr); + size_t curr_length = graph.get_length(walk.curr); + bool curr_is_rev = graph.get_is_reverse(walk.curr); + size_t take = min(curr_length, k - walk.length); + walk.end = make_pos_t(curr_id, curr_is_rev, take); + walk.length += take; + // Do we need to continue to the successor nodes? + if (walk.length < k) { + size_t outdegree = graph.get_degree(walk.curr, false); + graph.follow_edges(walk.curr, false, [&](const handle_t& next) { + if (outdegree > 1 && edge_max == walk.forks) { // our next step takes us over the max + int tid = omp_get_thread_num(); + buffers[tid].insert(graph.edge_handle(walk.curr, next)); + if (buffers[tid].size() >= PRUNE_THREAD_BUFFER_SIZE) { + flush_buffer(buffers[tid]); + } + } else { + walk_t next_walk = walk; + next_walk.curr = next; + if (outdegree > 1) { + next_walk.forks++; + } + walks.push(next_walk); + } + }); + } + } + } + }, true); + + // Flush the buffers and return the result. + for (pair_hash_set& buffer : buffers) { + flush_buffer(buffer); + } + return result; +} + +size_t prune_complex(DeletableHandleGraph& graph, + int path_length, int edge_max) { + + auto edges_to_destroy = find_edges_to_prune(graph, path_length, edge_max); + for (auto& edge : edges_to_destroy) { + graph.destroy_edge(edge); + } + return edges_to_destroy.size(); +} + +size_t prune_complex_with_head_tail(DeletableHandleGraph& graph, + int path_length, int edge_max) { + + SourceSinkOverlay source_sink_graph(&graph, path_length); + + auto edges_to_destroy = find_edges_to_prune(source_sink_graph, + path_length, + edge_max); + + for (auto& edge : edges_to_destroy) { + auto ss_handle_1 = source_sink_graph.forward(edge.first); + auto ss_handle_2 = source_sink_graph.forward(edge.second); + if (ss_handle_1 != source_sink_graph.get_source_handle() + && ss_handle_1 != source_sink_graph.get_sink_handle() + && ss_handle_2 != source_sink_graph.get_source_handle() + && ss_handle_2 != source_sink_graph.get_sink_handle()) { + // this is not an edge involving the artificial source/sink nodes + graph.destroy_edge(source_sink_graph.get_underlying_handle(edge.first), + source_sink_graph.get_underlying_handle(edge.second)); + + } + } + return edges_to_destroy.size(); +} + +size_t prune_short_subgraphs(DeletableHandleGraph& graph, int min_size) { + + unordered_set to_destroy; + + // DFS from all tips + for (auto tip : handlealgs::find_tips(&graph)) { + //cerr << "begin trav from " << graph.get_id(tip) << " " << graph.get_is_reverse(tip) << endl; + auto start = graph.forward(tip); + if (to_destroy.count(start)) { + // we already found this subgraph from another tip + //cerr << "skipping" << endl; + continue; + } + vector stack(1, start); + unordered_set seen{start}; + int size_seen = 0; + // stop when we've seen a large enough subgraph + while (!stack.empty() && size_seen < min_size) { + handle_t handle = stack.back(); + stack.pop_back(); + size_seen += graph.get_length(handle); + //cerr << "destack " << graph.get_id(handle) << ", update size seen to " << size_seen << endl; + for (bool go_left : {true, false}) { + graph.follow_edges(handle, go_left, [&](const handle_t& next) { + handle_t fwd_next = graph.forward(next); + if (!seen.count(fwd_next)) { + //cerr << "stack up " << graph.get_id(fwd_next) << ", update size seen to " << size_seen << endl; + stack.push_back(fwd_next); + seen.insert(fwd_next); + } + }); + } + } + if (size_seen < min_size) { + //cerr << "component is small enough to destroy" << endl; + // this component is under the size limit, mark them for destruction + for (auto handle : seen) { + to_destroy.insert(handle); + } + } + } + + // destroy all handles that we marked + for (auto handle : to_destroy) { + graph.destroy_handle(handle); + } + + return to_destroy.size(); +} + + +size_t remove_high_degree_nodes(DeletableHandleGraph& g, int max_degree) { + vector to_remove; + g.for_each_handle([&](const handle_t& h) { + int edge_count = 0; + g.follow_edges(h, false, [&](const handle_t& ignored) { + ++edge_count; + }); + g.follow_edges(h, true, [&](const handle_t& ignored) { + ++edge_count; + }); + if (edge_count > max_degree) { + to_remove.push_back(h); + } + }); + // now destroy the high degree nodes + for (auto& h : to_remove) { + g.destroy_handle(h); + } + return to_remove.size(); +} + +} +} diff --git a/src/algorithms/prune.hpp b/src/algorithms/prune.hpp new file mode 100644 index 00000000000..18268342317 --- /dev/null +++ b/src/algorithms/prune.hpp @@ -0,0 +1,31 @@ +#ifndef VG_ALGORITHMS_PRUNE_HPP_INCLUDED +#define VG_ALGORITHMS_PRUNE_HPP_INCLUDED + +#include "../handle.hpp" + +namespace vg { +namespace algorithms { + +/// Take all nodes that would introduce paths of > edge_max edge crossings, remove them, and link their neighbors to +/// head_node or tail_node depending on which direction the path extension was stopped. +/// For pruning graph prior to indexing with gcsa2. Returns the number of edges removed. +size_t prune_complex(DeletableHandleGraph& graph, + int path_length, int edge_max); + +/// Wrap the graph with heads and tails (for GCSA2 indexing) and then prune as with +/// prune_complex. Returns the number of edges removed. +size_t prune_complex_with_head_tail(DeletableHandleGraph& graph, + int path_length, int edge_max); + +/// Remove any weakly connected components that have total sequence +/// length under the minimum size. Returns the number of nodes removed. +size_t prune_short_subgraphs(DeletableHandleGraph& graph, int min_size); + +/// Remove nodes with >= max_degree total edges on each side. Note that +/// end-to-start self loops count twice. Returns the number of nodes removed. +size_t remove_high_degree_nodes(DeletableHandleGraph& graph, int max_degree); + +} +} + +#endif diff --git a/src/algorithms/prune_to_connecting_graph.cpp b/src/algorithms/prune_to_connecting_graph.cpp new file mode 100644 index 00000000000..6e3253bd977 --- /dev/null +++ b/src/algorithms/prune_to_connecting_graph.cpp @@ -0,0 +1,71 @@ +#include "algorithms/prune_to_connecting_graph.hpp" + +#include +#include + +//#define debug + +namespace vg { +namespace algorithms { + +void prune_to_connecting_graph(DeletableHandleGraph& graph, + const handle_t& from, const handle_t& to) { + + // we use these to remember which hanndles were reached + unordered_set forward, backward; + + // do BFS from both positions + for (bool fwd : {true, false}) { + + auto& reached = fwd ? forward : backward; + handle_t start = fwd ? from : to; + + queue bfs_queue; + reached.emplace(start); + bfs_queue.emplace(start); + + while (!bfs_queue.empty()) { + + handle_t here = bfs_queue.front(); + bfs_queue.pop(); + +#ifdef debug + cerr << "BFS in direction fwd? " << fwd << " at " << graph.get_id(here) << " " << graph.get_is_reverse(here) << endl; +#endif + + graph.follow_edges(here, !fwd, [&](const handle_t& next) { +#ifdef debug + cerr << "follow edge to " << graph.get_id(next) << " " << graph.get_is_reverse(next) << endl; +#endif + if (!reached.count(next)) { + reached.emplace(next); + bfs_queue.emplace(next); +#ifdef debug + cerr << "add to queue" << endl; +#endif + } + }); + } + } + + // check that each node is on some path between the two handles + vector to_delete; + graph.for_each_handle([&](const handle_t& handle) { + auto flipped = graph.flip(handle); + if (!((forward.count(handle) && backward.count(handle)) + || (forward.count(flipped) && backward.count(flipped)))) { +#ifdef debug + cerr << "mark " << graph.get_id(handle) << " for deletion" << endl; +#endif + to_delete.push_back(handle); + } + }); + + // delete the handles that failed the test + for (auto handle : to_delete) { + graph.destroy_handle(handle); + } +} + +} +} diff --git a/src/algorithms/prune_to_connecting_graph.hpp b/src/algorithms/prune_to_connecting_graph.hpp new file mode 100644 index 00000000000..328f7593370 --- /dev/null +++ b/src/algorithms/prune_to_connecting_graph.hpp @@ -0,0 +1,15 @@ +#ifndef VG_ALGORITHMS_PRUNE_TO_CONNECTING_GRAPH_HPP_INCLUDED +#define VG_ALGORITHMS_PRUNE_TO_CONNECTING_GRAPH_HPP_INCLUDED + +#include "../handle.hpp" + +namespace vg { +namespace algorithms { + +/// Remove all parts of the graph that are not on some path between the two handles +void prune_to_connecting_graph(DeletableHandleGraph& graph, + const handle_t& from, const handle_t& to); +} +} + +#endif diff --git a/src/algorithms/ref_path_distance.cpp b/src/algorithms/ref_path_distance.cpp new file mode 100644 index 00000000000..0d7a91c6c6a --- /dev/null +++ b/src/algorithms/ref_path_distance.cpp @@ -0,0 +1,158 @@ +/** \file + * Implements the reference path distance function + */ + +#include "ref_path_distance.hpp" + +//#define debug_ref_path_distance + +namespace vg { +namespace algorithms { + +using namespace std; + +int64_t ref_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos_1, const pos_t& pos_2, + const unordered_set& ref_paths, int64_t max_search_dist) { + +#ifdef debug_ref_path_distance + cerr << "[ref_path_distance] measuring approx reference dist from " << pos_1 << " to " << pos_2 << endl; +#endif + + // to record the nearest position on the strands of paths for each of the + // two positions + unordered_map, int64_t> nearby_paths_1, nearby_paths_2; + + // dijkstra priority queue, records of (node, from pos_1) + structures::RankPairingHeap, int64_t, greater> queue; + + // intialize in both directions from both positions + handle_t handle_1 = graph->get_handle(id(pos_1), is_rev(pos_1)); + handle_t handle_2 = graph->get_handle(id(pos_2), is_rev(pos_2)); + + // we only walk forward (toward each other) so that we know the positions can reach each other + queue.push_or_reprioritize(make_tuple(handle_1, true), -offset(pos_1)); + queue.push_or_reprioritize(make_tuple(handle_2, false), + offset(pos_2) - graph->get_length(handle_2)); + + vector> shared_refs; + while (!queue.empty() && shared_refs.empty()) { + + auto top = queue.top(); + handle_t handle; + bool from_pos_1; + tie(handle, from_pos_1) = top.first; + queue.pop(); + +#ifdef debug_ref_path_distance + cerr << "[ref_path_distance] dequeue " << graph->get_id(handle) << (graph->get_is_reverse(handle) ? "-" : "+") << ", left? " << !from_pos_1 << " from pos " << (from_pos_1 ? 1 : 2) << endl; +#endif + + decltype(nearby_paths_1)* nearby_paths; + decltype(nearby_paths_1)* other_nearby_paths; + if (from_pos_1) { + nearby_paths = &nearby_paths_1; + other_nearby_paths = &nearby_paths_2; + } + else { + nearby_paths = &nearby_paths_2; + other_nearby_paths = &nearby_paths_1; + } + + graph->for_each_step_on_handle(handle, [&](const step_handle_t& step) { + pair oriented_path(graph->get_path_handle_of_step(step), + graph->get_handle_of_step(step) != handle); +#ifdef debug_ref_path_distance + cerr << "[ref_path_distance] on path " << graph->get_path_name(oriented_path.first) << ", on rev? " << oriented_path.second << ", step offset " << graph->get_position_of_step(step) << endl; +#endif + if (!nearby_paths->count(oriented_path)) { + + int64_t path_offset; + if (oriented_path.second && !from_pos_1) { + // traverse left, reverse strand of path + path_offset = (graph->get_path_length(graph->get_path_handle_of_step(step)) + - graph->get_position_of_step(step)); + } + else if (oriented_path.second) { + // traverse right, reverse strand of path + path_offset = (graph->get_path_length(graph->get_path_handle_of_step(step)) + - graph->get_position_of_step(step) - graph->get_length(handle)); + } + else if (!from_pos_1) { + // traverse left, forward strand of path + path_offset = graph->get_position_of_step(step) + graph->get_length(handle); + } + else { + // traverse right, forward strand of path + path_offset = graph->get_position_of_step(step); + } + + // we include the offset on the node if we actually started on the path + if (from_pos_1 && handle == handle_1) { + path_offset += offset(pos_1); + } + else if (!from_pos_1 && handle == handle_2) { + path_offset -= graph->get_length(handle) - offset(pos_2); + } + +#ifdef debug_ref_path_distance + cerr << "[ref_path_distance] first encounter, recording path offset of " << path_offset << endl; +#endif + + (*nearby_paths)[oriented_path] = path_offset; + if (ref_paths.count(oriented_path.first) && other_nearby_paths->count(oriented_path)) { + shared_refs.emplace_back(oriented_path); + } + } + }); + + // only queue up the next if we're still within the max distance and haven't found a shared + // reference path + if (shared_refs.empty()) { + int64_t dist_thru = top.second + graph->get_length(handle); + if (dist_thru <= max_search_dist) { + graph->follow_edges(handle, !from_pos_1, [&](const handle_t& next) { + queue.push_or_reprioritize(make_tuple(next, from_pos_1), dist_thru); + }); + } + } + } + + int64_t approx_ref_dist = numeric_limits::max(); + if (!shared_refs.empty()) { + // we found a labeled reference, measure distance using that + for (const auto& ref : shared_refs) { + int64_t dist = nearby_paths_2[ref] - nearby_paths_1[ref]; +#ifdef debug_ref_path_distance + cerr << "[ref_path_distance] distance on pre-labeled reference path " << graph->get_path_name(ref.first) << " is " << dist << " from interval " << nearby_paths_1[ref] << ":" << nearby_paths_2[ref] << endl; +#endif + if (approx_ref_dist == numeric_limits::max() || dist > approx_ref_dist) { + approx_ref_dist = dist; + } + } + } + else { + // try among the non-reference paths since we didn't find a reference + for (const auto& path_record_1 : nearby_paths_1) { + auto it = nearby_paths_2.find(path_record_1.first); + if (it != nearby_paths_2.end()) { + int64_t dist = it->second - path_record_1.second; + +#ifdef debug_ref_path_distance + cerr << "[ref_path_distance] distance on non-reference path " << graph->get_path_name(path_record_1.first.first) << " is " << dist << " from interval " << path_record_1.second << ":" << it->second << endl; +#endif + if (approx_ref_dist == numeric_limits::max() || dist > approx_ref_dist) { + approx_ref_dist = dist; + } + } + } + } + +#ifdef debug_ref_path_distance + cerr << "[ref_path_distance] approximate ref distance is " << approx_ref_dist << endl; +#endif + + return approx_ref_dist; +} + +} +} diff --git a/src/algorithms/ref_path_distance.hpp b/src/algorithms/ref_path_distance.hpp new file mode 100644 index 00000000000..f204dae484a --- /dev/null +++ b/src/algorithms/ref_path_distance.hpp @@ -0,0 +1,29 @@ +/** \file + * Measures the distance between two graph positions along the reference path + * (approximated by the longest connecting path) + */ + +#ifndef VG_ALGORITHMS_REF_PATH_DISTANCE_HPP_INCLUDED +#define VG_ALGORITHMS_REF_PATH_DISTANCE_HPP_INCLUDED + +#include + +#include "handle.hpp" +#include "position.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + +/// Search the local region around two positions and return the longest distance between +/// them along any paths found during this search. Returns numeric_limits::max() +/// if no shared path is found. +int64_t ref_path_distance(const PathPositionHandleGraph* graph, const pos_t& pos_1, const pos_t& pos_2, + const unordered_set& ref_paths, int64_t max_search_dist); + +} + +} + +#endif // VG_ALGORITHMS_REF_PATH_DISTANCE_HPP_INCLUDED diff --git a/src/algorithms/remove_high_degree.cpp b/src/algorithms/remove_high_degree.cpp deleted file mode 100644 index 08f677bc100..00000000000 --- a/src/algorithms/remove_high_degree.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include "remove_high_degree.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - -void remove_high_degree_nodes(MutableHandleGraph& g, int max_degree) { - vector to_remove; - g.for_each_handle([&](const handle_t& h) { - int edge_count = 0; - g.follow_edges(h, false, [&](const handle_t& ignored) { - ++edge_count; - }); - g.follow_edges(h, true, [&](const handle_t& ignored) { - ++edge_count; - }); - if (edge_count > max_degree) { - to_remove.push_back(h); - } - }); - // now destroy the high degree nodes - for (auto& h : to_remove) { - g.destroy_handle(h); - } -} - -} -} diff --git a/src/algorithms/remove_high_degree.hpp b/src/algorithms/remove_high_degree.hpp deleted file mode 100644 index eb0b3fe991a..00000000000 --- a/src/algorithms/remove_high_degree.hpp +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef VG_ALGORITHMS_REMOVE_HIGH_DEGREE_HPP_INCLUDED -#define VG_ALGORITHMS_REMOVE_HIGH_DEGREE_HPP_INCLUDED - -/** - * \file remove_high_degree.hpp - * - * Defines a process that removes high-degree nodes from a graph - */ - -#include "../vg.pb.h" -#include "../handle.hpp" -#include - -namespace vg { -namespace algorithms { - -using namespace std; - -void remove_high_degree_nodes(MutableHandleGraph& g, int max_degree); - -} -} - -#endif diff --git a/src/algorithms/shortest_cycle.cpp b/src/algorithms/shortest_cycle.cpp new file mode 100644 index 00000000000..63f7651c894 --- /dev/null +++ b/src/algorithms/shortest_cycle.cpp @@ -0,0 +1,203 @@ +#include "shortest_cycle.hpp" + +namespace vg { +namespace algorithms { + + /// An implementation of Bellman-Ford with Yen's ordering improvement applied + /// to a layout ideally has a small feedback arc set + size_t bellman_ford_shortest_cycle_length(const HandleGraph* graph, + const handle_t& source, + const vector& layout, + const unordered_map& handle_index, + const vector>& feedback_edges) { + + // init a dynamic programming structure + vector dp_length(layout.size(), numeric_limits::max()); + + // base case + size_t source_idx = handle_index.at(source); + dp_length[source_idx] = 0; + + size_t cycle_length = numeric_limits::max(); + + // use dynamic programming over an implicitly dagified graph + bool any_changed = true; + for (int64_t i = 0; i < feedback_edges.size() + 1 && any_changed; i++) { + any_changed = false; + // iterate over forward edges + for (const handle_t& handle : layout) { + size_t idx_from = handle_index.at(handle); + if (dp_length[idx_from] == numeric_limits::max()) { + // this node hasn't been reached yet (this saves us checking for overflow) + continue; + } + size_t dist_thru = dp_length[idx_from] + graph->get_length(handle); + graph->follow_edges(handle, false, [&](const handle_t& next) { + size_t idx_to = handle_index.at(next); + if (idx_from < idx_to) { + if (idx_to == source_idx) { + if (dist_thru < cycle_length) { + cycle_length = dist_thru; + any_changed = true; + } + } + else { + if (dist_thru < dp_length[idx_to]) { + dp_length[idx_to] = dist_thru; + any_changed = true; + } + } + } + }); + } + + // iterate over feedback edges + for (const pair& feedback_edge : feedback_edges) { + if (dp_length[feedback_edge.first] == numeric_limits::max()) { + // this node hasn't been reached yet (this saves us checking for overflow) + continue; + } + size_t dist_thru = dp_length[feedback_edge.first] + graph->get_length(layout[feedback_edge.first]); + if (feedback_edge.second == source_idx) { + if (dist_thru < cycle_length) { + cycle_length = dist_thru; + any_changed = true; + } + } + else { + if (dist_thru < dp_length[feedback_edge.second]) { + dp_length[feedback_edge.second] = dist_thru; + any_changed = true; + } + } + } + } + + return cycle_length; + } + + /// Simple Dijkstra implementation that computes shortest cycle + size_t dijkstra_shortest_cycle_length(const HandleGraph* graph, const handle_t& source) { + + // distance from start of source to incoming side of the handle + unordered_map distance_to; + + // init the queue + structures::RankPairingHeap> queue; + queue.push_or_reprioritize(source, 0); + + // Dijkstra traversal over entire graph + while (!queue.empty()) { + pair here = queue.top(); + queue.pop(); + + distance_to[here.first] = here.second; + + size_t dist_thru = here.second + graph->get_length(here.first); + graph->follow_edges(here.first, false, [&](const handle_t& next) { + queue.push_or_reprioritize(next, dist_thru); + }); + } + + // walk one step in the other direction to complete the cycle + size_t cycle_length = numeric_limits::max(); + graph->follow_edges(source, true, [&](const handle_t& prev) { + auto iter = distance_to.find(prev); + if (iter != distance_to.end()) { + cycle_length = min(cycle_length, iter->second + graph->get_length(prev)); + } + }); + return cycle_length; + } + + size_t shortest_cycle_length_internal(const HandleGraph* graph, + const handle_t& source, + const vector& layout, + const unordered_map& handle_index, + const vector>& feedback_edges) { + + size_t log_node_size = 0; + { + size_t log_counter = layout.size(); + while (log_counter) { + log_node_size++; + log_counter /= 2; + } + } + + // the Bellman-Ford implementation has run time proportional to the number of feedback + // arcs, so it has an advantage over Dijkstra if it is dominated by log |V| + if (feedback_edges.size() < log_node_size) { + return bellman_ford_shortest_cycle_length(graph, source, layout, handle_index, feedback_edges); + } + else { + return dijkstra_shortest_cycle_length(graph, source); + } + } + + size_t shortest_cycle_length(const HandleGraph* graph, const handle_t& source) { + + // compute a small FAS layout + vector layout = handlealgs::eades_algorithm(graph); + + // identify each handle with its index in the layout + unordered_map handle_index; + for (size_t i = 0; i < layout.size(); i++) { + handle_index[layout[i]] = i; + } + + // collect the backward facing edges + vector> feedback_edges; + for (const handle_t& handle : layout) { + size_t idx_from = handle_index[handle]; + graph->follow_edges(handle, false, [&](const handle_t& next) { + size_t idx_to = handle_index[next]; + if (idx_from >= idx_to) { + feedback_edges.emplace_back(idx_from, idx_to); + } + }); + } + + return shortest_cycle_length_internal(graph, source, layout, handle_index, feedback_edges); + } + + size_t shortest_cycle_length(const HandleGraph* graph) { + + // compute a small FAS layout + vector layout = handlealgs::eades_algorithm(graph); + + // identify each handle with its index in the layout + unordered_map handle_index; + for (size_t i = 0; i < layout.size(); i++) { + handle_index[layout[i]] = i; + } + + // collect the backward facing edges + vector> feedback_edges; + for (const handle_t& handle : layout) { + size_t idx_from = handle_index[handle]; + graph->follow_edges(handle, false, [&](const handle_t& next) { + size_t idx_to = handle_index[next]; + if (idx_from >= idx_to) { + feedback_edges.emplace_back(idx_from, idx_to); + } + }); + } + + size_t min_cycle_length = numeric_limits::max(); + + // TODO: it shouldn't be necessary to do this on all nodes + for (const handle_t& handle : layout) { + size_t cycle_length = shortest_cycle_length_internal(graph, + handle, + layout, + handle_index, + feedback_edges); + min_cycle_length = min(min_cycle_length, cycle_length); + } + + return min_cycle_length; + } +} +} + diff --git a/src/algorithms/shortest_cycle.hpp b/src/algorithms/shortest_cycle.hpp new file mode 100644 index 00000000000..ff35b5b4be4 --- /dev/null +++ b/src/algorithms/shortest_cycle.hpp @@ -0,0 +1,25 @@ +#ifndef VG_ALGORITHMS_SHORTEST_CYCLE_HPP_INCLUDED +#define VG_ALGORITHMS_SHORTEST_CYCLE_HPP_INCLUDED + +#include + +#include "../handle.hpp" + +#include "structures/rank_pairing_heap.hpp" + +namespace vg { +namespace algorithms { + +using namespace std; + + /// Returns the length of the shortest cycle in the entire graph, or + /// numeric_limits::max() if no cycle exists. + size_t shortest_cycle_length(const HandleGraph* graph); + + /// Returns the length of the shortest cycle containing the source node, or + /// numeric_limits::max() if no cycle exists. + size_t shortest_cycle_length(const HandleGraph* graph, const handle_t& source); +} +} + +#endif diff --git a/src/algorithms/simplify_siblings.cpp b/src/algorithms/simplify_siblings.cpp new file mode 100644 index 00000000000..864016e8345 --- /dev/null +++ b/src/algorithms/simplify_siblings.cpp @@ -0,0 +1,293 @@ +/** + * \file simplify_siblings.cpp + * + * Defines an algorithm to merge nodes or parts of nodes with the same + * predecessors/successors. + */ + +#include "simplify_siblings.hpp" + +#include +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +bool simplify_siblings(handlegraph::MutablePathDeletableHandleGraph* graph, + function can_merge) { + + // Each handle is part of a "family" of handles with the same parents on + // the left side and the same leading base. We elide the trivial ones. We + // also only get one family for either a handle or its flipped counterpart, + // because if both have families they need to be resolved one after the + // other. + vector> families; + + // This tracks all the node IDs we've already put in families we are going + // to merge. This ignores orientation, to ensure that if we're coming into + // a node on its local right side in one family, we don't come in on its + // left side in another. + unordered_set in_family; + + graph->for_each_handle([&](const handle_t& local_forward_node) { + // For each node local forward + + for (bool local_orientation : {false, true}) { + // For it local forward and local reverse + handle_t node = local_orientation ? graph->flip(local_forward_node) : local_forward_node; + +#ifdef debug + cerr << "Consider " << graph->get_id(node) << (graph->get_is_reverse(node) ? '-' : '+') << endl; +#endif + + if (in_family.count(graph->get_id(node))) { + // If it is in a family in one orientation, don't find a family for it in the other orientation. + // We can only merge from one end of a node at a time. +#ifdef debug + cerr << "Node " << graph->get_id(node) << " is already in a family to merge" << endl; +#endif + + return; + } + // For each handle where it or its RC isn't already in a superfamily, identify its superfamily. + unordered_set superfamily; + + // Look left from the node and make a set of the things you see. + unordered_set correct_parents; + graph->follow_edges(node, true, [&](const handle_t& parent) { + correct_parents.insert(parent); +#ifdef debug + cerr << "Parent set includes: " << graph->get_id(parent) << (graph->get_is_reverse(parent) ? '-' : '+') << endl; +#endif + }); + + // Keep a set of things that are partial siblings so we don't have to constantly check them + unordered_set partial_siblings; + for (auto& parent : correct_parents) { + graph->follow_edges(parent, false, [&](const handle_t& candidate) { + // Look right from parents and for each candidate family member + +#ifdef debug + cerr << "Parent " << graph->get_id(parent) << (graph->get_is_reverse(parent) ? '-' : '+') + << " suggests sibling " << graph->get_id(candidate) << (graph->get_is_reverse(candidate) ? '-' : '+') << endl; +#endif + + if (partial_siblings.count(candidate)) { + // Known non-member +#ifdef debug + cerr << "\tAlready checked." << endl; +#endif + return; + } + if (superfamily.count(candidate)) { + // Known member +#ifdef debug + cerr << "\tAlready taken." << endl; +#endif + return; + } + + if (in_family.count(graph->get_id(candidate))) { + // If it is in a family in one orientation, don't find a family for it in the other orientation. + // We can only merge from one end of a node at a time. +#ifdef debug + cerr << "\tAlready in a family to merge." << endl; +#endif + return; + } + + // Look left from it and see if it has the right parents. + size_t seen_parents = 0; + bool bad_parent = false; + graph->follow_edges(candidate, true, [&](const handle_t& candidate_parent) { + if (!correct_parents.count(candidate_parent)) { + // We have a parent we shouldn't + bad_parent = true; + +#ifdef debug + cerr << "\tHas unacceptable parent " + << graph->get_id(candidate_parent) << (graph->get_is_reverse(candidate_parent) ? '-' : '+') << endl; +#endif + + return false; + } else { + // Otherwise we found one of the right ones. + seen_parents++; + +#ifdef debug + cerr << "\tHas OK parent " + << graph->get_id(candidate_parent) << (graph->get_is_reverse(candidate_parent) ? '-' : '+') << endl; +#endif + + return true; + } + }); + +#ifdef debug + cerr << "\tHas " << seen_parents << "/" << correct_parents.size() << " required parents" << endl; +#endif + // If it has the correct parents, it is a member of the superfamily + bool superfamily_check = !bad_parent && seen_parents == correct_parents.size(); + if (can_merge != nullptr) { + // optional callback filter checks candidate against the super family + for (auto super_it = superfamily.begin(); superfamily_check && super_it != superfamily.end(); ++super_it) { + superfamily_check = can_merge(candidate, *super_it); + } + } + if (superfamily_check) { + // If it has the correct parents and passes the check callback, it is a member of the superfamily + superfamily.insert(candidate); + +#ifdef debug + cerr << "\tBelongs in superfamily" << endl; +#endif + + } else { + // Otherwise, it is out, so don't check it again if we find it from another parent. + partial_siblings.insert(candidate); + +#ifdef debug + cerr << "\tOnly a partial sibling" << endl; +#endif + + } + }); + } + + // Now we have a family. It can't overap with any existing ones. + + if (superfamily.size() > 1) { + // It is nontrivial + + // Make sure no node appears multiple times in the superfamily (in opposite orientations). + // TODO: somehow deal with merging on different ends of the same node + unordered_set seen; + bool qualified = true; + for (auto& h : superfamily) { + id_t id = graph->get_id(h); + if (seen.count(id)) { + // We need to disqualify this superfamily to avoid parallel merging on the same node. + qualified = false; + +#ifdef debug + cerr << "Disqualify superfamily due to duplicate node " << id << endl; +#endif + + break; + } + seen.insert(id); + } + + if (!qualified) { + // This may contain two nontrivial families for the same node. Skip it. + // TODO: Only disqualify actually-conflicting families + continue; + } + + // Now we know all the families in the superfamily can exist together. + + // Bucket by leading base + map> by_base; + for (auto& h : superfamily) { + if (graph->get_length(h) == 0) { + // Empty nodes probably shouldn't exist, but skip them. + +#ifdef debug + cerr << "Empty node: " << graph->get_id(h) << (graph->get_is_reverse(h) ? '-' : '+') << endl; +#endif + + continue; + } + + // Bucket by base into families + by_base[graph->get_base(h, 0)].emplace_back(h); + } + +#ifdef debug + cerr << "Found " << by_base.size() << " distinct start bases" << endl; +#endif + + for (auto& base_and_family : by_base) { + // For each family we found + auto& family = base_and_family.second; + + if (family.size() == 1) { + // Ignore the trivial ones + continue; + } + +#ifdef debug + cerr << "Nontrivial family of " << family.size() << " nodes starting with " << base_and_family.first << endl; +#endif + + for (auto& h : family) { + // We're going to do this family, so disqualify all the nodes from other families on the other side. + in_family.insert(graph->get_id(h)); + +#ifdef debug + cerr << "Ban node " << graph->get_id(h) << " from subsequent families" << endl; +#endif + + } + + // Then save the family as a real family to merge on + families.push_back(family); + } + } + } + }); + + in_family.clear(); + +#ifdef debug + cerr << "Found " << families.size() << " distinct nontrivial families" << endl; +#endif + + // Now we have a bunch of families that won't invalidate each others' handles. + + // We set this tro true if we do any work. + bool made_progress = false; + + for (auto& family : families) { + // Set up the merge + // everything needs to start at base 0 + vector> merge_from; + merge_from.reserve(family.size()); + merge_from.emplace_back(family.at(0), 0); + + // Work out the length of the longest common prefix + size_t lcp_length = graph->get_length(family.at(0)); + string reference_string = graph->get_sequence(family.at(0)); + for (size_t i = 1; i < family.size(); i++) { + // Create the merge start position + merge_from.emplace_back(family.at(i), 0); + + // See where the first (case-insensitive) mismatch is, and min that in with the LCP length + auto other_string = graph->get_sequence(family.at(i)); + auto mismatch_iters = std::mismatch(reference_string.begin(), reference_string.end(), other_string.begin(), + [](unsigned char c1, unsigned char c2) {return std::toupper(c1) == std::toupper(c2);}); + size_t match_length = mismatch_iters.first - reference_string.begin(); + lcp_length = std::min(lcp_length, match_length); + } + + // There should be at least one base of match because we bucketed by base. + assert(lcp_length >= 1); + + // Do the merge. It can only invalidate handles within this family. + merge(graph, merge_from, lcp_length); + + // We did a merge + made_progress = true; + } + + // To merge everything on the other side of stuff we just merged, we need to start from the top again. + // So return if we did anything and more might remain (or have been created) to do. + return made_progress; +} + + +} +} + diff --git a/src/algorithms/simplify_siblings.hpp b/src/algorithms/simplify_siblings.hpp new file mode 100644 index 00000000000..2b0ebb2f4f4 --- /dev/null +++ b/src/algorithms/simplify_siblings.hpp @@ -0,0 +1,35 @@ +#ifndef VG_ALGORITHMS_SIMPLIFY_SIBLINGS_HPP_INCLUDED +#define VG_ALGORITHMS_SIMPLIFY_SIBLINGS_HPP_INCLUDED + + +#include "../handle.hpp" +#include "merge.hpp" +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +/** + * Simplify siblings in the given graph. + * + * When one base has two successors with the same base value, and those + * successors have the same set of predecessors, the successors will be merged. + * + * Performs only a subset of the possible merges. Can only merge in from one + * side of a given node in a single invocation. Returns true if it made + * progress and there may be more merging to do. + * + * Preserves paths. + * + * Optional can_merge callback will only let nodes get merged together if + * this pairwise check returns true. + */ +bool simplify_siblings(handlegraph::MutablePathDeletableHandleGraph* graph, + function can_merge = nullptr); + +} +} + +#endif diff --git a/src/algorithms/split_strands.cpp b/src/algorithms/split_strands.cpp deleted file mode 100644 index 86cec66b308..00000000000 --- a/src/algorithms/split_strands.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include "split_strands.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - - unordered_map> split_strands(const HandleGraph* source, MutableHandleGraph* into) { - - if (into->node_size()) { - cerr << "error:[algorithms] attempted to create strand-splitted graph in a non-empty graph" << endl; - exit(1); - } - - unordered_map> node_translation; - - unordered_map forward_node; - unordered_map reverse_node; - - unordered_set edges; - - source->for_each_handle([&](const handle_t& handle) { - // create and record forward and reverse versions of each node - handle_t fwd_handle = into->create_handle(source->get_sequence(handle)); - handle_t rev_handle = into->create_handle(reverse_complement(source->get_sequence(handle))); - - forward_node[handle] = fwd_handle; - reverse_node[handle] = rev_handle; - - node_translation[into->get_id(fwd_handle)] = make_pair(source->get_id(handle), false); - node_translation[into->get_id(rev_handle)] = make_pair(source->get_id(handle), true); - - // collect all the edges - source->follow_edges(handle, true, [&](const handle_t& prev) { - edges.insert(source->edge_handle(prev, handle)); - }); - source->follow_edges(handle, false, [&](const handle_t& next) { - edges.insert(source->edge_handle(handle, next)); - }); - }); - - // translate each edge into two edges between forward-oriented nodes - for (edge_t edge : edges) { - handle_t fwd_prev = source->get_is_reverse(edge.first) ? reverse_node[source->flip(edge.first)] - : forward_node[edge.first]; - handle_t fwd_next = source->get_is_reverse(edge.second) ? reverse_node[source->flip(edge.second)] - : forward_node[edge.second]; - - handle_t rev_prev = source->get_is_reverse(edge.second) ? forward_node[source->flip(edge.second)] - : reverse_node[edge.second]; - handle_t rev_next = source->get_is_reverse(edge.first) ? forward_node[source->flip(edge.first)] - : reverse_node[edge.first]; - - into->create_edge(fwd_prev, fwd_next); - into->create_edge(rev_prev, rev_next); - } - - return move(node_translation); - } -} -} diff --git a/src/algorithms/split_strands.hpp b/src/algorithms/split_strands.hpp deleted file mode 100644 index f80bebd10c5..00000000000 --- a/src/algorithms/split_strands.hpp +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef VG_ALGORITHMS_SPLIT_STRANDS_HPP_INCLUDED -#define VG_ALGORITHMS_SPLIT_STRANDS_HPP_INCLUDED - -/** - * \file split_strands.hpp - * - * Defines algorithm for converting any graph into a single stranded graph. - */ - -#include "../handle.hpp" -#include "../utility.hpp" - -#include -#include - -namespace vg { -namespace algorithms { - -using namespace std; - - /// Fills a MutableHandleGraph 'into' with a graph that has the same sequence and path - /// space as 'source', but all of the sequences are on the forward strand. This is - /// accomplished by creating a new node for each node in the source graph with the reverse - /// complement sequence. Returns a map that translates node IDs from 'into' to their - /// node ID and orientation in 'source'. Reports an error and exits if 'into' is not - /// empty. - unordered_map> split_strands(const HandleGraph* source, - MutableHandleGraph* into); - -} -} - -#endif diff --git a/src/algorithms/strongly_connected_components.cpp b/src/algorithms/strongly_connected_components.cpp deleted file mode 100644 index 884bf1f2da7..00000000000 --- a/src/algorithms/strongly_connected_components.cpp +++ /dev/null @@ -1,127 +0,0 @@ -#include "strongly_connected_components.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - // recursion-free version of Tarjan's strongly connected components algorithm - // https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm - // Generalized to bidirected graphs as described (confusingly) in - // "Decomposition of a bidirected graph into strongly connected components and - // its signed poset structure", by Kazutoshi Ando, Satoru Fujishige, and Toshio - // Nemoto. http://www.sciencedirect.com/science/article/pii/0166218X95000683 - - // The best way to think about that paper is that the edges are vectors in a - // vector space with number of dimensions equal to the number of nodes in the - // graph, and an edge attaching to the end a node is the positive unit vector in - // its dimension, and an edge attaching to the start of node is the negative - // unit vector in its dimension. - - // The basic idea is that you just consider the orientations as different nodes, - // and the edges as existing between both pairs of orientations they connect, - // and do connected components on that graph. Since we don't care about - // "consistent" or "inconsistent" strongly connected components, we just put a - // node in a component if either orientation is in it. But bear in mind that - // both orientations of a node might not actually be in the same strongly - // connected component in a bidirected graph, so now the components may overlap. - vector> strongly_connected_components(const HandleGraph* handle_graph) { - -#ifdef debug - cerr << "Computing strongly connected components" << endl; -#endif - - // What node visit step are we on? - int64_t index = 0; - // What's the search root from which a node was reached? - unordered_map roots; - // At what index step was each node discovered? - unordered_map discover_idx; - // We need our own copy of the DFS stack - vector stack; - // And our own set of nodes already on the stack - unordered_set on_stack; - // What components did we find? Because of the way strongly connected - // components generalizes, both orientations of a node always end up in the - // same component. - vector> components; - - dfs(*handle_graph, - [&](const handle_t& trav) { - // When a NodeTraversal is first visited -#ifdef debug - cerr << "First visit to " << handle_graph->get_id(trav) << " orientation " << handle_graph->get_is_reverse(trav) << endl; -#endif - // It is its own root - roots[trav] = trav; - // We discovered it at this step - discover_idx[trav] = index++; - // And it's on the stack - stack.push_back(trav); - on_stack.insert(trav); - }, - [&](const handle_t& trav) { - // When a NodeTraversal is done being recursed into -#ifdef debug - cerr << "Finishing " << handle_graph->get_id(trav) << " orientation " << handle_graph->get_is_reverse(trav) << endl; -#endif - // Go through all the NodeTraversals reachable reading onwards from this traversal. - handle_graph->follow_edges(trav, false, [&](const handle_t& next) { -#ifdef debug - cerr << "\tCould next reach " << handle_graph->get_id(next) << " orientation " << handle_graph->get_is_reverse(next) << endl; -#endif - if (on_stack.count(next)) { - // If any of those NodeTraversals are on the stack already -#ifdef debug - cerr << "\t\tIt is already on the stack, so maybe we want its root" << endl; -#endif - auto& node_root = roots[trav]; - auto& next_root = roots[next]; -#ifdef debug - cerr << "\t\t\tWe have root " << handle_graph->get_id(node_root) << " orientation " - << handle_graph->get_is_reverse(node_root) - << " discovered at time " << discover_idx[node_root] << endl; - cerr << "\t\t\tThey have root " << handle_graph->get_id(next_root) << " orientation " - << handle_graph->get_is_reverse(next_root) - << " discovered at time " << discover_idx[next_root] << endl; -#endif - // Adopt the root of the NodeTraversal that was discovered first. - roots[trav] = discover_idx[node_root] < discover_idx[next_root] ? - node_root : next_root; -#ifdef debug - cerr << "\t\t\tWinning root: " << handle_graph->get_id(roots[trav]) << " orientation " - << handle_graph->get_is_reverse(roots[trav]) << endl; -#endif - } - return true; - }); - - if (roots[trav] == trav) { - // If we didn't find a better root -#ifdef debug - cerr << "\tWe are our own best root, so glom up everything under us" << endl; -#endif - handle_t other; - components.emplace_back(); - auto& component = components.back(); - do - { - // Grab everything that was put on the DFS stack below us - // and put it in our component. - other = stack.back(); - stack.pop_back(); - on_stack.erase(other); - component.insert(handle_graph->get_id(other)); -#ifdef debug - cerr << "\t\tSnarf up node " << handle_graph->get_id(other) << " from handle in orientation " - << handle_graph->get_is_reverse(other) << endl; -#endif - } while (other != trav); - } - }, - vector(), unordered_set()); - - return components; - } - -} -} diff --git a/src/algorithms/strongly_connected_components.hpp b/src/algorithms/strongly_connected_components.hpp deleted file mode 100644 index 6f068276c3a..00000000000 --- a/src/algorithms/strongly_connected_components.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef VG_ALGORITHMS_STRONGLY_CONNECTED_COMPONENTS_HPP_INCLUDED -#define VG_ALGORITHMS_STRONGLY_CONNECTED_COMPONENTS_HPP_INCLUDED - -#include -#include "../handle.hpp" -#include "dfs.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - -/// Find all of the nodes with no edges on their left sides. -vector> strongly_connected_components(const HandleGraph* g); - -} -} - -#endif diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp new file mode 100644 index 00000000000..2ea619ea952 --- /dev/null +++ b/src/algorithms/subgraph.cpp @@ -0,0 +1,402 @@ +#include "subgraph.hpp" +#include "../path.hpp" + +namespace vg { +namespace algorithms { + +void expand_subgraph_by_steps(const HandleGraph& source, MutableHandleGraph& subgraph, const uint64_t& steps, bool forward_only) { + std::vector curr_handles; + subgraph.for_each_handle([&](const handle_t& h) { + curr_handles.push_back(h); + }); + for (uint64_t i = 0; i < steps && !curr_handles.empty(); ++i) { + std::vector next_handles; + for (auto& h : curr_handles) { + handle_t old_h = source.get_handle(subgraph.get_id(h)); + source.follow_edges(old_h, false, [&](const handle_t& c) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + next_handles.push_back(x); + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(h, x)) { + subgraph.create_edge(h, x); + } + }); + if (!forward_only) { + source.follow_edges(old_h, true, [&](const handle_t& c) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + next_handles.push_back(x); + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(x, h)) { + subgraph.create_edge(x, h); + } + }); + } + } + curr_handles = std::move(next_handles); + } + add_connecting_edges_to_subgraph(source, subgraph); +} + +void expand_subgraph_to_node_count(const HandleGraph& source, MutableHandleGraph& subgraph, const uint64_t& node_count, bool forward_only) { + std::vector curr_handles; + subgraph.for_each_handle([&](const handle_t& h) { + curr_handles.push_back(h); + }); + while (subgraph.get_node_count() < node_count && subgraph.get_node_count()) { + std::vector next_handles; + for (auto& h : curr_handles) { + handle_t old_h = source.get_handle(subgraph.get_id(h)); + source.follow_edges(old_h, false, [&](const handle_t& c) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + next_handles.push_back(x); + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(h, x)) { + subgraph.create_edge(h, x); + } + }); + if (!forward_only) { + source.follow_edges(old_h, true, [&](const handle_t& c) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + next_handles.push_back(x); + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(x, h)) { + subgraph.create_edge(x, h); + } + }); + } + } + curr_handles = std::move(next_handles); + } + add_connecting_edges_to_subgraph(source, subgraph); +} + +void expand_subgraph_by_length(const HandleGraph& source, MutableHandleGraph& subgraph, const uint64_t& length, bool forward_only) { + uint64_t accumulated_length = 0; + std::vector curr_handles; + subgraph.for_each_handle([&](const handle_t& h) { + curr_handles.push_back(h); + }); + while (accumulated_length < length && !curr_handles.empty()) { + std::vector next_handles; + for (auto& h : curr_handles) { + handle_t old_h = source.get_handle(subgraph.get_id(h)); + source.follow_edges(old_h, false, [&](const handle_t& c) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + next_handles.push_back(x); + accumulated_length += subgraph.get_length(x); + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(h, x)) { + subgraph.create_edge(h, x); + } + }); + if (!forward_only) { + source.follow_edges(old_h, true, [&](const handle_t& c) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + next_handles.push_back(x); + accumulated_length += subgraph.get_length(x); + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(x, h)) { + subgraph.create_edge(x, h); + } + }); + } + } + curr_handles = std::move(next_handles); + } + add_connecting_edges_to_subgraph(source, subgraph); +} + +void expand_subgraph_to_length(const HandleGraph& source, MutableHandleGraph& subgraph, const uint64_t& length, bool forward_only) { + uint64_t total_length = 0; + std::vector curr_handles; + subgraph.for_each_handle([&](const handle_t& h) { + total_length += subgraph.get_length(h); + curr_handles.push_back(h); + }); + while (total_length < length && !curr_handles.empty()) { + std::vector next_handles; + for (auto& h : curr_handles) { + handle_t old_h = source.get_handle(subgraph.get_id(h)); + source.follow_edges(old_h, false, [&](const handle_t& c) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + next_handles.push_back(x); + total_length += subgraph.get_length(x); + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(h, x)) { + subgraph.create_edge(h, x); + } + }); + if (!forward_only) { + source.follow_edges(old_h, true, [&](const handle_t& c) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + next_handles.push_back(x); + total_length += subgraph.get_length(x); + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(x, h)) { + subgraph.create_edge(x, h); + } + }); + } + } + curr_handles = std::move(next_handles); + } + add_connecting_edges_to_subgraph(source, subgraph); +} + +/// expand the context around a single handle position +void extract_context(const HandleGraph& source, MutableHandleGraph& subgraph, const handle_t& handle, const uint64_t& offset, const uint64_t& length, bool fwd, bool rev) { + uint64_t total_length_fwd = source.get_length(handle)-offset; + uint64_t total_length_rev = offset; + uint64_t get_fwd = fwd && !rev ? length : length/2; + uint64_t get_rev = !fwd && rev ? length : length/2; + if (!subgraph.has_node(source.get_id(handle))) { + subgraph.create_handle(source.get_sequence(source.get_is_reverse(handle)?source.flip(handle):handle), source.get_id(handle)); + } + bool extended = true; + while (extended && (total_length_fwd < get_fwd || total_length_rev < get_rev)) { + std::vector curr_handles; + subgraph.for_each_handle([&](const handle_t& h) { + curr_handles.push_back(h); + }); + extended = false; + for (auto& h : curr_handles) { + handle_t old_h = source.get_handle(subgraph.get_id(h)); + if (total_length_fwd < get_fwd) { + source.follow_edges(old_h, false, [&](const handle_t& c) { + if (total_length_fwd < get_fwd) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + total_length_fwd += subgraph.get_length(x); + extended = true; + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(h, x)) { + subgraph.create_edge(h, x); + } + } + }); + } + if (total_length_rev < get_rev) { + source.follow_edges(old_h, true, [&](const handle_t& c) { + if (total_length_rev < get_rev) { + handle_t x; + if (!subgraph.has_node(source.get_id(c))) { + x = subgraph.create_handle(source.get_sequence(source.get_is_reverse(c)?source.flip(c):c), source.get_id(c)); + total_length_rev += subgraph.get_length(x); + extended = true; + } else { + x = subgraph.get_handle(source.get_id(c)); + } + if (source.get_is_reverse(c)) { + x = subgraph.flip(x); + } + if (!subgraph.has_edge(x, h)) { + subgraph.create_edge(x, h); + } + } + }); + } + } + } + add_connecting_edges_to_subgraph(source, subgraph); +} + +void extract_id_range(const HandleGraph& source, const nid_t& id1, const nid_t& id2, MutableHandleGraph& subgraph) { + for (nid_t i = id1; i <= id2; ++i) { + if (!subgraph.has_node(i)) { + subgraph.create_handle(source.get_sequence(source.get_handle(i)), i); + } + } +} + +void extract_path_range(const PathPositionHandleGraph& source, path_handle_t path_handle, int64_t start, int64_t end, + MutableHandleGraph& subgraph) { + step_handle_t start_step = source.get_step_at_position(path_handle, start); + size_t start_position = source.get_position_of_step(start_step); + size_t size_needed = end < 0 ? numeric_limits::max() : end - start + 1 + start - start_position; + size_t running_length = 0; + + for (step_handle_t cur_step = start_step; cur_step != source.path_end(path_handle) && running_length < size_needed; + cur_step = source.get_next_step(cur_step)) { + handle_t cur_handle = source.get_handle_of_step(cur_step); + subgraph.create_handle(source.get_sequence(cur_handle), source.get_id(cur_handle)); + if (cur_step != start_step) { + handle_t prev_handle = source.get_handle_of_step(source.get_previous_step(cur_step)); + subgraph.create_edge(subgraph.get_handle(source.get_id(prev_handle), source.get_is_reverse(prev_handle)), + subgraph.get_handle(source.get_id(cur_handle), source.get_is_reverse(cur_handle))); + } + running_length += source.get_length(cur_handle); + } +} + +/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph +/// based on their order in the path position index provided by the source graph +/// will clear any path found in both graphs before writing the new steps into it +/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset +/// in the source graph (unless the subpath was not cut up at all) +void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph, + bool subpath_naming) { + std::unordered_map > subpaths; + subgraph.for_each_handle([&](const handle_t& h) { + handlegraph::nid_t id = subgraph.get_id(h); + if (source.has_node(id)) { + handle_t handle = source.get_handle(id); + source.for_each_step_position_on_handle(handle, [&](const step_handle_t& step, const bool& is_rev, const uint64_t& pos) { + path_handle_t path = source.get_path_handle_of_step(step); + std::string path_name = source.get_path_name(path); + subpaths[path_name][pos] = is_rev ? subgraph.flip(h) : h; + return true; + }); + } + }); + + function new_subpath = + [&subgraph](const string& path_name, bool is_circular, size_t subpath_offset) { + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange); + if (subrange == PathMetadata::NO_SUBRANGE) { + subrange.first = subpath_offset; + } else { + subrange.first += subpath_offset; + } + subrange.first = subpath_offset; + subrange.second = PathMetadata::NO_END_POSITION; + string subpath_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange); + if (subgraph.has_path(subpath_name)) { + subgraph.destroy_path(subgraph.get_path_handle(subpath_name)); + } + return subgraph.create_path_handle(subpath_name, is_circular); + }; + + for (auto& subpath : subpaths) { + const std::string& path_name = subpath.first; + path_handle_t source_path_handle = source.get_path_handle(path_name); + // destroy the path if it exists + if (subgraph.has_path(path_name)) { + subgraph.destroy_path(subgraph.get_path_handle(path_name)); + } + // create a new path. give it a subpath name if the flag's on and its smaller than original + path_handle_t path; + if (!subpath_naming || subpath.second.size() == source.get_step_count(source_path_handle) || + subpath.second.empty()) { + path = subgraph.create_path_handle(path_name, source.get_is_circular(source_path_handle)); + } else { + path = new_subpath(path_name, source.get_is_circular(source_path_handle), subpath.second.begin()->first); + } + for (auto p = subpath.second.begin(); p != subpath.second.end(); ++p) { + const handle_t& handle = p->second; + if (p != subpath.second.begin() && subpath_naming) { + auto prev = p; + --prev; + const handle_t& prev_handle = prev->second; + // distance from map + size_t delta = p->first - prev->first; + // what the distance should be if they're contiguous depends on relative orienations + size_t cont_delta = subgraph.get_length(prev_handle); + if (delta != cont_delta) { + // we have a discontinuity! we'll make a new path can continue from there + assert(subgraph.get_step_count(path) > 0); + path = new_subpath(path_name, subgraph.get_is_circular(path), p->first); + } + } + //fill in the path information + subgraph.append_step(path, handle); + } + } +} + +/// We can accumulate a subgraph without accumulating all the edges between its nodes +/// this helper ensures that we get the full set +void add_connecting_edges_to_subgraph(const HandleGraph& source, MutableHandleGraph& subgraph) { + subgraph.for_each_handle([&](const handle_t& handle) { + nid_t id = subgraph.get_id(handle); + handle_t source_handle = source.get_handle(id, subgraph.get_is_reverse(handle)); + source.follow_edges(source_handle, false, [&](const handle_t& next) { + nid_t next_id = source.get_id(next); + if (subgraph.has_node(next_id)) { + handle_t subgraph_next = subgraph.get_handle(next_id, source.get_is_reverse(next)); + if (!subgraph.has_edge(handle, subgraph_next)) { + subgraph.create_edge(handle, subgraph_next); + } + } + }); + source.follow_edges(source_handle, true, [&](const handle_t& prev) { + nid_t prev_id = source.get_id(prev); + if (subgraph.has_node(prev_id)) { + handle_t subgraph_prev = subgraph.get_handle(prev_id, source.get_is_reverse(prev)); + if (!subgraph.has_edge(subgraph_prev, handle)) { + subgraph.create_edge(subgraph_prev, handle); + } + } + }); + }); +} + +} +} diff --git a/src/algorithms/subgraph.hpp b/src/algorithms/subgraph.hpp new file mode 100644 index 00000000000..fef9e174235 --- /dev/null +++ b/src/algorithms/subgraph.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include "../handle.hpp" +//#include "../subgraph.hpp" +#include "../types.hpp" +#include "shortest_cycle.hpp" +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +/// expand the subgraph iteratively for this many steps +void expand_subgraph_by_steps(const HandleGraph& source, MutableHandleGraph& subgraph, const uint64_t& steps, bool forward_only = false); + +/// expand the subgraph iteratively until its node count is at least node_count +void expand_subgraph_to_node_count(const HandleGraph& source, MutableHandleGraph& subgraph, const uint64_t& node_count, bool forward_only = false); + +/// expand the subgraph iteratively to include at least length new sequence +void expand_subgraph_by_length(const HandleGraph& source, MutableHandleGraph& subgraph, const uint64_t& length, bool forward_only = false); + +/// expand the subgraph iterativel until its total sequence length is greater than length +void expand_subgraph_to_length(const HandleGraph& source, MutableHandleGraph& subgraph, const uint64_t& length, bool forward_only = false); + +/// expand the context around a single handle position +void extract_context(const HandleGraph& source, MutableHandleGraph& subgraph, const handle_t& handle, const uint64_t& offset, const uint64_t& length, bool go_fwd = true, bool go_rev = true); + +/// extract the node id range +void extract_id_range(const HandleGraph& source, const nid_t& id1, const nid_t& id2, MutableHandleGraph& subgraph); + +/// extract the path range +/// nodes aren't cut, so the returned graph may start before start and/or end after end +/// if end < 0, then it will walk to the end of the path +void extract_path_range(const PathPositionHandleGraph& source, path_handle_t path_handle, int64_t start, int64_t end, MutableHandleGraph& subgraph); + +/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph +/// based on their order in the path position index provided by the source graph +/// will clear any path found in both graphs before writing the new steps into it +/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset +/// in the source graph (unless the subpath was not cut up at all) +void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph, + bool subpath_naming = false); + +/// We can accumulate a subgraph without accumulating all the edges between its nodes +/// this helper ensures that we get the full set +void add_connecting_edges_to_subgraph(const HandleGraph& source, MutableHandleGraph& subgraph); + +} +} diff --git a/src/algorithms/three_edge_connected_components.cpp b/src/algorithms/three_edge_connected_components.cpp new file mode 100644 index 00000000000..77a1c7ed432 --- /dev/null +++ b/src/algorithms/three_edge_connected_components.cpp @@ -0,0 +1,804 @@ +#include "three_edge_connected_components.hpp" + +extern "C" { +#include "sonLib/sonLibList.h" +#include "sonLib/sonLibTuples.h" +#include "sonLib/3_Absorb3edge2x.h" +} + +#include + +#include +#include +#include +#include + +//#define debug + +namespace vg { +namespace algorithms { + +using namespace std; + +void three_edge_connected_component_merges_dense(size_t node_count, size_t first_root, + const function&)>& for_each_connected_node, + const function& same_component) { + + // Independent implementation of Norouzi and Tsin (2014) "A simple 3-edge + // connected component algorithm revisited", which can't really be + // understood without Tsin (2007) "A Simple 3-Edge-Connected Component + // Algorithm". + + // That algorithm assumes that all bridge edges are removed (i.e. + // everything is at least 2-connected), but we hack it a bit to generalize + // to graphs with bridge edges. It also assumes there are no self loops, + // but this implementation detects and allows self loops. + + // The algorithm does a depth-first search through the graph, and is based + // on this "absorb-eject" operation. You do it at a node, across ("on") an + // edge. It (conceptually) steals all the edges from the node at the other + // end of the edge, deletes the edge, and deletes the other node as well if + // it has a degree greater than 2. (The original algorithm didn't have to + // deal with degree 1; here we treat it about the same as degree 2 and + // leave the node floating in its own 3 edge connected component, while + // hiding the single edge from the real logic of the algorithm.) + + // Because of guarantees about the order in which we traverse the graph, we + // don't actually have to *do* any of the absorb-eject graph topology + // modifications. Instead, we just have to keep track of updates to nodes' + // "effective degree" in what would be the modified graph, and allow + // certain paths that we track during the algorithm to traverse the stolen + // edges. + + // For each node, we keep track of a path. Because of guarantees we get + // from the algorithm, we know that paths can safely share tails. So to + // represent the tail of a path, we can just point to another node (or + // nowhere if the path ends). The head of a path is tougher, because a + // node's path can be empty. The node may not be on its own path. It is not + // immediately clear from analyzing the algorithm whether a node can have a + // nonempty path that it itself is not on, or be the tail of another node's + // path without being on that path. To support those cases, we also give + // each node a flag for whether it is on its own path. + + // TODO: should we template this on an integer size so we can fit more work + // in less memory bandwidth when possible? + using number_t = size_t; + assert(node_count < numeric_limits::max()); + + /// This defines the data we track for each node in the graph + struct TsinNode { + /// When in the DFS were we first visited? + number_t dfs_counter; + /// When in the DFS were we last visited? + /// Needed for finding replacement neighbors to implement path range + /// absorption in part 1.3, when we're asked for a range to a neighbor + /// that got eaten. + number_t dfs_exit; + /// What is our "low point" in the search. This is the earliest + /// dfs_counter for a node that this node or any node in its DFS + /// subtree has a back-edge to. + number_t low_point; + /// What is the effective degree of this node in the graph with all the + /// absorb-eject modifications applied? + number_t effective_degree = 0; + /// What node has the continuation of this node's path? If equal to + /// numeric_limits::max(), the path ends after here. + /// The node's path is the path from this node, into its DFS subtree, + /// to (one of) the nodes in the subtree that has the back-edge that + /// caused this node's low point to be so low. Basically a low point + /// traceback. + number_t path_tail; + /// Is this node actually on its own path? + /// Nodes can be removed from their paths if those nodes don't matter + /// any more (i.e. got absorbed) but their paths still need to be tails + /// for other paths. + bool is_on_path; + /// Has the node been visited yet? Must be 0. TODO: Move to its own + /// vector to make zeroing them all free-ish with page table + /// shenanigans. + bool visited = false; + }; + + // We need to have all the nodes pre-allocated, so node references don't + // invalidate when we follow edges. + vector nodes(node_count); + + // We need to say how to absorb-eject along a whole path. + // + // We let you specify the node to absorb into; if it isn't + // numeric_limits::max(), it is assumed to be the first node, and + // actually on the path, and path_start (if itself on its path) is also + // absorbed into it. This lets you absorb into a path with something + // prepended, without constructing the path. + // + // Similarly, we let you specify a past end to stop before. If this isn't + // numeric_limits::max(), we stop and don't absorb the specified + // node, if we reach it. This lets us implement absorbing a range of a + // path, as called for in the algorithm. + // + // If you specify a past_end, and we never reach it, but also don't have + // just a single-node, no-edge "null" path, then something has gone wrong + // and we've violated known truths about the algorithm. + auto absorb_all_along_path = [&](number_t into, number_t path_start, number_t path_past_end) { + +#ifdef debug + cerr << "(Absorbing all along path into " << into << " from " << path_start << " to before " << path_past_end << ")" << endl; +#endif + + // Set this to false as soon as we cross an edge + bool path_null = true; + + number_t here = path_start; + while (here != path_past_end) { + // Until we hit the end of the path + +#ifdef debug + cerr << "(\tAt " << here << ")" << endl; +#endif + + if (here == numeric_limits::max()) { + // We hit the end of the path and never saw path_past_end. + +#ifdef debug + cerr << "(\t\tReached path end and missed waypoint)" << endl; +#endif + // Only allowed if the path was actually edge-free and no merges needed to happen. + assert(path_null); + +#ifdef debug + cerr << "(\t\t\tBut path was empty of edges)" << endl; +#endif + // Stop now. + break; + } + + // Find the node we are at + auto& here_node = nodes[here]; + + if (here_node.is_on_path) { + // We're actually on the path. + +#ifdef debug + cerr << "(\t\tOn path)" << endl; +#endif + + if (into == numeric_limits::max()) { + // We haven't found a first node to merge into yet; it is + // this one. + +#ifdef debug + cerr << "(\t\tUse as into)" << endl; +#endif + + into = here; + } else { + // We already have a first node to merge into, so merge. + +#ifdef debug + cerr << "(\t\tMerge with " << into << ")" << endl; +#endif + + // We are doing a merge! We'd better actually find the + // ending range bound, or something is wrong with our + // implementation of the algorithm. + path_null = false; + + // Update the effective degrees as if we merged this node + // with the connected into node. + nodes[into].effective_degree = (nodes[into].effective_degree + + here_node.effective_degree - 2); + + // Merge us into the same 3 edge connected component + same_component(into, here); + } + } + + // Advance to the tail of the path + here = here_node.path_tail; + +#ifdef debug + cerr << "(\t\tNext: " << here << ")" << endl; +#endif + + } + +#ifdef debug + cerr << "(Done absorbing)" << endl; +#endif + + }; + + // For debugging, we need to be able to dump a node's stored path + auto path_to_string = [&](number_t node) { + stringstream s; + + number_t here = node; + bool first = true; + while (here != numeric_limits::max()) { + if (nodes[here].is_on_path) { + if (first && nodes[here].path_tail == numeric_limits::max()) { + // Just a single node, no edge + s << "(just " << here << ")"; + break; + } + + if (first) { + first = false; + } else { + s << "-"; + } + s << here; + } + here = nodes[here].path_tail; + } + + return s.str(); + }; + + // We need a DFS stack that we manage ourselves, to avoid stack-overflowing + // as we e.g. walk along big cycles. + struct DFSStackFrame { + /// Track the node that this stack frame represents + number_t current; + /// Track all the neighbors left to visit. + /// When we visit a neighbor we pop it off the back. + vector neighbors; + /// When we look at the neighbors, we need to be able to tell the tree + /// edge to the parent from further back edges to the parent. So we + /// have a flag for whether we have seen the parent tree edge already, + /// and the first neighbors entry that is our parent will get called + /// the tree edge. + bool saw_parent_tree_edge = false; + /// Track whether we made a recursive DFS call into the last neighbor + /// or not. If we did, we need to do some work when we come out of it + /// and return to this frame. + bool recursing = false; + }; + + vector stack; + + // We need a way to produce unvisited nodes when we run out of nodes in a + // connected component. This will always point to the next unvisited node + // in order. If it points to node_count, all nodes are visited. When we + // fisit this node, we have to scan ahead for the next unvisited node, in + // number order. + number_t next_unvisited = 0; + + // We also keep a global DFS counter, so we don't have to track parent + // relationships when filling it in on the nodes. + // + // The paper starts it at 1, so we do too. + number_t dfs_counter = 1; + + while (next_unvisited != node_count) { + // We haven't visited everything yet. + if (!nodes[first_root].visited) { + // If possible start at the suggested root + stack.emplace_back(); + stack.back().current = first_root; + } else { + // Stack up the next unvisited node. + stack.emplace_back(); + stack.back().current = next_unvisited; + } + +#ifdef debug + cerr << "Root a search at " << stack.back().current << endl; +#endif + + while (!stack.empty()) { + // While there's still nodes on the DFS stack from the last component we broke into + // Grab the stack frame. + // Note that this reference will be invalidated if we add stuff to the stack! + auto& frame = stack.back(); + // And the current node + auto& node = nodes[frame.current]; + + if (!node.visited) { + // This is the first time we are in this stack frame. We need + // to do the initial visit of the node and set up the frame + // with the list of edges to do. + node.visited = true; + +#ifdef debug + cerr << "First visit of node " << frame.current << endl; +#endif + + if (frame.current == next_unvisited) { + // We need to find the next unvisited node, if any, since + // we just visited what it used to be. + do { + next_unvisited++; + } while (next_unvisited != node_count && nodes[next_unvisited].visited); + } + + node.dfs_counter = dfs_counter; + dfs_counter++; + node.low_point = node.dfs_counter; + // Make sure the node's path is just itself + node.path_tail = numeric_limits::max(); + node.is_on_path = true; + +#ifdef debug + cerr << "\tDFS: " << node.dfs_counter + << " low point: " << node.low_point + << " degree: " << node.effective_degree + << " path: " << path_to_string(frame.current) << endl; +#endif + + // Stack up all the edges to follow. + for_each_connected_node(frame.current, [&](size_t connected) { + frame.neighbors.push_back(connected); + }); + +#ifdef debug + cerr << "\tPut " << frame.neighbors.size() << " edges on to do list" << endl; +#endif + + // Now we're in a state where we can process edges. + // So kick back to the work loop as if we just processed an edge. + continue; + } else { + // We have (possibly 0) edges left to do for this node. + if (!frame.neighbors.empty()) { + +#ifdef debug + cerr << "Return to node " << frame.current << " with more edges to do" << endl; + + cerr << "\tDFS: " << node.dfs_counter + << " low point: " << node.low_point + << " degree: " << node.effective_degree + << " path: " << path_to_string(frame.current) << endl; +#endif + + // We have an edge to do! + // Look up the neighboring node. + number_t neighbor_number = frame.neighbors.back(); + auto& neighbor = nodes[neighbor_number]; + + if (!frame.recursing) { + // This is the first time we are thinking about this neighbor. + +#ifdef debug + cerr << "\tThink of edge to neighbor " << neighbor_number << " for the first time" << endl; +#endif + + // Increment degree of the node we're coming from + node.effective_degree++; + +#ifdef debug + cerr << "\t\tBump degree to " << node.effective_degree << endl; +#endif + + if (!neighbor.visited) { + // We need to recurse on this neighbor. + +#ifdef debug + cerr << "\t\tRecurse on unvisited neighbor" << endl; +#endif + + // So remember we are recursing. + frame.recursing = true; + // And set up the recursive frame. + stack.emplace_back(); + stack.back().current = neighbor_number; + // Kick back to the work loop; we will see the + // unvisited node on top of the stack and do its + // visit and add its edges to its to do list. + } else { + // No need to recurse.This is either a back-edge or the back side of the tree edge to the parent. + + if (stack.size() > 1 && neighbor_number == stack[stack.size() - 2].current && !frame.saw_parent_tree_edge) { + // This is the edge we took to get here (tree edge) +#ifdef debug + cerr << "\t\tNeighbor is parent; this is the tree edge in." << endl; +#endif + + // For tree edges, since they aren't either kind of back edge, neither 1.2 nor 1.3 fires. + // But the next edge to the parent will be a back edge. + frame.saw_parent_tree_edge = true; + } else if (neighbor.dfs_counter < node.dfs_counter) { + // The edge to the neighbor is an outgoing + // back-edge (i.e. the neighbor was visited + // first). Paper step 1.2. + +#ifdef debug + cerr << "\t\tNeighbor is upstream of us (outgoing back edge)." << endl; +#endif + + if (neighbor.dfs_counter < node.low_point) { + // The neighbor is below our low point. + +#ifdef debug + cerr << "\t\t\tNeighbor has a lower low point (" + << neighbor.dfs_counter << " < " << node.low_point << ")" << endl; + + cerr << "\t\t\t\tAbsorb along path to old low point source" << endl; +#endif + + // Absorb along our whole path. + absorb_all_along_path(numeric_limits::max(), + frame.current, + numeric_limits::max()); + + // Adopt the neighbor's DFS counter as our + // new, lower low point. + node.low_point = neighbor.dfs_counter; + +#ifdef debug + cerr << "\t\t\t\tNew lower low point " << node.low_point << endl; +#endif + + // Our path is now just us. + node.is_on_path = true; + node.path_tail = numeric_limits::max(); + +#ifdef debug + cerr << "\t\t\t\tNew path " << path_to_string(frame.current) << endl; +#endif + + } else { + +#ifdef debug + cerr << "\t\t\tWe have a sufficiently low low point" << endl; +#endif + + } + } else if (node.dfs_counter < neighbor.dfs_counter) { + // The edge to the neighbor is an incoming + // back-edge (i.e. we were visited first, but + // we recursed into something that got us to + // this neighbor already). Paper step 1.3. + +#ifdef debug + cerr << "\t\tWe are upstream of neighbor (incoming back edge)." << endl; +#endif + + // Drop our effective degree by 2 (I think + // we're closing a cycle or something?) + node.effective_degree -= 2; + +#ifdef debug + cerr << "\t\t\tDrop degree to " << node.effective_degree << endl; + + cerr << "\t\t\tWant to absorb along path towards low point source through neighbor" << endl; +#endif + + // Now, the algorithm says to absorb + // "P_w[w..u]", a notation that it does not + // rigorously define. w is here, and u is the + // neighbor. The neighbor is not necessarily + // actually *on* our path at this point, not + // least of which because the neighbor may have + // already been eaten and merged into another + // node, which in theory adopted the back edge + // we are looking at. In practice we don't have + // the data structure to find that node. So + // here's the part where we have to do + // something clever to "allow certain paths + // that we track to traverse the stolen edges". + + // What we have to do is find the node that + // *is* along our path that either is or ate + // the neighbor. We don't track the union-find + // logic we would need to answer that question, + // but both 2007 algorithm implementations I've + // seen deal with this by tracking DFS counter + // intervals/subtree sizes, and deciding that + // the last thin on our path visited no later + // than the neighbor, and exited no earlier + // than the neighbor (i.e. the last ancestor of + // the neighbor on our path) should be our + // replacement neighbor. + + // This makes sense because if the neighbor + // merged into anything, it's an ancestor of + // the neighbor. So we go looking for it. + + // TODO: let absorb_all_along_path do this instead? + + // Start out with ourselves as the replacement neighbor ancestor. + number_t replacement_neighbor_number = frame.current; + // Consider the next candidate + number_t candidate = nodes[replacement_neighbor_number].path_tail; + while (candidate != numeric_limits::max() && + nodes[candidate].dfs_counter <= neighbor.dfs_counter && + nodes[candidate].dfs_exit >= neighbor.dfs_exit) { + + // This candidate is a lower ancestor of the neighbor, so adopt it. + replacement_neighbor_number = candidate; + candidate = nodes[replacement_neighbor_number].path_tail; + } + + auto& replacement_neighbor = nodes[replacement_neighbor_number]; + +#ifdef debug + cerr << "\t\t\tNeighbor currently belongs to node " << replacement_neighbor_number << endl; + + cerr << "\t\t\tAbsorb along path towards low point source through there" << endl; +#endif + + // Absorb along our path from ourselves to the + // replacement neighbor, inclusive. + // Ignores trivial paths. + absorb_all_along_path(numeric_limits::max(), + frame.current, + replacement_neighbor.path_tail); + + // We also have to (or at least can) adopt the + // path of the replacement neighbor as our own + // path now. That's basically the rest of the + // path that we didn't merge. + // This isn't mentioned in the paper either, + // but I've seen the official implementation do + // it, and if we don't do it our path is going + // to go through a bunch of stuff we already + // merged, and waste time when we merge again. + + // If we ever merge us down our path again, + // continue with the part we didn't already + // eat. + node.path_tail = replacement_neighbor.path_tail; + } else { + // The other possibility is the neighbor is just + // us. Officially self loops aren't allowed, so + // we censor the edge. + +#ifdef debug + cerr << "\t\tWe are neighbor (self loop). Hide edge!" << endl; +#endif + + node.effective_degree--; + } + + // Clean up the neighbor from the to do list; we + // finished it without recursing. + frame.neighbors.pop_back(); + + // Kick back to the work loop to do the next + // neighbor, if any. + } + } else { + // We have returned from a recursive call on this neighbor. + +#ifdef debug + cerr << "\tReturned from recursion on neighbor " << neighbor_number << endl; +#endif + + // Support bridge edges: detect if we are returning + // across a bridge edge and censor it. Norouzi and Tsin + // 2014 as written in the paper assumes no bridge + // edges, and what we're about to do relies on all + // neighbors connecting back somewhere. + if (neighbor.low_point == neighbor.dfs_counter) { + // It has no back-edges out of its own subtree, so it must be across a bridge. +#ifdef debug + cerr << "\t\tNeighbor is across a bridge edge! Hide edge!" << endl; +#endif + + // Hide the edge we just took from degree calculations. + neighbor.effective_degree--; + node.effective_degree--; + + // Don't do anything else with the edge + } else { + // Wasn't a bridge edge, so we care about more than just traversing that part of the graph. + + // Do steps 1.1.1 and 1.1.2 of the algorithm as described in the paper. + if (neighbor.effective_degree == 2) { + // This neighbor gets absorbed and possibly ejected. + +#ifdef debug + cerr << "\t\tNeighbor is on a stick" << endl; + + cerr << "\t\t\tEdge " << frame.current << "-" << neighbor_number << " should never be seen again" << endl; +#endif + + // Take it off of its own path. + neighbor.is_on_path = false; + +#ifdef debug + cerr << "\t\t\tNew neighbor path: " << path_to_string(neighbor_number) << endl; +#endif + } + + // Because we hid the bridge edges, degree 1 nodes should never happen + assert(neighbor.effective_degree != 1); + + if (node.low_point <= neighbor.low_point) { + +#ifdef debug + cerr << "\t\tWe have a sufficiently low low point; neighbor comes back in in our subtree" << endl; + + cerr << "\t\t\tAbsorb us and then the neighbor's path to the end" << endl; +#endif + + // Absorb all along the path starting with here and + // continuing with this neighbor's path, to the + // end. + absorb_all_along_path(frame.current, + neighbor_number, + numeric_limits::max()); + } else { +#ifdef debug + cerr << "\t\tNeighbor has a lower low point (" + << neighbor.low_point << " < " << node.low_point << "); comes back in outside our subtree" << endl; +#endif + + // Lower our low point to that of the neighbor + node.low_point = neighbor.low_point; + +#ifdef debug + cerr << "\t\t\tNew low point: " << node.low_point << endl; + + cerr << "\t\t\tAbsorb along path to old low point soure" << endl; +#endif + + // Absorb all along our own path + absorb_all_along_path(numeric_limits::max(), + frame.current, + numeric_limits::max()); + // Adjust our path to be us and then our neighbor's path + node.is_on_path = true; + node.path_tail = neighbor_number; + +#ifdef debug + cerr << "\t\t\tNew path " << path_to_string(frame.current) << endl; +#endif + } + } + + // Say we aren't coming back from a recursive call + // anymore. + frame.recursing = false; + + // Clean up the neighbor, + frame.neighbors.pop_back(); + + // Kick back to the work loop to do the next neighbor, + // if any. + } + +#ifdef debug + cerr << "\tDFS: " << node.dfs_counter + << " low point: " << node.low_point + << " degree: " << node.effective_degree + << " path: " << path_to_string(frame.current) << endl; +#endif + + } else { + // All the neighbors left to do for this node are done. + +#ifdef debug + cerr << "\tNode is visited and no neighbors are on the to do list." << endl; + + cerr << "\tDFS: " << node.dfs_counter + << " low point: " << node.low_point + << " degree: " << node.effective_degree + << " path: " << path_to_string(frame.current) << endl; +#endif + + // This node is done. + + // Remember when we exited it + node.dfs_exit = dfs_counter; + + // Clean up the stack frame. + stack.pop_back(); + } + } + } + } + + // When we run out of unvisited nodes and the stack is empty, we've + // completed out search through all connected components of the graph. +} + +void three_edge_connected_components_dense(size_t node_count, size_t first_root, + const function&)>& for_each_connected_node, + const function&)>&)>& component_callback) { + + // Make a union-find over all the nodes + structures::UnionFind uf(node_count, true); + + // Call Tsin's Algorithm + three_edge_connected_component_merges_dense(node_count, first_root, for_each_connected_node, [&](size_t a, size_t b) { + // When it says to do a merge, do it + uf.union_groups(a, b); + }); + + for (auto& component : uf.all_groups()) { + // Call the callback for each group + component_callback([&](const function& emit_member) { + // And whrn it asks for the members + for (auto& member : component) { + // Send them all + emit_member(member); + } + }); + } +} + +void three_edge_connected_components_dense_cactus(size_t node_count, + const function&)>& for_each_connected_node, + const function&)>&)>& component_callback) { + + // Use the known good pinchesAndCacti algorithm + + // Make the stList of all the vertices, where each vertex is an stList of single element stIntTuple items that point to the ranks of connected nodes. + // When an item is removed, use the list destructor on it. + stList* vertices = stList_construct3(0, (void(*)(void *)) stList_destruct); + + // TODO: No way to hint final size to the list, and we need the individual member lists to know their destructors for their elements. + +#ifdef debug + cerr << "Running Cactus 3ecc on " << node_count << " nodes" << endl; +#endif + + for (size_t rank = 0; rank < node_count; rank++) { + while (rank >= stList_length(vertices)) { + // Make sure we have an adjacency list allocated for the node + // When an item in the node's adjacency list is destroyed, run the int tuple destructor. + stList_append(vertices, stList_construct3(0, (void(*)(void *)) stIntTuple_destruct)); + } + + for_each_connected_node(rank, [&](size_t other_rank) { +#ifdef debug + cerr << "Connect node " << rank << " to node " << other_rank << endl; +#endif + + // For each edge on the node, represent it as a 1-tuple in the node's list. + stList_append((stList*) stList_get(vertices, rank), stIntTuple_construct1((int64_t) other_rank)); + // We don't have to do the back-edge now; we will do it when we visit the other node. + }); + } + + +#ifdef debug + for (size_t i = 0; i < stList_length(vertices); i++) { + cerr << "Vertex " << i << " adjacent to:"; + stList* adjacencies = (stList*) stList_get(vertices, i); + for (size_t j = 0; j < stList_length(adjacencies); j++) { + stIntTuple* adj = (stIntTuple*) stList_get(adjacencies, j); + cerr << " " << stIntTuple_get(adj, 0); + } + cerr << endl; + } +#endif + + // Now we have the graph in the format Tsin's Algorithm wants, so run it. + // The components come out as a list of lists, one for each component, with + // the entries in each component's list being 1-element stIntTuples with + // ranks in them. + stList* components = computeThreeEdgeConnectedComponents(vertices); + +#ifdef debug + cerr << "Got back " << stList_length(components) << " components" << endl; +#endif + + for(size_t i = 0; i < stList_length(components); i++) { + // For each component + stList* component = (stList*) stList_get(components, i); + // Announce the component + component_callback([&](const function& visit_member) { + // And when we get the function to feed the members to + for (size_t j = 0; j < stList_length(component); j++) { +#ifdef debug + cerr << "Component " << i << " contains node " << stIntTuple_get((stIntTuple*) stList_get(component, j), 0) << endl; +#endif + + // Call it with each member + visit_member(stIntTuple_get((stIntTuple*) stList_get(component, j), 0)); + } + }); + } + + // Clean up the component result + stList_destruct(components); + + // Clean up the vertex data + stList_destruct(vertices); +} + +} +} diff --git a/src/algorithms/three_edge_connected_components.hpp b/src/algorithms/three_edge_connected_components.hpp new file mode 100644 index 00000000000..ed036026709 --- /dev/null +++ b/src/algorithms/three_edge_connected_components.hpp @@ -0,0 +1,177 @@ +#ifndef VG_ALGORITHMS_THREE_EDGE_CONNECTED_COMPONENTS_HPP_INCLUDED +#define VG_ALGORITHMS_THREE_EDGE_CONNECTED_COMPONENTS_HPP_INCLUDED + +#include +#include +#include + +namespace vg { +namespace algorithms { + +using namespace std; + +// Interface + +/** + * Get the three-edge-connected components of an arbitrary graph (not + * necessarily a handle graph). Only recognizes one kind of edge and one kind + * of node. Nodes are arbitrary value types (which may need to be hashable). + * + * Takes a function that loops an iteratee over all nodes, and a function that, + * given a node, loops an iteratee over all nodes connected to it. + * + * For each component identified, calls the given callback with a function that + * iterates over all nodes in the component. + * + * If you have a graph where you can easily rank the nodes, don't use this. Use + * three_edge_connected_components_dense() instead. The first thing this + * function does is asign nodes a dense, 0-based rank space. + */ +template +void three_edge_connected_components(const function&)>& for_each_node, + const function&)>& for_each_connected_node, + const function&)>&)>& component_callback); + +/** + * Get the three-edge-connected components of an arbitrary graph (not + * necessarily a handle graph). Only recognizes one kind of edge and one kind + * of node. Nodes are arbitrary value types (which may need to be hashable). + * + * Takes a function that loops an iteratee over all nodes, and a function that, + * given a node, loops an iteratee over all nodes connected to it. + * + * Calls same_component with pairs of nodes in (at least) a spanning tree of + * the set of nodes in each component (not restricted to the input graph). + * Doing merge operations on a union-find can get you the set of components. + * The callback MUST NOT modify the graph! + * + * If you have a graph where you can easily rank the nodes, don't use this. Use + * three_edge_connected_components_dense() instead. The first thing this + * function does is asign nodes a dense, 0-based rank space. + */ +template +void three_edge_connected_component_merges(const function&)>& for_each_node, + const function&)>& for_each_connected_node, + const function& same_component); + + +/** + * Get the three-edge-connected components of an arbitrary graph (not + * necessarily a handle graph). Only recognizes one kind of edge and one kind + * of node. Nodes are dense positive integers starting with 0. + * + * Takes a total node count, a suggested root (or 0), and a function that, + * given a node, loops an iteratee over all nodes connected to it. + * + * Calls same_component with pairs of nodes in (at least) a spanning tree of + * the set of nodes in each component (not restricted to the input graph). + * Doing merge operations on a union-find can get you the set of components. + * The callback MUST NOT modify the graph! + */ +void three_edge_connected_component_merges_dense(size_t node_count, size_t first_root, + const function&)>& for_each_connected_node, + const function& same_component); + +/** + * Get the three-edge-connected components of an arbitrary graph (not + * necessarily a handle graph). Only recognizes one kind of edge and one kind + * of node. Nodes are dense positive integers starting with 0. + * + * Takes a total node count, a suggested root (or 0), and a function that, + * given a node, loops an iteratee over all nodes connected to it. + * + * For each component identified, calls the given callback with a function that + * iterates over all nodes in the component. + */ +void three_edge_connected_components_dense(size_t node_count, size_t first_root, + const function&)>& for_each_connected_node, + const function&)>&)>& component_callback); + +/** + * Get the three-edge-connected components of an arbitrary graph (not + * necessarily a handle graph). Only recognizes one kind of edge and one kind + * of node. Nodes are dense positive integers starting with 0. + * + * Wraps the known good the 3 edge connected components algorithm from the + * pinchesAndCacti library. + * + * Takes a total node count, and a function that, given a node, loops an + * iteratee over all nodes connected to it. + * + * For each component identified, calls the given callback with a function that + * iterates over all nodes in the component. + */ +void three_edge_connected_components_dense_cactus(size_t node_count, + const function&)>& for_each_connected_node, + const function&)>&)>& component_callback); + +// Implementation + +template +void three_edge_connected_components(const function&)>& for_each_node, + const function&)>& for_each_connected_node, + const function&)>&)>& component_callback) { + + // Convert to small positive integers + vector rank_to_node; + unordered_map node_to_rank; + + for_each_node([&](TECCNode node) { + // Populate the rank/node translation. + // TODO: can we condense this? + node_to_rank[node] = rank_to_node.size(); + rank_to_node.push_back(node); + }); + + three_edge_connected_components_dense(rank_to_node.size(), 0, [&](size_t rank, const function visit_connected) { + // Translate the rank we are asked about into a node + for_each_connected_node(rank_to_node[rank], [&](TECCNode connected) { + // And translate the node back into a rank + visit_connected(node_to_rank[connected]); + }); + }, [&](const function&)>& for_each_component_member) { + // When we get a component + // Call our component callback with a function that takes the iteratee + component_callback([&](const function& iteratee) { + for_each_component_member([&](size_t member) { + // And for each member of the component we got, translate it and send it off. + iteratee(rank_to_node[member]); + }); + }); + }); +} + +template +void three_edge_connected_component_merges(const function&)>& for_each_node, + const function&)>& for_each_connected_node, + const function& same_component) { + + // Convert to small positive integers + vector rank_to_node; + unordered_map node_to_rank; + + for_each_node([&](TECCNode node) { + // Populate the rank/node translation. + // TODO: can we condense this? + node_to_rank[node] = rank_to_node.size(); + rank_to_node.push_back(node); + }); + + three_edge_connected_component_merges_dense(rank_to_node.size(), 0, [&](size_t rank, const function visit_connected) { + // Translate the rank we are asked about into a node + for_each_connected_node(rank_to_node[rank], [&](TECCNode connected) { + // And translate the node back into a rank + visit_connected(node_to_rank[connected]); + }); + }, [&](size_t a, size_t b) { + // When we find out two nodes should be in the same component + // Call our merge callback + same_component(rank_to_node[a], rank_to_node[b]); + }); +} + + +} +} + +#endif diff --git a/src/algorithms/topological_sort.cpp b/src/algorithms/topological_sort.cpp deleted file mode 100644 index c94a0530b55..00000000000 --- a/src/algorithms/topological_sort.cpp +++ /dev/null @@ -1,374 +0,0 @@ -#include "topological_sort.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - -vector head_nodes(const HandleGraph* g) { - vector to_return; - g->for_each_handle([&](const handle_t& found) { - // For each (locally forward) node - - bool no_left_edges = true; - g->follow_edges(found, true, [&](const handle_t& ignored) { - // We found a left edge! - no_left_edges = false; - // We only need one - return false; - }); - - if (no_left_edges) { - to_return.push_back(found); - } - }); - - return to_return; - -} - -vector tail_nodes(const HandleGraph* g) { - vector to_return; - g->for_each_handle([&](const handle_t& found) { - // For each (locally forward) node - - bool no_right_edges = true; - g->follow_edges(found, false, [&](const handle_t& ignored) { - // We found a right edge! - no_right_edges = false; - // We only need one - return false; - }); - - if (no_right_edges) { - to_return.push_back(found); - } - }); - - return to_return; - -} - -vector topological_order(const HandleGraph* g) { - - // Make a vector to hold the ordered and oriented nodes. - vector sorted; - sorted.reserve(g->node_size()); - - // Instead of actually removing edges, we add them to this set of masked edges. - unordered_set> masked_edges; - - // This (s) is our set of oriented nodes. - // using a map instead of a set ensures a stable sort across different systems - map s; - - // We find the head and tails, if there are any - vector heads{head_nodes(g)}; - // No need to fetch the tails since we don't use them - - - // Maps from node ID to first orientation we suggested for it. - map seeds; - - - for(handle_t& head : heads) { - // Dump all the heads into the oriented set, rather than having them as - // seeds. We will only go for cycle-breaking seeds when we run out of - // heads. This is bad for contiguity/ordering consistency in cyclic - // graphs and reversing graphs, but makes sure we work out to just - // topological sort on DAGs. It mimics the effect we used to get when we - // joined all the head nodes to a new root head node and seeded that. We - // ignore tails since we only orient right from nodes we pick. - s[g->get_id(head)] = head; - } - - // We will use an ordered map handles by ID for nodes we have not visited - // yet. This ensures a consistent sort order across systems. - map unvisited; - g->for_each_handle([&](const handle_t& found) { - if (!s.count(g->get_id(found))) { - // Only nodes that aren't yet in s are unvisited. - // Nodes in s are visited but just need to be added tot he ordering. - unvisited.emplace(g->get_id(found), found); - } - }); - - while(!unvisited.empty() || !s.empty()) { - - // Put something in s. First go through seeds until we can find one - // that's not already oriented. - while(s.empty() && !seeds.empty()) { - // Look at the first seed - auto first_seed = (*seeds.begin()).second; - - if(unvisited.count(g->get_id(first_seed))) { - // We have an unvisited seed. Use it -#ifdef debug -#pragma omp critical (cerr) - cerr << "Starting from seed " << g->get_id(first_seed) << " orientation " << g->get_is_reverse(first_seed) << endl; -#endif - - s[g->get_id(first_seed)] = first_seed; - unvisited.erase(g->get_id(first_seed)); - } - // Whether we used the seed or not, don't keep it around - seeds.erase(seeds.begin()); - } - - if(s.empty()) { - // If we couldn't find a seed, just grab any old node. - // Since map order is stable across systems, we can take the first node by id and put it locally forward. -#ifdef debug -#pragma omp critical (cerr) - cerr << "Starting from arbitrary node " << unvisited.begin()->first << " locally forward" << endl; -#endif - - s[unvisited.begin()->first] = unvisited.begin()->second; - unvisited.erase(unvisited.begin()->first); - } - - while (!s.empty()) { - // Grab an oriented node - auto n = s.begin()->second; - s.erase(g->get_id(n)); - // Emit it - sorted.push_back(n); -#ifdef debug -#pragma omp critical (cerr) - cerr << "Using oriented node " << g->get_id(n) << " orientation " << g->get_is_reverse(n) << endl; -#endif - - // See if it has an edge from its start to the start of some node - // where both were picked as places to break into cycles. A - // reversing self loop on a cycle entry point is a special case of - // this. - g->follow_edges(n, true, [&](const handle_t& prev_node) { - if(!unvisited.count(g->get_id(prev_node))) { - // Look at the edge - auto edge = g->edge_handle(prev_node, n); - if (masked_edges.count(edge)) { - // We removed this edge, so skip it. - return; - } - -#ifdef debug -#pragma omp critical (cerr) - cerr << "\tHas left-side edge to cycle entry point " << g->get_id(prev_node) - << " orientation " << g->get_is_reverse(prev_node) << endl; -#endif - - // Mask the edge - masked_edges.insert(edge); - -#ifdef debug -#pragma omp critical (cerr) - cerr << "\t\tEdge: " << g->get_id(edge.first) << " " << g->get_is_reverse(edge.first) - << " -> " << g->get_id(edge.second) << " " << g->get_is_reverse(edge.second) << endl; -#endif - } - }); - - // All other connections and self loops are handled by looking off the right side. - - // See what all comes next, minus deleted edges. - g->follow_edges(n, false, [&](const handle_t& next_node) { - - // Look at the edge - auto edge = g->edge_handle(n, next_node); - if (masked_edges.count(edge)) { - // We removed this edge, so skip it. - return; - } - -#ifdef debug -#pragma omp critical (cerr) - cerr << "\tHas edge to " << g->get_id(next_node) << " orientation " << g->get_is_reverse(next_node) << endl; -#endif - - // Mask the edge connecting these nodes in this order and - // relative orientation, so we can't traverse it again - -#ifdef debug -#pragma omp critical (cerr) - cerr << "\t\tEdge: " << g->get_id(edge.first) << " " << g->get_is_reverse(edge.first) - << " -> " << g->get_id(edge.second) << " " << g->get_is_reverse(edge.second) << endl; -#endif - - // Mask the edge - masked_edges.insert(edge); - - if(unvisited.count(g->get_id(next_node))) { - // We haven't already started here as an arbitrary cycle entry point - -#ifdef debug -#pragma omp critical (cerr) - cerr << "\t\tAnd node hasn't been visited yet" << endl; -#endif - - bool unmasked_incoming_edge = false; - g->follow_edges(next_node, true, [&](const handle_t& prev_node) { - // Get a handle for each incoming edge - auto prev_edge = g->edge_handle(prev_node, next_node); - - if (!masked_edges.count(prev_edge)) { - // We found such an edghe and can stop looking - unmasked_incoming_edge = true; - return false; - } - // Otherwise check all the edges on the left of this handle - return true; - }); - - if(!unmasked_incoming_edge) { - -#ifdef debug -#pragma omp critical (cerr) - cerr << "\t\t\tIs last incoming edge" << endl; -#endif - // Keep this orientation and put it here - s[g->get_id(next_node)] = next_node; - // Remember that we've visited and oriented this node, so we - // don't need to use it as a seed. - unvisited.erase(g->get_id(next_node)); - - } else if(!seeds.count(g->get_id(next_node))) { - // We came to this node in this orientation; when we need a - // new node and orientation to start from (i.e. an entry - // point to the node's cycle), we might as well pick this - // one. - // Only take it if we don't already know of an orientation for this node. - seeds[g->get_id(next_node)] = next_node; - -#ifdef debug -#pragma omp critical (cerr) - cerr << "\t\t\tSuggests seed " << g->get_id(next_node) << " orientation " << g->get_is_reverse(next_node) << endl; -#endif - } - } else { -#ifdef debug -#pragma omp critical (cerr) - cerr << "\t\tAnd node was already visited (to break a cycle)" << endl; -#endif - } - }); - } - } - - // Send away our sorted ordering. - return sorted; -} - -vector lazy_topological_order_internal(const HandleGraph* g, bool lazier) { - - // map that will contain the orientation and the in degree for each node - unordered_map inward_degree; - inward_degree.reserve(g->node_size()); - - // stack for the traversal - vector stack; - - if (lazier) { - // take the locally forward orientation as a single stranded orientation - g->for_each_handle([&](const handle_t& handle) { - int64_t& degree = inward_degree[handle]; - g->follow_edges(handle, true, [&](const handle_t& ignored) { - degree++; - }); - // initialize the stack with head nodes - if (degree == 0) { - stack.emplace_back(handle); - } - }); - } - else { - // get an orientation over which we can consider the graph single stranded - vector orientation = single_stranded_orientation(g); - - if (orientation.size() != g->node_size()) { - cerr << "error:[algorithms] attempting to use lazy topological sort on unorientable graph" << endl; - exit(1); - } - - // compute the degrees by following the edges backward - for (auto& handle : orientation) { - int64_t& degree = inward_degree[handle]; - g->follow_edges(handle, true, [&](const handle_t& ignored) { - degree++; - }); - // initialize the stack with head nodes - if (degree == 0) { - stack.emplace_back(handle); - } - } - } - - // the return value - vector order; - order.reserve(g->node_size()); - - while (!stack.empty()) { - // get a head node off the queue - handle_t here = stack.back(); - stack.pop_back(); - - // add it to the topological order - order.push_back(here); - - // remove its outgoing edges - g->follow_edges(here, false, [&](const handle_t& next) { - - auto iter = inward_degree.find(next); - // we should never be able to reach the opposite orientation of a node - assert(iter != inward_degree.end()); - // implicitly remove the edge - iter->second--; - if (iter->second == 0) { - // after removing this edge, the node is now a head, add it to the queue - stack.push_back(next); - } - }); - } - - if (order.size() != g->node_size()) { - cerr << "error:[algorithms] lazy topological sort is invalid on non-DAG graph, cannot complete algorithm" << endl; - exit(1); - } - - return order; -} - - -vector lazy_topological_order(const HandleGraph* g) { - return lazy_topological_order_internal(g, false); -} - -vector lazier_topological_order(const HandleGraph* g) { - return lazy_topological_order_internal(g, true); -} -void sort(MutableHandleGraph* g) { - if (g->node_size() <= 1) { - // A graph with <2 nodes has only one sort. - return; - } - - // No need to modify the graph; topological_sort is guaranteed to be stable. - - // Topologically sort, which orders and orients all the nodes, and apply the order to the backing graph - apply_ordering(g, topological_order(g)); -} - -void lazy_sort(MutableHandleGraph* g) { - apply_ordering(g, lazy_topological_order(g)); -} - -void lazier_sort(MutableHandleGraph* g) { - apply_ordering(g, lazier_topological_order(g)); -} - -unordered_set orient_nodes_forward(MutableHandleGraph* g) { - // Topologically sort, which orders and orients all the nodes, and apply the orientations to the backing graph. - return apply_orientations(g, topological_order(g)); -} - -} -} diff --git a/src/algorithms/topological_sort.hpp b/src/algorithms/topological_sort.hpp deleted file mode 100644 index 0326fb999aa..00000000000 --- a/src/algorithms/topological_sort.hpp +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef VG_ALGORITHMS_TOPOLOGICAL_SORT_HPP_INCLUDED -#define VG_ALGORITHMS_TOPOLOGICAL_SORT_HPP_INCLUDED - -/** - * \file topological_sort.hpp - * - * Defines a topological sort algorithm for handle graphs. - */ - -#include - -#include "../position.hpp" -#include "../cached_position.hpp" -#include "../vg.pb.h" -#include "../hash_map.hpp" -#include "../handle.hpp" -#include "apply_bulk_modifications.hpp" -#include "is_single_stranded.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - -/// Find all of the nodes with no edges on their left sides. -vector head_nodes(const HandleGraph* g); - -/// Find all of the nodes with no edges on their right sides. -vector tail_nodes(const HandleGraph* g); - -/** - * Order and orient the nodes in the graph using a topological sort. The sort is - * guaranteed to be machine-independent given the initial graph's node and edge - * ordering. The algorithm is well-defined on non-DAG graphs, but the order is - * necessarily not a topological order. - * - * We use a bidirected adaptation of Kahn's topological sort (1962), which can handle components with no heads or tails. - * - * L ↠Empty list that will contain the sorted and oriented elements - * S ↠Set of nodes which have been oriented, but which have not had their downstream edges examined - * N ↠Set of all nodes that have not yet been put into S - * - * while N is nonempty do - * remove a node from N, orient it arbitrarily, and add it to S - * (In practice, we use "seeds": the heads all in a batch at the start, and any - * nodes we have seen that had too many incoming edges) - * while S is non-empty do - * remove an oriented node n from S - * add n to tail of L - * for each node m with an edge e from n to m do - * remove edge e from the graph - * if m has no other edges to that side then - * orient m such that the side the edge comes to is first - * remove m from N - * insert m into S - * otherwise - * put an oriented m on the list of arbitrary places to start when S is empty - * (This helps start at natural entry points to cycles) - * return L (a topologically sorted order and orientation) - */ -vector topological_order(const HandleGraph* g); - -/** - * Order the nodes in a graph using a topological sort. The sort is NOT guaranteed - * to be machine-independent, but it is faster than topological_order(). This algorithm - * is invalid in a graph that has any cycles. For safety, consider this property with - * algorithms::is_directed_acyclic(). - */ -vector lazy_topological_order(const HandleGraph* g); - -/** - * Order the nodes in a graph using a topological sort. Similar to lazy_topological_order - * but somewhat faster. The algorithm is invalid in a graph that has any cycles or - * any reversing edges. For safety, consider these properties with algorithms::is_acyclic() - * and algorithms::is_single_stranded(). - */ -vector lazier_topological_order(const HandleGraph* g); - -/** - * Topologically sort the given handle graph, and then apply that sort to re- - * order the nodes of the graph. The sort is guaranteed to be stable. This sort is well-defined - * on graphs that are not DAGs, but instead of finding a topological sort ti does a heuristic - * sort to minimize a feedback arc set. - */ -void sort(MutableHandleGraph* g); - -/** - * Topologically sort the given handle graph, and then apply that sort to re- - * order the nodes of the graph. The sort is NOT guaranteed to be stable or - * machine-independent, but it is faster than sort(). This algorithm is invalid - * in a graph that has any cycles. For safety, consider checking this property with - * algorithms::is_acyclic(). - */ -void lazy_sort(MutableHandleGraph* g); - -/** - * Topologically sort the given handle graph, and then apply that sort to re- - * order the nodes of the graph. The sort is NOT guaranteed to be stable or - * machine-independent, but it is faster than sort() and somewhat faster than - * lazy_sort(). This algorithm is invalid in a graph that has any cycles or reversing - * edges. For safety, consider checking these properties with algorithms::is_single_stranded() - * and algorithms::is_acyclic(). - */ -void lazier_sort(MutableHandleGraph* g); - -/** - * Topologically sort the given handle graph, and then apply that sort to orient - * all the nodes in the global forward direction. May invalidate any paths - * stored by the graph. The re-orientation is guaranteed to be stable. - * Invalidates all handles into the graph (since any node might be flipped). - */ -unordered_set orient_nodes_forward(MutableHandleGraph* g); - -} -} - -#endif diff --git a/src/algorithms/walk.cpp b/src/algorithms/walk.cpp new file mode 100644 index 00000000000..1f6fb358417 --- /dev/null +++ b/src/algorithms/walk.cpp @@ -0,0 +1,183 @@ +#include "walk.hpp" + +#include + +namespace vg { + +namespace algorithms { + +void for_each_walk(const HandleGraph& graph, size_t k, size_t edge_max, + const std::function& lambda) { + graph.for_each_handle([&](const handle_t& h) { + // for the forward and reverse of this handle + // walk k bases from the end, so that any walk starting on the node will be represented in the tree we build + for (auto handle_is_rev : { false, true }) { + handle_t handle = handle_is_rev ? graph.flip(h) : h; + std::list walks; + // for each position in the node, set up a walk with that start position and the node end or walk length as the end position + // determine next positions + nid_t handle_id = graph.get_id(handle); + size_t handle_length = graph.get_length(handle); + std::string handle_seq = graph.get_sequence(handle); + for (size_t i = 0; i < handle_length; ++i) { + pos_t begin = make_pos_t(handle_id, handle_is_rev, i); + pos_t end = make_pos_t(handle_id, handle_is_rev, std::min(handle_length, i+k)); + walk_t walk = walk_t(handle_seq.substr(offset(begin), offset(end)-offset(begin)), begin, end, handle); + if (walk.seq.size() < k) { + size_t next_count = 0; + if (edge_max) graph.follow_edges(walk.curr, false, [&](const handle_t& next) { ++next_count; return next_count <= 1; }); + //walk.seq.reserve(k); // may reduce allocation costs + // follow edges if we haven't completed the walk here + if (next_count > 1 && (edge_max && edge_max == walk.forks)) { + } else { + graph.follow_edges(walk.curr, false, [&](const handle_t& next) { + walks.push_back(walk); + auto& todo = walks.back(); + todo.curr = next; + if (next_count > 1) { + ++todo.forks; + } + }); + } + } else { + walks.push_back(walk); + } + } + + // now expand the walks until they reach k + while (!walks.empty()) { + // first we check which ones have reached length k in the current handle; for each of these we run lambda and remove them from our list + auto walks_end = walks.end(); + for (std::list::iterator q = walks.begin(); q != walks_end; ++q) { + auto& walk = *q; + // did we reach our target length? + if (walk.seq.size() == k) { + // TODO here check if we are at the beginning of the reverse head or the beginning of the forward tail and would need special handling + // establish the context + handle_t end_handle = graph.get_handle(id(walk.end), is_rev(walk.end)); + size_t end_length = graph.get_length(end_handle); + // now pass the walk to our callback + lambda(walk); + q = walks.erase(q); + } else { + // do we finish in the current node? + nid_t curr_id = graph.get_id(walk.curr); + size_t curr_length = graph.get_length(walk.curr); + bool curr_is_rev = graph.get_is_reverse(walk.curr); + std::string curr_seq = graph.get_sequence(walk.curr); + size_t take = std::min(curr_length, k-walk.seq.size()); + walk.end = make_pos_t(curr_id, curr_is_rev, take); + walk.seq.append(curr_seq.substr(0,take)); + walk.path.push_back(walk.curr); + if (walk.seq.size() < k) { + size_t next_count = 0; + if (edge_max) graph.follow_edges(walk.curr, false, [&](const handle_t& next) { ++next_count; return next_count <= 1; }); + //walk.seq.reserve(k); // may reduce allocation costs + // follow edges if we haven't completed the walk here + if (next_count > 1 && (edge_max && edge_max == walk.forks)) { + } else { + graph.follow_edges(walk.curr, false, [&](const handle_t& next) { + walks.push_back(walk); + auto& todo = walks.back(); + todo.curr = next; + if (next_count > 1) { + ++todo.forks; + } + }); + } + // if not, we need to expand through the node then follow on + /* + graph.follow_edges(walk.curr, false, [&](const handle_t& next) { + walks.push_back(walk); + auto& todo = walks.back(); + todo.curr = next; + }); + */ + q = walks.erase(q); + } else { + if (walk.seq.size() > k) { + assert(walk.seq.size() <= k); + } + } + } + } + } + } + }, true); +} + +std::ostream& operator<<(std::ostream& out, const walk_t& walk) { + out << walk.seq << "\t" + << id(walk.begin) << ":" << (is_rev(walk.begin) ? "-":"") << offset(walk.begin) << "\t"; + return out; +} + +uint64_t walk_haplotype_frequency(const HandleGraph& graph, + const gbwt::GBWT& haplotypes, + const walk_t& walk) { + if (walk.path.empty()) { + return 0; + } + auto& first_step = walk.path.front(); + gbwt::node_type start_node = gbwt::Node::encode(graph.get_id(first_step), graph.get_is_reverse(first_step)); + gbwt::SearchState search_state = haplotypes.find(start_node); + for (uint64_t i = 1; i < walk.path.size(); ++i) { + auto& next = walk.path[i]; + gbwt::node_type next_node = gbwt::Node::encode(graph.get_id(next), graph.get_is_reverse(next)); + search_state = haplotypes.extend(search_state, next_node); + if (search_state.empty()) { + break; + } + } + return search_state.size(); +} + +std::vector walk_haplotype_names(const HandleGraph& graph, + const gbwt::GBWT& haplotypes, + const walk_t& walk) { + std::vector names; + if (walk.path.empty()) { + return names; + } + auto& first_step = walk.path.front(); + gbwt::node_type start_node = gbwt::Node::encode(graph.get_id(first_step), graph.get_is_reverse(first_step)); + gbwt::SearchState search_state = haplotypes.find(start_node); + for (uint64_t i = 1; i < walk.path.size(); ++i) { + auto& next = walk.path[i]; + gbwt::node_type next_node = gbwt::Node::encode(graph.get_id(next), graph.get_is_reverse(next)); + search_state = haplotypes.extend(search_state, next_node); + if (search_state.empty()) { + break; + } + } + assert(haplotypes.hasMetadata() && haplotypes.metadata.hasSampleNames()); + // Pre-parse the reference samples. + // TODO: Can we pass this down? + auto reference_samples = gbwtgraph::parse_reference_samples_tag(haplotypes); + for (auto& thread : haplotypes.locate(search_state)) { + // For each match + auto id = gbwt::Path::id(thread); + + // Figure out what kind of path it is + PathSense sense = gbwtgraph::get_path_sense(haplotypes, id, reference_samples); + + // Figure out its haplotype number + auto haplotype = gbwtgraph::get_path_haplotype(haplotypes, id, sense); + if (haplotype == PathMetadata::NO_HAPLOTYPE) { + // If no haplotype is applicable, use 0. + haplotype = 0; + } + + // Compose a name for it + std::stringstream ss; + ss << gbwtgraph::get_path_sample_name(haplotypes, id, sense) << "#" << haplotype; + names.push_back(ss.str()); + // TODO: should we do something special for generic sense or NO_SAMPLE_NAME? + } + return names; +} + + +} + +} diff --git a/src/algorithms/walk.hpp b/src/algorithms/walk.hpp new file mode 100644 index 00000000000..b12a9f7a285 --- /dev/null +++ b/src/algorithms/walk.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "position.hpp" +#include "gbwt_helper.hpp" + +/** \file + * Functions for working with `kmers_t`'s in HandleGraphs. + */ + +namespace vg { + +namespace algorithms { + +using namespace handlegraph; + +/// Stores a walk in the context of a graph. +struct walk_t { + walk_t(const std::string& s, + const pos_t& b, + const pos_t& e, + const handle_t& c) + : seq(s), begin(b), end(e), curr(c), path({c}) { }; + /// the walk + std::vector path; + /// the sequence + std::string seq; + /// our start position + pos_t begin; + /// Used in construction + pos_t end; /// one past the (current) end of the walk + handle_t curr; /// the next handle we extend into + uint16_t forks; /// how many branching edge crossings we took to get here +}; + +/// Iterate over all the walks in the graph, running lambda on each +void for_each_walk(const HandleGraph& graph, size_t k, size_t edge_max, + const std::function& lambda); + +/// Print a walk_t to a stream. +std::ostream& operator<<(std::ostream& out, const walk_t& walk); + +uint64_t walk_haplotype_frequency(const HandleGraph& graph, + const gbwt::GBWT& haplotypes, + const walk_t& walk); + +std::vector walk_haplotype_names(const HandleGraph& graph, + const gbwt::GBWT& haplotypes, + const walk_t& walk); + +} + +} diff --git a/src/algorithms/weakly_connected_components.cpp b/src/algorithms/weakly_connected_components.cpp deleted file mode 100644 index bc7d798c93f..00000000000 --- a/src/algorithms/weakly_connected_components.cpp +++ /dev/null @@ -1,54 +0,0 @@ -#include "weakly_connected_components.hpp" - -namespace vg { -namespace algorithms { - -using namespace std; - -vector> weakly_connected_components(const HandleGraph* graph) { - vector> to_return; - - // This only holds locally forward handles - unordered_set traversed; - - graph->for_each_handle([&](const handle_t& handle) { - - // Only think about it in the forward orientation - auto forward = graph->forward(handle); - - if (traversed.count(forward)) { - // Already have this node, so don't start a search from it. - return; - } - - // The stack only holds locally forward handles - vector stack{forward}; - to_return.emplace_back(); - while (!stack.empty()) { - handle_t here = stack.back(); - stack.pop_back(); - - traversed.insert(here); - to_return.back().insert(graph->get_id(here)); - - // We have a function to handle all connected handles - auto handle_other = [&](const handle_t& other) { - // Again, make it forward - auto other_forward = graph->forward(other); - - if (!traversed.count(other_forward)) { - stack.push_back(other_forward); - } - }; - - // Look at edges in both directions - graph->follow_edges(here, false, handle_other); - graph->follow_edges(here, true, handle_other); - - } - }); - return to_return; -} - -} -} diff --git a/src/algorithms/weakly_connected_components.hpp b/src/algorithms/weakly_connected_components.hpp deleted file mode 100644 index 243d8f30dd3..00000000000 --- a/src/algorithms/weakly_connected_components.hpp +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef VG_ALGORITHMS_WEAKLY_CONNECTED_COMPONENTS_HPP_INCLUDED -#define VG_ALGORITHMS_WEAKLY_CONNECTED_COMPONENTS_HPP_INCLUDED - -/** - * \file weakly_connected_components.hpp - * - * Defines an algorithm for finding weakly connected components in a graph. - */ - -#include "../handle.hpp" - -#include -#include - -namespace vg { -namespace algorithms { - -using namespace std; - -/// Returns sets of IDs defining components that are connected by any series -/// of nodes and edges, even if it is not a valid bidirected walk. TODO: It -/// might make sense to have a handle-returning version, but the consumers of -/// weakly connected components right now want IDs, and membership in a weakly -/// connected component is orientation-independent. -vector> weakly_connected_components(const HandleGraph* graph); - - -} -} - -#endif diff --git a/src/aligner.cpp b/src/aligner.cpp new file mode 100644 index 00000000000..f10c8512a27 --- /dev/null +++ b/src/aligner.cpp @@ -0,0 +1,2451 @@ +#include "aligner.hpp" + +#include "hash_map.hpp" +#include "alignment.hpp" +#include "path.hpp" +#include "utility.hpp" +#include "statistics.hpp" +#include "banded_global_aligner.hpp" +#include "reverse_graph.hpp" +#include "null_masking_graph.hpp" +#include "dozeu_pinning_overlay.hpp" +#include "algorithms/distance_to_tail.hpp" + +//#define debug_print_score_matrices + +namespace vg { + +using namespace std; +using namespace vg::io; + +int32_t score_gap(size_t gap_length, int32_t gap_open, int32_t gap_extension) { + return gap_length ? -gap_open - (gap_length - 1) * gap_extension : 0; +} + +static const double quality_scale_factor = 10.0 / log(10.0); +static const double exp_overflow_limit = log(std::numeric_limits::max()); + +GSSWAligner::~GSSWAligner(void) { + free(nt_table); + free(score_matrix); +} + +GSSWAligner::GSSWAligner(const int8_t* _score_matrix, + int8_t _gap_open, + int8_t _gap_extension, + int8_t _full_length_bonus, + double _gc_content) : deletion_aligner(_gap_open, _gap_extension) { + + log_base = recover_log_base(_score_matrix, _gc_content, 1e-12); + + // TODO: now that everything is in terms of score matrices, having match/mismatch is a bit + // misleading, but a fair amount of code depends on them + match = _score_matrix[0]; + mismatch = -_score_matrix[1]; + gap_open = _gap_open; + gap_extension = _gap_extension; + full_length_bonus = _full_length_bonus; + + // table to translate chars to their integer value + nt_table = gssw_create_nt_table(); +} + +gssw_graph* GSSWAligner::create_gssw_graph(const HandleGraph& g) const { + + vector topological_order = handlealgs::lazier_topological_order(&g); + + gssw_graph* graph = gssw_graph_create(g.get_node_count()); + unordered_map nodes; + + // compute the topological order + for (const handle_t& handle : topological_order) { + auto cleaned_seq = nonATGCNtoN(g.get_sequence(handle)); + gssw_node* node = gssw_node_create(nullptr, // TODO: the ID should be enough, don't need Node* too + g.get_id(handle), + cleaned_seq.c_str(), + nt_table, + score_matrix); // TODO: this arg isn't used, could edit + // in gssw + nodes[node->id] = node; + gssw_graph_add_node(graph, node); + } + + g.for_each_edge([&](const edge_t& edge) { + if(!g.get_is_reverse(edge.first) && !g.get_is_reverse(edge.second)) { + // This is a normal end to start edge. + gssw_nodes_add_edge(nodes[g.get_id(edge.first)], nodes[g.get_id(edge.second)]); + } + else if (g.get_is_reverse(edge.first) && g.get_is_reverse(edge.second)) { + // This is a start to end edge, but isn't reversing and can be converted to a normal end to start edge. + + // Flip the start and end + gssw_nodes_add_edge(nodes[g.get_id(edge.second)], nodes[g.get_id(edge.first)]); + } + else { + // TODO: It's a reversing edge, which gssw doesn't support yet. What + // we should really do is do a topological sort to break cycles, and + // then flip everything at the lower-rank end of this edge around, + // so we don't have to deal with its reversing-ness. But for now we + // just die so we don't get nonsense into gssw. +#pragma omp critical + { + // We need the critical section so we don't throw uncaught + // exceptions in multiple threads at once, leading to C++ trying + // to run termiante in parallel. This doesn't make it safe, just + // slightly safer. + cerr << "Can't gssw over reversing edge " << g.get_id(edge.first) << (g.get_is_reverse(edge.first) ? "-" : "+") << " -> " << g.get_id(edge.second) << (g.get_is_reverse(edge.second) ? "-" : "+") << endl; + // TODO: there's no safe way to kill the program without a way + // to signal the master to do it, via a shared variable in the + // clause that made us parallel. + } + exit(1); + } + return true; + }); + + return graph; + +} + +unordered_set GSSWAligner::identify_pinning_points(const HandleGraph& graph) const { + + unordered_set return_val; + + // start at the sink nodes + vector sinks = handlealgs::tail_nodes(&graph); + + // walk backwards to find non-empty nodes if necessary + for (const handle_t& handle : sinks) { + vector stack(1, handle); + while (!stack.empty()) { + handle_t here = stack.back(); + stack.pop_back(); + + if (graph.get_length(here) > 0) { + return_val.insert(graph.get_id(here)); + } + else { + graph.follow_edges(here, true, [&](const handle_t& prev) { + // TODO: technically this won't filter out all redundant walks, but it should + // handle all cases we're practically interested in and it doesn't require a + // second set object + if (!return_val.count(graph.get_id(prev))) { + stack.push_back(prev); + } + }); + } + } + } + + return return_val; +} + +void GSSWAligner::gssw_mapping_to_alignment(gssw_graph* graph, + gssw_graph_mapping* gm, + Alignment& alignment, + bool pinned, + bool pin_left) const { + alignment.clear_path(); + alignment.set_score(gm->score); + alignment.set_query_position(0); + Path* path = alignment.mutable_path(); + //alignment.set_cigar(graph_cigar(gm)); + + gssw_graph_cigar* gc = &gm->cigar; + gssw_node_cigar* ncs = gc->elements; + //cerr << "gm->position " << gm->position << endl; + string& to_seq = *alignment.mutable_sequence(); + //cerr << "-------------" << endl; + +#ifdef debug_print_score_matrices + gssw_graph_print_score_matrices(graph, to_seq.c_str(), to_seq.size(), stderr); +#endif + + int to_pos = 0; + int from_pos = gm->position; + + for (int i = 0; i < gc->length; ++i) { + // check that the current alignment has a non-zero length + gssw_cigar* c = ncs[i].cigar; + int l = c->length; + if (l == 0) continue; + gssw_cigar_element* e = c->elements; + + gssw_node* node = ncs[i].node; + Mapping* mapping = path->add_mapping(); + + if (i > 0) { + // reset for each node after the first + from_pos = 0; + } + + mapping->mutable_position()->set_node_id(node->id); + mapping->mutable_position()->set_offset(from_pos); + mapping->set_rank(path->mapping_size()); + + //cerr << node->id << ":" << endl; + + for (int j=0; j < l; ++j, ++e) { + int32_t length = e->length; + //cerr << e->length << e->type << endl; + + Edit* edit; + switch (e->type) { + case 'M': + case 'X': + case 'N': { + //cerr << "j = " << j << ", type = " << e->type << endl; + // do the sequences match? + // emit a stream of "SNPs" and matches + int h = from_pos; + int last_start = from_pos; + int k = to_pos; + for ( ; h < from_pos + length; ++h, ++k) { + //cerr << h << ":" << k << " " << node->seq[h] << " " << to_seq[k] << endl; + if (node->seq[h] != to_seq[k]) { + // emit the last "match" region + if (h - last_start > 0) { + edit = mapping->add_edit(); + edit->set_from_length(h-last_start); + edit->set_to_length(h-last_start); + } + // set up the SNP + edit = mapping->add_edit(); + edit->set_from_length(1); + edit->set_to_length(1); + edit->set_sequence(to_seq.substr(k,1)); + last_start = h+1; + } + } + // handles the match at the end or the case of no SNP + if (h - last_start > 0) { + edit = mapping->add_edit(); + edit->set_from_length(h-last_start); + edit->set_to_length(h-last_start); + } + to_pos += length; + from_pos += length; + } break; + case 'D': + edit = mapping->add_edit(); + edit->set_from_length(length); + edit->set_to_length(0); + from_pos += length; + break; + case 'I': + edit = mapping->add_edit(); + edit->set_from_length(0); + edit->set_to_length(length); + edit->set_sequence(to_seq.substr(to_pos, length)); + to_pos += length; + break; + case 'S': + // note that soft clips and insertions are semantically equivalent + // and can only be differentiated by their position in the read + // with soft clips coming at the start or end + edit = mapping->add_edit(); + edit->set_from_length(0); + edit->set_to_length(length); + edit->set_sequence(to_seq.substr(to_pos, length)); + to_pos += length; + break; + default: + cerr << "error:[Aligner::gssw_mapping_to_alignment] " + << "unsupported cigar op type " << e->type << endl; + exit(1); + break; + + } + } + } + + // compute and set identity + alignment.set_identity(identity(alignment.path())); +} + +void GSSWAligner::unreverse_graph(gssw_graph* graph) const { + // this is only for getting correct reference-relative edits, so we can get away with only + // reversing the sequences and not paying attention to the edges + + for (size_t i = 0; i < graph->size; i++) { + gssw_node* node = graph->nodes[i]; + for (int j = 0, stop = node->len / 2; j < stop; j++) { + std::swap(node->seq[j], node->seq[node->len - j - 1]); + } + } +} + +void GSSWAligner::unreverse_graph_mapping(gssw_graph_mapping* gm) const { + + gssw_graph_cigar* graph_cigar = &(gm->cigar); + gssw_node_cigar* node_cigars = graph_cigar->elements; + + // reverse the order of the node cigars + int32_t num_switching_nodes = graph_cigar->length / 2; + int32_t last_idx = graph_cigar->length - 1; + for (int32_t i = 0; i < num_switching_nodes; i++) { + std::swap(node_cigars[i], node_cigars[last_idx - i]); + } + + // reverse the actual cigar string for each node cigar + for (int32_t i = 0; i < graph_cigar->length; i++) { + gssw_cigar* node_cigar = node_cigars[i].cigar; + gssw_cigar_element* elements = node_cigar->elements; + + int32_t num_switching_elements = node_cigar->length / 2; + last_idx = node_cigar->length - 1; + for (int32_t j = 0; j < num_switching_elements; j++) { + std::swap(elements[j], elements[last_idx - j]); + } + } + + // compute the position in the first node + if (graph_cigar->length > 0) { + gssw_cigar_element* first_node_elements = node_cigars[0].cigar->elements; + int32_t num_first_node_elements = node_cigars[0].cigar->length; + uint32_t num_ref_aligned = 0; // the number of characters on the node sequence that are aligned + for (int32_t i = 0; i < num_first_node_elements; i++) { + switch (first_node_elements[i].type) { + case 'M': + case 'X': + case 'N': + case 'D': + num_ref_aligned += first_node_elements[i].length; + break; + + } + } + gm->position = node_cigars[0].node->len - num_ref_aligned - (graph_cigar->length == 1 ? gm->position : 0); + } + else { + gm->position = 0; + } +} + +string GSSWAligner::graph_cigar(gssw_graph_mapping* gm) const { + + stringstream s; + gssw_graph_cigar* gc = &gm->cigar; + gssw_node_cigar* nc = gc->elements; + int to_pos = 0; + int from_pos = gm->position; + //string& to_seq = *alignment.mutable_sequence(); + s << from_pos << '@'; + for (int i = 0; i < gc->length; ++i, ++nc) { + if (i > 0) from_pos = 0; // reset for each node after the first + Node* from_node = (Node*) nc->node->data; + s << from_node->id() << ':'; + gssw_cigar* c = nc->cigar; + int l = c->length; + gssw_cigar_element* e = c->elements; + for (int j=0; j < l; ++j, ++e) { + s << e->length << e->type; + } + if (i + 1 < gc->length) { + s << ","; + } + } + return s.str(); +} + +double GSSWAligner::recover_log_base(const int8_t* score_matrix, double gc_content, double tol) const { + + // convert gc content into base-wise frequencies + double* nt_freqs = (double*) malloc(sizeof(double) * 4); + nt_freqs[0] = 0.5 * (1 - gc_content); + nt_freqs[1] = 0.5 * gc_content; + nt_freqs[2] = 0.5 * gc_content; + nt_freqs[3] = 0.5 * (1 - gc_content); + + if (!verify_valid_log_odds_score_matrix(score_matrix, nt_freqs)) { + cerr << "error:[Aligner] Score matrix is invalid. Must have a negative expected score against random sequence." << endl; + exit(1); + } + + // searching for a positive value (because it's a base of a logarithm) + double lower_bound; + double upper_bound; + + // arbitrary starting point greater than zero + double lambda = 1.0; + // exponential search for a window containing lambda where total probability is 1 + double partition = alignment_score_partition_function(lambda, score_matrix, nt_freqs); + if (partition < 1.0) { + lower_bound = lambda; + while (partition <= 1.0) { + lower_bound = lambda; + lambda *= 2.0; + partition = alignment_score_partition_function(lambda, score_matrix, nt_freqs); + } + upper_bound = lambda; + } + else { + upper_bound = lambda; + while (partition >= 1.0) { + upper_bound = lambda; + lambda /= 2.0; + partition = alignment_score_partition_function(lambda, score_matrix, nt_freqs); + } + lower_bound = lambda; + } + + // bisect to find a log base where total probability is 1 + while (upper_bound / lower_bound - 1.0 > tol) { + lambda = 0.5 * (lower_bound + upper_bound); + if (alignment_score_partition_function(lambda, score_matrix, nt_freqs) < 1.0) { + lower_bound = lambda; + } + else { + upper_bound = lambda; + } + } + + free(nt_freqs); + + return 0.5 * (lower_bound + upper_bound); +} + +bool GSSWAligner::verify_valid_log_odds_score_matrix(const int8_t* score_matrix, const double* nt_freqs) const { + bool contains_positive_score = false; + for (int i = 0; i < 16; i++) { + if (score_matrix[i] > 0) { + contains_positive_score = 1; + break; + } + } + if (!contains_positive_score) { + return false; + } + + double expected_score = 0.0; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + expected_score += nt_freqs[i] * nt_freqs[j] * score_matrix[i * 4 + j]; + } + } + return expected_score < 0.0; +} + +double GSSWAligner::alignment_score_partition_function(double lambda, const int8_t* score_matrix, + const double* nt_freqs) const { + + double partition = 0.0; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + partition += nt_freqs[i] * nt_freqs[j] * exp(lambda * score_matrix[i * 4 + j]); + } + } + + if (isnan(partition)) { + cerr << "error:[Aligner] overflow error in log-odds base recovery subroutine." << endl; + exit(1); + } + + return partition; +} + +int32_t GSSWAligner::score_gap(size_t gap_length) const { + return vg::score_gap(gap_length, gap_open, gap_extension); +} + +double GSSWAligner::first_mapping_quality_exact(const vector& scaled_scores, + const vector* multiplicities) { + return maximum_mapping_quality_exact(scaled_scores, nullptr, multiplicities); +} + +double GSSWAligner::first_mapping_quality_approx(const vector& scaled_scores, + const vector* multiplicities) { + return maximum_mapping_quality_approx(scaled_scores, nullptr, multiplicities); +} + +double GSSWAligner::maximum_mapping_quality_exact(const vector& scaled_scores, size_t* max_idx_out, + const vector* multiplicities) { + + // TODO: this isn't very well-named now that it also supports computing non-maximum + // mapping qualities + + // work in log transformed values to avoid risk of overflow + double log_sum_exp = numeric_limits::lowest(); + double to_score = numeric_limits::lowest(); + + // go in reverse order because this has fewer numerical problems when the scores are sorted (as usual) + for (int64_t i = scaled_scores.size() - 1; i >= 0; i--) { + // get the value of one copy of the score and check if it's the max + double score = scaled_scores.at(i); + if (max_idx_out && score >= to_score) { + // Since we are going in reverse order, make sure to break ties in favor of the earlier item. + *max_idx_out = i; + to_score = score; + } + + // add all copies of the score + if (multiplicities && multiplicities->at(i) > 1.0) { + score += log(multiplicities->at(i)); + } + + // accumulate the sum of all score + log_sum_exp = add_log(log_sum_exp, score); + } + + // if necessary, assume a null alignment of 0.0 for comparison since this is local + if (scaled_scores.size() == 1) { + if (multiplicities && multiplicities->at(0) <= 1.0) { + log_sum_exp = add_log(log_sum_exp, 0.0); + } + else if (!multiplicities) { + log_sum_exp = add_log(log_sum_exp, 0.0); + } + } + + if (!max_idx_out) { + to_score = scaled_scores.empty() ? 0.0 : scaled_scores.front(); + } + + double direct_mapq = -quality_scale_factor * subtract_log(0.0, to_score - log_sum_exp); + return std::isinf(direct_mapq) ? (double) numeric_limits::max() : direct_mapq; +} + +vector GSSWAligner::all_mapping_qualities_exact(const vector& scaled_scores, + const vector* multiplicities) const { + + vector mapping_qualities(scaled_scores.size()); + + // iterate backwards for improved numerical performance in sorted scores + double log_denom = 0.0; + for (int64_t i = scaled_scores.size() - 1; i >= 0; --i) { + double score = scaled_scores[i]; + if (multiplicities && (*multiplicities)[i] != 1.0) { + score += log((*multiplicities)[i]); + } + log_denom = add_log(log_denom, score); + } + // compute the mapping qualities + for (size_t i = 0; i < scaled_scores.size(); i++) { + double log_prob_error = log10(1.0 - exp(scaled_scores[i] - log_denom)); + if (isnormal(log_prob_error) || log_prob_error == 0.0) { + mapping_qualities[i] = -10.0 * log_prob_error; + } + else { + mapping_qualities[i] = (double) numeric_limits::max(); + } + } + + return mapping_qualities; +} + +double GSSWAligner::maximum_mapping_quality_approx(const vector& scaled_scores, size_t* max_idx_out, + const vector* multiplicities) { + assert(!scaled_scores.empty()); + + // TODO: this isn't very well-named now that it also supports computing non-maximum + // mapping qualities + + // determine the maximum score and the count of the next highest score + double max_score = scaled_scores.at(0); + size_t max_idx = 0; + + // we start with the possibility of a null score of 0.0 + double next_score = 0.0; + double next_count = 1.0; + + if (multiplicities) { + if (multiplicities->at(0) > 1.0) { + // there are extra copies of this one, so we'll init with those + next_score = max_score; + next_count = multiplicities->at(0) - 1.0; + } + } + + for (int32_t i = 1; i < scaled_scores.size(); ++i) { + double score = scaled_scores.at(i); + if (score > max_score) { + if (multiplicities && multiplicities->at(i) > 1.0) { + // there are extra counts of the new highest score due to multiplicity + next_score = score; + next_count = multiplicities->at(i) - 1.0; + } + else if (next_score == max_score) { + // the next highest was the same score as the old max, so we can + // add its count back in + next_count += 1.0; + } + else { + // the old max score is now the second highest + next_score = max_score; + next_count = multiplicities ? multiplicities->at(max_idx) : 1.0; + } + max_score = score; + max_idx = i; + } + else if (score > next_score) { + // the new score is the second highest + next_score = score; + next_count = multiplicities ? multiplicities->at(i) : 1.0; + } + else if (score == next_score) { + // the new score ties the second highest, so we combine their counts + next_count += multiplicities ? multiplicities->at(i) : 1.0; + } + } + + // record the index of the highest score + if (max_idx_out) { + *max_idx_out = max_idx; + } + if (max_idx_out || max_idx == 0) { + // we're either returning the mapping quality of whichever was the best, or we're + // returning the mapping quality of the first, which also is the best + return max(0.0, quality_scale_factor * (max_score - next_score - (next_count > 1.0 ? log(next_count) : 0.0))); + } + else { + // we're returning the mapping quality of the first, which is not the best. the approximation + // gets complicated here, so lets just fall back on the exact computation + return maximum_mapping_quality_exact(scaled_scores, nullptr, multiplicities); + } +} + +double GSSWAligner::group_mapping_quality_exact(const vector& scaled_scores, const vector& group, + const vector* multiplicities) const { + + // work in log transformed values to avoid risk of overflow + double total_log_sum_exp = numeric_limits::lowest(); + double non_group_log_sum_exp = numeric_limits::lowest(); + + // go in reverse order because this has fewer numerical problems when the scores are sorted (as usual) + int64_t group_idx = group.size() - 1; + for (int64_t i = scaled_scores.size() - 1; i >= 0; i--) { + + // the score of one alignment + double score = scaled_scores.at(i); + + // the score all the multiples of this score combined + double multiple_score = score; + if (multiplicities && multiplicities->at(i) > 1.0) { + multiple_score += log(multiplicities->at(i)); + } + + total_log_sum_exp = add_log(total_log_sum_exp, multiple_score); + + if (group_idx >= 0 && i == group[group_idx]) { + // this is the next index in the group + group_idx--; + if (multiplicities && multiplicities->at(i) > 1.0) { + // there's some remaining multiples of this score that don't get added into the group + non_group_log_sum_exp = add_log(non_group_log_sum_exp, + score + log(multiplicities->at(i) - 1.0)); + } + } + else { + // this index is not part of the group + non_group_log_sum_exp = add_log(non_group_log_sum_exp, multiple_score); + } + } + + if (scaled_scores.size() == 1) { + if (multiplicities && multiplicities->at(0) <= 1.0) { + // assume a null alignment of 0.0 for comparison since this is local + non_group_log_sum_exp = add_log(non_group_log_sum_exp, 0.0); + total_log_sum_exp = add_log(total_log_sum_exp, 0.0); + } + else if (!multiplicities) { + //TODO: repetitive, do I need to be this careful to not deref a null? + // assume a null alignment of 0.0 for comparison since this is local + non_group_log_sum_exp = add_log(non_group_log_sum_exp, 0.0); + total_log_sum_exp = add_log(total_log_sum_exp, 0.0); + } + } + + double direct_mapq = quality_scale_factor * (total_log_sum_exp - non_group_log_sum_exp); + return (std::isinf(direct_mapq) || direct_mapq > numeric_limits::max()) ? + (double) numeric_limits::max() : direct_mapq; +} + +void GSSWAligner::compute_mapping_quality(vector& alignments, + int max_mapping_quality, + bool fast_approximation, + double cluster_mq, + bool use_cluster_mq, + int overlap_count, + double mq_estimate, + double maybe_mq_threshold, + double identity_weight) const { + + assert(log_base > 0.0); + + if (alignments.empty()) { + return; + } + + vector scaled_scores(alignments.size()); + for (size_t i = 0; i < alignments.size(); i++) { + scaled_scores[i] = log_base * alignments[i].score(); + } + + double mapping_quality; + size_t max_idx; + if (!fast_approximation) { + mapping_quality = maximum_mapping_quality_exact(scaled_scores, &max_idx); + } + else { + mapping_quality = maximum_mapping_quality_approx(scaled_scores, &max_idx); + } + + if (use_cluster_mq) { + mapping_quality = prob_to_phred(sqrt(phred_to_prob(cluster_mq + mapping_quality))); + } + + if (overlap_count) { + mapping_quality -= quality_scale_factor * log(overlap_count); + } + + auto& max_aln = alignments.at(max_idx); + int l = max(alignment_to_length(max_aln), alignment_from_length(max_aln)); + double identity = 1. - (double)(l * match - max_aln.score()) / (match + mismatch) / l; + + mapping_quality /= 2; + + mapping_quality *= pow(identity, identity_weight); + + if (mq_estimate < maybe_mq_threshold && mq_estimate < mapping_quality) { + mapping_quality = prob_to_phred(sqrt(phred_to_prob(mq_estimate + mapping_quality))); + } + + if (mapping_quality > max_mapping_quality) { + mapping_quality = max_mapping_quality; + } + + if (alignments[max_idx].score() == 0) { + mapping_quality = 0; + } + + alignments[max_idx].set_mapping_quality(max(0, (int32_t) round(mapping_quality))); + for (int i = 1; i < alignments.size(); ++i) { + alignments[0].add_secondary_score(alignments[i].score()); + } +} + +int32_t GSSWAligner::compute_max_mapping_quality(const vector& scores, bool fast_approximation, + const vector* multiplicities) const { + + vector scaled_scores(scores.size()); + for (size_t i = 0; i < scores.size(); i++) { + scaled_scores[i] = log_base * scores[i]; + } + size_t idx; + return (int32_t) (fast_approximation ? maximum_mapping_quality_approx(scaled_scores, &idx, multiplicities) + : maximum_mapping_quality_exact(scaled_scores, &idx, multiplicities)); +} + +int32_t GSSWAligner::compute_first_mapping_quality(const vector& scores, bool fast_approximation, + const vector* multiplicities) const { + vector scaled_scores(scores.size()); + for (size_t i = 0; i < scores.size(); i++) { + scaled_scores[i] = log_base * scores[i]; + } + return (int32_t) (fast_approximation ? first_mapping_quality_approx(scaled_scores, multiplicities) + : first_mapping_quality_exact(scaled_scores, multiplicities)); +} + +int32_t GSSWAligner::compute_group_mapping_quality(const vector& scores, const vector& group, + const vector* multiplicities) const { + + // make a non-const local version in case we need to sort it + vector non_const_group; + const vector* grp_ptr = &group; + + // ensure that group is in sorted order as following function expects + if (!is_sorted(group.begin(), group.end())) { + non_const_group = group; + sort(non_const_group.begin(), non_const_group.end()); + grp_ptr = &non_const_group; + } + + vector scaled_scores(scores.size(), 0.0); + for (size_t i = 0; i < scores.size(); i++) { + scaled_scores[i] = log_base * scores[i]; + } + return group_mapping_quality_exact(scaled_scores, *grp_ptr, multiplicities); +} + +vector GSSWAligner::compute_all_mapping_qualities(const vector& scores, + const vector* multiplicities) const { + vector scaled_scores(scores.size(), 0.0); + for (size_t i = 0; i < scores.size(); i++) { + scaled_scores[i] = log_base * scores[i]; + } + vector double_mapqs = all_mapping_qualities_exact(scaled_scores, multiplicities); + vector to_return(double_mapqs.size(), 0); + for (size_t i = 0; i < to_return.size(); ++i) { + to_return[i] = double_mapqs[i]; + } + return to_return; +} + +void GSSWAligner::compute_paired_mapping_quality(pair, vector>& alignment_pairs, + const vector& frag_weights, + int max_mapping_quality1, + int max_mapping_quality2, + bool fast_approximation, + double cluster_mq, + bool use_cluster_mq, + int overlap_count1, + int overlap_count2, + double mq_estimate1, + double mq_estimate2, + double maybe_mq_threshold, + double identity_weight) const { + + assert(log_base > 0.0); + + size_t size = min(alignment_pairs.first.size(), + alignment_pairs.second.size()); + + if (size == 0) { + return; + } + + vector scaled_scores(size); + + for (size_t i = 0; i < size; i++) { + auto& aln1 = alignment_pairs.first[i]; + auto& aln2 = alignment_pairs.second[i]; + scaled_scores[i] = log_base * (aln1.score() + aln2.score()); + // + frag_weights[i]); + // ^^^ we could also incorporate the fragment weights, but this does not seem to help performance in the current form + } + + size_t max_idx; + double mapping_quality; + if (!fast_approximation) { + mapping_quality = maximum_mapping_quality_exact(scaled_scores, &max_idx); + } + else { + mapping_quality = maximum_mapping_quality_approx(scaled_scores, &max_idx); + } + + if (use_cluster_mq) { + mapping_quality = prob_to_phred(sqrt(phred_to_prob(cluster_mq + mapping_quality))); + } + + double mapping_quality1 = mapping_quality; + double mapping_quality2 = mapping_quality; + + if (overlap_count1) { + mapping_quality1 -= quality_scale_factor * log(overlap_count1); + } + if (overlap_count2) { + mapping_quality2 -= quality_scale_factor * log(overlap_count2); + } + + auto& max_aln1 = alignment_pairs.first.at(max_idx); + int len1 = max(alignment_to_length(max_aln1), alignment_from_length(max_aln1)); + double identity1 = 1. - (double)(len1 * match - max_aln1.score()) / (match + mismatch) / len1; + auto& max_aln2 = alignment_pairs.second.at(max_idx); + int len2 = max(alignment_to_length(max_aln2), alignment_from_length(max_aln2)); + double identity2 = 1. - (double)(len2 * match - max_aln2.score()) / (match + mismatch) / len2; + + mapping_quality1 /= 2; + mapping_quality2 /= 2; + + mapping_quality1 *= pow(identity1, identity_weight); + mapping_quality2 *= pow(identity2, identity_weight); + + double mq_estimate = min(mq_estimate1, mq_estimate2); + if (mq_estimate < maybe_mq_threshold && mq_estimate < mapping_quality1) { + mapping_quality1 = prob_to_phred(sqrt(phred_to_prob(mq_estimate + mapping_quality1))); + } + if (mq_estimate < maybe_mq_threshold && mq_estimate < mapping_quality2) { + mapping_quality2 = prob_to_phred(sqrt(phred_to_prob(mq_estimate + mapping_quality2))); + } + + if (mapping_quality1 > max_mapping_quality1) { + mapping_quality1 = max_mapping_quality1; + } + if (mapping_quality2 > max_mapping_quality2) { + mapping_quality2 = max_mapping_quality2; + } + + if (alignment_pairs.first[max_idx].score() == 0) { + mapping_quality1 = 0; + } + if (alignment_pairs.second[max_idx].score() == 0) { + mapping_quality2 = 0; + } + + mapping_quality = max(0, (int32_t)round(min(mapping_quality1, mapping_quality2))); + + alignment_pairs.first[max_idx].set_mapping_quality(mapping_quality); + alignment_pairs.second[max_idx].set_mapping_quality(mapping_quality); + + for (int i = 1; i < alignment_pairs.first.size(); ++i) { + alignment_pairs.first[0].add_secondary_score(alignment_pairs.first[i].score()); + } + for (int i = 1; i < alignment_pairs.second.size(); ++i) { + alignment_pairs.second[0].add_secondary_score(alignment_pairs.second[i].score()); + } + +} + +double GSSWAligner::mapping_quality_score_diff(double mapping_quality) const { + return mapping_quality / (quality_scale_factor * log_base); +} + +double GSSWAligner::estimate_next_best_score(int length, double min_diffs) const { + return ((length - min_diffs) * match - min_diffs * mismatch); +} + +double GSSWAligner::max_possible_mapping_quality(int length) const { + double max_score = log_base * length * match; + vector v = { max_score }; + size_t max_idx; + return maximum_mapping_quality_approx(v, &max_idx); +} + +double GSSWAligner::estimate_max_possible_mapping_quality(int length, double min_diffs, double next_min_diffs) const { + double max_score = log_base * ((length - min_diffs) * match - min_diffs * mismatch); + double next_max_score = log_base * ((length - next_min_diffs) * match - next_min_diffs * mismatch); + vector v = { max_score, next_max_score }; + size_t max_idx; + return maximum_mapping_quality_approx(v, &max_idx); +} + +double GSSWAligner::score_to_unnormalized_likelihood_ln(double score) const { + // Log base needs to be set, or this can't work. + assert(log_base != 0); + // Likelihood is proportional to e^(lambda * score), so ln is just the exponent. + return log_base * score; +} + +size_t GSSWAligner::longest_detectable_gap(const Alignment& alignment, const string::const_iterator& read_pos) const { + return longest_detectable_gap(alignment.sequence().size(), read_pos - alignment.sequence().begin()); +} + +size_t GSSWAligner::longest_detectable_gap(size_t read_length, size_t read_pos) const { + // algebraic solution for when score is > 0 assuming perfect match other than gap + assert(read_length >= read_pos); + int64_t overhang_length = min(read_pos, read_length - read_pos); + int64_t numer = match * overhang_length + full_length_bonus; + int64_t gap_length = (numer - gap_open) / gap_extension + 1; + return gap_length >= 0 && overhang_length > 0 ? gap_length : 0; +} + +size_t GSSWAligner::longest_detectable_gap(const Alignment& alignment) const { + // longest detectable gap across entire read is in the middle + return longest_detectable_gap(alignment.sequence().size(), alignment.sequence().size() / 2); +} + +size_t GSSWAligner::longest_detectable_gap(size_t read_length) const { + return longest_detectable_gap(read_length, read_length / 2); +} + +int32_t GSSWAligner::score_discontiguous_alignment(const Alignment& aln, const function& estimate_distance, + bool strip_bonuses) const { + + int score = 0; + int read_offset = 0; + auto& path = aln.path(); + + // We keep track of whether the last edit was a deletion for coalescing + // adjacent deletions across node boundaries + bool last_was_deletion = false; + + for (int i = 0; i < path.mapping_size(); ++i) { + // For each mapping + auto& mapping = path.mapping(i); + for (int j = 0; j < mapping.edit_size(); ++j) { + // For each edit in the mapping + auto& edit = mapping.edit(j); + + // Score the edit according to its type + if (edit_is_match(edit)) { + score += score_exact_match(aln, read_offset, edit.to_length()); + last_was_deletion = false; + } else if (edit_is_sub(edit)) { + score += score_mismatch(aln.sequence().begin() + read_offset, + aln.sequence().begin() + read_offset + edit.to_length(), + aln.quality().begin() + read_offset); + last_was_deletion = false; + } else if (edit_is_deletion(edit)) { + if (last_was_deletion) { + // No need to charge a gap open + score -= edit.from_length() * gap_extension; + } else { + // We need a gap open + score -= edit.from_length() ? gap_open + (edit.from_length() - 1) * gap_extension : 0; + } + + if (edit.from_length()) { + // We already charged a gap open + last_was_deletion = true; + } + // If there's a 0-length deletion, leave the last_was_deletion flag unchanged. + } else if (edit_is_insertion(edit) && !((i == 0 && j == 0) || + (i == path.mapping_size()-1 && j == mapping.edit_size()-1))) { + // todo how do we score this qual adjusted? + score -= edit.to_length() ? gap_open + (edit.to_length() - 1) * gap_extension : 0; + last_was_deletion = false; + // No need to track if the last edit was an insertion because + // insertions will be all together in a single edit at a point. + } else { + // Edit has no score effect. Probably a softclip. + last_was_deletion = false; + } + read_offset += edit.to_length(); + } + // score any intervening gaps in mappings using approximate distances + if (i+1 < path.mapping_size()) { + // what is the distance between the last position of this mapping + // and the first of the next + Position last_pos = mapping.position(); + last_pos.set_offset(last_pos.offset() + mapping_from_length(mapping)); + Position next_pos = path.mapping(i+1).position(); + // Estimate the distance + int dist = estimate_distance(make_pos_t(last_pos), make_pos_t(next_pos), aln.sequence().size()); + if (dist > 0) { + // If it's nonzero, score it as a deletion gap + score -= gap_open + (dist - 1) * gap_extension; + } + } + } + + if (!strip_bonuses) { + // We should report any bonuses used in the DP in the final score + if (!softclip_start(aln)) { + score += score_full_length_bonus(true, aln); + } + if (!softclip_end(aln)) { + score += score_full_length_bonus(false, aln); + } + } + + return score; +} + +int32_t GSSWAligner::score_contiguous_alignment(const Alignment& aln, bool strip_bonuses) const { + return score_discontiguous_alignment(aln, [](pos_t, pos_t, size_t){return (size_t) 0;}, strip_bonuses); +} + +int32_t GSSWAligner::remove_bonuses(const Alignment& aln, bool pinned, bool pin_left) const { + int32_t score = aln.score(); + if (softclip_start(aln) == 0 && !(pinned && pin_left)) { + // No softclip at the start, and a left end bonus was applied. + score -= score_full_length_bonus(true, aln); + } + if (softclip_end(aln) == 0 && !(pinned && !pin_left)) { + // No softclip at the end, and a right end bonus was applied. + score -= score_full_length_bonus(false, aln); + } + return score; +} + +Aligner::Aligner(const int8_t* _score_matrix, + int8_t _gap_open, + int8_t _gap_extension, + int8_t _full_length_bonus, + double _gc_content) + : GSSWAligner(_score_matrix, _gap_open, _gap_extension, _full_length_bonus, _gc_content) +{ + + // add in the 5th row and column of 0s for N matches like GSSW wants + score_matrix = (int8_t*) malloc(sizeof(int8_t) * 25); + for (size_t i = 0, j = 0; i < 25; ++i) { + if (i % 5 == 4 || i / 5 == 4) { + score_matrix[i] = 0; + } + else { + score_matrix[i] = _score_matrix[j]; + ++j; + } + } + + // make an XdropAligner for each thread + int num_threads = get_thread_count(); + xdrops.reserve(num_threads); + for (size_t i = 0; i < num_threads; ++i) { + xdrops.emplace_back(_score_matrix, _gap_open, _gap_extension); + } +} + +void Aligner::align_internal(Alignment& alignment, vector* multi_alignments, const HandleGraph& g, + bool pinned, bool pin_left,int32_t max_alt_alns, bool traceback_aln) const { + // bench_start(bench); + // check input integrity + if (pin_left && !pinned) { + cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl; + exit(EXIT_FAILURE); + } + if (multi_alignments && !pinned) { + cerr << "error:[Aligner] multiple traceback is not implemented in local alignment, only pinned and global" << endl; + exit(EXIT_FAILURE); + } + if (!multi_alignments && max_alt_alns != 1) { + cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl; + exit(EXIT_FAILURE); + } + if (max_alt_alns <= 0) { + cerr << "error:[Aligner] cannot do less than 1 alignment" << endl; + exit(EXIT_FAILURE); + } + + // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top + // left we need to reverse all the sequences first and translate the alignment back later + + // make a place to reverse the graph and sequence if necessary + ReverseGraph reversed_graph(&g, false); + string reversed_sequence; + + // choose forward or reversed objects + const HandleGraph* oriented_graph = &g; + const string* align_sequence = &alignment.sequence(); + if (pin_left) { + // choose the reversed graph + oriented_graph = &reversed_graph; + + // make and assign the reversed sequence + reversed_sequence.resize(align_sequence->size()); + reverse_copy(align_sequence->begin(), align_sequence->end(), reversed_sequence.begin()); + align_sequence = &reversed_sequence; + } + + // to save compute, we won't make these unless we're doing pinning + unordered_set pinning_ids; + NullMaskingGraph* null_masked_graph = nullptr; + const HandleGraph* align_graph = oriented_graph; + if (pinned) { + pinning_ids = identify_pinning_points(*oriented_graph); + null_masked_graph = new NullMaskingGraph(oriented_graph); + align_graph = null_masked_graph; + } + + // convert into gssw graph + gssw_graph* graph = create_gssw_graph(*align_graph); + + // perform dynamic programming + gssw_graph_fill_pinned(graph, align_sequence->c_str(), + nt_table, score_matrix, + gap_open, gap_extension, full_length_bonus, + pinned ? 0 : full_length_bonus, 15, 2, traceback_aln); + + // traceback either from pinned position or optimal local alignment + if (traceback_aln) { + if (pinned) { + // we can only run gssw's DP on non-empty graphs, but we may have masked the entire graph + // if it consists of only empty nodes, so don't both with the DP in that case + gssw_graph_mapping** gms = nullptr; + if (align_graph->get_node_count() > 0) { + gssw_node** pinning_nodes = (gssw_node**) malloc(pinning_ids.size() * sizeof(gssw_node*)); + size_t j = 0; + for (size_t i = 0; i < graph->size; i++) { + gssw_node* node = graph->nodes[i]; + if (pinning_ids.count(node->id)) { + pinning_nodes[j] = node; + j++; + } + } + + // trace back pinned alignment + gms = gssw_graph_trace_back_pinned_multi (graph, + max_alt_alns, + true, + align_sequence->c_str(), + align_sequence->size(), + pinning_nodes, + pinning_ids.size(), + nt_table, + score_matrix, + gap_open, + gap_extension, + full_length_bonus, + 0); + + free(pinning_nodes); + } + + // did we both 1) do DP (i.e. the graph is non-empty), and 2) find a traceback with positive score? + if (gms ? gms[0]->score > 0 : false) { + + if (pin_left) { + // translate nodes and mappings into original sequence so that the cigars come out right + unreverse_graph(graph); + for (int32_t i = 0; i < max_alt_alns; i++) { + unreverse_graph_mapping(gms[i]); + } + } + + // have a mapping, can just convert normally + gssw_mapping_to_alignment(graph, gms[0], alignment, pinned, pin_left); + + if (multi_alignments) { + // determine how many non-null alignments were returned + int32_t num_non_null = max_alt_alns; + for (int32_t i = 1; i < max_alt_alns; i++) { + if (gms[i]->score <= 0) { + num_non_null = i; + break; + } + } + + // reserve to avoid illegal access errors that occur when the vector reallocates + multi_alignments->reserve(num_non_null); + + // copy the primary alignment + multi_alignments->emplace_back(alignment); + + // convert the alternate alignments and store them at the back of the vector (this will not + // execute if we are doing single alignment) + for (int32_t i = 1; i < num_non_null; i++) { + // make new alignment object + multi_alignments->emplace_back(); + Alignment& next_alignment = multi_alignments->back(); + + // copy over sequence information from the primary alignment + next_alignment.set_sequence(alignment.sequence()); + next_alignment.set_quality(alignment.quality()); + + // get path of the alternate alignment + gssw_mapping_to_alignment(graph, gms[i], next_alignment, pinned, pin_left); + } + } + } + else if (g.get_node_count() > 0) { + // we didn't get any alignments either because the graph was empty and we couldn't run + // gssw DP or because they had score 0 and gssw didn't want to do traceback. however, + // we can infer the location of softclips based on the pinning nodes, so we'll just make + // those manually + + // find the sink nodes of the oriented graph, which may be empty + auto pinning_points = handlealgs::tail_nodes(oriented_graph); + // impose a consistent ordering for machine independent behavior + sort(pinning_points.begin(), pinning_points.end(), [&](const handle_t& h1, const handle_t& h2) { + return oriented_graph->get_id(h1) < oriented_graph->get_id(h2); + }); + + for (size_t i = 0; i < max_alt_alns && i < pinning_points.size(); i++) { + // make a record in the multi alignments if we're using them + if (multi_alignments) { + multi_alignments->emplace_back(); + } + // choose an alignment object to construct the path in + Alignment& softclip_alignment = i == 0 ? alignment : multi_alignments->back(); + + handle_t& pinning_point = pinning_points[i]; + + Mapping* mapping = alignment.mutable_path()->add_mapping(); + mapping->set_rank(1); + + // locate at the beginning or end of the node + Position* position = mapping->mutable_position(); + position->set_node_id(oriented_graph->get_id(pinning_point)); + position->set_offset(pin_left ? 0 : oriented_graph->get_length(pinning_point)); + + // soft clip + Edit* edit = mapping->add_edit(); + edit->set_to_length(alignment.sequence().length()); + edit->set_sequence(alignment.sequence()); + + // we want to also have the first alignment in the multi-alignment vector + if (i == 0 && multi_alignments) { + multi_alignments->back() = alignment; + } + } + } + if (gms) { + for (int32_t i = 0; i < max_alt_alns; i++) { + gssw_graph_mapping_destroy(gms[i]); + } + free(gms); + } + } + else { + // trace back local alignment + gssw_graph_mapping* gm = gssw_graph_trace_back (graph, + align_sequence->c_str(), + align_sequence->size(), + nt_table, + score_matrix, + gap_open, + gap_extension, + full_length_bonus, + full_length_bonus); + + gssw_mapping_to_alignment(graph, gm, alignment, pinned, pin_left); + gssw_graph_mapping_destroy(gm); + } + } else { + // get the alignment position and score + alignment.set_score(graph->max_node->alignment->score1); + Mapping* m = alignment.mutable_path()->add_mapping(); + Position* p = m->mutable_position(); + p->set_node_id(graph->max_node->id); + p->set_offset(graph->max_node->alignment->ref_end1); // mark end position; for de-duplication + } + + // this might be null if we're not doing pinned alignment, but delete doesn't care + delete null_masked_graph; + + gssw_graph_destroy(graph); + // bench_end(bench); +} + +void Aligner::align(Alignment& alignment, const HandleGraph& g, bool traceback_aln) const { + + align_internal(alignment, nullptr, g, false, false, 1, traceback_aln); +} + +void Aligner::align(Alignment& alignment, const HandleGraph& g, + const std::vector& topological_order) const { + + // Create a gssw_graph and a mapping from handles to nodes. + gssw_graph* graph = gssw_graph_create(topological_order.size()); + hash_map nodes; + nodes.reserve(topological_order.size()); + + // Create the nodes. Use offsets in the topological order as node ids. + for (size_t i = 0; i < topological_order.size(); i++) { + handle_t handle = topological_order[i]; + auto cleaned_seq = nonATGCNtoN(g.get_sequence(handle)); + gssw_node* node = gssw_node_create(nullptr, + i, + cleaned_seq.c_str(), + nt_table, + score_matrix); + nodes[handle] = node; + gssw_graph_add_node(graph, node); + } + + // Create the edges. + for (const handle_t& from : topological_order) { + gssw_node* from_node = nodes[from]; + g.follow_edges(from, false, [&](const handle_t& to) { + auto iter = nodes.find(to); + if (iter != nodes.end()) { + gssw_nodes_add_edge(from_node, iter->second); + } + }); + } + + // Align the read to the subgraph. + gssw_graph_fill_pinned(graph, alignment.sequence().c_str(), + nt_table, score_matrix, + gap_open, gap_extension, full_length_bonus, full_length_bonus, + 15, 2, true); + gssw_graph_mapping* gm = gssw_graph_trace_back(graph, + alignment.sequence().c_str(), alignment.sequence().length(), + nt_table, score_matrix, + gap_open, gap_extension, full_length_bonus, full_length_bonus); + + // Convert the mapping to Alignment. + this->gssw_mapping_to_alignment(graph, gm, alignment, false, false); + Path& path = *(alignment.mutable_path()); + for (size_t i = 0; i < path.mapping_size(); i++) { + Position& pos = *(path.mutable_mapping(i)->mutable_position()); + handle_t handle = topological_order[pos.node_id()]; + pos.set_node_id(g.get_id(handle)); + pos.set_is_reverse(g.get_is_reverse(handle)); + } + + // Destroy the temporary objects. + gssw_graph_mapping_destroy(gm); + gssw_graph_destroy(graph); +} + +void Aligner::align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left, bool xdrop, + uint16_t xdrop_max_gap_length) const { + + if (xdrop) { + // XdropAligner manages its own stack, so it can never be threadsafe without be recreated + // for every alignment, which meshes poorly with its stack implementation. We achieve + // thread-safety by having one per thread, which makes this method const-ish. + XdropAligner& xdrop = const_cast(xdrops[omp_get_thread_num()]); + + // dozeu declines to produce an alignment when the gap is set to 0 + xdrop_max_gap_length = max(xdrop_max_gap_length, 1); + + // wrap the graph so that empty pinning points are handled correctly + DozeuPinningOverlay overlay(&g, !pin_left); + + if (overlay.get_node_count() == 0 && g.get_node_count() != 0) { + // the only nodes in the graph are empty nodes for pinning, which got masked. + // we can still infer a pinned alignment based purely on the pinning point but + // dozeu won't handle this correctly + g.for_each_handle([&](const handle_t& handle) { + bool can_pin = g.follow_edges(handle, pin_left, [&](const handle_t& next) {return false;}); + if (can_pin) { + // manually make the softclip + Mapping* mapping = alignment.mutable_path()->add_mapping(); + Position* pos = mapping->mutable_position(); + pos->set_node_id(g.get_id(handle)); + pos->set_is_reverse(false); + pos->set_offset(pin_left ? 0 : g.get_length(handle)); + + mapping->set_rank(1); + + Edit* edit = mapping->add_edit(); + edit->set_from_length(0); + edit->set_to_length(alignment.sequence().size()); + edit->set_sequence(alignment.sequence()); + alignment.set_score(0); + return false; + } + return true; + }); + } + else { + // do the alignment + xdrop.align_pinned(alignment, overlay, pin_left, full_length_bonus, xdrop_max_gap_length); + + if (overlay.performed_duplications()) { + // the overlay is not a strict subset of the underlying graph, so we may + // need to translate some node IDs + translate_oriented_node_ids(*alignment.mutable_path(), [&](id_t node_id) { + handle_t under = overlay.get_underlying_handle(overlay.get_handle(node_id)); + return make_pair(g.get_id(under), g.get_is_reverse(under)); + }); + } + } + } + else { + align_internal(alignment, nullptr, g, true, pin_left, 1, true); + } +} + +void Aligner::align_pinned_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, + bool pin_left, int32_t max_alt_alns) const { + + if (alt_alignments.size() != 0) { + cerr << "error:[Aligner::align_pinned_multi] output vector must be empty for pinned multi-aligning" << endl; + exit(EXIT_FAILURE); + } + + align_internal(alignment, &alt_alignments, g, true, pin_left, max_alt_alns, true); +} + +void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, + int32_t band_padding, bool permissive_banding) const { + + if (alignment.sequence().empty()) { + // we can save time by using a specialized deletion aligner for empty strings + deletion_aligner.align(alignment, g); + return; + } + + // We need to figure out what size ints we need to use. + // Get upper and lower bounds on the scores. TODO: if these overflow int64 we're out of luck + int64_t best_score = alignment.sequence().size() * match; + size_t total_bases = 0; + g.for_each_handle([&](const handle_t& handle) { + total_bases += g.get_length(handle); + }); + int64_t worst_score = (alignment.sequence().size() + total_bases) * -max(max(mismatch, gap_open), gap_extension); + + // TODO: put this all into another template somehow? + + if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int8 + BandedGlobalAligner band_graph(alignment, + g, + band_padding, + permissive_banding, + false); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int16 + BandedGlobalAligner band_graph(alignment, + g, + band_padding, + permissive_banding, + false); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int32 + BandedGlobalAligner band_graph(alignment, + g, + band_padding, + permissive_banding, + false); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else { + // Fall back to int64 + BandedGlobalAligner band_graph(alignment, + g, + band_padding, + permissive_banding, + false); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } +} + +void Aligner::align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, + int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) const { + + if (alignment.sequence().empty()) { + // we can save time by using a specialized deletion aligner for empty strings + deletion_aligner.align_multi(alignment, alt_alignments, g, max_alt_alns); + return; + } + + // We need to figure out what size ints we need to use. + // Get upper and lower bounds on the scores. TODO: if these overflow int64 we're out of luck + int64_t best_score = alignment.sequence().size() * match; + size_t total_bases = 0; + g.for_each_handle([&](const handle_t& handle) { + total_bases += g.get_length(handle); + }); + int64_t worst_score = (alignment.sequence().size() + total_bases) * -max(max(mismatch, gap_open), gap_extension); + + if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int8 + BandedGlobalAligner band_graph(alignment, + g, + alt_alignments, + max_alt_alns, + band_padding, + permissive_banding, + false); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int16 + BandedGlobalAligner band_graph(alignment, + g, + alt_alignments, + max_alt_alns, + band_padding, + permissive_banding, + false); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int32 + BandedGlobalAligner band_graph(alignment, + g, + alt_alignments, + max_alt_alns, + band_padding, + permissive_banding, + false); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else { + // Fall back to int64 + BandedGlobalAligner band_graph(alignment, + g, + alt_alignments, + max_alt_alns, + band_padding, + permissive_banding, + false); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } +} + +void Aligner::align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& mems, + bool reverse_complemented, uint16_t max_gap_length) const +{ + align_xdrop(alignment, g, handlealgs::lazier_topological_order(&g), mems, reverse_complemented, + max_gap_length); +} + +void Aligner::align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& order, + const vector& mems, bool reverse_complemented, uint16_t max_gap_length) const +{ + // XdropAligner manages its own stack, so it can never be threadsafe without be recreated + // for every alignment, which meshes poorly with its stack implementation. We achieve + // thread-safety by having one per thread, which makes this method const-ish. + XdropAligner& xdrop = const_cast(xdrops[omp_get_thread_num()]); + xdrop.align(alignment, g, order, mems, reverse_complemented, full_length_bonus, max_gap_length); + if (!alignment.has_path() && mems.empty()) { + // dozeu couldn't find an alignment, probably because it's seeding heuristic failed + // we'll just fall back on GSSW + // TODO: This is a bit inconsistent. GSSW gives a full-length bonus at both ends, while + // dozeu only gives it once. + align(alignment, g, order); + } +} + + +// Scoring an exact match is very simple in an ordinary Aligner + +int32_t Aligner::score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const { + return match * length; +} + +int32_t Aligner::score_exact_match(const string& sequence) const { + return match * sequence.length(); +} + +int32_t Aligner::score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end) const { + return match * (seq_end - seq_begin); +} + +int32_t Aligner::score_exact_match(const string& sequence, const string& base_quality) const { + return score_exact_match(sequence); +} + +int32_t Aligner::score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const { + return score_exact_match(seq_begin, seq_end); +} + +int32_t Aligner::score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const { + return -mismatch * (seq_end - seq_begin); +} + +int32_t Aligner::score_mismatch(size_t length) const { + return -match * length; +} + +int32_t Aligner::score_full_length_bonus(bool left_side, string::const_iterator seq_begin, + string::const_iterator seq_end, + string::const_iterator base_qual_begin) const { + return full_length_bonus; +} + +int32_t Aligner::score_full_length_bonus(bool left_side, const Alignment& alignment) const { + return full_length_bonus; +} + +int32_t Aligner::score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, + string::const_iterator seq_begin, bool no_read_end_scoring) const { + + int32_t score = 0; + string::const_iterator read_pos = seq_begin; + bool in_deletion = false; + for (size_t i = 0; i < path.mapping_size(); i++) { + const auto& mapping = path.mapping(i); + + for (size_t j = 0; j < mapping.edit_size(); j++) { + const auto& edit = mapping.edit(j); + + if (edit.from_length() > 0) { + if (edit.to_length() > 0) { + if (edit.sequence().empty()) { + // match + score += match * edit.from_length(); + } + else { + // mismatch + score -= mismatch * edit.from_length(); + } + + // apply full length bonus + if (read_pos == alignment.sequence().begin() && !no_read_end_scoring) { + score += score_full_length_bonus(true, alignment); + } + if (read_pos + edit.to_length() == alignment.sequence().end() + && !no_read_end_scoring) { + score += score_full_length_bonus(false, alignment); + } + in_deletion = false; + } + else if (in_deletion) { + score -= edit.from_length() * gap_extension; + } + else { + // deletion + score -= gap_open + (edit.from_length() - 1) * gap_extension; + in_deletion = true; + } + } + else if (edit.to_length() > 0) { + // don't score soft clips if scoring read ends + if (no_read_end_scoring || + (read_pos != alignment.sequence().begin() && + read_pos + edit.to_length() != alignment.sequence().end())) { + // insert + score -= gap_open + (edit.to_length() - 1) * gap_extension; + } + in_deletion = false; + } + + read_pos += edit.to_length(); + } + } + return score; +} + +QualAdjAligner::QualAdjAligner(const int8_t* _score_matrix, + int8_t _gap_open, + int8_t _gap_extension, + int8_t _full_length_bonus, + double _gc_content) + : GSSWAligner(_score_matrix, _gap_open, _gap_extension, _full_length_bonus, _gc_content) +{ + // TODO: this interface could really be improved in GSSW, oh well though + + // find the quality-adjusted scores + uint32_t max_base_qual = 255; + + // add in the 0s to the 5-th row and column for Ns + score_matrix = qual_adjusted_matrix(_score_matrix, _gc_content, max_base_qual); + + // compute the quality adjusted full length bonuses + qual_adj_full_length_bonuses = qual_adjusted_bonuses(_full_length_bonus, max_base_qual); + + // make a QualAdjXdropAligner for each thread + int num_threads = get_thread_count(); + xdrops.reserve(num_threads); + for (size_t i = 0; i < num_threads; ++i) { + xdrops.emplace_back(_score_matrix, score_matrix, _gap_open, _gap_extension); + } +} + +QualAdjAligner::~QualAdjAligner() { + free(qual_adj_full_length_bonuses); +} + +int8_t* QualAdjAligner::qual_adjusted_matrix(const int8_t* _score_matrix, double gc_content, uint32_t max_qual) const { + + // TODO: duplicative with GSSWAligner() + double* nt_freqs = (double*) malloc(sizeof(double) * 4); + nt_freqs[0] = 0.5 * (1 - gc_content); + nt_freqs[1] = 0.5 * gc_content; + nt_freqs[2] = 0.5 * gc_content; + nt_freqs[3] = 0.5 * (1 - gc_content); + + // recover the emission probabilities of the align state of the HMM + double* align_prob = (double*) malloc(sizeof(double) * 16); + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + align_prob[i * 4 + j] = (exp(log_base * _score_matrix[i * 4 + j]) + * nt_freqs[i] * nt_freqs[j]); + } + } + + // compute the sum of the emission probabilities under a base error + double* align_complement_prob = (double*) malloc(sizeof(double) * 16); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + align_complement_prob[i * 4 + j] = 0.0; + for (int k = 0; k < 4; k++) { + if (k != j) { + align_complement_prob[i * 4 + j] += align_prob[i * 4 + k]; + } + } + } + } + + // quality score of random guessing + int lowest_meaningful_qual = ceil(-10.0 * log10(0.75)); + + // compute the adjusted alignment scores for each quality level + int8_t* qual_adj_mat = (int8_t*) malloc(25 * (max_qual + 1) * sizeof(int8_t)); + for (int q = 0; q <= max_qual; q++) { + double err = pow(10.0, -q / 10.0); + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 5; j++) { + int8_t score; + if (i == 4 || j == 4 || q < lowest_meaningful_qual) { + score = 0; + } + else { + score = round(log(((1.0 - err) * align_prob[i * 4 + j] + (err / 3.0) * align_complement_prob[i * 4 + j]) + / (nt_freqs[i] * ((1.0 - err) * nt_freqs[j] + (err / 3.0) * (1.0 - nt_freqs[j])))) / log_base); + } + qual_adj_mat[q * 25 + i * 5 + j] = round(score); + } + } + } + + free(align_complement_prob); + free(align_prob); + free(nt_freqs); + + return qual_adj_mat; +} + +int8_t* QualAdjAligner::qual_adjusted_bonuses(int8_t _full_length_bonus, uint32_t max_qual) const { + + + double p_full_len = exp(log_base * _full_length_bonus) / (1.0 + exp(log_base * _full_length_bonus)); + + int8_t* qual_adj_bonuses = (int8_t*) calloc(max_qual + 1, sizeof(int8_t)); + + int lowest_meaningful_qual = ceil(-10.0 * log10(0.75)); + // hack because i want the minimum qual value from illumina (2) to have zero score, but phred + // values are spaced out in a way to approximate this singularity well + ++lowest_meaningful_qual; + + for (int q = lowest_meaningful_qual; q <= max_qual; ++q) { + double err = pow(10.0, -q / 10.0); + double score = log(((1.0 - err * 4.0 / 3.0) * p_full_len + (err * 4.0 / 3.0) * (1.0 - p_full_len)) / (1.0 - p_full_len)) / log_base; + qual_adj_bonuses[q] = round(score); + } + + return qual_adj_bonuses; +} + +void QualAdjAligner::align_internal(Alignment& alignment, vector* multi_alignments, const HandleGraph& g, + bool pinned, bool pin_left, int32_t max_alt_alns, bool traceback_aln) const { + + // check input integrity + if (pin_left && !pinned) { + cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl; + exit(EXIT_FAILURE); + } + if (multi_alignments && !pinned) { + cerr << "error:[Aligner] multiple traceback is not implemented in local alignment, only pinned and global" << endl; + exit(EXIT_FAILURE); + } + if (!multi_alignments && max_alt_alns != 1) { + cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl; + exit(EXIT_FAILURE); + } + if (max_alt_alns <= 0) { + cerr << "error:[Aligner] cannot do less than 1 alignment" << endl; + exit(EXIT_FAILURE); + } + + // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top + // left we need to reverse all the sequences first and translate the alignment back later + + // make a place to reverse the graph and sequence if necessary + ReverseGraph reversed_graph(&g, false); + string reversed_sequence; + string reversed_quality; + + // choose forward or reversed objects + const HandleGraph* oriented_graph = &g; + const string* align_sequence = &alignment.sequence(); + const string* align_quality = &alignment.quality(); + if (pin_left) { + // choose the reversed graph + oriented_graph = &reversed_graph; + + // make and assign the reversed sequence + reversed_sequence.resize(align_sequence->size()); + reverse_copy(align_sequence->begin(), align_sequence->end(), reversed_sequence.begin()); + align_sequence = &reversed_sequence; + + // make and assign the reversed quality + reversed_quality.resize(align_quality->size()); + reverse_copy(align_quality->begin(), align_quality->end(), reversed_quality.begin()); + align_quality = &reversed_quality; + } + + if (align_quality->size() != align_sequence->size()) { + cerr << "error:[QualAdjAligner] Read " << alignment.name() << " has sequence and quality strings with different lengths. Cannot perform base quality adjusted alignment. Consider toggling off base quality adjusted alignment at the command line." << endl; + exit(EXIT_FAILURE); + } + + // to save compute, we won't make these unless we're doing pinning + unordered_set pinning_ids; + NullMaskingGraph* null_masked_graph = nullptr; + const HandleGraph* align_graph = oriented_graph; + if (pinned) { + pinning_ids = identify_pinning_points(*oriented_graph); + null_masked_graph = new NullMaskingGraph(oriented_graph); + align_graph = null_masked_graph; + } + + // convert into gssw graph + gssw_graph* graph = create_gssw_graph(*align_graph); + + int8_t front_full_length_bonus = qual_adj_full_length_bonuses[align_quality->front()]; + int8_t back_full_length_bonus = qual_adj_full_length_bonuses[align_quality->back()]; + + // perform dynamic programming + // offer a full length bonus on each end, or only on the left if the right end is pinned. + gssw_graph_fill_pinned_qual_adj(graph, align_sequence->c_str(), align_quality->c_str(), + nt_table, score_matrix, + gap_open, gap_extension, + front_full_length_bonus, + pinned ? 0 : back_full_length_bonus, + 15, 2, traceback_aln); + + // traceback either from pinned position or optimal local alignment + if (traceback_aln) { + if (pinned) { + gssw_graph_mapping** gms = nullptr; + if (align_graph->get_node_count() > 0) { + + gssw_node** pinning_nodes = (gssw_node**) malloc(pinning_ids.size() * sizeof(gssw_node*)); + size_t j = 0; + for (size_t i = 0; i < graph->size; i++) { + gssw_node* node = graph->nodes[i]; + if (pinning_ids.count(node->id)) { + pinning_nodes[j] = node; + j++; + } + } + + // trace back pinned alignment + gms = gssw_graph_trace_back_pinned_qual_adj_multi (graph, + max_alt_alns, + true, + align_sequence->c_str(), + align_quality->c_str(), + align_sequence->size(), + pinning_nodes, + pinning_ids.size(), + nt_table, + score_matrix, + gap_open, + gap_extension, + front_full_length_bonus, + 0); + + free(pinning_nodes); + } + + // did we both 1) do DP (i.e. the graph is non-empty), and 2) find a traceback with positive score? + if (gms && gms[0]->score > 0) { + + if (pin_left) { + // translate graph and mappings into original node space + unreverse_graph(graph); + for (int32_t i = 0; i < max_alt_alns; i++) { + unreverse_graph_mapping(gms[i]); + } + } + + // have a mapping, can just convert normally + gssw_mapping_to_alignment(graph, gms[0], alignment, pinned, pin_left); + + if (multi_alignments) { + // determine how many non-null alignments were returned + int32_t num_non_null = max_alt_alns; + for (int32_t i = 1; i < max_alt_alns; i++) { + if (gms[i]->score <= 0) { + num_non_null = i; + break; + } + } + + // reserve to avoid illegal access errors that occur when the vector reallocates + multi_alignments->reserve(num_non_null); + + // copy the primary alignment + multi_alignments->emplace_back(alignment); + + // convert the alternate alignments and store them at the back of the vector (this will not + // execute if we are doing single alignment) + for (int32_t i = 1; i < num_non_null; i++) { + // make new alignment object + multi_alignments->emplace_back(); + Alignment& next_alignment = multi_alignments->back(); + + // copy over sequence information from the primary alignment + next_alignment.set_sequence(alignment.sequence()); + next_alignment.set_quality(alignment.quality()); + + // get path of the alternate alignment + gssw_mapping_to_alignment(graph, gms[i], next_alignment, pinned, pin_left); + } + } + } + else if (g.get_node_count() > 0) { + /// we didn't get any alignments either because the graph was empty and we couldn't run + // gssw DP or because they had score 0 and gssw didn't want to do traceback. however, + // we can infer the location of softclips based on the pinning nodes, so we'll just make + // those manually + + // find the sink nodes of the oriented graph, which may be empty + auto pinning_points = handlealgs::tail_nodes(oriented_graph); + // impose a consistent ordering for machine independent behavior + sort(pinning_points.begin(), pinning_points.end(), [&](const handle_t& h1, const handle_t& h2) { + return oriented_graph->get_id(h1) < oriented_graph->get_id(h2); + }); + + for (size_t i = 0; i < max_alt_alns && i < pinning_points.size(); i++) { + // make a record in the multi alignments if we're using them + if (multi_alignments) { + multi_alignments->emplace_back(); + } + // choose an alignment object to construct the path in + Alignment& softclip_alignment = i == 0 ? alignment : multi_alignments->back(); + + handle_t& pinning_point = pinning_points[i]; + + Mapping* mapping = alignment.mutable_path()->add_mapping(); + mapping->set_rank(1); + + // locate at the beginning or end of the node + Position* position = mapping->mutable_position(); + position->set_node_id(oriented_graph->get_id(pinning_point)); + position->set_offset(pin_left ? 0 : oriented_graph->get_length(pinning_point)); + + // soft clip + Edit* edit = mapping->add_edit(); + edit->set_to_length(alignment.sequence().length()); + edit->set_sequence(alignment.sequence()); + + // we want to also have the first alignment in the multi-alignment vector + if (i == 0 && multi_alignments) { + multi_alignments->back() = alignment; + } + } + } + + if (gms) { + for (int32_t i = 0; i < max_alt_alns; i++) { + gssw_graph_mapping_destroy(gms[i]); + } + free(gms); + } + } + else { + // trace back local alignment + gssw_graph_mapping* gm = gssw_graph_trace_back_qual_adj (graph, + align_sequence->c_str(), + align_quality->c_str(), + align_sequence->size(), + nt_table, + score_matrix, + gap_open, + gap_extension, + front_full_length_bonus, + back_full_length_bonus); + + gssw_mapping_to_alignment(graph, gm, alignment, pinned, pin_left); + gssw_graph_mapping_destroy(gm); + } + } else { + // get the alignment position and score + alignment.set_score(graph->max_node->alignment->score1); + Mapping* m = alignment.mutable_path()->add_mapping(); + Position* p = m->mutable_position(); + p->set_node_id(graph->max_node->id); + p->set_offset(graph->max_node->alignment->ref_end1); // mark end position; for de-duplication + } + + // this might be null if we're not doing pinned alignment, but delete doesn't care + delete null_masked_graph; + + gssw_graph_destroy(graph); + +} + +void QualAdjAligner::align(Alignment& alignment, const HandleGraph& g, bool traceback_aln) const { + + align_internal(alignment, nullptr, g, false, false, 1, traceback_aln); +} + +void QualAdjAligner::align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left, bool xdrop, + uint16_t xdrop_max_gap_length) const { + if (xdrop) { + // QualAdjXdropAligner manages its own stack, so it can never be threadsafe without be recreated + // for every alignment, which meshes poorly with its stack implementation. We achieve + // thread-safety by having one per thread, which makes this method const-ish. + QualAdjXdropAligner& xdrop = const_cast(xdrops[omp_get_thread_num()]); + + // wrap the graph so that empty pinning points are handled correctly + DozeuPinningOverlay overlay(&g, !pin_left); + if (overlay.get_node_count() == 0 && g.get_node_count() != 0) { + // the only nodes in the graph are empty nodes for pinning, which got masked. + // we can still infer a pinned alignment based purely on the pinning point but + // dozeu won't handle this correctly + g.for_each_handle([&](const handle_t& handle) { + bool can_pin = g.follow_edges(handle, pin_left, [&](const handle_t& next) {return false;}); + if (can_pin) { + // manually make the softclip + Mapping* mapping = alignment.mutable_path()->add_mapping(); + Position* pos = mapping->mutable_position(); + pos->set_node_id(g.get_id(handle)); + pos->set_is_reverse(false); + pos->set_offset(pin_left ? 0 : g.get_length(handle)); + + mapping->set_rank(1); + + Edit* edit = mapping->add_edit(); + edit->set_from_length(0); + edit->set_to_length(alignment.sequence().size()); + edit->set_sequence(alignment.sequence()); + alignment.set_score(0); + return false; + } + return true; + }); + } + else { + + // dozeu declines to produce an alignment when the gap is set to 0 + xdrop_max_gap_length = max(xdrop_max_gap_length, 1); + + // get the quality adjusted bonus + int8_t bonus = qual_adj_full_length_bonuses[pin_left ? alignment.quality().back() : alignment.quality().front()]; + + xdrop.align_pinned(alignment, overlay, pin_left, bonus, xdrop_max_gap_length); + + if (overlay.performed_duplications()) { + // the overlay is not a strict subset of the underlying graph, so we may + // need to translate some node IDs + translate_oriented_node_ids(*alignment.mutable_path(), [&](id_t node_id) { + handle_t under = overlay.get_underlying_handle(overlay.get_handle(node_id)); + return make_pair(g.get_id(under), g.get_is_reverse(under)); + }); + } + } + } + else { + align_internal(alignment, nullptr, g, true, pin_left, 1, true); + } +} + +void QualAdjAligner::align_pinned_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, + bool pin_left, int32_t max_alt_alns) const { + align_internal(alignment, &alt_alignments, g, true, pin_left, max_alt_alns, true); +} + +void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph& g, + int32_t band_padding, bool permissive_banding) const { + + if (alignment.sequence().empty()) { + // we can save time by using a specialized deletion aligner for empty strings + deletion_aligner.align(alignment, g); + return; + } + + int64_t best_score = alignment.sequence().size() * match; + size_t total_bases = 0; + g.for_each_handle([&](const handle_t& handle) { + total_bases += g.get_length(handle); + }); + int64_t worst_score = (alignment.sequence().size() + total_bases) * -max(max(mismatch, gap_open), gap_extension); + + // TODO: put this all into another template somehow? + + if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int8 + BandedGlobalAligner band_graph(alignment, + g, + band_padding, + permissive_banding, + true); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int16 + BandedGlobalAligner band_graph(alignment, + g, + band_padding, + permissive_banding, + true); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int32 + BandedGlobalAligner band_graph(alignment, + g, + band_padding, + permissive_banding, + true); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else { + // Fall back to int64 + BandedGlobalAligner band_graph(alignment, + g, + band_padding, + permissive_banding, + true); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } +} + +void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, + int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) const { + + if (alignment.sequence().empty()) { + // we can save time by using a specialized deletion aligner for empty strings + deletion_aligner.align_multi(alignment, alt_alignments, g, max_alt_alns); + return; + } + + // We need to figure out what size ints we need to use. + // Get upper and lower bounds on the scores. TODO: if these overflow int64 we're out of luck + int64_t best_score = alignment.sequence().size() * match; + size_t total_bases = 0; + g.for_each_handle([&](const handle_t& handle) { + total_bases += g.get_length(handle); + }); + int64_t worst_score = (alignment.sequence().size() + total_bases) * -max(max(mismatch, gap_open), gap_extension); + + if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int8 + BandedGlobalAligner band_graph(alignment, + g, + alt_alignments, + max_alt_alns, + band_padding, + permissive_banding, + true); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int16 + BandedGlobalAligner band_graph(alignment, + g, + alt_alignments, + max_alt_alns, + band_padding, + permissive_banding, + true); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { + // We'll fit in int32 + BandedGlobalAligner band_graph(alignment, + g, + alt_alignments, + max_alt_alns, + band_padding, + permissive_banding, + true); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } else { + // Fall back to int64 + BandedGlobalAligner band_graph(alignment, + g, + alt_alignments, + max_alt_alns, + band_padding, + permissive_banding, + true); + + band_graph.align(score_matrix, nt_table, gap_open, gap_extension); + } +} + +void QualAdjAligner::align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& mems, + bool reverse_complemented, uint16_t max_gap_length) const +{ + align_xdrop(alignment, g, handlealgs::lazier_topological_order(&g), mems, reverse_complemented, max_gap_length); +} + +void QualAdjAligner::align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& order, + const vector& mems, bool reverse_complemented, + uint16_t max_gap_length) const +{ + // QualAdjXdropAligner manages its own stack, so it can never be threadsafe without being recreated + // for every alignment, which meshes poorly with its stack implementation. We achieve + // thread-safety by having one per thread, which makes this method const-ish. + QualAdjXdropAligner& xdrop = const_cast(xdrops[omp_get_thread_num()]); + + // get the quality adjusted bonus + int8_t bonus = qual_adj_full_length_bonuses[reverse_complemented ? alignment.quality().front() : alignment.quality().back()]; + + xdrop.align(alignment, g, order, mems, reverse_complemented, bonus, max_gap_length); + if (!alignment.has_path() && mems.empty()) { + // dozeu couldn't find an alignment, probably because it's seeding heuristic failed + // we'll just fall back on GSSW + // TODO: This is a bit inconsistent. GSSW gives a full-length bonus at both ends, while + // dozeu only gives it once. + align(alignment, g, true); + } +} + +int32_t QualAdjAligner::score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const { + auto& sequence = aln.sequence(); + auto& base_quality = aln.quality(); + int32_t score = 0; + for (int32_t i = 0; i < length; i++) { + // index 5 x 5 score matrices (ACGTN) + // always have match so that row and column index are same and can combine algebraically + score += score_matrix[25 * base_quality[read_offset + i] + 6 * nt_table[sequence[read_offset + i]]]; + } + return score; +} + +int32_t QualAdjAligner::score_exact_match(const string& sequence, const string& base_quality) const { + int32_t score = 0; + for (int32_t i = 0; i < sequence.length(); i++) { + // index 5 x 5 score matrices (ACGTN) + // always have match so that row and column index are same and can combine algebraically + score += score_matrix[25 * base_quality[i] + 6 * nt_table[sequence[i]]]; + } + return score; +} + + +int32_t QualAdjAligner::score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const { + int32_t score = 0; + for (auto seq_iter = seq_begin, qual_iter = base_qual_begin; seq_iter != seq_end; seq_iter++) { + // index 5 x 5 score matrices (ACGTN) + // always have match so that row and column index are same and can combine algebraically + score += score_matrix[25 * (*qual_iter) + 6 * nt_table[*seq_iter]]; + qual_iter++; + } + return score; +} + +int32_t QualAdjAligner::score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const { + int32_t score = 0; + for (auto seq_iter = seq_begin, qual_iter = base_qual_begin; seq_iter != seq_end; seq_iter++) { + // index 5 x 5 score matrices (ACGTN) + // always have match so that row and column index are same and can combine algebraically + score += score_matrix[25 * (*qual_iter) + 1]; + qual_iter++; + } + return score; +} + +int32_t QualAdjAligner::score_full_length_bonus(bool left_side, string::const_iterator seq_begin, + string::const_iterator seq_end, + string::const_iterator base_qual_begin) const { + if (seq_begin != seq_end) { + return qual_adj_full_length_bonuses[left_side ? *base_qual_begin : *(base_qual_begin + (seq_end - seq_begin) - 1)]; + } + else { + return 0; + } +} + +int32_t QualAdjAligner::score_full_length_bonus(bool left_side, const Alignment& alignment) const { + return score_full_length_bonus(left_side, alignment.sequence().begin(), alignment.sequence().end(), + alignment.quality().begin()); +} + +int32_t QualAdjAligner::score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, + string::const_iterator seq_begin, bool no_read_end_scoring) const { + + int32_t score = 0; + string::const_iterator read_pos = seq_begin; + string::const_iterator qual_pos = alignment.quality().begin() + (seq_begin - alignment.sequence().begin()); + + bool in_deletion = false; + for (size_t i = 0; i < path.mapping_size(); i++) { + const auto& mapping = path.mapping(i); + + // get the sequence of this node on the proper strand + string node_seq = graph.get_sequence(graph.get_handle(mapping.position().node_id(), + mapping.position().is_reverse())); + + string::const_iterator ref_pos = node_seq.begin() + mapping.position().offset(); + + for (size_t j = 0; j < mapping.edit_size(); j++) { + const auto& edit = mapping.edit(j); + + if (edit.from_length() > 0) { + if (edit.to_length() > 0) { + for (auto siter = read_pos, riter = ref_pos, qiter = qual_pos; + siter != read_pos + edit.to_length(); siter++, qiter++, riter++) { + score += score_matrix[25 * (*qiter) + 5 * nt_table[*riter] + nt_table[*siter]]; + } + + // apply full length bonus + if (read_pos == alignment.sequence().begin() && !no_read_end_scoring) { + score += score_full_length_bonus(true, alignment); + } + if (read_pos + edit.to_length() == alignment.sequence().end() + && !no_read_end_scoring) { + score += score_full_length_bonus(false, alignment); + } + in_deletion = false; + } + else if (in_deletion) { + score -= edit.from_length() * gap_extension; + } + else { + // deletion + score -= gap_open + (edit.from_length() - 1) * gap_extension; + in_deletion = true; + } + } + else if (edit.to_length() > 0) { + // don't score soft clips if read end scoring + if (no_read_end_scoring || + (read_pos != alignment.sequence().begin() && + read_pos + edit.to_length() != alignment.sequence().end())) { + // insert + score -= gap_open + (edit.to_length() - 1) * gap_extension; + } + in_deletion = false; + } + + read_pos += edit.to_length(); + qual_pos += edit.to_length(); + ref_pos += edit.from_length(); + } + } + return score; +} + +AlignerClient::AlignerClient(double gc_content_estimate) : gc_content_estimate(gc_content_estimate) { + + // Adopt the default scoring parameters and make the aligners + set_alignment_scores(default_score_matrix, + default_gap_open, default_gap_extension, + default_full_length_bonus); +} + +const GSSWAligner* AlignerClient::get_aligner(bool have_qualities) const { + return (have_qualities && adjust_alignments_for_base_quality) ? + (GSSWAligner*) get_qual_adj_aligner() : + (GSSWAligner*) get_regular_aligner(); +} + +const QualAdjAligner* AlignerClient::get_qual_adj_aligner() const { + assert(qual_adj_aligner.get() != nullptr); + return qual_adj_aligner.get(); +} + +const Aligner* AlignerClient::get_regular_aligner() const { + assert(regular_aligner.get() != nullptr); + return regular_aligner.get(); +} + +int8_t* AlignerClient::parse_matrix(istream& matrix_stream) { + int8_t* matrix = (int8_t*) malloc(16 * sizeof(int8_t)); + for (size_t i = 0; i < 16; i++) { + if (!matrix_stream.good()) { + std::cerr << "error: vg Aligner::parse_matrix requires a 4x4 whitespace separated integer matrix\n"; + throw ""; + } + int score; + matrix_stream >> score; + if (score > 127 || score < -127) { + std::cerr << "error: vg Aligner::parse_matrix requires values in the range [-127,127]\n"; + throw ""; + } + matrix[i] = score; + } + return matrix; +} + +void AlignerClient::set_alignment_scores(int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, + int8_t full_length_bonus) { + + int8_t* matrix = (int8_t*) malloc(sizeof(int8_t) * 16); + for (size_t i = 0; i < 16; ++i) { + if (i % 5 == 0) { + matrix[i] = match; + } + else { + matrix[i] = -mismatch; + } + } + + this->set_alignment_scores(matrix, gap_open, gap_extend, full_length_bonus); + + free(matrix); +} + + +void AlignerClient::set_alignment_scores(const int8_t* score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) { + + qual_adj_aligner = unique_ptr(new QualAdjAligner(score_matrix, gap_open, gap_extend, + full_length_bonus, gc_content_estimate)); + regular_aligner = unique_ptr(new Aligner(score_matrix, gap_open, gap_extend, + full_length_bonus, gc_content_estimate)); + +} + +void AlignerClient::set_alignment_scores(std::istream& matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) { + int8_t* score_matrix = parse_matrix(matrix_stream); + this->set_alignment_scores(score_matrix, gap_open, gap_extend, full_length_bonus); + free(score_matrix); +} + +} diff --git a/src/aligner.hpp b/src/aligner.hpp new file mode 100644 index 00000000000..04a080fbce5 --- /dev/null +++ b/src/aligner.hpp @@ -0,0 +1,535 @@ +#ifndef VG_ALIGNER_HPP_INCLUDED +#define VG_ALIGNER_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gssw.h" +#include "Variant.h" +#include "Fasta.h" +#include "handle.hpp" +#include "path.hpp" +#include "dozeu_interface.hpp" +#include "deletion_aligner.hpp" + +// #define BENCH +// #include "bench.h" + +namespace vg { + + static constexpr int8_t default_match = 1; + static constexpr int8_t default_mismatch = 4; + static constexpr int8_t default_score_matrix[16] = { + default_match, -default_mismatch, -default_mismatch, -default_mismatch, + -default_mismatch, default_match, -default_mismatch, -default_mismatch, + -default_mismatch, -default_mismatch, default_match, -default_mismatch, + -default_mismatch, -default_mismatch, -default_mismatch, default_match + }; + static constexpr int8_t default_gap_open = 6; + static constexpr int8_t default_gap_extension = 1; + static constexpr int8_t default_full_length_bonus = 5; + static constexpr double default_gc_content = 0.5; + + /// Score a gap with the given open and extension scores. + int32_t score_gap(size_t gap_length, int32_t gap_open, int32_t gap_extension); + + /** + * The abstract interface that any Aligner should implement. + */ + class BaseAligner { + public: + + /// Store optimal local alignment against a graph in the Alignment object. + /// Gives the full length bonus separately on each end of the alignment. + virtual void align(Alignment& alignment, const HandleGraph& g, bool traceback_aln) const = 0; + }; + + /** + * The basic GSSW-based core aligner implementation, which can then be quality-adjusted or not. + */ + class GSSWAligner : public BaseAligner { + + protected: + + GSSWAligner() = default; + GSSWAligner(const int8_t* _score_matrix, + int8_t _gap_open, + int8_t _gap_extension, + int8_t _full_length_bonus, + double _gc_content); + ~GSSWAligner(); + + // for construction + // needed when constructing an alignable graph from the nodes + gssw_graph* create_gssw_graph(const HandleGraph& g) const; + + // identify the IDs of nodes that should be used as pinning points in GSSW for pinned + // alignment ((i.e. non-empty nodes as close as possible to sinks)) + unordered_set identify_pinning_points(const HandleGraph& graph) const; + + // convert graph mapping back into unreversed node positions + void unreverse_graph_mapping(gssw_graph_mapping* gm) const; + // convert from graph sequences back into unrereversed form + void unreverse_graph(gssw_graph* graph) const; + + // alignment functions + void gssw_mapping_to_alignment(gssw_graph* graph, + gssw_graph_mapping* gm, + Alignment& alignment, + bool pinned, + bool pin_left) const; + string graph_cigar(gssw_graph_mapping* gm) const; + + public: + /// Given a nonempty vector of nonnegative scaled alignment scores, + /// compute the mapping quality of the maximal score in the vector. + /// Sets max_idx_out to the index of that score in the vector. + /// Optionally includes a vector of implicit counts >= 1 for the scores, but + /// the mapping quality is always calculated as if its multiplicity is 1. + static double maximum_mapping_quality_exact(const vector& scaled_scores, size_t* max_idx_out, + const vector* multiplicities = nullptr); + /// Given a nonempty vector of nonnegative scaled alignment scores, + /// approximate the mapping quality of the maximal score in the vector. + /// Sets max_idx_out to the index of that score in the vector. + /// Optionally includes a vector of implicit counts >= 1 for the scores, but + /// the mapping quality is always calculated as if its multiplicity is 1. + static double maximum_mapping_quality_approx(const vector& scaled_scores, size_t* max_idx_out, + const vector* multiplicities = nullptr); + + /// Same as maximum_mapping_quality_exact except alway s computes mapping + /// quality for the first score + static double first_mapping_quality_exact(const vector& scaled_scores, + const vector* multiplicities = nullptr); + /// Same as maximum_mapping_quality_approx except alway s computes mapping + /// quality for the first score + static double first_mapping_quality_approx(const vector& scaled_scores, + const vector* multiplicities = nullptr); + protected: + double group_mapping_quality_exact(const vector& scaled_scores, const vector& group, + const vector* multiplicities = nullptr) const; + double estimate_next_best_score(int length, double min_diffs) const; + + double recover_log_base(const int8_t* score_matrix, double gc_content, double tol) const; + + bool verify_valid_log_odds_score_matrix(const int8_t* score_matrix, const double* nt_freqs) const; + + double alignment_score_partition_function(double lambda, const int8_t* score_matrix, + const double* nt_freqs) const; + + vector all_mapping_qualities_exact(const vector& scaled_scores, + const vector* multiplicities = nullptr) const; + + public: + + double max_possible_mapping_quality(int length) const; + double estimate_max_possible_mapping_quality(int length, double min_diffs, double next_min_diffs) const; + + /// store optimal alignment against a graph in the Alignment object with one end of the sequence + /// guaranteed to align to a source/sink node. if xdrop is selected, use the xdrop heuristic, which + /// does not guarantee an optimal alignment. + /// + /// pinning left means that that the alignment starts with the first base of the read sequence and + /// the first base of a source node sequence, pinning right means that the alignment starts with + /// the final base of the read sequence and the final base of a sink node sequence + /// + /// Gives the full length bonus only on the non-pinned end of the alignment. + virtual void align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left, bool xdrop = false, + uint16_t xdrop_max_gap_length = default_xdrop_max_gap_length) const = 0; + + /// store the top scoring pinned alignments in the vector in descending score order up to a maximum + /// number of alignments (including the optimal one). if there are fewer than the maximum number in + /// the return value, then it includes all alignments with a positive score. the optimal alignment + /// will be stored in both the vector and in the main alignment object + virtual void align_pinned_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, + bool pin_left, int32_t max_alt_alns) const = 0; + + /// store optimal global alignment against a graph within a specified band in the Alignment object + /// permissive banding auto detects the width of band needed so that paths can travel + /// through every node in the graph + virtual void align_global_banded(Alignment& alignment, const HandleGraph& g, + int32_t band_padding = 0, bool permissive_banding = true) const = 0; + + /// store top scoring global alignments in the vector in descending score order up to a maximum number + /// of alternate alignments (including the optimal alignment). if there are fewer than the maximum + /// number of alignments in the return value, then the vector contains all possible alignments. the + /// optimal alignment will be stored in both the vector and the original alignment object + virtual void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, + const HandleGraph& g, int32_t max_alt_alns, int32_t band_padding = 0, + bool permissive_banding = true) const = 0; + /// xdrop aligner + virtual void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& mems, + bool reverse_complemented, uint16_t max_gap_length = default_xdrop_max_gap_length) const = 0; + + /// xdrop aligner, but with a precomputed topological order on the graph, which need not include + /// all of the graph's handles and which may contain both orientations of a handle + virtual void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& order, + const vector& mems, bool reverse_complemented, + uint16_t max_gap_length = default_xdrop_max_gap_length) const = 0; + + /// Compute the score of an exact match in the given alignment, from the + /// given offset, of the given length. + virtual int32_t score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const = 0; + /// Compute the score of an exact match of the given sequence with the given qualities. + /// Qualities may be ignored by some implementations. + virtual int32_t score_exact_match(const string& sequence, const string& base_quality) const = 0; + /// Compute the score of an exact match of the given range of sequence with the given qualities. + /// Qualities may be ignored by some implementations. + virtual int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const = 0; + /// Compute the score of a mismatch of the given range of sequence with the given qualities. + /// Qualities may be ignored by some implementations. + /// Note that the return value is SIGNED, and almost certainly NEGATIVE, because mismatches are bad. + virtual int32_t score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const = 0; + + virtual int32_t score_full_length_bonus(bool left_side, string::const_iterator seq_begin, + string::const_iterator seq_end, + string::const_iterator base_qual_begin) const = 0; + + virtual int32_t score_full_length_bonus(bool left_side, const Alignment& alignment) const = 0; + + /// Compute the score of a path against the given range of subsequence with the given qualities. + virtual int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, + string::const_iterator seq_begin, bool no_read_end_scoring = false) const = 0; + + /// Returns the score of an insert or deletion of the given length + int32_t score_gap(size_t gap_length) const; + + /// stores -10 * log_10(P_err) in alignment mapping_quality field where P_err is the + /// probability that the alignment is not the correct one (assuming that one of the alignments + /// in the vector is correct). alignments must have been created with this Aligner for quality + /// score to be valid + void compute_mapping_quality(vector& alignments, + int max_mapping_quality, + bool fast_approximation, + double cluster_mq, + bool use_cluster_mq, + int overlap_count, + double mq_estimate, + double maybe_mq_threshold, + double identity_weight) const; + /// same function for paired reads, mapping qualities are stored in both alignments in the pair + void compute_paired_mapping_quality(pair, vector>& alignment_pairs, + const vector& frag_weights, + int max_mapping_quality1, + int max_mapping_quality2, + bool fast_approximation, + double cluster_mq, + bool use_cluster_mq, + int overlap_count1, + int overlap_count2, + double mq_estimate1, + double mq_estimate2, + double maybe_mq_threshold, + double identity_weight) const; + + /// Computes mapping quality for the first score in a vector of scores. + /// Optionally includes a vector of implicit counts >= 1 for the scores, but + /// only 1 count can apply toward the mapping quality. + int32_t compute_first_mapping_quality(const vector& scores, bool fast_approximation, + const vector* multiplicities = nullptr) const; + + /// Computes mapping quality for the optimal score in a vector of scores. + /// Optionally includes a vector of implicit counts >= 1 for the scores, but + /// only 1 count can apply toward the mapping quality. + int32_t compute_max_mapping_quality(const vector& scores, bool fast_approximation, + const vector* multiplicities = nullptr) const; + + + /// Computes mapping quality for a group of scores in a vector of scores (group given by indexes). + /// Optionally includes a vector of implicit counts >= 1 for the score, but the mapping quality is always + /// calculated as if each member of the group has a count of 1. + int32_t compute_group_mapping_quality(const vector& scores, const vector& group, + const vector* multiplicities = nullptr) const; + + /// Computes mapping quality for all of a vector of scores. + /// Optionally includes a vector of implicit counts >= 1 for the scores, but + /// only 1 count can apply toward the mapping quality. + vector compute_all_mapping_qualities(const vector& scores, + const vector* multiplicities = nullptr) const; + + /// Returns the difference between an optimal and second-best alignment scores that would + /// result in this mapping quality using the fast mapping quality approximation + double mapping_quality_score_diff(double mapping_quality) const; + + /// Convert a score to an unnormalized log likelihood for the sequence. + /// Requires log_base to have been set. + double score_to_unnormalized_likelihood_ln(double score) const; + + /// The longest gap detectable from a read position without soft-clipping + size_t longest_detectable_gap(const Alignment& alignment, const string::const_iterator& read_pos) const; + + /// The longest gap detectable from a read position without soft-clipping, for a generic read. + size_t longest_detectable_gap(size_t read_length, size_t read_pos) const; + + /// The longest gap detectable from any read position without soft-clipping + size_t longest_detectable_gap(const Alignment& alignment) const; + + /// The longest gap detectable from any read position without soft-clipping, for a generic read. + size_t longest_detectable_gap(size_t read_length) const; + + /// Use the score values in the aligner to score the given alignment, + /// scoring gaps caused by jumping between between nodes using a custom + /// gap length estimation function (which takes the from position, the + /// to position, and a search limit in bp that happens to be the read + /// length). + /// + /// May include full length bonus or not. TODO: bool flags are bad. + virtual int32_t score_discontiguous_alignment(const Alignment& aln, + const function& estimate_distance, + bool strip_bonuses = false) const; + + /// Use the score values in the aligner to score the given alignment assuming + /// that there are no gaps between Mappings in the Path + virtual int32_t score_contiguous_alignment(const Alignment& aln, + bool strip_bonuses = false) const; + + /// Without necessarily rescoring the entire alignment, return the score + /// of the given alignment with bonuses removed. Assumes that bonuses + /// are actually included in the score. + /// Needs to know if the alignment was pinned-end or not, and, if so, which end was pinned. + virtual int32_t remove_bonuses(const Alignment& aln, bool pinned = false, bool pin_left = false) const; + + // members + DeletionAligner deletion_aligner; + int8_t* nt_table = nullptr; + int8_t* score_matrix = nullptr; + int8_t match; + int8_t mismatch; + int8_t gap_open; + int8_t gap_extension; + int8_t full_length_bonus; + + // log of the base of the logarithm underlying the log-odds interpretation of the scores + double log_base = 0.0; + }; + + /** + * An ordinary aligner. + */ + class Aligner : public GSSWAligner { + + public: + + Aligner(const int8_t* _score_matrix = default_score_matrix, + int8_t _gap_open = default_gap_open, + int8_t _gap_extension = default_gap_extension, + int8_t _full_length_bonus = default_full_length_bonus, + double _gc_content = default_gc_content); + ~Aligner(void) = default; + + /// Store optimal local alignment against a graph in the Alignment object. + /// Gives the full length bonus separately on each end of the alignment. + void align(Alignment& alignment, const HandleGraph& g, bool traceback_aln) const; + + /// Align against a subgraph induced by a subset of nodes. The topological + /// order of the handles in the subgraph must be provided. + /// Store optimal local alignment in the Alignment object. + /// Gives the full length bonus separately on each end of the alignment. + void align(Alignment& alignment, const HandleGraph& g, + const std::vector& topological_order) const; + + /// store optimal alignment against a graph in the Alignment object with one end of the sequence + /// guaranteed to align to a source/sink node. if xdrop is selected, use the xdrop heuristic, which + /// does not guarantee an optimal alignment. + /// + /// pinning left means that that the alignment starts with the first base of the read sequence and + /// the first base of a source node sequence, pinning right means that the alignment starts with + /// the final base of the read sequence and the final base of a sink node sequence + /// + /// Gives the full length bonus only on the non-pinned end of the alignment. + void align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left, bool xdrop = false, + uint16_t xdrop_max_gap_length = default_xdrop_max_gap_length) const; + + /// store the top scoring pinned alignments in the vector in descending score order up to a maximum + /// number of alignments (including the optimal one). if there are fewer than the maximum number in + /// the return value, then it includes all alignments with a positive score. the optimal alignment + /// will be stored in both the vector and in the main alignment object + void align_pinned_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, + bool pin_left, int32_t max_alt_alns) const; + + /// store optimal global alignment against a graph within a specified band in the Alignment object + /// permissive banding auto detects the width of band needed so that paths can travel + /// through every node in the graph + void align_global_banded(Alignment& alignment, const HandleGraph& g, + int32_t band_padding = 0, bool permissive_banding = true) const; + + /// store top scoring global alignments in the vector in descending score order up to a maximum number + /// of alternate alignments (including the optimal alignment). if there are fewer than the maximum + /// number of alignments in the return value, then the vector contains all possible alignments. the + /// optimal alignment will be stored in both the vector and the original alignment object + void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, + int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true) const; + + /// xdrop aligner + void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& mems, + bool reverse_complemented, uint16_t max_gap_length = default_xdrop_max_gap_length) const; + + /// xdrop aligner, but with a precomputed topological order on the graph, which need not include + /// all of the graph's handles and which may contain both orientations of a handle + void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& order, + const vector& mems, bool reverse_complemented, + uint16_t max_gap_length = default_xdrop_max_gap_length) const; + + int32_t score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const; + int32_t score_exact_match(const string& sequence, const string& base_quality) const; + int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const; + int32_t score_exact_match(const string& sequence) const; + int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end) const; + int32_t score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const; + + /// Score a mismatch given just the length. Only possible since we ignore qualities. + /// Return value is SIGNED, and almost certainly NEGATIVE + int32_t score_mismatch(size_t length) const; + + int32_t score_full_length_bonus(bool left_side, string::const_iterator seq_begin, + string::const_iterator seq_end, + string::const_iterator base_qual_begin) const; + + int32_t score_full_length_bonus(bool left_side, const Alignment& alignment) const; + + int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, + string::const_iterator seq_begin, bool no_read_end_scoring = false) const; + + private: + + // internal function interacting with gssw for pinned and local alignment + void align_internal(Alignment& alignment, vector* multi_alignments, const HandleGraph& g, + bool pinned, bool pin_left, int32_t max_alt_alns, + bool traceback_aln) const; + + // members + vector xdrops; + }; + + /** + * An aligner that uses read base qualities to adjust its scores and alignments. + */ + class QualAdjAligner : public GSSWAligner { + public: + + QualAdjAligner(const int8_t* _score_matrix = default_score_matrix, + int8_t _gap_open = default_gap_open, + int8_t _gap_extension = default_gap_extension, + int8_t _full_length_bonus = default_full_length_bonus, + double _gc_content = default_gc_content); + + ~QualAdjAligner(void); + + // base quality adjusted counterparts to functions of same name from Aligner + + void align(Alignment& alignment, const HandleGraph& g, bool traceback_aln) const; + void align_global_banded(Alignment& alignment, const HandleGraph& g, + int32_t band_padding = 0, bool permissive_banding = true) const; + void align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left, bool xdrop = false, + uint16_t xdrop_max_gap_length = default_xdrop_max_gap_length) const; + void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, + int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true) const; + void align_pinned_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, + bool pin_left, int32_t max_alt_alns) const; + + void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& mems, + bool reverse_complemented, uint16_t max_gap_length = default_xdrop_max_gap_length) const; + void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& order, + const vector& mems, bool reverse_complemented, + uint16_t max_gap_length = default_xdrop_max_gap_length) const; + + int32_t score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const; + int32_t score_exact_match(const string& sequence, const string& base_quality) const; + int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const; + int32_t score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end, + string::const_iterator base_qual_begin) const; + + int32_t score_full_length_bonus(bool left_side, string::const_iterator seq_begin, + string::const_iterator seq_end, + string::const_iterator base_qual_begin) const; + + int32_t score_full_length_bonus(bool left_side, const Alignment& alignment) const; + + int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const path_t& path, + string::const_iterator seq_begin, bool no_read_end_scoring = false) const; + + + protected: + + int8_t* qual_adjusted_matrix(const int8_t* score_matrix, double gc_content, uint32_t max_qual) const; + + int8_t* qual_adjusted_bonuses(int8_t _full_length_bonus, uint32_t max_qual) const; + + // internal function interacting with gssw for pinned and local alignment + void align_internal(Alignment& alignment, vector* multi_alignments, const HandleGraph& g, + bool pinned, bool pin_left, int32_t max_alt_alns, + bool traceback_aln) const; + + int8_t* qual_adj_full_length_bonuses = nullptr; + + // members + vector xdrops; + }; + + + /** + * Holds a set of alignment scores, and has methods to produce aligners of various types on demand, using those scores. + * Provides a get_aligner() method to get ahold of a useful, possibly quality-adjusted Aligner. + * Base functionality that is shared between alignment and surjections + */ + class AlignerClient { + protected: + + /// Create an AlignerClient, which creates the default aligner instances, + /// which can depend on a GC content estimate. + AlignerClient(double gc_content_estimate = vg::default_gc_content); + + /// Get the appropriate aligner to use, based on + /// adjust_alignments_for_base_quality. By setting have_qualities to false, + /// you can force the non-quality-adjusted aligner, for reads that lack + /// quality scores. + const GSSWAligner* get_aligner(bool have_qualities = true) const; + + // Sometimes you really do need the two kinds of aligners, to pass to code + // that expects one or the other. + const QualAdjAligner* get_qual_adj_aligner() const; + const Aligner* get_regular_aligner() const; + + public: + + /// Set all the aligner scoring parameters and create the stored aligner instances. + virtual void set_alignment_scores(int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus); + + /// Set the algner scoring parameters and create the stored aligner instances. The + /// stream should contain a 4 x 4 whitespace-separated substitution matrix (in the + /// order ACGT) + virtual void set_alignment_scores(std::istream& matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus); + + /// Set the algner scoring parameters and create the stored aligner instances. The + /// score matrix should by a 4 x 4 array in the order (ACGT). + /// Other overloads of set_alignment_scores all call this one. + virtual void set_alignment_scores(const int8_t* score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus); + + /// Allocates an array to hold a 4x4 substitution matrix and returns it + static int8_t* parse_matrix(std::istream& matrix_stream); + + bool adjust_alignments_for_base_quality = false; // use base quality adjusted alignments + + private: + + // GSSW aligners + unique_ptr qual_adj_aligner; + unique_ptr regular_aligner; + + // GC content estimate that we need for building the aligners. + double gc_content_estimate; + }; + +} // end namespace vg + +#endif diff --git a/src/alignment.cpp b/src/alignment.cpp index 42f224e02f9..665a84b03f8 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -1,20 +1,25 @@ #include "alignment.hpp" -#include "stream.hpp" +#include "vg/io/gafkluge.hpp" +#include "annotation.hpp" -#include +#include + +using namespace vg::io; namespace vg { -int hts_for_each(string& filename, function lambda, xg::XG* xgindex) { +int hts_for_each(string& filename, function lambda, const PathPositionHandleGraph* graph) { samFile *in = hts_open(filename.c_str(), "r"); if (in == NULL) return 0; bam_hdr_t *hdr = sam_hdr_read(in); map rg_sample; parse_rg_sample_map(hdr->text, rg_sample); + map tid_path_handle; + parse_tid_path_handle_map(hdr, graph, tid_path_handle); bam1_t *b = bam_init1(); while (sam_read1(in, hdr, b) >= 0) { - Alignment a = bam_to_alignment(b, rg_sample, hdr, xgindex); + Alignment a = bam_to_alignment(b, rg_sample, tid_path_handle, hdr, graph); lambda(a); } bam_destroy1(b); @@ -28,13 +33,16 @@ int hts_for_each(string& filename, function lambda) { return hts_for_each(filename, lambda, nullptr); } -int hts_for_each_parallel(string& filename, function lambda, xg::XG* xgindex) { +int hts_for_each_parallel(string& filename, function lambda, + const PathPositionHandleGraph* graph) { samFile *in = hts_open(filename.c_str(), "r"); if (in == NULL) return 0; bam_hdr_t *hdr = sam_hdr_read(in); map rg_sample; parse_rg_sample_map(hdr->text, rg_sample); + map tid_path_handle; + parse_tid_path_handle_map(hdr, graph, tid_path_handle); int thread_count = get_thread_count(); vector bs; bs.resize(thread_count); @@ -48,12 +56,18 @@ int hts_for_each_parallel(string& filename, function lambda, x int tid = omp_get_thread_num(); while (more_data) { bam1_t* b = bs[tid]; + // We need to track our own read operation's success separate from + // the global flag, or someone else encountering EOF will cause us + // to drop our read on the floor. + bool got_read = false; #pragma omp critical (hts_input) if (more_data) { - more_data = sam_read1(in, hdr, b) >= 0; + got_read = sam_read1(in, hdr, b) >= 0; + more_data &= got_read; } - if (more_data) { - Alignment a = bam_to_alignment(b, rg_sample, hdr, xgindex); + // Now we're outside the critical section so we can only rely on our own variables. + if (got_read) { + Alignment a = bam_to_alignment(b, rg_sample, tid_path_handle, hdr, graph); lambda(a); } } @@ -84,11 +98,22 @@ bam_hdr_t* hts_file_header(string& filename, string& header) { } bam_hdr_t* hts_string_header(string& header, - map& path_length, - map& rg_sample) { + const map& path_length, + const map& rg_sample) { + + // Copy the map into a vecotr in its own order + vector> path_order_and_length(path_length.begin(), path_length.end()); + + // Make header in that order. + return hts_string_header(header, path_order_and_length, rg_sample); +} + +bam_hdr_t* hts_string_header(string& header, + const vector>& path_order_and_length, + const map& rg_sample) { stringstream hdr; hdr << "@HD\tVN:1.5\tSO:unknown\n"; - for (auto& p : path_length) { + for (auto& p : path_order_and_length) { hdr << "@SQ\tSN:" << p.first << "\t" << "LN:" << p.second << "\n"; } for (auto& s : rg_sample) { @@ -108,32 +133,74 @@ bool get_next_alignment_from_fastq(gzFile fp, char* buffer, size_t len, Alignmen alignment.Clear(); bool is_fasta = false; // handle name - if (0!=gzgets(fp,buffer,len)) { + string name; + if (gzgets(fp,buffer,len) != 0) { buffer[strlen(buffer)-1] = '\0'; - string name = buffer; + name = buffer; if (name[0] == '@') { is_fasta = false; - } else if (name[0] = '>') { + } else if (name[0] == '>') { is_fasta = true; } else { throw runtime_error("Found unexpected delimiter " + name.substr(0,1) + " in fastq/fasta input"); } - name = name.substr(1, name.find(' ')); // trim off leading @ and things after the first whitespace + name = name.substr(1, name.find(' ') - 1); // trim off leading @ and things after the first whitespace // keep trailing /1 /2 alignment.set_name(name); - } else { return false; } + } + else { + // no more to get + return false; + } // handle sequence - if (0!=gzgets(fp,buffer,len)) { - buffer[strlen(buffer)-1] = '\0'; - alignment.set_sequence(buffer); - } else { - cerr << "[vg::alignment.cpp] error: incomplete fastq record" << endl; exit(1); + string sequence; + bool reading_sequence = true; + while (reading_sequence) { + if (gzgets(fp,buffer,len) == 0) { + if (sequence.empty()) { + // there was no sequence + throw runtime_error("[vg::alignment.cpp] incomplete fastq/fasta record " + name); + } + else { + // we hit the end of the file + break; + } + } + size_t size_read = strlen(buffer); + if (buffer[size_read - 1] == '\n') { + // we stopped because of a line end rather than because we filled the buffer + + // we don't want the newline in the sequence, so terminate the buffer 1 char earlier + --size_read; + if (!is_fasta) { + // we assume FASTQ sequences only take one line + reading_sequence = false; + } + else { + // peek ahead to check for a multi-line sequence + int c = gzgetc(fp); + if (c < 0) { + // this is the end of the file + reading_sequence = false; + } + else { + if (c == '>') { + // the next line is a sequence name + reading_sequence = false; + } + // un-peek + gzungetc(c, fp); + } + } + } + sequence.append(buffer, size_read); } + alignment.set_sequence(sequence); // handle "+" sep if (!is_fasta) { if (0!=gzgets(fp,buffer,len)) { } else { - cerr << "[vg::alignment.cpp] error: incomplete fastq record" << endl; exit(1); + cerr << "[vg::alignment.cpp] error: incomplete fastq record " << name << endl; exit(1); } // handle quality if (0!=gzgets(fp,buffer,len)) { @@ -142,7 +209,7 @@ bool get_next_alignment_from_fastq(gzFile fp, char* buffer, size_t len, Alignmen //cerr << string_quality_short_to_char(quality) << endl; alignment.set_quality(quality); } else { - cerr << "[vg::alignment.cpp] error: incomplete fastq record" << endl; exit(1); + cerr << "[vg::alignment.cpp] error: fastq record missing base quality " << name << endl; exit(1); } } @@ -158,187 +225,7 @@ bool get_next_alignment_pair_from_fastqs(gzFile fp1, gzFile fp2, char* buffer, s return get_next_alignment_from_fastq(fp1, buffer, len, mate1) && get_next_alignment_from_fastq(fp2, buffer, len, mate2); } -size_t unpaired_for_each_parallel(function get_read_if_available, function lambda) { - - size_t nLines = 0; - vector *batch = nullptr; - // number of batches currently being processed - uint64_t batches_outstanding = 0; -#pragma omp parallel default(none) shared(batches_outstanding, batch, nLines, get_read_if_available, lambda) -#pragma omp single - { - - // number of reads in each batch - const uint64_t batch_size = 1 << 9; // 512 - // max # of such batches to be holding in memory - uint64_t max_batches_outstanding = 1 << 9; // 512 - // max # we will ever increase the batch buffer to - const uint64_t max_max_batches_outstanding = 1 << 13; // 8192 - - // alignments to hold the incoming data - Alignment aln; - // did we find the end of the file yet? - bool more_data = true; - - while (more_data) { - // init a new batch - batch = new std::vector(); - batch->reserve(batch_size); - - // load up to the batch-size number of reads - for (int i = 0; i < batch_size; i++) { - - more_data = get_read_if_available(aln); - - if (more_data) { - batch->emplace_back(std::move(aln)); - nLines++; - } - else { - break; - } - } - - // did we get a batch? - if (batch->size()) { - - // how many batch tasks are outstanding currently, including this one? - uint64_t current_batches_outstanding; -#pragma omp atomic capture - current_batches_outstanding = ++batches_outstanding; - - if (current_batches_outstanding >= max_batches_outstanding) { - // do this batch in the current thread because we've spawned the maximum number of - // concurrent batch tasks - for (auto& aln : *batch) { - lambda(aln); - } - delete batch; -#pragma omp atomic capture - current_batches_outstanding = --batches_outstanding; - - if (4 * current_batches_outstanding / 3 < max_batches_outstanding - && max_batches_outstanding < max_max_batches_outstanding) { - // we went through at least 1/4 of the batch buffer while we were doing this thread's batch - // this looks risky, since we want the batch buffer to stay populated the entire time we're - // occupying this thread on compute, so let's increase the batch buffer size - - max_batches_outstanding *= 2; - } - } - else { - // spawn a new task to take care of this batch -#pragma omp task default(none) firstprivate(batch) shared(batches_outstanding, lambda) - { - for (auto& aln : *batch) { - lambda(aln); - } - delete batch; -#pragma omp atomic update - batches_outstanding--; - } - } - } - } - } - return nLines; -} - -size_t paired_for_each_parallel_after_wait(function get_pair_if_available, - function lambda, - function single_threaded_until_true) { - - - size_t nLines = 0; - vector > *batch = nullptr; - // number of batches currently being processed - uint64_t batches_outstanding = 0; - -#pragma omp parallel default(none) shared(batches_outstanding, batch, nLines, get_pair_if_available, single_threaded_until_true, lambda) -#pragma omp single - { - - // number of pairs in each batch - const uint64_t batch_size = 1 << 9; // 512 - // max # of such batches to be holding in memory - uint64_t max_batches_outstanding = 1 << 9; // 512 - // max # we will ever increase the batch buffer to - const uint64_t max_max_batches_outstanding = 1 << 13; // 8192 - - // alignments to hold the incoming data - Alignment mate1, mate2; - // did we find the end of the file yet? - bool more_data = true; - - while (more_data) { - // init a new batch - batch = new std::vector>(); - batch->reserve(batch_size); - - // load up to the batch-size number of pairs - for (int i = 0; i < batch_size; i++) { - - more_data = get_pair_if_available(mate1, mate2); - - if (more_data) { - batch->emplace_back(std::move(mate1), std::move(mate2)); - nLines++; - } - else { - break; - } - } - - // did we get a batch? - if (batch->size()) { - // how many batch tasks are outstanding currently, including this one? - uint64_t current_batches_outstanding; -#pragma omp atomic capture - current_batches_outstanding = ++batches_outstanding; - - bool do_single_threaded = !single_threaded_until_true(); - if (current_batches_outstanding >= max_batches_outstanding || do_single_threaded) { - // do this batch in the current thread because we've spawned the maximum number of - // concurrent batch tasks or because we are directed to work in a single thread - for (auto& p : *batch) { - lambda(p.first, p.second); - } - delete batch; -#pragma omp atomic capture - current_batches_outstanding = --batches_outstanding; - - if (4 * current_batches_outstanding / 3 < max_batches_outstanding - && max_batches_outstanding < max_max_batches_outstanding - && !do_single_threaded) { - // we went through at least 1/4 of the batch buffer while we were doing this thread's batch - // this looks risky, since we want the batch buffer to stay populated the entire time we're - // occupying this thread on compute, so let's increase the batch buffer size - // (skip this adjustment if you're in single-threaded mode and thus expect the buffer to be - // empty) - - max_batches_outstanding *= 2; - } - } - else { - // spawn a new task to take care of this batch -#pragma omp task default(none) firstprivate(batch) shared(batches_outstanding, lambda) - { - for (auto& p : *batch) { - lambda(p.first, p.second); - } - delete batch; -#pragma omp atomic update - batches_outstanding--; - } - } - } - } - } - - return nLines; -} - -size_t fastq_unpaired_for_each_parallel(const string& filename, function lambda) { +size_t fastq_unpaired_for_each_parallel(const string& filename, function lambda, uint64_t batch_size) { gzFile fp = (filename != "-") ? gzopen(filename.c_str(), "r") : gzdopen(fileno(stdin), "r"); if (!fp) { @@ -353,7 +240,7 @@ size_t fastq_unpaired_for_each_parallel(const string& filename, function lambda) { - return fastq_paired_interleaved_for_each_parallel_after_wait(filename, lambda, [](void) {return true;}); +size_t fastq_paired_interleaved_for_each_parallel(const string& filename, function lambda, uint64_t batch_size) { + return fastq_paired_interleaved_for_each_parallel_after_wait(filename, lambda, [](void) {return true;}, batch_size); } -size_t fastq_paired_two_files_for_each_parallel(const string& file1, const string& file2, function lambda) { - return fastq_paired_two_files_for_each_parallel_after_wait(file1, file2, lambda, [](void) {return true;}); +size_t fastq_paired_two_files_for_each_parallel(const string& file1, const string& file2, function lambda, uint64_t batch_size) { + return fastq_paired_two_files_for_each_parallel_after_wait(file1, file2, lambda, [](void) {return true;}, batch_size); } size_t fastq_paired_interleaved_for_each_parallel_after_wait(const string& filename, function lambda, - function single_threaded_until_true) { + function single_threaded_until_true, + uint64_t batch_size) { gzFile fp = (filename != "-") ? gzopen(filename.c_str(), "r") : gzdopen(fileno(stdin), "r"); if (!fp) { @@ -385,7 +273,7 @@ size_t fastq_paired_interleaved_for_each_parallel_after_wait(const string& filen return get_next_interleaved_alignment_pair_from_fastq(fp, buf, len, mate1, mate2); }; - size_t nLines = paired_for_each_parallel_after_wait(get_pair, lambda, single_threaded_until_true); + size_t nLines = paired_for_each_parallel_after_wait(get_pair, lambda, single_threaded_until_true, batch_size); delete[] buf; gzclose(fp); @@ -394,7 +282,8 @@ size_t fastq_paired_interleaved_for_each_parallel_after_wait(const string& filen size_t fastq_paired_two_files_for_each_parallel_after_wait(const string& file1, const string& file2, function lambda, - function single_threaded_until_true) { + function single_threaded_until_true, + uint64_t batch_size) { gzFile fp1 = (file1 != "-") ? gzopen(file1.c_str(), "r") : gzdopen(fileno(stdin), "r"); if (!fp1) { @@ -412,7 +301,7 @@ size_t fastq_paired_two_files_for_each_parallel_after_wait(const string& file1, return get_next_alignment_pair_from_fastqs(fp1, fp2, buf, len, mate1, mate2); }; - size_t nLines = paired_for_each_parallel_after_wait(get_pair, lambda, single_threaded_until_true); + size_t nLines = paired_for_each_parallel_after_wait(get_pair, lambda, single_threaded_until_true, batch_size); delete[] buf; gzclose(fp1); @@ -456,6 +345,7 @@ size_t fastq_paired_interleaved_for_each(const string& filename, function lambda) { gzFile fp1 = (file1 != "-") ? gzopen(file1.c_str(), "r") : gzdopen(fileno(stdin), "r"); if (!fp1) { @@ -539,44 +429,37 @@ void parse_rg_sample_map(char* hts_header, map& rg_sample) { } } -void write_alignments(std::ostream& out, vector& buf) { - function lambda = - [&buf] (size_t n) { - return buf[n]; - }; - stream::write(cout, buf.size(), lambda); -} - -short quality_char_to_short(char c) { - return static_cast(c) - 33; -} - -char quality_short_to_char(short i) { - return static_cast(i + 33); -} - -void alignment_quality_short_to_char(Alignment& alignment) { - alignment.set_quality(string_quality_short_to_char(alignment.quality())); -} - -string string_quality_short_to_char(const string& quality) { - string buffer; buffer.resize(quality.size()); - for (int i = 0; i < quality.size(); ++i) { - buffer[i] = quality_short_to_char(quality[i]); +void parse_tid_path_handle_map(const bam_hdr_t* hts_header, const PathHandleGraph* graph, map& tid_path_handle) { + if (!graph) { + // No path handles to find! + return; } - return buffer; -} - -void alignment_quality_char_to_short(Alignment& alignment) { - alignment.set_quality(string_quality_char_to_short(alignment.quality())); -} - -string string_quality_char_to_short(const string& quality) { - string buffer; buffer.resize(quality.size()); - for (int i = 0; i < quality.size(); ++i) { - buffer[i] = quality_char_to_short(quality[i]); + for (int i = 0; i < hts_header->n_targets; i++) { + // Pre-look-up all the paths mentioned in the header + string target_name(hts_header->target_name[i]); + if (graph->has_path(target_name)) { + path_handle_t target = graph->get_path_handle(target_name); + if (graph->get_sense(target) != PathSense::HAPLOTYPE) { + // Non-haplotype paths are allowed in the mapping because they + // are always path-position indexed. + + // Store the handles for the paths we find, under their HTSlib target numbers. + tid_path_handle.emplace(i, target); + } else { + // TODO: Decide we need to positional-index this path? Make + // PackedReferencePathOverlay take a collection of paths to + // index and use this one? + #pragma omp critical (cerr) + std::cerr << "error[vg::parse_tid_path_handle_map] Path " << target_name + << " referenced in header exists in graph, but as a haplotype." + << " It is probably not indexed for positional lookup. Make the" + << " path a reference path" + << " " + << " and try again." << std::endl; + exit(1); + } + } } - return buffer; } // Internal conversion function for both paired and unpaired codepaths @@ -584,19 +467,30 @@ string alignment_to_sam_internal(const Alignment& alignment, const string& refseq, const int32_t refpos, const bool refrev, - const string& cigar, + const vector>& cigar, const string& mateseq, const int32_t matepos, + bool materev, const int32_t tlen, - bool paired) { - + bool paired, + const int32_t tlen_max) { + // Determine flags, using orientation, next/prev fragments, and pairing status. - int32_t flags = sam_flag(alignment, refrev, paired); - + int32_t flags = determine_flag(alignment, refseq, refpos, refrev, mateseq, matepos, materev, tlen, paired, tlen_max); + + string alignment_name; + if (paired) { + // We need to strip the /1 and /2 or _1 and _2 from paired reads so the two ends have the same name. + alignment_name = regex_replace(alignment.name(), regex("[/_][12]$"), ""); + } else { + // Keep the alignment name as is because even if the name looks paired, the reads are semantically unpaired. + alignment_name = alignment.name(); + } + // Have One True Flag for whether the read is mapped (and should have its // mapping stuff set) or unmapped (and should have things *'d out). bool mapped = !(flags & BAM_FUNMAP); - + if (mapped) { // Make sure we have everything assert(!refseq.empty()); @@ -605,28 +499,19 @@ string alignment_to_sam_internal(const Alignment& alignment, assert(alignment.has_path()); assert(alignment.path().mapping_size() > 0); } - - // We've observed some reads with the unmapped flag set and also a CIGAR string set, which shouldn't happen. - // We will check for this. The CIGAR string will only be set in the output if the alignment has a path. - assert((bool)(flags & BAM_FUNMAP) != (alignment.has_path() && alignment.path().mapping_size())); + + // We apply the convention of unmapped reads getting their mate's coordinates + // See section 2.4.1 https://samtools.github.io/hts-specs/SAMv1.pdf + bool use_mate_loc = !mapped && paired && !mateseq.empty(); stringstream sam; - string alignment_name; - if (paired) { - // We need to strip the /1 and /2 or _1 and _2 from paired reads so the two ends have the same name. - alignment_name = regex_replace(alignment.name(), regex("[/_][12]$"), ""); - } else { - // Keep the alignment name as is because even if the name looks paired, the reads are semantically unpaired. - alignment_name = alignment.name(); - } - sam << (!alignment_name.empty() ? alignment_name : "*") << "\t" << flags << "\t" - << (mapped ? refseq : "*") << "\t" - << refpos + 1 << "\t" + << (mapped ? refseq : use_mate_loc ? mateseq : "*") << "\t" + << (use_mate_loc ? matepos + 1 : refpos + 1) << "\t" << (mapped ? alignment.mapping_quality() : 0) << "\t" - << (mapped ? cigar : "*") << "\t" + << (mapped ? cigar_string(cigar) : "*") << "\t" << (mateseq == "" ? "*" : (mateseq == refseq ? "=" : mateseq)) << "\t" << matepos + 1 << "\t" << tlen << "\t" @@ -650,16 +535,69 @@ string alignment_to_sam_internal(const Alignment& alignment, return sam.str(); } +int32_t determine_flag(const Alignment& alignment, + const string& refseq, + const int32_t refpos, + const bool refrev, + const string& mateseq, + const int32_t matepos, + bool materev, + const int32_t tlen, + bool paired, + const int32_t tlen_max) { + + // Determine flags, using orientation, next/prev fragments, and pairing status. + int32_t flags = sam_flag(alignment, refrev, paired); + + // We've observed some reads with the unmapped flag set and also a CIGAR string set, which shouldn't happen. + // We will check for this. The CIGAR string will only be set in the output if the alignment has a path. + assert((bool)(flags & BAM_FUNMAP) != (alignment.has_path() && alignment.path().mapping_size())); + + if (!((bool)(flags & BAM_FUNMAP)) && paired && !refseq.empty() && refseq == mateseq) { + // Properly paired if both mates mapped to same sequence, in inward-facing orientations. + // We know they're on the same sequence, so check orientation. + + // If we are first, mate needs to be reverse, and if mate is first, we need to be reverse. + // If we are at the same position either way is fine. + bool facing = ((refpos <= matepos) && !refrev && materev) || ((matepos <= refpos) && refrev && !materev); + + // We are close enough if there is not tlen limit, or if there is one and we do not exceed it + bool close_enough = (tlen_max == 0) || abs(tlen) <= tlen_max; + + if (facing && close_enough) { + // We can't find anything wrong with this pair; it's properly paired. + flags |= BAM_FPROPER_PAIR; + } + + // TODO: Support sequencing technologies where "proper" pairing may + // have a different meaning or expected combination of orientations. + } + + if (paired && mateseq.empty()) { + // Set the flag for the mate being unmapped + flags |= BAM_FMUNMAP; + } + + if (paired && materev) { + // Set the flag for the mate being reversed + flags |= BAM_FMREVERSE; + } + + return flags; +} + string alignment_to_sam(const Alignment& alignment, const string& refseq, const int32_t refpos, const bool refrev, - const string& cigar, + const vector>& cigar, const string& mateseq, const int32_t matepos, - const int32_t tlen) { + bool materev, + const int32_t tlen, + const int32_t tlen_max) { - return alignment_to_sam_internal(alignment, refseq, refpos, refrev, cigar, mateseq, matepos, tlen, true); + return alignment_to_sam_internal(alignment, refseq, refpos, refrev, cigar, mateseq, matepos, materev, tlen, true, tlen_max); } @@ -667,70 +605,258 @@ string alignment_to_sam(const Alignment& alignment, const string& refseq, const int32_t refpos, const bool refrev, - const string& cigar) { + const vector>& cigar) { - return alignment_to_sam_internal(alignment, refseq, refpos, refrev, cigar, "", -1, 0, false); + return alignment_to_sam_internal(alignment, refseq, refpos, refrev, cigar, "", -1, false, 0, false, 0); } // Internal conversion function for both paired and unpaired codepaths -bam1_t* alignment_to_bam_internal(const string& sam_header, +bam1_t* alignment_to_bam_internal(bam_hdr_t* header, const Alignment& alignment, const string& refseq, const int32_t refpos, const bool refrev, - const string& cigar, + const vector>& cigar, const string& mateseq, const int32_t matepos, + bool materev, const int32_t tlen, - bool paired) { - - assert(!sam_header.empty()); - - // Make a tiny SAM file. Remember to URL-encode it, since it may contain '%' - string sam_file = "data:," + percent_url_encode(sam_header + - alignment_to_sam_internal(alignment, refseq, refpos, refrev, cigar, mateseq, matepos, tlen, paired)); - const char* sam = sam_file.c_str(); - samFile *in = sam_open(sam, "r"); - bam_hdr_t *header = sam_hdr_read(in); - bam1_t *aln = bam_init1(); - if (sam_read1(in, header, aln) >= 0) { - bam_hdr_destroy(header); - sam_close(in); // clean up - return aln; - } else { - cerr << "[vg::alignment] Failure to parse SAM record" << endl - << sam << endl; - exit(1); + bool paired, + const int32_t tlen_max) { + + // this table doesn't seem to be reproduced in htslib publicly, so I'm copying + // it from the CRAM conversion code + static const char nt_encoding[256] = { + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15, 0,15,15, + 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, + 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 + }; + + // init an empty BAM record + bam1_t* bam = bam_init1(); + + // strip the pair order identifiers + string alignment_name = alignment.name(); + if (paired && alignment_name.size() >= 2) { + // We need to strip the /1 and /2 or _1 and _2 from paired reads so the two ends have the same name. + char c1 = alignment_name[alignment_name.size() - 2]; + char c2 = alignment_name[alignment_name.size() - 1]; + if ((c1 == '_' || c1 == '/') && (c2 == '1' || c2 == '2')) { + alignment_name = alignment_name.substr(0, alignment_name.size() - 2); + } } -} + + // calculate the size in bytes of the variable length fields (which are all concatenated in memory) + int qname_nulls = 4 - alignment_name.size() % 4; + int qname_data_size = alignment_name.size() + qname_nulls; + int cigar_data_size = 4 * cigar.size(); + int seq_data_size = (alignment.sequence().size() + 1) / 2; // round up + int qual_data_size = alignment.sequence().size(); // we will allocate this even if quality doesn't exist + + // allocate the joint variable length fields + int var_field_data_size = qname_data_size + cigar_data_size + seq_data_size + qual_data_size; + bam->data = (uint8_t*) calloc(var_field_data_size, sizeof(uint8_t)); + + // TODO: what ID is this? CRAM seems to ignore it, so maybe we can too... + //bam->id = 0; + bam->l_data = var_field_data_size; // current length of data + bam->m_data = var_field_data_size; // max length of data + + bam1_core_t& core = bam->core; + // mapping position + core.pos = refpos; + // ID of sequence mapped to + core.tid = sam_hdr_name2tid(header, refseq.c_str()); + // MAPQ + core.qual = alignment.mapping_quality(); + // number of nulls (above 1) used to pad read name string + core.l_extranul = qname_nulls - 1; + // bit flag + core.flag = determine_flag(alignment, refseq, refpos, refrev, mateseq, matepos, materev, tlen, paired, tlen_max); + // length of read name, including nulls + core.l_qname = qname_data_size; + // number of cigar operations + core.n_cigar = cigar.size(); + // length of read + core.l_qseq = alignment.sequence().size(); + // ID of sequence mate is mapped to + core.mtid = sam_hdr_name2tid(header, mateseq.c_str()); // TODO: what if there is no mate + // mapping position of mate + core.mpos = matepos; + // insert length of fragment + core.isize = tlen; + + // all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux + + // write query name, padded by nulls + uint8_t* name_data = bam->data; + for (size_t i = 0; i < alignment_name.size(); ++i) { + name_data[i] = (uint8_t) alignment_name[i]; + } + for (size_t i = 0; i < qname_nulls; ++i) { + name_data[i + alignment_name.size()] = '\0'; + } + + // encode cigar and copy into data -bam1_t* alignment_to_bam(const string& sam_header, - const Alignment& alignment, - const string& refseq, - const int32_t refpos, - const bool refrev, - const string& cigar, - const string& mateseq, - const int32_t matepos, - const int32_t tlen) { + uint32_t* cigar_data = (uint32_t*) (name_data + qname_data_size); - return alignment_to_bam_internal(sam_header, alignment, refseq, refpos, refrev, cigar, mateseq, matepos, tlen, true); + auto refend = core.pos; + for (size_t i = 0; i < cigar.size(); ++i) { + uint32_t op; + switch (cigar[i].second) { + case 'M': + case 'm': + op = BAM_CMATCH; + refend += cigar[i].first; + break; + case 'I': + case 'i': + op = BAM_CINS; + break; + case 'D': + case 'd': + op = BAM_CDEL; + refend += cigar[i].first; + break; + case 'N': + case 'n': + op = BAM_CREF_SKIP; + refend += cigar[i].first; + break; + case 'S': + case 's': + op = BAM_CSOFT_CLIP; + break; + case 'H': + case 'h': + op = BAM_CHARD_CLIP; + break; + case 'P': + case 'p': + op = BAM_CPAD; + break; + case '=': + op = BAM_CEQUAL; + refend += cigar[i].first; + break; + case 'X': + case 'x': + op = BAM_CDIFF; + refend += cigar[i].first; + break; + default: + throw runtime_error("Invalid CIGAR operation " + string(1, cigar[i].second)); + break; + } + cigar_data[i] = bam_cigar_gen(cigar[i].first, op); + } + + + // now we know where it ends, we can compute the bin + // copied from cram/cram_samtools.h + core.bin = hts_reg2bin(refpos, refend - 1, 14, 5); // TODO: not sure if end is past-the-last + + // convert sequence to 4-bit (nibble) encoding + uint8_t* seq_data = (uint8_t*) (cigar_data + cigar.size()); + const string* seq = &alignment.sequence(); + string rev_seq; + const string* qual = &alignment.quality(); + string rev_qual; + if (refrev) { + // Sequence and quality both need to be flipped to target forward orientation + rev_seq = reverse_complement(*seq); + seq = &rev_seq; + reverse_copy(qual->begin(), qual->end(), back_inserter(rev_qual)); + qual = &rev_qual; + } + for (size_t i = 0; i < alignment.sequence().size(); i += 2) { + if (i + 1 < alignment.sequence().size()) { + seq_data[i / 2] = (nt_encoding[seq->at(i)] << 4) | nt_encoding[seq->at(i + 1)]; + } + else { + seq_data[i / 2] = nt_encoding[seq->at(i)] << 4; + } + } + + // write the quality directly (it should already have the +33 offset removed) + uint8_t* qual_data = seq_data + seq_data_size; + for (size_t i = 0; i < alignment.sequence().size(); ++i) { + if (alignment.quality().empty()) { + // hacky, but this seems to be what they do in CRAM anyway + qual_data[i] = '\xff'; + } + else { + qual_data[i] = qual->at(i); + } + } + + if (~core.flag & BAM_FUNMAP) { + // we've decided that it is aligned + int32_t score = alignment.score(); + bam_aux_append(bam, "AS", 'i', sizeof(int32_t), (uint8_t*) &score); + } + + if (!alignment.read_group().empty()) { + bam_aux_append(bam, "RG", 'Z', alignment.read_group().size() + 1, (uint8_t*) alignment.read_group().c_str()); + } + + // this annotation comes from surject and should be retained in the BAM + if (has_annotation(alignment, "all_scores")) { + string all_scores = get_annotation(alignment, "all_scores"); + bam_aux_append(bam, "SS", 'Z', all_scores.size() + 1, (uint8_t*) all_scores.c_str()); + } + + // TODO: this does not seem to be a standardized field (https://samtools.github.io/hts-specs/SAMtags.pdf) +// if (!alignment.sample_name()) { +// +// } + + return bam; +} + +bam1_t* alignment_to_bam(bam_hdr_t* bam_header, + const Alignment& alignment, + const string& refseq, + const int32_t refpos, + const bool refrev, + const vector>& cigar, + const string& mateseq, + const int32_t matepos, + bool materev, + const int32_t tlen, + const int32_t tlen_max) { + + return alignment_to_bam_internal(bam_header, alignment, refseq, refpos, refrev, cigar, mateseq, matepos, materev, tlen, true, tlen_max); } -bam1_t* alignment_to_bam(const string& sam_header, - const Alignment& alignment, - const string& refseq, - const int32_t refpos, - const bool refrev, - const string& cigar) { +bam1_t* alignment_to_bam(bam_hdr_t* bam_header, + const Alignment& alignment, + const string& refseq, + const int32_t refpos, + const bool refrev, + const vector>& cigar) { - return alignment_to_bam_internal(sam_header, alignment, refseq, refpos, refrev, cigar, "", -1, 0, false); + return alignment_to_bam_internal(bam_header, alignment, refseq, refpos, refrev, cigar, "", -1, false, 0, false, 0); } -string cigar_string(vector >& cigar) { +string cigar_string(const vector >& cigar) { vector > cigar_comp; pair cur = make_pair(0, '\0'); for (auto& e : cigar) { @@ -784,39 +910,39 @@ string mapping_string(const string& source, const Mapping& mapping) { return result; } -void mapping_cigar(const Mapping& mapping, vector >& cigar) { +void mapping_cigar(const Mapping& mapping, vector>& cigar) { for (const auto& edit : mapping.edit()) { if (edit.from_length() && edit.from_length() == edit.to_length()) { // *matches* from_length == to_length, or from_length > 0 and offset unset // match state - cigar.push_back(make_pair(edit.from_length(), 'M')); + append_cigar_operation(edit.from_length(), 'M', cigar); //cerr << "match " << edit.from_length() << endl; } else { // mismatch/sub state // *snps* from_length == to_length; sequence = alt if (edit.from_length() == edit.to_length()) { - cigar.push_back(make_pair(edit.from_length(), 'M')); + append_cigar_operation(edit.from_length(), 'M', cigar); //cerr << "match " << edit.from_length() << endl; } else if (edit.from_length() > edit.to_length()) { // *deletions* from_length > to_length; sequence may be unset or empty int32_t del = edit.from_length() - edit.to_length(); int32_t eq = edit.to_length(); - if (eq) cigar.push_back(make_pair(eq, 'M')); - cigar.push_back(make_pair(del, 'D')); + if (eq) append_cigar_operation(eq, 'M', cigar); + append_cigar_operation(del, 'D', cigar); //cerr << "del " << edit.from_length() - edit.to_length() << endl; } else if (edit.from_length() < edit.to_length()) { // *insertions* from_length < to_length; sequence contains relative insertion int32_t ins = edit.to_length() - edit.from_length(); int32_t eq = edit.from_length(); - if (eq) cigar.push_back(make_pair(eq, 'M')); - cigar.push_back(make_pair(ins, 'I')); + if (eq) append_cigar_operation(eq, 'M', cigar); + append_cigar_operation(ins, 'I', cigar); //cerr << "ins " << edit.to_length() - edit.from_length() << endl; } } } } -int64_t cigar_mapping(const bam1_t *b, Mapping* mapping, xg::XG* xgindex) { +int64_t cigar_mapping(const bam1_t *b, Mapping* mapping) { int64_t ref_length = 0; int64_t query_length = 0; @@ -849,30 +975,28 @@ int64_t cigar_mapping(const bam1_t *b, Mapping* mapping, xg::XG* xgindex) { return ref_length; } -void mapping_against_path(Alignment& alignment, const bam1_t *b, char* chr, xg::XG* xgindex, bool on_reverse_strand) { +void mapping_against_path(Alignment& alignment, const bam1_t *b, const path_handle_t& path, const PathPositionHandleGraph* graph, bool on_reverse_strand) { if (b->core.pos == -1) return; Mapping mapping; - int64_t length = cigar_mapping(b, &mapping, xgindex); + int64_t length = cigar_mapping(b, &mapping); - Alignment aln = xgindex->target_alignment(chr, b->core.pos, b->core.pos + length, "", on_reverse_strand, mapping); + Alignment aln = target_alignment(graph, path, b->core.pos, b->core.pos + length, "", on_reverse_strand, mapping); *alignment.mutable_path() = aln.path(); Position* refpos = alignment.add_refpos(); - refpos->set_name(chr); + refpos->set_name(graph->get_path_name(path)); refpos->set_offset(b->core.pos); refpos->set_is_reverse(on_reverse_strand); } -// act like the path this is against is the reference -// and generate an equivalent cigar -// Produces CIGAR in forward strand space of the reference sequence. -string cigar_against_path(const Alignment& alignment, bool on_reverse_strand, int64_t& pos, size_t path_len, size_t softclip_suppress) { +vector> cigar_against_path(const Alignment& alignment, bool on_reverse_strand, int64_t& pos, size_t path_len, size_t softclip_suppress) { vector > cigar; - if (!alignment.has_path() || alignment.path().mapping_size() == 0) return ""; + + if (!alignment.has_path() || alignment.path().mapping_size() == 0) return cigar; const Path& path = alignment.path(); int l = 0; @@ -887,6 +1011,12 @@ string cigar_against_path(const Alignment& alignment, bool on_reverse_strand, in // handle soft clips, which are just insertions at the start or end // back + if (cigar.size() > 1 && cigar.back().second == 'D' && cigar[cigar.size() - 2].second == 'I') { + // Swap insert to the outside so it can be a softclip. + // When making the CIGAR we should put D before I but when flipping the + // strand they may switch. + std::swap(cigar.back(), cigar[cigar.size() - 2]); + } if (cigar.back().second == 'I') { // make sure we stay in the reference sequence when suppressing the softclips if (cigar.back().first <= softclip_suppress @@ -897,6 +1027,10 @@ string cigar_against_path(const Alignment& alignment, bool on_reverse_strand, in } } // front + if (cigar.size() > 1 && cigar.front().second == 'D' && cigar[1].second == 'I') { + // Swap insert to the outside so it can be a softclip + std::swap(cigar.front(), cigar[1]); + } if (cigar.front().second == 'I') { // make sure we stay in the reference sequence when suppressing the softclips if (cigar.front().first <= softclip_suppress @@ -907,8 +1041,116 @@ string cigar_against_path(const Alignment& alignment, bool on_reverse_strand, in cigar.front().second = 'S'; } } + + simplify_cigar(cigar); + + return cigar; +} + +void simplify_cigar(vector>& cigar) { + + size_t removed = 0; + for (size_t i = 0, j = 0; i < cigar.size(); ++j) { + if (j == cigar.size() || (cigar[j].second != 'I' && cigar[j].second != 'D')) { + // this is the end boundary of a runs of I/D operations + if (j - i >= 3) { + // we have at least 3 adjacent I/D operations, which means they should + // be re-consolidated + int d_total = 0, i_total = 0; + for (size_t k = i - removed, end = j - removed; k < end; ++k) { + if (cigar[k].second == 'D') { + d_total += cigar[k].first; + } + else { + i_total += cigar[k].first; + } + } + + cigar[i - removed] = make_pair(d_total, 'D'); + cigar[i - removed + 1] = make_pair(i_total, 'I'); + + // mark that we've removed cigar operations + removed += j - i - 2; + } + // move the start of the next I/D run beyond the current operation + i = j + 1; + } + if (j < cigar.size()) { + cigar[j - removed] = cigar[j]; + } + } + cigar.resize(cigar.size() - removed); + // do a second pass removing empty operations and consolidating non I/D operations + removed = 0; + for (size_t i = 0; i < cigar.size(); ++i) { + if (cigar[i].first == 0) { + ++removed; + } + else if (i > removed && cigar[i].second == cigar[i - removed - 1].second) { + cigar[i - removed - 1].first += cigar[i].first; + ++removed; + } + else if (removed) { + cigar[i - removed] = cigar[i]; + } + } + cigar.resize(cigar.size() - removed); +} + +pair compute_template_lengths(const int64_t& pos1, const vector>& cigar1, + const int64_t& pos2, const vector>& cigar2) { + + // Compute signed distance from outermost matched/mismatched base of each + // alignment to the outermost matched/mismatched base of the other. + + // We work with CIGARs because it's easier than reverse complementing + // Alignment objects without node lengths. + + // Work out the low and high mapped bases for each side + auto find_bounds = [](const int64_t& pos, const vector>& cigar) { + // Initialize bounds to represent no mapped bases + int64_t low = numeric_limits::max(); + int64_t high = numeric_limits::min(); + + // Track position in the reference + int64_t here = pos; + for (auto& item : cigar) { + // Trace along the cigar + if (item.second == 'M') { + // Bases are matched. Count them in the bounds and execute the operation + low = min(low, here); + here += item.first; + high = max(high, here); + } else if (item.second == 'D') { + // Only other way to advance in the reference + here += item.first; + } + } + + return make_pair(low, high); + }; + + auto bounds1 = find_bounds(pos1, cigar1); + auto bounds2 = find_bounds(pos2, cigar2); + + // Compute the separation + int32_t dist = 0; + if (bounds1.first < bounds2.second) { + // The reads are in order + dist = bounds2.second - bounds1.first; + } else if (bounds2.first < bounds1.second) { + // The reads are out of order so the other bounds apply + dist = bounds1.second - bounds2.first; + } + + if (pos1 < pos2) { + // Count read 1 as the overall "leftmost", so its value will be positive + return make_pair(dist, -dist); + } else { + // Count read 2 as the overall leftmost + return make_pair(-dist, dist); + } - return cigar_string(cigar); } int32_t sam_flag(const Alignment& alignment, bool on_reverse_strand, bool paired) { @@ -935,11 +1177,7 @@ int32_t sam_flag(const Alignment& alignment, bool on_reverse_strand, bool paired if (!alignment.has_path() || alignment.path().mapping_size() == 0) { // unmapped flag |= BAM_FUNMAP; - } else if (flag & BAM_FPAIRED) { - // Aligned and in a pair, so assume it's properly paired. - // TODO: this relies on us not emitting improperly paired reads - flag |= BAM_FPROPER_PAIR; - } + } if (on_reverse_strand) { flag |= BAM_FREVERSE; } @@ -947,12 +1185,14 @@ int32_t sam_flag(const Alignment& alignment, bool on_reverse_strand, bool paired flag |= BAM_FSECONDARY; } - - return flag; } -Alignment bam_to_alignment(const bam1_t *b, map& rg_sample, const bam_hdr_t *bh, xg::XG* xgindex) { +Alignment bam_to_alignment(const bam1_t *b, + const map& rg_sample, + const map& tid_path_handle, + const bam_hdr_t *bh, + const PathPositionHandleGraph* graph) { Alignment alignment; @@ -972,11 +1212,14 @@ Alignment bam_to_alignment(const bam1_t *b, map& rg_sample, cons // get the read group and sample name uint8_t *rgptr = bam_aux_get(b, "RG"); - char* rg = (char*) (rgptr+1); - //if (!rg_sample + string read_group; string sname; - if (!rg_sample.empty()) { - sname = rg_sample[string(rg)]; + if (rgptr && !rg_sample.empty()) { + read_group = string((char*) (rgptr+1)); + auto found = rg_sample.find(read_group); + if (found != rg_sample.end()) { + sname = found->second; + } } // Now name the read after the scaffold @@ -1013,23 +1256,31 @@ Alignment bam_to_alignment(const bam1_t *b, map& rg_sample, cons } - if (xgindex != nullptr && bh != nullptr) { + if (graph != nullptr && bh != nullptr && b->core.tid >= 0) { alignment.set_mapping_quality(b->core.qual); - mapping_against_path(alignment, b, bh->target_name[b->core.tid], xgindex, b->core.flag & BAM_FREVERSE); + // Look for the path handle this is against. + auto found = tid_path_handle.find(b->core.tid); + if (found == tid_path_handle.end()) { + cerr << "[vg::alignment.cpp] error: alignment references path not present in graph: " + << bh->target_name[b->core.tid] << endl; + exit(1); + } + mapping_against_path(alignment, b, found->second, graph, b->core.flag & BAM_FREVERSE); } // TODO: htslib doesn't wrap this flag for some reason. alignment.set_is_secondary(b->core.flag & BAM_FSECONDARY); - if (sname.size()) { + if (!sname.empty()) { alignment.set_sample_name(sname); - alignment.set_read_group(rg); + // We know the sample name came from a read group + alignment.set_read_group(read_group); } return alignment; } -Alignment bam_to_alignment(const bam1_t *b, map& rg_sample) { - return bam_to_alignment(b, rg_sample, nullptr, nullptr); +Alignment bam_to_alignment(const bam1_t *b, const map& rg_sample, const map& tid_path_handle) { + return bam_to_alignment(b, rg_sample, tid_path_handle, nullptr, nullptr); } int alignment_to_length(const Alignment& a) { @@ -1311,6 +1562,21 @@ int softclip_end(const Alignment& alignment) { return 0; } +int softclip_trim(Alignment& alignment) { + // Trim the softclips off of every read + // Work out were to cut + int cut_start = softclip_start(alignment); + int cut_end = softclip_end(alignment); + // Cut the sequence and quality + alignment.set_sequence(alignment.sequence().substr(cut_start, alignment.sequence().size() - cut_start - cut_end)); + if (alignment.quality().size() != 0) { + alignment.set_quality(alignment.quality().substr(cut_start, alignment.quality().size() - cut_start - cut_end)); + } + // Trim the path + *alignment.mutable_path() = trim_hanging_ends(alignment.path()); + return cut_start + cut_end; +} + int query_overlap(const Alignment& aln1, const Alignment& aln2) { if (!alignment_to_length(aln1) || !alignment_to_length(aln2) || !aln1.path().mapping_size() || !aln2.path().mapping_size() @@ -1364,12 +1630,224 @@ Alignment simplify(const Alignment& a, bool trim_internal_deletions) { } return aln; } + +void normalize_alignment(Alignment& alignment) { + + enum edit_type_t {None, Match, Mismatch, Insert, Delete, N}; + + size_t cumul_to_length = 0; + + // we only build the normalized path if we find things we need to normalize + // (this makes the whole algorithm a little fucky, but it should be less overhead) + bool doing_normalization = false; + Path normalized; + + const Path& path = alignment.path(); + const string& seq = alignment.sequence(); + + auto ensure_init_normalized_path = [&](size_t i, size_t j) { + // we won't copy the already normalized prefix unless we have to + if (!doing_normalization) { + for (size_t k = 0; k < i; k++) { + *normalized.add_mapping() = path.mapping(k); + } + Mapping* mapping = normalized.add_mapping(); + *mapping->mutable_position() = path.mapping(i).position(); + mapping->set_rank(path.mapping_size()); + for (size_t k = 0; k < j; k++) { + *mapping->add_edit() = path.mapping(i).edit(k); + } + doing_normalization = true; + } + }; + + edit_type_t prev = None; + + for (size_t i = 0; i < path.mapping_size(); ++i) { + + const Mapping& mapping = path.mapping(i); + prev = None; + + if (doing_normalization) { + // we're maintaining the normalized path, so we need to add mappings + // as we go + Mapping* norm_mapping = normalized.add_mapping(); + *norm_mapping->mutable_position() = mapping.position(); + norm_mapping->set_rank(normalized.mapping_size()); + } + + for (size_t j = 0; j < mapping.edit_size(); ++j) { + + const Edit& edit = mapping.edit(j); + + if (edit.from_length() > 0 && edit.to_length() == 0) { + + if (prev == Delete || doing_normalization) { + // we need to modify the normalized path this round + ensure_init_normalized_path(i, j); + Mapping* norm_mapping = normalized.mutable_mapping(normalized.mapping_size() - 1); + if (prev == Delete) { + // merge with the previous + Edit* norm_edit = norm_mapping->mutable_edit(norm_mapping->edit_size() - 1); + norm_edit->set_from_length(norm_edit->from_length() + edit.from_length()); + } + else { + // just copy + *norm_mapping->add_edit() = edit; + } + } + + prev = Delete; + } + else if (edit.from_length() == 0 && edit.to_length() > 0) { + + if (prev == Insert || doing_normalization) { + // we need to modify the normalized path this round + ensure_init_normalized_path(i, j); + Mapping* norm_mapping = normalized.mutable_mapping(normalized.mapping_size() - 1); + if (prev == Insert) { + // merge with the previous + Edit* norm_edit = norm_mapping->mutable_edit(norm_mapping->edit_size() - 1); + norm_edit->set_to_length(norm_edit->to_length() + edit.to_length()); + norm_edit->mutable_sequence()->append(edit.sequence()); + } + else { + // just copy + *norm_mapping->add_edit() = edit; + } + } + + cumul_to_length += edit.to_length(); + prev = Insert; + } + else { + auto begin = seq.begin() + cumul_to_length; + auto end = begin + edit.to_length(); + + auto first_N = find(begin, end, 'N'); + + edit_type_t type = edit.sequence().empty() ? Match : Mismatch; + + if (prev == type || first_N != end || doing_normalization) { + // we have to do some normalization here + ensure_init_normalized_path(i, j); + + Mapping* norm_mapping = normalized.mutable_mapping(normalized.mapping_size() - 1); + if (first_N == end && prev != type) { + // just need to copy, no fancy normalization + *norm_mapping->add_edit() = edit; + prev = type; + } + else if (first_N == end) { + // we need to extend the previous edit, but we don't need + // to worry about Ns + Edit* norm_edit = norm_mapping->mutable_edit(norm_mapping->edit_size() - 1); + norm_edit->set_from_length(norm_edit->from_length() + edit.from_length()); + norm_edit->set_to_length(norm_edit->to_length() + edit.to_length()); + if (type == Mismatch) { + norm_edit->mutable_sequence()->append(edit.sequence()); + } + } + else { + bool on_Ns = first_N == begin; + auto next_pos = begin; + // iterate until we've handled the whole edit sequence + while (next_pos != end) { + // find the next place where we switch from N to non-N or the reverse + auto next_end = find_if(next_pos, end, [&](char c) { + return c == 'N' != on_Ns; + }); + + if ((prev == N && on_Ns) || (prev == type && !on_Ns)) { + // we need to merge with the previous edit + Edit* norm_edit = norm_mapping->mutable_edit(norm_mapping->edit_size() - 1); + norm_edit->set_from_length(norm_edit->from_length() + edit.from_length()); + norm_edit->set_to_length(norm_edit->to_length() + edit.to_length()); + + // we copy sequence for Ns and for mismatches only + if ((prev == N && on_Ns) || (prev == type && !on_Ns && type == Mismatch)) { + norm_edit->mutable_sequence()->append(next_pos, next_end); + } + } + else { + // we can just copy + Edit* norm_edit = norm_mapping->add_edit(); + norm_edit->set_from_length(next_end - next_pos); + norm_edit->set_to_length(next_end - next_pos); + *norm_edit->mutable_sequence() = string(next_pos, next_end); + } + + next_pos = next_end; + prev = on_Ns ? N : type; + on_Ns = !on_Ns; + } + } + } + else { + // no normalization yet + prev = type; + } + + cumul_to_length += edit.to_length(); + } + } + } + + if (doing_normalization) { + // we found things we needed to normalize away, so we must have built the normalized + // path, now replace the original with it + *alignment.mutable_path() = move(normalized); + } +} -void write_alignment_to_file(const Alignment& aln, const string& filename) { - ofstream out(filename); - vector alnz = { aln }; - stream::write_buffered(out, alnz, 1); - out.close(); +bool uses_Us(const Alignment& alignment) { + + for (char nt : alignment.sequence()) { + switch (nt) { + case 'U': + return true; + break; + + case 'T': + return false; + break; + + default: + break; + } + } + return false; +} + +void convert_alignment_char(Alignment& alignment, char from, char to) { + auto& seq = *alignment.mutable_sequence(); + for (size_t i = 0; i < seq.size(); ++i) { + if (seq[i] == from) { + seq[i] = to; + } + } + if (alignment.has_path()) { + for (Mapping& mapping : *alignment.mutable_path()->mutable_mapping()) { + for (Edit& edit : *mapping.mutable_edit()) { + if (!edit.sequence().empty()) { + auto& eseq = *edit.mutable_sequence(); + for (size_t i = 0; i < eseq.size(); ++i) { + if (eseq[i] == from) { + eseq[i] = to; + } + } + } + } + } + } +} + +void convert_Us_to_Ts(Alignment& alignment) { + convert_alignment_char(alignment, 'U', 'T'); +} + +void convert_Ts_to_Us(Alignment& alignment) { + convert_alignment_char(alignment, 'T', 'U'); } map alignment_quality_per_node(const Alignment& aln) { @@ -1419,7 +1897,7 @@ pair signature(const Alignment& aln1, const Alignment& aln2) { } void parse_bed_regions(istream& bedstream, - xg::XG* xgindex, + const PathPositionHandleGraph* graph, vector* out_alignments) { out_alignments->clear(); if (!bedstream) { @@ -1443,23 +1921,25 @@ void parse_bed_regions(istream& bedstream, istringstream ss(row); ss >> seq; - if (xgindex->path_rank(seq) == 0) { + if (!graph->has_path(seq)) { // This path doesn't exist, and we'll get a segfault or worse if // we go look for positions in it. cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; continue; } + path_handle_t path_handle = graph->get_path_handle(seq); + ss >> sbuf; ss >> ebuf; if (ss.fail()) { // Skip lines that can't be parsed - cerr << "Error parsing bed line " << line << ": " << row << endl; + cerr << "warning: Error parsing bed line " << line << ", skipping: " << row << endl; continue; } - if (sbuf >= ebuf && !xgindex->path_is_circular(seq)) { + if (sbuf >= ebuf && !graph->get_is_circular(path_handle)) { // The start of the region can be after the end of the region only if the underlying path is circular. // That's not the case, so complain and skip the region. cerr << "warning: path \"" << seq << "\" is not circular, skipping end-spanning region on line " @@ -1467,6 +1947,20 @@ void parse_bed_regions(istream& bedstream, continue; } + if (ebuf > graph->get_path_length(path_handle)) { + // Skip ends that are too late + cerr << "warning: out of range path end " << ebuf << " > " << graph->get_path_length(path_handle) + << " in bed line " << line << ", skipping: " << row << endl; + continue; + } + + if (sbuf >= graph->get_path_length(path_handle)) { + // Skip starts that are too late + cerr << "warning: out of range path start " << sbuf << " >= " << graph->get_path_length(path_handle) + << " in bed line " << line << ", skipping: " << row << endl; + continue; + } + // Try parsing the optional fields. If they fail, ignore the problem, because they're optional. ss >> name; ss >> score; @@ -1478,7 +1972,7 @@ void parse_bed_regions(istream& bedstream, } // Make the Alignment - Alignment alignment = xgindex->target_alignment(seq, sbuf, ebuf, name, is_reverse); + Alignment alignment = target_alignment(graph, path_handle, sbuf, ebuf, name, is_reverse); alignment.set_score(score); out_alignments->push_back(alignment); @@ -1486,7 +1980,7 @@ void parse_bed_regions(istream& bedstream, } void parse_gff_regions(istream& gffstream, - xg::XG* xgindex, + const PathPositionHandleGraph* graph, vector* out_alignments) { out_alignments->clear(); if (!gffstream) { @@ -1515,8 +2009,10 @@ void parse_gff_regions(istream& gffstream, getline(ss, source, '\t'); getline(ss, type, '\t'); getline(ss, buf, '\t'); - sbuf = atoi(buf.c_str()); + // Convert to 0-based + sbuf = atoi(buf.c_str()) - 1; getline(ss, buf, '\t'); + // 1-based inclusive == 0-based exclusive ebuf = atoi(buf.c_str()); if (ss.fail() || !(sbuf < ebuf)) { @@ -1527,23 +2023,33 @@ void parse_gff_regions(istream& gffstream, getline(ss, num, '\t'); getline(ss, annotations, '\t'); vector vals = split(annotations, ";"); + + string name = ""; + for (auto& s : vals) { if (s.find("Name=") == 0) { name = s.substr(5); } } + // Skips annotations where the name can not be parsed. Empty names can + // results in undefinable behavior downstream. + if (name.empty()) { + cerr << "warning: could not parse annotation name (Name=), skipping line " << line << endl; + continue; + } + bool is_reverse = false; if(!ss.fail() && strand.compare("-") == 0) { is_reverse = true; } - if (xgindex->path_rank(seq) == 0) { + if (!graph->has_path(seq)) { // This path doesn't exist, and we'll get a segfault or worse if // we go look for positions in it. cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; } else { - Alignment alignment = xgindex->target_alignment(seq, sbuf, ebuf, name, is_reverse); + Alignment alignment = target_alignment(graph, graph->get_path_handle(seq), sbuf, ebuf, name, is_reverse); out_alignments->push_back(alignment); } @@ -1577,12 +2083,12 @@ map > > alignment_refpos_to_path_offsets(const return offsets; } -void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base) { +void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base, const unordered_map* translation) { auto base_offsets = alignment_refpos_to_path_offsets(base); - return alignment_set_distance_to_correct(aln, base_offsets); + return alignment_set_distance_to_correct(aln, base_offsets, translation); } -void alignment_set_distance_to_correct(Alignment& aln, const map > >& base_offsets) { +void alignment_set_distance_to_correct(Alignment& aln, const map > >& base_offsets, const unordered_map* translation) { auto aln_offsets = alignment_refpos_to_path_offsets(aln); // bail out if we can't compare if (!(aln_offsets.size() && base_offsets.size())) return; @@ -1590,7 +2096,15 @@ void alignment_set_distance_to_correct(Alignment& aln, const map::max(); for (auto& path : aln_offsets) { - auto& name = path.first; + auto name = path.first; + if (translation) { + // See if we need to translate the name of the path + auto found = translation->find(name); + if (found != translation->end()) { + // We have a replacement so apply it. + name = found->second; + } + } auto& aln_positions = path.second; auto f = base_offsets.find(name); if (f == base_offsets.end()) continue; @@ -1616,4 +2130,302 @@ void alignment_set_distance_to_correct(Alignment& aln, const maphas_node(mapping.position().node_id())) { + std::stringstream ss; + ss << "Node " << mapping.position().node_id() << " not found in graph"; + return { + AlignmentValidity::NODE_MISSING, + i, + ss.str() + }; + } + size_t node_len = hgraph->get_length(hgraph->get_handle(mapping.position().node_id())); + if (mapping_from_length(mapping) + mapping.position().offset() > node_len) { + std::stringstream ss; + ss << "Length of node " + << mapping.position().node_id() << " (" << node_len << ") exceeded by Mapping with offset " + << mapping.position().offset() << " and from-length " << mapping_from_length(mapping); + return { + AlignmentValidity::NODE_TOO_SHORT, + i, + ss.str() + }; + } + } + return {AlignmentValidity::OK}; +} + +Alignment target_alignment(const PathPositionHandleGraph* graph, const path_handle_t& path, size_t pos1, size_t pos2, + const string& feature, bool is_reverse, Mapping& cigar_mapping) { + Alignment aln; + + // How long is the path? + auto path_len = graph->get_path_length(path); + + if (pos2 < pos1) { + // Looks like we want to span the origin of a circular path + if (!graph->get_is_circular(path)) { + // But the path isn't circular, which is a problem + throw runtime_error("Cannot extract Alignment from " + to_string(pos1) + + " to " + to_string(pos2) + " across the junction of non-circular path " + + graph->get_path_name(path)); + } + + if (pos1 >= path_len) { + // We want to start off the end of the path, which is no good. + throw runtime_error("Cannot extract Alignment starting at " + to_string(pos1) + + " which is past end " + to_string(path_len) + " of path " + + graph->get_path_name(path)); + } + + if (pos2 > path_len) { + // We want to end off the end of the path, which is no good either. + throw runtime_error("Cannot extract Alignment ending at " + to_string(pos2) + + " which is past end " + to_string(path_len) + " of path " + + graph->get_path_name(path)); + } + + // Split the proivided Mapping of edits at the path end/start junction + auto part_mappings = cut_mapping_offset(cigar_mapping, path_len - pos1); + + // We extract from pos1 to the end + Alignment aln1 = target_alignment(graph, path, pos1, path_len, feature, is_reverse, part_mappings.first); + + // And then from the start to pos2 + Alignment aln2 = target_alignment(graph, path, 0, pos2, feature, is_reverse, part_mappings.second); + + if (is_reverse) { + // The alignments were flipped, so the second has to be first + return merge_alignments(aln2, aln1); + } else { + // The alignments get merged in the same order + return merge_alignments(aln1, aln2); + } + } + + // Otherwise, the base case is that we don't go over the circular path junction + + if (pos1 >= path_len) { + throw runtime_error("Cannot extract Alignment starting at " + to_string(pos1) + + " which is past end " + to_string(path_len) + " of path " + + graph->get_path_name(path)); + } + if (pos2 > path_len) { + throw runtime_error("Cannot extract Alignment ending at " + to_string(pos2) + + " which is past end " + to_string(path_len) + " of path " + + graph->get_path_name(path)); + } + + step_handle_t step = graph->get_step_at_position(path, pos1); + size_t step_start = graph->get_position_of_step(step); + handle_t handle = graph->get_handle_of_step(step); + + int64_t trim_start = pos1 - step_start; + { + Mapping* first_mapping = aln.mutable_path()->add_mapping(); + first_mapping->mutable_position()->set_node_id(graph->get_id(handle)); + first_mapping->mutable_position()->set_is_reverse(graph->get_is_reverse(handle)); + first_mapping->mutable_position()->set_offset(trim_start); + + auto mappings = cut_mapping_offset(cigar_mapping, graph->get_length(handle)-trim_start); + first_mapping->clear_edit(); + + string from_seq = graph->get_sequence(handle); + int from_pos = trim_start; + for (size_t j = 0; j < mappings.first.edit_size(); ++j) { + if (mappings.first.edit(j).to_length() == mappings.first.edit(j).from_length()) {// if (mappings.first.edit(j).sequence() != nullptr) { + // do the sequences match? + // emit a stream of "SNPs" and matches + int last_start = from_pos; + int k = 0; + Edit* edit; + for (int to_pos = 0 ; to_pos < mappings.first.edit(j).to_length() ; ++to_pos, ++from_pos) { + //cerr << h << ":" << k << " " << from_seq[h] << " " << to_seq[k] << endl; + if (from_seq[from_pos] != mappings.first.edit(j).sequence()[to_pos]) { + // emit the last "match" region + if (from_pos - last_start > 0) { + edit = first_mapping->add_edit(); + edit->set_from_length(from_pos-last_start); + edit->set_to_length(from_pos-last_start); + } + // set up the SNP + edit = first_mapping->add_edit(); + edit->set_from_length(1); + edit->set_to_length(1); + edit->set_sequence(from_seq.substr(to_pos,1)); + last_start = from_pos+1; + } + } + // handles the match at the end or the case of no SNP + if (from_pos - last_start > 0) { + edit = first_mapping->add_edit(); + edit->set_from_length(from_pos-last_start); + edit->set_to_length(from_pos-last_start); + } + // to_pos += length; + // from_pos += length; + } else { + // Edit* edit = first_mapping->add_edit(); + // *edit = mappings.first.edit(j); + *first_mapping->add_edit() = mappings.first.edit(j); + from_pos += mappings.first.edit(j).from_length(); + } + } + cigar_mapping = mappings.second; + } + // get p to point to the next step (or past it, if we're a feature on a single node) + int64_t p = step_start + graph->get_length(handle); + step = graph->get_next_step(step); + while (p < pos2) { + handle = graph->get_handle_of_step(step); + + auto mappings = cut_mapping_offset(cigar_mapping, graph->get_length(handle)); + + Mapping m; + m.mutable_position()->set_node_id(graph->get_id(handle)); + m.mutable_position()->set_is_reverse(graph->get_is_reverse(handle)); + + string from_seq = graph->get_sequence(handle); + int from_pos = 0; + for (size_t j = 0 ; j < mappings.first.edit_size(); ++j) { + if (mappings.first.edit(j).to_length() == mappings.first.edit(j).from_length()) { + // do the sequences match? + // emit a stream of "SNPs" and matches + int last_start = from_pos; + int k = 0; + Edit* edit; + for (int to_pos = 0 ; to_pos < mappings.first.edit(j).to_length() ; ++to_pos, ++from_pos) { + //cerr << h << ":" << k << " " << from_seq[h] << " " << to_seq[k] << endl; + if (from_seq[from_pos] != mappings.first.edit(j).sequence()[to_pos]) { + // emit the last "match" region + if (from_pos - last_start > 0) { + edit = m.add_edit(); + edit->set_from_length(from_pos-last_start); + edit->set_to_length(from_pos-last_start); + } + // set up the SNP + edit = m.add_edit(); + edit->set_from_length(1); + edit->set_to_length(1); + edit->set_sequence(from_seq.substr(to_pos,1)); + last_start = from_pos+1; + } + } + // handles the match at the end or the case of no SNP + if (from_pos - last_start > 0) { + edit = m.add_edit(); + edit->set_from_length(from_pos-last_start); + edit->set_to_length(from_pos-last_start); + } + // to_pos += length; + // from_pos += length; + } else { + *m.add_edit() = mappings.first.edit(j); + from_pos += mappings.first.edit(j).from_length(); + } + } + cigar_mapping = mappings.second; + *aln.mutable_path()->add_mapping() = m; + p += mapping_from_length(aln.path().mapping(aln.path().mapping_size()-1)); + step = graph->get_next_step(step); + } + aln.set_name(feature); + if (is_reverse) { + reverse_complement_alignment_in_place(&aln, [&](vg::id_t node_id) { return graph->get_length(graph->get_handle(node_id)); }); + } + return aln; +} + +Alignment target_alignment(const PathPositionHandleGraph* graph, const path_handle_t& path, size_t pos1, size_t pos2, + const string& feature, bool is_reverse) { + Alignment aln; + + + if (pos2 < pos1) { + // Looks like we want to span the origin of a circular path + if (!graph->get_is_circular(path)) { + // But the path isn't circular, which is a problem + throw runtime_error("Cannot extract Alignment from " + to_string(pos1) + + " to " + to_string(pos2) + " across the junction of non-circular path " + + graph->get_path_name(path)); + } + + // How long is the path? + auto path_len = graph->get_path_length(path); + + if (pos1 >= path_len) { + // We want to start off the end of the path, which is no good. + throw runtime_error("Cannot extract Alignment starting at " + to_string(pos1) + + " which is past end " + to_string(path_len) + " of path " + + graph->get_path_name(path)); + } + + if (pos2 > path_len) { + // We want to end off the end of the path, which is no good either. + throw runtime_error("Cannot extract Alignment ending at " + to_string(pos2) + + " which is past end " + to_string(path_len) + " of path " + + graph->get_path_name(path)); + } + + // We extract from pos1 to the end + Alignment aln1 = target_alignment(graph, path, pos1, path_len, feature, is_reverse); + + // And then from the start to pos2 + Alignment aln2 = target_alignment(graph, path, 0, pos2, feature, is_reverse); + + if (is_reverse) { + // The alignments were flipped, so the second has to be first + return merge_alignments(aln2, aln1); + } else { + // The alignments get merged in the same order + return merge_alignments(aln1, aln2); + } + } + + // If we get here, we do the normal non-circular path case. + + step_handle_t step = graph->get_step_at_position(path, pos1); + size_t step_start = graph->get_position_of_step(step); + handle_t handle = graph->get_handle_of_step(step); + + int64_t trim_start = pos1 - step_start; + { + Mapping* first_mapping = aln.mutable_path()->add_mapping(); + first_mapping->mutable_position()->set_node_id(graph->get_id(handle)); + first_mapping->mutable_position()->set_is_reverse(graph->get_is_reverse(handle)); + first_mapping->mutable_position()->set_offset(trim_start); + + Edit* e = first_mapping->add_edit(); + size_t edit_len = min(graph->get_length(handle) - trim_start, pos2 - pos1); + e->set_from_length(edit_len); + e->set_to_length(edit_len); + } + // get p to point to the next step (or past it, if we're a feature on a single node) + int64_t p = step_start + graph->get_length(handle); + step = graph->get_next_step(step); + while (p < pos2) { + handle = graph->get_handle_of_step(step); + + Mapping* m = aln.mutable_path()->add_mapping(); + m->mutable_position()->set_node_id(graph->get_id(handle)); + m->mutable_position()->set_is_reverse(graph->get_is_reverse(handle)); + + Edit* e = m->add_edit(); + size_t edit_len = min(graph->get_length(handle), pos2 - p); + e->set_from_length(edit_len); + e->set_to_length(edit_len); + + p += graph->get_length(handle); + step = graph->get_next_step(step); + } + + aln.set_name(feature); + if (is_reverse) { + reverse_complement_alignment_in_place(&aln, [&](vg::id_t node_id) { return graph->get_length(graph->get_handle(node_id)); }); + } + return aln; +} } diff --git a/src/alignment.hpp b/src/alignment.hpp index 0784e9b743d..d5aa5457242 100644 --- a/src/alignment.hpp +++ b/src/alignment.hpp @@ -7,13 +7,14 @@ #include "utility.hpp" #include "path.hpp" #include "position.hpp" -#include "vg.pb.h" -#include "xg.hpp" -#include "edit.hpp" -#include "htslib/hfile.h" -#include "htslib/hts.h" -#include "htslib/sam.h" -#include "htslib/vcf.h" +#include +#include "vg/io/edit.hpp" +#include +#include +#include +#include +#include "handle.hpp" +#include "vg/io/alignment_io.hpp" namespace vg { @@ -21,9 +22,13 @@ const char* const BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; int hts_for_each(string& filename, function lambda); int hts_for_each_parallel(string& filename, function lambda); -int hts_for_each(string& filename, function lambda, xg::XG* xgindex); -int hts_for_each_parallel(string& filename, function lambda, xg::XG* xgindex); +int hts_for_each(string& filename, function lambda, + const PathPositionHandleGraph* graph); +int hts_for_each_parallel(string& filename, function lambda, + const PathPositionHandleGraph* graph); int fastq_for_each(string& filename, function lambda); + +// fastq bool get_next_alignment_from_fastq(gzFile fp, char* buffer, size_t len, Alignment& alignment); bool get_next_interleaved_alignment_pair_from_fastq(gzFile fp, char* buffer, size_t len, Alignment& mate1, Alignment& mate2); bool get_next_alignment_pair_from_fastqs(gzFile fp1, gzFile fp2, char* buffer, size_t len, Alignment& mate1, Alignment& mate2); @@ -33,58 +38,103 @@ size_t fastq_paired_interleaved_for_each(const string& filename, function lambda); // parallel versions of above size_t fastq_unpaired_for_each_parallel(const string& filename, - function lambda); + function lambda, + uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE); size_t fastq_paired_interleaved_for_each_parallel(const string& filename, - function lambda); + function lambda, + uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE); size_t fastq_paired_interleaved_for_each_parallel_after_wait(const string& filename, function lambda, - function single_threaded_until_true); + function single_threaded_until_true, + uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE); size_t fastq_paired_two_files_for_each_parallel(const string& file1, const string& file2, - function lambda); + function lambda, + uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE); size_t fastq_paired_two_files_for_each_parallel_after_wait(const string& file1, const string& file2, function lambda, - function single_threaded_until_true); + function single_threaded_until_true, + uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE); bam_hdr_t* hts_file_header(string& filename, string& header); bam_hdr_t* hts_string_header(string& header, - map& path_length, - map& rg_sample); -/// Write alighnents to the given output stream. -/// Does not write an EOF marker, so stream::finish() must be run on the stream after writing is done. -void write_alignments(std::ostream& out, vector& buf); + const map& path_length, + const map& rg_sample); +bam_hdr_t* hts_string_header(string& header, + const vector>& path_order_and_length, + const map& rg_sample); void write_alignment_to_file(const Alignment& aln, const string& filename); void mapping_cigar(const Mapping& mapping, vector >& cigar); -string cigar_string(vector >& cigar); +string cigar_string(const vector >& cigar); string mapping_string(const string& source, const Mapping& mapping); -void cigar_mapping(const bam1_t *b, Mapping& mapping, xg::XG* xgindex); +void cigar_mapping(const bam1_t *b, Mapping& mapping); -Alignment bam_to_alignment(const bam1_t *b, map& rg_sample, const bam_hdr_t *bh, xg::XG* xgindex); -Alignment bam_to_alignment(const bam1_t *b, map& rg_sample); +Alignment bam_to_alignment(const bam1_t *b, + const map& rg_sample, + const map& tid_path_handle, + const bam_hdr_t *bh, + const PathPositionHandleGraph* graph); +Alignment bam_to_alignment(const bam1_t *b, const map& rg_sample, const map& tid_path_handle); + +/** + * Add a CIGAR operation to a vector representing the parsed CIGAR string. + * + * Coalesces adjacent operations of the same type. Coalesces runs of inserts + * and deletes into a signle delete followed by a single insert. + */ +inline void append_cigar_operation(const int length, const char operation, vector>& cigar) { + if (cigar.empty()) { + // Always append to an empty CIGAR + cigar.emplace_back(length, operation); + } else if (operation != cigar.back().second) { + // We have changed operations + if (operation == 'D' && cigar.back().second == 'I') { + // This deletion needs to come before the adjacent insertion + if (cigar.size() > 1 && cigar[cigar.size() - 2].second == 'D') { + // Add to the deletion that laready exists before the insertion + cigar[cigar.size() - 2].first += length; + } else { + // Create a new deletion + cigar.emplace_back(length, operation); + // Put it under the insertion + std::swap(cigar[cigar.size() - 2], cigar.back()); + } + } else { + // This is an ordinary change of operations. + cigar.emplace_back(length, operation); + } + } else { + cigar.back().first += length; + } +} /** * Convert a paired Alignment to a BAM record. If the alignment is unmapped, * refpos must be -1. Otherwise, refpos must be the position on the reference * sequence to which the alignment is aligned. Similarly, refseq must be the * sequence aligned to, or "" if unaligned. The mateseq and matepos fields must - * be set similarly for the mate. Note that mateseq must not be "=". + * be set similarly for the mate. Note that mateseq must not be "=". If + * tlen_max is given, it is a limit on the magnitude of tlen to consider the + * read properly paired. * * Remember to clean up with bam_destroy1(b); */ -bam1_t* alignment_to_bam(const string& sam_header, +bam1_t* alignment_to_bam(bam_hdr_t* bam_header, const Alignment& alignment, const string& refseq, const int32_t refpos, const bool refrev, - const string& cigar, + const vector>& cigar, const string& mateseq, const int32_t matepos, - const int32_t tlen); + bool materev, + const int32_t tlen, + const int32_t tlen_max = 0); /** * Convert an unpaired Alignment to a BAM record. If the alignment is unmapped, @@ -94,28 +144,32 @@ bam1_t* alignment_to_bam(const string& sam_header, * * Remember to clean up with bam_destroy1(b); */ -bam1_t* alignment_to_bam(const string& sam_header, - const Alignment& alignment, - const string& refseq, - const int32_t refpos, - const bool refrev, - const string& cigar); +bam1_t* alignment_to_bam(bam_hdr_t* bam_header, + const Alignment& alignment, + const string& refseq, + const int32_t refpos, + const bool refrev, + const vector>& cigar); /** * Convert a paired Alignment to a SAM record. If the alignment is unmapped, * refpos must be -1. Otherwise, refpos must be the position on the reference * sequence to which the alignment is aligned. Similarly, refseq must be the * sequence aligned to, or "" if unaligned. The mateseq and matepos fields must - * be set similarly for the mate. Note that mateseq must not be "=". + * be set similarly for the mate. Note that mateseq must not be "=". If + * tlen_max is given, it is a limit on the magnitude of tlen to consider the + * read properly paired. */ string alignment_to_sam(const Alignment& alignment, const string& refseq, const int32_t refpos, const bool refrev, - const string& cigar, + const vector>& cigar, const string& mateseq, const int32_t matepos, - const int32_t tlen); + bool materev, + const int32_t tlen, + const int32_t tlen_max = 0); /** * Convert an unpaired Alignment to a SAM record. If the alignment is unmapped, @@ -127,21 +181,50 @@ string alignment_to_sam(const Alignment& alignment, const string& refseq, const int32_t refpos, const bool refrev, - const string& cigar); + const vector>& cigar); +/// Returns the SAM bit-coded flag for alignment with +int32_t determine_flag(const Alignment& alignment, + const string& refseq, + const int32_t refpos, + const bool refrev, + const string& mateseq, + const int32_t matepos, + bool materev, + const int32_t tlen, + bool paired, + const int32_t tlen_max); + +/// Create a CIGAR from the given Alignment. If softclip_suppress is nonzero, +/// suppress softclips up to that length. This will necessitate adjusting pos, +/// which is why it is passed by reference. +vector> cigar_against_path(const Alignment& alignment, bool on_reverse_strand, int64_t& pos, size_t path_len, size_t softclip_suppress); -string cigar_against_path(const Alignment& alignment, bool on_reverse_strand, int64_t& pos, size_t path_len, size_t softclip_suppress); -void mapping_against_path(Alignment& alignment, const bam1_t *b, xg::XG* xgindex, bool on_reverse_strand); +/// Merge runs of successive I/D operations into a single I and D, remove 0-length +/// operations, and merge adjacent operations of the same type +void simplify_cigar(vector>& cigar); + + +/// Translate the CIGAR in the given BAM record into mappings in the given +/// Alignment against the given path in the given graph. +void mapping_against_path(Alignment& alignment, const bam1_t *b, + const path_handle_t& path, const PathPositionHandleGraph* graph, + bool on_reverse_strand); + +/// Work out the TLEN values for two reads. The magnitude is the distance +/// between the outermost aligned bases, and the sign is positive for the +/// leftmost read and negative for the rightmost. +pair compute_template_lengths(const int64_t& pos1, const vector>& cigar1, + const int64_t& pos2, const vector>& cigar2); int32_t sam_flag(const Alignment& alignment, bool on_reverse_strand, bool paired); -short quality_char_to_short(char c); -char quality_short_to_char(short i); -string string_quality_char_to_short(const string& quality); -string string_quality_short_to_char(const string& quality); -void alignment_quality_char_to_short(Alignment& alignment); -void alignment_quality_short_to_char(Alignment& alignment); +/// Populate a mapping from read group to sample name, given the text BAM header. void parse_rg_sample_map(char* hts_header, map& rg_sample); +/// Populate a mapping from target ID number to path handle in the given graph, +/// given a parsed BAM header. The graph may be null. Missing target paths in +/// the graph produce no warning or error and no map entry. +void parse_tid_path_handle_map(const bam_hdr_t* hts_header, const PathHandleGraph* graph, map& tid_path_handle); int alignment_to_length(const Alignment& a); int alignment_from_length(const Alignment& a); // Adds a2 onto the end of a1, returns reference to a1 @@ -169,6 +252,7 @@ int non_match_start(const Alignment& alignment); int non_match_end(const Alignment& alignment); int softclip_start(const Alignment& alignment); int softclip_end(const Alignment& alignment); +int softclip_trim(Alignment& alignment); int query_overlap(const Alignment& aln1, const Alignment& aln2); int edit_count(const Alignment& alignment); size_t to_length_after_pos(const Alignment& aln, const Position& pos); @@ -187,10 +271,24 @@ void translate_nodes(Alignment& a, const unordered_map >& // listed. It needs a callback to ask the length of any given node. void flip_nodes(Alignment& a, const set& ids, const std::function& node_length); +/// Returns true if the alignment sequence contains any U's and false if the alignment sequence contains +/// and T's. In the case that both T's and U's are included, responds according to whichever comes first. +/// If the sequence contains neither U's nor T's, returns false. +bool uses_Us(const Alignment& alignment); + +/// Replaces any U's in the sequence or the Path with T's +void convert_Us_to_Ts(Alignment& alignment); + +/// Replaces any T's in the sequence or the Path with U's +void convert_Ts_to_Us(Alignment& alignment); + /// Simplifies the Path in the Alignment. Note that this removes deletions at /// the start and end of Mappings, so code that handles simplified Alignments /// needs to handle offsets on internal Mappings. Alignment simplify(const Alignment& a, bool trim_internal_deletions = true); + +/// Merge adjacent edits of the same type and convert all N matches to mismatches. +void normalize_alignment(Alignment& alignment); // quality information; a kind of poor man's pileup map alignment_quality_per_node(const Alignment& aln); @@ -198,17 +296,59 @@ map alignment_quality_per_node(const Alignment& aln); /// Parse regions from the given BED file into Alignments in a vector. /// Reads the optional name, is_reverse, and score fields if present, and populates the relevant Alignment fields. /// Skips and warns about malformed or illegal BED records. -void parse_bed_regions(istream& bedstream, xg::XG* xgindex, vector* out_alignments); -void parse_gff_regions(istream& gtfstream, xg::XG* xgindex, vector* out_alignments); +void parse_bed_regions(istream& bedstream, const PathPositionHandleGraph* graph, vector* out_alignments); +void parse_gff_regions(istream& gtfstream, const PathPositionHandleGraph* graph, vector* out_alignments); Position alignment_start(const Alignment& aln); Position alignment_end(const Alignment& aln);Position alignment_start(const Alignment& aln); /// return the path offsets as cached in the alignment map > > alignment_refpos_to_path_offsets(const Alignment& aln); -/// annotate the first alignment with its minimum distance to the second in their annotated paths -void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base); -void alignment_set_distance_to_correct(Alignment& aln, const map > >& base_offsets); +/// Annotate the first alignment with its minimum distance to the second in +/// their annotated paths. If translation is set, replace path names in aln +/// using that mapping, if they are found in it. +void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base, const unordered_map* translation = nullptr); +void alignment_set_distance_to_correct(Alignment& aln, const map>>& base_offsets, const unordered_map* translation = nullptr); + +/** + * Represents a report on whether an alignment makes sense in the context of a graph. + */ +struct AlignmentValidity { + /// The different kinds of possible problems with alignments + enum Problem { + OK, + NODE_MISSING, + NODE_TOO_SHORT + }; + + /// The kind of problem with the alignment. + Problem problem = OK; + /// The mapping in the alignment's path at which the problem was encountered. + size_t bad_mapping_index = 0; + /// An explanation for the problem. + std::string message = ""; + + /// We are truthy if the alignment has no problem, and falsey otherwise. + inline operator bool() const { + return problem == OK; + } +}; + +/// Check to make sure edits on the alignment's path don't assume incorrect +/// node lengths or ids. Result can be used like a bool or inspected for +/// further details. Does not log anything itself about bad alignments. +AlignmentValidity alignment_is_valid(const Alignment& aln, const HandleGraph* hgraph); + +/// Make an Alignment corresponding to a subregion of a stored path. +/// Positions are 0-based, and pos2 is excluded. +/// Respects path circularity, so pos2 < pos1 is not a problem. +/// If pos1 == pos2, returns an empty alignment. +Alignment target_alignment(const PathPositionHandleGraph* graph, const path_handle_t& path, size_t pos1, size_t pos2, + const string& feature, bool is_reverse); +/// Same as above, but uses the given Mapping, translated directly form a CIGAR string, as a source of edits. +/// The edits are inserted into the generated Alignment, cut as necessary to fit into the Alignment's Mappings. +Alignment target_alignment(const PathPositionHandleGraph* graph, const path_handle_t& path, size_t pos1, size_t pos2, + const string& feature, bool is_reverse, Mapping& cigar_mapping); } diff --git a/src/annotation.hpp b/src/annotation.hpp index efc9c1caea1..fd7ce0b177b 100644 --- a/src/annotation.hpp +++ b/src/annotation.hpp @@ -8,6 +8,9 @@ #include #include #include +#include +#include +#include #include @@ -19,6 +22,10 @@ using namespace std; // API //////////////////////////////////////////////////////////////////////// +/// Returns true if the Protobuf object has an annotation with this name +template +bool has_annotation(const Annotated& annotated, const string& name); + /// Get the annotation with the given name and return it. /// If not present, returns the Protobuf default value for the annotation type. /// The value may be a primitive type or an entire Protobuf object. @@ -55,6 +62,15 @@ void clear_annotation(Annotated* annotated, const string& name); template void clear_annotation(Annotated& annotated, const string& name); +/// Apply a lambda to all annotations, except for Struct and ListValue annotations (which cannot +/// be easily typed without exposing ugly Protobuf internals +template +void for_each_basic_annotation(const Annotated& annotated, + const function null_lambda, + const function double_lambda, + const function bool_lambda, + const function string_lambda); + //////////////////////////////////////////////////////////////////////// // Internal Definitions //////////////////////////////////////////////////////////////////////// @@ -111,6 +127,20 @@ inline bool value_cast(const google::protobuf::Value& value) { template<> inline double value_cast(const google::protobuf::Value& value) { + if (value.kind_case() == google::protobuf::Value::KindCase::kStringValue) { + // If someone puts in an infinite or NAN double, Protobuf refuses to + // stringify those, so we do it ourselves. But now they want the double + // back so we need to undo that. + if (value.string_value() == "Infinity") { + return std::numeric_limits::infinity(); + } else if (value.string_value() == "-Infinity") { + return -std::numeric_limits::infinity(); + } else if (value.string_value() == "NaN") { + return nan(""); + } else { + throw std::runtime_error("Cannot understand " + value.string_value() + " as a double."); + } + } assert(value.kind_case() == google::protobuf::Value::KindCase::kNumberValue); return value.number_value(); } @@ -131,7 +161,15 @@ inline google::protobuf::Value value_cast(const bool& wrap) { template<> inline google::protobuf::Value value_cast(const double& wrap) { google::protobuf::Value to_return; - to_return.set_number_value(wrap); + // We need to represent inf and nan values as something else, since Protobuf now refuses to serialize them as anything. + // Previously it made them "Infinity", "-Infinity" and "NaN", so we do that too. + if (isinf(wrap)) { + to_return.set_string_value(wrap > 0 ? "Infinity" : "-Infinity"); + } else if (isnan(wrap)) { + to_return.set_string_value("NaN"); + } else { + to_return.set_number_value(wrap); + } return to_return; } @@ -142,6 +180,19 @@ inline google::protobuf::Value value_cast(const string& wrap) { return to_return; } +// Helpers for dumping integral types to double. +// May lose precision for large numbers. + +template<> +inline google::protobuf::Value value_cast(const size_t& wrap) { + return value_cast((double) wrap); +} + +template<> +inline google::protobuf::Value value_cast(const int& wrap) { + return value_cast((double) wrap); +} + // We also have implementations for vectors and other push_back-able containers. template @@ -169,6 +220,14 @@ inline google::protobuf::Value value_cast(const Container& wrap) { return to_return; } +template +inline bool has_annotation(const Annotated& annotated, const string& name) { + // Grab the whole annotation struct + auto annotation_struct = Annotation::get(annotated); + // Check for the annotation + return annotation_struct.fields().count(name); +} + // TODO: more value casts for e.g. ints and embedded messages. template @@ -221,6 +280,34 @@ inline void clear_annotation(Annotated& annotated, const string& name) { clear_annotation(&annotated, name); } +template +void for_each_basic_annotation(const Annotated& annotated, + const function null_lambda, + const function double_lambda, + const function bool_lambda, + const function string_lambda) { + + for (auto it = annotated.annotation().fields().begin(), end = annotated.annotation().fields().end(); it != end; ++it) { + switch (it->second.kind_case()) { + case google::protobuf::Value::KindCase::kBoolValue: + bool_lambda(it->first, it->second.bool_value()); + break; + case google::protobuf::Value::KindCase::kNumberValue: + double_lambda(it->first, it->second.number_value()); + break; + case google::protobuf::Value::KindCase::kStringValue: + string_lambda(it->first, it->second.string_value()); + break; + case google::protobuf::Value::KindCase::kNullValue: + null_lambda(it->first); + break; + default: + // TODO: skip ListValue and Struct, how to include? + break; + } + } +} + } diff --git a/src/augment.cpp b/src/augment.cpp new file mode 100644 index 00000000000..e36d5bb0844 --- /dev/null +++ b/src/augment.cpp @@ -0,0 +1,1329 @@ +#include "vg.hpp" +#include +#include + +#include "augment.hpp" +#include "alignment.hpp" +#include "packer.hpp" +#include "annotation.hpp" +//#define debug + +using namespace vg::io; + +namespace vg { + +using namespace std; + +// The correct way to edit the graph +void augment(MutablePathMutableHandleGraph* graph, + const string& gam_path, + const string& aln_format, + vector* out_translations, + const string& gam_out_path, + bool embed_paths, + bool break_at_ends, + bool remove_softclips, + bool filter_out_of_graph_alignments, + double min_baseq, + double min_mapq, + Packer* packer, + size_t min_bp_coverage, + double max_frac_n, + bool edges_only) { + + // memory-wasting hack: we need node lengths from the original graph in order to parse the GAF. Unlesss we + // store them, they will be lost in the 2nd pass + unordered_map id_to_length; + if (aln_format == "GAF") { + graph->for_each_handle([&](handle_t handle) { + id_to_length[graph->get_id(handle)] = graph->get_length(handle); + }); + } + + function, bool, bool)> iterate_gam = + [&gam_path, &aln_format, &graph, &packer, &id_to_length] (function aln_callback, bool second_pass, bool parallel) { + if (aln_format == "GAM") { + get_input_file(gam_path, [&](istream& gam_stream) { + if (parallel) { + vg::io::for_each_parallel(gam_stream, aln_callback, Packer::estimate_batch_size(get_thread_count())); + } else { + vg::io::for_each(gam_stream, aln_callback); + } + }); + } else { + assert(aln_format == "GAF"); + function node_to_length; + function node_to_sequence; + if (second_pass) { + // graph has changed, need to fall back on our table we saved from the original graph + node_to_length = [&id_to_length](nid_t node_id) { + return id_to_length[node_id]; + }; + // try to do without sequences + node_to_sequence = nullptr; + } else { + // graph is valid on the first pass + node_to_length = [&graph](nid_t node_id) { + return graph->get_length(graph->get_handle(node_id)); + }; + node_to_sequence = [&graph](nid_t node_id, bool is_reversed) { + return graph->get_sequence(graph->get_handle(node_id, is_reversed)); + }; + } + if (parallel) { + vg::io::gaf_unpaired_for_each_parallel(node_to_length, node_to_sequence, gam_path, aln_callback); + } else { + vg::io::gaf_unpaired_for_each(node_to_length, node_to_sequence, gam_path, aln_callback); + } + } + }; + + augment_impl(graph, + iterate_gam, + aln_format, + out_translations, + gam_out_path, + embed_paths, + break_at_ends, + remove_softclips, + filter_out_of_graph_alignments, + min_baseq, + min_mapq, + packer, + min_bp_coverage, + max_frac_n, + edges_only); +} + +void augment(MutablePathMutableHandleGraph* graph, + vector& path_vector, + const string& aln_format, + vector* out_translations, + const string& gam_out_path, + bool embed_paths, + bool break_at_ends, + bool remove_softclips, + bool filter_out_of_graph_alignments, + double min_baseq, + double min_mapq, + Packer* packer, + size_t min_bp_coverage, + double max_frac_n, + bool edges_only) { + + function, bool, bool)> iterate_gam = + [&path_vector] (function aln_callback, bool second_pass, bool parallel) { + if (parallel) { +#pragma omp parallel for + for (size_t i = 0; i < path_vector.size(); ++i) { + Path& path = path_vector[i]; + Alignment aln; + *aln.mutable_path() = path; + aln.set_name(path.name()); + aln_callback(aln); + } + + } + else { + for (Path& path : path_vector) { + Alignment aln; + *aln.mutable_path() = path; + aln.set_name(path.name()); + aln_callback(aln); + } + } + }; + + augment_impl(graph, + iterate_gam, + aln_format, + out_translations, + gam_out_path, + embed_paths, + break_at_ends, + remove_softclips, + filter_out_of_graph_alignments, + min_baseq, + min_mapq, + packer, + min_bp_coverage, + max_frac_n, + edges_only); +} + +// Check if alignment contains node that's not in the graph +static inline bool check_in_graph(const Path& path, HandleGraph* graph) { + for (size_t i = 0; i < path.mapping_size(); ++i) { + if (!graph->has_node(path.mapping(i).position().node_id())) { + return false; + } + } + return true; +} + +// Check if alignment contains node that's not in the graph (via node sizes map) +static inline bool check_in_graph(const Path& path, const unordered_map& node_map) { + for (size_t i = 0; i < path.mapping_size(); ++i) { + if (!node_map.count(path.mapping(i).position().node_id())) { + return false; + } + } + return true; +} + +void augment_impl(MutablePathMutableHandleGraph* graph, + function,bool, bool)> iterate_gam, + const string& aln_format, + vector* out_translations, + const string& gam_out_path, + bool embed_paths, + bool break_at_ends, + bool remove_softclips, + bool filter_out_of_graph_alignments, + double min_baseq, + double min_mapq, + Packer* packer, + size_t min_bp_coverage, + double max_frac_n, + bool edges_only) { + + if (edges_only) { + // just add edges between consecutive mappings that aren't already in the graph + // note: offsets are completely ignored. if we want to take them into account, + // it's probably best to use below logic, but filtering non-deletion edits out + // of the GAM first. but keeping just the node-adjacency information as we + // do here is hopefully sufficient to improve sv genotyping with pack/call + add_edges_only(graph, iterate_gam, min_mapq, min_bp_coverage); + return; + } + + // toggle between using Packer to store breakpoints or the STL map + bool packed_mode = min_bp_coverage > 0 || min_baseq > 0 || max_frac_n < 1.; + assert(!packed_mode || packer != nullptr); + + unordered_map> breakpoints; + + // First pass: find the breakpoints + iterate_gam((function)[&](Alignment& aln) { +#ifdef debug + cerr << pb2json(aln.path()) << endl; +#endif + if (aln.mapping_quality() < min_mapq || (filter_out_of_graph_alignments && !check_in_graph(aln.path(), graph))) { + return; + } + + if (aln_format == "GAF" && has_annotation(aln, "from_cg") && get_annotation(aln, "from_cg")) { +#pragma omp critical (cerr) + { + cerr << "[vg augment] error: GAF with cg cigars contains insufficient information for augmenting: cs cigars required." << endl; + } + exit(1); + } + + if (remove_softclips) { + softclip_trim(aln); + } + + // Simplify the path, just to eliminate adjacent match Edits in the same + // Mapping (because we don't have or want a breakpoint there) + Path simplified_path = simplify(aln.path()); + + // Add in breakpoints from each path + if (packed_mode) { + find_packed_breakpoints(simplified_path, *packer, break_at_ends, aln.quality(), min_baseq, max_frac_n); + } else { + // note: we cannot pass non-zero min_baseq here. it relies on filter_breakpoints_by_coverage + // to work correctly, and must be passed in only via find_packed_breakpoints. + find_breakpoints(simplified_path, breakpoints, break_at_ends, "", 0, 1.); + } + }, false, packed_mode); + + if (packed_mode) { + // Filter the breakpoints by coverage + breakpoints = filter_breakpoints_by_coverage(*packer, min_bp_coverage); + } else { + // Invert the breakpoints that are on the reverse strand + breakpoints = forwardize_breakpoints(graph, breakpoints); + } + + // don't need this anymore: free up some memory + if (packer != nullptr) { + packer->clear(); + } + + // get the node sizes, for use when making the translation + unordered_map orig_node_sizes; + orig_node_sizes.reserve(graph->get_node_count()); + graph->for_each_handle([&](handle_t node) { + orig_node_sizes[graph->get_id(node)] = graph->get_length(node); + }); + + // Break any nodes that need to be broken. Save the map we need to translate + // from offsets on old nodes to new nodes. Note that this would mess up the + // ranks of nodes in their existing paths, which is why we clear and rebuild + // them. + auto node_translation = ensure_breakpoints(graph, breakpoints); + + // we remember the sequences of nodes we've added at particular positions on the forward strand + unordered_map, vector> added_seqs; + // we will record the nodes that we add, so we can correctly make the returned translation + unordered_map added_nodes; + // output alignment emitter and buffer + unique_ptr aln_emitter; + if (!gam_out_path.empty()) { + aln_emitter = vg::io::get_non_hts_alignment_emitter(gam_out_path, aln_format, {}, get_thread_count(), graph); + } + vector aln_buffer; + + // Second pass: add the nodes and edges + iterate_gam((function)[&](Alignment& aln) { + if (aln.mapping_quality() < min_mapq || (filter_out_of_graph_alignments && !check_in_graph(aln.path(), orig_node_sizes))) { + return; + } + + if (remove_softclips) { + softclip_trim(aln); + } + + // Simplify the path, just to eliminate adjacent match Edits in the same + // Mapping (because we don't have or want a breakpoint there) + // Note: We're electing to re-simplify in a second pass to avoid storing all + // the input paths in memory + Path simplified_path = simplify(aln.path()); + + // Filter out edits corresponding to breakpoints that didn't meet our coverage + // criteria + if (min_bp_coverage > 0) { + simplify_filtered_edits(graph, aln, simplified_path, node_translation, orig_node_sizes, + min_baseq, max_frac_n); + } + + // Create new nodes/wire things up. Get the added version of the path. + Path added = add_nodes_and_edges(graph, simplified_path, node_translation, added_seqs, + added_nodes, orig_node_sizes); + + // Copy over the name + *added.mutable_name() = aln.name(); + + if (embed_paths) { + add_path_to_graph(graph, added); + } + + // something is off about this check. + // assuming the GAM path is sorted, let's double-check that its edges are here + for (size_t i = 1; i < added.mapping_size(); ++i) { + auto& m1 = added.mapping(i-1); + auto& m2 = added.mapping(i); + // we're no longer sorting our input paths, so we assume they are sorted + assert((m1.rank() == 0 && m2.rank() == 0) || (m1.rank() + 1 == m2.rank())); + //if (!adjacent_mappings(m1, m2)) continue; // the path is completely represented here + auto s1 = graph->get_handle(m1.position().node_id(), m1.position().is_reverse()); + auto s2 = graph->get_handle(m2.position().node_id(), m2.position().is_reverse()); + // Ensure that we always have an edge between the two nodes in the correct direction + graph->create_edge(s1, s2); + } + + // optionally write out the modified path to GAM + if (!gam_out_path.empty()) { + *aln.mutable_path() = added; + aln_buffer.push_back(aln); + if (aln_buffer.size() >= 100) { + aln_emitter->emit_singles(vector(aln_buffer)); + aln_buffer.clear(); + } + } + }, true, false); + if (!aln_buffer.empty()) { + // Flush the buffer + aln_emitter->emit_singles(vector(aln_buffer)); + } + + // perform the same check as above, but on the paths that were already in the graph + // assuming the graph's paths are sorted, let's double-check that the edges are here + graph->for_each_path_handle([&](path_handle_t path_handle) { + step_handle_t prev_handle; + int i = 0; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (i > 0) { + // Ensure the edge that the path follows exists. + // TODO: Should this be an error if it doesn't exist instead? + graph->create_edge(graph->get_handle_of_step(prev_handle), handle); + } + prev_handle = step_handle; + }); + }); + + // make the translation + if (out_translations != nullptr) { + *out_translations = make_translation(graph, node_translation, added_nodes, orig_node_sizes); + } + + VG* vg_graph = dynamic_cast(graph); + + // This code got run after augment in VG::edit, so we make sure it happens here too + if (vg_graph != nullptr) { + // Rebuild path ranks, aux mapping, etc. by compacting the path ranks + // Todo: can we just do this once? + vg_graph->paths.compact_ranks(); + + // execute a semi partial order sort on the nodes + vg_graph->sort(); + } + +} + +double get_avg_baseq(const Edit& edit, const string& base_quals, size_t position_in_read) { + double avg_qual = numeric_limits::max(); + if (!base_quals.empty() && !edit.sequence().empty() && (edit_is_sub(edit) || edit_is_insertion(edit))) { + double tot_qual = 0; + for (int i = 0; i < edit.sequence().length(); ++i) { + tot_qual += base_quals[position_in_read + i]; + } + avg_qual = tot_qual / (double)edit.sequence().length(); + } + return avg_qual; +} + +// returns breakpoints on the forward strand of the nodes +void find_breakpoints(const Path& path, unordered_map>& breakpoints, bool break_ends, + const string& base_quals, double min_baseq, double max_frac_n) { + // We need to work out what offsets we will need to break each node at, if + // we want to add in all the new material and edges in this path. + +#ifdef debug + cerr << "Processing path..." << endl; +#endif + + // The base position in the edit + size_t position_in_read = 0; + + for (size_t i = 0; i < path.mapping_size(); ++i) { + // For each Mapping in the path + const Mapping& m = path.mapping(i); + + // What node are we on? + id_t node_id = m.position().node_id(); + + if(node_id == 0) { + // Skip Mappings that aren't actually to nodes. + continue; + } + + // See where the next edit starts in the node. It is always included + // (even when the edit runs backward), unless the edit has 0 length in + // the reference. + pos_t edit_first_position = make_pos_t(m.position()); + +#ifdef debug + cerr << "Processing mapping " << pb2json(m) << endl; +#endif + + for(size_t j = 0; j < m.edit_size(); ++j) { + // For each Edit in the mapping + const Edit& e = m.edit(j); + + // We know where the mapping starts in its node. But where does it + // end (inclusive)? Note that if the edit has 0 reference length, + // this may not actually be included in the edit (and + // edit_first_position will be further along than + // edit_last_position). + pos_t edit_last_position = edit_first_position; + if (e.from_length()) { + get_offset(edit_last_position) += e.from_length(); + } + +#ifdef debug + cerr << "Edit on " << node_id << " from " << edit_first_position << " to " << edit_last_position << endl; + cerr << pb2json(e) << endl; +#endif + + // Do the base quality check if applicable. If it fails we just ignore the edit + if ((min_baseq == 0 || get_avg_baseq(e, base_quals, position_in_read) >= min_baseq) && + (max_frac_n == 1. || get_fraction_of_ns(e.sequence()) <= max_frac_n)) { + + + if (!edit_is_match(e) || (j == 0 && (i != 0 || break_ends))) { + // If this edit is not a perfect match, or if this is the first + // edit in this mapping and either we had a previous mapping we + // may need to connect to or we want to break at the path's + // start, we need to make sure we have a breakpoint at the start + // of this edit. + +#ifdef debug + cerr << "Need to break " << node_id << " at edit lower end " << + edit_first_position << endl; +#endif + + // We need to snip between edit_first_position and edit_first_position - direction. + // Note that it doesn't matter if we put breakpoints at 0 and 1-past-the-end; those will be ignored. + breakpoints[node_id].insert(edit_first_position); + } + + if (!edit_is_match(e) || (j == m.edit_size() - 1 && (i != path.mapping_size() - 1 || break_ends))) { + // If this edit is not a perfect match, or if it is the last + // edit in a mapping and we have a subsequent mapping we might + // need to connect to or we want to break at the path ends, make + // sure we have a breakpoint at the end of this edit. + +#ifdef debug + cerr << "Need to break " << node_id << " at past edit upper end " << + edit_last_position << endl; +#endif + + // We also need to snip between edit_last_position and edit_last_position + direction. + breakpoints[node_id].insert(edit_last_position); + } + } + // TODO: for an insertion or substitution, note that we need a new + // node and two new edges. + + // TODO: for a deletion, note that we need an edge. TODO: Catch + // and complain about some things we can't handle (like a path with + // a leading/trailing deletion)? Or just skip deletions when wiring. + + // Use up the portion of the node taken by this mapping, so we know + // where the next mapping will start. + edit_first_position = edit_last_position; + + position_in_read += e.to_length(); + } + } + +} + +unordered_map> forwardize_breakpoints(const HandleGraph* graph, + const unordered_map>& breakpoints) { + unordered_map> fwd; + for (auto& p : breakpoints) { + id_t node_id = p.first; + if (!graph->has_node(node_id)) { + throw runtime_error("Node from GAM \"" + std::to_string(node_id) + "\" not found in graph. If you are sure" + " the input graph is a subgraph of that used to create the GAM, you can ignore this error" + " with \"vg augment -s\""); + } + assert(graph->has_node(node_id)); + size_t node_length = graph->get_length(graph->get_handle(node_id)); + auto bp = p.second; + for (auto& pos : bp) { + pos_t x = pos; + if (offset(pos) == node_length) continue; + if (offset(pos) > node_length) { + cerr << "forwardize_breakpoints error: failure, position " << pos << " is not inside node " + << node_id << endl; + assert(false); + } + if (is_rev(pos)) { + fwd[node_id].insert(reverse(pos, node_length)); + } else { + fwd[node_id].insert(pos); + } + } + } + return fwd; +} + + +// returns breakpoints on the forward strand of the nodes +void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends, + const string& base_quals, double min_baseq, double max_frac_n) { + // use existing methods to find the breakpoints, then copy them into a packer + // todo: streamline? + unordered_map> breakpoints; + find_breakpoints(path, breakpoints, break_ends, base_quals, min_baseq, max_frac_n); + breakpoints = forwardize_breakpoints(packed_breakpoints.get_graph(), breakpoints); + const HandleGraph* graph = packed_breakpoints.get_graph(); + for (auto& id_set : breakpoints) { + size_t node_len = graph->get_length(graph->get_handle(id_set.first)); + Position position; + position.set_node_id(id_set.first); + for (auto pos : id_set.second) { + size_t offset = get_offset(pos); + if (offset <= node_len - 1) { + position.set_offset(offset); + packed_breakpoints.increment_coverage(packed_breakpoints.position_in_basis(position)); + } + } + } +} + +unordered_map> filter_breakpoints_by_coverage(const Packer& packed_breakpoints, size_t min_bp_coverage) { + vector>> bp_maps(get_thread_count()); + size_t n = packed_breakpoints.coverage_size(); + const VectorizableHandleGraph* vec_graph = dynamic_cast(packed_breakpoints.get_graph()); + // we assume our position vector is much larger than the number of filtered breakpoints + // and scan it in parallel in a first pass +#pragma omp parallel for + for (size_t i = 0; i < n; ++i) { + if (packed_breakpoints.coverage_at_position(i) >= min_bp_coverage) { + auto& bp_map = bp_maps[omp_get_thread_num()]; + nid_t node_id = vec_graph->node_at_vector_offset(i+1); + size_t offset = i - vec_graph->node_vector_offset(node_id); + bp_map[node_id].insert(make_pos_t(node_id, false, offset)); + } + } + // then collect up the breakpoints sequentially in a second pass + for (size_t i = 1; i < bp_maps.size(); ++i) { + for (auto& kv : bp_maps[i]) { + bp_maps[0][kv.first].insert(kv.second.begin(), kv.second.end()); + } + } + + return bp_maps[0]; +} + + +path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path) { + path_handle_t path_handle = graph->create_path_handle(path.name(), path.is_circular()); + for (int i = 0; i < path.mapping_size(); ++i) { + graph->append_step(path_handle, graph->get_handle(path.mapping(i).position().node_id(), + path.mapping(i).position().is_reverse())); + } + return path_handle; +} + +map ensure_breakpoints(MutableHandleGraph* graph, + const unordered_map>& breakpoints) { + // Set up the map we will fill in with the new node start positions in the + // old nodes. + map toReturn; + + for(auto& kv : breakpoints) { + // Go through all the nodes we need to break up + auto original_node_id = kv.first; + + // Save the original node length. We don't want to break here (or later) + // because that would be off the end. + id_t original_node_length = graph->get_length(graph->get_handle(original_node_id)); + + // We are going through the breakpoints left to right, so we need to + // keep the node pointer for the right part that still needs further + // dividing. + handle_t right_part = graph->get_handle(original_node_id); + handle_t left_part; + + pos_t last_bp = make_pos_t(original_node_id, false, 0); + // How far into the original node does our right part start? + id_t current_offset = 0; + + for(auto breakpoint : kv.second) { + // For every point at which we need to make a new node, in ascending + // order (due to the way sets store ints)... + + // ensure that we're on the forward strand (should be the case due to forwardize_breakpoints) + assert(!is_rev(breakpoint)); + + // This breakpoint already exists, because the node starts or ends here + if(offset(breakpoint) == 0 + || offset(breakpoint) == original_node_length) { + continue; + } + + // How far in do we need to break the remaining right part? And how + // many bases will be in this new left part? + id_t divide_offset = offset(breakpoint) - current_offset; + + +#ifdef debug + cerr << "Need to divide original " << original_node_id << " at " << breakpoint << "/" << + + original_node_length << endl; + cerr << "Translates to " << graph->get_id(right_part) << " at " << divide_offset << "/" << + graph->get_length(right_part) << endl; + cerr << "divide offset is " << divide_offset << endl; +#endif + + if (offset(breakpoint) <= 0) { cerr << "breakpoint is " << breakpoint << endl; } + assert(offset(breakpoint) > 0); + if (offset(breakpoint) >= original_node_length) { cerr << "breakpoint is " << breakpoint << endl; } + assert(offset(breakpoint) < original_node_length); + + // Make a new left part and right part. This updates all the + // existing perfect match paths in the graph. + std::tie(left_part, right_part) = graph->divide_handle(right_part, divide_offset); + +#ifdef debug + + cerr << "Produced " << graph->get_id(left_part) << " (" << graph->get_length(left_part) << " bp)" << endl; + cerr << "Left " << graph->get_id(right_part) << " (" << graph->get_length(right_part) << " bp)" << endl; +#endif + + // The left part is now done. We know it started at current_offset + // and ended before breakpoint, so record it by start position. + + // record forward and reverse + toReturn[last_bp] = graph->get_id(left_part); + toReturn[reverse(breakpoint, original_node_length)] = graph->get_id(left_part); + + // Record that more sequence has been consumed + current_offset += divide_offset; + last_bp = breakpoint; + + } + + // Now the right part is done too. It's going to be the part + // corresponding to the remainder of the original node. + toReturn[last_bp] = graph->get_id(right_part); + toReturn[make_pos_t(original_node_id, true, 0)] = graph->get_id(right_part); + + // and record the start and end of the node + toReturn[make_pos_t(original_node_id, true, original_node_length)] = 0; + toReturn[make_pos_t(original_node_id, false, original_node_length)] = 0; + + } + + return toReturn; +} + +// We use this function to get the id of the node that contains a position on an +// original node. +static nid_t find_new_node(HandleGraph* graph, pos_t old_pos, const map& node_translation) { + if(node_translation.find(make_pos_t(id(old_pos), false, 0)) == node_translation.end()) { + // The node is unchanged + return id(old_pos); + } + // Otherwise, get the first new node starting after that position, and + // then look left. + auto found = node_translation.upper_bound(old_pos); + assert(found != node_translation.end()); + if (id(found->first) != id(old_pos) + || is_rev(found->first) != is_rev(old_pos)) { + return id_t(0); + } + // Get the thing before that (last key <= the position we want + --found; + assert(graph->has_node(found->second)); + + // Return the node we found. + return found->second; +}; + + +bool simplify_filtered_edits(HandleGraph* graph, Alignment& aln, Path& path, const map& node_translation, + const unordered_map& orig_node_sizes, + double min_baseq, double max_frac_n) { + + // check if an edit position is chopped at its next or prev position + auto is_chopped = [&](pos_t edit_position, bool forward) { + // todo: better coverage support at node ends (problem is pack structure doesn't have that extra bin) + bool chopped = offset(edit_position) >= orig_node_sizes.find(id(edit_position))->second - 1 || offset(edit_position) <= 0; + if (!chopped) { + if (forward) { + auto edit_next_position = edit_position; + ++get_offset(edit_next_position); + chopped = find_new_node(graph, edit_position, node_translation) != find_new_node(graph, edit_next_position, node_translation); + } else { + auto edit_prev_position = edit_position; + --get_offset(edit_prev_position); + chopped = find_new_node(graph, edit_position, node_translation) != find_new_node(graph, edit_prev_position, node_translation); + } + } + return chopped; + }; + + bool filtered_an_edit = false; + bool kept_an_edit = false; + + // The base position in the edit + size_t position_in_read = 0; + + // stuff that's getting cut out of the read, which requires cuts to + // quality and and the alignment string + vector> read_deletions; + + for (size_t i = 0; i < path.mapping_size(); ++i) { + // For each Mapping in the path + Mapping& m = *path.mutable_mapping(i); + + // What node are we on? In old node ID space. + id_t node_id = m.position().node_id(); + + // See where the next edit starts in the node. It is always included + // (even when the edit runs backward), unless the edit has 0 length in + // the reference. + pos_t edit_first_position = make_pos_t(m.position()); + + for(size_t j = 0; j < m.edit_size(); ++j) { + // For each Edit in the mapping + Edit& e = *m.mutable_edit(j); + size_t orig_to_length = e.to_length(); // remember here, as we may filter an insertion + + // Work out where its end position on the original node is (inclusive) + // We don't use this on insertions, so 0-from-length edits don't matter. + pos_t edit_last_position = edit_first_position; + get_offset(edit_last_position) += (e.from_length()?e.from_length()-1:0); + + // skip edits whose breakpoitns weren't added due to the coverage filter + // or edits whose avg base quality fails the min_baseq filter + if (!edit_is_match(e)) { + bool chopped; + if (e.from_length() == 0) { + // Just need one-side (prev) test when insertion + chopped = is_chopped(edit_first_position, false); + } else { + chopped = is_chopped(edit_first_position, false) && is_chopped(edit_last_position, true); + } + if (!chopped || + (min_baseq > 0 && get_avg_baseq(e, aln.quality(), position_in_read) < min_baseq) || + (max_frac_n < 1. && get_fraction_of_ns(e.sequence()) > max_frac_n)) { + if (e.from_length() == e.to_length() && !aln.sequence().empty()) { + // if we're smoothing a match out, patch the alignment sequence right away + // todo: actually look up the correct sequence from the translation. + } else if (edit_is_insertion(e)) { + // we're trimming off the filtered insertion. so the alignment's sequence and quality + // will need to get updated. + read_deletions.push_back(make_pair(position_in_read, e.to_length())); + } + e.set_to_length(e.from_length()); + e.set_sequence(""); + filtered_an_edit = true; + + } else { + kept_an_edit = true; + } + } + + // Advance in the right direction along the original node for this edit. + // This way the next one will start at the right place. + get_offset(edit_first_position) += e.from_length(); + + position_in_read += orig_to_length; + } + } + + if (filtered_an_edit) { + // there's something to simplify + path = simplify(path); + + if (!read_deletions.empty()) { + // cut out deleted parts of the read from the sequence and quality + const string& seq = aln.sequence(); + const string& qual = aln.quality(); + string cut_seq; + string cut_qual; + int j = 0; + for (int i = 0; i < seq.length(); ++i) { + if (j < read_deletions.size() && i == read_deletions[j].first) { + // skip a deleted interval + i += read_deletions[j].second - 1; + ++j; + } else { + // copy a single position that wasn't skipped + cut_seq.push_back(seq[i]); + if (!qual.empty()) { + cut_qual.push_back(qual[i]); + } + } + } + aln.set_sequence(cut_seq); + aln.set_quality(cut_qual); + } + } + + return kept_an_edit; +} + +Path add_nodes_and_edges(MutableHandleGraph* graph, + const Path& path, + const map& node_translation, + unordered_map, vector>& added_seqs, + unordered_map& added_nodes, + const unordered_map& orig_node_sizes, + size_t max_node_size) { + + set dangling; + return add_nodes_and_edges(graph, + path, + node_translation, + added_seqs, + added_nodes, + orig_node_sizes, + dangling, + max_node_size); + +} + + +Path add_nodes_and_edges(MutableHandleGraph* graph, + const Path& path, + const map& node_translation, + unordered_map, vector>& added_seqs, + unordered_map& added_nodes, + const unordered_map& orig_node_sizes, + set& dangling, + size_t max_node_size) { + + // The basic algorithm is to traverse the path edit by edit, keeping track + // of a NodeSide for the last piece of sequence we were on. If we hit an + // edit that creates new sequence, we check if it has been added before If + // it has, we use it. If not, we create that new sequence as a node, and + // attach it to the dangling NodeSide(s), and leave its end dangling + // instead. If we hit an edit that corresponds to a match, we know that + // there's a breakpoint on each end (since it's bordered by a non-perfect- + // match or the end of a node), so we can attach its start to the dangling + // NodeSide(s) and leave its end dangling instead. + + // We need node_translation to translate between node ID space, where the + // paths are articulated, and new node ID space, where the edges are being + // made. + + // We need orig_node_sizes so we can remember the sizes of nodes that we + // modified, so we can interpret the paths. It only holds sizes for modified + // nodes. + + // This is where we will keep the version of the path articulated as + // actually embedded in the graph. + Path embedded; + embedded.set_name(path.name()); + + auto create_new_mappings = [&](pos_t p1, pos_t p2, bool is_rev) { + vector mappings; + vector nodes; + for (pos_t p = p1; p <= p2; ++get_offset(p)) { + auto n = find_new_node(graph, p, node_translation); + assert(n != 0); + nodes.push_back(find_new_node(graph, p, node_translation)); + } + auto np = nodes.begin(); + while (np != nodes.end()) { + size_t c = 0; + auto n1 = np; + while (np != nodes.end() && *n1 == *np) { + ++c; + ++np; // we'll always increment once + } + assert(c); + // set the mapping position + Mapping m; + m.mutable_position()->set_node_id(*n1); + m.mutable_position()->set_is_reverse(is_rev); + // and the edit that says we match + Edit* e = m.add_edit(); + e->set_from_length(c); + e->set_to_length(c); + mappings.push_back(m); + } + return mappings; + }; + + for (size_t i = 0; i < path.mapping_size(); ++i) { + // For each Mapping in the path + const Mapping& m = path.mapping(i); + + // What node are we on? In old node ID space. + id_t node_id = m.position().node_id(); + + // See where the next edit starts in the node. It is always included + // (even when the edit runs backward), unless the edit has 0 length in + // the reference. + pos_t edit_first_position = make_pos_t(m.position()); + + for(size_t j = 0; j < m.edit_size(); ++j) { + // For each Edit in the mapping + const Edit& e = m.edit(j); + + // Work out where its end position on the original node is (inclusive) + // We don't use this on insertions, so 0-from-length edits don't matter. + pos_t edit_last_position = edit_first_position; + //get_offset(edit_last_position) += (e.from_length()?e.from_length()-1:0); + get_offset(edit_last_position) += (e.from_length()?e.from_length()-1:0); + +//#define debug_edit true +#ifdef debug_edit + cerr << "Edit on " << node_id << " from " << edit_first_position << " to " << edit_last_position << endl; + cerr << pb2json(e) << endl; +#endif + + if(edit_is_insertion(e) || edit_is_sub(e)) { + // This edit introduces new sequence. +#ifdef debug_edit + cerr << "Handling ins/sub relative to " << node_id << endl; +#endif + // store the path representing this novel sequence in the translation table + auto prev_position = edit_first_position; + Path from_path; + auto prev_from_mapping = from_path.add_mapping(); + *prev_from_mapping->mutable_position() = make_position(prev_position); + auto from_edit = prev_from_mapping->add_edit(); + from_edit->set_sequence(e.sequence()); + from_edit->set_to_length(e.to_length()); + from_edit->set_from_length(e.from_length()); + // find the position after the edit + // if the edit is not the last in a mapping, the position after is from_length of the edit after this + pos_t next_position; + if (j + 1 < m.edit_size()) { + next_position = prev_position; + get_offset(next_position) += e.from_length(); + auto next_from_mapping = from_path.add_mapping(); + *next_from_mapping->mutable_position() = make_position(next_position); + } else { // implicitly (j + 1 == m.edit_size()) + // if the edit is the last in a mapping, look at the next mapping position + if (i + 1 < path.mapping_size()) { + auto& next_mapping = path.mapping(i+1); + auto next_from_mapping = from_path.add_mapping(); + *next_from_mapping->mutable_position() = next_mapping.position(); + } else { + // if we are at the end of the path, then this insertion has no end, and we do nothing + } + } + // TODO what about forward into reverse???? + if (is_rev(prev_position)) { + from_path = simplify( + reverse_complement_path(from_path, [&](int64_t id) { + auto l = orig_node_sizes.find(id); + if (l == orig_node_sizes.end()) { + // The node has no entry, so it must not have been broken + return graph->get_length(graph->get_handle(id)); + } else { + return l->second; + } + })); + } + + // Create the new nodes, reversing it if we are reversed + vector new_nodes; + pos_t start_pos = make_pos_t(from_path.mapping(0).position()); + // We put in the reverse of our sdequence if we are an insert on + // the revers of a node, to keep the graph pointing mostly the + // same direction. + auto fwd_seq = m.position().is_reverse() ? + reverse_complement(e.sequence()) + : e.sequence(); + auto novel_edit_key = make_pair(start_pos, fwd_seq); + auto added = added_seqs.find(novel_edit_key); + if (added != added_seqs.end()) { + // if we have the node run already, don't make it again, just use the existing one + new_nodes = added->second; +#ifdef debug_edit + cerr << "Re-using already added nodes: "; + for (auto n : new_nodes) { + cerr << n << " "; + } + cerr << endl; +#endif + } else { + // Make a new run of nodes of up to max_node_size each + + // Make sure that we are trying to make a run of nodes of + // the length we're supposed to be. + assert(path_to_length(from_path) == fwd_seq.size()); + + size_t cursor = 0; + while (cursor < fwd_seq.size()) { + // Until we used up all the sequence, make nodes + handle_t new_node = graph->create_handle(fwd_seq.substr(cursor, max_node_size)); + cursor += max_node_size; + +#ifdef debug_edit + cerr << "Create new node " << pb2json(*new_node) << endl; +#endif + if (!new_nodes.empty()) { + // Connect each to the previous node in the chain. + graph->create_edge(graph->get_handle(new_nodes.back()), new_node); +#ifdef debug_edit + cerr << "Create edge " << new_nodes.back() << "," << graph->get_id(new_node) << endl; +#endif + } + + // Remember the new node + new_nodes.push_back(graph->get_id(new_node)); + + // Chop the front of the from path off and associate it + // with this node. TODO: this is n^2 in number of nodes + // we add because we copy the whole path each time. + Path front_path; + + if (path_to_length(from_path) > graph->get_length(new_node)) { + // There will still be path left, so we cut the path + tie(front_path, from_path) = cut_path(from_path, graph->get_length(new_node)); + } else { + // We consume the rest of the path. Don't bother cutting it. + swap(front_path, from_path); + } + + // The front bit of the path belongs to this new node + added_nodes[graph->get_id(new_node)] = front_path; + + } + + // reverse the order of the nodes if we did a rev-comp + if (m.position().is_reverse()) { + std::reverse(new_nodes.begin(), new_nodes.end()); + } + + // TODO: fwd_seq can't be empty or problems will be happen + // because we'll have an empty vector of created nodes. I + // think the edit won't be an insert or sub if it is, + // though. + + // Remember that this run belongs to this edit + added_seqs[novel_edit_key] = new_nodes; + + } + + for (auto new_node : new_nodes) { + // Add a mapping to each newly created node + Mapping& nm = *embedded.add_mapping(); + nm.mutable_position()->set_node_id(new_node); + nm.mutable_position()->set_is_reverse(m.position().is_reverse()); + + // Don't set a rank; since we're going through the input + // path in order, the auto-generated ranks will put our + // newly created mappings in order. + + Edit* e = nm.add_edit(); + size_t l = graph->get_length(graph->get_handle(new_node)); + e->set_from_length(l); + e->set_to_length(l); + } + + for (auto& dangler : dangling) { + // This actually referrs to a node. + + // Attach what was dangling to the early-in-the-alignment side of the newly created run. + auto to_attach = NodeSide(m.position().is_reverse() ? new_nodes.back() : new_nodes.front(), + m.position().is_reverse()); + +#ifdef debug_edit + cerr << "Connecting " << dangler << " and " << to_attach << endl; +#endif + // Add an edge from the dangling NodeSide to the start of this new node + auto from_handle = graph->get_handle(dangler.node, !dangler.is_end); + auto to_handle = graph->get_handle(to_attach.node, to_attach.is_end); + graph->create_edge(from_handle, to_handle); + } + + // Dangle the late-in-the-alignment end of this run of new nodes + dangling.clear(); + dangling.insert(NodeSide(m.position().is_reverse() ? new_nodes.front() : new_nodes.back(), + !m.position().is_reverse())); + + // save edit into translated path + + } else if(edit_is_match(e)) { + // We're using existing sequence + + // We know we have breakpoints on both sides, but we also might + // have additional breakpoints in the middle. So we need the + // left node, that contains the first base of the match, and the + // right node, that contains the last base of the match. + id_t left_node = find_new_node(graph, edit_first_position, node_translation); + id_t right_node = find_new_node(graph, edit_last_position, node_translation); + + // TODO: we just assume the outer edges of these nodes are in + // the right places. They should be if we cut the breakpoints + // right. + + // get the set of new nodes that we map to + // and use the lengths of each to create new mappings + // and append them to the path we are including + for (auto nm : create_new_mappings(edit_first_position, + edit_last_position, + m.position().is_reverse())) { + + *embedded.add_mapping() = nm; + + // Don't set a rank; since we're going through the input + // path in order, the auto-generated ranks will put our + // newly created mappings in order. + } + +#ifdef debug_edit + cerr << "Handling match relative to " << node_id << endl; +#endif + + for (auto& dangler : dangling) { +#ifdef debug_edit + cerr << "Connecting " << dangler << " and " << NodeSide(left_node->id(), m.position().is_reverse()) << endl; +#endif + + // Connect the left end of the left node we matched in the direction we matched it + auto from_handle = graph->get_handle(dangler.node, !dangler.is_end); + auto to_handle = graph->get_handle(left_node, m.position().is_reverse()); + graph->create_edge(from_handle, to_handle); + } + + // Dangle the right end of the right node in the direction we matched it. + if (right_node != 0) { + dangling.clear(); + dangling.insert(NodeSide(right_node, !m.position().is_reverse())); + } + } else { + // We don't need to deal with deletions since we'll deal with the actual match/insert edits on either side +#ifdef debug_edit + cerr << "Skipping other edit relative to " << node_id << endl; +#endif + } + + // Advance in the right direction along the original node for this edit. + // This way the next one will start at the right place. + get_offset(edit_first_position) += e.from_length(); + +//#undef debug_edut + } + + } + + // Actually return the embedded path. + return embedded; + +} + +vector make_translation(const HandleGraph* graph, + const map& node_translation, + const unordered_map& added_nodes, + const unordered_map& orig_node_sizes) { + vector translation; + // invert the translation + map inv_node_trans; + for (auto& t : node_translation) { + if (!is_rev(t.first)) { + inv_node_trans[t.second] = t.first; + } + } + // walk the whole graph + graph->for_each_handle([&](handle_t handle) { + id_t node = graph->get_id(handle); + translation.emplace_back(); + auto& trans = translation.back(); + auto f = inv_node_trans.find(node); + auto added = added_nodes.find(node); + if (f != inv_node_trans.end()) { + // if the node is in the inverted translation, use the position to make a mapping + auto pos = f->second; + auto from_mapping = trans.mutable_from()->add_mapping(); + auto to_mapping = trans.mutable_to()->add_mapping(); + // Make sure the to mapping is in the same orientation as the + // from mapping, since we're going to be making translations on + // both strands and the new node is the same local orientation + // as the old node. + *to_mapping->mutable_position() = make_position(node, is_rev(pos), 0); + *from_mapping->mutable_position() = make_position(pos); + auto match_length = graph->get_length(handle); + auto to_edit = to_mapping->add_edit(); + to_edit->set_to_length(match_length); + to_edit->set_from_length(match_length); + auto from_edit = from_mapping->add_edit(); + from_edit->set_to_length(match_length); + from_edit->set_from_length(match_length); + } else if (added != added_nodes.end()) { + // the node is novel + auto to_mapping = trans.mutable_to()->add_mapping(); + *to_mapping->mutable_position() = make_position(node, false, 0); + auto to_edit = to_mapping->add_edit(); + to_edit->set_to_length(graph->get_length(handle)); + to_edit->set_from_length(graph->get_length(handle)); + auto from_path = trans.mutable_from(); + *trans.mutable_from() = added->second; + } else { + // otherwise we assume that the graph is unchanged + auto from_mapping = trans.mutable_from()->add_mapping(); + auto to_mapping = trans.mutable_to()->add_mapping(); + *to_mapping->mutable_position() = make_position(node, false, 0); + *from_mapping->mutable_position() = make_position(node, false, 0); + auto match_length = graph->get_length(handle); + auto to_edit = to_mapping->add_edit(); + to_edit->set_to_length(match_length); + to_edit->set_from_length(match_length); + auto from_edit = from_mapping->add_edit(); + from_edit->set_to_length(match_length); + from_edit->set_from_length(match_length); + } + }); + std::sort(translation.begin(), translation.end(), + [&](const Translation& t1, const Translation& t2) { + if (!t1.from().mapping_size() && !t2.from().mapping_size()) { + // warning: this won't work if we don't have to mappings + // this guards against the lurking segfault + return t1.to().mapping_size() && t2.to().mapping_size() + && make_pos_t(t1.to().mapping(0).position()) + < make_pos_t(t2.to().mapping(0).position()); + } else if (!t1.from().mapping_size()) { + return true; + } else if (!t2.from().mapping_size()) { + return false; + } else { + return make_pos_t(t1.from().mapping(0).position()) + < make_pos_t(t2.from().mapping(0).position()); + } + }); + // append the reverse complement of the translation + translation.reserve(translation.size() * 2); + auto get_curr_node_length = [&](id_t id) { + return graph->get_length(graph->get_handle(id)); + }; + auto get_orig_node_length = [&](id_t id) { + auto f = orig_node_sizes.find(id); + if (f == orig_node_sizes.end()) { + // The node has no entry, so it must not have been broken + return graph->get_length(graph->get_handle(id)); + } + return f->second; + }; + for (auto& trans : translation) { + translation.emplace_back(); + auto& rev_trans = translation.back(); + *rev_trans.mutable_to() = simplify(reverse_complement_path(trans.to(), get_curr_node_length)); + *rev_trans.mutable_from() = simplify(reverse_complement_path(trans.from(), get_orig_node_length)); + } + return translation; +} + + +void add_edges_only(MutableHandleGraph* graph, + function,bool, bool)> iterate_gam, + double min_mapq, + size_t min_bp_coverage) { + // occurrence of each non-graph edge + // todo: is this too big? do we need something more compact? + // novel non-graph edges from read-mappings should be pretty local (and not quadratically exploding) + // in general, i think. + vector> edge_counts(get_thread_count()); + + // scan every non-graph edge in the alignment paths. if we have a coverage threshold, + // then fill in the edge_counts map, otherwise just add the edges as soon as they're found + iterate_gam((function)[&](Alignment& aln) { + + if (aln.mapping_quality() < min_mapq) { + return; + } + + handle_t prev_handle; + for (size_t i = 0; i < aln.path().mapping_size(); ++i) { + const Mapping& mapping = aln.path().mapping(i); + const Position& pos = mapping.position(); + handle_t handle = graph->get_handle(pos.node_id(), pos.is_reverse()); + if (i > 0) { + edge_t edge = graph->edge_handle(prev_handle, handle); + if (!graph->has_edge(edge)) { + if (min_bp_coverage > 1) { + edge_counts[omp_get_thread_num()][edge]++; + } else { + graph->create_edge(edge); + } + } + } + prev_handle = handle; + } + }, false, min_bp_coverage > 1); + + if (min_bp_coverage > 1) { + // second pass required to add edges that meet threshold + + // start by merging the thread counters into the first + for (size_t i = 1; i < edge_counts.size(); ++i) { + for (const auto& ec : edge_counts[i]) { + edge_counts[0][ec.first] += ec.second; + } + edge_counts[i].clear(); + } + // then add all the edges that meet threshold + for (const auto& ec : edge_counts[0]) { + if (ec.second >= min_bp_coverage) { + graph->create_edge(ec.first); + } + } + } +} + +} diff --git a/src/augment.hpp b/src/augment.hpp new file mode 100644 index 00000000000..9e5e3902494 --- /dev/null +++ b/src/augment.hpp @@ -0,0 +1,212 @@ +#ifndef VG_AUGMENT_HPP_INCLUDED +#define VG_AUGMENT_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include + +#include "handle.hpp" + +namespace vg { + +class Packer; + +using namespace std; + +/// %Edit the graph to include all the sequence and edges added by the given +/// paths. Can handle paths that visit nodes in any orientation. Note that +/// this method sorts the graph and rebuilds the path index, so it should +/// not be called in a loop. +/// +/// if gam_path is "-", then stdin used +/// if gam_out_path is "-", then stdout used +/// If gam_out_path is not empty, the paths will be modified to reflect their +/// embedding in the modified graph and written to the path. +/// aln_format used to toggle between GAM and GAF +/// If out_translation is not null, a list of translations, one per node existing +/// after the edit, describing +/// how each new or conserved node is embedded in the old graph. +/// if embed_paths is true, then the augmented alignemnents will be saved as embededed paths in the graph +/// in order to add it back to the graph. +/// If break_at_ends is true, nodes will be broken at +/// the ends of paths that start/end woth perfect matches, so the paths can +/// be added to the vg graph's paths object. +/// If soft_clip is true, soft clips will be removed from the input paths +/// before processing, and the dangling ends won't end up in the graph +/// If filter_out_of_graph_alignments is true, some extra time will be taken to check if +/// all nodes in the alignment are in the graph. If they aren't, then it will be ignored +/// If an edit sequence's avg base quality is less than min_baseq it will be ignored (considered a match) +/// If an alignment's mapping quality is less than min_mapq it is ignored +/// A packer is required for all non-mapq filters +/// If a breakpoint has less than min_bp_coverage it is not included in the graph +/// Edits with more than max_frac_n N content will be ignored +void augment(MutablePathMutableHandleGraph* graph, + const string& gam_path, + const string& aln_format = "GAM", + vector* out_translation = nullptr, + const string& gam_out_path = "", + bool embed_paths = false, + bool break_at_ends = false, + bool remove_soft_clips = false, + bool filter_out_of_graph_alignments = false, + double min_baseq = 0, + double min_mapq = 0, + Packer* packer = nullptr, + size_t min_bp_coverage = 0, + double max_frac_n = 1., + bool edges_only = false); + +/// Like above, but operates on a vector of Alignments, instead of a file +/// (Note: It is best to use file interface to stream large numbers of alignments to save memory) +void augment(MutablePathMutableHandleGraph* graph, + vector& path_vector, + const string& aln_format = "GAM", + vector* out_translation = nullptr, + const string& gam_out_path = "", + bool embed_paths = false, + bool break_at_ends = false, + bool remove_soft_clips = false, + bool filter_out_of_graph_alignments = false, + double min_baseq = 0, + double min_mapq = 0, + Packer* packer = nullptr, + size_t min_bp_coverage = 0, + double max_frac_n = 1., + bool edges_only = false); + +/// Generic version used to implement the above three methods. +void augment_impl(MutablePathMutableHandleGraph* graph, + function, bool, bool)> iterate_gam, + const string& aln_format, + vector* out_translation, + const string& gam_out_path, + bool embed_paths, + bool break_at_ends, + bool remove_soft_clips, + bool filter_out_of_graph_alignments, + double min_baseq, + double min_mapq, + Packer* packer, + size_t min_bp_coverage, + double max_frac_n, + bool edges_only); + +/// Add a path to the graph. This is like VG::extend, and expects +/// a path with no edits, and for all the nodes and edges in the path +/// to exist exactly in the graph +path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path); + +/// Compute the average base quality of an edit. +/// If the edit has no sequence or there are no base_quals given, +/// then double_max is returned. +double get_avg_baseq(const Edit& edit, const string& base_quals, size_t position_in_read); + +/// Find all the points at which a Path enters or leaves nodes in the graph. Adds +/// them to the given map by node ID of sets of bases in the node that will need +/// to become the starts of new nodes. +/// +/// If break_ends is true, emits breakpoints at the ends of the path, even +/// if it starts/ends with perfect matches. + +/// Find all the points at which a Path enters or leaves nodes in the graph. Adds +/// them to the given map by node ID of sets of bases in the node that will need +/// to become the starts of new nodes. +/// +/// If break_ends is true, emits breakpoints at the ends of the path, even +/// if it starts/ends with perfect matches. +void find_breakpoints(const Path& path, unordered_map>& breakpoints, bool break_ends = true, + const string& base_quals = "", double min_baseq = 0, double max_frac_n = 1.); + +/// Flips the breakpoints onto the forward strand. +unordered_map> forwardize_breakpoints(const HandleGraph* graph, + const unordered_map>& breakpoints); + + +/// Like "find_breakpoints", but store in packed structure (better for large gams and enables coverage filter) +void find_packed_breakpoints(const Path& path, Packer& packed_breakpoints, bool break_ends = true, + const string& base_quals = "", double min_baseq = 0, double max_frac_n = 1.); + +/// Filters the breakpoints by coverage, and converts them back from the Packer to the STL map +/// expected by following methods +unordered_map> filter_breakpoints_by_coverage(const Packer& packed_breakpoints, size_t min_bp_coverage); + +/// Take a map from node ID to a set of offsets at which new nodes should +/// start (which may include 0 and 1-past-the-end, which should be ignored), +/// break the specified nodes at those positions. Returns a map from old +/// node start position to new node pointer in the graph. Note that the +/// caller will have to crear and rebuild path rank data. +/// +/// Returns a map from old node start position to new node. This map +/// contains some entries pointing to null, for positions past the ends of +/// original nodes. It also maps from positions on either strand of the old +/// node to the same new node pointer; the new node's forward strand is +/// always the same as the old node's forward strand. +map ensure_breakpoints(MutableHandleGraph* graph, + const unordered_map>& breakpoints); + +/// Remove edits in our graph that don't correspond to breakpoints (ie were effectively filtered +/// out due to insufficient coverage. This way, subsequent logic in add_nodes_and_edges +/// can be run correctly. Returns true if at least one edit survived the filter. +bool simplify_filtered_edits(HandleGraph* graph, Alignment& aln, Path& path, const map& node_translation, + const unordered_map& orig_node_sizes, + double min_baseq = 0, double max_frac_n = 1.); + +/// Given a path on nodes that may or may not exist, and a map from start +/// position in the old graph to a node in the current graph, add all the +/// new sequence and edges required by the path. The given path must not +/// contain adjacent perfect match edits in the same mapping, or any +/// deletions on the start or end of mappings (the removal of which can be +/// accomplished with the Path::simplify() function). +/// +/// Outputs (and caches for subsequent calls) novel node runs in added_seqs, +/// and Paths describing where novel nodes translate back to in the original +/// graph in added_nodes. Also needs a map of the original sizes of nodes +/// deleted from the original graph, for reverse complementing. If dangling +/// is nonempty, left edges of nodes created for initial inserts will +/// connect to the specified sides. At the end, dangling is populated with +/// the side corresponding to the last edit in the path. +/// +/// Returns a fully embedded version of the path, after all node insertions, +/// divisions, and translations. +Path add_nodes_and_edges(MutableHandleGraph* graph, + const Path& path, + const map& node_translation, + unordered_map, vector>& added_seqs, + unordered_map& added_nodes, + const unordered_map& orig_node_sizes, + set& dangling, + size_t max_node_size = 1024); + +/// This version doesn't require a set of dangling sides to populate +Path add_nodes_and_edges(MutableHandleGraph* graph, + const Path& path, + const map& node_translation, + unordered_map, vector>& added_seqs, + unordered_map& added_nodes, + const unordered_map& orig_node_sizes, + size_t max_node_size = 1024); + +/// Produce a graph Translation object from information about the editing process. +vector make_translation(const HandleGraph* graph, + const map& node_translation, + const unordered_map& added_nodes, + const unordered_map& orig_node_sizes); + +/// Add edges between consecutive mappings that aren't already in the graph +/// note: offsets are completely ignored (a simplifying assumption designed +/// to help with SV genotpying with pack/call as edge packing works similarly) +/// +/// No existing nodes or edges are modified, and no nodes are added, just edges +/// So the output graph will be id-space compatible, and any GAM/GAF will continue +/// to be valid for it. +void add_edges_only(MutableHandleGraph* graph, + function,bool, bool)> iterate_gam, + double min_mapq, + size_t min_bp_coverage); + +} + +#endif diff --git a/src/back_translating_alignment_emitter.cpp b/src/back_translating_alignment_emitter.cpp new file mode 100644 index 00000000000..f9fe6f14246 --- /dev/null +++ b/src/back_translating_alignment_emitter.cpp @@ -0,0 +1,72 @@ +/** + * \file back_translating_alignment_emitter.cpp + * Implementation for BackTranslatingAlignmentEmitter + */ + + +#include "back_translating_alignment_emitter.hpp" +#include "algorithms/back_translate.hpp" + +namespace vg { + +using namespace std; + +BackTranslatingAlignmentEmitter::BackTranslatingAlignmentEmitter(const NamedNodeBackTranslation* translation, + unique_ptr&& backing) : translation(translation), backing(std::move(backing)) { + // Nothing to do! +} + +void BackTranslatingAlignmentEmitter::back_translate_alignments_in_place(vector& alns) const { + for (auto& aln : alns) { + algorithms::back_translate_in_place(translation, *aln.mutable_path()); + } +} + +void BackTranslatingAlignmentEmitter::emit_singles(vector&& aln_batch) { + // Intercept the batch on its way + vector aln_batch_caught(aln_batch); + // Process it in place + back_translate_alignments_in_place(aln_batch_caught); + // Forward it along + backing->emit_singles(std::move(aln_batch_caught)); +} + +void BackTranslatingAlignmentEmitter::emit_mapped_singles(vector>&& alns_batch) { + // Intercept the batch on its way + vector> alns_batch_caught(alns_batch); + for (auto& mappings : alns_batch_caught) { + // Surject all mappings in place + back_translate_alignments_in_place(mappings); + } + // Forward it along + backing->emit_mapped_singles(std::move(alns_batch_caught)); +} + +void BackTranslatingAlignmentEmitter::emit_pairs(vector&& aln1_batch, vector&& aln2_batch, vector&& tlen_limit_batch) { + // Intercept the batch on its way + vector aln1_batch_caught(aln1_batch); + vector aln2_batch_caught(aln2_batch); + // Process it in place + back_translate_alignments_in_place(aln1_batch_caught); + back_translate_alignments_in_place(aln2_batch_caught); + // Forward it along + backing->emit_pairs(std::move(aln1_batch_caught), std::move(aln2_batch_caught), std::move(tlen_limit_batch)); +} + +void BackTranslatingAlignmentEmitter::emit_mapped_pairs(vector>&& alns1_batch, vector>&& alns2_batch, vector&& tlen_limit_batch) { + // Intercept the batch on its way + vector> alns1_batch_caught(alns1_batch); + vector> alns2_batch_caught(alns2_batch); + for (auto& mappings : alns1_batch_caught) { + // Process all mappings in place + back_translate_alignments_in_place(mappings); + } + for (auto& mappings : alns2_batch_caught) { + // Process all mappings in place + back_translate_alignments_in_place(mappings); + } + // Forward it along + backing->emit_mapped_pairs(std::move(alns1_batch_caught), std::move(alns2_batch_caught), std::move(tlen_limit_batch)); +} + +} diff --git a/src/back_translating_alignment_emitter.hpp b/src/back_translating_alignment_emitter.hpp new file mode 100644 index 00000000000..160fd667d88 --- /dev/null +++ b/src/back_translating_alignment_emitter.hpp @@ -0,0 +1,70 @@ +#ifndef VG_BACK_TRANSLATING_ALIGNMENT_EMITTER_HPP_INCLUDED +#define VG_BACK_TRANSLATING_ALIGNMENT_EMITTER_HPP_INCLUDED + +/** \file + * + * Holds a back-translating wrapper AlignmentEmitter. + */ + + +#include "vg/io/alignment_emitter.hpp" +#include "handle.hpp" + +#include +#include +#include + +namespace vg { + +using namespace std; + +/** + * An AlignmentEmitter implementation that translates alignments into + * named-segment space coordinates before emitting them via a backing + * AlignmentEmitter, which it owns. + */ +class BackTranslatingAlignmentEmitter : public vg::io::AlignmentEmitter { +public: + + /** + * Make an alignment emitter that translates alignments using the given + * translation and emits them to the given backing AlignmentEmitter. + * Takes ownership of the AlignmentEmitter. + */ + BackTranslatingAlignmentEmitter(const NamedNodeBackTranslation* translation, + unique_ptr&& backing); + + /// Emit a batch of Alignments + virtual void emit_singles(vector&& aln_batch); + /// Emit batch of Alignments with secondaries. All secondaries must have is_secondary set already. + virtual void emit_mapped_singles(vector>&& alns_batch); + /// Emit a batch of pairs of Alignments. The tlen_limit_batch, if + /// specified, is the maximum pairing distance for ewch pair to flag + /// properly paired, if the output format cares about such things. TODO: + /// Move to a properly paired annotation that runs with the Alignment. + virtual void emit_pairs(vector&& aln1_batch, vector&& aln2_batch, + vector&& tlen_limit_batch); + /// Emit the mappings of a batch of pairs of Alignments. All secondaries + /// must have is_secondary set already. The tlen_limit_batch, if specified, + /// is the maximum pairing distance for each pair to flag properly paired, + /// if the output format cares about such things. TODO: Move to a properly + /// paired annotation that runs with the Alignment. + /// + /// Both ends of each pair must have the same number of mappings. + virtual void emit_mapped_pairs(vector>&& alns1_batch, + vector>&& alns2_batch, vector&& tlen_limit_batch); + +protected: + /// Translation to use to translate node IDs to pieces of named segments. + const NamedNodeBackTranslation* translation; + + /// AlignmentEmitter to emit to once done + unique_ptr backing; + + /// Back-translate alignments in place. + void back_translate_alignments_in_place(vector& alns) const; +}; + +} + +#endif diff --git a/src/banded_global_aligner.cpp b/src/banded_global_aligner.cpp index a5a68221812..0dca6eb05bb 100644 --- a/src/banded_global_aligner.cpp +++ b/src/banded_global_aligner.cpp @@ -6,13 +6,18 @@ // #include "banded_global_aligner.hpp" -#include "json2pb.h" +#include "vg/io/json2pb.h" //#define debug_banded_aligner_objects //#define debug_banded_aligner_graph_processing //#define debug_banded_aligner_fill_matrix //#define debug_banded_aligner_traceback //#define debug_banded_aligner_print_matrices +//#define debug_jemalloc + +#ifdef debug_jemalloc +#include +#endif namespace vg { @@ -21,7 +26,7 @@ BandedGlobalAligner::BABuilder::BABuilder(Alignment& alignment) : alignment(alignment), matrix_state(Match), matching(false), - current_node(nullptr), + current_node_id(0), edit_length(0), edit_read_end_idx(0) { @@ -34,22 +39,23 @@ BandedGlobalAligner::BABuilder::~BABuilder() { } template -void BandedGlobalAligner::BABuilder::update_state(matrix_t matrix, Node* node, - int64_t read_idx, int64_t node_idx, +void BandedGlobalAligner::BABuilder::update_state(const HandleGraph& graph, matrix_t matrix, + const handle_t& node, int64_t read_idx, int64_t node_idx, bool empty_node_seq) { #ifdef debug_banded_aligner_traceback cerr << "[BABuilder::update_state] beginning " << (empty_node_seq ? "" : "non-") << "empty state update for read index " << read_idx << ", node seq index " << node_idx << endl; #endif - if (node != current_node) { + if (graph.get_id(node) != current_node_id) { #ifdef debug_banded_aligner_traceback - cerr << "[BABuilder::update_state] at new node " << (node ? node->id() : -1) << " previously " << (current_node ? current_node->id() : -1) << endl; + cerr << "[BABuilder::update_state] at new node " << graph.get_id(node) << " previously " << current_node_id << endl; #endif // conclude current mapping and proceed to next node finish_current_node(); - current_node = node; + current_node_id = graph.get_id(node); + current_node_sequence = graph.get_sequence(node); matrix_state = matrix; if (matrix_state == Match) { - matching = (alignment.sequence()[read_idx] == current_node->sequence()[node_idx]); + matching = (alignment.sequence()[read_idx] == current_node_sequence[node_idx]); } edit_length = !empty_node_seq; edit_read_end_idx = read_idx; @@ -64,13 +70,13 @@ void BandedGlobalAligner::BABuilder::update_state(matrix_t matrix, Node } matrix_state = matrix; if (matrix_state == Match) { - matching = (alignment.sequence()[read_idx] == current_node->sequence()[node_idx]); + matching = (alignment.sequence()[read_idx] == current_node_sequence[node_idx]); } edit_length = 1; edit_read_end_idx = read_idx; } else if (matrix == Match && - (alignment.sequence()[read_idx] == current_node->sequence()[node_idx]) != matching) { + (alignment.sequence()[read_idx] == current_node_sequence[node_idx]) != matching) { #ifdef debug_banded_aligner_traceback cerr << "[BABuilder::update_state] switching between match and mismatch" << endl; #endif @@ -86,7 +92,7 @@ void BandedGlobalAligner::BABuilder::update_state(matrix_t matrix, Node } #ifdef debug_banded_aligner_traceback - cerr << "[BABuilder::update_state] finished updating state, matrix is " << (matrix_state == Match ? "match" : (matrix_state == InsertRow ? "insert row" : "insert column" )) << ", is matching? " << (matching ? "yes" : "no") << ", edit length " << edit_length << ", edit end index (on read) " << edit_read_end_idx << ", current node " << current_node->id() << endl; + cerr << "[BABuilder::update_state] finished updating state, matrix is " << (matrix_state == Match ? "match" : (matrix_state == InsertRow ? "insert row" : "insert column" )) << ", is matching? " << (matching ? "yes" : "no") << ", edit length " << edit_length << ", edit end index (on read) " << edit_read_end_idx << ", current node " << current_node_id << endl; #endif } @@ -137,7 +143,7 @@ template void BandedGlobalAligner::BABuilder::finish_current_node() { // sentinel for first iteration - if (current_node == nullptr) { + if (current_node_id == 0) { #ifdef debug_banded_aligner_traceback cerr << "[BABuilder::finish_current_node] at beginning of traceback, not creating a mapping" << endl; #endif @@ -147,7 +153,7 @@ void BandedGlobalAligner::BABuilder::finish_current_node() { finish_current_edit(); #ifdef debug_banded_aligner_traceback - cerr << "[BABuilder::finish_current_node] finishing mapping for node " << current_node->id() << endl; + cerr << "[BABuilder::finish_current_node] finishing mapping for node " << current_node_id << endl; #endif node_mappings.emplace_front(); @@ -161,7 +167,7 @@ void BandedGlobalAligner::BABuilder::finish_current_node() { mapping_edits.clear(); - (*(node_mappings.front().mutable_position())).set_node_id(current_node->id()); + (*(node_mappings.front().mutable_position())).set_node_id(current_node_id); // note: global alignment always starts at beginning of node, default offset 0 is correct } @@ -202,15 +208,13 @@ void BandedGlobalAligner::BABuilder::finalize_alignment(const list -BandedGlobalAligner::BAMatrix::BAMatrix(Alignment& alignment, Node* node, int64_t top_diag, - int64_t bottom_diag, BAMatrix** seeds, int64_t num_seeds, - int64_t cumulative_seq_len) : +BandedGlobalAligner::BAMatrix::BAMatrix(Alignment& alignment, handle_t node, int64_t top_diag, int64_t bottom_diag, + const vector& seeds, int64_t cumulative_seq_len) : node(node), top_diag(top_diag), bottom_diag(bottom_diag), seeds(seeds), alignment(alignment), - num_seeds(num_seeds), cumulative_seq_len(cumulative_seq_len), match(nullptr), insert_col(nullptr), @@ -218,50 +222,89 @@ BandedGlobalAligner::BAMatrix::BAMatrix(Alignment& alignment, Node* nod { // nothing to do #ifdef debug_banded_aligner_objects - cerr << "[BAMatrix]: constructor for node " << node->id() << " with sequence " << node->sequence() << " and band from " << top_diag << " to " << bottom_diag << endl;; + cerr << "[BAMatrix]: constructor for node " << as_integer(node) << " and band from " << top_diag << " to " << bottom_diag << endl;; #endif } template BandedGlobalAligner::BAMatrix::~BAMatrix() { #ifdef debug_banded_aligner_objects - if (node != nullptr) { - cerr << "[BAMatrix::~BAMatrix] destructing matrix for node " << node->id() << endl; - } - else { - cerr << "[BAMatrix::~BAMatrix] destructing null matrix" << endl; - } + cerr << "[BAMatrix::~BAMatrix] destructing matrix for handle " << handlegraph::as_integer(node) << endl; #endif free(match); free(insert_row); free(insert_col); - free(seeds); } template -void BandedGlobalAligner::BAMatrix::fill_matrix(int8_t* score_mat, int8_t* nt_table, int8_t gap_open, - int8_t gap_extend, bool qual_adjusted, IntType min_inf) { +void BandedGlobalAligner::BAMatrix::fill_matrix(const HandleGraph& graph, int8_t* score_mat, int8_t* nt_table, + int8_t gap_open, int8_t gap_extend, bool qual_adjusted, IntType min_inf) { #ifdef debug_banded_aligner_fill_matrix - cerr << "[BAMatrix::fill_matrix] beginning DP on matrix for node " << node->id() << endl;; + cerr << "[BAMatrix::fill_matrix] beginning DP on matrix for node " << as_integer(node) << endl;; #endif // note: bottom has the higher index int64_t band_height = bottom_diag - top_diag + 1; - int64_t ncols = node->sequence().length(); + int64_t ncols = graph.get_length(node); int64_t band_size = band_height * ncols; #ifdef debug_banded_aligner_fill_matrix cerr << "[BAMatrix::fill_matrix]: allocating matrices of height " << band_height << " and width " << ncols << " for a total cell count of " << band_size << endl; #endif - const string& node_seq = node->sequence(); + string node_seq = graph.get_sequence(node); const string& read = alignment.sequence(); const string& base_quality = alignment.quality(); match = (IntType*) malloc(sizeof(IntType) * band_size); insert_col = (IntType*) malloc(sizeof(IntType) * band_size); insert_row = (IntType*) malloc(sizeof(IntType) * band_size); + if (!match || !insert_col || !insert_row) { + // An allocation has failed. + // We may have run out of virtual memory. + +#ifdef debug_jemalloc + size_t requested_size = sizeof(IntType) * band_size; + size_t usable_size[3] = {0, 0, 0}; +#endif + + // Free up what we are holding, and also report how much usable mamoey jemalloc gave us for anything that passed. + if (match) { +#ifdef debug_jemalloc + usable_size[0] = malloc_usable_size(match); +#endif + free(match); + } + if (insert_col) { +#ifdef debug_jemalloc + usable_size[1] = malloc_usable_size(insert_col); +#endif + free(insert_col); + } + if (insert_row) { +#ifdef debug_jemalloc + usable_size[2] = malloc_usable_size(insert_row); +#endif + free(insert_row); + } + + cerr << "[BAMatrix::fill_matrix]: failed to allocate matrices of height " << band_height << " and width " << ncols << " for a total cell count of " << band_size << endl; +#ifdef debug_jemalloc + cerr << "[BAMatrix::fill_matrix]: requested: " << requested_size << " actually obtained: " << usable_size[0] << " " << usable_size[1] << " " << usable_size[2] << endl; +#endif + cerr << "[BAMatrix::fill_matrix]: is alignment problem too big for your virtual or physical memory?" << endl; + +#ifdef debug_jemalloc + // Dump the stats from the allocator. + // TODO: skip when not building with jemalloc somehow. + malloc_stats_print(nullptr, nullptr, ""); +#endif + + // Bail out relatively safely + throw std::bad_alloc(); + } + /* these represent a band in a matrix, but we store it as a rectangle with chopped * corners * @@ -308,16 +351,14 @@ void BandedGlobalAligner::BAMatrix::fill_matrix(int8_t* score_mat, int8 // we will allow the alignment to treat this node as a source if it has no seeds or if it // is connected to a source node by a length 0 path (which we will check later) - bool treat_as_source = (num_seeds == 0); + bool treat_as_source = seeds.empty(); - list seed_queue; - for (int64_t seed_num = 0; seed_num < num_seeds; seed_num++) { - seed_queue.push_back(seeds[seed_num]); - } + // initialize the queue with all of the predecessors + vector seed_queue = seeds; while (!seed_queue.empty()) { - BAMatrix* seed = seed_queue.front(); - seed_queue.pop_front(); + BAMatrix* seed = seed_queue.back(); + seed_queue.pop_back(); if (seed == nullptr) { #ifdef debug_banded_aligner_fill_matrix @@ -327,20 +368,20 @@ void BandedGlobalAligner::BAMatrix::fill_matrix(int8_t* score_mat, int8 } #ifdef debug_banded_aligner_fill_matrix - cerr << "[BAMatrix::fill_matrix]: doing POA across boundary from seed node " << seed->node->id() << " to node " << node->id() << endl; + cerr << "[BAMatrix::fill_matrix]: doing POA across boundary from seed node " << graph.get_id(seed->node) << " to node " << graph.get_id(node) << endl; #endif - int64_t seed_node_seq_len = seed->node->sequence().length(); + int64_t seed_node_seq_len = graph.get_length(seed->node); if (seed_node_seq_len == 0) { #ifdef debug_banded_aligner_fill_matrix - cerr << "[BAMatrix::fill_matrix]: seed node " << seed->node->id() << " has no sequence, adding its predecessors as seed nodes" << endl; + cerr << "[BAMatrix::fill_matrix]: seed node " << graph.get_id(seed->node) << " has no sequence, adding its predecessors as seed nodes" << endl; #endif // this is a length 0 node, so let this seed's predecessors seed into this one or // identify this node as a "source" - treat_as_source = treat_as_source || (seed->num_seeds == 0); - for (int64_t seed_num = 0; seed_num < seed->num_seeds; seed_num++) { - seed_queue.push_back(seed->seeds[seed_num]); + treat_as_source = treat_as_source || seed->seeds.empty(); + for (auto& seed_of_seed : seed->seeds) { + seed_queue.push_back(seed_of_seed); } continue; } @@ -492,7 +533,7 @@ void BandedGlobalAligner::BAMatrix::fill_matrix(int8_t* score_mat, int8 if (treat_as_source && ncols > 0) { if (cumulative_seq_len != 0) { cerr << "error:[BandedGlobalAligner] banded alignment has no node predecessor for node in middle of path" << endl; - assert(0); + exit(1); } #ifdef debug_banded_aligner_fill_matrix @@ -506,8 +547,7 @@ void BandedGlobalAligner::BAMatrix::fill_matrix(int8_t* score_mat, int8 idx = iter_start * ncols; // cap stop index if last diagonal is below bottom of matrix - int64_t iter_stop = bottom_diag > (int64_t) read.length() ? band_height + (int64_t) read.length() - bottom_diag - 1 : band_height; - + int64_t iter_stop = bottom_diag >= (int64_t) read.length() ? band_height + (int64_t) read.length() - bottom_diag - 1 : band_height; // match of first nucleotides if (qual_adjusted) { match[idx] = max(score_mat[25 * base_quality[0] + 5 * nt_table[node_seq[0]] + nt_table[read[0]]], match[idx]); @@ -542,7 +582,6 @@ void BandedGlobalAligner::BAMatrix::fill_matrix(int8_t* score_mat, int8 insert_col[up_idx] - gap_open); // must take two gaps to get into first column insert_col[idx] = max(-2 * gap_open - (top_diag + i) * gap_extend, insert_col[idx]); - #ifdef debug_banded_aligner_fill_matrix cerr << "[BAMatrix::fill_matrix]: on left edge of matrix at rectangle coords (" << i << ", " << 0 << "), match score of node char " << 0 << " (" << node_seq[0] << ") and read char " << i + top_diag << " (" << read[i + top_diag] << ") is " << (int) match_score << ", leading gap length is " << top_diag + i << " for total match matrix score of " << (int) match[idx] << endl; #endif @@ -571,12 +610,12 @@ void BandedGlobalAligner::BAMatrix::fill_matrix(int8_t* score_mat, int8 for (int64_t j = 1; j < ncols; j++) { // are we clipping any diagonals because they are outside the range of the matrix in this column? - bool bottom_diag_outside = bottom_diag + j >= (int64_t) read.length(); + bool bottom_diag_outside = bottom_diag + j >= int64_t(read.size()); bool top_diag_outside = top_diag + j < 0; bool top_diag_abutting = top_diag + j == 0; int64_t iter_start = top_diag_outside ? -(top_diag + j) : 0; - int64_t iter_stop = bottom_diag_outside ? band_height + (int64_t) read.length() - bottom_diag - j - 1 : band_height; + int64_t iter_stop = bottom_diag_outside ? band_height + int64_t(read.size()) - bottom_diag - j - 1 : band_height; idx = iter_start * ncols + j; @@ -683,53 +722,43 @@ void BandedGlobalAligner::BAMatrix::fill_matrix(int8_t* score_mat, int8 } #ifdef debug_banded_aligner_print_matrices - print_full_matrices(); - print_rectangularized_bands(); + print_full_matrices(graph); + print_rectangularized_bands(graph); #endif } + template -void BandedGlobalAligner::BAMatrix::traceback(BABuilder& builder, AltTracebackStack& traceback_stack, matrix_t start_mat, - int8_t* score_mat, int8_t* nt_table, int8_t gap_open, int8_t gap_extend, - bool qual_adjusted, IntType min_inf) { +void BandedGlobalAligner::BAMatrix::init_traceback_indexes(const HandleGraph& graph, int64_t& i, int64_t& j) { // get coordinates of bottom right corner const string& read = alignment.sequence(); - int64_t ncols = node->sequence().length(); - int64_t row = bottom_diag + ncols > (int64_t) read.length() ? (int64_t) read.length() - top_diag - ncols : bottom_diag - top_diag; - int64_t col = ncols - 1; - - -#ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback] beginning traceback in matrices for node " << node->id() << " starting matrix is " << (start_mat == Match ? "match" : (start_mat == InsertCol ? "insert column" : "insert row")) << endl; -#endif - - traceback_internal(builder, traceback_stack, row, col, start_mat, alignment.sequence().empty(), score_mat, nt_table, gap_open, gap_extend, - qual_adjusted, min_inf); + int64_t ncols = graph.get_length(node); + i = bottom_diag + ncols > (int64_t) read.length() ? (int64_t) read.length() - top_diag - ncols : bottom_diag - top_diag; + j = ncols - 1; } template -void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& builder, AltTracebackStack& traceback_stack, - int64_t start_row, int64_t start_col, matrix_t start_mat, - bool in_lead_gap, int8_t* score_mat, int8_t* nt_table, - int8_t gap_open, int8_t gap_extend, bool qual_adjusted, - IntType min_inf) { +void BandedGlobalAligner::BAMatrix::traceback(const HandleGraph& graph, BABuilder& builder, + AltTracebackStack& traceback_stack, + int64_t& i, int64_t& j, matrix_t& mat, bool& in_lead_gap, + const int8_t* score_mat, const int8_t* nt_table, + const int8_t gap_open, const int8_t gap_extend, + const bool qual_adjusted, const IntType min_inf) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] starting traceback back through node " << node->id() << " from rectangular coordinates (" << start_row << ", " << start_col << "), currently " << (in_lead_gap ? "" : "not ") << "in a lead gap" << endl; + cerr << "[BAMatrix::traceback] starting traceback back through node " << graph.get_id(node) << " from rectangular coordinates (" << i << ", " << j << "), currently " << (in_lead_gap ? "" : "not ") << "in a lead gap" << endl; #endif const string& read = alignment.sequence(); const string& base_quality = alignment.quality(); int64_t band_height = bottom_diag - top_diag + 1; - const char* node_seq = node->sequence().c_str(); - int64_t ncols = node->sequence().length(); - int64_t node_id = node->id(); + string node_seq = graph.get_sequence(node); + int64_t ncols = node_seq.size(); + int64_t node_id = graph.get_id(node); int64_t idx, next_idx; - int64_t i = start_row, j = start_col; - matrix_t curr_mat = start_mat; IntType curr_score; IntType source_score; IntType score_diff; @@ -738,20 +767,20 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build // do node traceback unless we are in the lead gap implied at the edge of the DP matrix or we // are already at a node boundary trying to get across - while ((j > 0 || curr_mat == InsertRow) && !in_lead_gap) { + while ((j > 0 || mat == InsertRow) && !in_lead_gap) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] traceback coordinates (" << i << ", " << j << "), current matrix is " << (curr_mat == Match ? "match" : (curr_mat == InsertCol ? "insert column" : "insert row")) << endl; + cerr << "[BAMatrix::traceback] traceback coordinates (" << i << ", " << j << "), current matrix is " << (mat == Match ? "match" : (mat == InsertCol ? "insert column" : "insert row")) << endl; #endif // add traceback step to alignment - builder.update_state(curr_mat, node, i + top_diag + j, j); + builder.update_state(graph, mat, node, i + top_diag + j, j); // check for a deflection if (traceback_stack.at_next_deflection(node_id, i, j)) { // move to the next position as dictated by current matrix - switch (curr_mat) { + switch (mat) { case Match: j--; break; @@ -772,10 +801,10 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build } // take the deflection and advance the deflection iterator - curr_mat = traceback_stack.deflect_to_matrix(); + mat = traceback_stack.deflect_to_matrix(); #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] taking inside matrix deflection to " << (curr_mat == Match ? "match" : (curr_mat == InsertCol ? "insert column" : "insert row")) << endl; + cerr << "[BAMatrix::traceback] taking inside matrix deflection to " << (mat == Match ? "match" : (mat == InsertCol ? "insert column" : "insert row")) << endl; #endif continue; @@ -784,15 +813,15 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build // find optimal traceback idx = i * ncols + j; bool found_trace = false; - switch (curr_mat) { + switch (mat) { case Match: { if (i + j == -top_diag) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] next cell is outside matrix, opening implied lead gap" << endl; + cerr << "[BAMatrix::traceback] next cell is outside matrix, opening implied lead gap" << endl; #endif // at top of matrix, move into implied lead gap along top edge - curr_mat = InsertCol; + mat = InsertCol; j--; in_lead_gap = true; // no where else to go, so break out of switch statement without checking alts @@ -811,16 +840,16 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build } #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] transitioning from match, current score " << (int) match[idx] << " match/mismatch score " << (int) match_score << " from node char " << j << " (" << node_seq[j] << ") and read char " << i + top_diag + j << " (" << read[i + top_diag + j] << ")" << endl; + cerr << "[BAMatrix::traceback] transitioning from match, current score " << (int) match[idx] << " match/mismatch score " << (int) match_score << " from node char " << j << " (" << node_seq[j] << ") and read char " << i + top_diag + j << " (" << read[i + top_diag + j] << ")" << endl; #endif source_score = match[next_idx]; score_diff = curr_score - (source_score + match_score); if (score_diff == 0) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] found next cell in match matrix with score " << (int) match[next_idx] << endl; + cerr << "[BAMatrix::traceback] found next cell in match matrix with score " << (int) match[next_idx] << endl; #endif - curr_mat = Match; + mat = Match; found_trace = true; } else if (source_score != min_inf) { @@ -833,9 +862,9 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build score_diff = curr_score - (source_score + match_score); if (!found_trace && score_diff == 0) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] found next cell in insert row matrix with score " << (int) insert_row[next_idx] << endl; + cerr << "[BAMatrix::traceback] found next cell in insert row matrix with score " << (int) insert_row[next_idx] << endl; #endif - curr_mat = InsertRow; + mat = InsertRow; found_trace = true; } else { @@ -849,9 +878,9 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build score_diff = curr_score - (source_score + match_score); if (!found_trace && score_diff == 0) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] found next cell in insert column matrix with score " << (int) insert_col[next_idx] << endl; + cerr << "[BAMatrix::traceback] found next cell in insert column matrix with score " << (int) insert_col[next_idx] << endl; #endif - curr_mat = InsertCol; + mat = InsertCol; found_trace = true; } else if (source_score != min_inf) { @@ -893,7 +922,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build source_score = match[next_idx]; score_diff = curr_score - (source_score - gap_open); if (score_diff == 0) { - curr_mat = Match; + mat = Match; found_trace = true; } else if (source_score != min_inf) { @@ -905,7 +934,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build if (source_score > min_inf) { score_diff = curr_score - (source_score - gap_extend); if (!found_trace && score_diff == 0) { - curr_mat = InsertRow; + mat = InsertRow; found_trace = true; } else if (source_score != min_inf) { @@ -918,7 +947,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build if (source_score > min_inf) { score_diff = curr_score - (source_score - gap_open); if (!found_trace && score_diff == 0) { - curr_mat = InsertCol; + mat = InsertCol; found_trace = true; } else if (source_score != min_inf) { @@ -952,7 +981,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build source_score = match[next_idx]; score_diff = curr_score - (source_score - gap_open); if (score_diff == 0) { - curr_mat = Match; + mat = Match; found_trace = true; } else if (source_score != min_inf) { @@ -964,7 +993,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build if (source_score > min_inf) { score_diff = curr_score - (source_score - gap_open); if (!found_trace && score_diff == 0) { - curr_mat = InsertRow; + mat = InsertRow; found_trace = true; } else if (source_score != min_inf) { @@ -977,7 +1006,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build if (source_score > min_inf) { score_diff = curr_score - (source_score - gap_extend); if (!found_trace && score_diff == 0) { - curr_mat = InsertCol; + mat = InsertCol; found_trace = true; } else if (source_score != min_inf) { @@ -1008,24 +1037,35 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build if (in_lead_gap) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] running through node sequence in a lead gap" << endl; + cerr << "[BAMatrix::traceback] running through node sequence in a lead gap" << endl; #endif // add lead column gaps until reaching edge of node - curr_mat = InsertCol; + mat = InsertCol; while (j > 0) { - builder.update_state(curr_mat, node, -1, j); + builder.update_state(graph, mat, node, -1, j); j--; i++; } } +} + +template +void BandedGlobalAligner::BAMatrix::traceback_over_edge(const HandleGraph& graph, BABuilder& builder, + AltTracebackStack& traceback_stack, + int64_t& i, int64_t& j, matrix_t& mat, + bool& in_lead_gap, int64_t& node_id, + const int8_t* score_mat, const int8_t* nt_table, + const int8_t gap_open, const int8_t gap_extend, + const bool qual_adjusted, IntType const min_inf) { + // begin POA across the boundary bool treat_as_source = false; unordered_set traceback_source_nodes; - if (num_seeds == 0) { + if (seeds.empty()) { treat_as_source = true; - traceback_source_nodes.insert(node->id()); + traceback_source_nodes.insert(graph.get_id(node)); } BAMatrix* traceback_seed = nullptr; @@ -1033,24 +1073,36 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build int64_t traceback_seed_col = std::numeric_limits::min(); matrix_t traceback_mat = Match; + const string& read = alignment.sequence(); + const string& base_quality = alignment.quality(); + + int64_t idx, next_idx; + IntType score_diff; + IntType alt_score; + IntType curr_score; + IntType source_score; + IntType curr_traceback_score = traceback_stack.current_traceback_score(); + int64_t curr_diag = top_diag + i; + string node_seq = graph.get_sequence(node); + int64_t ncols = graph.get_length(node); if (traceback_stack.at_next_deflection(node_id, i, j)) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] at boundary, taking a deflection" << endl; + cerr << "[BAMatrix::traceback_over_edge] at boundary, taking a deflection" << endl; #endif - builder.update_state(curr_mat, node, curr_diag, 0); + builder.update_state(graph, mat, node, curr_diag, 0); // where to deflect to? int64_t deflect_node_id; matrix_t deflect_matrix = traceback_stack.deflect_to_matrix(deflect_node_id); // find which seed matrix to deflect to (don't have a better way of looking this up right now) - list seed_path; - for (int64_t k = 0; k < num_seeds; k++) { + vector seed_path; + for (auto initial_seed : seeds) { - list seed_stack{seeds[k]}; + vector seed_stack{initial_seed}; while (!seed_stack.empty()) { BAMatrix* seed = seed_stack.back(); @@ -1060,18 +1112,18 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build // pop off the path if we hit the stack marker seed_path.pop_back(); } - else if (seed->node->id() == deflect_node_id) { + else if (graph.get_id(seed->node) == deflect_node_id) { // we found the traceback node seed_path.push_back(seed); break; } - else if (seed->node->sequence().length() == 0) { + else if (graph.get_length(seed->node) == 0) { // this is not the traceback node, but it is an empty node so the traceback // might be on the other side of it seed_path.push_back(seed); seed_stack.push_back(nullptr); - for (int64_t l = 0; l < seed->num_seeds; l++) { - seed_stack.push_back(seed->seeds[l]); + for (auto seed_of_seed : seed->seeds) { + seed_stack.push_back(seed_of_seed); } } } @@ -1092,25 +1144,26 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build auto end = seed_path.end(); end--; for (auto iter = seed_path.begin(); iter != end; iter++) { - builder.update_state(curr_mat, (*iter)->node, i, 0, true); + builder.update_state(graph, mat, (*iter)->node, i, 0, true); } } BAMatrix* seed = seed_path.back(); - int64_t seed_ncols = seed->node->sequence().length(); - traceback_seed_row = curr_diag - seed->top_diag - seed_ncols + (curr_mat == InsertCol); + int64_t seed_ncols = graph.get_length(seed->node); + traceback_seed_row = curr_diag - seed->top_diag - seed_ncols + (mat == InsertCol); traceback_seed_col = seed_ncols - 1; // check whether we're crossing into a lead gap - in_lead_gap = (curr_diag == 0 && curr_mat == Match) || in_lead_gap; + in_lead_gap = (curr_diag == 0 && mat == Match) || in_lead_gap; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] taking node boundary deflection to " << (deflect_matrix == Match ? "match" : (deflect_matrix == InsertCol ? "insert column" : "insert row")) << " in node " << deflect_node_id << ", will start at coordinates (" << traceback_seed_row << ", " << traceback_seed_col << ")" << endl; + cerr << "[BAMatrix::traceback_over_edge] taking node boundary deflection to " << (deflect_matrix == Match ? "match" : (deflect_matrix == InsertCol ? "insert column" : "insert row")) << " in node " << deflect_node_id << ", will start at coordinates (" << traceback_seed_row << ", " << traceback_seed_col << ")" << endl; #endif - // continue traceback in the next node - seed->traceback_internal(builder, traceback_stack, traceback_seed_row, traceback_seed_col, deflect_matrix, - in_lead_gap, score_mat, nt_table, gap_open, gap_extend, qual_adjusted, min_inf); + i = traceback_seed_row; + j = traceback_seed_col; + mat = deflect_matrix; + node_id = deflect_node_id; return; } @@ -1120,26 +1173,26 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build vector empty_source_path; // a queue of seeds and their empty predecessors - list>> seed_queue; - for (int64_t k = 0; k < num_seeds; k++) { - seed_queue.emplace_back(seeds[k], vector()); + vector>> seed_queue; + for (auto seed : seeds) { + seed_queue.emplace_back(seed, vector()); } if (in_lead_gap) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] at boundary, following seed backward from a lead gap" << endl; + cerr << "[BAMatrix::traceback_over_edge] at boundary, following seed backward from a lead gap" << endl; #endif // we are in the implied lead gap along the top of the matrix // take the shortest path back to the origin of the global alignment // add final read deletion of node - builder.update_state(curr_mat, node, -1, j); + builder.update_state(graph, mat, node, -1, j); while (!seed_queue.empty()) { - auto seed_record = seed_queue.front(); + auto seed_record = seed_queue.back(); BAMatrix* seed = seed_record.first; - seed_queue.pop_front(); + seed_queue.pop_back(); // is seed masked? if (seed == nullptr) { @@ -1147,35 +1200,35 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build } #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] checking seed node " << seed->node->id() << endl; + cerr << "[BAMatrix::traceback_over_edge] checking seed node " << graph.get_id(seed->node) << endl; #endif // if this node is empty, add its predecessors to the queue - if (seed->node->sequence().length() == 0) { + if (graph.get_length(seed->node) == 0) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] seed node " << seed->node->id() << " is empty, checking predecessors" << endl; + cerr << "[BAMatrix::traceback_over_edge] seed node " << graph.get_id(seed->node) << " is empty, checking predecessors" << endl; #endif // record that this seed comes before its predecessors in the traceback seed_record.second.push_back(seed); - if (seed->num_seeds == 0) { + if (seed->seeds.empty()) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] empty seed node " << seed->node->id() << " is a source" << endl; + cerr << "[BAMatrix::traceback_over_edge] empty seed node " << graph.get_id(seed->node) << " is a source" << endl; #endif treat_as_source = true; - traceback_source_nodes.insert(seed->node->id()); + traceback_source_nodes.insert(graph.get_id(seed->node)); empty_source_path = seed_record.second; } - for (int64_t seed_num = 0; seed_num < seed->num_seeds; seed_num++) { - seed_queue.push_back(make_pair(seed->seeds[seed_num], seed_record.second)); + for (auto seed_of_seed : seed->seeds) { + seed_queue.push_back(make_pair(seed_of_seed, seed_record.second)); } continue; } - score_diff = gap_extend * (seed->cumulative_seq_len + seed->node->sequence().length() - cumulative_seq_len); + score_diff = gap_extend * (seed->cumulative_seq_len + graph.get_length(seed->node) - cumulative_seq_len); if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] found a lead gap traceback to node " << seed->node->id() << endl; + cerr << "[BAMatrix::traceback_over_edge] found a lead gap traceback to node " << graph.get_id(seed->node) << endl; #endif traceback_seed = seed; empty_intermediate_nodes = seed_record.second; @@ -1183,13 +1236,13 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build } else { alt_score = curr_traceback_score - score_diff; - traceback_stack.propose_deflection(alt_score, node_id, i, j, seed->node->id(), InsertCol); + traceback_stack.propose_deflection(alt_score, node_id, i, j, graph.get_id(seed->node), InsertCol); } } if (traceback_seed) { // where in the matrix is this? - int64_t seed_ncols = traceback_seed->node->sequence().length(); + int64_t seed_ncols = graph.get_length(traceback_seed->node); int64_t seed_extended_top_diag = traceback_seed->top_diag + seed_ncols; traceback_seed_row = top_diag - seed_extended_top_diag + i + 1; traceback_seed_col = seed_ncols - 1; @@ -1197,10 +1250,10 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build } else { - builder.update_state(curr_mat, node, curr_diag, 0); + builder.update_state(graph, mat, node, curr_diag, 0); IntType match_score; - switch (curr_mat) { + switch (mat) { case Match: { curr_score = match[i * ncols]; @@ -1235,7 +1288,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build } #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] at boundary, in node " << node->id() << " following seed backward from " << (curr_mat == Match ? "match" : "insert column") << " matrix with score " << (int) curr_score << endl; + cerr << "[BAMatrix::traceback_over_edge] at boundary, in node " << graph.get_id(node) << " following seed backward from " << (mat == Match ? "match" : "insert column") << " matrix with score " << (int) curr_score << endl; #endif // matches stay on same diagonal, column insertions move over one diagonal @@ -1243,26 +1296,26 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build // check traceback goes to each seed matrix while (!seed_queue.empty()) { - auto seed_record = seed_queue.front(); + auto seed_record = seed_queue.back(); BAMatrix* seed = seed_record.first; - seed_queue.pop_front(); + seed_queue.pop_back(); // is the matrix masked? if (seed == nullptr) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] seed is masked" << endl; + cerr << "[BAMatrix::traceback_over_edge] seed is masked" << endl; #endif continue; } - if (seed->node->sequence().length() == 0) { + if (graph.get_length(seed->node) == 0) { - for (int64_t seed_num = 0; seed_num < seed->num_seeds; seed_num++) { - seed_queue.push_back(make_pair(seed->seeds[seed_num], seed_record.second)); + for (auto seed_of_seed : seed->seeds) { + seed_queue.push_back(make_pair(seed_of_seed, seed_record.second)); seed_queue.back().second.push_back(seed); } - if (seed->num_seeds == 0) { + if (seed->seeds.empty()) { treat_as_source = true; // keep track of the path through empty nodes to a source empty_source_path = seed_record.second; @@ -1273,43 +1326,43 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] checking seed node " << seed->node->id() << endl; + cerr << "[BAMatrix::traceback_over_edge] checking seed node " << graph.get_id(seed->node) << endl; #endif - int64_t seed_node_id = seed->node->id(); - int64_t seed_ncols = seed->node->sequence().length(); + int64_t seed_node_id = graph.get_id(seed->node); + int64_t seed_ncols = graph.get_length(seed->node); // the diagonals in the current matrix that this seed extends to int64_t seed_extended_top_diag = seed->top_diag + seed_ncols; int64_t seed_extended_bottom_diag = seed->bottom_diag + seed_ncols; // does the traceback diagonal extend backward to this matrix? - if (curr_diag > seed_extended_bottom_diag - (curr_mat == InsertCol) // col inserts hit 1 less of band + if (curr_diag > seed_extended_bottom_diag - (mat == InsertCol) // col inserts hit 1 less of band || curr_diag < seed_extended_top_diag) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] seed extended diags are top: " << seed_extended_top_diag << ", bottom: " << seed_extended_bottom_diag << " and curr mat is " << (curr_mat == InsertCol ? "" : "not") << " insert column, so we cannot extend from this seed to the current diag " << curr_diag << endl; + cerr << "[BAMatrix::traceback_over_edge] seed extended diags are top: " << seed_extended_top_diag << ", bottom: " << seed_extended_bottom_diag << " and curr mat is " << (mat == InsertCol ? "" : "not") << " insert column, so we cannot extend from this seed to the current diag " << curr_diag << endl; #endif continue; } int64_t seed_col = seed_ncols - 1; - int64_t seed_row = -(seed_extended_top_diag - top_diag) + i + (curr_mat == InsertCol); + int64_t seed_row = -(seed_extended_top_diag - top_diag) + i + (mat == InsertCol); next_idx = seed_row * seed_ncols + seed_col; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] checking seed rectangular coordinates (" << seed_row << ", " << seed_col << "), with indices calculated from current diagonal " << curr_diag << " (top diag " << top_diag << " + offset " << i << "), seed top diagonal " << seed->top_diag << ", seed seq length " << seed_ncols << " with insert column offset " << (curr_mat == InsertCol) << endl; + cerr << "[BAMatrix::traceback_over_edge] checking seed rectangular coordinates (" << seed_row << ", " << seed_col << "), with indices calculated from current diagonal " << curr_diag << " (top diag " << top_diag << " + offset " << i << "), seed top diagonal " << seed->top_diag << ", seed seq length " << seed_ncols << " with insert column offset " << (mat == InsertCol) << endl; #endif - switch (curr_mat) { + switch (mat) { case Match: { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] poa backwards from match, seed extended top diag " << seed_extended_top_diag << endl; + cerr << "[BAMatrix::traceback_over_edge] poa backwards from match, seed extended top diag " << seed_extended_top_diag << endl; #endif // does match lead into a lead row gap? if (seed->top_diag + seed_row + seed_col == -1) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] traceback points to a lead column gap of length " << seed->cumulative_seq_len + seed_ncols << " with score " << (int) -gap_open - (seed->cumulative_seq_len + seed_ncols - 1) * gap_extend << " extending to score here of " << (int) curr_score << " with match score " << (int) match_score << endl; + cerr << "[BAMatrix::traceback_over_edge] traceback points to a lead column gap of length " << seed->cumulative_seq_len + seed_ncols << " with score " << (int) -gap_open - (seed->cumulative_seq_len + seed_ncols - 1) * gap_extend << " extending to score here of " << (int) curr_score << " with match score " << (int) match_score << endl; #endif // score of implied column gap source_score = -gap_open - (seed->cumulative_seq_len + seed_ncols - 1) * gap_extend; @@ -1323,7 +1376,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build found_trace = true; empty_intermediate_nodes = seed_record.second; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] hit found in lead gap with score " << -gap_open - (seed->cumulative_seq_len + seed_ncols - 1) * gap_extend << endl; + cerr << "[BAMatrix::traceback_over_edge] hit found in lead gap with score " << -gap_open - (seed->cumulative_seq_len + seed_ncols - 1) * gap_extend << endl; #endif } else { @@ -1346,7 +1399,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build found_trace = true; empty_intermediate_nodes = seed_record.second; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] hit found in match matrix with score " << (int) seed->match[next_idx] << endl; + cerr << "[BAMatrix::traceback_over_edge] hit found in match matrix with score " << (int) seed->match[next_idx] << endl; #endif } else if (source_score != min_inf) { @@ -1360,7 +1413,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build score_diff = curr_score - (source_score + match_score); if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] hit found in insert column matrix with score " << (int) seed->insert_col[next_idx] << endl; + cerr << "[BAMatrix::traceback_over_edge] hit found in insert column matrix with score " << (int) seed->insert_col[next_idx] << endl; #endif traceback_mat = InsertCol; traceback_seed = seed; @@ -1381,7 +1434,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build score_diff = curr_score - (source_score + match_score); if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] hit found in insert row matrix with score " << (int) seed->insert_row[next_idx] << endl; + cerr << "[BAMatrix::traceback_over_edge] hit found in insert row matrix with score " << (int) seed->insert_row[next_idx] << endl; #endif traceback_mat = InsertRow; traceback_seed = seed; @@ -1406,7 +1459,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build score_diff = curr_score - (source_score - gap_open); if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] hit found in match matrix with score " << (int) seed->match[next_idx] << endl; + cerr << "[BAMatrix::traceback_over_edge] hit found in match matrix with score " << (int) seed->match[next_idx] << endl; #endif traceback_mat = Match; traceback_seed = seed; @@ -1418,7 +1471,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build else if (source_score != min_inf) { alt_score = curr_traceback_score - score_diff; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] no hit in match matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; + cerr << "[BAMatrix::traceback_over_edge] no hit in match matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; #endif traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, Match); } @@ -1429,7 +1482,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build score_diff = curr_score - (source_score - gap_extend); if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] hit found in insert column matrix with score " << (int) seed->match[next_idx] << endl; + cerr << "[BAMatrix::traceback_over_edge] hit found in insert column matrix with score " << (int) seed->match[next_idx] << endl; #endif traceback_mat = InsertCol; traceback_seed = seed; @@ -1441,7 +1494,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build else { alt_score = curr_traceback_score - score_diff; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] no hit in insert row matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; + cerr << "[BAMatrix::traceback_over_edge] no hit in insert row matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; #endif traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertCol); } @@ -1453,7 +1506,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build score_diff = curr_score - (source_score - gap_open); if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] hit found in insert row matrix with score " << (int) seed->match[next_idx] << endl; + cerr << "[BAMatrix::traceback_over_edge] hit found in insert row matrix with score " << (int) seed->match[next_idx] << endl; #endif traceback_mat = InsertRow; traceback_seed = seed; @@ -1465,7 +1518,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build else { alt_score = curr_traceback_score - score_diff; #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] no hit in insert column matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; + cerr << "[BAMatrix::traceback_over_edge] no hit in insert column matrix, proposing deflection with alt score " << (int) alt_score << " from current traceback score " << (int) curr_traceback_score << " and score diff " << (int) score_diff << endl; #endif traceback_stack.propose_deflection(alt_score, node_id, i, j, seed_node_id, InsertRow); } @@ -1497,7 +1550,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build // this is a source node, or it is connected to one by a zero-length path #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] at beginning of first node in alignment" << endl; + cerr << "[BAMatrix::traceback_over_edge] at beginning of first node in alignment" << endl; #endif if (in_lead_gap && !found_trace) { // this will always be the shortest gap @@ -1506,7 +1559,7 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build i++; } else { - switch (curr_mat) { + switch (mat) { case Match: { IntType match_score; @@ -1521,9 +1574,9 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build score_diff = curr_score - (source_score + match_score); if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] alignment starts with match, adding read char " << top_diag + i << ": " << read[top_diag + i] << endl; + cerr << "[BAMatrix::traceback_over_edge] alignment starts with match, adding read char " << top_diag + i << ": " << read[top_diag + i] << endl; #endif - curr_mat = InsertRow; + mat = InsertRow; found_source_trace = true; } else { @@ -1542,9 +1595,9 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build score_diff = curr_score - (source_score - gap_open); if (score_diff == 0 && !found_trace) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] alignment starts with column gap" << endl; + cerr << "[BAMatrix::traceback_over_edge] alignment starts with column gap" << endl; #endif - curr_mat = InsertRow; + mat = InsertRow; found_source_trace = true; } else { @@ -1572,27 +1625,30 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build if (found_source_trace) { // if we traversed any empty nodes before finding the traceback, add empty updates for them for (BAMatrix* seed_path_node : empty_source_path) { - builder.update_state(InsertCol, seed_path_node->node, i + top_diag, 0, true); + builder.update_state(graph, InsertCol, seed_path_node->node, i + top_diag, 0, true); } // add any lead row gaps necessary while (top_diag + i > 0) { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] initial row gaps are present, adding read char " << top_diag + i - 1 << ": " << read[top_diag + i - 1] << endl; + cerr << "[BAMatrix::traceback_over_edge] initial row gaps are present, adding read char " << top_diag + i - 1 << ": " << read[top_diag + i - 1] << endl; #endif i--; - Node* end_node = empty_source_path.empty() ? node : empty_source_path.back()->node; - builder.update_state(InsertRow, end_node, i + top_diag, -1, end_node->sequence().empty()); + const handle_t& end_node = empty_source_path.empty() ? node : empty_source_path.back()->node; + builder.update_state(graph, InsertRow, end_node, i + top_diag, -1, graph.get_length(end_node) == 0); } + + // set the node ID to 0 to indicate completion + node_id = 0; return; } else { #ifdef debug_banded_aligner_traceback - cerr << "[BAMatrix::traceback_internal] traversed " << empty_intermediate_nodes.size() << " empty nodes before finding trace" << endl; + cerr << "[BAMatrix::traceback_over_edge] traversed " << empty_intermediate_nodes.size() << " empty nodes before finding trace" << endl; #endif // if we traversed any empty nodes before finding the traceback, add empty updates for them for (BAMatrix* intermediate_node : empty_intermediate_nodes) { - builder.update_state(curr_mat, intermediate_node->node, i + top_diag, 0, true); + builder.update_state(graph, mat, intermediate_node->node, i + top_diag, 0, true); } } @@ -1602,45 +1658,47 @@ void BandedGlobalAligner::BAMatrix::traceback_internal(BABuilder& build assert(0); } - // continue traceback in the next node - traceback_seed->traceback_internal(builder, traceback_stack, traceback_seed_row, traceback_seed_col, traceback_mat, - in_lead_gap, score_mat, nt_table, gap_open, gap_extend, qual_adjusted, min_inf); + // set the traceback values for the next matrix + i = traceback_seed_row; + j = traceback_seed_col; + mat = traceback_mat; + node_id = graph.get_id(traceback_seed->node); } template -void BandedGlobalAligner::BAMatrix::print_full_matrices() { +void BandedGlobalAligner::BAMatrix::print_full_matrices(const HandleGraph& graph) { if (match == nullptr) { cerr << "error:[BandedGlobalAligner] cannot print matrix before performing dynamic programming" << endl; assert(0); } - cerr << "matrices for node " << node->id() << ":" << endl; + cerr << "matrices for node " << graph.get_id(node) << ":" << endl; for (matrix_t mat : {Match, InsertRow, InsertCol}) { - print_matrix(mat); + print_matrix(graph, mat); } } template -void BandedGlobalAligner::BAMatrix::print_rectangularized_bands() { +void BandedGlobalAligner::BAMatrix::print_rectangularized_bands(const HandleGraph& graph) { if (match == nullptr) { cerr << "error:[BandedGlobalAligner] cannot print band before performing dynamic programming" << endl; assert(0); } - cerr << "rectangularized bands for node " << node->id() << ":" << endl; + cerr << "rectangularized bands for node " << graph.get_id(node) << ":" << endl; for (matrix_t mat : {Match, InsertRow, InsertCol}) { - print_band(mat); + print_band(graph, mat); } } template -void BandedGlobalAligner::BAMatrix::print_matrix(matrix_t which_mat) { +void BandedGlobalAligner::BAMatrix::print_matrix(const HandleGraph& graph, matrix_t which_mat) { const string& read = alignment.sequence(); - const string& node_seq = node->sequence(); + string node_seq = graph.get_sequence(node); IntType* band_rect; switch (which_mat) { @@ -1688,10 +1746,10 @@ void BandedGlobalAligner::BAMatrix::print_matrix(matrix_t which_mat) { } template -void BandedGlobalAligner::BAMatrix::print_band(matrix_t which_mat) { +void BandedGlobalAligner::BAMatrix::print_band(const HandleGraph& graph, matrix_t which_mat) { const string& read = alignment.sequence(); - const string& node_seq = node->sequence(); + string node_seq = graph.get_sequence(node); IntType* band_rect; switch (which_mat) { @@ -1749,7 +1807,7 @@ void BandedGlobalAligner::BAMatrix::print_band(matrix_t which_mat) { } template -BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, Graph& g, +BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, int64_t band_padding, bool permissive_banding, bool adjust_for_base_quality) : BandedGlobalAligner(alignment, g, @@ -1762,7 +1820,7 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, Graph& g } template -BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, Graph& g, +BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, vector& alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding, @@ -1782,16 +1840,21 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, Graph& g } template -BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, Graph& g, +BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, vector* alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding, bool adjust_for_base_quality) : + graph(g), alignment(alignment), alt_alignments(alt_alignments), max_multi_alns(max_multi_alns), - adjust_for_base_quality(adjust_for_base_quality) + adjust_for_base_quality(adjust_for_base_quality), + // compute some graph features we will be frequently reusing + topological_order(handlealgs::lazier_topological_order(&g)), + source_nodes(handlealgs::head_nodes(&g)), + sink_nodes(handlealgs::tail_nodes(&g)) { #ifdef debug_banded_aligner_objects cerr << "[BandedGlobalAligner]: constructing BandedBlobalAligner with " << band_padding << " padding, " << permissive_banding << " permissive, " << adjust_for_base_quality << " quality adjusted" << endl; @@ -1803,6 +1866,10 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, Graph& g } } + if (topological_order.size() < graph.get_node_count()) { + cerr << "error:[BandedGlobalAligner] alignment graph must be a DAG" << endl; + } + // TODO: this can waste memory, but reallocating the vector seems to throw an error in protobuf and // we won't know if there are fewer alignments than the max until the cycle is over if (alt_alignments) { @@ -1810,50 +1877,12 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, Graph& g } // map node ids to indices - for (int64_t i = 0; i < g.node_size(); i++) { - node_id_to_idx[g.node(i).id()] = i; - } - -#ifdef debug_banded_aligner_objects - cerr << "[BandedGlobalAligner]: constructing edge lists by node" << endl; -#endif - - // convert the graph into adjacency list representation - vector> node_edges_in; - vector> node_edges_out; - graph_edge_lists(g, true, node_edges_out); - graph_edge_lists(g, false, node_edges_in); - -#ifdef debug_banded_aligner_objects - cerr << "[BandedGlobalAligner]: performing topological sort" << endl; -#endif - - // compute topological ordering - topological_sort(g, node_edges_out, topological_order); - -#ifdef debug_banded_aligner_objects - cerr << "[BandedGlobalAligner]: identifying source and sink nodes" << endl; -#endif - - // identify source and sink nodes in the graph - for (int64_t i = 0; i < g.node_size(); i++) { - if (node_edges_in[i].empty()) { - source_nodes.insert(g.mutable_node(i)); - } - if (node_edges_out[i].empty()) { - sink_nodes.insert(g.mutable_node(i)); - } - } - - if (source_nodes.empty() || sink_nodes.empty()) { - cerr << "error:[BandedGlobalAligner] alignment graph must be a DAG" << endl; + for (int64_t i = 0; i < topological_order.size(); i++) { + node_id_to_idx[graph.get_id(topological_order[i])] = i; } #ifdef debug_banded_aligner_objects cerr << "[BandedGlobalAligner]: " << source_nodes.size() << " sources and " << sink_nodes.size() << " sinks" << endl; -#endif - -#ifdef debug_banded_aligner_objects cerr << "[BandedGlobalAligner]: computing node bands" << endl; #endif @@ -1861,7 +1890,7 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, Graph& g // global alignment within the band vector node_masked; vector> band_ends; - find_banded_paths(alignment.sequence(), permissive_banding, node_edges_in, node_edges_out, band_padding, node_masked, band_ends); + find_banded_paths(permissive_banding, band_padding, node_masked, band_ends); #ifdef debug_banded_aligner_objects cerr << "[BandedGlobalAligner]: identifying shortest paths" << endl; @@ -1870,69 +1899,52 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, Graph& g // find the shortest sequence leading to each node so we can infer the length // of lead deletions vector shortest_seqs; - shortest_seq_paths(node_edges_out, source_nodes, shortest_seqs); + shortest_seq_paths(shortest_seqs); #ifdef debug_banded_aligner_objects cerr << "[BandedGlobalAligner]: constructing banded matrix objects" << endl; #endif // initialize DP matrices for each node - banded_matrices.resize(g.node_size()); - for (int64_t i = 0; i < g.node_size(); i++) { + banded_matrices.resize(graph.get_node_count(), nullptr); + for (int64_t i = 0; i < topological_order.size(); i++) { #ifdef debug_banded_aligner_objects - cerr << "[BandedGlobalAligner]: creating matrix object for node " << topological_order[i]->id() << " at index " << i << endl; + cerr << "[BandedGlobalAligner]: creating matrix object for node " << graph.get_id(topological_order[i]) << " at index " << i << endl; #endif - Node* node = topological_order[i]; - int64_t node_idx = node_id_to_idx[node->id()]; - if (node_masked[node_idx]) { -#ifdef debug_banded_aligner_objects - cerr << "[BandedGlobalAligner]: node is masked, creating dummy matrix object" << endl; -#endif + if (!node_masked[i]) { - banded_matrices[node_idx] = nullptr; - } - else { - int64_t node_seq_len = node->sequence().length(); + const handle_t& node = topological_order[i]; + + int64_t node_seq_len = graph.get_length(node); #ifdef debug_banded_aligner_objects - cerr << "[BandedGlobalAligner]: establishing seed list for node " << node->id() << " at index " << i << endl; + cerr << "[BandedGlobalAligner]: establishing seed list for node " << graph.get_id(node) << " at index " << i << endl; #endif // POA predecessor matrices - BAMatrix** seeds; - vector& edges_in = node_edges_in[node_idx]; - if (edges_in.empty()) { - -#ifdef debug_banded_aligner_objects - cerr << "[BandedGlobalAligner]: no seeds, setting array to null" << endl; -#endif - - seeds = nullptr; - } - else { - seeds = (BAMatrix**) malloc(sizeof(BAMatrix**) * edges_in.size()); - for (int64_t j = 0; j < edges_in.size(); j++) { - seeds[j] = banded_matrices[edges_in[j]]; - } - } + vector seeds; + graph.follow_edges(node, true, [&](const handle_t& prev) { + seeds.push_back(banded_matrices[node_id_to_idx[graph.get_id(prev)]]); + }); - banded_matrices[node_idx] = new BAMatrix(alignment, - node, - band_ends[node_idx].first, - band_ends[node_idx].second, - seeds, - edges_in.size(), - shortest_seqs[node_idx]); + banded_matrices[i] = new BAMatrix(alignment, + node, + band_ends[i].first, + band_ends[i].second, + std::move(seeds), + shortest_seqs[i]); } } + // check to see if we chose a banding that is too restrictive for making an + // an alignment if (!permissive_banding) { bool sinks_masked = true; - for (Node* node : sink_nodes) { - if (banded_matrices[node_id_to_idx[node->id()]] != nullptr) { + for (const handle_t& node : sink_nodes) { + if (banded_matrices[node_id_to_idx[graph.get_id(node)]] != nullptr) { sinks_masked = false; break; } @@ -1956,220 +1968,131 @@ BandedGlobalAligner::~BandedGlobalAligner() { } } -// fills a vector with vectors ids that have edges to/from each node -template -void BandedGlobalAligner::graph_edge_lists(Graph& g, bool outgoing_edges, vector>& out_edge_list) { - out_edge_list = vector>(g.node_size()); - for (int64_t i = 0; i < g.edge_size(); i++) { - // Find the connected nodes - const Edge& edge = g.edge(i); - id_t from = edge.from(); - id_t to = edge.to(); - // We know the edge can't be reversing (since we align to DAGs), but it might be doubly reversing. - if (edge.from_start() && edge.to_end()) { - std::swap(from, to); - } - - if (outgoing_edges) { - // We want to store destinations by sources - out_edge_list[node_id_to_idx.at(from)].push_back(node_id_to_idx.at(to)); - } else { - // We want to store sources by destinations - out_edge_list[node_id_to_idx.at(to)].push_back(node_id_to_idx.at(from)); - } - } - -} - -// standard DFS-based topological sort algorithm -// NOTE: this is only valid if the Graph g has been dag-ified first and there are no from_start -// or to_end edges. -template -void BandedGlobalAligner::topological_sort(Graph& g, vector>& node_edges_out, - vector& out_topological_order) { - if (g.node_size() == 0) { - cerr << "warning:[BandedGlobalAligner] attempted to perform topological sort on empty graph" << endl; - return; - } - - // initialize return value - out_topological_order = vector(g.node_size()); - size_t order_index = g.node_size() - 1; - - // initialize iteration structures - vector enqueued = vector(g.node_size()); - vector edge_index = vector(g.node_size()); - vector stack; - - // iterate through starting nodes - for (int64_t init_node_id = 0; init_node_id < g.node_size(); init_node_id++) { - if (enqueued[init_node_id]) { - continue; - } - // navigate through graph with DFS - stack.push_back(init_node_id); - enqueued[init_node_id] = true; - while (!stack.empty()) { - int64_t node_id = stack[stack.size() - 1]; - if (edge_index[node_id] < node_edges_out[node_id].size()) { - int64_t target_id = node_edges_out[node_id][edge_index[node_id]]; - if (enqueued[target_id]) { - edge_index[node_id]++; - } - else { - stack.push_back(target_id); - enqueued[target_id] = true; - } - } - else { - // add to topological order in reverse finishing order - stack.pop_back(); - out_topological_order[order_index] = g.mutable_node(node_id); - order_index--; - } - } - } -} - template -void BandedGlobalAligner::path_lengths_to_sinks(const string& read, vector>& node_edges_in, - vector& shortest_path_to_sink, +void BandedGlobalAligner::path_lengths_to_sinks(vector& shortest_path_to_sink, vector& longest_path_to_sink) { #ifdef debug_banded_aligner_graph_processing cerr << "[BandedGlobalAligner::path_lengths_to_sinks]: finding longest and shortest paths to sink node" << endl; #endif // find the longest path from the right side of each matrix to the end of the graph - longest_path_to_sink = vector(topological_order.size()); - shortest_path_to_sink = vector(topological_order.size()); - // set initial values -- 0 default value is sufficient for longest path - for (int64_t& initial_path_length : shortest_path_to_sink) { - initial_path_length = numeric_limits::max(); - } + // set initial values + longest_path_to_sink.resize(topological_order.size(), 0); + shortest_path_to_sink.resize(topological_order.size(), numeric_limits::max()); + // set base case (longest path already set to 0) - for (Node* node : sink_nodes) { - shortest_path_to_sink[node_id_to_idx.at(node->id())] = 0; + for (const handle_t& handle : sink_nodes) { + shortest_path_to_sink[node_id_to_idx.at(graph.get_id(handle))] = 0; } // iterate in reverse order - for (auto iter = topological_order.rbegin(); iter != topological_order.rend(); iter++) { - Node* node = *iter; - int64_t node_seq_len = node->sequence().length(); - int64_t node_idx = node_id_to_idx.at(node->id()); - // compute longest path through this node to right side of incoming matrices - int64_t longest_path_length = longest_path_to_sink[node_idx] + node_seq_len; - int64_t shortest_path_length = shortest_path_to_sink[node_idx] + node_seq_len; + for (int64_t i = topological_order.size() - 1; i >= 0; i--) { -#ifdef debug_banded_aligner_graph_processing - cerr << "[BandedGlobalAligner::path_lengths_to_sinks]: processing node " << node->id() << " at index " << node_idx << " with longest/shortest distance to sink " << longest_path_to_sink[node_idx] << "/" << shortest_path_to_sink[node_idx] << " and sequence " << node->sequence() << " for total path length of " << longest_path_length << "/" << shortest_path_length << endl; -#endif + int64_t node_seq_len = graph.get_length(topological_order[i]); + // compute longest path through this node to right side of incoming matrices + int64_t longest_path_length = longest_path_to_sink[i] + node_seq_len; + int64_t shortest_path_length = shortest_path_to_sink[i] + node_seq_len; - for (int64_t node_in_idx : node_edges_in[node_idx] ) { - if (longest_path_to_sink[node_in_idx] < longest_path_length) { + graph.follow_edges(topological_order[i], true, [&](const handle_t& prev) { + + int64_t prev_idx = node_id_to_idx.at(graph.get_id(prev)); + + if (longest_path_to_sink[prev_idx] < longest_path_length) { #ifdef debug_banded_aligner_graph_processing - cerr << "[BandedGlobalAligner::path_lengths_to_sinks]: path through " << node->id() << " of length " << longest_path_length << " to node at index " << node_in_idx << " is longer than current longest path " << longest_path_to_sink[node_in_idx] << ", updating it now" << endl; + cerr << "[BandedGlobalAligner::path_lengths_to_sinks]: path through " << graph.get_id(prev) << " of length " << longest_path_length << " to node at index " << prev_idx << " is longer than current longest path " << longest_path_to_sink[prev_idx] << ", updating it now" << endl; #endif - longest_path_to_sink[node_in_idx] = longest_path_length; + longest_path_to_sink[prev_idx] = longest_path_length; } - - if (shortest_path_to_sink[node_in_idx] > shortest_path_length) { + if (shortest_path_to_sink[prev_idx] > shortest_path_length) { #ifdef debug_banded_aligner_graph_processing - cerr << "[BandedGlobalAligner::path_lengths_to_sinks]: path through " << node->id() << " of length " << shortest_path_length << " to node at index " << node_in_idx << " is shorter than current shortest path " << shortest_path_to_sink[node_in_idx] << ", updating it now" << endl; + cerr << "[BandedGlobalAligner::path_lengths_to_sinks]: path through " << graph.get_id(prev) << " of length " << shortest_path_length << " to node at index " << prev_idx << " is shorter than current shortest path " << shortest_path_to_sink[prev_idx] << ", updating it now" << endl; #endif - shortest_path_to_sink[node_in_idx] = shortest_path_length; + shortest_path_to_sink[prev_idx] = shortest_path_length; } - } + }); } } // fills vectors with whether nodes are masked by the band width, and the band ends of each node template -void BandedGlobalAligner::find_banded_paths(const string& read, bool permissive_banding, - vector>& node_edges_in, - vector>& node_edges_out, - int64_t band_padding, vector& node_masked, +void BandedGlobalAligner::find_banded_paths(bool permissive_banding, int64_t band_padding, + vector& node_masked, vector>& band_ends) { - // find the longest and shortest path from each node to any sink - vector shortest_path_to_sink; - vector longest_path_to_sink; - path_lengths_to_sinks(read, node_edges_in, shortest_path_to_sink, longest_path_to_sink); - // keeps track of which nodes cannot reach the bottom corner within the band - node_masked = vector(topological_order.size()); + node_masked.resize(topological_order.size(), false); // the bottom and top indices of the band in the rightmost column of each node's matrix - band_ends = vector>(topological_order.size()); - - // set band ends to identities of max / min functions - for (int64_t i = 0; i < topological_order.size(); i++) { - band_ends[i].first = numeric_limits::max(); - band_ends[i].second = numeric_limits::min(); - } + band_ends.resize(topological_order.size(), make_pair(numeric_limits::max(), + numeric_limits::min())); + // find the longest and shortest path from each node to any sink + vector shortest_path_to_sink; + vector longest_path_to_sink; + path_lengths_to_sinks(shortest_path_to_sink, longest_path_to_sink); if (permissive_banding) { // initialize with wide enough bands that every source can hit every connected sink - for (Node* init_node : source_nodes) { - int64_t init_node_idx = node_id_to_idx.at(init_node->id()); - int64_t init_node_seq_len = init_node->sequence().length(); + for (const handle_t& init_node : source_nodes) { + int64_t init_node_idx = node_id_to_idx.at(graph.get_id(init_node)); + int64_t init_node_seq_len = graph.get_length(init_node); band_ends[init_node_idx].first = min(-band_padding, - (int64_t) read.length() - (init_node_seq_len + longest_path_to_sink[init_node_idx]) - band_padding); + int64_t(alignment.sequence().size()) + - (init_node_seq_len + longest_path_to_sink[init_node_idx]) - band_padding); band_ends[init_node_idx].second = max(band_padding, - (int64_t) read.length() - (init_node_seq_len + shortest_path_to_sink[init_node_idx]) + band_padding); + int64_t(alignment.sequence().size()) + - (init_node_seq_len + shortest_path_to_sink[init_node_idx]) + band_padding); #ifdef debug_banded_aligner_graph_processing - cerr << "[BandedGlobalAligner::find_banded_paths]: initializing band path end at node " << init_node->id() << " at index " << init_node_idx << " to top " << band_ends[init_node_idx].first << ", and bottom " << band_ends[init_node_idx].second << " from shortest and longest paths of length " << shortest_path_to_sink[init_node_idx] << " and " << longest_path_to_sink[init_node_idx] << " compared to read length " << read.length() << " with padding " << band_padding << endl; + cerr << "[BandedGlobalAligner::find_banded_paths]: initializing band path end at node " << graph.get_id(init_node) << " at index " << init_node_idx << " to top " << band_ends[init_node_idx].first << ", and bottom " << band_ends[init_node_idx].second << " from shortest and longest paths of length " << shortest_path_to_sink[init_node_idx] << " and " << longest_path_to_sink[init_node_idx] << " compared to read length " << alignment.sequence().size() << " with padding " << band_padding << endl; #endif } } else { // initialize with band ends beginning with source nodes - for (Node* init_node : source_nodes) { - int64_t init_node_idx = node_id_to_idx.at(init_node->id()); - int64_t init_node_seq_len = init_node->sequence().length(); + for (const handle_t& handle : source_nodes) { + int64_t init_node_idx = node_id_to_idx.at(graph.get_id(handle)); + int64_t init_node_seq_len = graph.get_length(handle); band_ends[init_node_idx].first = -band_padding; band_ends[init_node_idx].second = band_padding; #ifdef debug_banded_aligner_graph_processing - cerr << "[BandedGlobalAligner::find_banded_paths]: initializing band path end at node " << init_node->id() << " at index " << init_node_idx << " to top " << band_ends[init_node_idx].first << ", and bottom " << band_ends[init_node_idx].second << endl; + cerr << "[BandedGlobalAligner::find_banded_paths]: initializing band path end at node " << graph.get_id(handle) << " at index " << init_node_idx << " to top " << band_ends[init_node_idx].first << ", and bottom " << band_ends[init_node_idx].second << endl; #endif } } // iterate through the rest of the nodes in topological order for (int64_t i = 0; i < topological_order.size(); i++) { - Node* node = topological_order[i]; - int64_t node_idx = node_id_to_idx.at(node->id()); - int64_t node_seq_len = node->sequence().length(); - vector& edges_out = node_edges_out[node_idx]; + const handle_t& node = topological_order[i]; + int64_t node_seq_len = graph.get_length(node); - int64_t extended_band_top = band_ends[node_idx].first + node_seq_len; - int64_t extended_band_bottom = band_ends[node_idx].second + node_seq_len; + int64_t extended_band_top = band_ends[i].first + node_seq_len; + int64_t extended_band_bottom = band_ends[i].second + node_seq_len; #ifdef debug_banded_aligner_graph_processing - cerr << "[BandedGlobalAligner::find_banded_paths]: following edges out of node " << node->id() << " at index " << node_idx << " with sequence " << node->sequence() << ", band of " << band_ends[node_idx].first << ", " << band_ends[node_idx].second << " extending to " << extended_band_top << ", " << extended_band_bottom << endl; + cerr << "[BandedGlobalAligner::find_banded_paths]: following edges out of node " << graph.get_id(node) << " at index " << i << " with sequence " << graph.get_sequence(node) << ", band of " << band_ends[i].first << ", " << band_ends[i].second << " extending to " << extended_band_top << ", " << extended_band_bottom << endl; #endif // can alignments from this node reach the bottom right corner within the band? - if (extended_band_top + shortest_path_to_sink[node_idx] > (int64_t) read.length() - || extended_band_bottom + longest_path_to_sink[node_idx] < (int64_t) read.length()) { + if (extended_band_top + shortest_path_to_sink[i] > int64_t(alignment.sequence().size()) + || extended_band_bottom + longest_path_to_sink[i] < int64_t(alignment.sequence().size())) { - node_masked[node_idx] = true; + node_masked[i] = true; #ifdef debug_banded_aligner_graph_processing - cerr << "[BandedGlobalAligner::find_banded_paths]: cannot complete alignment to read of length " << read.length() << " along shortest path " << shortest_path_to_sink[node_idx] << " or longest path " << longest_path_to_sink[node_idx] << ", which reach range " << extended_band_top + shortest_path_to_sink[node_idx] << ", " << extended_band_bottom + longest_path_to_sink[node_idx] << endl; + cerr << "[BandedGlobalAligner::find_banded_paths]: cannot complete alignment to read of length " << alignment.sequence().size() << " along shortest path " << shortest_path_to_sink[i] << " or longest path " << longest_path_to_sink[i] << ", which reach range " << extended_band_top + shortest_path_to_sink[i] << ", " << extended_band_bottom + longest_path_to_sink[i] << endl; #endif continue; } - // check if each edge out requires expanding the bands - for (int64_t j = 0; j < edges_out.size(); j++) { - int64_t node_out_idx = edges_out[j]; + graph.follow_edges(node, false, [&](const handle_t& next) { + + int64_t node_out_idx = node_id_to_idx.at(graph.get_id(next)); #ifdef debug_banded_aligner_graph_processing cerr << "[BandedGlobalAligner::find_banded_paths]: extending band to node at index " << node_out_idx << endl; @@ -2188,37 +2111,34 @@ void BandedGlobalAligner::find_banded_paths(const string& read, bool pe #endif band_ends[node_out_idx].second = extended_band_bottom; } - } + }); } } // returns the shortest sequence from any source node to each node template -void BandedGlobalAligner::shortest_seq_paths(vector>& node_edges_out, - unordered_set& source_nodes, - vector& seq_lens_out) { +void BandedGlobalAligner::shortest_seq_paths(vector& seq_lens_out) { // initialize vector with min identity to store sequence lengths - seq_lens_out = vector(topological_order.size(), numeric_limits::max()); + seq_lens_out.resize(topological_order.size(), numeric_limits::max()); // base cases - for (Node* node : source_nodes) { - seq_lens_out[node_id_to_idx[node->id()]] = 0; + for (const handle_t& handle : source_nodes) { + seq_lens_out[node_id_to_idx[graph.get_id(handle)]] = 0; } // dynamic programming to calculate sequence lengths for rest of nodes - for (auto iter = topological_order.begin(); iter != topological_order.end(); iter++) { - Node* node = *iter; - int64_t node_idx = node_id_to_idx.at(node->id()); - int64_t seq_len = node->sequence().length() + seq_lens_out[node_idx]; + for (size_t i = 0; i < topological_order.size(); i++) { + int64_t seq_len = graph.get_length(topological_order[i]) + seq_lens_out[i]; - for (int64_t target_idx : node_edges_out[node_idx]) { + graph.follow_edges(topological_order[i], false, [&](const handle_t& handle) { + int64_t target_idx = node_id_to_idx.at(graph.get_id(handle)); // find the shortest sequence that can reach the top left corner of the matrix if (seq_len < seq_lens_out[target_idx]) { seq_lens_out[target_idx] = seq_len; } - } + }); } } @@ -2234,25 +2154,22 @@ void BandedGlobalAligner::align(int8_t* score_mat, int8_t* nt_table, in // fill each nodes matrix in topological order - for (int64_t i = 0; i < topological_order.size(); i++) { - Node* node = topological_order[i]; - int64_t node_idx = node_id_to_idx.at(node->id()); - BAMatrix* band_matrix = banded_matrices[node_idx]; -#ifdef debug_banded_aligner_fill_matrix - cerr << "[BandedGlobalAligner::align] checking node " << node->id() << " at index " << node_idx << " with sequence " << node->sequence() << " and topological position " << i << endl; -#endif + for (int64_t i = 0; i < banded_matrices.size(); i++) { + BAMatrix* band_matrix = banded_matrices[i]; // skip masked nodes if (band_matrix == nullptr) { #ifdef debug_banded_aligner_fill_matrix - cerr << "[BandedGlobalAligner::align] node is masked, skipping" << endl; + cerr << "[BandedGlobalAligner::align] node " << graph.get_id(topological_order[i]) << " is masked, skipping" << endl; #endif continue; } + #ifdef debug_banded_aligner_fill_matrix + cerr << "[BandedGlobalAligner::align] at node " << graph.get_id(band_matrix->node) << " at index " << i << " with sequence " << graph.get_id(band_matrix->node) << endl; cerr << "[BandedGlobalAligner::align] node is not masked, filling matrix" << endl; #endif - band_matrix->fill_matrix(score_mat, nt_table, gap_open, gap_extend, adjust_for_base_quality, min_inf); + band_matrix->fill_matrix(graph, score_mat, nt_table, gap_open, gap_extend, adjust_for_base_quality, min_inf); } traceback(score_mat, nt_table, gap_open, gap_extend, min_inf); @@ -2264,25 +2181,21 @@ void BandedGlobalAligner::traceback(int8_t* score_mat, int8_t* nt_table // get the sink and source node matrices for alignment stack unordered_set sink_node_matrices; unordered_set source_node_matrices; - for (Node* node : sink_nodes) { - sink_node_matrices.insert(banded_matrices[node_id_to_idx[node->id()]]); + for (const handle_t& node : sink_nodes) { + sink_node_matrices.insert(banded_matrices[node_id_to_idx[graph.get_id(node)]]); } - for (Node* node : source_nodes) { - source_node_matrices.insert(banded_matrices[node_id_to_idx[node->id()]]); + for (const handle_t& node : source_nodes) { + source_node_matrices.insert(banded_matrices[node_id_to_idx[graph.get_id(node)]]); } int64_t read_length = alignment.sequence().length(); int32_t empty_score = read_length > 0 ? -gap_open - (read_length - 1) * gap_extend : 0; // find the optimal alignment(s) and initialize stack - AltTracebackStack traceback_stack(max_multi_alns, empty_score, source_node_matrices, sink_node_matrices, + AltTracebackStack traceback_stack(graph, max_multi_alns, empty_score, source_node_matrices, sink_node_matrices, gap_open, gap_extend, min_inf); while (traceback_stack.has_next()) { - int64_t end_node_id; - matrix_t end_matrix; - traceback_stack.get_alignment_start(end_node_id, end_matrix); - int64_t end_node_idx = node_id_to_idx[end_node_id]; Alignment* next_alignment; if (!alt_alignments) { @@ -2301,27 +2214,52 @@ void BandedGlobalAligner::traceback(int8_t* score_mat, int8_t* nt_table } if (traceback_stack.next_is_empty()) { + traceback_stack.next_empty_alignment(*next_alignment); #ifdef debug_banded_aligner_traceback cerr << "[BandedGlobalAligner::traceback] taking the next full empty alignment" << endl; + cerr << pb2json(*next_alignment) << endl; #endif - traceback_stack.next_empty_alignment(*next_alignment); } else { + // what node does the alignment start at + int64_t node_id; + matrix_t mat; + traceback_stack.get_alignment_start(node_id, mat); + + // start the row and column trackers + int64_t i, j; + banded_matrices[node_id_to_idx[node_id]]->init_traceback_indexes(graph, i, j); + + // we only start in a gap if the sequence if there is nothing to align + bool in_lead_gap = alignment.sequence().empty(); + #ifdef debug_banded_aligner_traceback - cerr << "[BandedGlobalAligner::traceback] beginning traceback ending at node " << end_node_id << " in matrix " << (end_matrix == Match ? "match" : (end_matrix == InsertCol ? "insert column" : "insert row")) << endl; + cerr << "[BandedGlobalAligner::traceback] beginning traceback ending at node " << node_id << " in matrix " << (mat == Match ? "match" : (mat == InsertCol ? "insert column" : "insert row")) << endl; #endif - // add score to alignment - next_alignment->set_score(traceback_stack.current_traceback_score()); + // do traceback BABuilder builder(*next_alignment); - banded_matrices[end_node_idx]->traceback(builder, traceback_stack, end_matrix, score_mat, nt_table, - gap_open, gap_extend, adjust_for_base_quality, min_inf); + + while (node_id != 0) { + int64_t node_idx = node_id_to_idx[node_id]; + // trace through the matrix + banded_matrices[node_idx]->traceback(graph, builder, traceback_stack, i, j, mat, in_lead_gap, score_mat, + nt_table, gap_open, gap_extend, adjust_for_base_quality, min_inf); + // trace over edges + banded_matrices[node_idx]->traceback_over_edge(graph, builder, traceback_stack, i, j, mat, in_lead_gap, + node_id, score_mat, nt_table, gap_open, gap_extend, + adjust_for_base_quality, min_inf); + } // construct the alignment path builder.finalize_alignment(traceback_stack.current_empty_prefix()); + // add score to alignment + next_alignment->set_score(traceback_stack.current_traceback_score()); + + // advance to the next traceback traceback_stack.next_traceback_alignment(); } @@ -2335,13 +2273,15 @@ void BandedGlobalAligner::traceback(int8_t* score_mat, int8_t* nt_table } template -BandedGlobalAligner::AltTracebackStack::AltTracebackStack(int64_t max_multi_alns, +BandedGlobalAligner::AltTracebackStack::AltTracebackStack(const HandleGraph& graph, + int64_t max_multi_alns, int32_t empty_score, unordered_set& source_node_matrices, unordered_set& sink_node_matrices, int8_t gap_open, int8_t gap_extend, IntType min_inf) : + graph(graph), empty_score(empty_score), max_multi_alns(max_multi_alns) { @@ -2357,7 +2297,7 @@ BandedGlobalAligner::AltTracebackStack::AltTracebackStack(int64_t max_m if (sink_matrix->match == nullptr) { cerr << "error:[BandedGlobalAligner] must fill dynamic programming matrices before finding optimal score" << endl; - assert(0); + exit(1); } list band_stack{sink_matrix}; @@ -2367,16 +2307,19 @@ BandedGlobalAligner::AltTracebackStack::AltTracebackStack(int64_t max_m band_stack.pop_back(); if (!band_matrix) { +#ifdef debug_banded_aligner_traceback + cerr << "[BandedGlobalAligner::traceback] found stack marker, pulling " << path.front() << " from path" << endl; +#endif path.pop_front(); continue; } - if (band_matrix->node->sequence().length() == 0) { - path.push_front(band_matrix->node->id()); + if (graph.get_length(band_matrix->node) == 0) { + path.push_front(graph.get_id(band_matrix->node)); band_stack.push_back(nullptr); #ifdef debug_banded_aligner_traceback - cerr << "[BandedGlobalAligner::traceback] traversing initial empty path on " << band_matrix->node->id() << endl; + cerr << "[BandedGlobalAligner::traceback] traversing initial empty path on " << graph.get_id(band_matrix->node) << endl; #endif // we went all the way from a source to a sink using only nodes with @@ -2384,11 +2327,16 @@ BandedGlobalAligner::AltTracebackStack::AltTracebackStack(int64_t max_m // whether they are sufficiently high scoring alignments to yield if (source_node_matrices.count(band_matrix)) { empty_full_paths.push_back(path); +#ifdef debug_banded_aligner_traceback + cerr << "[BandedGlobalAligner::traceback] found empty full path" << endl; + for (auto nid : path ) { + cerr << "\t" << nid << endl; + } +#endif continue; } - for (int64_t i = 0; i < band_matrix->num_seeds; i++) { - BAMatrix* seed = band_matrix->seeds[i]; + for (auto seed : band_matrix->seeds) { if (seed) { band_stack.push_back(seed); } @@ -2397,11 +2345,14 @@ BandedGlobalAligner::AltTracebackStack::AltTracebackStack(int64_t max_m else { // get the coordinates of the bottom right corner - Node* node = band_matrix->node; - int64_t node_id = node->id(); - + const handle_t& node = band_matrix->node; + int64_t node_id = graph.get_id(node); int64_t read_length = band_matrix->alignment.sequence().length(); - int64_t ncols = node->sequence().length(); + int64_t ncols = graph.get_length(node); + +#ifdef debug_banded_aligner_traceback + cerr << "[BandedGlobalAligner::traceback] initializing tracebacks on node " << node_id << endl; +#endif int64_t final_col = ncols - 1; int64_t final_row = band_matrix->bottom_diag + ncols > read_length ? read_length - band_matrix->top_diag - ncols : band_matrix->bottom_diag - band_matrix->top_diag; @@ -2410,7 +2361,7 @@ BandedGlobalAligner::AltTracebackStack::AltTracebackStack(int64_t max_m if (band_matrix->alignment.sequence().empty()) { // if the read sequence is empty then we can only insert relative to the graph - size_t graph_length = band_matrix->cumulative_seq_len + band_matrix->node->sequence().size(); + size_t graph_length = band_matrix->cumulative_seq_len + graph.get_length(band_matrix->node); IntType insert_score = graph_length ? (graph_length - 1) * (-gap_extend) - gap_open : 0; insert_traceback(null_prefix, insert_score, node_id, final_row, final_col, node_id, InsertCol, path); } diff --git a/src/banded_global_aligner.hpp b/src/banded_global_aligner.hpp index 1ebbc9444ca..b77f9fe449f 100644 --- a/src/banded_global_aligner.hpp +++ b/src/banded_global_aligner.hpp @@ -16,7 +16,8 @@ #include #include #include -#include "vg.pb.h" + +#include "handle.hpp" using namespace std; @@ -40,6 +41,10 @@ namespace vg { * start node. Any signed integer type can be used for the dynamic programming matrices, but there * are no checks for overflow. * + * THIS IS A COMPONENT OF THE ALIGNER CLASS. + * + * Use Aligner::align_global_banded() instead. + * */ template class BandedGlobalAligner { @@ -53,7 +58,7 @@ namespace vg { /// permissive_banding expand band, not necessarily symmetrically, to allow all node paths /// adjust_for_base_quality perform base quality adjusted alignment (see QualAdjAligner) /// - BandedGlobalAligner(Alignment& alignment, Graph& g, + BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, int64_t band_padding, bool permissive_banding = false, bool adjust_for_base_quality = false); @@ -70,7 +75,7 @@ namespace vg { /// band_padding width to expand band by /// permissive_banding expand band, not necessarily symmetrically, to allow all node paths /// adjust_for_base_quality perform base quality adjusted alignment (see QualAdjAligner) - BandedGlobalAligner(Alignment& alignment, Graph& g, + BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, vector& alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding = false, bool adjust_for_base_quality = false); @@ -102,6 +107,7 @@ namespace vg { /// Matrices used in Smith-Waterman-Gotoh alignment algorithm enum matrix_t {Match, InsertCol, InsertRow}; + const HandleGraph& graph; /// The primary alignment Alignment& alignment; /// Vector for alternate alignments, or null if not making any @@ -117,14 +123,14 @@ namespace vg { /// Map from node IDs to the index used in internal vectors unordered_map node_id_to_idx; /// A topological ordering of the nodes - vector topological_order; + vector topological_order; /// Source nodes in the graph - unordered_set source_nodes; + vector source_nodes; /// Sink nodes in the graph - unordered_set sink_nodes; + vector sink_nodes; /// Internal constructor that the public constructors funnel into - BandedGlobalAligner(Alignment& alignment, Graph& g, + BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, vector* alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding = false, bool adjust_for_base_quality = false); @@ -132,20 +138,13 @@ namespace vg { /// Traceback through dynamic programming matrices to compute alignment void traceback(int8_t* score_mat, int8_t* nt_table, int8_t gap_open, int8_t gap_extend, IntType min_inf); - /// Constructor helper function: converts Graph object into adjacency list representation - void graph_edge_lists(Graph& g, bool outgoing_edges, vector>& out_edge_list); - /// Constructor helper function: compute topoligical ordering - void topological_sort(Graph& g, vector>& node_edges_out, vector& out_topological_order); /// Constructor helper function: compute the longest and shortest path to a sink for each node - void path_lengths_to_sinks(const string& read, vector>& node_edges_in, - vector& shortest_path_to_sink, vector& longest_path_to_sink); + void path_lengths_to_sinks(vector& shortest_path_to_sink, vector& longest_path_to_sink); /// Constructor helper function: compute which diagonals the bands cover on each node's matrix - void find_banded_paths(const string& read, bool permissive_banding, vector>& node_edges_in, - vector>& node_edges_out, int64_t band_padding, - vector& node_masked, vector>& band_ends); + void find_banded_paths(bool permissive_banding, int64_t band_padding, vector& node_masked, + vector>& band_ends); /// Constructor helper function: compute the shortest path from a source to each node - void shortest_seq_paths(vector>& node_edges_out, unordered_set& source_nodes, - vector& seq_lens_out); + void shortest_seq_paths(vector& seq_lens_out); }; /** @@ -156,22 +155,30 @@ namespace vg { class BandedGlobalAligner::BAMatrix { public: - BAMatrix(Alignment& alignment, Node* node, int64_t top_diag, int64_t bottom_diag, - BAMatrix** seeds, int64_t num_seeds, int64_t cumulative_seq_len); + BAMatrix(Alignment& alignment, handle_t node, int64_t top_diag, int64_t bottom_diag, + const vector& seeds, int64_t cumulative_seq_len); ~BAMatrix(); /// Use DP to fill the band with alignment scores - void fill_matrix(int8_t* score_mat, int8_t* nt_table, int8_t gap_open, int8_t gap_extend, bool qual_adjusted, - IntType min_inf); + void fill_matrix(const HandleGraph& graph, int8_t* score_mat, int8_t* nt_table, int8_t gap_open, + int8_t gap_extend, bool qual_adjusted, IntType min_inf); - /// Traceback through the band after using DP to fill it - void traceback(BABuilder& builder, AltTracebackStack& traceback_stack, matrix_t start_mat, int8_t* score_mat, - int8_t* nt_table, int8_t gap_open, int8_t gap_extend, bool qual_adjusted, IntType min_inf); + void init_traceback_indexes(const HandleGraph& graph, int64_t& i, int64_t& j); + + void traceback(const HandleGraph& graph, BABuilder& builder, AltTracebackStack& traceback_stack, + int64_t& i, int64_t& j, matrix_t& mat, bool& in_lead_gap, + const int8_t* score_mat, const int8_t* nt_table, const int8_t gap_open, const int8_t gap_extend, + const bool qual_adjusted, IntType const min_inf); + + void traceback_over_edge(const HandleGraph& graph, BABuilder& builder, AltTracebackStack& traceback_stack, + int64_t& i, int64_t& j, matrix_t& mat, bool& in_lead_gap, int64_t& node_id, + const int8_t* score_mat, const int8_t* nt_table, const int8_t gap_open, + const int8_t gap_extend, const bool qual_adjusted, IntType const min_inf); /// Debugging function - void print_full_matrices(); + void print_full_matrices(const HandleGraph& graph); /// Debugging function - void print_rectangularized_bands(); + void print_rectangularized_bands(const HandleGraph& graph); private: @@ -179,7 +186,7 @@ namespace vg { int64_t top_diag; int64_t bottom_diag; - Node* node; + handle_t node; Alignment& alignment; @@ -187,8 +194,7 @@ namespace vg { int64_t cumulative_seq_len; /// Matrices for nodes with edges into this node - BAMatrix** seeds; - int64_t num_seeds; + vector seeds; /// DP matrix IntType* match; @@ -197,18 +203,15 @@ namespace vg { /// DP matrix IntType* insert_row; - void traceback_internal(BABuilder& builder, AltTracebackStack& traceback_stack, int64_t start_row, - int64_t start_col, matrix_t start_mat, bool in_lead_gap, int8_t* score_mat, - int8_t* nt_table, int8_t gap_open, int8_t gap_extend, bool qual_adjusted, - IntType min_inf); - /// Debugging function - void print_matrix(matrix_t which_mat); + void print_matrix(const HandleGraph& graph, matrix_t which_mat); /// Debugging function - void print_band(matrix_t which_mat); + void print_band(const HandleGraph& graph, matrix_t which_mat); friend class BABuilder; friend class AltTracebackStack; // not a fan of this one, but constructor ugly without it + friend class BandedGlobalAligner; // also not a fan of this one but i have to refactor some + // debug statements without it }; /** @@ -220,7 +223,7 @@ namespace vg { template class BandedGlobalAligner::AltTracebackStack { public: - AltTracebackStack(int64_t max_multi_alns, int32_t empty_score, + AltTracebackStack(const HandleGraph& graph, int64_t max_multi_alns, int32_t empty_score, unordered_set& source_node_matrices, unordered_set& sink_node_matrices, int8_t gap_open, @@ -275,6 +278,7 @@ namespace vg { /// All of the paths through the graph that take only empty nodes list> empty_full_paths; int32_t empty_score; + const HandleGraph& graph; /// Pointer to the traceback directions for the alignment we are currently tracing back typename list, IntType, list>>::iterator curr_traceback; @@ -321,8 +325,8 @@ namespace vg { ~BABuilder(); /// Add next step in traceback - void update_state(matrix_t matrix, Node* node, int64_t read_idx, int64_t node_idx, - bool empty_node_seq = false); + void update_state(const HandleGraph& graph, matrix_t matrix, const handle_t& node, int64_t read_idx, + int64_t node_idx, bool empty_node_seq = false); /// Call after concluding traceback to finish adding edits to alignment void finalize_alignment(const list& empty_prefix); @@ -334,7 +338,8 @@ namespace vg { matrix_t matrix_state; bool matching = false; - Node* current_node = nullptr; + id_t current_node_id = 0; + string current_node_sequence = ""; int64_t edit_length = 0; int64_t edit_read_end_idx = 0; diff --git a/src/bin2ascii.h b/src/bin2ascii.h deleted file mode 100644 index 40097a9c0b9..00000000000 --- a/src/bin2ascii.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2013 Pavel Shramov - * - * json2pb is free software; you can redistribute it and/or modify - * it under the terms of the MIT license. See LICENSE for details. - */ - -#ifndef VG_BIN2ASCII_H_INCLUDED -#define VG_BIN2ASCII_H_INCLUDED - -#include -#include - -inline std::string hex2bin(const std::string &s) -{ - if (s.size() % 2) - throw std::runtime_error("Odd hex data size"); - static const char lookup[] = "" - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x00 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x10 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x20 - "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x80\x80\x80\x80\x80\x80" // 0x30 - "\x80\x0a\x0b\x0c\x0d\x0e\x0f\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x40 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x50 - "\x80\x0a\x0b\x0c\x0d\x0e\x0f\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x60 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x70 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x80 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x90 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xa0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xb0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xc0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xd0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xe0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xf0 - ""; - std::string r; - r.reserve(s.size() / 2); - for (size_t i = 0; i < s.size(); i += 2) { - char hi = lookup[s[i]]; - char lo = lookup[s[i+1]]; - if (0x80 & (hi | lo)) - throw std::runtime_error("Invalid hex data: " + s.substr(i, 6)); - r.push_back((hi << 4) | lo); - } - return r; -} - -inline std::string bin2hex(const std::string &s) -{ - static const char lookup[] = "0123456789abcdef"; - std::string r; - r.reserve(s.size() * 2); - for (size_t i = 0; i < s.size(); i++) { - char hi = s[i] >> 4; - char lo = s[i] & 0xf; - r.push_back(lookup[hi]); - r.push_back(lookup[lo]); - } - return r; -} - -inline std::string b64_encode(const std::string &s) -{ - typedef unsigned char u1; - static const char lookup[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - const u1 * data = (const u1 *) s.c_str(); - std::string r; - r.reserve(s.size() * 4 / 3 + 3); - for (size_t i = 0; i < s.size(); i += 3) { - unsigned n = data[i] << 16; - if (i + 1 < s.size()) n |= data[i + 1] << 8; - if (i + 2 < s.size()) n |= data[i + 2]; - - u1 n0 = (u1)(n >> 18) & 0x3f; - u1 n1 = (u1)(n >> 12) & 0x3f; - u1 n2 = (u1)(n >> 6) & 0x3f; - u1 n3 = (u1)(n ) & 0x3f; - - r.push_back(lookup[n0]); - r.push_back(lookup[n1]); - if (i + 1 < s.size()) r.push_back(lookup[n2]); - if (i + 2 < s.size()) r.push_back(lookup[n3]); - } - for (int i = 0; i < (3 - s.size() % 3) % 3; i++) - r.push_back('='); - return r; -} - -inline std::string b64_decode(const std::string &s) -{ - typedef unsigned char u1; - static const char lookup[] = "" - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x00 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x10 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x3e\x80\x80\x80\x3f" // 0x20 - "\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x80\x80\x80\x00\x80\x80" // 0x30 - "\x80\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e" // 0x40 - "\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x80\x80\x80\x80\x80" // 0x50 - "\x80\x1a\x1b\x1c\x1d\x1e\x1f\x20\x21\x22\x23\x24\x25\x26\x27\x28" // 0x60 - "\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x80\x80\x80\x80\x80" // 0x70 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x80 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0x90 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xa0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xb0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xc0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xd0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xe0 - "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" // 0xf0 - ""; - std::string r; - if (!s.size()) return r; - if (s.size() % 4) - throw std::runtime_error("Invalid base64 data size"); - size_t pad = 0; - if (s[s.size() - 1] == '=') pad++; - if (s[s.size() - 2] == '=') pad++; - - r.reserve(s.size() * 3 / 4 + 3); - for (size_t i = 0; i < s.size(); i += 4) { - u1 n0 = lookup[(u1) s[i+0]]; - u1 n1 = lookup[(u1) s[i+1]]; - u1 n2 = lookup[(u1) s[i+2]]; - u1 n3 = lookup[(u1) s[i+3]]; - if (0x80 & (n0 | n1 | n2 | n3)) - throw std::runtime_error("Invalid hex data: " + s.substr(i, 4)); - unsigned n = (n0 << 18) | (n1 << 12) | (n2 << 6) | n3; - r.push_back((n >> 16) & 0xff); - if (s[i+2] != '=') r.push_back((n >> 8) & 0xff); - if (s[i+3] != '=') r.push_back((n ) & 0xff); - } - return r; -} - -#endif//VG_BIN2ASCII_H_INCLUDED diff --git a/src/blocked_gzip_input_stream.cpp b/src/blocked_gzip_input_stream.cpp deleted file mode 100644 index f757c93d06f..00000000000 --- a/src/blocked_gzip_input_stream.cpp +++ /dev/null @@ -1,250 +0,0 @@ -#include "blocked_gzip_input_stream.hpp" - -#include "hfile_cppstream.hpp" - -// We need the hFILE* internals available. -#include - -namespace vg { - -namespace stream { - -using namespace std; - -BlockedGzipInputStream::BlockedGzipInputStream(std::istream& stream) : handle(nullptr), byte_count(0), - know_offset(false) { - - // See where the stream is - stream.clear(); - auto file_start = stream.tellg(); - bool good = stream.good(); - - // Wrap the stream in an hFILE* - hFILE* wrapped = hfile_wrap(stream); - if (wrapped == nullptr) { - throw runtime_error("Unable to wrap stream"); - } - - // Give ownership of it to a BGZF that reads, which we in turn own. - handle = bgzf_hopen(wrapped, "r"); - if (handle == nullptr) { - throw runtime_error("Unable to set up BGZF library on wrapped stream"); - } - - if (file_start >= 0 && good && bgzf_compression(handle) == 2) { - // The stream we are wrapping is seekable, and the data is block-compressed - - // We need to make sure BGZF knows where its blocks are starting. - - // We just freshly opened the BGZF so it thinks it is at 0. - - // Tell the BGZF where its next block is actually starting. - handle->block_address = file_start; - - // Remember the virtual offsets will be valid - know_offset = true; - } -} - -BlockedGzipInputStream::~BlockedGzipInputStream() { - // Close the GBZF - bgzf_close(handle); -} - -bool BlockedGzipInputStream::Next(const void** data, int* size) { - if (handle->block_length != 0 && handle->block_offset != handle->block_length) { - // We aren't just after a seek, but we also aren't at the end of a block. We backed up. - - // The already-read data may have started at an offset. But we don't - // care, because if we back up by X bytes we always re-read the last X - // bytes of the block. - - // Return the unread part of the BGZF file's buffer - *data = (void*)((char*)handle->uncompressed_block + handle->block_offset); - *size = handle->block_length - handle->block_offset; - - // Send the offset to the end of the block again - handle->block_offset = handle->block_length; - handle->uncompressed_address += *size; - -#ifdef debug - cerr << "Re-emit " << *size << " bytes of backed-up data, move to " << handle->block_offset << "/" << handle->block_length << endl; - cerr << "errcode: " << handle->errcode << endl; -#endif - - return true; - } else { - // We need new data. Either we did a seek, or we are at the end of the previous block. - - if (bgzf_compression(handle) != 2) { - // We're not BGZF compressed. bgzf_read_block only resets the - // block_offset (when not seeking) for BGZF files. We have to do it - // manually. - handle->block_offset = 0; - } - -#ifdef debug - cerr << "Compression mode: " << bgzf_compression(handle) << endl; - cerr << "Read next block; offset is " << handle->block_offset << endl; - if (handle->gz_stream != nullptr) { - cerr << "\tavail_in: " << handle->gz_stream->avail_in << endl; - if (handle->gz_stream->avail_in > 0) { - cerr << "\t\tShould not read from backing file!" << endl; - } - cerr << "\tavail_out: " << handle->gz_stream->avail_out << endl; - } -#endif - - // Make the BGZF read the next block - if (bgzf_read_block(handle) != 0) { - // We have encountered an error - -#ifdef debug - cerr << "Failed to read next block" << endl; - cerr << "\terrcode: " << handle->errcode << endl; -#endif - - return false; - } - -#ifdef debug - cerr << "See a block of length " << handle->block_length << " at " << handle->block_address << endl; - cerr << "\terrcode: " << handle->errcode << endl; - if (handle->gz_stream != nullptr) { - cerr << "\tavail_in: " << handle->gz_stream->avail_in << endl; - cerr << "\tavail_out: " << handle->gz_stream->avail_out << endl; - } -#endif - - if (handle->block_length == 0) { - // We have hit EOF - -#ifdef debug - cerr << "Next block reports length 0 (EOF)" << endl; - cerr << "\terrcode: " << handle->errcode << endl; -#endif - - return false; - } - - // Otherwise we have data. - - if (handle->block_offset > handle->block_length) { - // We don't have enough data to fulfill the most recent seek. Signal an error. - -#ifdef debug - cerr << "Tried to seek to " << handle->block_offset << " but got only " << handle->block_length << " bytes" << endl; - cerr << "\terrcode: " << handle->errcode << endl; -#endif - - return false; - } - - // Send out the address and size, accounting for seek offset - *data = (void*)((char*)handle->uncompressed_block + handle->block_offset); - *size = handle->block_length - handle->block_offset; - - // Record the bytes read - byte_count += handle->block_length - handle->block_offset; - - // Tell the BGZF that the cursor is at the end of the block (because it - // is; subsequent reads come from there) - handle->block_offset = handle->block_length; - handle->uncompressed_address += *size; - -#ifdef debug - cerr << "Emit " << *size << " bytes in fresh block" << endl; - cerr << "\terrcode: " << handle->errcode << endl; -#endif - - return true; - } -} - -void BlockedGzipInputStream::BackUp(int count) { - assert(count <= handle->block_offset); - handle->block_offset -= count; - handle->uncompressed_address -= count; - -#ifdef debug - cerr << "Back up " << count << " bytes to " << handle->block_offset << "/" << handle->block_length << endl; -#endif -} - -bool BlockedGzipInputStream::Skip(int count) { - // We just implement this in terms of next and back up. There's not really - // a more efficient way, since we can't do relative seeks. - - // We have to support this happening immediately after a seek. - - while (count > 0) { - // Keep nexting until we get the block that is count away from where we are. - const void* ignored_data; - int size; - - if (!Next(&ignored_data, &size)) { - // We hit EOF, or had an error. - return false; - } - - // We accomplished this much skipping. - count -= size; - } - - if (count < 0) { - // We went too far. But we know we want to go somewhere in this buffer, - // or we would have finished the loop before we did. - BackUp(-count); - count = 0; - } - - return true; - -} - -int64_t BlockedGzipInputStream::ByteCount() const { - return byte_count; -} - -int64_t BlockedGzipInputStream::Tell() const { - if (know_offset) { - // Our virtual offsets are true. - - // But since we are happy to leave the BGZF's cursor at the ends of - // blocks, we have to work out what the real virtual offset should be - // in that case (byte 0 of the next block) - if (handle->block_offset == handle->block_length) { - // We need to know where the next block is - - // We don't have bgzf_htell so we fake it. - // We also manually shift the block address to the right place. - return htell(handle->fp) << 16; - - } else { - // Since we use the BGZF's internal cursor correctly, we can rely on its tell function. - return bgzf_tell(handle); - } - } else { - // We don't know where the zero position in the stream was, so we can't - // trust BGZF's virtual offsets. - return -1; - } -} - -bool BlockedGzipInputStream::Seek(int64_t virtual_offset) { - if (!know_offset) { - // We can't seek - return false; - } - - // Do the seek, and return whether it worked. - // This will set handle->block_length to 0, so we know we need to read the block when we read next. - return bgzf_seek(handle, virtual_offset, SEEK_SET) == 0; - - // We won't find out if there's actually data there until we try to read... -} - -} - -} - diff --git a/src/blocked_gzip_input_stream.hpp b/src/blocked_gzip_input_stream.hpp deleted file mode 100644 index a91b554560a..00000000000 --- a/src/blocked_gzip_input_stream.hpp +++ /dev/null @@ -1,99 +0,0 @@ -#ifndef VG_BLOCKED_GZIP_INPUT_STREAM_HPP_INCLUDED -#define VG_BLOCKED_GZIP_INPUT_STREAM_HPP_INCLUDED - -#include - -#include - -namespace vg { - -namespace stream { - - -/// Protobuf-style ZeroCopyInputStream that reads data from blocked gzip -/// format, and allows interacting with virtual offsets. -/// Cannot be moved or copied, because the base class can't be moved or copied. -class BlockedGzipInputStream : public ::google::protobuf::io::ZeroCopyInputStream { - -public: - - // Does not support construction off a raw BGZF because there's no way to - // force the file cursor to the start of a new block. And because it's a - // bad API anyway. - - /// Make a new stream reading from the given C++ std::istream, wrapping it - /// in a BGZF. The stream must be at a BGZF block header, since the header - /// info is peeked. - BlockedGzipInputStream(std::istream& stream); - - /// Destroy the stream. - virtual ~BlockedGzipInputStream(); - - // Explicitly say we can't be copied/moved, to simplify errors. - BlockedGzipInputStream(const BlockedGzipInputStream& other) = delete; - BlockedGzipInputStream& operator=(const BlockedGzipInputStream& other) = delete; - BlockedGzipInputStream(BlockedGzipInputStream&& other) = delete; - BlockedGzipInputStream& operator=(BlockedGzipInputStream&& other) = delete; - - /////////////////////////////////////////////////////////////////////////// - // ZeroCopyInputStream interface - /////////////////////////////////////////////////////////////////////////// - - /// Get a buffer to read from. Saves the address of the buffer where data - /// points, and the size of the buffer where size points. Returns false on - /// an unrecoverable error or EOF, and true if a buffer was gotten. The - /// data pointer must be valid until the next read call or until the stream - /// is destroyed. - virtual bool Next(const void** data, int* size); - - /// When called after Next(), mark the last count bytes of the buffer that - /// Next() produced as not having been read. - virtual void BackUp(int count); - - /// Skip ahead the given number of bytes. Return false if the end of the - /// stream is reached, or an error occurs. If the end of the stream is hit, - /// advances to the end of the stream. - virtual bool Skip(int count); - - /// Get the number of bytes read since the stream was constructed. - virtual int64_t ByteCount() const; - - /////////////////////////////////////////////////////////////////////////// - // BGZF support interface - /////////////////////////////////////////////////////////////////////////// - - /// Return the blocked gzip virtual offset at which the next fresh buffer - /// returned by Next() will start, or -1 if operating on an untellable - /// stream like standard input or on a non-blocked file. Note that this - /// will only get you the position of the next read if anything you are - /// reading through is fully backed up to the next actually-unread byte. - /// See Protobuf's CodedInputStream::Trim(). - virtual int64_t Tell() const; - - /// Seek to the given virtual offset. Return true if successful, or false - /// if the backing stream is unseekable, or not blocked. Note that this - /// will cause problems if something reading from this stream is still - /// operating on outstanding buffers; Any CodedInputStreams reading from - /// this stream *must* be destroyed before this function is called. - virtual bool Seek(int64_t virtual_offset); - -protected: - - /// The open BGZF handle being read from. We use the BGZF's buffer as our - /// buffer, and its block_offset for our seeks and back-ups. - BGZF* handle; - - /// The counter to back ByteCount - size_t byte_count; - - /// Flag for whether our backing stream is tellable. - bool know_offset; - -}; - -} - -} - - -#endif diff --git a/src/blocked_gzip_output_stream.cpp b/src/blocked_gzip_output_stream.cpp deleted file mode 100644 index 769a1ebd9dd..00000000000 --- a/src/blocked_gzip_output_stream.cpp +++ /dev/null @@ -1,221 +0,0 @@ -#include "blocked_gzip_output_stream.hpp" - -#include "hfile_cppstream.hpp" - -// We need the hFILE* internals available. -#include - -namespace vg { - -namespace stream { - -using namespace std; - -BlockedGzipOutputStream::BlockedGzipOutputStream(BGZF* bgzf_handle) : handle(bgzf_handle), buffer(), backed_up(0), byte_count(0), - know_offset(false), end_file(false) { - - if (handle->mt) { - // I don't want to deal with BGZF multithreading, because I'm going to be hacking its internals - throw runtime_error("Multithreaded BGZF is not supported"); - } - - // Force the BGZF to start a new block by flushing the old one, if it exists. - if (bgzf_flush(handle) != 0) { - throw runtime_error("Unable to flush BGZF"); - } - - // Try seeking the hfile's backend to exactly the position it is at, to get the actual offset. - // This lets us know if the stream is really seekable/tellable, because htell always works. - auto cur_pos = (*(handle->fp->backend->seek))(handle->fp, 0, SEEK_CUR); - if (cur_pos >= 0) { - // The seek succeeded. We know where we are, and so, we assume, does - // the hFILE. - - // Tell the BGZF where it is (which is at the hFILE's position rather - // than the backend's, but we know the hFILE position is correct) - handle->block_address = htell(handle->fp); - - // We are backed by a tellable stream - know_offset = true; - } -} - -BlockedGzipOutputStream::BlockedGzipOutputStream(std::ostream& stream) : handle(nullptr), buffer(), backed_up(0), byte_count(0), - know_offset(false), end_file(false) { - - // Wrap the stream in an hFILE* - hFILE* wrapped = hfile_wrap(stream); - if (wrapped == nullptr) { - throw runtime_error("Unable to wrap stream"); - } - - // Give ownership of it to a BGZF that writes, which we in turn own. - handle = bgzf_hopen(wrapped, "w"); - if (handle == nullptr) { - throw runtime_error("Unable to set up BGZF library on wrapped stream"); - } - - stream.clear(); - auto file_start = stream.tellp(); - if (file_start >= 0 && stream.good()) { - // The stream we are wrapping is seekable. - - // We need to make sure BGZF knows where its blocks are starting. - - // No need to flush because we just freshly opened the BGZF - - // Tell the BGZF where its next block is actually starting. - handle->block_address = file_start; - - // Remember the virtual offsets will be valid - know_offset = true; - } -} - -BlockedGzipOutputStream::~BlockedGzipOutputStream() { - // Make sure to finish writing before destructing. - flush(); - - if (end_file) { - // Close the file with an EOF block. -#ifdef debug - cerr << "Close normally" << endl; -#endif - bgzf_close(handle); - } else { - // Close the BGZF *without* writing an EOF block. -#ifdef debug - cerr << "Force close" << endl; -#endif - force_close(); - } -} - -bool BlockedGzipOutputStream::Next(void** data, int* size) { - try { - // Dump data if we have it - flush(); - - // Allocate some space in the buffer - buffer.resize(4096); - -#ifdef debug - cerr << "Allocate buffer of " << buffer.size() << " bytes " << endl; -#endif - - // None of it is backed up - backed_up = 0; - - // Tell the caller where to write - *data = (void*)&buffer[0]; - *size = buffer.size(); - - // It worked - return true; - - } catch(exception e) { - return false; - } -} - -void BlockedGzipOutputStream::BackUp(int count) { - backed_up += count; - assert(backed_up <= buffer.size()); - -#ifdef debug - cerr << "Back up " << count << " bytes to " << (buffer.size() - backed_up) << " still written" << endl; -#endif -} - -int64_t BlockedGzipOutputStream::ByteCount() const { -#ifdef debug - cerr << "Report total bytes written as " << byte_count << endl; -#endif - return byte_count; -} - -bool BlockedGzipOutputStream::WriteAliasedRaw(const void* data, int size) { - // Not allowed - return false; -} - -bool BlockedGzipOutputStream::AllowsAliasing() const { - return false; -} - - -int64_t BlockedGzipOutputStream::Tell() { - if (know_offset) { - // Our virtual offsets are true. - - // Make sure all data has been sent to BGZF - flush(); - - // See where we are now. No de-aliasing is necessary; the BGZF never - // leaves the cursor past the end of the block when writing, so we - // always have the cannonical virtual offset. - return bgzf_tell(handle); - } else { - // We don't know where the zero position in the stream was, so we can't - // trust BGZF's virtual offsets. - return -1; - } -} - -void BlockedGzipOutputStream::StartFile() { - // We know since nothing has been written that we are working with a fresh - // BGZF at what it thinks is virtual offset 0. - assert(bgzf_tell(handle) == 0); - know_offset = true; -} - -void BlockedGzipOutputStream::EndFile() { - end_file = true; -} - -void BlockedGzipOutputStream::flush() { - // How many bytes are left to write? - auto outstanding = buffer.size() - backed_up; - if (outstanding > 0) { -#ifdef debug - cerr << "Flush " << outstanding << " bytes to BGZF" << endl; -#endif - - // Save the buffer - auto written = bgzf_write(handle, (void*)&buffer[0], outstanding); - - if (written != outstanding) { - // This only happens when there is an error - throw runtime_error("IO error writing data in BlockedGzipOutputStream"); - } - - // Record the actual write - byte_count += written; - - // Make sure we don't try and write the same data twice by scrapping the buffer. - buffer.resize(0); - backed_up = 0; - } -} - -void BlockedGzipOutputStream::force_close() { - // Sneakily close the BGZF file without letting it write an EOF empty block marker. - - // Flush the data, which the close function won't do in the path we want it to take - if (bgzf_flush(handle) != 0) { - throw runtime_error("Could not flush the BGZF"); - } - - // Lie to BGZF and tell it that it did not just write compressed data. - // This causes close to bypass the EOF block write. - handle->is_compressed = 0; - - // Do the close operation, which does all the other cleanup still. - bgzf_close(handle); - handle = nullptr; -} - -} - -} - diff --git a/src/blocked_gzip_output_stream.hpp b/src/blocked_gzip_output_stream.hpp deleted file mode 100644 index 82a0fcf1fad..00000000000 --- a/src/blocked_gzip_output_stream.hpp +++ /dev/null @@ -1,132 +0,0 @@ -#ifndef VG_BLOCKED_GZIP_OUTPUT_STREAM_HPP_INCLUDED -#define VG_BLOCKED_GZIP_OUTPUT_STREAM_HPP_INCLUDED - -#include - -#include - -namespace vg { - -namespace stream { - - -/// Protobuf-style ZeroCopyOutputStream that writes data in blocked gzip -/// format, and allows interacting with virtual offsets. Does NOT emit the BGZF -/// end-of-file marker unless told to, because we don't want an empty block -/// after every vg stream::write call. -class BlockedGzipOutputStream : public ::google::protobuf::io::ZeroCopyOutputStream { - -public: - /// Make a new stream outputting to the given open BGZF file handle. - /// The stream will own the BGZF file and close it when destructed. - BlockedGzipOutputStream(BGZF* bgzf_handle); - - /// Make a new stream outputting to the given C++ std::ostream, wrapping it - /// in a BGZF. - BlockedGzipOutputStream(std::ostream& stream); - - /// Destroy the stream, finishing all writes if necessary. - virtual ~BlockedGzipOutputStream(); - - // Explicitly say we can't be copied/moved, to simplify errors. - BlockedGzipOutputStream(const BlockedGzipOutputStream& other) = delete; - BlockedGzipOutputStream& operator=(const BlockedGzipOutputStream& other) = delete; - BlockedGzipOutputStream(BlockedGzipOutputStream&& other) = delete; - BlockedGzipOutputStream& operator=(BlockedGzipOutputStream&& other) = delete; - - /////////////////////////////////////////////////////////////////////////// - // ZeroCopyOutputStream interface - /////////////////////////////////////////////////////////////////////////// - - /// Get a buffer to write to. Saves the address of the buffer where data - /// points, and the size of the buffer where size points. Returns false on - /// an unrecoverable error, and true if a buffer was gotten. The stream is - /// responsible for making sure data in the buffer makes it into the - /// output. The data pointer must be valid until the next write call or - /// until the stream is destroyed. - virtual bool Next(void** data, int* size); - - /// When called after Next(), mark the last count bytes of the buffer that - /// Next() produced as not to be written to the output. The user must not - /// have touched those bytes. - virtual void BackUp(int count); - - /// Get the number of bytes written since the stream was constructed. - virtual int64_t ByteCount() const; - - /// Take the given data at the given address into the stream as written. - /// Only works if AllowsAliasing() returns true. Returns true on success, - /// and false on an unrecoverable error. - virtual bool WriteAliasedRaw(const void * data, int size); - - /// Return true if WriteAliasedRaw() is actually available, and false otherwise. - virtual bool AllowsAliasing() const; - - /////////////////////////////////////////////////////////////////////////// - // BGZF support interface - /////////////////////////////////////////////////////////////////////////// - - /// Return the blocked gzip virtual offset at which the next buffer - /// returned by Next() will start, or -1 if operating on an untellable - /// stream like standard output. Note that this will only get you the - /// position of the next write if anything you are writing through is fully - /// backed up to the next actually-unwritten byte. See Protobuf's - /// CodedOutputStream::Trim(). Not const because buffered data may need to - /// be sent to the compressor to get the virtual offset. - virtual int64_t Tell(); - - // Seek is not supported because it is not allowed by the backing BGZF - // library for writable files. - - /// Tell this BlockedGzipOutputStream that it is at the beginning of a - /// file, when the backing stream is unseekable. Must be called before - /// anything has been written. Enables Tell() and sets the current virtual - /// offset to 0. - virtual void StartFile(); - - /// Make this BlockedGzipOutputStream write the BGZF-required empty end of - /// file block, when it finishes writing to the BGZF. These blocks are - /// permitted in the interior of files, but we don't want to add them all - /// the time because they're superfluous and they are supposed to be EOF - /// indicators while we are supposed to be able to append data to a file in - /// progress. - virtual void EndFile(); - -protected: - - /// Actually dump the buffer data to the BGZF, if needed. Sadly, we can't - /// really be zero-copy because the BGZF library isn't. - /// Throws on failure. - void flush(); - - /// Force the BGZF handle closed without letting the library write its EOF marker. - /// TODO: This is necessarily a hack that depends strongly on htslib internals. - /// Should not be called unless data has been flushed into the BGZF. - void force_close(); - - /// The open BGZF handle being written to - BGZF* handle; - - /// This vector will own the memory we use as our void* buffer. - std::vector buffer; - - /// The number of characters that have been backed up from the end of the buffer - size_t backed_up; - - /// The counter to back ByteCount - size_t byte_count; - - /// Flag for whether our backing stream is tellable. - bool know_offset; - - /// Flag for whether we are supposed to close out the BGZF file. - bool end_file; - -}; - -} - -} - - -#endif diff --git a/src/build_index.cpp b/src/build_index.cpp index 14eb936a792..aa6a38950b3 100644 --- a/src/build_index.cpp +++ b/src/build_index.cpp @@ -1,36 +1,32 @@ #include "build_index.hpp" +#include "source_sink_overlay.hpp" +#include "utility.hpp" namespace vg { -void build_gcsa_lcp(VG& graph, +void build_gcsa_lcp(const HandleGraph& graph, gcsa::GCSA*& gcsa, gcsa::LCPArray*& lcp, int kmer_size, size_t doubling_steps, size_t size_limit, const string& base_file_name) { - id_t max_id=0; - graph.for_each_handle([&max_id,&graph](const handle_t& h) { max_id = max(graph.get_id(h), max_id); }); - id_t head_id = max_id+1; - id_t tail_id = max_id+2; - Node* head_node = nullptr; Node* tail_node = nullptr; - // TODO add this for MutableHandleGraphs - graph.add_start_end_markers(kmer_size, '#', '$', head_node, tail_node, head_id, tail_id); - + + // Add an overlay with the source and sink nodes for GCSA + SourceSinkOverlay overlay(&graph, kmer_size); gcsa::ConstructionParameters params; params.setSteps(doubling_steps); params.setLimit(size_limit); // Generate the kmers and reduce the size limit by their size. size_t kmer_bytes = params.getLimitBytes(); - string tmpfile = write_gcsa_kmers_to_tmpfile(graph, kmer_size, + string tmpfile = write_gcsa_kmers_to_tmpfile(overlay, kmer_size, kmer_bytes, - head_id, tail_id, + overlay.get_id(overlay.get_source_handle()), + overlay.get_id(overlay.get_sink_handle()), base_file_name); params.reduceLimit(kmer_bytes); - graph.destroy_node(head_node); - graph.destroy_node(tail_node); // set up the input graph using the kmers gcsa::InputGraph input_graph({ tmpfile }, true); // run the GCSA construction diff --git a/src/build_index.hpp b/src/build_index.hpp index 71b1ed0ce63..9b09977b298 100644 --- a/src/build_index.hpp +++ b/src/build_index.hpp @@ -1,15 +1,12 @@ #ifndef VG_BUILD_INDEX_HPP_INCLUDED #define VG_BUILD_INDEX_HPP_INCLUDED -#include "vg.pb.h" +#include #include -#include "json2pb.h" #include "handle.hpp" -#include "utility.hpp" #include "gcsa/gcsa.h" #include "gcsa/lcp.h" #include "kmer.hpp" -#include "vg.hpp" /** \file * Functions for building GCSA2/LCP indexes from HandleGraphs @@ -19,7 +16,7 @@ namespace vg { using namespace std; -void build_gcsa_lcp(VG& graph, +void build_gcsa_lcp(const HandleGraph& graph, gcsa::GCSA*& gcsa, gcsa::LCPArray*& lcp, int kmer_size, diff --git a/src/cached_position.cpp b/src/cached_position.cpp deleted file mode 100644 index 63ad42e1fa4..00000000000 --- a/src/cached_position.cpp +++ /dev/null @@ -1,296 +0,0 @@ -#include "cached_position.hpp" - -namespace vg { - -Node xg_cached_node(id_t id, xg::XG* xgidx, LRUCache& node_cache) { - pair cached = node_cache.retrieve(id); - if(!cached.second) { - cached.first = xgidx->node(id); - node_cache.put(id, cached.first); - } - Node& node = cached.first; - return node; -} - -vector xg_cached_edges_of(id_t id, xg::XG* xgidx, LRUCache >& edge_cache) { - pair, bool> cached = edge_cache.retrieve(id); - if(!cached.second) { - for (auto& edge : xgidx->edges_of(id)) { - cached.first.push_back(edge); - } - edge_cache.put(id, cached.first); - } - return cached.first; -} - -vector xg_cached_edges_on_start(id_t id, xg::XG* xgidx, LRUCache >& edge_cache) { - vector all_edges = xg_cached_edges_of(id, xgidx, edge_cache); - auto new_end = std::remove_if(all_edges.begin(), all_edges.end(), - [&](const Edge& edge) { - return (edge.from() == id && edge.from_start()) || - (edge.to() == id && !edge.to_end()); - }); - all_edges.resize(new_end - all_edges.begin()); - return all_edges; -} - -vector xg_cached_edges_on_end(id_t id, xg::XG* xgidx, LRUCache >& edge_cache) { - vector all_edges = xg_cached_edges_of(id, xgidx, edge_cache); - auto new_end = std::remove_if(all_edges.begin(), all_edges.end(), - [&](const Edge& edge) { - return (edge.from() == id && !edge.from_start()) || - (edge.to() == id && edge.to_end()); - }); - all_edges.resize(new_end - all_edges.begin()); - return all_edges; -} - -string xg_cached_node_sequence(id_t id, xg::XG* xgidx, LRUCache& node_cache) { - pair cached = node_cache.retrieve(id); - if(!cached.second) { - cached.first = xgidx->node(id); - node_cache.put(id, cached.first); - } - Node& node = cached.first; - return node.sequence(); -} - -size_t xg_cached_node_length(id_t id, xg::XG* xgidx, LRUCache& node_cache) { - pair cached = node_cache.retrieve(id); - if(!cached.second) { - cached.first = xgidx->node(id); - node_cache.put(id, cached.first); - } - Node& node = cached.first; - return node.sequence().size(); -} - -int64_t xg_cached_node_start(id_t id, xg::XG* xgidx, LRUCache& node_start_cache) { - pair cached = node_start_cache.retrieve(id); - if(!cached.second) { - cached.first = (int64_t)xgidx->node_start(id); - node_start_cache.put(id, cached.first); - } - return cached.first; -} - -char xg_cached_pos_char(pos_t pos, xg::XG* xgidx, LRUCache& node_cache) { - pair cached = node_cache.retrieve(id(pos)); - if(!cached.second) { - // If it's not in the cache, put it in - cached.first = xgidx->node(id(pos)); - node_cache.put(id(pos), cached.first); - } - Node& node = cached.first; - if (is_rev(pos)) { - /* - cerr << "reversed... " << endl; - cerr << "rev pos " << offset(reverse(pos, node.sequence().size())) << endl; - cerr << "seq is " << node.sequence() << " and got " << - reverse_complement(node.sequence()[offset(reverse(pos, node.sequence().size()))-1]) << endl; - */ - return reverse_complement(node.sequence()[offset(reverse(pos, node.sequence().size()))-1]); - } else { - /* - cerr << "forward... " << endl; - cerr << "seq is " << node.sequence() << " and got " << node.sequence().at(offset(pos)) << endl; - */ - return node.sequence().at(offset(pos)); - } -} - -map xg_cached_next_pos_chars(pos_t pos, xg::XG* xgidx, LRUCache& node_cache, LRUCache >& edge_cache) { - - map nexts; - // See if the node is cached (did we just visit it?) - pair cached = node_cache.retrieve(id(pos)); - if(!cached.second) { - // If it's not in the cache, put it in - cached.first = xgidx->node(id(pos)); - node_cache.put(id(pos), cached.first); - } - Node& node = cached.first; - // if we are still in the node, return the next position and character - if (offset(pos) < node.sequence().size()-1) { - ++get_offset(pos); - nexts[pos] = xg_cached_pos_char(pos, xgidx, node_cache); - } else { - // helper - auto is_inverting = [](const Edge& e) { - return !(e.from_start() == e.to_end()) - && (e.from_start() || e.to_end()); - }; - // check our cache - pair, bool> cached = edge_cache.retrieve(id(pos)); - if(!cached.second) { - // If it's not in the cache, put it in - for (auto& edge : xgidx->edges_of(id(pos))) { - cached.first.push_back(edge); - } - edge_cache.put(id(pos), cached.first); - } - auto& edges = cached.first; - // look at the next positions we could reach - if (!is_rev(pos)) { - // we are on the forward strand, the next things from this node come off the end - for (auto& edge : edges) { - if((edge.to() == id(pos) && edge.to_end()) || (edge.from() == id(pos) && !edge.from_start())) { - id_t nid = (edge.from() == id(pos) ? - edge.to() - : edge.from()); - pos_t p = make_pos_t(nid, is_inverting(edge), 0); - nexts[p] = xg_cached_pos_char(p, xgidx, node_cache); - } - } - } else { - // we are on the reverse strand, the next things from this node come off the start - for (auto& edge : edges) { - if((edge.to() == id(pos) && !edge.to_end()) || (edge.from() == id(pos) && edge.from_start())) { - id_t nid = (edge.to() == id(pos) ? - edge.from() - : edge.to()); - pos_t p = make_pos_t(nid, !is_inverting(edge), 0); - nexts[p] = xg_cached_pos_char(p, xgidx, node_cache); - } - } - } - } - return nexts; -} - -set xg_cached_next_pos(pos_t pos, bool whole_node, xg::XG* xgidx, LRUCache& node_cache, LRUCache >& edge_cache) { - set nexts; - // See if the node is cached (did we just visit it?) - pair cached = node_cache.retrieve(id(pos)); - if(!cached.second) { - // If it's not in the cache, put it in - cached.first = xgidx->node(id(pos)); - node_cache.put(id(pos), cached.first); - } - Node& node = cached.first; - // if we are still in the node, return the next position and character - if (!whole_node && offset(pos) < node.sequence().size()-1) { - ++get_offset(pos); - nexts.insert(pos); - } else { - // helper - auto is_inverting = [](const Edge& e) { - return !(e.from_start() == e.to_end()) - && (e.from_start() || e.to_end()); - }; - // check our cache - pair, bool> cached = edge_cache.retrieve(id(pos)); - if(!cached.second) { - // If it's not in the cache, put it in - for (auto& edge : xgidx->edges_of(id(pos))) { - cached.first.push_back(edge); - } - edge_cache.put(id(pos), cached.first); - } - auto& edges = cached.first; - // look at the next positions we could reach - if (!is_rev(pos)) { - // we are on the forward strand, the next things from this node come off the end - for (auto& edge : edges) { - if((edge.to() == id(pos) && edge.to_end()) || (edge.from() == id(pos) && !edge.from_start())) { - id_t nid = (edge.from() == id(pos) ? - edge.to() - : edge.from()); - nexts.insert(make_pos_t(nid, is_inverting(edge), 0)); - } - } - } else { - // we are on the reverse strand, the next things from this node come off the start - for (auto& edge : edges) { - if((edge.to() == id(pos) && !edge.to_end()) || (edge.from() == id(pos) && edge.from_start())) { - id_t nid = (edge.to() == id(pos) ? - edge.from() - : edge.to()); - nexts.insert(make_pos_t(nid, !is_inverting(edge), 0)); - } - } - } - } - return nexts; -} - -int64_t xg_cached_distance(pos_t pos1, pos_t pos2, int64_t maximum, xg::XG* xgidx, LRUCache& node_cache, LRUCache >& edge_cache) { - //cerr << "distance from " << pos1 << " to " << pos2 << endl; - if (pos1 == pos2) return 0; - int64_t adj = (offset(pos1) == xg_cached_node_length(id(pos1), xgidx, node_cache) ? 0 : 1); - set seen; - set nexts = xg_cached_next_pos(pos1, false, xgidx, node_cache, edge_cache); - int64_t distance = 0; - while (!nexts.empty()) { - set todo; - for (auto& next : nexts) { - if (!seen.count(next)) { - seen.insert(next); - if (next == pos2) { - return distance+adj; - } - // handle the edge case that we are looking for the position after the end of this node - if (make_pos_t(id(next), is_rev(next), offset(next)+1) == pos2) { - return distance+adj+1; - } - for (auto& x : xg_cached_next_pos(next, false, xgidx, node_cache, edge_cache)) { - todo.insert(x); - } - } - } - if (distance == maximum) { - break; - } - nexts = todo; - ++distance; - } - return numeric_limits::max(); -} - -set xg_cached_positions_bp_from(pos_t pos, int64_t distance, bool rev, xg::XG* xgidx, LRUCache& node_cache, LRUCache >& edge_cache) { - // handle base case - //size_t xg_cached_node_length(id_t id, xg::XG* xgidx, LRUCache& node_cache); - if (rev) { - pos = reverse(pos, xg_cached_node_length(id(pos), xgidx, node_cache)); - } - set positions; - if (distance == 0) { - positions.insert(pos); - //return positions; - } else { - set seen; - set nexts = xg_cached_next_pos(pos, false, xgidx, node_cache, edge_cache); - int64_t walked = 0; - while (!nexts.empty()) { - if (walked+1 == distance) { - for (auto& next : nexts) { - positions.insert(next); - } - break; - } - set todo; - for (auto& next : nexts) { - if (!seen.count(next)) { - seen.insert(next); - for (auto& x : xg_cached_next_pos(next, false, xgidx, node_cache, edge_cache)) { - todo.insert(x); - } - } - } - nexts = todo; - ++walked; - } - } - if (rev) { - set rev_pos; - for (auto& p : positions) { - rev_pos.insert(reverse(p, xg_cached_node_length(id(p), xgidx, node_cache))); - } - return rev_pos; - } else { - return positions; - } -} - - -} diff --git a/src/cached_position.hpp b/src/cached_position.hpp deleted file mode 100644 index 0d7ca836b04..00000000000 --- a/src/cached_position.hpp +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef VG_CACHED_POS_HPP_INCLUDED -#define VG_CACHED_POS_HPP_INCLUDED - -#include "vg.pb.h" -#include "types.hpp" -#include "xg.hpp" -#include "lru_cache.h" -#include "utility.hpp" -#include "json2pb.h" -#include -#include - -/** \file - * Functions for working with cached Positions and `pos_t`s. - */ - -namespace vg { - -using namespace std; - -// xg/position traversal helpers with caching -// used by the Sampler and by the Mapper -string xg_cached_node_sequence(id_t id, xg::XG* xgidx, LRUCache& node_cache); -/// Get the length of a Node from an xg::XG index, with cacheing of deserialized nodes. -size_t xg_cached_node_length(id_t id, xg::XG* xgidx, LRUCache& node_cache); -/// Get the node start position in the sequence vector -int64_t xg_cached_node_start(id_t id, xg::XG* xgidx, LRUCache& node_start_cache); -/// Get the character at a position in an xg::XG index, with cacheing of deserialized nodes. -char xg_cached_pos_char(pos_t pos, xg::XG* xgidx, LRUCache& node_cache); -/// Get the characters at positions after the given position from an xg::XG index, with cacheing of deserialized nodes. -map xg_cached_next_pos_chars(pos_t pos, xg::XG* xgidx, LRUCache& node_cache, LRUCache >& edge_cache); -set xg_cached_next_pos(pos_t pos, bool whole_node, xg::XG* xgidx, LRUCache& node_cache, LRUCache >& edge_cache); -int64_t xg_cached_distance(pos_t pos1, pos_t pos2, int64_t maximum, xg::XG* xgidx, LRUCache& node_cache, LRUCache >& edge_cache); -set xg_cached_positions_bp_from(pos_t pos, int64_t distance, bool rev, xg::XG* xgidx, LRUCache& node_cache, LRUCache >& edge_cache); -//void xg_cached_graph_context(VG& graph, const pos_t& pos, int length, xg::XG* xgidx, LRUCache& node_cache, LRUCache >& edge_cache); -Node xg_cached_node(id_t id, xg::XG* xgidx, LRUCache& node_cache); -vector xg_cached_edges_of(id_t id, xg::XG* xgidx, LRUCache >& edge_cache); -vector xg_cached_edges_on_start(id_t id, xg::XG* xgidx, LRUCache >& edge_cache); -vector xg_cached_edges_on_end(id_t id, xg::XG* xgidx, LRUCache >& edge_cache); - -} - -#endif diff --git a/src/cactus.cpp b/src/cactus.cpp index d99791e7293..7d5a6bc8ac3 100644 --- a/src/cactus.cpp +++ b/src/cactus.cpp @@ -1,15 +1,8 @@ #include #include "cactus.hpp" #include "vg.hpp" -#include "algorithms/topological_sort.hpp" -#include "algorithms/weakly_connected_components.hpp" -#include "algorithms/strongly_connected_components.hpp" -#include "algorithms/find_shortest_paths.hpp" - -extern "C" { -#include "sonLib.h" -#include "stCactusGraphs.h" -} +#include "handle.hpp" +#include "algorithms/dfs.hpp" //#define debug @@ -203,7 +196,8 @@ void addArbitraryTelomerePair(vector ends, stList *telomeres) // Step 2) Make a Cactus Graph. Returns the graph and a list of paired // cactusEdgeEnd telomeres, one after the other. Both members of the return // value must be destroyed. -pair handle_graph_to_cactus(PathHandleGraph& graph, const unordered_set& hint_paths) { +pair handle_graph_to_cactus(const PathHandleGraph& graph, const unordered_set& hint_paths, + bool single_component) { // in a cactus graph, every node is an adjacency component. // every edge is a *vg* node connecting the component @@ -264,7 +258,7 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con cac_side2->node = other_node_id; cac_side2->is_end = other_is_end; #ifdef debug - cerr << "Creating cactus edge for sides " << pb2json(graph.to_visit(side)) << " -- " << pb2json(graph.to_visit(other_side)) << ": " << i << " -> " << j << endl; + //cerr << "Creating cactus edge for sides " << pb2json(graph.to_visit(side)) << " -- " << pb2json(graph.to_visit(other_side)) << ": " << i << " -> " << j << endl; #endif // We get the cactusEdgeEnd corresponding to the side stored in side. @@ -289,41 +283,55 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con // Now we decide on telomere pairs. // We need one for each weakly connected component in the graph, so first we break into connected components. - vector> weak_components = algorithms::weakly_connected_components(&graph); + vector> weak_components_all; + if (single_component == false) { + weak_components_all = handlealgs::weakly_connected_components(&graph); + } else { + // the calling funciton knows it's just one component, so we skip the calculation + weak_components_all.resize(1); + graph.for_each_handle([&weak_components_all, &graph](handle_t handle) { + weak_components_all[0].insert(graph.get_id(handle)); + }); + } + + // If we feed size 1 components through to Cactus it will apparently crash. + bool warned = false; + vector> weak_components; + weak_components.reserve(weak_components_all.size()); + for (auto& component : weak_components_all) { + if (component.size() > 1) { + weak_components.push_back(std::move(component)); + } else if (!warned) { + cerr << "Warning: Cactus does not currently support finding snarls in a single-node connected component" << endl; + warned = true; + } + } + weak_components_all.clear(); + if (weak_components.empty()) { + throw runtime_error("Cactus does not currently support finding snarls in graph of single-node connected components"); + } // We also want a map so we can efficiently find which component a node lives in. unordered_map node_to_component; for (size_t i = 0; i < weak_components.size(); i++) { - if (weak_components[i].size() == 1) { - // If we feed this through to Cactus it will crash. - throw runtime_error("Cactus does not currently support finding snarls in a single-node connected component"); - } - for (auto& id : weak_components[i]) { node_to_component[id] = i; } } - // Then we find the heads and tails - auto all_heads = algorithms::head_nodes(&graph); - auto all_tails = algorithms::tail_nodes(&graph); + // Then we find all the tips, inward-facing + auto all_tips = handlealgs::find_tips(&graph); #ifdef debug - cerr << "Found " << all_heads.size() << " heads and " << all_tails.size() << " tails in graph" << endl; + cerr << "Found " << all_tips.size() << " tips in graph" << endl; #endif - // Alot them to components. We store tips in an inward-facing direction + // Allot them to components. We store tips in an inward-facing direction vector> component_tips(weak_components.size()); - for (auto& head : all_heads) { - component_tips[node_to_component[graph.get_id(head)]].insert(head); -#ifdef debug - cerr << "Found head " << graph.get_id(head) << " in component " << node_to_component[graph.get_id(head)] << endl; -#endif - } - for (auto& tail : all_tails) { - component_tips[node_to_component[graph.get_id(tail)]].insert(graph.flip(tail)); + for (auto& tip : all_tips) { + component_tips[node_to_component[graph.get_id(tip)]].insert(tip); #ifdef debug - cerr << "Found tail " << graph.get_id(tail) << " in component " << node_to_component[graph.get_id(tail)] << endl; + cerr << "Found tip " << graph.get_id(tip) << (graph.get_is_reverse(tip) ? '-' : '+') << " in component " << node_to_component[graph.get_id(tip)] << endl; #endif } @@ -334,7 +342,7 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con graph.for_each_path_handle([&](const path_handle_t& path_handle) { - if (graph.get_occurrence_count(path_handle) == 0) { + if (graph.is_empty(path_handle)) { // Not a real useful path, so skip it. Some alt paths used for // haplotype generation are empty. return; @@ -342,9 +350,9 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con string name = graph.get_path_name(path_handle); - occurrence_handle_t occurrence_handle = graph.get_first_occurrence(path_handle); + step_handle_t step_handle = graph.path_begin(path_handle); - auto component = node_to_component[graph.get_id(graph.get_occurrence(occurrence_handle))]; + auto component = node_to_component[graph.get_id(graph.get_handle_of_step(step_handle))]; component_paths[component].push_back(name); @@ -353,21 +361,14 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con cerr << "Path " << name << " belongs to component " << component << endl; #endif - auto process_occurrence = [&](const occurrence_handle_t& occurrence_handle) { - handle_t handle = graph.get_occurrence(occurrence_handle); + for (handle_t handle : graph.scan_path(path_handle)) { path_length[name] += graph.get_length(handle); if (node_to_component[graph.get_id(handle)] != component) { // If we use a path like this to pick telomeres we will segfault Cactus. throw runtime_error("Path " + name + " spans multiple connected components!"); } - }; - - while (graph.has_next_occurrence(occurrence_handle)) { - process_occurrence(occurrence_handle); - occurrence_handle = graph.get_next_occurrence(occurrence_handle); } - process_occurrence(occurrence_handle); #ifdef debug cerr << "\tPath " << name << " has length " << path_length[name] << endl; @@ -378,7 +379,7 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con // This holds all the strongly connected components that live in each weakly connected component. vector>> component_strong_components(weak_components.size()); size_t strong_component_count = 0; - for (auto& strong_component : algorithms::strongly_connected_components(&graph)) { + for (auto& strong_component : handlealgs::strongly_connected_components(&graph)) { // For each strongly connected component assert(!strong_component.empty()); // Assign it to the weak component that some node in it belongs to @@ -439,13 +440,14 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con //auto& path_mappings = graph.paths.get_path(path_name); #ifdef debug - cerr << "\tPath " << path_name << " has " << graph.get_occurrence_count(path_handle) << " mappings" << endl; + cerr << "\tPath " << path_name << " has " << graph.get_step_count(path_handle) << " mappings" << endl; #endif // See if I can get two tips on its ends. // Get the inward-facing start and end handles. - handle_t path_start = graph.get_occurrence(graph.get_first_occurrence(path_handle)); - handle_t path_end = graph.flip(graph.get_occurrence(graph.get_last_occurrence(path_handle))); + handle_t path_start = graph.get_handle_of_step(graph.path_begin(path_handle)); + step_handle_t final_step = graph.get_previous_step(graph.path_end(path_handle)); + handle_t path_end = graph.flip(graph.get_handle_of_step(final_step)); if (component_tips[i].count(path_start) && component_tips[i].count(path_end)) { // This path ends in two tips so we can consider it @@ -528,7 +530,7 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con << graph.get_id(key.first) << " " << graph.get_is_reverse(key.first) << endl; #endif - unordered_map distances = algorithms::find_shortest_paths(&graph, key.first); + unordered_map distances = handlealgs::find_shortest_paths(&graph, key.first); for (auto& other_tip : component_tips[i]) { // And save the distances for everything reachable or unreachable. @@ -658,8 +660,8 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con #endif // Dijkstra in both directions - unordered_map distances_right = algorithms::find_shortest_paths(&graph, start); - unordered_map distances_left = algorithms::find_shortest_paths(&graph, graph.flip(start)); + unordered_map distances_right = handlealgs::find_shortest_paths(&graph, start); + unordered_map distances_left = handlealgs::find_shortest_paths(&graph, graph.flip(start)); // Find the furthest-out reachable tip on each side handle_t furthest_right_tip; @@ -712,7 +714,7 @@ pair handle_graph_to_cactus(PathHandleGraph& graph, con #endif // Dijkstra out from the current starting tip - unordered_map distances = algorithms::find_shortest_paths(&graph, best_tips[starting_tip]); + unordered_map distances = handlealgs::find_shortest_paths(&graph, best_tips[starting_tip]); // Find the other tip that is furthest away (stored in tip orientation and not Dijkstra orientation) handle_t maximal_tip = best_tips[!starting_tip]; diff --git a/src/cactus.hpp b/src/cactus.hpp index d1d12406cee..36d53f2fab6 100644 --- a/src/cactus.hpp +++ b/src/cactus.hpp @@ -15,8 +15,8 @@ #include "vg.hpp" extern "C" { -#include "sonLib.h" -#include "stCactusGraphs.h" +#include +#include } using namespace std; @@ -31,10 +31,13 @@ struct CactusSide { // Convert VG to Cactus Graph. Takes a list of path names to use to find // telomeres if present in a connected component. +// If we know the graph is a single weakly connected component, single_component can +// be set to ture to avoid recomputing components. // Notes: // - returned cactus graph needs to be freed by stCactusGraph_destruct // - returns a Cactus graph, and a list of stCactusEdgeEnd* telomeres, in pairs of adjacent items. -pair handle_graph_to_cactus(PathHandleGraph& graph, const unordered_set& hint_paths); +pair handle_graph_to_cactus(const PathHandleGraph& graph, const unordered_set& hint_paths, + bool single_component = false); // Convert back from Cactus to VG // (to, for example, display using vg view) diff --git a/src/cactus_snarl_finder.cpp b/src/cactus_snarl_finder.cpp new file mode 100644 index 00000000000..20d8fca995c --- /dev/null +++ b/src/cactus_snarl_finder.cpp @@ -0,0 +1,432 @@ +/// +/// \file cactus_snarl_finder.cpp +/// +/// + +//#define debug + +#include "subgraph_overlay.hpp" +#include "handle.hpp" + +#include "cactus_snarl_finder.hpp" + +namespace vg { + +using namespace std; + +CactusSnarlFinder::CactusSnarlFinder(const PathHandleGraph& graph, const string& hint_path) : + graph(&graph) { + if (!hint_path.empty()) { + hint_paths.insert(hint_path); + // TODO: actually use it + } +} + +SnarlManager CactusSnarlFinder::find_snarls_impl(bool known_single_component, bool finish_index) { + + if (graph->get_node_count() <= 1) { + // No snarls here! + return SnarlManager(); + } + // convert to cactus + pair cac_pair = handle_graph_to_cactus(*graph, hint_paths, known_single_component); + stCactusGraph* cactus_graph = cac_pair.first; + stList* telomeres = cac_pair.second; + + // get the snarl decomposition as a C struct + stSnarlDecomposition *snarls = stCactusGraph_getSnarlDecomposition(cactus_graph, telomeres); + + // Get a non-owning pointer to the list of chains (which are themselves lists of snarls). + stList* cactus_chains_list = snarls->topLevelChains; + + // And one to the list of top-level unary snarls + stList* cactus_unary_snarls_list = snarls->topLevelUnarySnarls; + + + // We'll fill this with all the snarls + SnarlManager snarl_manager; + + // Fill the manager with all of the snarls, recursively. + recursively_emit_snarls(Visit(), Visit(), Visit(), Visit(), cactus_chains_list, cactus_unary_snarls_list, snarl_manager); + + // Free the decomposition + stSnarlDecomposition_destruct(snarls); + + // Free the telomeres + stList_destruct(telomeres); + + // free the cactus graph + stCactusGraph_destruct(cactus_graph); + + if (finish_index) { + // Finish the SnarlManager + snarl_manager.finish(); + } + + // Return the completed SnarlManager + return snarl_manager; + +} + +SnarlManager CactusSnarlFinder::find_snarls() { + return find_snarls_impl(false, true); +} + +SnarlManager CactusSnarlFinder::find_snarls_parallel() { + + vector> weak_components = handlealgs::weakly_connected_components(graph); + vector snarl_managers(weak_components.size()); + +#pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < weak_components.size(); ++i) { + const PathHandleGraph* subgraph; + if (weak_components.size() == 1) { + subgraph = graph; + } else { + // turn the component into a graph + subgraph = new PathSubgraphOverlay(graph, &weak_components[i]); + } + string hint_path = !hint_paths.empty() ? *hint_paths.begin() : ""; + CactusSnarlFinder finder(*subgraph, hint_path); + // find the snarls, telling the finder that the graph is a single component + // and that we don't want to finish the snarl index + snarl_managers[i] = finder.find_snarls_impl(true, false); + if (weak_components.size() != 1) { + // delete our component graph overlay + delete subgraph; + } + } + + // merge the managers into the biggest one. + size_t biggest_snarl_idx = 0; + for (size_t i = 1; i < snarl_managers.size(); ++i) { + if (snarl_managers[i].num_snarls() > snarl_managers[biggest_snarl_idx].num_snarls()) { + biggest_snarl_idx = i; + } + } + for (size_t i = 0; i < snarl_managers.size(); ++i) { + if (i != biggest_snarl_idx) { + snarl_managers[i].for_each_snarl_unindexed([&](const Snarl* snarl) { + snarl_managers[biggest_snarl_idx].add_snarl(*snarl); + }); + } + } + snarl_managers[biggest_snarl_idx].finish(); + return std::move(snarl_managers[biggest_snarl_idx]); +} + + +const Snarl* CactusSnarlFinder::recursively_emit_snarls(const Visit& start, const Visit& end, + const Visit& parent_start, const Visit& parent_end, + stList* chains_list, stList* unary_snarls_list, SnarlManager& destination) { + +#ifdef debug + cerr << "Explore snarl " << start << " -> " << end << endl; +#endif + + // This is the snarl we are filling in to add to the SnarlManger, or an + // empty snarl if we're a fake root snarl. + Snarl snarl; + + if (start.node_id() != 0 && end.node_id() != 0) { + // This is a real snarl + + // Set up the start and end + *snarl.mutable_start() = start; + *snarl.mutable_end() = end; + + if (parent_start.node_id() != 0 && parent_end.node_id() != 0) { + // We have a parent that isn't the fake root, so fill in its ends + *snarl.mutable_parent()->mutable_start() = parent_start; + *snarl.mutable_parent()->mutable_end() = parent_end; + } + } + + // This will hold the pointer to the copy of the snarl in the SnarlManager, + // or null if the snarl is a fake root and we don't add it. + const Snarl* managed = nullptr; + + // Before we can pass our snarl to the snarl manager, we need to look at all + // its children so we can get connectivity info. + + // We have a vector of the snarls made for the child snarls in each ordinary + // chain, plus trivial chains for the unary snarls. + vector child_chains; + +#ifdef debug + cerr << "Look at " << stList_length(chains_list) << " child chains" << endl; +#endif + + int chain_offset = 0; + for (int64_t i = 0; i < stList_length(chains_list); i++) { + // For each child chain + stList* cactus_chain = (stList*)stList_get(chains_list, i); + + // Make a new chain. + // We aren't going to pass it on to the snarl manager, because chains need to be recomputed for consistency. + // But we need it for computing the internal snarl connectivity. + child_chains.emplace_back(); + auto& chain = child_chains.back(); + +#ifdef debug + cerr << "Chain " << i << " has " << stList_length(cactus_chain) << " child snarls" << endl; +#endif + + for (int64_t j = 0; j < stList_length(cactus_chain); j++) { + // for each child snarl in the chain + stSnarl* child_snarl = (stSnarl*)stList_get(cactus_chain, j); + + // scrape the vg coordinate information out of the cactus ends where we stuck + // it during cactus construction + CactusSide* cac_child_side1 = (CactusSide*)stCactusEdgeEnd_getObject(child_snarl->edgeEnd1); + CactusSide* cac_child_side2 = (CactusSide*)stCactusEdgeEnd_getObject(child_snarl->edgeEnd2); + + // Convert from CactusSide (the interior endpoint of each node) to Visit (inward at start, outward at end) + Visit child_start; + child_start.set_node_id(cac_child_side1->node); + // Start is backward if the interior is not an end + child_start.set_backward(!cac_child_side1->is_end); + Visit child_end; + child_end.set_node_id(cac_child_side2->node); + // End is backward if the interior is an end + child_end.set_backward(cac_child_side2->is_end); + + // Recursively create a snarl for the child + const Snarl* converted_child = recursively_emit_snarls(child_start, child_end, start, end, + child_snarl->chains, child_snarl->unarySnarls, destination); + // Work out if it should be backward in the chain + bool backward_in_chain = false; + if (!chain.empty()) { + bool last_backward_in_chain = chain.back().second; + auto dangling_id = last_backward_in_chain ? chain.back().first->end().node_id() : chain.back().first->start().node_id(); + // We are backward if our end is shared with the previous snarl in the chain. + backward_in_chain = converted_child->end().node_id() == dangling_id; + } + + // And then add it to this chain. + chain.emplace_back(converted_child, backward_in_chain); + } + } + +#ifdef debug + cerr << "Look at " << stList_length(unary_snarls_list) << " child unary snarls" << endl; +#endif + + for (int64_t i = 0; i < stList_length(unary_snarls_list); i++) { + // for each child unary snarl + stSnarl* child_snarl = (stSnarl*)stList_get(unary_snarls_list, i); + + // TODO: deduplicate this code + + // scrape the vg coordinate information out of the cactus ends where we stuck + // it during cactus construction + CactusSide* cac_child_side1 = (CactusSide*)stCactusEdgeEnd_getObject(child_snarl->edgeEnd1); + CactusSide* cac_child_side2 = (CactusSide*)stCactusEdgeEnd_getObject(child_snarl->edgeEnd2); + + // Convert from CactusSide (the interior endpoint of each node) to Visit (inward at start, outward at end) + Visit child_start; + child_start.set_node_id(cac_child_side1->node); + // Start is backward if the interior is not an end + child_start.set_backward(!cac_child_side1->is_end); + Visit child_end; + child_end.set_node_id(cac_child_side2->node); + // End is backward if the interior is an end + child_end.set_backward(cac_child_side2->is_end); + + // Make a trivial chain + child_chains.emplace_back(); + auto& chain = child_chains.back(); + + // Recursively create a snarl for the child, and then add it to the trivial chain as forward + chain.emplace_back(recursively_emit_snarls(child_start, child_end, start, end, + child_snarl->chains, child_snarl->unarySnarls, destination), false); + } + + if (snarl.start().node_id() != 0 || snarl.end().node_id() != 0) { + // This snarl is real, we care about type and connectivity. + + // First determine connectivity + { + + // Make a net graph for the snarl that uses internal connectivity + NetGraph connectivity_net_graph(start, end, child_chains, graph, true); + + // Evaluate connectivity + // A snarl is minimal, so we know out start and end will be normal nodes. + handle_t start_handle = connectivity_net_graph.get_handle(start.node_id(), start.backward()); + handle_t end_handle = connectivity_net_graph.get_handle(end.node_id(), end.backward()); + + // Start out by assuming we aren't connected + bool connected_start_start = false; + bool connected_end_end = false; + bool connected_start_end = false; + + // We do a couple of direcred walk searches to test connectivity. + list queue{start_handle}; + unordered_set queued{start_handle}; + auto handle_edge = [&](const handle_t& other) { +#ifdef debug + cerr << "\tCan reach " << connectivity_net_graph.get_id(other) + << " " << connectivity_net_graph.get_is_reverse(other) << endl; +#endif + + // Whenever we see a new node orientation, queue it. + if (!queued.count(other)) { + queue.push_back(other); + queued.insert(other); + } + }; + +#ifdef debug + cerr << "Looking for start-start turnarounds and through connections from " + << connectivity_net_graph.get_id(start_handle) << " " << + connectivity_net_graph.get_is_reverse(start_handle) << endl; +#endif + + while (!queue.empty()) { + handle_t here = queue.front(); + queue.pop_front(); + + if (here == end_handle) { + // Start can reach the end + connected_start_end = true; + } + + if (here == connectivity_net_graph.flip(start_handle)) { + // Start can reach itself the other way around + connected_start_start = true; + } + + if (connected_start_end && connected_start_start) { + // No more searching needed + break; + } + + // Look at everything reachable on a proper rightward directed walk. + connectivity_net_graph.follow_edges(here, false, handle_edge); + } + + auto end_inward = connectivity_net_graph.flip(end_handle); + +#ifdef debug + cerr << "Looking for end-end turnarounds from " << connectivity_net_graph.get_id(end_inward) + << " " << connectivity_net_graph.get_is_reverse(end_inward) << endl; +#endif + + // Reset and search the other way from the end to see if it can find itself. + queue = {end_inward}; + queued = {end_inward}; + while (!queue.empty()) { + handle_t here = queue.front(); + queue.pop_front(); + +#ifdef debug + cerr << "Got to " << connectivity_net_graph.get_id(here) << " " + << connectivity_net_graph.get_is_reverse(here) << endl; +#endif + + if (here == end_handle) { + // End can reach itself the other way around + connected_end_end = true; + break; + } + + // Look at everything reachable on a proper rightward directed walk. + connectivity_net_graph.follow_edges(here, false, handle_edge); + } + + // Save the connectivity info. TODO: should the connectivity flags be + // calculated based on just the net graph, or based on actual connectivity + // within child snarls. + snarl.set_start_self_reachable(connected_start_start); + snarl.set_end_self_reachable(connected_end_end); + snarl.set_start_end_reachable(connected_start_end); + +#ifdef debug + cerr << "Connectivity: " << connected_start_start << " " << connected_end_end << " " << connected_start_end << endl; +#endif + + + } + + { + // Determine cyclicity/acyclicity + + // Make a net graph that just pretends child snarls/chains are ordinary nodes + NetGraph flat_net_graph(start, end, child_chains, graph); + + // This definitely should be calculated based on the internal-connectivity-ignoring net graph. + snarl.set_directed_acyclic_net_graph(handlealgs::is_directed_acyclic(&flat_net_graph)); + } + + // Now we need to work out if the snarl can be a unary snarl or an ultrabubble or what. + if (start.node_id() == end.node_id()) { + // Snarl has the same start and end (or no start or end, in which case we don't care). + snarl.set_type(UNARY); +#ifdef debug + cerr << "Snarl is UNARY" << endl; +#endif + } else if (!snarl.start_end_reachable()) { + // Can't be an ultrabubble if we're not connected through. + snarl.set_type(UNCLASSIFIED); +#ifdef debug + cerr << "Snarl is UNCLASSIFIED because it doesn't connect through" << endl; +#endif + } else if (snarl.start_self_reachable() || snarl.end_self_reachable()) { + // Can't be an ultrabubble if we have these cycles + snarl.set_type(UNCLASSIFIED); + +#ifdef debug + cerr << "Snarl is UNCLASSIFIED because it allows turning around, creating a directed cycle" << endl; +#endif + + } else { + // See if we have all ultrabubble children + bool all_ultrabubble_children = true; + for (auto& chain : child_chains) { + for (auto& child : chain) { + if (child.first->type() != ULTRABUBBLE) { + all_ultrabubble_children = false; + break; + } + } + if (!all_ultrabubble_children) { + break; + } + } + + // Note that ultrabubbles *can* loop back on their start or end. + + if (!all_ultrabubble_children) { + // If we have non-ultrabubble children, we can't be an ultrabubble. + snarl.set_type(UNCLASSIFIED); +#ifdef debug + cerr << "Snarl is UNCLASSIFIED because it has non-ultrabubble children" << endl; +#endif + } else if (!snarl.directed_acyclic_net_graph()) { + // If all our children are ultrabubbles but we ourselves are cyclic, we can't be an ultrabubble + snarl.set_type(UNCLASSIFIED); + +#ifdef debug + cerr << "Snarl is UNCLASSIFIED because it is not directed-acyclic" << endl; +#endif + } else { + // We have only ultrabubble children and are acyclic. + // We're an ultrabubble. + snarl.set_type(ULTRABUBBLE); +#ifdef debug + cerr << "Snarl is an ULTRABUBBLE" << endl; +#endif + } + } + + // Now we know enough about the snarl to actually put it in the SnarlManager + managed = destination.add_snarl(snarl); + + } + + // Return a pointer to the managed snarl. + return managed; +} + +} diff --git a/src/cactus_snarl_finder.hpp b/src/cactus_snarl_finder.hpp new file mode 100644 index 00000000000..91c1c48e1cb --- /dev/null +++ b/src/cactus_snarl_finder.hpp @@ -0,0 +1,74 @@ +/// +/// \file cactus_snarl_finder.hpp +/// +/// Defines a widget for finding snarls using the pinchesAndCacti library. +/// + +#ifndef VG_CACTUS_SNARL_FINDER_HPP_INCLUDED +#define VG_CACTUS_SNARL_FINDER_HPP_INCLUDED + +#include "snarls.hpp" + +namespace vg { + +using namespace std; + + +/** + * Class for finding all snarls using the base-level Cactus snarl decomposition + * interface. + */ +class CactusSnarlFinder : public SnarlFinder { + +protected: + /// Holds the vg graph we are looking for sites in. + const PathHandleGraph* graph; + + /// Holds the names of reference path hints + unordered_set hint_paths; + + /// Create a snarl in the given SnarlManager with the given start and end, + /// containing the given child snarls in the list of chains of children and + /// the given list of unary children. Recursively creates snarls in the + /// SnarlManager for the children. Returns a pointer to the finished snarl + /// in the SnarlManager. Start and end may be empty visits, in which case no + /// snarl is created, all the child chains are added as root chains, and + /// null is returned. If parent_start and parent_end are empty Visits, no + /// parent() is added to the produced snarl. + const Snarl* recursively_emit_snarls(const Visit& start, const Visit& end, + const Visit& parent_start, const Visit& parent_end, + stList* chains_list, stList* unary_snarls_list, SnarlManager& destination); + + /** + * Find all the snarls with Cactus, and put them into a SnarlManager. + * Skip breaking into connected components if "known_single_component" is true + * Skip making the snarl manager index if finish_index is false + */ + virtual SnarlManager find_snarls_impl(bool known_single_component, bool finish_index); + +public: + /** + * Make a new CactusSnarlFinder to find snarls in the given graph. + * We can't filter trivial bubbles because that would break our chains. + * + * Optionally takes a hint path name. + */ + CactusSnarlFinder(const PathHandleGraph& graph, const string& hint_path = ""); + + /** + * Find all the snarls with Cactus, and put them into a SnarlManager. + */ + virtual SnarlManager find_snarls(); + + /** + * Find all the snarls of weakly connected components in parallel. + * Even single-threaded, this may be worth using as it will use less + * memory by only considering each component in the context of itself. + */ + virtual SnarlManager find_snarls_parallel(); + +}; + +} + +#endif diff --git a/src/chunker.cpp b/src/chunker.cpp index c2a6ceb5cfa..00fb6f6a3e9 100644 --- a/src/chunker.cpp +++ b/src/chunker.cpp @@ -1,17 +1,18 @@ #include #include -#include "stream.hpp" +#include #include "chunker.hpp" +#include "algorithms/subgraph.hpp" +#include "vg.hpp" +#include "clip.hpp" +//#define debug namespace vg { using namespace std; -using namespace xg; - - -PathChunker::PathChunker(xg::XG* xindex) : xg(xindex) { +PathChunker::PathChunker(const PathPositionHandleGraph* graph) : graph(graph) { } @@ -19,93 +20,417 @@ PathChunker::~PathChunker() { } -void PathChunker::extract_subgraph(const Region& region, int context, int length, - bool forward_only, VG& subgraph, Region& out_region) { - - Graph g; - - // convert to 0-based inclusive - int64_t start = region.start; - +void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only, + MutablePathMutableHandleGraph& subgraph, Region& out_region) { + // This method still depends on VG + // (not a super high priority to port, as calling can now be done at genome scale and we no longer + // have to chunk up paths) + VG* vg_subgraph = dynamic_cast(&subgraph); + if (vg_subgraph == nullptr) { + vg_subgraph = new VG(); + assert(subgraph.get_node_count() == 0); + } + // extract our path range into the graph + path_handle_t path_handle = graph->get_path_handle(region.seq); + step_handle_t start_step = graph->get_step_at_position(path_handle, region.start); + handle_t start_handle = graph->get_handle_of_step(start_step); + step_handle_t end_step = graph->get_step_at_position(path_handle, region.end); + handle_t end_handle = graph->get_handle_of_step(end_step); - // Commenting out till I can be sure it's not doing weird things to paths - //xg->get_path_range(region.seq, region.start, region.end - 1, g); +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "extracting subgraph range for " << region.seq << ":" << region.start << "-" << region.end + << ", wich maps to handle range " << graph->get_id(start_handle) << ":" << graph->get_is_reverse(start_handle) << "-" + << graph->get_id(end_handle) << ":" << graph->get_is_reverse(end_handle) << endl; + } +#endif - xg->for_path_range(region.seq, region.start, region.end, [&](int64_t id) { - *g.add_node() = xg->node(id); - }); - + step_handle_t end_plus_one_step = graph->has_next_step(end_step) ? graph->get_next_step(end_step) : graph->path_end(path_handle) ; + for (step_handle_t step = start_step; step != end_plus_one_step; step = graph->get_next_step(step)) { + handle_t step_handle = graph->get_handle_of_step(step); + if (graph->get_is_reverse(step_handle)) { + step_handle = graph->flip(step_handle); + } + if (!vg_subgraph->has_node(graph->get_id(step_handle))) { + vg_subgraph->create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle)); + } + }; // expand the context and get path information // if forward_only true, then we only go forward. - xg->expand_context(g, context, true, true, true, !forward_only); - if (length) { - xg->expand_context(g, context, true, false, true, !forward_only); + if (context > 0) { + algorithms::expand_subgraph_by_steps(*graph, *vg_subgraph, context, forward_only); + } + if (length > 0) { + algorithms::expand_subgraph_by_length(*graph, *vg_subgraph, context, forward_only); + } + else if (context == 0 && length == 0) { + algorithms::add_connecting_edges_to_subgraph(*graph, *vg_subgraph); + } + algorithms::add_subpaths_to_subgraph(*graph, *vg_subgraph, true); + + // merge back our reference path to use the old chopping code + // todo: work with subpaths somehow? + if (!vg_subgraph->has_path(region.seq)) { + map ref_subpaths; + vg_subgraph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = vg_subgraph->get_path_name(path_handle); + subrange_t subrange; + path_name = Paths::strip_subrange(path_name, &subrange); + if (subrange != PathMetadata::NO_SUBRANGE && path_name == region.seq) { + ref_subpaths[subrange.first] = path_handle; + } + }); + path_handle_t new_ref_path = vg_subgraph->create_path_handle(region.seq, graph->get_is_circular(path_handle)); + for (auto& ref_subpath : ref_subpaths) { + vg_subgraph->for_each_step_in_path(ref_subpath.second, [&] (step_handle_t subpath_step) { + vg_subgraph->append_step(new_ref_path, vg_subgraph->get_handle_of_step(subpath_step)); + }); + vg_subgraph->destroy_path(ref_subpath.second); + } } + + // build the vg of the subgraph + vg_subgraph->remove_orphan_edges(); + + // get our range endpoints before context expansion + list& mappings = vg_subgraph->paths.get_path(region.seq); + assert(!mappings.empty()); + size_t mappings_size = mappings.size(); + int64_t input_start_node = graph->get_id(start_handle); + int64_t input_end_node = graph->get_id(end_handle); + +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "Path range in expanded subgraph is " << *mappings.begin() << "-" << *mappings.rbegin() << endl; + } +#endif + + // replaces old xg position_in_path() to check node counts in path + function(const PathHandleGraph&, handle_t, path_handle_t)> path_steps_of_handle = + [] (const PathHandleGraph& graph, handle_t handle, path_handle_t path_handle) { + vector node_steps = graph.steps_of_handle(handle); + vector node_path_steps; + for (auto step : node_steps) { + if (graph.get_path_handle_of_step(step) == path_handle) { + node_path_steps.push_back(step); + } + } + return node_path_steps; + }; + + // we have no direct way of getting our steps out of the subgraph, so we + // go through node ids. the problem is that cycles can introduce + // ambiguity. we check for that here (only to punt on it later) + vector start_node_path_steps = path_steps_of_handle(*graph, start_handle, path_handle); + vector end_node_path_steps = path_steps_of_handle(*graph, end_handle, path_handle); + bool end_points_on_cycle = start_node_path_steps.size() > 1 || end_node_path_steps.size() > 1; + + // keep track of the edges in our original path + set, pair>> path_edge_set = + // walking out with the context length (as supported below) won't always work as expansion + // can grab an arbitrary amount of path regardless of context. so we load up the entire path: + // (todo: could sniff out limits from subgraph...) + get_path_edge_index(graph->path_begin(path_handle), graph->path_back(path_handle), std::max(context, length)); + + // the distance between them and the nodes in our input range + size_t left_padding = 0; + size_t right_padding = 0; + // do we need to rewrite back to our graph? + bool rewrite_paths = false; + + if (!end_points_on_cycle) { + // start and end of our expanded chunk + auto start_it = mappings.begin(); + auto end_it = --mappings.end(); + + // find our input range in the expanded path. we know these nodes only appear once. + for (; start_it != mappings.end() && start_it->node_id() != input_start_node; ++start_it); + for (; end_it != mappings.begin() && end_it->node_id() != input_end_node; --end_it); + + // walk back our start point as we can without rank discontinuities. doesn't matter + // if we encounter cycles here, because we keep a running path length + auto cur_it = start_it; + auto prev_it = cur_it; + if (prev_it != mappings.begin()) { + for (; prev_it != mappings.begin(); --prev_it) { + cur_it = prev_it; + --cur_it; + handle_t prev_handle = vg_subgraph->get_handle(prev_it->node_id(), + prev_it->is_reverse()); + handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(), + cur_it->is_reverse()); + edge_t edge = vg_subgraph->edge_handle(cur_handle, prev_handle); + if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)), + make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) { +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "found discontinuity between when left scanning path in subgraph: " << *cur_it << " and " << *prev_it << endl; + + } +#endif + break; + } + left_padding += cur_it->length; + } + } + start_it = prev_it; + // walk forward the end point + cur_it = end_it; + prev_it = cur_it; + for (++cur_it; cur_it != mappings.end(); ++prev_it, ++cur_it) { + handle_t prev_handle = vg_subgraph->get_handle(prev_it->node_id(), + prev_it->is_reverse()); + handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(), + cur_it->is_reverse()); + edge_t edge = vg_subgraph->edge_handle(prev_handle, cur_handle); + if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)), + make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) { +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "found discontinuity between when right scanning path in subgraph: " << *prev_it << " and " << *cur_it << endl; + + } +#endif + break; + } + right_padding += cur_it->length; + } + end_it = prev_it; + + rewrite_paths = start_it != mappings.begin() || end_it != --mappings.end(); - // build the vg - subgraph.extend(g); - subgraph.remove_orphan_edges(); + // cut out nodes before and after discontinuity + mappings.erase(mappings.begin(), start_it); + mappings.erase(++end_it, mappings.end()); + } + // We're clipping at a cycle in the reference path. Just preserve the path as-is from the + // input region. + else { + mappings.clear(); + for (step_handle_t step = start_step; step != end_plus_one_step; step = graph->get_next_step(step)) { + handle_t step_handle = graph->get_handle_of_step(step); + mapping_t mapping; + mapping.set_node_id(graph->get_id(step_handle)); + mapping.set_is_reverse(graph->get_is_reverse(step_handle)); + mappings.push_back(mapping); + } + rewrite_paths = true; + } - // what node contains our input starting position? - int64_t input_start_node = xg->node_at_path_position(region.seq, region.start); + // Cut our graph so that our reference path end points are graph tips. This will let the + // snarl finder use the path to find telomeres. + path_handle_t sg_path_handle = vg_subgraph->get_path_handle(region.seq); + Node* start_node = vg_subgraph->get_node(mappings.begin()->node_id()); + auto sg_start_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(start_node->id()), sg_path_handle); + if (rewrite_paths && sg_start_steps.size() == 1) { + if (!mappings.begin()->is_reverse() && vg_subgraph->start_degree(start_node) != 0) { + for (auto edge : vg_subgraph->edges_to(start_node)) { +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl; + } +#endif + vg_subgraph->destroy_edge(edge); + } + } else if (mappings.begin()->is_reverse() && vg_subgraph->end_degree(start_node) != 0) { + for (auto edge : vg_subgraph->edges_from(start_node)) { +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl; + } +#endif + vg_subgraph->destroy_edge(edge); + } + } + } + Node* end_node = vg_subgraph->get_node(mappings.rbegin()->node_id()); + auto sg_end_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(end_node->id()), sg_path_handle); + if (rewrite_paths && sg_end_steps.size() == 1) { + if (!mappings.rbegin()->is_reverse() && vg_subgraph->end_degree(end_node) != 0) { + for (auto edge : vg_subgraph->edges_from(end_node)) { +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl; + } +#endif + vg_subgraph->destroy_edge(edge); + } + } else if (mappings.rbegin()->is_reverse() && vg_subgraph->start_degree(end_node) != 0) { + for (auto edge : vg_subgraph->edges_to(end_node)) { +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl; + } +#endif + vg_subgraph->destroy_edge(edge); + } + } + } + + // Sync our updated paths lists back into the Graph protobuf + if (rewrite_paths) { + vg_subgraph->paths.rebuild_node_mapping(); + vg_subgraph->paths.rebuild_mapping_aux(); + vg_subgraph->graph.clear_path(); + vg_subgraph->paths.to_graph(vg_subgraph->graph); + } + + // copy back out of vg if necessary + if (dynamic_cast(&subgraph) == nullptr) { + handlealgs::copy_path_handle_graph(vg_subgraph, &subgraph); + delete vg_subgraph; + } // start could fall inside a node. we find out where in the path the // 0-offset point of the node is. - int64_t input_start_pos = xg->node_start_at_path_position(region.seq, region.start); - assert(input_start_pos <= region.start && - input_start_pos + xg->node_length(input_start_node) > region.start); - - // find out the start position of the first node in the path in the - // subgraph. take the last occurance before the input_start_pos - // todo: there are probably some cases involving cycles where this breaks - Path path = subgraph.paths.path(region.seq); - int64_t chunk_start_node = path.mapping(0).position().node_id(); - int64_t chunk_start_pos = -1; - int64_t best_delta = numeric_limits::max(); - vector first_positions = xg->position_in_path(chunk_start_node, region.seq); - for (auto fp : first_positions) { - int64_t delta = input_start_pos - (int64_t)fp; - if (delta >= 0 && delta < best_delta) { - best_delta = delta; - chunk_start_pos = fp; + int64_t input_start_pos = graph->get_position_of_step(start_step); + int64_t input_end_pos = graph->get_position_of_step(end_step); + out_region.seq = region.seq; + out_region.start = input_start_pos - left_padding; + out_region.end = input_end_pos + graph->get_length(end_handle) + right_padding - 1; +} + +void PathChunker::extract_snarls(const Region& region, SnarlManager& snarl_manager, MutablePathMutableHandleGraph& subgraph) { + + // copy over the path extraction code from above: + + // extract our path range into the graph + path_handle_t path_handle = graph->get_path_handle(region.seq); + step_handle_t start_step = graph->get_step_at_position(path_handle, region.start); + handle_t start_handle = graph->get_handle_of_step(start_step); + step_handle_t end_step = graph->get_step_at_position(path_handle, region.end); + handle_t end_handle = graph->get_handle_of_step(end_step); + +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "extracting subgraph range for " << region.seq << ":" << region.start << "-" << region.end + << ", wich maps to handle range " << graph->get_id(start_handle) << ":" << graph->get_is_reverse(start_handle) << "-" + << graph->get_id(end_handle) << ":" << graph->get_is_reverse(end_handle) << endl; + } +#endif + + step_handle_t end_plus_one_step = graph->has_next_step(end_step) ? graph->get_next_step(end_step) : graph->path_end(path_handle) ; + for (step_handle_t step = start_step; step != end_plus_one_step; step = graph->get_next_step(step)) { + handle_t step_handle = graph->get_handle_of_step(step); + if (graph->get_is_reverse(step_handle)) { + step_handle = graph->flip(step_handle); + } + if (!subgraph.has_node(graph->get_id(step_handle))) { + subgraph.create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle)); } } - assert(chunk_start_pos >= 0); - out_region.seq = region.seq; - out_region.start = chunk_start_pos; - out_region.end = out_region.start - 1; - // Is there a better way to get path length? - Path output_path = subgraph.paths.path(out_region.seq); - for (size_t j = 0; j < output_path.mapping_size(); ++j) { - int64_t op_node = output_path.mapping(j).position().node_id(); - out_region.end += subgraph.get_node(op_node)->sequence().length(); + // now fill in the snarls using the vg clip api + // todo: we can specifiy multiple regions here + visit_contained_snarls(graph, {region}, snarl_manager, false, + [&](const Snarl* snarl, step_handle_t start_step, step_handle_t end_step, + int64_t start_node, int64_t end_node, bool steps_reversed, + const Region* containing_region) { + + pair, unordered_set > snarl_contents = snarl_manager.deep_contents(snarl, *graph, true); + for (id_t snarl_node : snarl_contents.first) { + if (!subgraph.has_node(snarl_node)) { + subgraph.create_handle(graph->get_sequence(graph->get_handle(snarl_node)), snarl_node); + } + } + + }); + + // now fill in the edges + algorithms::add_connecting_edges_to_subgraph(*graph, subgraph); + + // now fill in the paths + algorithms::add_subpaths_to_subgraph(*graph, subgraph, true); +} + +void PathChunker::extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region) { + unordered_set path_ids; + + path_handle_t path_handle = graph->get_path_handle(path_name); + for (handle_t handle : graph->scan_path(path_handle)) { + path_ids.insert(graph->get_id(handle)); } + + extract_component(path_ids, subgraph, true); + out_region.seq = path_name; } -void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int context, int length, - bool forward_only, VG& subgraph, Region& out_region) { +void PathChunker::extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph, bool subpath_naming) { + + for (nid_t node_id : node_ids) { + subgraph.create_handle(graph->get_sequence(graph->get_handle(node_id)), node_id); + } + + algorithms::expand_subgraph_by_steps(*graph, subgraph, numeric_limits::max()); + algorithms::add_subpaths_to_subgraph(*graph, subgraph, subpath_naming); +} - Graph g; +void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, + bool forward_only, MutablePathMutableHandleGraph& subgraph, + Region& out_region) { for (vg::id_t i = start; i <= end; ++i) { - *g.add_node() = xg->node(i); + subgraph.create_handle(graph->get_sequence(graph->get_handle(i)), i); } - + // expand the context and get path information // if forward_only true, then we only go forward. - xg->expand_context(g, context, true, true, true, !forward_only); + algorithms::expand_subgraph_by_steps(*graph, subgraph, context, forward_only); if (length) { - xg->expand_context(g, context, true, false, true, !forward_only); + algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only); } + algorithms::add_subpaths_to_subgraph(*graph, subgraph, true); // build the vg - subgraph.extend(g); - subgraph.remove_orphan_edges(); - out_region.start = subgraph.min_node_id(); out_region.end = subgraph.max_node_id(); } +set, pair>> PathChunker::get_path_edge_index(step_handle_t start_step, + step_handle_t end_step, int64_t context) const { + // we don't use handles as we're going to use this structure to compare edges across different graphs + set, pair>> path_edges; + + function add_edge = [&](step_handle_t step) { + step_handle_t next = graph->get_next_step(step); + edge_t edge = graph->edge_handle(graph->get_handle_of_step(step), graph->get_handle_of_step(next)); + path_edges.insert(make_pair(make_pair(graph->get_id(edge.first), graph->get_is_reverse(edge.first)), + make_pair(graph->get_id(edge.second), graph->get_is_reverse(edge.second)))); + }; + + // edges from left context + int i = 0; + for (step_handle_t step = start_step; graph->has_previous_step(step) && i <= context; + step = graph->get_previous_step(step), ++i) { + add_edge(graph->get_previous_step(step)); + } + + // edges from range + for (step_handle_t step = start_step; step != end_step; step = graph->get_next_step(step)) { + if (graph->has_next_step(step)) { + add_edge(step); + } + } + + // edges from right context + i = 0; + for (step_handle_t step = end_step; graph->has_next_step(step) && i <= context; + step = graph->get_next_step(step), ++i) { + add_edge(step); + } + + return path_edges; +} + + } diff --git a/src/chunker.hpp b/src/chunker.hpp index 6fdb05e3707..b9d88aeb60d 100644 --- a/src/chunker.hpp +++ b/src/chunker.hpp @@ -5,11 +5,10 @@ #include #include #include -#include "lru_cache.h" -#include "vg.hpp" -#include "xg.hpp" -#include "json2pb.h" +#include "vg/io/json2pb.h" #include "region.hpp" +#include "handle.hpp" +#include "snarls.hpp" namespace vg { @@ -24,10 +23,10 @@ class PathChunker { public: - // xg index used for all path splitting and subgraphing operations - xg::XG* xg; + // graph used for all path splitting and subgraphing operations + const PathPositionHandleGraph* graph; - PathChunker(xg::XG* xg = NULL); + PathChunker(const PathPositionHandleGraph* graph = NULL); ~PathChunker(); /** Extract subgraph corresponding to given path region into its @@ -36,17 +35,40 @@ class PathChunker { * cut nodes) are written to out_region. If forward_only set, context * is only expanded in the forward direction * - * NOTE: we follow convention of Region coordinates being 1-based + * NOTE: we follow convention of Region coordinates being 0-based * inclusive. * */ - void extract_subgraph(const Region& region, int context, int length, bool forward_only, - VG& subgraph, Region& out_region); + void extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only, + MutablePathMutableHandleGraph& subgraph, Region& out_region); + + + /** Extract the region along the given path, and any snarls fully contained in it. This will often + * give more intuitive results than messing with context steps, which can run way + * outside the region of interest */ + void extract_snarls(const Region& region, SnarlManager& snarl_manager, + MutablePathMutableHandleGraph& subgraph); + + /** + * Extract a connected component containing a given path + */ + void extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region); + + /** + * Extract a connected component starting from an id set + */ + void extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph, bool subpath_naming); /** * Like above, but use (inclusive) id range instead of region on path. */ - void extract_id_range(vg::id_t start, vg::id_t end, int context, int length, bool forward_only, - VG& subgraph, Region& out_region); + void extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, bool forward_only, + MutablePathMutableHandleGraph& subgraph, Region& out_region); + + /** + * Get a set of all edges in the graph along a path region (to check for discontinuities later on) + */ + set, pair>> get_path_edge_index(step_handle_t start_step, + step_handle_t end_step, int64_t context) const; }; diff --git a/src/clip.cpp b/src/clip.cpp new file mode 100644 index 00000000000..0d2c5efd78d --- /dev/null +++ b/src/clip.cpp @@ -0,0 +1,1211 @@ +#include "clip.hpp" +#include "traversal_finder.hpp" +#include +#include +#include +#include +#include "bdsg/internal/hash_map.hpp" +#include "bdsg/internal/packed_structs.hpp" + +//#define debug + +namespace vg { + +using namespace std; + +// find the snarl's spanning interval on every reference path traversal through it +// as soon as one of these intervals is found that is contained within an interval in the input index +// then return it (or nullptr if none found) +// also return the snarl's interval (as pair of offsets) in the path +// this logic is mostly lifted from deconstructor which does the same thing to get vcf coordinates. +static tuple get_containing_region(const PathPositionHandleGraph* graph, + PathTraversalFinder& trav_finder, + const Snarl* snarl, + unordered_map>& contig_to_interval_tree, + bool include_endpoints) { + + // every path through the snarl + pair, vector > > travs = trav_finder.find_path_traversals(*snarl); + + // we sort by (region-size, interval-size) to choose the biggest interval from the biggest contig + // intuition: pggb graph with tons of unplaced contigs and one reference contig -- we want the reference contig + // then in all the possible traversals of that reference contig, we take the biggest (which should fit better + // with the other heuristics) + multimap, + tuple> ranked_intervals; + + // check each one against the interval tree + for (size_t i = 0; i < travs.first.size(); ++i) { + auto& step_pair = travs.second[i]; + auto& ref_trav = travs.first[i]; + + path_handle_t path_handle = graph->get_path_handle_of_step(step_pair.first); + string path_name = graph->get_path_name(path_handle); + int64_t path_offset = 0; + subrange_t subrange; + path_name = Paths::strip_subrange(path_name, &subrange); + if (subrange != PathMetadata::NO_SUBRANGE) { + path_offset = subrange.first; + } + + if (contig_to_interval_tree.count(path_name)) { + + IntervalTree& interval_tree = contig_to_interval_tree.at(path_name); + + // first_path_pos computation copied from deconstructor.cpp (it does not include the start node) + step_handle_t start_step = step_pair.first; + step_handle_t end_step = step_pair.second; + handle_t start_handle = graph->get_handle_of_step(start_step); + handle_t end_handle = graph->get_handle_of_step(end_step); + size_t start_pos = graph->get_position_of_step(start_step); + size_t end_pos = graph->get_position_of_step(end_step); + bool use_start = start_pos < end_pos; + handle_t first_path_handle = use_start ? start_handle : end_handle; + int64_t first_path_pos = use_start ? start_pos : end_pos; + // Get the first visit of our snarl traversal + const Visit& first_trav_visit = use_start ? ref_trav.visit(0) : ref_trav.visit(ref_trav.visit_size() - 1); + if ((use_start && first_trav_visit.backward() == graph->get_is_reverse(first_path_handle)) || + (!use_start && first_trav_visit.backward() != graph->get_is_reverse(first_path_handle))) { + // Our path and traversal have consistent orientation. leave off the end of the start node going forward + first_path_pos += graph->get_length(first_path_handle); + } + + size_t length_from_start = 0; + for (size_t j = 1; j < ref_trav.visit_size() - 1; ++j) { + length_from_start += graph->get_length(graph->get_handle(ref_trav.visit(j).node_id())); + } + + if (include_endpoints) { + first_path_pos -= graph->get_length(first_path_handle); + length_from_start += graph->get_length(graph->get_handle(ref_trav.visit(ref_trav.visit_size() - 1).node_id())); + } + int64_t last_path_pos = length_from_start == 0 ? first_path_pos : first_path_pos + length_from_start - 1; + auto overlapping_intervals = interval_tree.findOverlapping(first_path_pos, last_path_pos); + int64_t traversal_interval_length = last_path_pos - first_path_pos + 1; + for (auto& interval : overlapping_intervals) { + if (interval.start <= first_path_pos && interval.stop >= last_path_pos) { + int64_t region_interval_length = interval.stop - interval.start + 1; + ranked_intervals.insert(make_pair(make_pair(region_interval_length, traversal_interval_length), + make_tuple(interval.value, start_step, end_step, first_path_pos, last_path_pos, !use_start))); + } + } + } + } + + + if (!ranked_intervals.empty()) { + return ranked_intervals.rbegin()->second; + } + return make_tuple(nullptr, step_handle_t(), step_handle_t(), -1, -1, false); +} + +void visit_contained_snarls(const PathPositionHandleGraph* graph, const vector& regions, SnarlManager& snarl_manager, + bool include_endpoints, + function visit_fn) { + + // make an interval tree of regions for each contig + unordered_map::interval>> region_intervals; + for (const Region & region : regions) { + vector::interval>& intervals = region_intervals[region.seq]; + intervals.push_back(IntervalTree::interval(region.start, region.end, ®ion)); + } + unordered_map> contig_to_interval_tree; + for (auto seq_intervals : region_intervals) { + IntervalTree interval_tree(std::move(seq_intervals.second)); + contig_to_interval_tree[seq_intervals.first] = std::move(interval_tree); + } + region_intervals.clear(); + + // make a path traversal finder for all affected reference paths taking into account + // subpaths in the graph + unordered_set path_name_set; + for (const Region& region : regions) { + path_name_set.insert(region.seq); + } + unordered_set graph_path_name_set; + graph->for_each_path_handle([&](path_handle_t path_handle) { + string graph_path_name = graph->get_path_name(path_handle); + if (path_name_set.count(Paths::strip_subrange(graph_path_name))) { + graph_path_name_set.insert(graph_path_name); + } + }); + vector path_names; + for (const string& path_name : graph_path_name_set) { + path_names.push_back(path_name); + } + path_name_set.clear(); + graph_path_name_set.clear(); + PathTraversalFinder trav_finder(*graph, snarl_manager, path_names); + + // Do the top-level snarls, the recurse as needed (framework copied from deconstructor.cpp) + snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) { + vector todo(1, snarl); + vector next; + while (!todo.empty()) { + for (auto next_snarl : todo) { + auto containing_region_info = get_containing_region(graph, trav_finder, next_snarl, contig_to_interval_tree, include_endpoints); + if (get<0>(containing_region_info) != nullptr) { + visit_fn(next_snarl, get<1>(containing_region_info), get<2>(containing_region_info), get<3>(containing_region_info), + get<4>(containing_region_info), get<5>(containing_region_info), get<0>(containing_region_info)); + } else { + const vector& children = snarl_manager.children_of(next_snarl); + next.insert(next.end(), children.begin(), children.end()); + } + } + swap(todo, next); + next.clear(); + } + }); +} + +// note: end_step is after the subpath +// end_offset is also one-past the interval +static path_handle_t create_path_fragment(MutablePathMutableHandleGraph* graph, const string& base_name, step_handle_t first_step, + step_handle_t end_step, int64_t start_offset, int64_t end_offset) { + assert(end_offset > start_offset); + PathSense sense; + std::string sample; + std::string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(base_name, sense, sample, locus, haplotype, phase_block, subrange); + assert(subrange == PathMetadata::NO_SUBRANGE); + subrange.first = start_offset; + subrange.second = end_offset; + string subpath_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange); +#ifdef debug + cerr << "making fragment " << subpath_name << endl; +#endif + path_handle_t subpath_handle = graph->create_path_handle(subpath_name); + for (step_handle_t step = first_step; step != end_step; step = graph->get_next_step(step)) { + graph->append_step(subpath_handle, graph->get_handle_of_step(step)); + } + return subpath_handle; +} + +// note: clip-vg.cpp has a more general version (if ever needed) that can chop nodes on path positions +void delete_nodes_and_chop_paths(MutablePathMutableHandleGraph* graph, const unordered_set& nodes_to_delete, + const unordered_set& edges_to_delete, int64_t min_fragment_len, + unordered_map* fragments_per_path) { + // chop the paths + vector path_handles; + graph->for_each_path_handle([&](path_handle_t path_handle) { + path_handles.push_back(path_handle); + }); + for (path_handle_t& path_handle : path_handles) { + + string path_name = graph->get_path_name(path_handle); +#ifdef debug + cerr << "processing path " << path_name << endl; +#endif + int64_t path_offset = 0; + subrange_t subrange; + path_name = Paths::strip_subrange(path_name, &subrange); + if (subrange != PathMetadata::NO_SUBRANGE) { + path_offset = subrange.first; + } + + int64_t cur_step_offset = path_offset; + step_handle_t start_step = graph->path_begin(path_handle); + int64_t start_step_offset = cur_step_offset; + step_handle_t prev_step; + step_handle_t end_step = graph->path_end(path_handle); + bool in_path = !nodes_to_delete.count(graph->get_id(graph->get_handle_of_step(start_step))); + bool was_chopped = false; + for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph->get_next_step(cur_step)) { + handle_t cur_handle = graph->get_handle_of_step(cur_step); + nid_t cur_id = graph->get_id(cur_handle); + bool step_deleted = nodes_to_delete.count(cur_id); + bool edge_deleted = false; + if (in_path && cur_step_offset > start_step_offset) { + if (!step_deleted) { + edge_deleted = edges_to_delete.count(graph->edge_handle(graph->get_handle_of_step(prev_step), cur_handle)); + } + if (step_deleted || edge_deleted) { + // we hit a deleted node (or edge): make a path fragment for eveything to it + if (step_deleted || cur_step_offset - start_step_offset >= min_fragment_len) { + create_path_fragment(graph, path_name, start_step, cur_step, start_step_offset, cur_step_offset); + if (fragments_per_path) { + ++(*fragments_per_path)[path_name]; + } + } + } + if (edge_deleted) { // need to handle this case by popping off the path now + in_path = false; + } + } + if (!in_path && !step_deleted) { + // we hit the first undeleted node after a run of deleteions, start a new fragment + start_step_offset = cur_step_offset; + start_step = cur_step; + } + in_path = !step_deleted; + cur_step_offset += graph->get_length(cur_handle); + prev_step = cur_step; + was_chopped = was_chopped || step_deleted || edge_deleted; + } + + if (was_chopped && in_path && cur_step_offset > start_step_offset) { + // get that last fragment + if (cur_step_offset - start_step_offset >= min_fragment_len) { + create_path_fragment(graph, path_name, start_step, graph->path_end(path_handle), start_step_offset, cur_step_offset); + if (fragments_per_path) { + ++(*fragments_per_path)[path_name]; + } + } + } + + if (was_chopped) { + graph->destroy_path(path_handle); + } + } + + DeletableHandleGraph* del_graph = dynamic_cast(graph); + // delete the edges + for (edge_t edge : edges_to_delete) { + del_graph->destroy_edge(edge); + } + + // finally, delete the nodes + for (nid_t node_id : nodes_to_delete) { + handle_t handle = graph->get_handle(node_id); + assert(graph->steps_of_handle(handle).empty()); + del_graph->destroy_handle(handle); + } + +} + +// determine if a snarl is complex enough to warrant flattening by measuring some very basic stats +static bool snarl_is_complex(PathPositionHandleGraph* graph, const Snarl* snarl, + const pair, unordered_set >& contents, + const pair, unordered_set >& contents_shallow, + int64_t ref_interval_length, const Region& region, path_handle_t path_handle, + size_t max_nodes, size_t max_edges, + size_t max_nodes_shallow, size_t max_edges_shallow, + double max_avg_degree, double max_reflen_prop, size_t max_reflen, + double& out_avg_degree) { + + out_avg_degree = -1.; + + // if our snarl is to big vs the reference path, we do not process it + double ref_prop = (double)ref_interval_length / (double)graph->get_path_length(path_handle); + if (ref_prop > max_reflen_prop || ref_interval_length > max_reflen) { +#ifdef debug + cerr << "skipping snarl " << pb2json(*snarl) << " with interval length " << ref_interval_length + << " because its ref_prop of " << region.seq << " is " << ref_prop << " which is greater than " << max_reflen_prop + << " or its ref length " << ref_interval_length << " is greater than " << max_reflen << endl; +#endif + return false; + } + + // check the stats + bool filter_on = max_nodes > 0 || max_edges > 0 || max_nodes_shallow > 0 || max_edges_shallow > 0 || max_avg_degree > 0.; + bool complex_nodes = contents.first.size() > max_nodes; + bool complex_edges = contents.second.size() > max_edges; + bool complex_nodes_shallow = contents_shallow.first.size() > max_nodes_shallow; + bool complex_edges_shallow = contents_shallow.second.size() > max_edges_shallow; + size_t total_degree = 0; + for (id_t node_id : contents.first) { + handle_t handle = graph->get_handle(node_id); + total_degree += graph->get_degree(handle, true) + graph->get_degree(handle, false); + } + // degree averaged over node sides to be a bit more intuitive, hence 2X in denominator: + out_avg_degree = (double)total_degree / (2. *(double)contents.first.size()); + bool complex_degree = out_avg_degree > max_avg_degree; + + return !filter_on || (complex_nodes && complex_edges && complex_nodes_shallow && complex_edges_shallow && complex_degree); +} + +void clip_contained_snarls(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, + SnarlManager& snarl_manager, bool include_endpoints, int64_t min_fragment_len, + size_t max_nodes, size_t max_edges, size_t max_nodes_shallow, size_t max_edges_shallow, + double max_avg_degree, double max_reflen_prop, size_t max_reflen, + bool out_bed, bool verbose) { + + // find all nodes in the snarl that are not on the reference interval (reference path name from containing interval) + unordered_set nodes_to_delete; + + // and all the edges + unordered_set edges_to_delete; + + // just for logging + unordered_map clip_counts; + + // for making the whitelist + unordered_set ref_prefixes; + for (const Region& region : regions) { + ref_prefixes.insert(region.seq); + } + + visit_contained_snarls(pp_graph, regions, snarl_manager, include_endpoints, [&](const Snarl* snarl, step_handle_t start_step, step_handle_t end_step, + int64_t start_pos, int64_t end_pos, + bool steps_reversed, const Region* containing_region) { + +#ifdef debug + cerr << "Clipping snarl " << pb2json(*snarl) << " because it lies in region " + << containing_region->seq << ":" << containing_region->start << "-" << containing_region->end << endl; + cerr << "Passed in steps are " << pp_graph->get_id(pp_graph->get_handle_of_step(start_step)) << ":" << pp_graph->get_is_reverse(pp_graph->get_handle_of_step(start_step)) << " - " + << pp_graph->get_id(pp_graph->get_handle_of_step(end_step)) << ":" << pp_graph->get_is_reverse(pp_graph->get_handle_of_step(end_step)) << endl; +#endif + + unordered_set whitelist; + if (steps_reversed) { + step_handle_t past_end_step = pp_graph->get_previous_step(end_step); + for (step_handle_t step = start_step ; step != past_end_step; step = graph->get_previous_step(step)) { + whitelist.insert(pp_graph->get_id(pp_graph->get_handle_of_step(step))); + } + } else { + step_handle_t past_end_step = pp_graph->get_next_step(end_step); + for (step_handle_t step = start_step ; step != past_end_step; step = graph->get_next_step(step)) { + whitelist.insert(pp_graph->get_id(pp_graph->get_handle_of_step(step))); + } + } + + edge_t deletion_edge = graph->edge_handle(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), + graph->get_handle(snarl->end().node_id(), snarl->end().backward())); + bool deletion_on_whitelist = false; + // check if the snarl-spanning deletion edge is on a reference path. if it is, we whitelist it, otherwise it + // goes. this gets treated separately as it would not otherwise get deleted by erasing every node in the snarl contents + if (graph->has_edge(deletion_edge)) { + graph->for_each_step_on_handle(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), [&](step_handle_t step_handle) { + string path_name = graph->get_path_name(graph->get_path_handle_of_step(step_handle)); + for (const string& ref_prefix : ref_prefixes) { + if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) { + step_handle_t next_step = graph->get_next_step(step_handle); + if (next_step != graph->path_end(graph->get_path_handle_of_step(step_handle)) && + graph->edge_handle(graph->get_handle_of_step(step_handle), graph->get_handle_of_step(next_step)) == deletion_edge) { + deletion_on_whitelist = true; + return false; + } + step_handle_t prev_step = graph->get_previous_step(step_handle); + if (prev_step != graph->path_front_end(graph->get_path_handle_of_step(step_handle)) && + graph->edge_handle(graph->get_handle_of_step(prev_step), graph->get_handle_of_step(step_handle)) == deletion_edge) { + deletion_on_whitelist = true; + return false; + } + } + } + return true; + }); + } + + size_t ref_interval_length = 0; + for (nid_t node_id : whitelist) { + // don't count snarl ends here. todo: should this be an option? + if (node_id != snarl->start().node_id() && node_id != snarl->end().node_id()) { + ref_interval_length += pp_graph->get_length(pp_graph->get_handle(node_id)); + } + } + path_handle_t path_handle = pp_graph->get_path_handle_of_step(start_step); + pair, unordered_set > contents = snarl_manager.deep_contents(snarl, *pp_graph, false); + pair, unordered_set > contents_shallow = snarl_manager.shallow_contents(snarl, *pp_graph, false); + // add other reference paths to the whitelist to make sure they don't get cut + if (!ref_prefixes.empty()) { + for (const id_t& node_id : contents.first) { + if (!whitelist.count(node_id)) { + graph->for_each_step_on_handle(graph->get_handle(node_id), [&](step_handle_t step_handle) { + string path_name = graph->get_path_name(graph->get_path_handle_of_step(step_handle)); + for (const string& ref_prefix : ref_prefixes) { + if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) { + whitelist.insert(node_id); + return false; + } + } + return true; + }); + } + } + } + + double avg_degree = -1; + if (snarl_is_complex(pp_graph, snarl, contents, contents_shallow, ref_interval_length, *containing_region, path_handle, max_nodes, max_edges, + max_nodes_shallow, max_edges_shallow, max_avg_degree, max_reflen_prop, max_reflen, avg_degree)) { + if (out_bed) { + string snarl_name = (snarl->start().backward() ? "<" : ">") + std::to_string(snarl->start().node_id()) + + (snarl->end().backward() ? "<" : ">") + std::to_string(snarl->end().node_id()); + cout << containing_region->seq << "\t" << start_pos << "\t" << (end_pos + 1) << "\t" << snarl_name << "\t" + << contents.first.size() << "\t" << contents.second.size() << "\t" + << contents_shallow.first.size() << "\t" << contents_shallow.second.size() << "\t" + << avg_degree << "\n"; + } else { + for (id_t node_id : contents.first) { + if (!whitelist.count(node_id)) { + nodes_to_delete.insert(node_id); + ++clip_counts[containing_region->seq]; + } + } + // since we're deleting all alt alleles, the only edge that could be left is a snarl-spanning deletion + if (!deletion_on_whitelist && graph->has_edge(deletion_edge)) { + edges_to_delete.insert(deletion_edge); + } + } + } +#ifdef debug + cerr << "snarl was not deemed complex enough to clip" << endl; +#endif + }); + + if (verbose && !out_bed) { + if (clip_counts.size() > 1) { + for (const auto& kv : clip_counts) { + cerr << "[vg-clip]: Removing " << kv.second << " nodes due to intervals on path " << kv.first << endl; + } + } + cerr << "[vg-clip]: Removing total of " << nodes_to_delete.size() << " nodes and " << edges_to_delete.size() << " snarl-spanning edges from snarls in regions" << endl; + clip_counts.clear(); + } + + // cut out the nodes and chop up paths + if (!out_bed) { + delete_nodes_and_chop_paths(graph, nodes_to_delete, edges_to_delete, min_fragment_len, verbose ? &clip_counts : nullptr); + if (verbose) { + for (const auto& kv : clip_counts) { + cerr << "[vg-clip]: Creating " << kv.second << " fragments from path " << kv.first << endl; + } + clip_counts.clear(); + } + } +} + +struct BBEdgeHash { + uint64_t operator()(const edge_t& edge, uint64_t seed = 0xAAAAAAAA55555555ULL) const { + uint64_t hsh1 = boomphf::SingleHashFunctor()(as_integer(edge.first), seed); + uint64_t hsh2 = boomphf::SingleHashFunctor()(as_integer(edge.second), seed); + // Boost combine for hash values + return hsh1 ^ (hsh2 + 0x9e3779b9 + (hsh1<<6) + (hsh1>>2)); + } +}; + +void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph, + function)> iterate_handles, + function)> iterate_edges, + int64_t min_depth, const vector& ref_prefixes, + int64_t min_fragment_len, bool verbose) { + + // find all nodes in the snarl that are not on the reference interval (reference path name from containing interval) + unordered_set to_delete; + + // just for logging + unordered_map clip_counts; + + function check_prefixes = [&ref_prefixes] (const string& path_name) { + for (const string& ref_prefix : ref_prefixes) { + if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) { + return true; + } + } + return false; + }; + + function visit_handle = [&](handle_t handle, const Region* region) { + bool on_ref = false; + size_t depth = 0; + graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) { + ++depth; + if (depth > min_depth || on_ref) { + return false; + } + if (!ref_prefixes.empty() || region) { + // if we have a region, do exact comparison to it. + // otherwise, do a prefix check against ref_prefix + string path_name = graph->get_path_name(graph->get_path_handle_of_step(step_handle)); + if ((region && region->seq == path_name) || (!region && check_prefixes(path_name))) { + on_ref = true; + return false; + } + } + return true; + }); + if (!on_ref && depth < min_depth) { + to_delete.insert(graph->get_id(handle)); + } + }; + + iterate_handles(visit_handle); + + if (verbose) { + cerr << "[vg-clip]: Removing " << to_delete.size() << " nodes with path coverage less than " << min_depth << endl; + } + + // now do the edges + vector edges; + graph->for_each_edge([&](edge_t edge) { + edges.push_back(edge); + }); + size_t edge_count = edges.size(); + boomphf::mphf edge_hash(edge_count, edges, get_thread_count(), 2.0, false, false); + edges.clear(); + bdsg::PackedVector<> edge_depths; + edge_depths.resize(edge_count + 1); + + graph->for_each_path_handle([&](path_handle_t path_handle) { + bool is_ref_path = check_prefixes(graph->get_path_name(path_handle)); + handle_t prev_handle; + bool first = true; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (!first) { + edge_t edge = graph->edge_handle(prev_handle, handle); + size_t edge_rank = edge_hash.lookup(edge); + int64_t edge_depth = edge_depths.get(edge_rank); + if (edge_depth < min_depth) { + if (is_ref_path) { + // we never want to remove and edge on a reference path, + // so automatically bump such edges past the threshold + edge_depths.set(edge_rank, min_depth); + } else { + edge_depths.set(edge_rank, edge_depth + 1); + } + } + } else { + first = false; + } + prev_handle = handle; + }); + }); + + unordered_set edges_to_delete; + function visit_edge = [&](edge_t edge, const Region* region) { + size_t edge_rank = edge_hash.lookup(edge); + if (edge_depths.get(edge_rank) < min_depth) { + edges_to_delete.insert(edge); + } + }; + + iterate_edges(visit_edge); + + if (verbose) { + cerr << "[vg-clip]: Removing " << edges_to_delete.size() << " edges with path coverage less than " << min_depth << endl; + } + + // cut out the nodes and chop up paths + delete_nodes_and_chop_paths(graph, to_delete, edges_to_delete, min_fragment_len, verbose ? &clip_counts : nullptr); + + if (verbose) { + for (const auto& kv : clip_counts) { + cerr << "[vg-clip]: Creating " << kv.second << " fragments from path" << kv.first << endl; + } + clip_counts.clear(); + } + + // use the reference path prefix (if given) to clip out components that aren't anchored to it + // (this would take care of above filter, but we leave that one as it's not dependent on path name) + if (!ref_prefixes.empty()) { + size_t removed_node_count = 0; + size_t removed_component_count = 0; + vector> components = handlealgs::weakly_connected_components(graph); + for (auto& component : components) { + bool ref_anchored = false; + for (auto ni = component.begin(); !ref_anchored && ni != component.end(); ++ni) { + vector steps = graph->steps_of_handle(graph->get_handle(*ni)); + for (size_t si = 0; !ref_anchored && si < steps.size(); ++si) { + string step_path_name = graph->get_path_name(graph->get_path_handle_of_step(steps[si])); + if (check_prefixes(step_path_name)) { + ref_anchored = true; + } + } + } + if (!ref_anchored) { + ++removed_component_count; + for (auto node_id : component) { + handle_t node_handle = graph->get_handle(node_id); + dynamic_cast(graph)->destroy_handle(node_handle); + ++removed_node_count; + } + } + } + if (verbose) { + cerr << "[vg-clip]: Removing " << removed_node_count << " nodes in " << removed_component_count << " disconnected components" << endl; + } + } +} + +void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector& ref_prefixes, + int64_t min_fragment_len, bool verbose) { + + function)> iterate_handles = [&] (function visit_handle) { + graph->for_each_handle([&](handle_t handle) { + visit_handle(handle, nullptr); + }); + }; + + function)> iterate_edges = [&] (function visit_edge) { + graph->for_each_edge([&](edge_t edge) { + visit_edge(edge, nullptr); + }); + }; + + clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_prefixes, min_fragment_len, verbose); +} + +void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, + SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose) { + + function)> iterate_handles = [&] (function visit_handle) { + + visit_contained_snarls(pp_graph, regions, snarl_manager, include_endpoints, [&](const Snarl* snarl, step_handle_t start_step, step_handle_t end_step, + int64_t start_pos, int64_t end_pos, + bool steps_reversed, const Region* containing_region) { + + pair, unordered_set > contents = snarl_manager.deep_contents(snarl, *pp_graph, false); + for (id_t node_id : contents.first) { + visit_handle(graph->get_handle(node_id), containing_region); + } + }); + }; + + // todo: duplicating this is very wasteful, and is only happening because edge support was added after the fact. + // something needs to be refactored in order to fix this, but it's a fairly esoteric codepath and may not be worth it + function)> iterate_edges = [&] (function visit_edge) { + + visit_contained_snarls(pp_graph, regions, snarl_manager, include_endpoints, [&](const Snarl* snarl, step_handle_t start_step, step_handle_t end_step, + int64_t start_pos, int64_t end_pos, + bool steps_reversed, const Region* containing_region) { + + pair, unordered_set > contents = snarl_manager.deep_contents(snarl, *pp_graph, false); + for (const edge_t& edge : contents.second) { + visit_edge(edge, containing_region); + } + }); + }; + + // the edge depths are computed globally, without looking at regions. as such, they need some notion of reference paths + // so we shimmy a set in from the regions + set ref_path_set; + for (const Region& region : regions) { + ref_path_set.insert(region.seq); + } + vector ref_paths_from_regions(ref_path_set.begin(), ref_path_set.end()); + + clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_paths_from_regions, min_fragment_len, verbose); + +} + +// we avoid the path position interface since we only want reference coordinates +static unordered_map> make_ref_index(PathHandleGraph* graph, path_handle_t ref_path, + unordered_set& out_ref_edges) { + unordered_map> handle_to_position; + int64_t pos = 0; + handle_t prev_handle; + bool has_prev = false; + graph->for_each_step_in_path(ref_path, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + int64_t handle_len = graph->get_length(handle); + handle_to_position[handle].push_back(pos); + handle_to_position[graph->flip(handle)].push_back(pos + handle_len - 1); + pos += handle_len; + if (has_prev) { + out_ref_edges.insert(graph->edge_handle(prev_handle, handle)); + } + has_prev = true; + prev_handle = handle; + }); + return handle_to_position; +} + +// walk context steps out from reference path, flagging each node encountered with its +// minimum and maximum position on the path +static multimap find_deletion_candidate_edges(PathHandleGraph* graph, path_handle_t ref_path, + const unordered_map>& handle_to_position, + int64_t max_deletion, int64_t context_steps, + const unordered_set& edge_blacklist) { + vector> pos_handles; + int64_t cur_pos = 0; + graph->for_each_step_in_path(ref_path, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + pos_handles.push_back(make_pair(cur_pos, handle)); + cur_pos += graph->get_length(handle); + }); + + vector> id_to_min_pos_threads(get_thread_count()); + vector> id_to_max_pos_threads(get_thread_count()); + +#pragma omp parallel for + for (size_t i = 0; i < pos_handles.size(); ++i) { + int64_t pos = pos_handles[i].first; + handle_t handle = pos_handles[i].second; + unordered_map& id_to_min_pos = id_to_min_pos_threads[omp_get_thread_num()]; + unordered_map& id_to_max_pos = id_to_max_pos_threads[omp_get_thread_num()]; + + // scan a context of our current step, trying to avoid touching back + // on the reference path + // todo: can do better job of constraining with more step_on_path lookups + unordered_set context; + vector cur_handles = {handle}; + for (int64_t i = 0; i < context_steps; ++i) { + vector next_handles; + for (auto& h : cur_handles) { + nid_t cur_id = graph->get_id(h); + if (!context.count(cur_id)) { + context.insert(cur_id); + graph->follow_edges(h, false, [&](handle_t n) { + if (!edge_blacklist.count(graph->edge_handle(h, n)) && !handle_to_position.count(n)) { + next_handles.push_back(n); + } + }); + graph->follow_edges(h, true, [&](handle_t p) { + if (!edge_blacklist.count(graph->edge_handle(p, h)) && !handle_to_position.count(p)) { + next_handles.push_back(p); + } + }); + } + } + cur_handles = std::move(next_handles); + } + + // assig everything in the context to the current position + for (nid_t id : context) { + auto it = id_to_min_pos.find(id); + if (it == id_to_min_pos.end()) { + id_to_min_pos[id] = pos; + id_to_max_pos[id] = pos; + } else { + id_to_min_pos[id] = min(pos, it->second); + id_to_max_pos[id] = max(pos, id_to_max_pos.at(id)); + } + } + } + + for (size_t i = 1; i < id_to_max_pos_threads.size(); ++i) { + for (const auto& id_max : id_to_max_pos_threads[i]) { + id_to_max_pos_threads[0][id_max.first] = max(id_to_max_pos_threads[0][id_max.first], id_max.second); + } + id_to_max_pos_threads[i].clear(); + for (const auto& id_min : id_to_min_pos_threads[i]) { + if (id_to_min_pos_threads[0].count(id_min.first)) { + id_to_min_pos_threads[0][id_min.first] = min(id_to_min_pos_threads[0][id_min.first], id_min.second); + } else { + id_to_min_pos_threads[0][id_min.first] = id_min.second; + } + } + id_to_min_pos_threads[i].clear(); + } + + auto& id_to_min_pos = id_to_min_pos_threads[0]; + auto& id_to_max_pos = id_to_max_pos_threads[0]; + + // scan every edge to find minimum distance according to positions found above + multimap length_to_edge; + unordered_set edges_visited; + vector neighbours; + for (const auto& id_pos : id_to_min_pos) { + handle_t handle = graph->get_handle(id_pos.first); + graph->follow_edges(handle, false, [&] (handle_t next) { + edge_t edge = graph->edge_handle(handle, next); + if (id_to_min_pos.count(graph->get_id(next)) && !edges_visited.count(edge)) { + edges_visited.insert(edge); + neighbours.push_back(edge); + } + }); + graph->follow_edges(handle, true, [&] (handle_t next) { + edge_t edge = graph->edge_handle(next, handle); + if (id_to_min_pos.count(graph->get_id(next)) && !edges_visited.count(edge)) { + edges_visited.insert(edge); + neighbours.push_back(edge); + } + }); + for (const edge_t& edge : neighbours) { + assert(graph->has_edge(edge)); + nid_t id1 = graph->get_id(edge.first); + nid_t id2 = graph->get_id(edge.second); + int64_t delta = max(abs(id_to_max_pos.at(id1) - id_to_min_pos.at(id2)), abs(id_to_max_pos.at(id2) - id_to_min_pos.at(id1))); + + if (delta > max_deletion) { + length_to_edge.insert(make_pair(delta, edge)); + } + } + neighbours.clear(); + } + + return length_to_edge; +} + +// walk from given edge back to reference path, returning the maximum distance found +static int64_t get_max_deletion(const PathHandleGraph* graph, + const unordered_map>& handle_to_position, + int64_t context_steps, + edge_t edge, + const unordered_set& edge_blacklist) { + + function(handle_t)> get_ref_context = [&] (handle_t handle) { + unordered_set context; + unordered_set ref_context; + vector cur_handles = {handle}; + for (int64_t i = 0; i < context_steps; ++i) { + vector next_handles; + for (auto& h : cur_handles) { + if (!context.count(h)) { + context.insert(h); + if (i == 0) { + // keep search directional from origin + context.insert(graph->flip(h)); + } + // stop search once we hit reference path + if (handle_to_position.count(h)) { + ref_context.insert(h); + } else { + graph->follow_edges(h, false, [&](handle_t n) { + if (!edge_blacklist.count(graph->edge_handle(h, n))) { + next_handles.push_back(n); + } + }); + if (i > 0) { + // keep search directional from origin + graph->follow_edges(h, true, [&](handle_t p) { + if (!edge_blacklist.count(graph->edge_handle(p, h))) { + next_handles.push_back(p); + } + }); + } + } + } + } + cur_handles = std::move(next_handles); + } + return ref_context; + }; + + // search away from the start of the edge, finding the leftmost and rightmost reference + // path positions + unordered_set left_context = get_ref_context(graph->flip(edge.first)); + if (left_context.empty()) { + return -1; + } + int64_t min_pos_left = numeric_limits::max(); + int64_t max_pos_left = -1; + for (handle_t handle : left_context) { + auto it = handle_to_position.find(handle); + assert(it != handle_to_position.end()); + const vector& positions = it->second; + for (int64_t pos : positions) { + min_pos_left = min(min_pos_left, pos); + max_pos_left = max(max_pos_left, pos); + } + } + + // search away from the end of the edge, finding the leftmost and rightmost reference + // path positions + unordered_set right_context = get_ref_context(edge.second); + if (right_context.empty()) { + return -1; + } + int64_t min_pos_right = numeric_limits::max(); + int64_t max_pos_right = -1; + for (handle_t handle : right_context) { + auto it = handle_to_position.find(handle); + assert(it != handle_to_position.end()); + const vector& positions = it->second; + for (int64_t pos : positions) { + min_pos_right = min(min_pos_right, pos); + max_pos_right = max(max_pos_right, pos); + } + } + + // compute the maximum deletion + int64_t delta = max(abs(max_pos_left - min_pos_right), abs(max_pos_right - min_pos_left)); + + return delta; +} + +void clip_deletion_edges(MutablePathMutableHandleGraph* graph, int64_t max_deletion, + int64_t context_steps, + const vector& ref_prefixes, int64_t min_fragment_len, bool verbose) { + + // load up the reference paths and their ids + unordered_set ref_paths; + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + for (const string& ref_prefix : ref_prefixes) { + if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) { + ref_paths.insert(path_handle); + break; + } + } + }); + + // all deletion edges for all paths + unordered_set deletion_edges; + // all reference edges + deletion edges + unordered_set edge_blacklist; + + for (path_handle_t ref_path : ref_paths) { + + // index the path to avoid path position interface dep + unordered_map> handle_to_position = make_ref_index(graph, ref_path, edge_blacklist); + + // find set of deletion candidates sorted by length by walking out from the reference path + if (verbose) { + cerr << "[vg clip]: Searching for deletion candidates on " << graph->get_path_name(ref_path) + << " with " << context_steps << " context steps and " << get_thread_count() << " threads" << endl; + } + multimap length_to_edge = find_deletion_candidate_edges(graph, ref_path, handle_to_position, + max_deletion, context_steps, edge_blacklist); + + if (verbose) { + cerr << "[vg clip]: Found " << length_to_edge.size() << " candidate deletion edges for " << graph->get_path_name(ref_path); + if (!length_to_edge.empty()) { + cerr << " with sizes ranging from " << length_to_edge.begin()->first << " to " << length_to_edge.rbegin()->first; + } + cerr << endl; + } + + // for every deletion candidate, walk *back* to the reference path and add it if it forms a deletion + // we do this one-at-a-time to make sure that the edges are still deletions after the preceding edges were deleted + int64_t candidate_i = 0; + for (auto it = length_to_edge.rbegin(); it != length_to_edge.rend(); ++it) { + const edge_t& edge = it->second; + assert(graph->has_edge(edge)); + int64_t deletion_size = get_max_deletion(graph, handle_to_position, context_steps, edge, edge_blacklist); + + if (deletion_size > max_deletion) { + deletion_edges.insert(edge); + edge_blacklist.insert(edge); + if (verbose) { + if (verbose) { + cerr << "[vg clip]: Found deletion edge for candidate " << candidate_i << " on " << graph->get_path_name(ref_path) << ": " + << (graph->get_is_reverse(edge.first) ? "<" : ">") << graph->get_id(edge.first) + << (graph->get_is_reverse(edge.second) ? "<" : ">") << graph->get_id(edge.second) + << " with reference length " << deletion_size << endl; + } + } + } + ++candidate_i; + } + } + + // just for logging + unordered_map clip_counts; + + if (verbose) { + cerr << "[vg-clip]: Clipping " << deletion_edges.size() << " edges" << endl; + } + + // delete the edges + delete_nodes_and_chop_paths(graph, {}, deletion_edges, min_fragment_len, verbose ? &clip_counts : nullptr); + + if (verbose) { + for (const auto& kv : clip_counts) { + cerr << "[vg-clip]: Creating " << kv.second << " fragments from path " << kv.first << endl; + } + clip_counts.clear(); + } +} + +void clip_stubs_generic(MutablePathMutableHandleGraph* graph, + function)> iterate_handles, + function handle_in_range, + const vector& ref_prefixes, + int64_t min_fragment_len, + bool verbose) { + + unordered_set to_delete; + + // just for logging + unordered_map clip_counts; + + // frontier for recursing on stub neighbours + unordered_map stub_neighbours_1; + + // test if a node is "reference" using a name check + function check_prefixes = [&ref_prefixes] (const string& path_name) { + for (const string& ref_prefix : ref_prefixes) { + if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) { + return true; + } + } + return false; + }; + + // test if a node is a stub. + // we consider a node a stub if either (or both) sides have degree 0. + function is_stub = [&to_delete, &graph] (const handle_t& handle) { + size_t left_degree = 0; + graph->follow_edges(handle, true, [&](handle_t left) { + if (!to_delete.count(graph->get_id(left))) { + ++left_degree; + return false; + } + return true; + }); + size_t right_degree = 1; + if (left_degree > 0) { + right_degree = 0; + graph->follow_edges(handle, false, [&](handle_t right) { + if (!to_delete.count(graph->get_id(right))) { + ++right_degree; + return false; + } + return true; + }); + } + return left_degree == 0 || right_degree == 0; + }; + + function visit_handle = [&](handle_t handle, const Region* region) { + + if (!to_delete.count(graph->get_id(handle)) && is_stub(handle)) { + bool on_ref = false; + graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) { + if (!ref_prefixes.empty() || region) { + // if we have a region, do exact comparison to it. + // otherwise, do a prefix check against ref_prefix + string path_name = graph->get_path_name(graph->get_path_handle_of_step(step_handle)); + if ((region && region->seq == path_name) || (!region && check_prefixes(path_name))) { + on_ref = true; + return false; + } + } + return true; + }); + if (!on_ref) { + to_delete.insert(graph->get_id(handle)); + + // remember the neighbours -- they can be new stubs! + graph->follow_edges(handle, true, [&](handle_t prev) { + if (handle_in_range(prev) && !to_delete.count(graph->get_id(prev)) && graph->get_id(handle) != graph->get_id(prev)) { + stub_neighbours_1[prev] = region; + } + }); + graph->follow_edges(handle, false, [&](handle_t next) { + if (handle_in_range(next) && !to_delete.count(graph->get_id(next)) && graph->get_id(handle) != graph->get_id(next)) { + stub_neighbours_1[next] = region; + } + }); + } + } + }; + + // first pass: find all the stubs in iterate_handles + // and populate stub_neighbours_1 + iterate_handles(visit_handle); + + // keep doing the same thing on the neighbours until none left, using + // handle_in_range to make sure we don't step out of bounds (in the case we're doing BED regions) + unordered_map stub_neighbours_2; + while (!stub_neighbours_1.empty()) { + stub_neighbours_2.clear(); + swap(stub_neighbours_1, stub_neighbours_2); + for (const auto& neighbour_pair : stub_neighbours_2) { + visit_handle(neighbour_pair.first, neighbour_pair.second); + } + stub_neighbours_2.clear(); + } + + if (verbose) { + cerr << "[vg-clip]: Removing " << to_delete.size() << " nodes from (non-reference) stubs." << endl; + } + + // cut out the nodes and chop up paths + delete_nodes_and_chop_paths(graph, to_delete, {}, min_fragment_len, verbose ? &clip_counts : nullptr); + + if (verbose) { + for (const auto& kv : clip_counts) { + cerr << "[vg-clip]: Creating " << kv.second << " fragments from path" << kv.first << endl; + } + clip_counts.clear(); + } + +} + +void clip_stubs(MutablePathMutableHandleGraph* graph, const vector& ref_prefixes, int64_t min_fragment_len, bool verbose) { + + function)> iterate_handles = [&] (function visit_handle) { + graph->for_each_handle([&](handle_t handle) { + visit_handle(handle, nullptr); + }); + }; + + function handle_in_range = [](handle_t) { + return true; + }; + + clip_stubs_generic(graph, iterate_handles, handle_in_range, ref_prefixes, min_fragment_len, verbose); +} + +void clip_contained_stubs(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, + SnarlManager& snarl_manager, bool include_endpoints, int64_t min_fragment_len, bool verbose) { + + unordered_set all_handles; + function handle_in_range = [&all_handles](handle_t handle) { + return all_handles.count(handle); + }; + + function)> iterate_handles = [&] (function visit_handle) { + + visit_contained_snarls(pp_graph, regions, snarl_manager, include_endpoints, [&](const Snarl* snarl, step_handle_t start_step, step_handle_t end_step, + int64_t start_pos, int64_t end_pos, + bool steps_reversed, const Region* containing_region) { + + pair, unordered_set > contents = snarl_manager.deep_contents(snarl, *pp_graph, false); + for (id_t node_id : contents.first) { + visit_handle(graph->get_handle(node_id), containing_region); + all_handles.insert(graph->get_handle(node_id)); + } + }); + }; + + // the edge depths are computed globally, without looking at regions. as such, they need some notion of reference paths + // so we shimmy a set in from the regions + set ref_path_set; + for (const Region& region : regions) { + ref_path_set.insert(region.seq); + } + vector ref_paths_from_regions(ref_path_set.begin(), ref_path_set.end()); + + clip_stubs_generic(graph, iterate_handles, handle_in_range, ref_paths_from_regions, min_fragment_len, verbose); + +} + +void stubbify_ref_paths(MutablePathMutableHandleGraph* graph, const vector& ref_prefixes, int64_t min_fragment_len, bool verbose) { + unordered_set edges_to_delete; + int64_t stubbified_path_count = 0; // just for logging + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + for (const string& ref_prefix : ref_prefixes) { + bool was_stubbified = false; + if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) { + step_handle_t first_step = graph->path_begin(path_handle); + handle_t first_handle = graph->get_handle_of_step(first_step); + graph->follow_edges(first_handle, !graph->get_is_reverse(first_handle), [&](handle_t next_handle) { + edge_t edge = graph->get_is_reverse(first_handle) ? graph->edge_handle(first_handle, next_handle) : + graph->edge_handle(next_handle, first_handle); + edges_to_delete.insert(edge); + was_stubbified = true; + }); + + step_handle_t last_step = graph->path_back(path_handle); + handle_t last_handle = graph->get_handle_of_step(last_step); + graph->follow_edges(last_handle, graph->get_is_reverse(last_handle), [&](handle_t next_handle) { + edge_t edge = graph->get_is_reverse(last_handle) ? graph->edge_handle(next_handle, last_handle) : + graph->edge_handle(last_handle, next_handle); + edges_to_delete.insert(edge); + was_stubbified = true; + }); + } + if (was_stubbified) { + ++stubbified_path_count; + } + } + }); + + // just for logging + unordered_map clip_counts; + + if (verbose) { + cerr << "[vg-clip]: Clipping " << edges_to_delete.size() << " edges to stubbify " << stubbified_path_count + << " reference paths" << endl; + } + + // delete the edges + delete_nodes_and_chop_paths(graph, {}, edges_to_delete, min_fragment_len, verbose ? &clip_counts : nullptr); + + if (verbose) { + for (const auto& kv : clip_counts) { + cerr << "[vg-clip]: Ref path stubbification creating " << kv.second << " fragments from path " << kv.first << endl; + } + clip_counts.clear(); + } +} + +} diff --git a/src/clip.hpp b/src/clip.hpp new file mode 100644 index 00000000000..e57a260b98c --- /dev/null +++ b/src/clip.hpp @@ -0,0 +1,104 @@ +#ifndef VG_CLIP_HPP_INCLUDED +#define VG_CLIP_HPP_INCLUDED + +/** + * \file clip.hpp + * + * Clip regions out of a graph + */ + +#include "handle.hpp" +#include "snarls.hpp" +#include "region.hpp" + +namespace vg { + +using namespace std; + +/** + * Visit each snarl if it is fully contained in at least one region from the input set. + * Only the top-most snarl is visited. + * The parameters to visit_fn are: + * + */ +void visit_contained_snarls(const PathPositionHandleGraph* graph, const vector& regions, SnarlManager& snarl_manager, + bool include_endpoints, + function visit_fn); + +/* + * Cut nodes out of a graph, and chop up any paths that contain them, using (and resolving) supbath + * naming conventions from Paths class in path.hpp + * If a chopped path has a fragment with length < min_fragment_len, don't bother writing the new path + * The fragments_per_path map is optional, and will collect some stats if present + */ +void delete_nodes_and_chop_paths(MutablePathMutableHandleGraph* graph, + const unordered_set& nodes_to_delete, + const unordered_set& edges_to_delete, + int64_t min_fragment_len, + unordered_map* fragments_per_path = nullptr); + + +/** + * If a given bed region spans a snarl (overlaps its end nodes, and forms a traversal) + * then clip out all other nodes (ie nodes that don't lie on the traversal) + * + * IMPORTANT: for any given snarl, the first region that contains it is used. + * (but other reference paths now whitelisted via ref_prefixes) + * + * Update: now accepts some snarl complexity thresholds to ignore simple enough snarls + */ +void clip_contained_snarls(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, + SnarlManager& snarl_manager, bool include_endpoints, int64_t min_fragment_len, + size_t max_nodes, size_t max_edges, size_t max_nodes_shallow, size_t max_edges_shallow, + double max_avg_degree, double max_reflen_prop, size_t max_reflen, + bool out_bed, bool verbose); + + +/** + * Clip out nodes that don't pass depth threshold (depth < min_depth). + * "depth" is the number of paths that step on the node. + * Nodes on path with given prefix ignored (todo: should really switch to regex or something) + * iterate_handles is a hack to generalize this function to whole graphs or snarls + */ +void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph, + function)> iterate_handles, + function)> iterate_edges, + int64_t min_depth, const vector& ref_prefixes, + int64_t min_fragment_len, bool verbose); + +/** + * Run above function on graph + */ +void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector& ref_prefixes, + int64_t min_fragment_len, bool verbose); + +/** + * Or on contained snarls + */ +void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, + SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose); + +/** + * clip out deletion edges + */ +void clip_deletion_edges(MutablePathMutableHandleGraph* graph, int64_t max_deletion, int64_t context_steps, + const vector& ref_prefixes, int64_t min_fragment_len, bool verbose); + +/** +* clip out stubs +*/ +void clip_stubs(MutablePathMutableHandleGraph* graph, const vector& ref_prefixes, int64_t min_fragment_len, bool verbose); + +void clip_contained_stubs(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, + SnarlManager& snarl_manager, bool include_endpoints, int64_t min_fragment_len, bool verbose); + + +/** + * stubbify reference + */ +void stubbify_ref_paths(MutablePathMutableHandleGraph* graph, const vector& ref_prefixes, int64_t min_fragment_len, bool verbose); + +} + + +#endif diff --git a/src/cluster.cpp b/src/cluster.cpp index 401d68550cb..8c3a9b60419 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -4,8 +4,11 @@ #include #include "cluster.hpp" +#include "algorithms/subgraph.hpp" +#include "algorithms/extract_containing_graph.hpp" +#include "utility.hpp" -//#define debug_od_clusterer +//#define debug_mem_clusterer using namespace std; using namespace structures; @@ -25,7 +28,7 @@ MEMChainModel::MEMChainModel( const vector& aln_lengths, const vector >& matches, const function& approx_position, - const function > >(pos_t)>& path_position, + const function > >(pos_t)>& path_position, const function& transition_weight, int band_width, int position_depth, @@ -47,7 +50,7 @@ MEMChainModel::MEMChainModel( m.prev = nullptr; m.score = 0; m.mem.positions = path_position(pos); - m.mem.positions[""].push_back(make_pair(approx_position(pos), is_rev(pos))); + m.mem.positions[handlegraph::as_path_handle(0)].push_back(make_pair(approx_position(pos), is_rev(pos))); m.mem.nodes.clear(); m.mem.nodes.push_back(node); m.mem.fragment = frag_n; @@ -76,56 +79,10 @@ MEMChainModel::MEMChainModel( pos.resize(min(pos.size(), (size_t)position_depth)); } } - // for each vertex merge if we go equivalently forward in the positional space and forward in the read to the next position - // scan forward - for (map::iterator> > >::iterator c = positions.begin(); c != positions.end(); ++c) { - for (map::iterator> >::iterator p = c->second.begin(); p != c->second.end(); ++p) { - for (auto& v1 : p->second) { - if (redundant_vertexes.count(v1)) continue; - auto q = p; - while (++q != c->second.end() && abs(p->first - q->first) < band_width) { - for (auto& v2 : q->second) { - if (redundant_vertexes.count(v2)) continue; - if (mems_overlap(v1->mem, v2->mem) - && abs(v2->mem.begin - v1->mem.begin) == abs(q->first - p->first)) { - if (v2->mem.length() < v1->mem.length()) { - redundant_vertexes.insert(v2); - if (v2->mem.end > v1->mem.end) { - v1->weight += v2->mem.end - v1->mem.end; - } - } - } - } - } - } - } - } - // scan reverse - for (map::iterator> > >::iterator c = positions.begin(); c != positions.end(); ++c) { - for (map::iterator> >::reverse_iterator p = c->second.rbegin(); p != c->second.rend(); ++p) { - for (auto& v1 : p->second) { - if (redundant_vertexes.count(v1)) continue; - auto q = p; - while (++q != c->second.rend() && abs(p->first - q->first) < band_width) { - for (auto& v2 : q->second) { - if (redundant_vertexes.count(v2)) continue; - if (mems_overlap(v1->mem, v2->mem) - && abs(v2->mem.begin - v1->mem.begin) == abs(p->first - q->first)) { - if (v2->mem.length() < v1->mem.length()) { - redundant_vertexes.insert(v2); - if (v2->mem.end > v1->mem.end) { - v1->weight += v2->mem.end - v1->mem.end; - } - } - } - } - } - } - } - } // now build up the model using the positional bandwidth set::iterator, vector::iterator> > seen; - for (map::iterator> > >::iterator c = positions.begin(); c != positions.end(); ++c) { + for (unordered_map::iterator> > >::iterator c = positions.begin(); c != positions.end(); ++ +c) { for (map::iterator> >::iterator p = c->second.begin(); p != c->second.end(); ++p) { for (auto& v1 : p->second) { // For each vertex... @@ -146,7 +103,7 @@ MEMChainModel::MEMChainModel( // There are not too many connections yet seen.insert(make_pair(v1, v2)); if (v1->mem.fragment < v2->mem.fragment - || v1->mem.fragment == v2->mem.fragment && v1->mem.begin < v2->mem.begin) { + || (v1->mem.fragment == v2->mem.fragment && v1->mem.begin < v2->mem.begin)) { // Transition is allowable because the first comes before the second double weight = transition_weight(v1->mem, v2->mem); @@ -155,7 +112,7 @@ MEMChainModel::MEMChainModel( v2->prev_cost.push_back(make_pair(&*v1, weight)); } } else if (v1->mem.fragment > v2->mem.fragment - || v1->mem.fragment == v2->mem.fragment && v1->mem.begin > v2->mem.begin) { + || (v1->mem.fragment == v2->mem.fragment && v1->mem.begin > v2->mem.begin)) { // Really we want to think about the transition going the other way double weight = transition_weight(v2->mem, v1->mem); @@ -274,6 +231,7 @@ vector > MEMChainModel::traceback(int alt_alns, bool p } mem_trace.push_back(vertex.mem); } + //display_dot(cerr, vertex_trace); // for debugging } return traces; } @@ -317,7 +275,53 @@ void MEMChainModel::display(ostream& out) { out << endl; } } - + +void MEMChainModel::display_dot(ostream& out, vector vertex_trace) { + map vertex_ids; + int i = 0; + for (auto& vertex : model) { + vertex_ids[&vertex] = ++i; + } + map in_trace; + i = 0; + for (auto& v : vertex_trace) { + in_trace[v] = ++i; + } + out << "digraph memchain {" << endl; + for (auto& vertex : model) { + out << vertex_ids[&vertex] + << " [label=\"id:" << vertex_ids[&vertex] + << " seq:" << vertex.mem.sequence() + << " score:" << vertex.score + << " pos:["; + for (auto& p : vertex.mem.positions) { + for (auto& o : p.second) { + out << handlegraph::as_integer(p.first) << ":" << o.first << ":" << (o.second?"-":"+") << ","; + } + } + out << "]\""; + if (in_trace.find(&vertex) != in_trace.end()) { + out << ",color=red"; + } + out << ",shape=box];" << endl; + /* + for (auto& p : vertex.prev_cost) { + out << vertex_ids[p.first] << " -> " << vertex_ids[&vertex] << " [label=\"" << p.second << "\"];" << endl; + } + */ + for (auto& p : vertex.next_cost) { + //out << in_trace[&vertex] << " " << in_trace[p.first] << endl; + out << vertex_ids[&vertex] << " -> " << vertex_ids[p.first] << " [label=\"" << p.second << "\""; + if (in_trace.find(&vertex) != in_trace.end() && in_trace.find(p.first) != in_trace.end() && + in_trace[&vertex] - 1 == in_trace[p.first]) { + out << ",color=red"; + } + out << "];" << endl; + } + } + out << "}" << endl; +} + ShuffledPairs::ShuffledPairs(size_t num_items) : num_items(num_items), num_pairs(num_items * num_items), larger_prime(1), primitive_root(1) { // Find a prime that is at least as large as but at most a constant factor @@ -390,1576 +394,2061 @@ bool ShuffledPairs::iterator::operator==(const iterator& other) const { bool ShuffledPairs::iterator::operator!=(const iterator& other) const { return !(*this == other); } - -OrientedDistanceClusterer::OrientedDistanceClusterer(const Alignment& alignment, - const vector& mems, - const QualAdjAligner& aligner, - xg::XG* xgindex, - size_t max_expected_dist_approx_error, - size_t min_mem_length, - bool unstranded, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo) : - OrientedDistanceClusterer(alignment, mems, nullptr, &aligner, xgindex, max_expected_dist_approx_error, - min_mem_length, unstranded, paths_of_node_memo, oriented_occurences_memo, handle_memo) { - // nothing else to do + +int32_t MEMClusterer::estimate_edge_score(const MaximalExactMatch* mem_1, const MaximalExactMatch* mem_2, + int64_t graph_dist, const GSSWAligner* aligner) const { + + // the length of the sequence in between the MEMs (can be negative if they overlap) + int64_t between_length = mem_2->begin - mem_1->end; + + if (between_length < 0) { + // the MEMs overlap, but this can occur in some insertions and deletions + // because the SMEM algorithm is "greedy" in taking up as much of the read + // as possible + // we can check if this happened directly, but it's expensive + // so for now we just give it the benefit of the doubt but adjust the edge + // score so that the matches don't get double counted + + int64_t extra_dist = abs(graph_dist - between_length); + + return aligner->match * between_length + - (extra_dist ? (extra_dist - 1) * aligner->gap_extension + aligner->gap_open : 0); + } + else { + int64_t gap_length = abs(between_length - graph_dist); + // the read length in between the MEMs is the same as the distance, suggesting a pure mismatch + return gap_length ? -((gap_length - 1) * aligner->gap_extension + aligner->gap_open) : 0; + } } -OrientedDistanceClusterer::OrientedDistanceClusterer(const Alignment& alignment, - const vector& mems, - const Aligner& aligner, - xg::XG* xgindex, - size_t max_expected_dist_approx_error, - size_t min_mem_length, - bool unstranded, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo) : - OrientedDistanceClusterer(alignment, mems, &aligner, nullptr, xgindex, max_expected_dist_approx_error, - min_mem_length, unstranded, paths_of_node_memo, oriented_occurences_memo, handle_memo) { - // nothing else to do +void MEMClusterer::deduplicate_cluster_pairs(vector, int64_t>>& cluster_pairs, + int64_t optimal_separation) { + + // sort so that pairs with same clusters are adjacent + sort(cluster_pairs.begin(), cluster_pairs.end()); + +#ifdef debug_mem_clusterer + cerr << "pairs before deduplicating:" << endl; + for (const auto& pair_record : cluster_pairs) { + cerr << pair_record.first.first << ", " << pair_record.first.second << ": " << pair_record.second << endl; + } + cerr << "target separation " << optimal_separation << endl; +#endif + + size_t removed_so_far = 0; + + for (size_t i = 0; i < cluster_pairs.size();) { + // find the range of values that have the same pair of indices + size_t range_end = i + 1; + while (range_end < cluster_pairs.size() ? cluster_pairs[i].first == cluster_pairs[range_end].first : false) { + range_end++; + } + + // find the pair that is closest to the middle of the target interval + int64_t best_separation = cluster_pairs[i].second; + size_t best_idx = i; + for (size_t j = i + 1; j < range_end; j++) { + if (abs(cluster_pairs[j].second - optimal_separation) < abs(best_separation - optimal_separation)) { + best_separation = cluster_pairs[j].second; + best_idx = j; + } + } + + // move the best pair with these indices into the part of the vector we will keep + cluster_pairs[i - removed_so_far] = cluster_pairs[best_idx]; + + // we remove the entire interval except for one + removed_so_far += range_end - i - 1; + i = range_end; + } + + // trim off the end of the vector, which now contains arbitrary values + cluster_pairs.resize(cluster_pairs.size() - removed_so_far); + +#ifdef debug_mem_clusterer + cerr << "pairs after deduplicating:" << endl; + for (const auto& pair_record : cluster_pairs) { + cerr << pair_record.first.first << ", " << pair_record.first.second << ": " << pair_record.second << endl; + } +#endif } - -OrientedDistanceClusterer::OrientedDistanceClusterer(const Alignment& alignment, - const vector& mems, - const Aligner* aligner, - const QualAdjAligner* qual_adj_aligner, - xg::XG* xgindex, - size_t max_expected_dist_approx_error, - size_t min_mem_length, - bool unstranded, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo) : aligner(aligner), qual_adj_aligner(qual_adj_aligner) { +MEMClusterer::HitGraph::HitGraph(const vector& mems, const Alignment& alignment, + const GSSWAligner* aligner, size_t min_mem_length, bool track_components, + const match_fanouts_t* fanouts) : + track_components(track_components), components(0, false) +{ // there generally will be at least as many nodes as MEMs, so we can speed up the reallocation nodes.reserve(mems.size()); for (const MaximalExactMatch& mem : mems) { -//#pragma omp atomic -// MEM_TOTAL += mem.nodes.size(); + //#pragma omp atomic + // MEM_TOTAL += mem.nodes.size(); if (mem.length() < min_mem_length) { -#ifdef debug_od_clusterer +#ifdef debug_mem_clusterer cerr << "skipping short MEM " << mem << endl; #endif -//#pragma omp atomic -// MEM_FILTER_COUNTER += mem.nodes.size(); + //#pragma omp atomic + // MEM_FILTER_COUNTER += mem.nodes.size(); continue; } - // calculate the longest gaps we could detect to the left and right of this MEM - int32_t mem_score; - if (aligner) { - mem_score = aligner->score_exact_match(mem.begin, mem.end); - } - else { - mem_score = qual_adj_aligner->score_exact_match(mem.begin, mem.end, alignment.quality().begin() - + (mem.begin - alignment.sequence().begin())); + int32_t mem_score = aligner->score_exact_match(mem.begin, mem.end, + alignment.quality().begin() + (mem.begin - alignment.sequence().begin())); + + // adjust the score downward for fan-out mismatches + if (fanouts && fanouts->count(&mem)) { + for (const auto& fanout : fanouts->at(&mem)) { + mem_score += (aligner->score_mismatch(fanout.first, fanout.first + 1, + alignment.quality().begin() + (fanout.first - alignment.sequence().begin())) + - aligner->score_exact_match(fanout.first, fanout.first + 1, + alignment.quality().begin() + (fanout.first - alignment.sequence().begin()))); + } } -#ifdef debug_od_clusterer - cerr << "adding nodes for MEM " << mem << endl; +#ifdef debug_mem_clusterer + cerr << "adding nodes for MEM " << mem << " with score " << mem_score << endl; + if (fanouts && fanouts->count(&mem)) { + cerr << "fanouts:" << endl; + for (auto fanout : fanouts->at(&mem)) { + cerr << "\t" << (fanout.first - mem.begin) << ": " << *fanout.first << " -> " << fanout.second << endl; + } + } + cerr << "locations:" << endl; #endif for (gcsa::node_type mem_hit : mem.nodes) { nodes.emplace_back(mem, make_pos_t(mem_hit), mem_score); -#ifdef debug_od_clusterer +#ifdef debug_mem_clusterer cerr << "\t" << nodes.size() - 1 << ": " << make_pos_t(mem_hit) << endl; #endif } } - // Get all the distances between nodes, in a forrest of unrooted trees of - // nodes that we know are on a consistent strand. - unordered_map, int64_t> recorded_finite_dists = get_on_strand_distance_tree(nodes.size(), unstranded, xgindex, - [&](size_t node_number) { - return nodes[node_number].start_pos; - }, - [&](size_t node_number) { - return 0; - }, - paths_of_node_memo, - oriented_occurences_memo, - handle_memo); - - // Flatten the trees to maps of relative position by node ID. - vector> strand_relative_position = flatten_distance_tree(nodes.size(), recorded_finite_dists); + // init the component tracker + if (track_components) { + components = UnionFind(nodes.size(), false); + } +} + +void MEMClusterer::HitGraph::add_edge(size_t from, size_t to, int32_t weight, int64_t distance) { + nodes[from].edges_from.emplace_back(to, weight, distance); + nodes[to].edges_to.emplace_back(from, weight, distance); -#ifdef debug_od_clusterer - for (const auto& strand : strand_relative_position) { - cerr << "strand reconstruction: " << endl; - vector order; - for (const auto& record : strand) { - order.push_back(record.first); - } - sort(order.begin(), order.end(), [&](size_t a, size_t b) {return strand.at(a) < strand.at(b);}); - for (const auto i : order) { - int64_t strand_pos = strand.at(i); - cerr << "\t" << i << ":\t" << strand_pos << "\t" << nodes[i].mem->sequence() << endl; - } + if (track_components) { + components.union_groups(from, to); } -#endif +} - // now we use the strand clusters and the estimated distances to make the DAG for the - // approximate MEM alignment +void MEMClusterer::HitGraph::connected_components(vector>& components_out) const { - int64_t match_score, mismatch_score, gap_open_score, gap_extension_score, max_gap; - if (aligner) { - match_score = aligner->match; - gap_open_score = aligner->gap_open; - gap_extension_score = aligner->gap_extension; - max_gap = aligner->longest_detectable_gap(alignment); - } - else { - match_score = qual_adj_aligner->match; - gap_open_score = qual_adj_aligner->gap_open; - gap_extension_score = qual_adj_aligner->gap_extension; - max_gap = qual_adj_aligner->longest_detectable_gap(alignment); - } + components_out.clear(); + vector enqueued(nodes.size()); - int64_t forward_gap_length = max_gap + max_expected_dist_approx_error; - for (const unordered_map& relative_pos : strand_relative_position) { - - // sort the nodes by relative position - vector> sorted_pos; - for (const pair& pos_record : relative_pos) { - sorted_pos.emplace_back(pos_record.second, pos_record.first); + // check each node in turn to find new components + for (size_t i = 0; i < nodes.size(); i++) { + if (enqueued[i]) { + // we've already found this node from some component + continue; } - std::sort(sorted_pos.begin(), sorted_pos.end()); - // find edges within each strand cluster by first identifying the interval of MEMs that meets - // the graph distance constrant for each MEM and then checking for read colinearity and the - // reverse distance constraint - int64_t last_idx = sorted_pos.size() - 1; - int64_t low = 0, hi = 0; - for (int64_t i = 0; i < sorted_pos.size(); i++) { - - int64_t strand_pos = sorted_pos[i].first; - size_t pivot_idx = sorted_pos[i].second; - ODNode& pivot = nodes[pivot_idx]; - int64_t pivot_length = pivot.mem->end - pivot.mem->begin; - int64_t suffix_length = alignment.sequence().end() - pivot.mem->end; + // this node belongs to a component we haven't found yet, use DFS to find the rest + vector stack {i}; + enqueued[i] = true; + components_out.emplace_back(1, i); + + while (!stack.empty()) { - // the limits of how far away we might detect edges to add to the clustering graph - int64_t target_hi_pos, target_low_pos; - if (unstranded) { - target_low_pos = strand_pos - suffix_length - forward_gap_length; - target_hi_pos = strand_pos + suffix_length + forward_gap_length; - } - else { - target_low_pos = strand_pos - max_expected_dist_approx_error; - target_hi_pos = strand_pos + suffix_length + forward_gap_length; - } + const HitNode& node = nodes[stack.back()]; + stack.pop_back(); - // move the lower boundary of the search interval to the lowest value inside the - // the target interval - if (sorted_pos[low].first > target_low_pos) { - while (low > 0 ? sorted_pos[low - 1].first > target_low_pos : false) { - low--; - } - } - else { - while (low < sorted_pos.size() ? sorted_pos[low].first < target_low_pos : false) { - low++; - } - } + // search in both forward and backward directions - // move the upper boundary of the search interval to the highest value inside the - // the target interval - if (sorted_pos[hi].first > target_hi_pos) { - while (hi > 0 ? sorted_pos[hi].first > target_hi_pos : false) { - hi--; - } - } - else { - while (hi < last_idx ? sorted_pos[hi + 1].first <= target_hi_pos : false) { - hi++; + for (const HitEdge& edge : node.edges_from) { + + if (!enqueued[edge.to_idx]) { + stack.push_back(edge.to_idx); + enqueued[edge.to_idx] = true; + components_out.back().push_back(edge.to_idx); } } -#ifdef debug_od_clusterer - cerr << "checking for possible edges from " << sorted_pos[i].second << " to MEMs between " << sorted_pos[low].first << "(" << sorted_pos[low].second << ") and " << sorted_pos[hi].first << "(" << sorted_pos[hi].second << "), which is inside the interval (" << target_low_pos << ", " << target_hi_pos << ")" << endl; -#endif - - for (int64_t j = low; j <= hi; j++) { - // don't make self edges - if (i == j) { - continue; - } - - int64_t next_idx = sorted_pos[j].second; - ODNode& next = nodes[next_idx]; - - // the length of the sequence in between the MEMs (can be negative if they overlap) - int64_t between_length = next.mem->begin - pivot.mem->end; - - // the estimated distance between the end of the pivot and the start of the next MEM in the graph - int64_t graph_dist; - if (unstranded) { - // here we make the charitable assumption that it is on the correct strand - graph_dist = abs(sorted_pos[j].first - strand_pos) - pivot_length; - } - else { - graph_dist = sorted_pos[j].first - strand_pos - pivot_length; - } - - if (next.mem->begin >= pivot.mem->begin && next.mem->end <= pivot.mem->end - && abs((sorted_pos[j].first - strand_pos) - (next.mem->begin - pivot.mem->begin)) <= 1) { - // this looks like a redundant sub-MEM - - // we add a dummy edge, but only to connect the nodes' components and join the clusters, - // not to actually use in dynamic programming (given arbitrary low weight that should not - // cause overflow) - pivot.edges_from.emplace_back(next_idx, numeric_limits::lowest() / 2, graph_dist); - next.edges_to.emplace_back(pivot_idx, numeric_limits::lowest() / 2, graph_dist); - - continue; - } - else if (next.mem->begin <= pivot.mem->begin || next.mem->end <= pivot.mem->end) { - // these MEMs cannot be colinear along the read - - // note: we allow one of the start/end positions to be the same here even though they can't - // techinically overlap because it tends to soak up redundant sub-MEMs into the same connected - // component so that they don't get their own cluster - - continue; - } + for (const HitEdge& edge : node.edges_to) { - int32_t edge_score; - if (between_length < 0) { - // the MEMs overlap, but this can occur in some insertions and deletions - // because the SMEM algorithm is "greedy" in taking up as much of the read - // as possible - // we can check if this happened directly, but it's expensive - // so for now we just give it the benefit of the doubt but adjust the edge - // score so that the matches don't get double counted - - int64_t extra_dist = abs(graph_dist - between_length); - - edge_score = match_score * between_length - - (extra_dist ? (extra_dist - 1) * gap_extension_score + gap_open_score : 0); - } - else { - int64_t gap_length = abs(between_length - graph_dist); - // the read length in between the MEMs is the same as the distance, suggesting a pure mismatch - edge_score = gap_length ? -((gap_length - 1) * gap_extension_score + gap_open_score) : 0; + if (!enqueued[edge.to_idx]) { + stack.push_back(edge.to_idx); + enqueued[edge.to_idx] = true; + components_out.back().push_back(edge.to_idx); } - -#ifdef debug_od_clusterer - cerr << "adding edge to MEM " << sorted_pos[j].first << "(" << sorted_pos[j].second << ") with weight " << edge_score << endl; -#endif - - // add the edges in - pivot.edges_from.emplace_back(next_idx, edge_score, graph_dist); - next.edges_to.emplace_back(pivot_idx, edge_score, graph_dist); } } } } - -unordered_map, int64_t> OrientedDistanceClusterer::get_on_strand_distance_tree(size_t num_items, bool unstranded, xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo) { - // for recording the distance of any pair that we check with a finite distance - unordered_map, int64_t> recorded_finite_dists; +void MEMClusterer::HitGraph::prune_low_scoring_edges(vector>& components, size_t component_idx, double score_factor) { - // for recording the number of times elements of a strand cluster have been compared - // and found an infinite distance - map, size_t> num_infinite_dists; + vector& component = components[component_idx]; - // we use a union find to keep track of which MEMs have been identified as being on the same strand - UnionFind component_union_find(num_items); + // get the topological order within this component (expressed in indexes into the component vector) + vector component_order; + component_topological_order(component, component_order); - size_t num_possible_merges_remaining = (num_items * (num_items - 1)) / 2; +#ifdef debug_mem_clusterer + cerr << "doing backwards DP" << endl; +#endif - int64_t max_failed_distance_probes = 2; + vector backwards_dp_score(component.size()); + unordered_map node_idx_to_component_idx; + for (size_t i = 0; i < component.size(); i++) { + backwards_dp_score[i] = nodes[component[i]].score; + node_idx_to_component_idx[component[i]] = i; + } - // an initial pass that only looks at nodes on path - if (unstranded) { - extend_dist_tree_by_path_buckets(max_failed_distance_probes, num_possible_merges_remaining,component_union_find, recorded_finite_dists, - num_infinite_dists, num_items, xgindex, get_position, get_offset, paths_of_node_memo, oriented_occurences_memo, - handle_memo); + // do dynamic programming backwards within this component + for (int64_t i = component_order.size() - 1; i >= 0; i--) { + size_t idx = component_order[i]; + size_t node_idx = component[idx]; + for (HitEdge& edge : nodes[node_idx].edges_to) { + size_t local_to_idx = node_idx_to_component_idx[edge.to_idx]; + int32_t dp_score = backwards_dp_score[idx] + edge.weight; + if (dp_score > backwards_dp_score[local_to_idx]) { + backwards_dp_score[local_to_idx] = dp_score; + } + } } - else { - extend_dist_tree_by_strand_buckets(max_failed_distance_probes, num_possible_merges_remaining,component_union_find, recorded_finite_dists, - num_infinite_dists, num_items, xgindex, get_position, get_offset, paths_of_node_memo, oriented_occurences_memo, - handle_memo); + +#ifdef debug_mem_clusterer + cerr << "backwards dp scores:" << endl; + for (size_t i = 0; i < component.size(); i++) { + cerr << "\t" << component[i] << ": " << backwards_dp_score[i] << endl; } +#endif - // TODO: permutations that try to assign singletons + // the minimum score we will require each edge to be a part of + int32_t min_score = *max_element(backwards_dp_score.begin(), backwards_dp_score.end()) * score_factor; - // a second pass that tries fill in the tree by traversing to the nearest shared path - size_t nlogn = ceil(num_items * log(num_items)); - extend_dist_tree_by_permutations(max_failed_distance_probes, 50, nlogn, num_possible_merges_remaining, component_union_find, - recorded_finite_dists, num_infinite_dists, unstranded, num_items, xgindex, get_position, get_offset, paths_of_node_memo, - oriented_occurences_memo, handle_memo); - - return recorded_finite_dists; -} - -void OrientedDistanceClusterer::exclude_dist_tree_merges_by_components(int64_t max_failed_distance_probes, - size_t& num_possible_merges_remaining, - UnionFind& component_union_find, - map, size_t>& num_infinite_dists, - unordered_map neighbors_on_paths, - xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo) { - - -#ifdef debug_od_clusterer - cerr << "using path component index to exclude strand merges" << endl; +#ifdef debug_mem_clusterer + cerr << "looking for edges with max score less than " << min_score << endl; #endif - // use the component path set index to exclude some distance measurements between groups we can tell are on separate - // strands a priori - // TODO: I wonder if there's a way to do this without the quadratic loop (although it's quadratic in number of connected - // components, so probably not all that bad) - vector> bucketed_groups = component_union_find.all_groups(); -#ifdef debug_od_clusterer - cerr << "groups: " << endl; - for (auto& group : bucketed_groups) { - for (size_t idx : group) { - cerr << idx << " "; - } - cerr << endl; - } + for (size_t i = 0; i < component.size(); i++) { + size_t node_idx = component[i]; + HitNode& node = nodes[node_idx]; + for (size_t j = 0; j < node.edges_from.size(); ) { + HitEdge& edge = node.edges_from[j]; + + // don't remove edges that look nearly perfect (helps keep redundant sub-MEMs in the cluster with + // their parent so that they can be removed later) + if (abs((edge.distance + (node.mem->end - node.mem->begin)) + - (nodes[edge.to_idx].mem->begin - node.mem->begin)) <= 1) { +#ifdef debug_mem_clusterer + cerr << "preserving edge because distance looks good" << endl; #endif - for (size_t i = 1; i < bucketed_groups.size(); i++) { - // find the first member of the 'i' group that is on a path or associated with a neighbor on a path - vector& i_group = bucketed_groups[i]; - size_t i_idx = 0; - while (i_idx < i_group.size() ? (neighbors_on_paths.count(i_group[i_idx]) ? neighbors_on_paths[i_group[i_idx]] == 0 : false ) : false) { - i_idx++; - } - - // check if these two hits are on a path that is on a separate component - if (i_idx < i_group.size()) { - id_t i_node_id = neighbors_on_paths.count(i_group[i_idx]) ? neighbors_on_paths[i_group[i_idx]] : id(get_position(i_group[i_idx])); - size_t i_path = paths_of_node_memo->at(i_node_id).front(); + j++; + continue; + } - for (size_t j = 0; j < i; j++) { - // find the first member of the 'j' group that is on a path or associated with a neighbor on a path - vector& j_group = bucketed_groups[j]; - size_t j_idx = 0; - while (j_idx < j_group.size() ? (neighbors_on_paths.count(j_group[j_idx]) ? neighbors_on_paths[j_group[j_idx]] == 0 : false ) : false) { - j_idx++; - } + // the forward-backward score of this edge + int32_t edge_score = node.dp_score + edge.weight + backwards_dp_score[node_idx_to_component_idx[edge.to_idx]]; + + // is the max score across this edge too low? + if (edge_score < min_score) { - if (j_idx < j_group.size()) { -#ifdef debug_od_clusterer - cerr << "checking for shared component using strand cluster representatives " << get_position(i_group[i_idx]) << " and " << get_position(j_group[j_idx]) << endl; +#ifdef debug_mem_clusterer + cerr << "removing edge " << node_idx << "->" << edge.to_idx << " with weight " << edge.weight << " and max score " << edge_score << endl; #endif - size_t j_node_id = neighbors_on_paths.count(j_group[j_idx]) ? neighbors_on_paths[j_group[j_idx]] : id(get_position(j_group[j_idx])); - size_t j_path = paths_of_node_memo->at(j_node_id).front(); - - if (!xgindex->paths_on_same_component(i_path, j_path)) { - // these hits are associated with strands that are on separated components of the graph - // so we can rule out these strand merges a priori - size_t i_strand = component_union_find.find_group(i_group[i_idx]); - size_t j_strand = component_union_find.find_group(j_group[j_idx]); - - num_infinite_dists[make_pair(i_strand, j_strand)] = max_failed_distance_probes + 1; - num_infinite_dists[make_pair(j_strand, i_strand)] = max_failed_distance_probes + 1; - - num_possible_merges_remaining -= component_union_find.group_size(i_strand) * component_union_find.group_size(j_strand); - -#ifdef debug_od_clusterer - cerr << "representatives are on separate components, blocking strand merge and decreasing possible merges by " << component_union_find.group_size(i_strand) * component_union_find.group_size(j_strand) << " to " << num_possible_merges_remaining << endl; + + // remove it's reverse counterpart + HitNode& dest_node = nodes[edge.to_idx]; + for (size_t k = 0; k < dest_node.edges_to.size(); k++) { + if (dest_node.edges_to[k].to_idx == node_idx) { +#ifdef debug_mem_clusterer + cerr << "removing bwd edge " << edge.to_idx << "->" << dest_node.edges_to[k].to_idx << " with weight " << dest_node.edges_to[k].weight << " and max score " << edge_score << endl; #endif + dest_node.edges_to[k] = dest_node.edges_to.back(); + dest_node.edges_to.pop_back(); + break; } } + + // remove the edge + node.edges_from[j] = node.edges_from.back(); + node.edges_from.pop_back(); + } + else { + j++; } } } -} - -void OrientedDistanceClusterer::extend_dist_tree_by_path_buckets(int64_t max_failed_distance_probes, - size_t& num_possible_merges_remaining, - UnionFind& component_union_find, - unordered_map, int64_t>& recorded_finite_dists, - map, size_t>& num_infinite_dists, - size_t num_items, - xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo) { -#ifdef debug_od_clusterer - cerr << "using paths to bucket distance comparisons" << endl; +#ifdef debug_mem_clusterer + cerr << "reidentifying connected components" << endl; #endif - if (!paths_of_node_memo) { - return; - } - - // enter which paths occur on the nodes of each hit into the memo - for (size_t i = 0; i < num_items; i++) { - pos_t pos = get_position(i); - if (!paths_of_node_memo->count(id(pos))) { - (*paths_of_node_memo)[id(pos)] = xgindex->paths_of_node(id(pos)); - } - } + // use DFS to identify the connected components again + vector> new_components; - // reverse the memo so that it tells us which hits occur on a strand of a path and identify hits with no paths - unordered_map> items_on_path; - // record which hits aren't on a path and associate them with their nearest neighbor's node ID - unordered_map non_path_hits; - for (size_t i = 0; i < num_items; i++) { - pos_t pos = get_position(i); - vector& paths = paths_of_node_memo->at(id(pos)); - if (paths.empty()) { - // just add a sentinel for now - non_path_hits[i] = 0; + vector enqueued(component.size(), false); + for (size_t i = 0; i < component.size(); i++) { + if (enqueued[i]) { + continue; } - else { - for (size_t path : paths) { - items_on_path[path].push_back(i); + new_components.emplace_back(); + vector stack(1, component[i]); + enqueued[i] = true; + while (!stack.empty()) { + size_t node_idx = stack.back(); + stack.pop_back(); + + new_components.back().push_back(node_idx); + + for (HitEdge& edge : nodes[node_idx].edges_from) { + size_t local_idx = node_idx_to_component_idx[edge.to_idx]; + if (!enqueued[local_idx]) { + stack.push_back(edge.to_idx); + enqueued[local_idx] = true; + } + } + + for (HitEdge& edge : nodes[node_idx].edges_to) { + size_t local_idx = node_idx_to_component_idx[edge.to_idx]; + if (!enqueued[local_idx]) { + stack.push_back(edge.to_idx); + enqueued[local_idx] = true; + } } } } - // check the nearest nodes to each singleton to see if we can use it to bucket the item - for (pair& non_path_hit : non_path_hits) { - pos_t pos = get_position(non_path_hit.first); - handle_t handle = xgindex->memoized_get_handle(id(pos), is_rev(pos), handle_memo); - size_t right_dist = xgindex->get_length(handle) - offset(pos); - size_t trav_dist = min(offset(pos), right_dist); - // TODO: magic number (matches the distance used in the permutations step) - if (trav_dist <= 50) { - bool go_left = offset(pos) < right_dist; - function bucket_using_neighbors = [&](const handle_t& handle) { - id_t neighbor_id = xgindex->get_id(handle); - bool neighbor_rev = xgindex->get_is_reverse(handle); - if (!paths_of_node_memo->count(neighbor_id)) { - (*paths_of_node_memo)[neighbor_id] = xgindex->paths_of_node(neighbor_id); - } - auto& neighbor_paths = paths_of_node_memo->at(neighbor_id); - for (size_t path : neighbor_paths) { - items_on_path[path].push_back(non_path_hit.first); - } - // replace the sentinel value with the actual neighbor's ID (but only if we find a path on it) - if (!neighbor_paths.empty()) { - non_path_hit.second = neighbor_id; - } - return true; - }; - xgindex->follow_edges(handle, go_left, bucket_using_neighbors); + // did we break this connected component into multiple connected components? + if (new_components.size() > 1) { +#ifdef debug_mem_clusterer + stringstream strm; + strm << "splitting cluster:" << endl; + for (auto& comp : new_components) { + for (size_t i : comp) { + strm << "\t" << i << " " << nodes[i].mem->sequence() << " " << nodes[i].start_pos << endl; + } + strm << endl; + } + cerr << strm.str(); +#endif + // the the original component + components[component_idx] = move(new_components[0]); + // add the remaining to the end + for (size_t i = 1; i < new_components.size(); i++) { + components.emplace_back(move(new_components[i])); } } +} + +size_t MEMClusterer::HitGraph::median_mem_coverage(const vector& component, const Alignment& aln) const { - // make sure the items are unique with each list of hits and generate a system-independent ordering over strands - vector buckets; - buckets.reserve(items_on_path.size()); - for (pair>& path_bucket : items_on_path) { - sort(path_bucket.second.begin(), path_bucket.second.end()); - auto new_end = unique(path_bucket.second.begin(), path_bucket.second.end()); - path_bucket.second.resize(new_end - path_bucket.second.begin()); - buckets.push_back(path_bucket.first); + // express the MEMs as intervals along the read sequence + vector> mem_intervals; + for (size_t node_idx : component) { + mem_intervals.emplace_back(nodes[node_idx].mem->begin - aln.sequence().begin(), nodes[node_idx].mem->end - aln.sequence().begin()); } - sort(buckets.begin(), buckets.end()); -#ifdef debug_od_clusterer - cerr << "path buckets:" << endl; - for (auto buck : buckets) { - cerr << "\t"; - for (auto i : items_on_path[buck]) { - cerr << i << " "; - } - cerr << endl; + // put the intervals in order by starting index and then descending by length + sort(mem_intervals.begin(), mem_intervals.end(), [](const pair& a, const pair& b) { + return a.first < b.first || (a.first == b.first && a.second > b.second); + }); + +#ifdef debug_median_algorithm + cerr << "intervals:" << endl; + for (const auto& interval : mem_intervals) { + cerr << "\t[" << interval.first << ", " << interval.second << ")" << endl; } #endif + unordered_map coverage_count; - // use the path strands to bucket distance measurements - for (size_t path_bucket : buckets) { -#ifdef debug_od_clusterer - cerr << "doing a bucketed comparison of items on the path ranked " << path_bucket << endl; -#endif - vector& bucket = items_on_path[path_bucket]; - for (size_t i = 1; i < bucket.size(); i++) { - size_t prev = bucket[i - 1]; - size_t here = bucket[i]; - - // have these items already been identified as on the same strand? - if (component_union_find.find_group(prev) == component_union_find.find_group(here)) { - continue; - } - - // estimate the distance - pos_t pos_prev = get_position(prev); - pos_t pos_here = get_position(here); - -#ifdef debug_od_clusterer - cerr << "measuring distance between " << prev << " at " << pos_prev << " and " << here << " at " << pos_here << endl; -#endif - - int64_t dist = xgindex->closest_shared_path_unstranded_distance(id(pos_prev), offset(pos_prev), is_rev(pos_prev), - id(pos_here), offset(pos_here), is_rev(pos_here), - 50, paths_of_node_memo, oriented_occurences_memo, handle_memo); - - - // did we get a successful estimation? - if (dist == numeric_limits::max()) { -#ifdef debug_od_clusterer - cerr << "they don't appear to be on the same path, skipping" << endl; -#endif - continue; - } - - // add the fixed offset from the hit position - dist += get_offset(here) - get_offset(prev); - -#ifdef debug_od_clusterer - cerr << "recording distance at " << dist << endl; -#endif - - // merge them into a strand cluster - recorded_finite_dists[make_pair(prev, here)] = dist; - num_possible_merges_remaining -= component_union_find.group_size(prev) * component_union_find.group_size(here); - component_union_find.union_groups(prev, here); - } - } + // a pointer to the read index we're currently at + int64_t at = 0; + // to keep track of how many intervals cover the current segment + int64_t depth = 0; - exclude_dist_tree_merges_by_components(max_failed_distance_probes, num_possible_merges_remaining, component_union_find, num_infinite_dists, - non_path_hits, xgindex, get_position, get_offset, paths_of_node_memo); -} + // we can keep track of the SMEM we're in by checking whether we've passed its final index + pair curr_smem(0, 0); + // and the number of hits of this SMEM we've seen + int64_t curr_smem_hit_count = 0; + // we will skip one copy of each sub-MEM (heurstically assuming it's redundant with the parent) + // per copy of the SMEM + unordered_map, int64_t> skipped_sub_mems; -void OrientedDistanceClusterer::extend_dist_tree_by_strand_buckets(int64_t max_failed_distance_probes, - size_t& num_possible_merges_remaining, - UnionFind& component_union_find, - unordered_map, int64_t>& recorded_finite_dists, - map, size_t>& num_infinite_dists, - size_t num_items, - xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo) { - if (!paths_of_node_memo || !oriented_occurences_memo) { - return; - } + // the sort order ensures we will encounter the interval starts in order, we use a priority queue + // to also ensure that we will encounter their ends in order + priority_queue, greater> ends; -#ifdef debug_od_clusterer - cerr << "using strands to bucket distance comparisons" << endl; + for (size_t i = 0; i < mem_intervals.size(); i++) { + pair& interval = mem_intervals[i]; + +#ifdef debug_median_algorithm + cerr << "iter for interval [" << interval.first << ", " << interval.second << "), starting at " << at << endl; #endif - - // enter which paths occur on the nodes of each hit into the memo - for (size_t i = 0; i < num_items; i++) { - pos_t pos = get_position(i); -#ifdef debug_od_clusterer - cerr << "adding position " << pos << " to memo" << endl; + + if (interval.second > curr_smem.second) { + // we're in a MEM that covers distinct sequence from the current SMEM, so this is + // a new SMEM (because of sort order) + curr_smem = interval; + curr_smem_hit_count = 1; +#ifdef debug_median_algorithm + cerr << "\tthis is a new SMEM" << endl; #endif - - if (!paths_of_node_memo->count(id(pos))) { - (*paths_of_node_memo)[id(pos)] = xgindex->paths_of_node(id(pos)); } - for (size_t path : paths_of_node_memo->at(id(pos))) { - if (!oriented_occurences_memo->count(make_pair(id(pos), path))) { - (*oriented_occurences_memo)[make_pair(id(pos), path)] = xgindex->oriented_occurrences_on_path(id(pos), path); -#ifdef debug_od_clusterer - cerr << "node " << id(pos) << " has occurrences on path " << path << ":" << endl; - for (auto occurrence : (*oriented_occurences_memo)[make_pair(id(pos), path)]) { - cerr << "\t" << occurrence.first << " " << (occurrence.second ? "rev" : "fwd") << endl; - } + else if (interval == curr_smem) { + // this is another hit of the same SMEM, increase the count + curr_smem_hit_count++; +#ifdef debug_median_algorithm + cerr << "\tthis is a repeat of the current SMEM" << endl; #endif - } } - } - -#ifdef debug_od_clusterer - cerr << "reversing node to strand memo" << endl; + else if (skipped_sub_mems[interval] < curr_smem_hit_count) { + // we're in a MEM that covers a strict subinterval of the current SMEM, so skip + // one sub-MEM per hit of the SMEM on the assumption that it's redundant + skipped_sub_mems[interval]++; +#ifdef debug_median_algorithm + cerr << "\tthis is a sub-MEM we must skip" << endl; #endif - - // reverse the memo so that it tells us which hits occur on a strand of a path and identify hits with no paths - unordered_map, vector> items_on_path_strand; - // record which hits aren't on a path and associate them with their nearest neighbor's node ID - unordered_map non_path_hits; - for (size_t i = 0; i < num_items; i++) { - pos_t pos = get_position(i); - vector& paths = paths_of_node_memo->at(id(pos)); - if (paths.empty()) { - // just add a sentinel for now - non_path_hits[i] = 0; + continue; } - else { - for (size_t path : paths) { - for (pair oriented_occurrence : oriented_occurences_memo->at(make_pair(id(pos), path))) { -#ifdef debug_od_clusterer - cerr << "position " << pos << " is on strand " << path << (oriented_occurrence.second != is_rev(pos) ? "-" : "+") << endl; + + // add the coverage of any segments that come before the start of this interval + while (ends.empty() ? false : ends.top() <= interval.first) { +#ifdef debug_median_algorithm + cerr << "\ttraversing interval end at " << ends.top() << " adding " << ends.top() - at << " to depth " << depth << endl; #endif - items_on_path_strand[make_pair(path, oriented_occurrence.second != is_rev(pos))].push_back(i); - } - } + coverage_count[depth] += ends.top() - at; + at = ends.top(); + ends.pop(); + + // an interval is leaving scope, decrement the depth + depth--; } - } - - // check the nearest nodes to each singleton to see if we can use it to bucket the item - for (pair& non_path_hit : non_path_hits) { - pos_t pos = get_position(non_path_hit.first); - handle_t handle = xgindex->memoized_get_handle(id(pos), is_rev(pos), handle_memo); - size_t right_dist = xgindex->get_length(handle) - offset(pos); - size_t trav_dist = min(offset(pos), right_dist); - // TODO: magic number (matches the distance used in the permutations step) - if (trav_dist <= 50) { - bool go_left = offset(pos) < right_dist; - function bucket_using_neighbors = [&](const handle_t& handle) { - id_t neighbor_id = xgindex->get_id(handle); - bool neighbor_rev = xgindex->get_is_reverse(handle); - if (!paths_of_node_memo->count(neighbor_id)) { - (*paths_of_node_memo)[neighbor_id] = xgindex->paths_of_node(neighbor_id); - } - auto& neighbor_paths = paths_of_node_memo->at(neighbor_id); - for (size_t path : neighbor_paths) { - for (pair& node_occurence : xgindex->memoized_oriented_occurrences_on_path(neighbor_id, path, oriented_occurences_memo)) { -#ifdef debug_od_clusterer - cerr << "position " << pos << " has neighbor " << neighbor_id << " on strand " << path << (node_occurence.second != neighbor_rev ? "-" : "+") << endl; -#endif - items_on_path_strand[make_pair(path, node_occurence.second != neighbor_rev)].push_back(non_path_hit.first); - } - } - // replace the sentinel value with the actual neighbor's ID (but only if we find a path on it) - if (!neighbor_paths.empty()) { - non_path_hit.second = neighbor_id; - } - return true; - }; - xgindex->follow_edges(handle, go_left, bucket_using_neighbors); + + // if there's an initial interval of 0 depth, we ignore it (helps with read-end effects from sequencers) + if (at > 0 || depth > 0) { +#ifdef debug_median_algorithm + cerr << "\ttraversing pre-interval segment staring from " << at << " adding " << interval.first - at << " to depth " << depth << endl; +#endif + coverage_count[depth] += interval.first - at; + } +#ifdef debug_median_algorithm + else { + cerr << "\tskipping an initial segment from " << at << " to " << interval.first << " with depth " << depth << endl; } +#endif + + + at = interval.first; + // an interval is entering scope, increment the depth + depth++; + ends.push(interval.second); + } + + // run through the rest of the ends + while (!ends.empty()) { +#ifdef debug_median_algorithm + cerr << "\ttraversing interval end at " << ends.top() << " adding " << ends.top() - at << " to depth " << depth << endl; +#endif + coverage_count[depth] += ends.top() - at; + at = ends.top(); + ends.pop(); + + // an interval is leaving scope, decrement the depth + depth--; + } + + // NOTE: we used to count the final interval of depth 0 here, but now we ignore 0-depth terminal intervals + // because it seems to help with the read-end effects of sequencers (which can lead to match dropout) + //coverage_count[0] += aln.sequence().size() - at; + + // convert it into a CDF over read coverage + vector> cumul_coverage_count(coverage_count.begin(), coverage_count.end()); + sort(cumul_coverage_count.begin(), cumul_coverage_count.end()); + +#ifdef debug_median_algorithm + cerr << "\tcoverage distr is: " ; + for (const auto& record : cumul_coverage_count) { + cerr << record.first << ":" << record.second << " "; } + cerr << endl; +#endif - // make sure the items are unique with each list of hits and generate a system-independent ordering over strands - vector> buckets; - buckets.reserve(items_on_path_strand.size()); - for (pair, vector>& strand_bucket : items_on_path_strand) { - sort(strand_bucket.second.begin(), strand_bucket.second.end()); - auto new_end = unique(strand_bucket.second.begin(), strand_bucket.second.end()); - strand_bucket.second.resize(new_end - strand_bucket.second.begin()); - buckets.push_back(strand_bucket.first); + int64_t cumul = 0; + for (pair& coverage_record : cumul_coverage_count) { + coverage_record.second += cumul; + cumul = coverage_record.second; } - sort(buckets.begin(), buckets.end()); -#ifdef debug_od_clusterer - cerr << "strand buckets:" << endl; - for (auto buck : buckets) { - cerr << "\t"; - for (auto i : items_on_path_strand[buck]) { - cerr << i << " "; + // bisect to find the median + int64_t target = aln.sequence().size() / 2; + if (target <= cumul_coverage_count[0].second) { + return cumul_coverage_count[0].first; + } + int64_t low = 0; + int64_t hi = cumul_coverage_count.size() - 1; + int64_t mid; + while (hi > low + 1) { + mid = (hi + low) / 2; + + if (target <= cumul_coverage_count[mid].second) { + hi = mid; + } + else { + low = mid; } - cerr << endl; } +#ifdef debug_median_algorithm + cerr << "\tmedian is " << cumul_coverage_count[hi].first << endl; #endif + return cumul_coverage_count[hi].first; +} - // use the path strands to bucket distance measurements - for (pair& strand_bucket : buckets) { -#ifdef debug_od_clusterer - cerr << "doing a bucketed comparison of items on the path ranked " << strand_bucket.first << ", strand " << (strand_bucket.second ? "-" : "+") << endl; +void MEMClusterer::HitGraph::perform_dp() { + + for (HitNode& node : nodes) { + // as in local alignment, minimum score is the score of node itself + node.dp_score = node.score; + } + +#ifdef debug_mem_clusterer + cerr << "computing topological order for clustering DP" << endl; #endif - vector& bucket = items_on_path_strand[strand_bucket]; - for (size_t i = 1; i < bucket.size(); i++) { - size_t prev = bucket[i - 1]; - size_t here = bucket[i]; - - // have these items already been identified as on the same strand? - if (component_union_find.find_group(prev) == component_union_find.find_group(here)) { - continue; - } - - // estimate the distance - pos_t pos_prev = get_position(prev); - pos_t pos_here = get_position(here); - -#ifdef debug_od_clusterer - cerr << "measuring distance between " << prev << " at " << pos_prev << " and " << here << " at " << pos_here << endl; + + vector order; + topological_order(order); + + for (size_t i : order) { + HitNode& node = nodes[i]; +#ifdef debug_mem_clusterer + cerr << "at node " << i << " with DP score " << node.dp_score << " and node score " << node.score << endl; #endif + // for each edge out of this node + for (HitEdge& edge : node.edges_from) { - int64_t dist = xgindex->closest_shared_path_oriented_distance(id(pos_prev), offset(pos_prev), is_rev(pos_prev), - id(pos_here), offset(pos_here), is_rev(pos_here), - false, 50, paths_of_node_memo, oriented_occurences_memo, handle_memo); - - - // did we get a successful estimation? - if (dist == numeric_limits::max()) { -#ifdef debug_od_clusterer - cerr << "they don't appear to be on the same path, skipping" << endl; + // check if the path through the node out of this edge increase score of target node + HitNode& target_node = nodes[edge.to_idx]; + int32_t extend_score = node.dp_score + edge.weight + target_node.score; + if (extend_score > target_node.dp_score) { +#ifdef debug_mem_clusterer + cerr << "extending DP to node " << edge.to_idx << " with score " << extend_score << endl; #endif - continue; + target_node.dp_score = extend_score; } - - // add the fixed offset from the hit position - dist += get_offset(here) - get_offset(prev); - -#ifdef debug_od_clusterer - cerr << "recording distance at " << dist << endl; -#endif - - // merge them into a strand cluster - recorded_finite_dists[make_pair(prev, here)] = dist; - num_possible_merges_remaining -= component_union_find.group_size(prev) * component_union_find.group_size(here); - component_union_find.union_groups(prev, here); } } - - exclude_dist_tree_merges_by_components(max_failed_distance_probes, num_possible_merges_remaining, component_union_find, num_infinite_dists, - non_path_hits, xgindex, get_position, get_offset, paths_of_node_memo); } + +void MEMClusterer::HitGraph::topological_order(vector& order_out) const { -void OrientedDistanceClusterer::extend_dist_tree_by_permutations(int64_t max_failed_distance_probes, - int64_t max_search_distance_to_path, - size_t decrement_frequency, - size_t& num_possible_merges_remaining, - UnionFind& component_union_find, - unordered_map, int64_t>& recorded_finite_dists, - map, size_t>& num_infinite_dists, - bool unstranded, - size_t num_items, - xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo) { - - // We want to run through all possible pairsets of node numbers in a permuted order. - ShuffledPairs shuffled_pairs(num_items); - auto current_pair = shuffled_pairs.begin(); - size_t pairs_checked = 0; + // initialize return value + order_out.clear(); + order_out.resize(nodes.size()); + size_t order_idx = nodes.size() - 1; - // a simulated annealing parameter loosely inspired by the cutoff for an Erdos-Renyi random graph - // to be connected with probability approaching 1 - size_t current_max_num_probes = max_failed_distance_probes; + // initialize iteration structures + vector enqueued(nodes.size(), false); + vector edge_index(nodes.size(), 0); + vector stack; - while (num_possible_merges_remaining > 0 && current_pair != shuffled_pairs.end() && current_max_num_probes > 0) { - // slowly lower the number of distances we need to check before we believe that two clusters are on - // separate strands -#ifdef debug_od_clusterer - cerr << "checked " << pairs_checked << " pairs with max probes " << current_max_num_probes << ", decrement frequency " << decrement_frequency << ", merges remaining " << num_possible_merges_remaining << endl; -#endif - - if (pairs_checked % decrement_frequency == 0 && pairs_checked != 0) { - current_max_num_probes--; -#ifdef debug_od_clusterer - cerr << "reducing the max number of probes to " << current_max_num_probes << endl; -#endif - for (const pair, size_t>& inf_dist_record : num_infinite_dists) { - // break symmetry so we don't repeat the operation twice - if (inf_dist_record.first.first < inf_dist_record.first.second && inf_dist_record.second == current_max_num_probes) { - // this merge just fell below the new maximum number of distance probes - size_t strand_size_1 = component_union_find.group_size(inf_dist_record.first.first); - size_t strand_size_2 = component_union_find.group_size(inf_dist_record.first.second); - num_possible_merges_remaining -= strand_size_1 * strand_size_2; -#ifdef debug_od_clusterer - cerr << "after reduction, the total number of probes between strand " << inf_dist_record.first.first << " and " << inf_dist_record.first.second << " is above max, reducing possible merges by " << strand_size_1 * strand_size_2 << " to " << num_possible_merges_remaining << endl; -#endif + // iterate through starting nodes + for (size_t init_node_idx = 0; init_node_idx < nodes.size(); init_node_idx++) { + if (enqueued[init_node_idx]) { + continue; + } + // navigate through graph with DFS + stack.push_back(init_node_idx); + enqueued[init_node_idx] = true; + while (!stack.empty()) { + size_t node_idx = stack.back(); + size_t& edge_idx = edge_index[node_idx]; + if (edge_idx < nodes[node_idx].edges_from.size()) { + size_t target_idx = nodes[node_idx].edges_from[edge_idx].to_idx; + if (enqueued[target_idx]) { + edge_index[node_idx]++; + } + else { + stack.push_back(target_idx); + enqueued[target_idx] = true; } } + else { + // add to topological order in reverse finishing order + stack.pop_back(); + order_out[order_idx] = node_idx; + order_idx--; + } } - - - pair node_pair = *current_pair; - ++current_pair; - - pairs_checked++; - - size_t strand_1 = component_union_find.find_group(node_pair.first); - size_t strand_2 = component_union_find.find_group(node_pair.second); - -#ifdef debug_od_clusterer - cerr << "checking MEMs " << node_pair.first << " and " << node_pair.second << " in cluster " << strand_1 << " and " << strand_2 << endl; -#endif - - if (strand_1 == strand_2) { - // these are already identified as on the same strand, don't need to do it again -#ifdef debug_od_clusterer - cerr << "already on same strand" << endl; -#endif - continue; - } - - auto num_failed_probes = num_infinite_dists.find(make_pair(strand_1, strand_2)); - if (num_failed_probes == num_infinite_dists.end() ? false : num_failed_probes->second >= current_max_num_probes) { - // we've already checked multiple distances between these strand clusters and - // none have returned a finite distance, so we conclude that they are in fact - // on separate clusters and decline to check any more distances -#ifdef debug_od_clusterer - cerr << "already have checked distance above maximum number of probes" << endl; -#endif - continue; + } +} + +void MEMClusterer::HitGraph::component_topological_order(const vector& component, + vector& order_out) const { + // initialize return value + order_out.clear(); + order_out.resize(component.size()); + + vector in_degree(component.size()); + vector stack; + unordered_map node_idx_to_component_idx; + for (size_t i = 0; i < component.size(); i++) { + in_degree[i] = nodes[component[i]].edges_to.size(); + if (in_degree[i] == 0) { + stack.push_back(i); } + node_idx_to_component_idx[component[i]] = i; + } + + size_t order_idx = 0; + while (!stack.empty()) { + size_t i = stack.back(); + stack.pop_back(); - const pos_t& pos_1 = get_position(node_pair.first); - const pos_t& pos_2 = get_position(node_pair.second); + order_out[order_idx] = i; + order_idx++; - int64_t oriented_dist; - if (unstranded) { - oriented_dist = xgindex->closest_shared_path_unstranded_distance(id(pos_1), offset(pos_1), is_rev(pos_1), - id(pos_2), offset(pos_2), is_rev(pos_2), - max_search_distance_to_path, paths_of_node_memo, - oriented_occurences_memo, handle_memo); - } - else { - oriented_dist = xgindex->closest_shared_path_oriented_distance(id(pos_1), offset(pos_1), is_rev(pos_1), - id(pos_2), offset(pos_2), is_rev(pos_2), false, - max_search_distance_to_path, paths_of_node_memo, - oriented_occurences_memo, handle_memo); + for (const HitEdge& edge : nodes[component[i]].edges_from) { + size_t j = node_idx_to_component_idx[edge.to_idx]; + in_degree[j]--; + if (in_degree[j] == 0) { + stack.push_back(j); + } + } + } +} + +void MEMClusterer::HitGraph::identify_sources_and_sinks(vector& sources_out, + vector& sinks_out) const { + + sources_out.clear(); + sinks_out.clear(); + + vector is_source(nodes.size(), true); + + for (size_t i = 0; i < nodes.size(); i++) { + if (nodes[i].edges_from.empty()) { + sinks_out.push_back(i); } -#ifdef debug_od_clusterer - cerr << "distance between " << pos_1 << " and " << pos_2 << " estimated at " << oriented_dist << endl; -#endif - - if (oriented_dist == std::numeric_limits::max()) { - // distance is estimated at infinity, so these are either on different strands - // or the path heuristic failed to find a shared path - - if (num_failed_probes == num_infinite_dists.end()) { - num_failed_probes = num_infinite_dists.insert(pair, size_t>(make_pair(strand_1, strand_2), 1)).first; - num_infinite_dists[make_pair(strand_2, strand_1)] = 1; - } - else { - num_failed_probes->second++; - num_infinite_dists[make_pair(strand_2, strand_1)]++; - } - - - // this infinite distance pushed the count over the maximum number of probes, so remove - // these merges from the pool of potential merges remaining - if (num_failed_probes->second >= current_max_num_probes) { - size_t strand_size_1 = component_union_find.group_size(strand_1); - size_t strand_size_2 = component_union_find.group_size(strand_2); - - num_possible_merges_remaining -= strand_size_1 * strand_size_2; - -#ifdef debug_od_clusterer - cerr << "number of probes " << num_failed_probes->second << " crossed max threshold of " << current_max_num_probes << ", reducing possible merges by " << strand_size_1 * strand_size_2 << " to " << num_possible_merges_remaining << endl; + for (const HitEdge& edge : nodes[i].edges_from) { + is_source[edge.to_idx] = false; + } + } + + for (size_t i = 0; i < nodes.size(); i++) { + if (is_source[i]) { + sources_out.push_back(i); + } + } +} + +vector MEMClusterer::HitGraph::clusters(const Alignment& alignment, + const GSSWAligner* aligner, + int32_t max_qual_score, + int32_t log_likelihood_approx_factor, + size_t min_median_mem_coverage_for_split, + double suboptimal_edge_pruning_factor, + double cluster_multiplicity_diff) { + + vector to_return; + if (nodes.size() == 0) { + // this should only happen if we have filtered out all MEMs, so there are none to cluster + return to_return; + } + +#ifdef debug_mem_clusterer + cerr << "performing approximate DP across MEMs" << endl; +#endif + perform_dp(); + +#ifdef debug_mem_clusterer + cerr << "finding top tracebacks within connected components" << endl; #endif + // find the weakly connected components, which should correspond to mappings + vector> components; + connected_components(components); + +#ifdef debug_mem_clusterer + cerr << "traceback returns the following components: " << endl; + for (size_t i = 0; i < components.size(); i++) { + vector& component = components[i]; + cerr << "\tcomponent " << i << ":" << endl; + for (size_t idx : component) { + cerr << "\t\t" << idx << " " << nodes[idx].start_pos << " "; + for (auto iter = nodes[idx].mem->begin; iter != nodes[idx].mem->end; iter++) { + cerr << *iter; } + cerr << endl; } - else { - // the distance is finite, so merge the strand clusters - - // add the fixed offset of the hit from the start position - oriented_dist += get_offset(node_pair.second) - get_offset(node_pair.first); - - recorded_finite_dists[node_pair] = oriented_dist; - - size_t strand_size_1 = component_union_find.group_size(strand_1); - size_t strand_size_2 = component_union_find.group_size(strand_2); - - component_union_find.union_groups(node_pair.first, node_pair.second); - - // remove these from the pool of remaining merges - num_possible_merges_remaining -= strand_size_1 * strand_size_2; - - size_t strand_retaining = component_union_find.find_group(node_pair.first); - size_t strand_removing = strand_retaining == strand_1 ? strand_2 : strand_1; - -#ifdef debug_od_clusterer - cerr << "probe triggered group merge, reducing possible merges by " << strand_size_1 * strand_size_2 << " to " << num_possible_merges_remaining << " and retaining strand " << strand_retaining << endl; + } #endif - - // get the ranges in the counter for failed distance probe records for both of the strands - auto removing_iter = num_infinite_dists.lower_bound(make_pair(strand_removing, 0)); - auto removing_end = num_infinite_dists.upper_bound(make_pair(strand_removing, numeric_limits::max())); - auto retaining_iter = num_infinite_dists.lower_bound(make_pair(strand_retaining, 0)); - auto retaining_end = num_infinite_dists.upper_bound(make_pair(strand_retaining, numeric_limits::max())); - - vector> unseen_comparisons; - while (removing_iter != removing_end && retaining_iter != retaining_end) { - if (removing_iter->first.second == retaining_iter->first.second) { - // both the removing and the retaining strand cluster have failed probes against this cluster so - // we need to combine the records - - // check if we've already marked some of these merges as off limits - bool retaining_already_blocked = retaining_iter->second >= current_max_num_probes; - bool removing_already_blocked = removing_iter->second >= current_max_num_probes; - - // add the counts together - retaining_iter->second += removing_iter->second; - num_infinite_dists[make_pair(retaining_iter->first.second, strand_retaining)] += removing_iter->second; - - // update the number of possible merges remaining - if (retaining_already_blocked && !removing_already_blocked) { - num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(removing_iter->first.second); - -#ifdef debug_od_clusterer - cerr << "after merge, the total number of probes against strand " << removing_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", but the retaining strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(removing_iter->first.second) << " to " << num_possible_merges_remaining << endl; + + if (min_median_mem_coverage_for_split) { +#ifdef debug_mem_clusterer + cerr << "looking for high coverage clusters to split" << endl; #endif - } - else if (removing_already_blocked && !retaining_already_blocked) { - num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_1 : strand_size_2) * component_union_find.group_size(removing_iter->first.second); - -#ifdef debug_od_clusterer - cerr << "after merge, the total number of probes against strand " << removing_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", but the removing strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_1 : strand_size_2) * component_union_find.group_size(removing_iter->first.second) << " to " << num_possible_merges_remaining << endl; + size_t num_original_components = components.size(); + for (size_t i = 0; i < num_original_components; i++) { +#ifdef debug_mem_clusterer + cerr << "component " << i << " has median coverage " << median_mem_coverage(components[i], alignment) << endl; #endif - } - else if (!retaining_already_blocked && !removing_already_blocked && retaining_iter->second >= current_max_num_probes) { - num_possible_merges_remaining -= (strand_size_1 + strand_size_2) * component_union_find.group_size(removing_iter->first.second); - -#ifdef debug_od_clusterer - cerr << "after merge, the total number of probes against strand " << removing_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", reducing possible merges by " << (strand_size_1 + strand_size_2) * component_union_find.group_size(removing_iter->first.second) << " to " << num_possible_merges_remaining << endl; + size_t curr_num_components = components.size(); + if (median_mem_coverage(components[i], alignment) >= min_median_mem_coverage_for_split) { + //#pragma omp atomic + // SPLIT_ATTEMPT_COUNTER++; +#ifdef debug_mem_clusterer + cerr << "attempting to prune and split cluster" << endl; #endif - - } - removing_iter++; - retaining_iter++; - } - else if (removing_iter->first.second < retaining_iter->first.second) { - // the strand being removed has probes against this strand cluster, but the strand being - // retained does not, mark this and save it for later so that we don't invalidate the range - unseen_comparisons.emplace_back(removing_iter->first.second, removing_iter->second); - removing_iter++; + + prune_low_scoring_edges(components, i, suboptimal_edge_pruning_factor); + + if (components.size() > curr_num_components) { + //#pragma omp atomic + // SUCCESSFUL_SPLIT_ATTEMPT_COUNTER++; } - else { - // the strand being retained has probes against this strand cluster, but the strand being - // removed does not, check if we need to add the removing strand to the remaining merges - // counter - if (retaining_iter->second >= current_max_num_probes) { - num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(retaining_iter->first.second); - -#ifdef debug_od_clusterer - cerr << "after merge, the total number of probes against strand " << retaining_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", but the retaining strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(retaining_iter->first.second) << " to " << num_possible_merges_remaining << endl; + } + } +#ifdef debug_mem_clusterer + vector> current_components; + connected_components(current_components); + cerr << "after splitting, from " << num_original_components << " to " << current_components.size() << " connected components" << endl; #endif - } - retaining_iter++; - } + //#pragma omp atomic + // PRE_SPLIT_CLUSTER_COUNTER += num_original_components; + //#pragma omp atomic + // POST_SPLIT_CLUSTER_COUNTER += components.size(); + } + + + // find the node with the highest DP score in each connected component + // each record is a pair of (score lower bound, node index) + vector>> component_traceback_ends(components.size(), + make_pair(numeric_limits::min(), vector())); + for (size_t i = 0; i < components.size(); i++) { + vector& component = components[i]; + pair>& traceback_end = component_traceback_ends[i]; + for (size_t j = 0; j < component.size(); j++) { + int32_t dp_score = nodes[component[j]].dp_score; + if (dp_score > traceback_end.first) { + // this is better than all previous scores, so throw anything we have away + traceback_end.first = dp_score; + traceback_end.second.clear(); + traceback_end.second.push_back(component[j]); } - - // finish off either range - while (removing_iter != removing_end) { - unseen_comparisons.emplace_back(removing_iter->first.second, removing_iter->second); - removing_iter++; + else if (dp_score == traceback_end.first) { + // this is equivalent to the current best, so hold onto both + traceback_end.second.push_back(component[j]); } - while (retaining_iter != retaining_end) { - if (retaining_iter->second >= current_max_num_probes) { - num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(retaining_iter->first.second); - -#ifdef debug_od_clusterer - cerr << "after merge, the total number of probes against strand " << retaining_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", but the retaining strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(retaining_iter->first.second) << " to " << num_possible_merges_remaining << endl; + } + } + //#pragma omp atomic + // CLUSTER_TOTAL += component_traceback_ends.size(); + + std::make_heap(component_traceback_ends.begin(), component_traceback_ends.end()); + + // estimate the minimum score a cluster must obtain to even affect the mapping quality + // TODO: this approximation could break down sometimes, need to look into it + int32_t top_score = component_traceback_ends.front().first; + int32_t suboptimal_score_cutoff = top_score - log_likelihood_approx_factor * aligner->mapping_quality_score_diff(max_qual_score); + // keep track of the scores of the clusters we take off the heap + vector returned_cluster_scores; + while (!component_traceback_ends.empty()) { + // get the next highest scoring traceback end(s) + auto traceback_end = component_traceback_ends.front(); + +#ifdef debug_mem_clusterer + cerr << "checking traceback of component starting at " << traceback_end.second.front() << endl; +#endif + // if this cluster does not look like it even affect the mapping quality of the top scoring + // cluster, don't bother forming it + if (traceback_end.first < suboptimal_score_cutoff) { +#ifdef debug_mem_clusterer + cerr << "skipping rest of components on account of low score of " << traceback_end.first << " compared to max score " << top_score << " and cutoff " << suboptimal_score_cutoff << endl; #endif - } - retaining_iter++; - } + //#pragma omp atomic + // PRUNE_COUNTER += component_traceback_ends.size() + 1; + break; + } + + // we're going to add this cluster to the return vector, take it off the heap + std::pop_heap(component_traceback_ends.begin(), component_traceback_ends.end()); + component_traceback_ends.pop_back(); + + // get the index of the node + vector& trace_stack = traceback_end.second; + + // traceback all optimal paths in this connected component + + // keep track of which indexes have already been added to the stack + unordered_set stacked{trace_stack.begin(), trace_stack.end()}; + + while (!trace_stack.empty()) { + size_t trace_idx = trace_stack.back(); + trace_stack.pop_back(); +#ifdef debug_mem_clusterer + cerr << "\ttracing back from " << trace_idx << " with DP score " << nodes[trace_idx].dp_score << " and node score " << nodes[trace_idx].score << endl; +#endif - // add the probes between the removing strands and clusters that had never been compared to the retaining strand - for (const pair& unseen_comparison : unseen_comparisons) { - num_infinite_dists[make_pair(unseen_comparison.first, strand_retaining)] = unseen_comparison.second; - num_infinite_dists[make_pair(strand_retaining, unseen_comparison.first)] = unseen_comparison.second; - - if (unseen_comparison.second >= current_max_num_probes) { - num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_1 : strand_size_2) * component_union_find.group_size(unseen_comparison.first); - -#ifdef debug_od_clusterer - cerr << "after merge, the total number of probes against strand " << unseen_comparison.first << " increased to " << unseen_comparison.second << ", above current max of " << current_max_num_probes << ", but the removing strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(unseen_comparison.first) << " to " << num_possible_merges_remaining << endl; + int32_t target_source_score = nodes[trace_idx].dp_score - nodes[trace_idx].score; + for (HitEdge& edge : nodes[trace_idx].edges_to) { +#ifdef debug_mem_clusterer + cerr << "\t\ttrace from " << edge.to_idx << " would have score " << nodes[edge.to_idx].dp_score + edge.weight + nodes[trace_idx].score << endl; +#endif + if (nodes[edge.to_idx].dp_score + edge.weight == target_source_score && !stacked.count(edge.to_idx)) { + trace_stack.push_back(edge.to_idx); + stacked.insert(edge.to_idx); +#ifdef debug_mem_clusterer + cerr << "\t\tidentifying this as a proper traceback that we have not yet traced" << endl; #endif } } - - // find the range containing the records with the removing strand again (it may have changed since we - // altered the map) - removing_iter = num_infinite_dists.lower_bound(make_pair(strand_removing, 0)); - removing_end = num_infinite_dists.upper_bound(make_pair(strand_removing, numeric_limits::max())); - if (removing_iter != removing_end) { - // move the end so that it is an inclusive range - removing_end--; - - // erase the range - if (removing_iter == removing_end) { - if (removing_iter->first.first != removing_iter->first.second) { - num_infinite_dists.erase(make_pair(removing_iter->first.second, removing_iter->first.first)); - } - num_infinite_dists.erase(removing_iter); - } - else { - // erase the previous position on each iteration so that we don't invalidate the iterator before - // we use it to move to the next position - auto removing_iter_prev = removing_iter; - removing_iter++; - while (removing_iter != removing_end) { - if (removing_iter_prev->first.first != removing_iter_prev->first.second) { - num_infinite_dists.erase(make_pair(removing_iter_prev->first.second, removing_iter_prev->first.first)); - } - num_infinite_dists.erase(removing_iter_prev); - removing_iter_prev = removing_iter; - removing_iter++; - } - if (removing_iter_prev->first.first != removing_iter_prev->first.second) { - num_infinite_dists.erase(make_pair(removing_iter_prev->first.second, removing_iter_prev->first.first)); - } - num_infinite_dists.erase(removing_iter_prev); - if (removing_iter->first.first != removing_iter->first.second) { - num_infinite_dists.erase(make_pair(removing_iter->first.second, removing_iter->first.first)); - } - num_infinite_dists.erase(removing_iter); - } - } + } + + // make a cluster + to_return.emplace_back(); + auto& cluster = to_return.back(); + for (size_t traced_idx : stacked) { + HitNode& node = nodes[traced_idx]; + cluster.first.emplace_back(node.mem, node.start_pos); + } + // it starts with multiplicity 1 be default + cluster.second = 1.0; + // keep track of its score for further multiplicity calculations + returned_cluster_scores.push_back(traceback_end.first); + + // put the cluster in order by read position + sort(cluster.first.begin(), cluster.first.end(), [](const hit_t& hit_1, const hit_t& hit_2) { + return hit_1.first->begin < hit_2.first->begin || + (hit_1.first->begin == hit_2.first->begin && hit_1.first->end < hit_2.first->end); + }); + } + + // find out how many of the remaining clusters had similar score to the final + // ones we're returning + int32_t tail_equiv_diff = round(aligner->mapping_quality_score_diff(cluster_multiplicity_diff)); + int32_t min_tail_score = returned_cluster_scores.back() - tail_equiv_diff; + int64_t num_tail_cutoff = 0; + while (!component_traceback_ends.empty() && + component_traceback_ends.front().first >= min_tail_score) { + // count it and remove it + ++num_tail_cutoff; + std::pop_heap(component_traceback_ends.begin(), component_traceback_ends.end()); + component_traceback_ends.pop_back(); + } + if (num_tail_cutoff > 0) { + // find out how many of the clusters we're returning also have similar score + int32_t max_tail_score = returned_cluster_scores.back() + tail_equiv_diff; + int64_t max_tail_idx = to_return.size() - 1; + while (max_tail_idx > 0 && returned_cluster_scores[max_tail_idx - 1] <= max_tail_score) { + --max_tail_idx; + } + + // assign the corresponding multiplicity to all of clusters we're returning with similar scores + double cluster_multiplicity = (double(to_return.size() - max_tail_idx + num_tail_cutoff) + / double(to_return.size() - max_tail_idx)); + for (int64_t i = max_tail_idx; i < to_return.size(); ++i) { + to_return[i].second = cluster_multiplicity; } } + + return to_return; +} + +vector MEMClusterer::clusters(const Alignment& alignment, + const vector& mems, + const GSSWAligner* aligner, + size_t min_mem_length , + int32_t max_qual_score, + int32_t log_likelihood_approx_factor, + size_t min_median_mem_coverage_for_split, + double suboptimal_edge_pruning_factor, + double cluster_multiplicity_diff, + const match_fanouts_t* fanouts) { + + HitGraph hit_graph = make_hit_graph(alignment, mems, aligner, min_mem_length, fanouts); + return hit_graph.clusters(alignment, aligner, max_qual_score, log_likelihood_approx_factor, + min_median_mem_coverage_for_split, suboptimal_edge_pruning_factor, + cluster_multiplicity_diff); + } -vector> OrientedDistanceClusterer::flatten_distance_tree(size_t num_items, - const unordered_map, int64_t>& recorded_finite_dists) { +MEMClusterer::HitGraph NullClusterer::make_hit_graph(const Alignment& alignment, const vector& mems, + const GSSWAligner* aligner, size_t min_mem_length, + const match_fanouts_t* fanouts) { + // intialize the hit nodes, but do not add any edges, and ignore the min mem length + return HitGraph(mems, alignment, aligner, 1, fanouts); +} + +vector, int64_t>> NullClusterer::pair_clusters(const Alignment& alignment_1, + const Alignment& alignment_2, + const vector& left_clusters, + const vector& right_clusters, + const vector>& left_alt_cluster_anchors, + const vector>& right_alt_cluster_anchors, + int64_t optimal_separation, + int64_t max_deviation) { + // do not cluster pairs. + return vector, int64_t>>(); +} -#ifdef debug_od_clusterer - cerr << "constructing strand distance tree from " << num_items << " distances records:" << endl; - for (const auto& record : recorded_finite_dists) { - cerr << "\t" << record.first.first << "->" << record.first.second << ": " << record.second << endl; - } -#endif +PathOrientedDistanceMeasurer::PathOrientedDistanceMeasurer(const PathPositionHandleGraph* graph, + const PathComponentIndex* path_component_index) : + graph(graph), path_component_index(path_component_index) { - // build the graph of relative distances in adjacency list representation - // by construction each strand cluster will be an undirected, unrooted tree - vector> strand_distance_tree(num_items); - for (const auto& dist_record : recorded_finite_dists) { - strand_distance_tree[dist_record.first.first].push_back(dist_record.first.second); - strand_distance_tree[dist_record.first.second].push_back(dist_record.first.first); +} + +int64_t PathOrientedDistanceMeasurer::oriented_distance(const pos_t& pos_1, const pos_t& pos_2) { + + /* + * STEP 1: TRAVERSE OUTWARD FROM BOTH POSITIONS TO FIND A SHARED PATH STRAND + */ + + // maps of oriented paths to (handle, oriented distance) tuples + unordered_map, vector>> path_strand_dists_1; + unordered_map, vector>> path_strand_dists_2; + + unordered_set> shared_path_strands; + + // ensure that the paths of the start nodes are added, even if their ends are too far away + // from the positions for the search to explore + // TODO: this leaves the ambiguity that a node might occur multiple times on the same path, in which case + // the tie for closest traversal to the path is broken arbitrarily + handle_t handle_1 = graph->get_handle(id(pos_1), is_rev(pos_1)); + handle_t handle_2 = graph->get_handle(id(pos_2), is_rev(pos_2)); + + for (const step_handle_t& step : graph->steps_of_handle(handle_1)) { + pair path_occurrence(graph->get_path_handle_of_step(step), + graph->get_handle_of_step(step) != handle_1); + path_strand_dists_1[path_occurrence].emplace_back(step, -((int64_t) offset(pos_1))); + +#ifdef debug_algorithms + cerr << "[PathDistance] first position " << id(pos_1) << "[" << offset(pos_1) << "]" << (is_rev(pos_1) ? "-" : "+") << " has an initial path occurrence on " << as_integer(graph->get_path_handle_of_step(step)) << (graph->get_handle_of_step(step) != handle_1 ? "-" : "+") << endl; +#endif } - // now approximate the relative positions along the strand by traversing each tree and - // treating the distances we estimated as transitive - vector> strand_relative_position; - vector processed(num_items, false); - for (size_t i = 0; i < num_items; i++) { - if (processed[i]) { - continue; + for (const step_handle_t& step : graph->steps_of_handle(handle_2)) { + pair path_occurrence(graph->get_path_handle_of_step(step), + graph->get_handle_of_step(step) != handle_2); + path_strand_dists_2[path_occurrence].emplace_back(step, -((int64_t) offset(pos_2))); + +#ifdef debug_algorithms + cerr << "[PathDistance] second position " << id(pos_2) << "[" << offset(pos_2) << "]" << (is_rev(pos_2) ? "-" : "+") << " has an initial path occurrence on " << as_integer(graph->get_path_handle_of_step(step)) << (graph->get_handle_of_step(step) != handle_2 ? "-" : "+") << endl; +#endif + + if (path_strand_dists_1.count(path_occurrence)) { +#ifdef debug_algorithms + cerr << "[PathDistance] this occurrence is on a shared path" << endl; +#endif + shared_path_strands.insert(path_occurrence); } -#ifdef debug_od_clusterer - cerr << "beginning a distance tree traversal at item " << i << endl; + } + + + // if we already found shared paths on the start nodes, don't search anymore + if (shared_path_strands.empty() && max_walk > 0) { +#ifdef debug_algorithms + cerr << "[PathDistance] no shared paths detected, beginning traversals" << endl; #endif - strand_relative_position.emplace_back(); - unordered_map& relative_pos = strand_relative_position.back(); - // arbitrarily make this node the 0 point - relative_pos[i] = 0; - processed[i] = true; + // priority queues over traversals + // distance is measure at the end of the node, so it's actually the distance + // to the next nodes we will traverse to + // there is a separate queue for each of the positions + RankPairingHeap, int64_t, greater> queue_1, queue_2; - // traverse the strand's tree with DFS - list queue{i}; - while (!queue.empty()) { - size_t curr = queue.back(); - queue.pop_back(); + queue_1.push_or_reprioritize(make_pair(handle_1, true), offset(pos_1) - graph->get_length(handle_1)); + queue_1.push_or_reprioritize(make_pair(handle_1, false), -offset(pos_1)); + queue_2.push_or_reprioritize(make_pair(handle_2, true), offset(pos_2) - graph->get_length(handle_2)); + queue_2.push_or_reprioritize(make_pair(handle_2, false), -offset(pos_2)); + + while (!(queue_1.empty() && queue_2.empty()) && shared_path_strands.empty()) { - int64_t curr_pos = relative_pos[curr]; +#ifdef debug_algorithms + cerr << "[PathDistance] choosing queue for next traversal" << endl; +#endif + // we'll use whichever queue has the shortest traversal so far + auto curr_queue = &queue_1; + auto curr_path_strand_dists = &path_strand_dists_1; + auto other_path_strand_dists = &path_strand_dists_2; + if (queue_1.empty() ? true : (queue_2.empty() ? false : queue_1.top().second > queue_2.top().second)) { + curr_queue = &queue_2; + std::swap(curr_path_strand_dists, other_path_strand_dists); + } - for (size_t next : strand_distance_tree[curr]) { - if (processed[next]) { - continue; + auto trav = curr_queue->top(); + curr_queue->pop(); + +#ifdef debug_algorithms + cerr << "[PathDistance] traversing " << graph->get_id(trav.first.first) << (graph->get_is_reverse(trav.first.first) ? "-" : "+") << " in " << (trav.first.second ? "leftward" : "rightward") << " direction at distance " << trav.second << endl; +#endif + + // don't look any further if the next closest traversal is beyond the maximum distance + if (trav.second > (int64_t) max_walk) { + break; + } + + int64_t dist = trav.second + graph->get_length(trav.first.first); + + if (!(curr_queue == &queue_1 && trav.first.first == handle_1) + && !(curr_queue == &queue_2 && trav.first.first == handle_2)) { + // this is not one of the start positions, so it might have new paths on it + for (const step_handle_t& step : graph->steps_of_handle(trav.first.first)) { + + pair path_occurrence(graph->get_path_handle_of_step(step), + graph->get_handle_of_step(step) != trav.first.first); + +#ifdef debug_algorithms + cerr << "\ttrav is on path " << as_integer(path_occurrence.first) << " in " << (path_occurrence.second ? "reverse" : "forward") << " orientation" << endl; +#endif + + if (!curr_path_strand_dists->count(path_occurrence)) { + // record the oriented distance to the forward beginning of the node, relative to the start traversal + (*curr_path_strand_dists)[path_occurrence].emplace_back(step, trav.first.second ? -dist : trav.second); + // have we found nodes that share a path yet? + if (other_path_strand_dists->count(path_occurrence)) { + shared_path_strands.insert(path_occurrence); + } + } } + } + + graph->follow_edges(trav.first.first, trav.first.second, [&](const handle_t& next) { +#ifdef debug_algorithms + cerr << "\tfollowing edge to " << graph->get_id(next) << (graph->get_is_reverse(next) ? "-" : "+") << " at dist " << dist << endl; +#endif + curr_queue->push_or_reprioritize(make_pair(next, trav.first.second), dist); + }); + } + } + +#ifdef debug_algorithms + cerr << "[PathDistance] found a shared path or exhausted search distance" << endl; +#endif + + /* + * STEP 2: COMPUTE THE MINIMUM DISTANCE ALONG ANY SHARED PATH STRANDS DISCOVERED + */ + + // we will look for minimum absolute distance, so set it to the max to begin + int64_t approx_dist = std::numeric_limits::max(); + for (const pair& oriented_path : shared_path_strands) { + +#ifdef debug_algorithms + cerr << "[PathDistance] estimating distance with shared path " << as_integer(oriented_path.first) << (oriented_path.second ? "-" : "+") << endl; +#endif + + for (const pair& node_trav_1 : path_strand_dists_1[oriented_path]) { + for (const pair& node_trav_2 : path_strand_dists_2[oriented_path]) { - // invert the sign of the distance if we originally measured it in the other order - int64_t dist = recorded_finite_dists.count(make_pair(curr, next)) ? - recorded_finite_dists.at(make_pair(curr, next)) : - -recorded_finite_dists.at(make_pair(next, curr)); + // the net distance searched between the two points to get to nodes on the path + int64_t relative_offset = node_trav_1.second - node_trav_2.second; - // find the position relative to the previous node we just traversed - relative_pos[next] = curr_pos + dist; - processed[next] = true; +#ifdef debug_algorithms + cerr << "[PathDistance] search offset adds up to " << relative_offset << endl; +#endif - queue.push_back(next); + // add in the interval along the path + if (oriented_path.second) { + // the interval is on the reverse strand, so we need measure from the end of the node, + // which is also the start of the next node + relative_offset += (graph->get_position_of_step(graph->get_next_step(node_trav_1.first)) + - graph->get_position_of_step(graph->get_next_step(node_trav_2.first))); + } + else { + relative_offset += (graph->get_position_of_step(node_trav_2.first) + - graph->get_position_of_step(node_trav_1.first)); + } + +#ifdef debug_algorithms + cerr << "[PathDistance] estimating distance on path " << as_integer(oriented_path.first) << (oriented_path.second ? "-" : "+") << " at " << relative_offset << endl; +#endif + + // find the minimum absolute distance, but retain signing + if (abs(relative_offset) < abs(approx_dist)) { + approx_dist = relative_offset; + } } } } - return strand_relative_position; -} - -vector> OrientedDistanceClusterer::compute_tail_mem_coverage(const Alignment& alignment, - const vector& mems) { - - // include an index for the past-the-last position on the read - vector> mem_tail_coverage(alignment.sequence().size() + 1); +#ifdef debug_algorithms + cerr << "[PathDistance] minimum distance is estimated at " << approx_dist << endl; +#endif - if (mems.empty()) { - return mem_tail_coverage; - } + return approx_dist; +} - // convert the MEMs to the read interval they cover - vector> mem_intervals; - mem_intervals.reserve(mems.size()); - for (int64_t i = 0; i < mems.size(); i++) { - if (!mems[i].nodes.empty()) { - mem_intervals.emplace_back(mems[i].begin - alignment.sequence().begin(), - mems[i].end - alignment.sequence().begin()); - } - } +vector> PathOrientedDistanceMeasurer::get_buckets(const function& get_position, size_t num_items) { +#ifdef debug_mem_clusterer + cerr << "using paths to bucket distance comparisons" << endl; +#endif - // ensure that the intervals are sorted lexicographically - if (!std::is_sorted(mem_intervals.begin(), mem_intervals.end())) { - std::sort(mem_intervals.begin(), mem_intervals.end()); - } + // the return value + vector> buckets; - // find number of SMEM beginnings strictly to the left of each position + // we will associate each path strand with the index of a bucket + unordered_map, size_t> bucket_of_path_strand; - int64_t last_mem_idx = mem_intervals.size() - 1; - int64_t mem_idx = 0; - size_t smem_count = 0; + // we will also keep track of any hits that were not on any path + vector non_path_hits; - // iterate through any sub-MEMs contained in the SMEM that share its start position - int64_t curr_mem_begin = mem_intervals[mem_idx].first; - int64_t curr_mem_end = mem_intervals[mem_idx].second; - while (mem_idx < last_mem_idx ? mem_intervals[mem_idx + 1].first == curr_mem_begin : false) { - mem_idx++; - } - for (int64_t i = 0; i < mem_tail_coverage.size(); i++) { - - mem_tail_coverage[i].first = smem_count; + for (size_t i = 0; i < num_items; i++) { + pos_t pos = get_position(i); +#ifdef debug_mem_clusterer + cerr << "adding position " << pos << " to memo" << endl; +#endif - // are we encountering the start of another SMEM - if (mem_idx < mem_intervals.size() ? i == mem_intervals[mem_idx].first : false) { - smem_count++; - // iterate to the next MEM that contains some new sequence - curr_mem_end = mem_intervals[mem_idx].second; - mem_idx++; - while (mem_idx < mems.size() ? mem_intervals[mem_idx].second <= curr_mem_end : false) { - mem_idx++; + // iterate over the path steps that this node is on + bool on_path = false; + for (const step_handle_t& step : graph->steps_of_handle(graph->get_handle(id(pos)))) { + on_path = true; + + // key indicating a path and a strand + pair key(graph->get_path_handle_of_step(step), + graph->get_is_reverse(graph->get_handle_of_step(step)) != is_rev(pos)); + + size_t bucket; + if (!bucket_of_path_strand.count(key)) { + // add a new bucket + bucket_of_path_strand[key] = buckets.size(); + bucket = buckets.size(); + buckets.emplace_back(); } - // iterate through any sub-MEMs contained in the SMEM that share its start position - curr_mem_begin = mem_intervals[mem_idx].first; - while (mem_idx < last_mem_idx ? mem_intervals[mem_idx + 1].first == curr_mem_begin : false) { - mem_idx++; + else { + // access the old bucket + bucket = bucket_of_path_strand[key]; } + buckets[bucket].push_back(i); + } + + if (!on_path) { + // record that this hit was not on any paths + non_path_hits.push_back(i); } } - // now use insertion sort to switch the lexicographic ordering - for (int64_t i = 1; i < mem_intervals.size(); i++) { - int64_t j = i; - while (mem_intervals[j].second < mem_intervals[j - 1].second || - (mem_intervals[j].second == mem_intervals[j - 1].second && mem_intervals[j].first < mem_intervals[j - 1].first)) { - std::swap(mem_intervals[j], mem_intervals[j - 1]); - j--; - if (j == 0) { - break; - } + // check the nearest nodes to each non-path hit to see if we can use them to bucket the item + for (size_t non_path_hit : non_path_hits) { + + pos_t pos = get_position(non_path_hit); + + handle_t handle = graph->get_handle(id(pos), is_rev(pos)); + size_t right_dist = graph->get_length(handle) - offset(pos); + size_t trav_dist = min(offset(pos), right_dist); + if (trav_dist <= max_walk) { + // we want to consider neighbors out this far according to our walk parameter + + graph->follow_edges(handle, offset(pos) < right_dist, [&](const handle_t& neighbor) { + // check whether this neighbor is on any paths + for (const step_handle_t& step : graph->steps_of_handle(neighbor)) { + + // key indicating a path and a strand + pair key(graph->get_path_handle_of_step(step), + graph->get_is_reverse(graph->get_handle_of_step(step)) != is_rev(pos)); + + size_t bucket; + if (!bucket_of_path_strand.count(key)) { + // add a new bucket + bucket_of_path_strand[key] = buckets.size(); + bucket = buckets.size(); + buckets.emplace_back(); + } + else { + // access the old bucket + bucket = bucket_of_path_strand[key]; + } + buckets[bucket].push_back(non_path_hit); + // we can stop after this bucketing + return false; + } + return true; + }); } } -#ifdef debug_od_clusterer - cerr << "reversed lexicographic ordering of intervals" << endl; - for (auto interval : mem_intervals) { - cerr << "\t" << interval.first << " " << interval.second << endl; - } -#endif + return buckets; +} + +vector> PathOrientedDistanceMeasurer::exclude_merges(vector>& current_groups, + const function& get_position){ - // find number of SMEM ends strictly to the right of each position - mem_idx = last_mem_idx; - smem_count = 0; + // the pairs that we are going to exclude + vector> excludes; - // iterate through any sub-MEMs contained in the SMEM that share its end position - curr_mem_begin = mem_intervals[mem_idx].first; - curr_mem_end = mem_intervals[mem_idx].second; - while (mem_idx > 0 ? mem_intervals[mem_idx - 1].second == curr_mem_end : false) { - mem_idx--; + if (!path_component_index) { +#ifdef debug_mem_clusterer + cerr << "no path component index, skipping process of excluding merges" << endl; +#endif + return excludes; } - for (int64_t i = mem_tail_coverage.size() - 1; i >= 0; i--) { +#ifdef debug_mem_clusterer + cerr << "using path component index to exclude strand merges" << endl; +#endif + + // use the component path set index to exclude some distance measurements between groups we can tell are on separate + // strands a priori + // TODO: I wonder if there's a way to do this without the quadratic loop (although it's quadratic in number of connected + // components, so probably not all that bad) + +#ifdef debug_mem_clusterer + cerr << "groups: " << endl; + for (auto& group : current_groups) { + for (size_t idx : group) { + cerr << idx << " "; + } + cerr << endl; + } +#endif + + // returns the path and a bool indicating whether the search was successful + function(const vector&)> find_path_of_group = [&](const vector& group) { + // try to find a member of the group that is on a path + for (size_t i : group) { + handle_t handle = graph->get_handle(id(get_position(i))); + for (const step_handle_t& step : graph->steps_of_handle(handle)) { + return make_pair(graph->get_path_handle_of_step(step), true); + } + + } + // try to find a member whose neighbor is on a path + for (size_t i : group) { + pos_t pos = get_position(i); + handle_t handle = graph->get_handle(id(pos)); + size_t right_dist = graph->get_length(handle) - offset(pos); + size_t trav_dist = min(offset(pos), right_dist); + if (trav_dist <= max_walk) { + path_handle_t result; + bool not_found = graph->follow_edges(handle, offset(pos) < right_dist, [&](const handle_t& neighbor) { + for (const step_handle_t& step : graph->steps_of_handle(neighbor)) { + result = graph->get_path_handle_of_step(step); + return false; + } + return true; + }); + if (!not_found) { + return make_pair(result, true); + } + } + } - mem_tail_coverage[i].second = smem_count; + // we ran through every hit and did not find a path + return make_pair(handlegraph::as_path_handle(0), false); + }; + + for (size_t i = 1; i < current_groups.size(); i++) { + pair i_path = find_path_of_group(current_groups[i]); - if (mem_idx >= 0 ? i == mem_intervals[mem_idx].second : false) { - smem_count++; - // iterate to the next MEM that contains some new sequence - curr_mem_begin = mem_intervals[mem_idx].first; - mem_idx--; - while (mem_idx >= 0 ? mem_intervals[mem_idx].first >= curr_mem_begin : false) { - mem_idx--; + if (!i_path.second) { + continue; + } + + for (size_t j = 0; j < i; j++) { + pair j_path = find_path_of_group(current_groups[j]); + if (!j_path.second) { + continue; } - // iterate through any sub-MEMs contained in the SMEM that share its end position - curr_mem_end = mem_intervals[mem_idx].second; - while (mem_idx > 0 ? mem_intervals[mem_idx - 1].second == curr_mem_end : false) { - mem_idx--; + + // we can exclude any hits that are on separate connected components + if (!path_component_index->paths_on_same_component(i_path.first, j_path.first)) { + excludes.emplace_back(i, j); } } } -#ifdef debug_od_clusterer - cerr << "computed left MEM coverage" << endl; - for (auto pos : mem_tail_coverage) { - cerr << pos.first << " "; + return excludes; +} + +SnarlOrientedDistanceMeasurer::SnarlOrientedDistanceMeasurer(SnarlDistanceIndex* distance_index) : distance_index(distance_index) { + + // nothing to do +} + +int64_t SnarlOrientedDistanceMeasurer::oriented_distance(const pos_t& pos_1, const pos_t& pos_2) { + +#ifdef debug_mem_clusterer + cerr << "measuring distance between " << pos_1 << " and " << pos_2 << endl; +#endif + + size_t forward_dist = minimum_distance(*distance_index, pos_1, pos_2); + size_t backward_dist = minimum_distance(*distance_index, pos_2, pos_1); + + // -1 is the sentinel returned by the distance index if the distance is not measurable + if (forward_dist == std::numeric_limits::max() && backward_dist == std::numeric_limits::max()) { + // convert to the sentinel used by this interface + return numeric_limits::max(); } - cerr << endl; - cerr << "computed right MEM coverage" << endl; - for (auto pos : mem_tail_coverage) { - cerr << pos.second << " "; + else if (forward_dist == std::numeric_limits::max()) { + return -(int64_t)backward_dist; + } + else if (backward_dist == std::numeric_limits::max()) { + return forward_dist; + } + else { + return forward_dist < backward_dist ? forward_dist : -(int64_t)backward_dist; } - cerr << endl; -#endif - return mem_tail_coverage; } -void OrientedDistanceClusterer::topological_order(vector& order_out) { +vector> SnarlOrientedDistanceMeasurer::get_buckets(const function& get_position, size_t num_items) { + // we don't do bucketed distance measurements with this method, return it empty + return vector>(); +} + +vector> SnarlOrientedDistanceMeasurer::exclude_merges(vector>& current_groups, + const function& get_position) { + // we don't do merge exclusion with this method, return it empty + return vector>(); +} + + - // initialize return value - order_out.clear(); - order_out.resize(nodes.size()); - size_t order_idx = nodes.size() - 1; - // initialize iteration structures - vector enqueued(nodes.size(), false); - vector edge_index(nodes.size(), 0); - vector stack; +MEMClusterer::HitGraph OrientedDistanceClusterer::make_hit_graph(const Alignment& alignment, const vector& mems, + const GSSWAligner* aligner, size_t min_mem_length, + const match_fanouts_t* fanouts) { - // iterate through starting nodes - for (size_t init_node_idx = 0; init_node_idx < nodes.size(); init_node_idx++) { - if (enqueued[init_node_idx]) { - continue; + HitGraph hit_graph(mems, alignment, aligner, min_mem_length, false, fanouts); + + // Get all the distances between nodes, in a forrest of unrooted trees of + // nodes that we know are on a consistent strand. + unordered_map, int64_t> recorded_finite_dists = get_on_strand_distance_tree(hit_graph.nodes.size(), + [&](size_t node_number) { + return hit_graph.nodes[node_number].start_pos; + }, + [&](size_t node_number) { + return 0; + }); + + // Flatten the trees to maps of relative position by node ID. + vector> strand_relative_position = flatten_distance_tree(hit_graph.nodes.size(), recorded_finite_dists); + +#ifdef debug_mem_clusterer + for (const auto& strand : strand_relative_position) { + cerr << "strand reconstruction: " << endl; + vector order; + for (const auto& record : strand) { + order.push_back(record.first); } - // navigate through graph with DFS - stack.push_back(init_node_idx); - enqueued[init_node_idx] = true; - while (!stack.empty()) { - size_t node_idx = stack.back(); - size_t& edge_idx = edge_index[node_idx]; - if (edge_idx < nodes[node_idx].edges_from.size()) { - size_t target_idx = nodes[node_idx].edges_from[edge_idx].to_idx; - if (enqueued[target_idx]) { - edge_index[node_idx]++; - } - else { - stack.push_back(target_idx); - enqueued[target_idx] = true; - } - } - else { - // add to topological order in reverse finishing order - stack.pop_back(); - order_out[order_idx] = node_idx; - order_idx--; - } + sort(order.begin(), order.end(), [&](size_t a, size_t b) {return strand.at(a) < strand.at(b);}); + for (const auto i : order) { + int64_t strand_pos = strand.at(i); + cerr << "\t" << i << ":\t" << strand_pos << "\t" << hit_graph.nodes[i].mem->sequence() << endl; } } -} +#endif -void OrientedDistanceClusterer::component_topological_order(const vector& component, - vector& order_out) const { - // initialize return value - order_out.clear(); - order_out.resize(component.size()); + // now we use the strand clusters and the estimated distances to make the DAG for the + // approximate MEM alignment - vector in_degree(component.size()); - vector stack; - unordered_map node_idx_to_component_idx; - for (size_t i = 0; i < component.size(); i++) { - in_degree[i] = nodes[component[i]].edges_to.size(); - if (in_degree[i] == 0) { - stack.push_back(i); - } - node_idx_to_component_idx[component[i]] = i; - } + int64_t gap_open_score = aligner->gap_open; + int64_t gap_extension_score = aligner->gap_extension; - size_t order_idx = 0; - while (!stack.empty()) { - size_t i = stack.back(); - stack.pop_back(); + int64_t forward_gap_length = min(aligner->longest_detectable_gap(alignment), max_gap) + max_expected_dist_approx_error; + for (const unordered_map& relative_pos : strand_relative_position) { - order_out[order_idx] = i; - order_idx++; + // sort the nodes by relative position + vector> sorted_pos; + for (const pair& pos_record : relative_pos) { + sorted_pos.emplace_back(pos_record.second, pos_record.first); + } + std::sort(sorted_pos.begin(), sorted_pos.end()); - for (const ODEdge& edge : nodes[component[i]].edges_from) { - size_t j = node_idx_to_component_idx[edge.to_idx]; - in_degree[j]--; - if (in_degree[j] == 0) { - stack.push_back(j); + // find edges within each strand cluster by first identifying the interval of MEMs that meets + // the graph distance constrant for each MEM and then checking for read colinearity and the + // reverse distance constraint + int64_t last_idx = sorted_pos.size() - 1; + int64_t low = 0, hi = 0; + for (int64_t i = 0; i < sorted_pos.size(); i++) { + + int64_t strand_pos = sorted_pos[i].first; + size_t pivot_idx = sorted_pos[i].second; + HitNode& pivot = hit_graph.nodes[pivot_idx]; + int64_t pivot_length = pivot.mem->end - pivot.mem->begin; + int64_t suffix_length = alignment.sequence().end() - pivot.mem->begin; + + // the limits of how far away we might detect edges to add to the clustering graph + int64_t target_low_pos = strand_pos - max_expected_dist_approx_error; + int64_t target_hi_pos = strand_pos + suffix_length + forward_gap_length; + + // move the lower boundary of the search interval to the lowest value inside the + // the target interval + if (sorted_pos[low].first > target_low_pos) { + while (low > 0 ? sorted_pos[low - 1].first > target_low_pos : false) { + low--; + } + } + else { + while (low < sorted_pos.size() ? sorted_pos[low].first < target_low_pos : false) { + low++; + } + } + + // move the upper boundary of the search interval to the highest value inside the + // the target interval + if (sorted_pos[hi].first > target_hi_pos) { + while (hi > 0 ? sorted_pos[hi].first > target_hi_pos : false) { + hi--; + } + } + else { + while (hi < last_idx ? sorted_pos[hi + 1].first <= target_hi_pos : false) { + hi++; + } + } + +#ifdef debug_mem_clusterer + cerr << "checking for possible edges from " << sorted_pos[i].second << " to MEMs between " << sorted_pos[low].first << "(" << sorted_pos[low].second << ") and " << sorted_pos[hi].first << "(" << sorted_pos[hi].second << "), which is inside the interval (" << target_low_pos << ", " << target_hi_pos << ")" << endl; +#endif + + for (int64_t j = low; j <= hi; j++) { + // don't make self edges + if (i == j) { + continue; + } + + int64_t next_idx = sorted_pos[j].second; + HitNode& next = hit_graph.nodes[next_idx]; + + // the estimated distance between the end of the pivot and the start of the next MEM in the graph + int64_t graph_dist = sorted_pos[j].first - strand_pos - pivot_length; + + if (next.mem->begin >= pivot.mem->begin && next.mem->end <= pivot.mem->end + && abs((sorted_pos[j].first - strand_pos) - (next.mem->begin - pivot.mem->begin)) <= 1) { + // this looks like a redundant sub-MEM + + // we add a dummy edge, but only to connect the nodes' components and join the clusters, + // not to actually use in dynamic programming (given arbitrary low weight that should not + // cause overflow) + hit_graph.add_edge(pivot_idx, next_idx, numeric_limits::lowest() / 2, graph_dist); + + continue; + } + else if (next.mem->begin <= pivot.mem->begin || next.mem->end <= pivot.mem->end) { + // these MEMs cannot be colinear along the read + + // note: we allow one of the start/end positions to be the same here even though they can't + // techinically overlap because it tends to soak up redundant sub-MEMs into the same connected + // component so that they don't get their own cluster + + continue; + } + + // add the edge in + int32_t edge_score = estimate_edge_score(pivot.mem, next.mem, graph_dist, aligner); + hit_graph.add_edge(pivot_idx, next_idx, edge_score, graph_dist); + +#ifdef debug_mem_clusterer + cerr << "adding edge to MEM " << sorted_pos[j].first << "(" << sorted_pos[j].second << ") with weight " << edge_score << endl; +#endif } } } + + return hit_graph; +} + +OrientedDistanceClusterer::OrientedDistanceClusterer(OrientedDistanceMeasurer& distance_measurer, + size_t max_expected_dist_approx_error) + : distance_measurer(distance_measurer), max_expected_dist_approx_error(max_expected_dist_approx_error) { + } -void OrientedDistanceClusterer::identify_sources_and_sinks(vector& sources_out, - vector& sinks_out) { +unordered_map, int64_t> OrientedDistanceClusterer::get_on_strand_distance_tree(size_t num_items, + const function& get_position, + const function& get_offset) { - sources_out.clear(); - sinks_out.clear(); + // for recording the distance of any pair that we check with a finite distance + unordered_map, int64_t> recorded_finite_dists; - vector is_source(nodes.size(), true); + // for recording the number of times elements of a strand cluster have been compared + // and found an infinite distance + map, size_t> num_infinite_dists; - for (size_t i = 0; i < nodes.size(); i++) { - if (nodes[i].edges_from.empty()) { - sinks_out.push_back(i); - } - - for (ODEdge& edge : nodes[i].edges_from) { - is_source[edge.to_idx] = false; - } - } + // we use a union find to keep track of which MEMs have been identified as being on the same strand + UnionFind component_union_find(num_items); - for (size_t i = 0; i < nodes.size(); i++) { - if (is_source[i]) { - sources_out.push_back(i); + size_t num_possible_merges_remaining = (num_items * (num_items - 1)) / 2; + + int64_t max_failed_distance_probes = 2; + + // an initial pass that only looks at easily identifiable buckets + extend_dist_tree_by_buckets(get_position, get_offset, num_items, recorded_finite_dists, + component_union_find, num_possible_merges_remaining); + + // another initial pass that tries to identify groups that cannot be merged + exclude_dist_tree_merges(get_position, num_infinite_dists, component_union_find, + num_possible_merges_remaining, max_failed_distance_probes); + + // TODO: permutations that try to assign singletons + + // a second pass that measures distances between randomly selected pairs + size_t nlogn = ceil(num_items * log(num_items)); + extend_dist_tree_by_permutations(get_position, get_offset, num_items, max_failed_distance_probes, nlogn, + recorded_finite_dists, num_infinite_dists, component_union_find, num_possible_merges_remaining); + + return recorded_finite_dists; +} + +void OrientedDistanceClusterer::exclude_dist_tree_merges(const function& get_position, + map, size_t>& num_infinite_dists, + UnionFind& component_union_find, + size_t& num_possible_merges_remaining, + int64_t max_failed_distance_probes) { + + // the current set of groups after bucketed merging + vector> current_groups = component_union_find.all_groups(); + + // pairs of groups that we can easily identify as being on separate connected components + vector> excluded_pairs = distance_measurer.exclude_merges(current_groups, get_position); + + // mark these pairs as unmergeable and update the accounting accordingly + for (const auto& excluded_pair : excluded_pairs) { + size_t group_1 = component_union_find.find_group(current_groups[excluded_pair.first].front()); + size_t group_2 = component_union_find.find_group(current_groups[excluded_pair.second].front()); + + num_infinite_dists[make_pair(group_1, group_2)] = max_failed_distance_probes + 1; + num_infinite_dists[make_pair(group_2, group_1)] = max_failed_distance_probes + 1; + + num_possible_merges_remaining -= component_union_find.group_size(group_1) * component_union_find.group_size(group_2); + } +} + +void OrientedDistanceClusterer::extend_dist_tree_by_buckets(const function& get_position, + const function& get_offset, + size_t num_items, + unordered_map, int64_t>& recorded_finite_dists, + UnionFind& component_union_find, + size_t& num_possible_merges_remaining) { + + vector> buckets = distance_measurer.get_buckets(get_position, num_items); + + // Ensure a deterministic, system independent ordering + for (vector& bucket : buckets) { + sort(bucket.begin(), bucket.end()); + } + sort(buckets.begin(), buckets.end()); + + // use the path strands to bucket distance measurements + for (vector& bucket : buckets) { + + for (size_t i = 1; i < bucket.size(); i++) { + size_t prev = bucket[i - 1]; + size_t here = bucket[i]; + + // have these items already been identified as on the same strand? + if (component_union_find.find_group(prev) == component_union_find.find_group(here)) { + continue; + } + + // estimate the distance + pos_t pos_prev = get_position(prev); + pos_t pos_here = get_position(here); + +#ifdef debug_mem_clusterer + cerr << "measuring distance between " << prev << " at " << pos_prev << " and " << here << " at " << pos_here << endl; +#endif + + int64_t dist = distance_measurer.oriented_distance(pos_prev, pos_here); + + + // did we get a successful estimation? + if (dist == numeric_limits::max()) { +#ifdef debug_mem_clusterer + cerr << "they don't appear to be have a measurable distance, skipping" << endl; +#endif + continue; + } + + // add the fixed offset from the hit position + dist += get_offset(here) - get_offset(prev); + +#ifdef debug_mem_clusterer + cerr << "recording distance at " << dist << endl; +#endif + + // merge them into a strand cluster + recorded_finite_dists[make_pair(prev, here)] = dist; + num_possible_merges_remaining -= component_union_find.group_size(prev) * component_union_find.group_size(here); + component_union_find.union_groups(prev, here); + } + } +} + +void OrientedDistanceClusterer::extend_dist_tree_by_permutations(const function& get_position, + const function& get_offset, + size_t num_items, + int64_t max_failed_distance_probes, + size_t decrement_frequency, + unordered_map, int64_t>& recorded_finite_dists, + map, size_t>& num_infinite_dists, + UnionFind& component_union_find, + size_t& num_possible_merges_remaining) { + + // We want to run through all possible pairsets of node numbers in a permuted order. + ShuffledPairs shuffled_pairs(num_items); + auto current_pair = shuffled_pairs.begin(); + size_t pairs_checked = 0; + + // a simulated annealing parameter loosely inspired by the cutoff for an Erdos-Renyi random graph + // to be connected with probability approaching 1 + size_t current_max_num_probes = max_failed_distance_probes; + + while (num_possible_merges_remaining > 0 && current_pair != shuffled_pairs.end() && current_max_num_probes > 0) { + // slowly lower the number of distances we need to check before we believe that two clusters are on + // separate strands +#ifdef debug_mem_clusterer + cerr << "checked " << pairs_checked << " pairs with max probes " << current_max_num_probes << ", decrement frequency " << decrement_frequency << ", merges remaining " << num_possible_merges_remaining << endl; +#endif + + if (pairs_checked % decrement_frequency == 0 && pairs_checked != 0) { + current_max_num_probes--; +#ifdef debug_mem_clusterer + cerr << "reducing the max number of probes to " << current_max_num_probes << endl; +#endif + for (const pair, size_t>& inf_dist_record : num_infinite_dists) { + // break symmetry so we don't repeat the operation twice + if (inf_dist_record.first.first < inf_dist_record.first.second && inf_dist_record.second == current_max_num_probes) { + // this merge just fell below the new maximum number of distance probes + size_t strand_size_1 = component_union_find.group_size(inf_dist_record.first.first); + size_t strand_size_2 = component_union_find.group_size(inf_dist_record.first.second); + num_possible_merges_remaining -= strand_size_1 * strand_size_2; +#ifdef debug_mem_clusterer + cerr << "after reduction, the total number of probes between strand " << inf_dist_record.first.first << " and " << inf_dist_record.first.second << " is above max, reducing possible merges by " << strand_size_1 * strand_size_2 << " to " << num_possible_merges_remaining << endl; +#endif + } + } + } + + + pair node_pair = *current_pair; + ++current_pair; + + pairs_checked++; + + size_t strand_1 = component_union_find.find_group(node_pair.first); + size_t strand_2 = component_union_find.find_group(node_pair.second); + +#ifdef debug_mem_clusterer + cerr << "checking MEMs " << node_pair.first << " and " << node_pair.second << " in cluster " << strand_1 << " and " << strand_2 << endl; +#endif + + if (strand_1 == strand_2) { + // these are already identified as on the same strand, don't need to do it again +#ifdef debug_mem_clusterer + cerr << "already on same strand" << endl; +#endif + continue; + } + + auto num_failed_probes = num_infinite_dists.find(make_pair(strand_1, strand_2)); + if (num_failed_probes == num_infinite_dists.end() ? false : num_failed_probes->second >= current_max_num_probes) { + // we've already checked multiple distances between these strand clusters and + // none have returned a finite distance, so we conclude that they are in fact + // on separate clusters and decline to check any more distances +#ifdef debug_mem_clusterer + cerr << "already have checked distance above maximum number of probes" << endl; +#endif + continue; + } + + const pos_t& pos_1 = get_position(node_pair.first); + const pos_t& pos_2 = get_position(node_pair.second); + + int64_t oriented_dist = distance_measurer.oriented_distance(pos_1, pos_2); + +#ifdef debug_mem_clusterer + cerr << "distance between " << pos_1 << " and " << pos_2 << " estimated at " << oriented_dist << endl; +#endif + + if (oriented_dist == std::numeric_limits::max()) { + // distance is estimated at infinity, so these are either on different strands + // or the path heuristic failed to find a shared path + + if (num_failed_probes == num_infinite_dists.end()) { + num_failed_probes = num_infinite_dists.insert(pair, size_t>(make_pair(strand_1, strand_2), 1)).first; + num_infinite_dists[make_pair(strand_2, strand_1)] = 1; + } + else { + num_failed_probes->second++; + num_infinite_dists[make_pair(strand_2, strand_1)]++; + } + + + // this infinite distance pushed the count over the maximum number of probes, so remove + // these merges from the pool of potential merges remaining + if (num_failed_probes->second >= current_max_num_probes) { + size_t strand_size_1 = component_union_find.group_size(strand_1); + size_t strand_size_2 = component_union_find.group_size(strand_2); + + num_possible_merges_remaining -= strand_size_1 * strand_size_2; + +#ifdef debug_mem_clusterer + cerr << "number of probes " << num_failed_probes->second << " crossed max threshold of " << current_max_num_probes << ", reducing possible merges by " << strand_size_1 * strand_size_2 << " to " << num_possible_merges_remaining << endl; +#endif + } + } + else { + // the distance is finite, so merge the strand clusters + + // add the fixed offset of the hit from the start position + oriented_dist += get_offset(node_pair.second) - get_offset(node_pair.first); + + recorded_finite_dists[node_pair] = oriented_dist; + + size_t strand_size_1 = component_union_find.group_size(strand_1); + size_t strand_size_2 = component_union_find.group_size(strand_2); + + component_union_find.union_groups(node_pair.first, node_pair.second); + + // remove these from the pool of remaining merges + num_possible_merges_remaining -= strand_size_1 * strand_size_2; + + size_t strand_retaining = component_union_find.find_group(node_pair.first); + size_t strand_removing = strand_retaining == strand_1 ? strand_2 : strand_1; + +#ifdef debug_mem_clusterer + cerr << "probe triggered group merge, reducing possible merges by " << strand_size_1 * strand_size_2 << " to " << num_possible_merges_remaining << " and retaining strand " << strand_retaining << endl; +#endif + + // get the ranges in the counter for failed distance probe records for both of the strands + auto removing_iter = num_infinite_dists.lower_bound(make_pair(strand_removing, 0)); + auto removing_end = num_infinite_dists.upper_bound(make_pair(strand_removing, numeric_limits::max())); + auto retaining_iter = num_infinite_dists.lower_bound(make_pair(strand_retaining, 0)); + auto retaining_end = num_infinite_dists.upper_bound(make_pair(strand_retaining, numeric_limits::max())); + + vector> unseen_comparisons; + while (removing_iter != removing_end && retaining_iter != retaining_end) { + if (removing_iter->first.second == retaining_iter->first.second) { + // both the removing and the retaining strand cluster have failed probes against this cluster so + // we need to combine the records + + // check if we've already marked some of these merges as off limits + bool retaining_already_blocked = retaining_iter->second >= current_max_num_probes; + bool removing_already_blocked = removing_iter->second >= current_max_num_probes; + + // add the counts together + retaining_iter->second += removing_iter->second; + num_infinite_dists[make_pair(retaining_iter->first.second, strand_retaining)] += removing_iter->second; + + // update the number of possible merges remaining + if (retaining_already_blocked && !removing_already_blocked) { + num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(removing_iter->first.second); + +#ifdef debug_mem_clusterer + cerr << "after merge, the total number of probes against strand " << removing_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", but the retaining strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(removing_iter->first.second) << " to " << num_possible_merges_remaining << endl; +#endif + } + else if (removing_already_blocked && !retaining_already_blocked) { + num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_1 : strand_size_2) * component_union_find.group_size(removing_iter->first.second); + +#ifdef debug_mem_clusterer + cerr << "after merge, the total number of probes against strand " << removing_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", but the removing strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_1 : strand_size_2) * component_union_find.group_size(removing_iter->first.second) << " to " << num_possible_merges_remaining << endl; +#endif + } + else if (!retaining_already_blocked && !removing_already_blocked && retaining_iter->second >= current_max_num_probes) { + num_possible_merges_remaining -= (strand_size_1 + strand_size_2) * component_union_find.group_size(removing_iter->first.second); + +#ifdef debug_mem_clusterer + cerr << "after merge, the total number of probes against strand " << removing_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", reducing possible merges by " << (strand_size_1 + strand_size_2) * component_union_find.group_size(removing_iter->first.second) << " to " << num_possible_merges_remaining << endl; +#endif + + } + removing_iter++; + retaining_iter++; + } + else if (removing_iter->first.second < retaining_iter->first.second) { + // the strand being removed has probes against this strand cluster, but the strand being + // retained does not, mark this and save it for later so that we don't invalidate the range + unseen_comparisons.emplace_back(removing_iter->first.second, removing_iter->second); + removing_iter++; + } + else { + // the strand being retained has probes against this strand cluster, but the strand being + // removed does not, check if we need to add the removing strand to the remaining merges + // counter + if (retaining_iter->second >= current_max_num_probes) { + num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(retaining_iter->first.second); + +#ifdef debug_mem_clusterer + cerr << "after merge, the total number of probes against strand " << retaining_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", but the retaining strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(retaining_iter->first.second) << " to " << num_possible_merges_remaining << endl; +#endif + } + retaining_iter++; + } + } + + // finish off either range + while (removing_iter != removing_end) { + unseen_comparisons.emplace_back(removing_iter->first.second, removing_iter->second); + removing_iter++; + } + while (retaining_iter != retaining_end) { + if (retaining_iter->second >= current_max_num_probes) { + num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(retaining_iter->first.second); + +#ifdef debug_mem_clusterer + cerr << "after merge, the total number of probes against strand " << retaining_iter->first.second << " increased to " << retaining_iter->second << ", above current max of " << current_max_num_probes << ", but the retaining strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(retaining_iter->first.second) << " to " << num_possible_merges_remaining << endl; +#endif + } + retaining_iter++; + } + + + // add the probes between the removing strands and clusters that had never been compared to the retaining strand + for (const pair& unseen_comparison : unseen_comparisons) { + num_infinite_dists[make_pair(unseen_comparison.first, strand_retaining)] = unseen_comparison.second; + num_infinite_dists[make_pair(strand_retaining, unseen_comparison.first)] = unseen_comparison.second; + + if (unseen_comparison.second >= current_max_num_probes) { + num_possible_merges_remaining -= (strand_retaining == strand_1 ? strand_size_1 : strand_size_2) * component_union_find.group_size(unseen_comparison.first); + +#ifdef debug_mem_clusterer + cerr << "after merge, the total number of probes against strand " << unseen_comparison.first << " increased to " << unseen_comparison.second << ", above current max of " << current_max_num_probes << ", but the removing strand is already blocked, reducing possible merges by " << (strand_retaining == strand_1 ? strand_size_2 : strand_size_1) * component_union_find.group_size(unseen_comparison.first) << " to " << num_possible_merges_remaining << endl; +#endif + } + } + + // find the range containing the records with the removing strand again (it may have changed since we + // altered the map) + removing_iter = num_infinite_dists.lower_bound(make_pair(strand_removing, 0)); + removing_end = num_infinite_dists.upper_bound(make_pair(strand_removing, numeric_limits::max())); + if (removing_iter != removing_end) { + // move the end so that it is an inclusive range + removing_end--; + + // erase the range + if (removing_iter == removing_end) { + if (removing_iter->first.first != removing_iter->first.second) { + num_infinite_dists.erase(make_pair(removing_iter->first.second, removing_iter->first.first)); + } + num_infinite_dists.erase(removing_iter); + } + else { + // erase the previous position on each iteration so that we don't invalidate the iterator before + // we use it to move to the next position + auto removing_iter_prev = removing_iter; + removing_iter++; + while (removing_iter != removing_end) { + if (removing_iter_prev->first.first != removing_iter_prev->first.second) { + num_infinite_dists.erase(make_pair(removing_iter_prev->first.second, removing_iter_prev->first.first)); + } + num_infinite_dists.erase(removing_iter_prev); + removing_iter_prev = removing_iter; + removing_iter++; + } + if (removing_iter_prev->first.first != removing_iter_prev->first.second) { + num_infinite_dists.erase(make_pair(removing_iter_prev->first.second, removing_iter_prev->first.first)); + } + num_infinite_dists.erase(removing_iter_prev); + if (removing_iter->first.first != removing_iter->first.second) { + num_infinite_dists.erase(make_pair(removing_iter->first.second, removing_iter->first.first)); + } + num_infinite_dists.erase(removing_iter); + } + } } } } -void OrientedDistanceClusterer::connected_components(vector>& components_out) { +vector> OrientedDistanceClusterer::flatten_distance_tree(size_t num_items, + const unordered_map, int64_t>& recorded_finite_dists) { - components_out.clear(); - vector enqueued(nodes.size()); +#ifdef debug_mem_clusterer + cerr << "constructing strand distance tree from " << num_items << " distances records:" << endl; + for (const auto& record : recorded_finite_dists) { + cerr << "\t" << record.first.first << "->" << record.first.second << ": " << record.second << endl; + } +#endif - // check each node in turn to find new components - for (size_t dfs_start_idx = 0; dfs_start_idx < nodes.size(); dfs_start_idx++) { - if (enqueued[dfs_start_idx]) { - // we've already found this node from some component + // build the graph of relative distances in adjacency list representation + // by construction each strand cluster will be an undirected, unrooted tree + vector> strand_distance_tree(num_items); + for (const auto& dist_record : recorded_finite_dists) { + strand_distance_tree[dist_record.first.first].push_back(dist_record.first.second); + strand_distance_tree[dist_record.first.second].push_back(dist_record.first.first); + } + + // now approximate the relative positions along the strand by traversing each tree and + // treating the distances we estimated as transitive + vector> strand_relative_position; + vector processed(num_items, false); + for (size_t i = 0; i < num_items; i++) { + if (processed[i]) { continue; } - // this node belongs to a component we haven't found yet, use DFS to find the rest - vector stack {dfs_start_idx}; - enqueued[dfs_start_idx] = true; - components_out.emplace_back(1, dfs_start_idx); +#ifdef debug_mem_clusterer + cerr << "beginning a distance tree traversal at item " << i << endl; +#endif + strand_relative_position.emplace_back(); + unordered_map& relative_pos = strand_relative_position.back(); - while (!stack.empty()) { - - ODNode& node = nodes[stack.back()]; - stack.pop_back(); + // arbitrarily make this node the 0 point + relative_pos[i] = 0; + processed[i] = true; + + // traverse the strand's tree with DFS + list queue{i}; + while (!queue.empty()) { + size_t curr = queue.back(); + queue.pop_back(); - // search in both forward and backward directions + int64_t curr_pos = relative_pos[curr]; - for (ODEdge& edge : node.edges_from) { - - if (!enqueued[edge.to_idx]) { - stack.push_back(edge.to_idx); - enqueued[edge.to_idx] = true; - components_out.back().push_back(edge.to_idx); + for (size_t next : strand_distance_tree[curr]) { + if (processed[next]) { + continue; } - } - - for (ODEdge& edge : node.edges_to) { - if (!enqueued[edge.to_idx]) { - stack.push_back(edge.to_idx); - enqueued[edge.to_idx] = true; - components_out.back().push_back(edge.to_idx); - } + // invert the sign of the distance if we originally measured it in the other order + int64_t dist = recorded_finite_dists.count(make_pair(curr, next)) ? + recorded_finite_dists.at(make_pair(curr, next)) : + -recorded_finite_dists.at(make_pair(next, curr)); + + // find the position relative to the previous node we just traversed + relative_pos[next] = curr_pos + dist; + processed[next] = true; + + queue.push_back(next); } } } + + return strand_relative_position; } -void OrientedDistanceClusterer::perform_dp() { - - for (ODNode& node : nodes) { - // as in local alignment, minimum score is the score of node itself - node.dp_score = node.score; - } +vector> OrientedDistanceClusterer::compute_tail_mem_coverage(const Alignment& alignment, + const vector& mems) { -#ifdef debug_od_clusterer - cerr << "computing topological order for clustering DP" << endl; -#endif + // include an index for the past-the-last position on the read + vector> mem_tail_coverage(alignment.sequence().size() + 1); - vector order; - topological_order(order); + if (mems.empty()) { + return mem_tail_coverage; + } - for (size_t i : order) { - ODNode& node = nodes[i]; -#ifdef debug_od_clusterer - cerr << "at node " << i << " with DP score " << node.dp_score << " and node score " << node.score << endl; -#endif - // for each edge out of this node - for (ODEdge& edge : node.edges_from) { - - // check if the path through the node out of this edge increase score of target node - ODNode& target_node = nodes[edge.to_idx]; - int32_t extend_score = node.dp_score + edge.weight + target_node.score; - if (extend_score > target_node.dp_score) { -#ifdef debug_od_clusterer - cerr << "extending DP to node " << edge.to_idx << " with score " << extend_score << endl; -#endif - target_node.dp_score = extend_score; - } + // convert the MEMs to the read interval they cover + vector> mem_intervals; + mem_intervals.reserve(mems.size()); + for (int64_t i = 0; i < mems.size(); i++) { + if (!mems[i].nodes.empty()) { + mem_intervals.emplace_back(mems[i].begin - alignment.sequence().begin(), + mems[i].end - alignment.sequence().begin()); } } -} - -vector OrientedDistanceClusterer::clusters(const Alignment& alignment, - int32_t max_qual_score, - int32_t log_likelihood_approx_factor, - size_t min_median_mem_coverage_for_split, - double suboptimal_edge_pruning_factor) { - vector>> to_return; - if (nodes.size() == 0) { - // this should only happen if we have filtered out all MEMs, so there are none to cluster - return to_return; + // ensure that the intervals are sorted lexicographically + if (!std::is_sorted(mem_intervals.begin(), mem_intervals.end())) { + std::sort(mem_intervals.begin(), mem_intervals.end()); } -#ifdef debug_od_clusterer - cerr << "performing approximate DP across MEMs" << endl; -#endif - perform_dp(); + // find number of SMEM beginnings strictly to the left of each position -#ifdef debug_od_clusterer - cerr << "finding top tracebacks within connected components" << endl; -#endif - // find the weakly connected components, which should correspond to mappings - vector> components; - connected_components(components); + int64_t last_mem_idx = mem_intervals.size() - 1; + int64_t mem_idx = 0; + size_t smem_count = 0; -#ifdef debug_od_clusterer - cerr << "traceback returns the following components: " << endl; - for (size_t i = 0; i < components.size(); i++) { - vector& component = components[i]; - cerr << "\tcomponent " << i << ":" << endl; - for (size_t idx : component) { - cerr << "\t\t" << idx << " " << nodes[idx].start_pos << " "; - for (auto iter = nodes[idx].mem->begin; iter != nodes[idx].mem->end; iter++) { - cerr << *iter; + // iterate through any sub-MEMs contained in the SMEM that share its start position + int64_t curr_mem_begin = mem_intervals[mem_idx].first; + int64_t curr_mem_end = mem_intervals[mem_idx].second; + while (mem_idx < last_mem_idx ? mem_intervals[mem_idx + 1].first == curr_mem_begin : false) { + mem_idx++; + } + for (int64_t i = 0; i < mem_tail_coverage.size(); i++) { + + mem_tail_coverage[i].first = smem_count; + + // are we encountering the start of another SMEM + if (mem_idx < mem_intervals.size() ? i == mem_intervals[mem_idx].first : false) { + smem_count++; + // iterate to the next MEM that contains some new sequence + curr_mem_end = mem_intervals[mem_idx].second; + mem_idx++; + while (mem_idx < mems.size() ? mem_intervals[mem_idx].second <= curr_mem_end : false) { + mem_idx++; + } + // iterate through any sub-MEMs contained in the SMEM that share its start position + curr_mem_begin = mem_intervals[mem_idx].first; + while (mem_idx < last_mem_idx ? mem_intervals[mem_idx + 1].first == curr_mem_begin : false) { + mem_idx++; } - cerr << endl; } } -#endif - if (min_median_mem_coverage_for_split) { -#ifdef debug_od_clusterer - cerr << "looking for high coverage clusters to split" << endl; -#endif - size_t num_original_components = components.size(); - for (size_t i = 0; i < num_original_components; i++) { -#ifdef debug_od_clusterer - cerr << "component " << i << " has median coverage " << median_mem_coverage(components[i], alignment) << endl; -#endif - size_t curr_num_components = components.size(); - if (median_mem_coverage(components[i], alignment) >= min_median_mem_coverage_for_split) { -//#pragma omp atomic -// SPLIT_ATTEMPT_COUNTER++; -#ifdef debug_od_clusterer - cerr << "attempting to prune and split cluster" << endl; -#endif - - prune_low_scoring_edges(components, i, suboptimal_edge_pruning_factor); - - if (components.size() > curr_num_components) { -//#pragma omp atomic -// SUCCESSFUL_SPLIT_ATTEMPT_COUNTER++; - } + // now use insertion sort to switch the lexicographic ordering + for (int64_t i = 1; i < mem_intervals.size(); i++) { + int64_t j = i; + while (mem_intervals[j].second < mem_intervals[j - 1].second || + (mem_intervals[j].second == mem_intervals[j - 1].second && mem_intervals[j].first < mem_intervals[j - 1].first)) { + std::swap(mem_intervals[j], mem_intervals[j - 1]); + j--; + if (j == 0) { + break; } } -#ifdef debug_od_clusterer - vector> current_components; - connected_components(current_components); - cerr << "after splitting, from " << num_original_components << " to " << current_components.size() << " connected components" << endl; -#endif -//#pragma omp atomic -// PRE_SPLIT_CLUSTER_COUNTER += num_original_components; -//#pragma omp atomic -// POST_SPLIT_CLUSTER_COUNTER += components.size(); } - - // find the node with the highest DP score in each connected component - // each record is a pair of (score lower bound, node index) - vector>> component_traceback_ends(components.size(), - make_pair(numeric_limits::min(), vector())); - for (size_t i = 0; i < components.size(); i++) { - vector& component = components[i]; - pair>& traceback_end = component_traceback_ends[i]; - for (size_t j = 0; j < component.size(); j++) { - int32_t dp_score = nodes[component[j]].dp_score; - if (dp_score > traceback_end.first) { - // this is better than all previous scores, so throw anything we have away - traceback_end.first = dp_score; - traceback_end.second.clear(); - traceback_end.second.push_back(component[j]); - } - else if (dp_score == traceback_end.first) { - // this is equivalent to the current best, so hold onto both - traceback_end.second.push_back(component[j]); - } - } +#ifdef debug_mem_clusterer + cerr << "reversed lexicographic ordering of intervals" << endl; + for (auto interval : mem_intervals) { + cerr << "\t" << interval.first << " " << interval.second << endl; } - //#pragma omp atomic - // CLUSTER_TOTAL += component_traceback_ends.size(); +#endif - std::make_heap(component_traceback_ends.begin(), component_traceback_ends.end()); + // find number of SMEM ends strictly to the right of each position - // estimate the minimum score a cluster must obtain to even affect the mapping quality - // TODO: this approximation could break down sometimes, need to look into it - int32_t top_score = component_traceback_ends.front().first; - const BaseAligner* base_aligner = aligner ? (BaseAligner*) aligner : (BaseAligner*) qual_adj_aligner; - int32_t suboptimal_score_cutoff = top_score - log_likelihood_approx_factor * base_aligner->mapping_quality_score_diff(max_qual_score); + mem_idx = last_mem_idx; + smem_count = 0; - while (!component_traceback_ends.empty()) { - // get the next highest scoring traceback end(s) - auto traceback_end = component_traceback_ends.front(); - std::pop_heap(component_traceback_ends.begin(), component_traceback_ends.end()); - component_traceback_ends.pop_back(); - - // get the index of the node - vector& trace_stack = traceback_end.second; - -#ifdef debug_od_clusterer - cerr << "checking traceback of component starting at " << traceback_end.second.front() << endl; -#endif - // if this cluster does not look like it even affect the mapping quality of the top scoring - // cluster, don't bother forming it - if (traceback_end.first < suboptimal_score_cutoff) { -#ifdef debug_od_clusterer - cerr << "skipping rest of components on account of low score of " << traceback_end.first << " compared to max score " << top_score << " and cutoff " << suboptimal_score_cutoff << endl; -#endif - -//#pragma omp atomic -// PRUNE_COUNTER += component_traceback_ends.size() + 1; - break; - } - - // traceback all optimal paths in this connected component + // iterate through any sub-MEMs contained in the SMEM that share its end position + curr_mem_begin = mem_intervals[mem_idx].first; + curr_mem_end = mem_intervals[mem_idx].second; + while (mem_idx > 0 ? mem_intervals[mem_idx - 1].second == curr_mem_end : false) { + mem_idx--; + } + + for (int64_t i = mem_tail_coverage.size() - 1; i >= 0; i--) { - // keep track of which indexes have already been added to the stack - unordered_set stacked{trace_stack.begin(), trace_stack.end()}; + mem_tail_coverage[i].second = smem_count; - while (!trace_stack.empty()) { - size_t trace_idx = trace_stack.back(); - trace_stack.pop_back(); -#ifdef debug_od_clusterer - cerr << "\ttracing back from " << trace_idx << " with DP score " << nodes[trace_idx].dp_score << " and node score " << nodes[trace_idx].score << endl; -#endif - - int32_t target_source_score = nodes[trace_idx].dp_score - nodes[trace_idx].score; - for (ODEdge& edge : nodes[trace_idx].edges_to) { -#ifdef debug_od_clusterer - cerr << "\t\ttrace from " << edge.to_idx << " would have score " << nodes[edge.to_idx].dp_score + edge.weight + nodes[trace_idx].score << endl; -#endif - if (nodes[edge.to_idx].dp_score + edge.weight == target_source_score && !stacked.count(edge.to_idx)) { - trace_stack.push_back(edge.to_idx); - stacked.insert(edge.to_idx); -#ifdef debug_od_clusterer - cerr << "\t\tidentifying this as a proper traceback that we have not yet traced" << endl; -#endif - } + if (mem_idx >= 0 ? i == mem_intervals[mem_idx].second : false) { + smem_count++; + // iterate to the next MEM that contains some new sequence + curr_mem_begin = mem_intervals[mem_idx].first; + mem_idx--; + while (mem_idx >= 0 ? mem_intervals[mem_idx].first >= curr_mem_begin : false) { + mem_idx--; + } + // iterate through any sub-MEMs contained in the SMEM that share its end position + curr_mem_end = mem_intervals[mem_idx].second; + while (mem_idx > 0 ? mem_intervals[mem_idx - 1].second == curr_mem_end : false) { + mem_idx--; } } - - // make a cluster - to_return.emplace_back(); - auto& cluster = to_return.back(); - for (size_t traced_idx : stacked) { - ODNode& node = nodes[traced_idx]; - cluster.emplace_back(node.mem, node.start_pos); - } - - // put the cluster in order by read position - sort(cluster.begin(), cluster.end(), [](const hit_t& hit_1, const hit_t& hit_2) { - return hit_1.first->begin < hit_2.first->begin || - (hit_1.first->begin == hit_2.first->begin && hit_1.first->end < hit_2.first->end); - }); } - return std::move(to_return); +#ifdef debug_mem_clusterer + cerr << "computed left MEM coverage" << endl; + for (auto pos : mem_tail_coverage) { + cerr << pos.first << " "; + } + cerr << endl; + cerr << "computed right MEM coverage" << endl; + for (auto pos : mem_tail_coverage) { + cerr << pos.second << " "; + } + cerr << endl; +#endif + + return mem_tail_coverage; } vector, int64_t>> OrientedDistanceClusterer::pair_clusters(const Alignment& alignment_1, @@ -1968,17 +2457,12 @@ vector, int64_t>> OrientedDistanceClusterer::pair_clus const vector& right_clusters, const vector>& left_alt_cluster_anchors, const vector>& right_alt_cluster_anchors, - xg::XG* xgindex, - int64_t min_inter_cluster_distance, - int64_t max_inter_cluster_distance, - bool unstranded, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo) { - -#ifdef debug_od_clusterer + int64_t optimal_separation, + int64_t max_deviation) { + +#ifdef debug_mem_clusterer cerr << "beginning clustering of MEM cluster pairs for " << left_clusters.size() << " left clusters and " << right_clusters.size() << " right clusters" << endl; - cerr << "looking for pairs in the distance range of " << min_inter_cluster_distance << " to " << max_inter_cluster_distance << endl; + cerr << "looking for pairs with separation within of " << max_deviation << " from " << optimal_separation << endl; #endif // We will fill this in with all sufficiently close pairs of clusters from different reads. @@ -1994,551 +2478,1296 @@ vector, int64_t>> OrientedDistanceClusterer::pair_clus size_t total_cluster_positions = total_clusters + total_alt_anchors; // Compute distance trees for sets of clusters that are distance-able on consistent strands. - unordered_map, int64_t> distance_tree = get_on_strand_distance_tree(total_cluster_positions, unstranded, xgindex, + unordered_map, int64_t> distance_tree = get_on_strand_distance_tree(total_cluster_positions, [&](size_t cluster_num) { // Assumes the clusters are nonempty. if (cluster_num < left_clusters.size()) { // Grab the pos_t for the first hit in the cluster, which is sorted to be the largest one. - return left_clusters[cluster_num]->front().second; + return left_clusters[cluster_num]->first.front().second; } else if (cluster_num < total_clusters) { // Grab the pos_t for the largest hit from the other cluster - return right_clusters[cluster_num - left_clusters.size()]->front().second; + return right_clusters[cluster_num - left_clusters.size()]->first.front().second; } else if (cluster_num < total_clusters + left_alt_cluster_anchors.size()) { // Grab a lower pos_t in the list of hits according to the alt anchor const pair& alt_anchor = left_alt_cluster_anchors[cluster_num - total_clusters]; - return left_clusters[alt_anchor.first]->at(alt_anchor.second).second; + return left_clusters[alt_anchor.first]->first.at(alt_anchor.second).second; } else { // Grab an alternate pos_t for a right cluster const pair& alt_anchor = right_alt_cluster_anchors[cluster_num - total_clusters - left_alt_cluster_anchors.size()]; - return right_clusters[alt_anchor.first]->at(alt_anchor.second).second; + return right_clusters[alt_anchor.first]->first.at(alt_anchor.second).second; } }, [&](size_t cluster_num) { // Give the offset of the position we chose to either the start or end of the read if (cluster_num < left_clusters.size()) { - return alignment_1.sequence().begin() - left_clusters[cluster_num]->front().first->begin; + return alignment_1.sequence().begin() - left_clusters[cluster_num]->first.front().first->begin; } else if (cluster_num < total_clusters) { - return alignment_2.sequence().end() - right_clusters[cluster_num - left_clusters.size()]->front().first->begin; + return alignment_2.sequence().end() - right_clusters[cluster_num - left_clusters.size()]->first.front().first->begin; } else if (cluster_num < total_clusters + left_alt_cluster_anchors.size()) { const pair& alt_anchor = left_alt_cluster_anchors[cluster_num - total_clusters]; - return alignment_1.sequence().begin() - left_clusters[alt_anchor.first]->at(alt_anchor.second).first->begin; + return alignment_1.sequence().begin() - left_clusters[alt_anchor.first]->first.at(alt_anchor.second).first->begin; } else { const pair& alt_anchor = right_alt_cluster_anchors[cluster_num - total_clusters - left_alt_cluster_anchors.size()]; - return alignment_2.sequence().end() - right_clusters[alt_anchor.first]->at(alt_anchor.second).first->begin; + return alignment_2.sequence().end() - right_clusters[alt_anchor.first]->first.at(alt_anchor.second).first->begin; } - }, - paths_of_node_memo, oriented_occurences_memo, handle_memo); + }); // Flatten the distance tree to a set of linear spaces, one per tree. vector> linear_spaces = flatten_distance_tree(total_cluster_positions, distance_tree); -#ifdef debug_od_clusterer +#ifdef debug_mem_clusterer for (const auto& strand : linear_spaces) { cerr << "strand reconstruction: " << endl; for (const auto& record : strand) { if (record.first < left_clusters.size()) { - cerr << "\t" << record.first << " left: " << record.second << "\t" << left_clusters[record.first]->front().second << endl; + cerr << "\t" << record.first << " left: " << record.second << "\t" << left_clusters[record.first]->first.front().second << endl; } else if (record.first < total_clusters) { - cerr << "\t" << record.first - left_clusters.size() << " right: " << record.second << "\t" << right_clusters[record.first - left_clusters.size()]->front().second << endl; + cerr << "\t" << record.first - left_clusters.size() << " right: " << record.second << "\t" << right_clusters[record.first - left_clusters.size()]->first.front().second << endl; } else if (record.first < total_clusters + left_alt_cluster_anchors.size()) { const pair& alt_anchor = left_alt_cluster_anchors[record.first - total_clusters]; - cerr << "\t" << alt_anchor.first << "(alt " << alt_anchor.second << ") left: " << record.second << "\t" << left_clusters[alt_anchor.first]->front().second << endl; + cerr << "\t" << alt_anchor.first << "(alt " << alt_anchor.second << ") left: " << record.second << "\t" << left_clusters[alt_anchor.first]->first.front().second << endl; } else { const pair& alt_anchor = right_alt_cluster_anchors[record.first - total_clusters - left_alt_cluster_anchors.size()]; - cerr << "\t" << alt_anchor.first << "(alt " << alt_anchor.second << ") right: " << record.second << "\t" << right_clusters[alt_anchor.first]->front().second << endl; + cerr << "\t" << alt_anchor.first << "(alt " << alt_anchor.second << ") right: " << record.second << "\t" << right_clusters[alt_anchor.first]->first.front().second << endl; + } + + } + } +#endif + + // choose bounds based on whether we're measuring stranded distances + int64_t max_inter_cluster_distance = optimal_separation + max_deviation; + int64_t min_inter_cluster_distance = optimal_separation - max_deviation; + + for (const unordered_map& linear_space : linear_spaces) { + // For each linear space + + // The linear space may run forward or reverse relative to our read. + + // This will hold pairs of relative position and cluster number + vector> sorted_pos; + for (auto& cluster_and_pos : linear_space) { + // Flip each pair around and put it in the list to sort. + sorted_pos.emplace_back(cluster_and_pos.second, cluster_and_pos.first); + } + // Sort the list ascending by the first item (relative position) + std::sort(sorted_pos.begin(), sorted_pos.end()); + + // Now scan for opposing pairs within the distance limit. + // TODO: this is going to be O(n^2) in the number of clusters in range. + // Note: but only if there are a lot of clusters within the range, if the + // clusters are distributed sparsely it will be approximately linear + + // Keep a cursor to the start of the window and the end of the window. + // When adding each new thing to the window, eject anything too far + // behind it, then compare it to everything that is left. + size_t window_start = 0; + size_t window_last = 0; + + for (size_t i = 0; i < sorted_pos.size(); i++) { + // we're looking for left to right connections, so don't start from the right + if ((sorted_pos[i].second >= left_clusters.size() && sorted_pos[i].second < total_clusters) + || (sorted_pos[i].second >= total_clusters + left_alt_cluster_anchors.size())) { + continue; + } + + size_t left_idx = (sorted_pos[i].second < total_clusters ? + sorted_pos[i].second : left_alt_cluster_anchors[sorted_pos[i].second - total_clusters].first); + + // the interval of linearized coordinates we want to form pairs to + int64_t coord_interval_start = sorted_pos[i].first + min_inter_cluster_distance; + int64_t coord_interval_end = sorted_pos[i].first + max_inter_cluster_distance; + +#ifdef debug_mem_clusterer + if (sorted_pos[i].second < total_clusters) { + cerr << "looking for clusters consistent with cluster that starts with " << left_clusters[sorted_pos[i].second]->first.front().second << " at relative position " << sorted_pos[i].first << " in coordinate window " << coord_interval_start << ":" << coord_interval_end << endl; + } + else { + const pair& alt_anchor = left_alt_cluster_anchors[sorted_pos[i].second - total_clusters]; + cerr << "looking for clusters consistent with (alt) cluster that starts with " << left_clusters[alt_anchor.first]->first.front().second << " at relative position " << sorted_pos[i].first << " in coordinate window " << coord_interval_start << ":" << coord_interval_end << endl; + } +#endif + + // move the window bounds forward until it's inside the coordinate interval + while (window_start < sorted_pos.size() ? sorted_pos[window_start].first < coord_interval_start : false) { + window_start++; +#ifdef debug_mem_clusterer + if (window_start == sorted_pos.size()) { + cerr << "window is beyond the end of the clusters" << endl; + } + else { + cerr << "moving window start to relative position " << sorted_pos[window_start].first << endl; + } +#endif + } + while (window_last + 1 < sorted_pos.size() ? sorted_pos[window_last + 1].first < coord_interval_end : false) { + window_last++; +#ifdef debug_mem_clusterer + cerr << "moving window end to relative position " << sorted_pos[window_last - 1].first << endl; +#endif + } + + // add each pair of clusters that's from the two read ends to the return value + for (size_t j = window_start; j <= window_last; j++) { + if (sorted_pos[j].second < left_clusters.size() + || (sorted_pos[j].second >= total_clusters && sorted_pos[j].second < total_clusters + left_alt_cluster_anchors.size())) { +#ifdef debug_mem_clusterer + size_t idx = sorted_pos[j].second < total_clusters ? sorted_pos[j].second : sorted_pos[j].second - total_clusters; + cerr << "cluster at relative position " << sorted_pos[idx].first << " is from the same end, skipping" << endl; +#endif + continue; + } + + size_t right_idx; + if (sorted_pos[j].second < total_clusters) { + right_idx = sorted_pos[j].second - left_clusters.size(); + } + else { + right_idx = right_alt_cluster_anchors[sorted_pos[j].second - total_clusters - left_alt_cluster_anchors.size()].first; + } + +#ifdef debug_mem_clusterer + cerr << "adding pair (" << left_idx << ", " << right_idx << ") with cluster relative position " << sorted_pos[j].first << " starting with " << right_clusters[right_idx]->first.front().second << endl; +#endif + + to_return.emplace_back(make_pair(left_idx, right_idx), + sorted_pos[j].first - sorted_pos[i].first); + } + } + } + + if (!left_alt_cluster_anchors.empty() || !right_alt_cluster_anchors.empty()) { + // get rid of extra copies of pairs due to alternate anchor positions + deduplicate_cluster_pairs(to_return, optimal_separation); + } + + return to_return; +} + +SnarlMinDistance::SnarlMinDistance(SnarlDistanceIndex& distance_index) : distance_index(distance_index) { + // nothing else to do +} + +int64_t SnarlMinDistance::operator()(const pos_t& pos_1, const pos_t& pos_2) { + size_t distance = minimum_distance(distance_index, pos_1, pos_2); + return distance == std::numeric_limits::max() ? -1 : (int64_t)distance; +} + +TipAnchoredMaxDistance::TipAnchoredMaxDistance(SnarlDistanceIndex& distance_index) : distance_index(distance_index) { + // nothing else to do +} + +int64_t TipAnchoredMaxDistance::operator()(const pos_t& pos_1, const pos_t& pos_2) { + return maximum_distance(distance_index, pos_1, pos_2); +} + +TargetValueSearch::TargetValueSearch(const HandleGraph& handle_graph, + DistanceHeuristic* upper_bound_heuristic, + DistanceHeuristic* lower_bound_heuristic) : + handle_graph(handle_graph), upper_bound_heuristic(upper_bound_heuristic), lower_bound_heuristic(lower_bound_heuristic) { + // nothing else to do +} + +bool TargetValueSearch::tv_path_exists(const pos_t& pos_1, const pos_t& pos_2, int64_t target_value, int64_t tolerance) { + return !tv_path(pos_1, pos_2, target_value, tolerance).empty(); +} + +vector TargetValueSearch::tv_path(const pos_t& pos_1, const pos_t& pos_2, int64_t target_value, int64_t tolerance) { + + //TODO: Doesn't work for cyclic graphs since max dist returns cap, not infinity + bool exact_min = true;//TODO: Put this somewhere else. True if the min heuristic is exact + DistanceHeuristic& min_distance = *lower_bound_heuristic; + DistanceHeuristic& max_distance = *upper_bound_heuristic; + + int64_t offset_1 = offset(pos_1); + int64_t offset_2 = offset(pos_2); + + //map each node to the target values from that node that are out of the min + //and max bounds for the node but within tolerance of the target + //Only keep the smaller/larger value that is closest to the target - maximum shorter distance + hash_map, int64_t> node_to_target_longer;//Too long + hash_map, int64_t> node_to_target_shorter;//Too short + + //Path that is closest to the target and difference from target + pair, int64_t>> next_best + (-1, make_pair(make_pair(0, false), -1)); + + //Best that is too long - use when min heuristic finds actual minimum + //difference between target and best dist, node, and target from that node + //difference between actual best path and target, the node itself, and the + // target from that node + pair, int64_t>> best_long + (-1, make_pair(make_pair(0, false), -1)); + + + //map each node and target for node to the node+target leading to it + //TODO: Maybe better to map each node to the actual path and remove old ones as necessary + hash_map, int64_t>, pair, int64_t>> + node_to_path; + + //reachable node + vector, int64_t>> next_nodes; //node and target + next_nodes.push_back(make_pair(make_pair(id(pos_1), is_rev(pos_1)), + target_value + offset_1)); + + handle_t h = handle_graph.get_handle(id(pos_1), is_rev(pos_1)); + + + + + //TODO: maybe move this somewhere else + auto get_min_path = [&](pair, int64_t> node) + -> vector { + /* Assuming that the path from node to pos_2 is the best path, + * find the path from pos_1 to pos_2 that passes through node with the + * given target value + */ + + //Get the path from pos_1 to node + list result; + auto prev = node_to_path.find(node); + handle_t curr_handle = handle_graph.get_handle(node.first.first, + node.first.second); + result.push_front(curr_handle); + while (prev != node_to_path.end()) { + pair, int64_t> prev_n = prev->second; + curr_handle = handle_graph.get_handle( + prev_n.first.first, prev_n.first.second); + result.push_front(curr_handle); + prev = node_to_path.find(prev_n); + + } + + vector path (result.begin(), result.end()); + //Path contains handles from pos_1 to node + + //Get the path from node to pos_2 + pos_t curr_pos = make_pos_t(node.first.first, node.first.second, 0); + int64_t dist = min_distance(curr_pos, pos_2); + while (id(curr_pos) != id(pos_2) || is_rev(curr_pos) != is_rev(pos_2)){ + + handle_t handle = handle_graph.get_handle(id(curr_pos), + is_rev(curr_pos)); + auto try_next = [&](const handle_t& h)-> bool { + curr_pos = make_pos_t(handle_graph.get_id(h), + handle_graph.get_is_reverse(h), 0); + + int64_t node_len = handle_graph.get_length(handle); + if (min_distance(curr_pos, pos_2) + node_len == dist) { + //If this node is on a minimum path + dist = dist - node_len; + path.push_back(h); + return false; + } else { + return true; + } + + }; + + handle_graph.follow_edges(handle, false, try_next); + + } + return path; + + }; + + int64_t min = min_distance(pos_1, pos_2); + if (min == -1 || target_value + tolerance < min) { + // The positions are too far apart, or are unreachable + return vector(); + } + + int64_t max = max_distance(pos_1, pos_2); + if (target_value - tolerance > max) { + // The positions are too close together + return vector(); + } + + //////////// Phase 1 of tsv search: get target for each reachable node + while (next_nodes.size() != 0) { + //Traverse graph in DFS order, find the target at each node + + pair, int64_t> next = next_nodes.back(); + next_nodes.pop_back(); + pair curr_node = next.first; + int64_t curr_target = next.second; + + + if (curr_node.first == id(pos_2) && curr_node.second == is_rev(pos_2)) { + //If this node is the end node + if (curr_target == offset_2) { + //If perfect path + list result; + auto prev = node_to_path.find(make_pair(curr_node, curr_target)); + handle_t handle = handle_graph.get_handle(curr_node.first, curr_node.second); + result.push_front(handle); + while (prev != node_to_path.end()) { + pair, int64_t> prev_n = prev->second; + handle = handle_graph.get_handle( + prev_n.first.first, prev_n.first.second); + result.push_front(handle); + prev = node_to_path.find(prev_n); + + } + return vector(result.begin(), result.end()); + } else { + int64_t diff = abs(curr_target-offset_2 ); + if (next_best.first == -1 || diff < next_best.first) { + next_best.first = diff; + next_best.second = make_pair(curr_node, curr_target); + } + } + } + + //If this is any other node or the target was not hit + + handle_t curr_handle = handle_graph.get_handle(curr_node.first, curr_node.second); + int64_t new_target = curr_target - handle_graph.get_length(curr_handle); + + vector best_path;//Use this if the best path can be found using min distance + + auto add_next = [&](const handle_t& h)-> bool { + //For each adjacent node, add it to next nodes if end node is + //reachable with target + + id_t id = handle_graph.get_id(h); + bool rev = handle_graph.get_is_reverse(h); + pos_t new_pos = make_pos_t(id, rev, 0); + + int64_t min_dist = min_distance(new_pos, pos_2); + int64_t max_dist = max_distance(new_pos, pos_2); + int64_t lower_target = std::max((int64_t)0, (new_target - tolerance)); + int64_t upper_target = new_target + tolerance; + + if (exact_min && min_dist != -1 && min_dist == new_target) { + //If the minimum path is the best path + node_to_path[make_pair(make_pair(id, rev), new_target)]= + make_pair(curr_node, curr_target); + best_path = get_min_path(make_pair(make_pair(id, rev), new_target)); + return false; + } + + if (min_dist != -1 && + min_dist <= new_target && new_target <= max_dist) { + //If the target is within the distance bounds + + auto prev = node_to_path.find(make_pair(make_pair(id, rev), + new_target)); + if (prev == node_to_path.end()) { + //If this node hasn't been seen before + node_to_path[make_pair(make_pair(id, rev), new_target)]= + make_pair(curr_node, curr_target); + next_nodes.emplace_back(make_pair(id, rev), new_target); + } + + } else if (min_dist != -1 && + ((lower_target <= min_dist && min_dist <= upper_target) || + (lower_target <= max_dist && max_dist <= upper_target))){ + + //If no path will hit the target but there are paths + //within tolerance, then save for later + //TODO: Could take a shortcut if we assume that the min dist is actual min dist + + auto prev_max_target = node_to_target_shorter.find( + make_pair(id, rev)); + auto prev_min_target = node_to_target_longer.find( + make_pair(id, rev)); + if (min_dist >= new_target) { + //All paths too long - want to minimize distance + + if (exact_min && (best_long.first == -1 || + min_dist - new_target < best_long.first)) { + //If the min heuristic is exact, only save one longer + //path + //If the min from here is better than previous longer path + best_long.first = min_dist - new_target; + best_long.second = make_pair(make_pair(id, rev), + new_target); + node_to_path[make_pair(make_pair(id, rev), + new_target)] = make_pair(curr_node, curr_target); + + } else if (!exact_min && + (prev_min_target == node_to_target_longer.end() || + new_target < prev_min_target->second)) { + //Target is better (smaller)than previous from this node + node_to_target_longer.erase(make_pair(id, rev)); + node_to_target_longer.emplace(make_pair(id, rev), + new_target); + node_to_path[make_pair(make_pair(id, rev), new_target)] + = make_pair(curr_node, curr_target); + } + } else if (max_dist <= new_target && + (prev_max_target == node_to_target_shorter.end() || + new_target > prev_max_target->second)){ + //All paths too short; + node_to_target_shorter.erase(make_pair(id, rev)); + node_to_target_shorter.emplace( + make_pair(id, rev), new_target); + node_to_path[make_pair(make_pair(id, rev), + new_target)] = make_pair(curr_node, curr_target); + } } + return true; + }; + if (!handle_graph.follow_edges(curr_handle, false, add_next)){ + + return best_path; + + } + + } + return tv_phase2(pos_1, pos_2, target_value, tolerance, node_to_target_shorter, node_to_target_longer, best_long, next_best, node_to_path); +} + +vector TargetValueSearch::tv_phase2(const pos_t& pos_1, const pos_t& pos_2, int64_t target_value, int64_t tolerance, + hash_map, int64_t>& node_to_target_shorter, + hash_map, int64_t>& node_to_target_longer, + pair, int64_t>>& best_long, + pair, int64_t>>& next_best, + hash_map, int64_t>, pair, int64_t>>& node_to_path) { +//TODO: Any path that has been found is probably still pretty good, could return it here + + DistanceHeuristic& min_distance = *lower_bound_heuristic; + DistanceHeuristic& max_distance = *upper_bound_heuristic; + int64_t offset_1 = offset(pos_1); + int64_t offset_2 = offset(pos_2); + auto get_min_path = [&](pair, int64_t> node) + -> vector { + /* Assuming that the path from node to pos_2 is the best path, + * find the path from pos_1 to pos_2 that passes through node with the + * given target value + */ + + //Get the path from pos_1 to node + list result; + auto prev = node_to_path.find(node); + handle_t curr_handle = handle_graph.get_handle(node.first.first, + node.first.second); + result.push_front(curr_handle); + while (prev != node_to_path.end()) { + pair, int64_t> prev_n = prev->second; + curr_handle = handle_graph.get_handle( + prev_n.first.first, prev_n.first.second); + result.push_front(curr_handle); + prev = node_to_path.find(prev_n); + + } + + vector path (result.begin(), result.end()); + //Path contains handles from pos_1 to node + + //Get the path from node to pos_2 + pos_t curr_pos = make_pos_t(node.first.first, node.first.second, 0); + int64_t dist = min_distance(curr_pos, pos_2); + while (id(curr_pos) != id(pos_2) || is_rev(curr_pos) != is_rev(pos_2)){ + + handle_t handle = handle_graph.get_handle(id(curr_pos), + is_rev(curr_pos)); + auto try_next = [&](const handle_t& h)-> bool { + curr_pos = make_pos_t(handle_graph.get_id(h), + handle_graph.get_is_reverse(h), 0); + + int64_t node_len = handle_graph.get_length(handle); + if (min_distance(curr_pos, pos_2) + node_len == dist) { + //If this node is on a minimum path + dist = dist - node_len; + path.push_back(h); + return false; + } else { + return true; + } + + }; + + handle_graph.follow_edges(handle, false, try_next); + + } + return path; + + }; + ///////// Phase 2 + //If there is no perfect path, look for ones still within tolerance + auto cmp = [] (pair, int64_t>, int64_t> x, + pair, int64_t>, int64_t> y) { + //Comparison function for priority queue + return (x.second > y.second); + }; + priority_queue, int64_t>, int64_t>, + vector, int64_t>, int64_t>>, + decltype(cmp)> reachable(cmp); + + //Put all nodes into + for (auto it : node_to_target_shorter) { + pair node = it.first; + int64_t target = it.second; + pos_t pos = make_pos_t(node.first, node.second, 0); + int64_t diff = target - max_distance(pos, pos_2) ; + reachable.push(make_pair(make_pair(node, target), diff)); + + } + for (auto it : node_to_target_longer) { + pair node = it.first; + int64_t target = it.second; + pos_t pos = make_pos_t(node.first, node.second, 0); + int64_t diff = min_distance(pos, pos_2) - target; + reachable.push(make_pair(make_pair(node, target), diff)); + + } + + while (reachable.size() != 0) { + //Continue A* search of nodes that cannot reach pos_2 with target length + + pair, int64_t>,int64_t> next = reachable.top(); + reachable.pop(); + pair curr_node = next.first.first; + int64_t curr_target = next.first.second; + + handle_t curr_handle = handle_graph.get_handle(curr_node.first, curr_node.second); + pair, int64_t> prev_node (curr_node, curr_target); + + if (curr_node.first == id(pos_2) && curr_node.second == is_rev(pos_2)) { + //If this node is the end node + int64_t diff = abs( curr_target - offset_2); + if (next_best.first == -1 || diff < next_best.first) { + next_best.first = diff; + next_best.second = prev_node; + } + } else { + + //If this is any other node + //TODO: Should be able to traverse the start node twice if this is a cyclic graph + + int64_t new_target = curr_target - + handle_graph.get_length(curr_handle); + auto add_next = [&](const handle_t& h)-> bool { + id_t id = handle_graph.get_id(h); + bool rev = handle_graph.get_is_reverse(h); + pos_t new_pos = make_pos_t(id, rev, 0); + + int64_t min_dist = min_distance(new_pos, pos_2); + int64_t max_dist = max_distance(new_pos, pos_2); + + if (min_dist != -1) { + auto prev_max_target = node_to_target_shorter.find( + make_pair(id, rev)); + auto prev_min_target = node_to_target_longer.find( + make_pair(id, rev)); + if (min_dist >= new_target) { + //If paths are too long + if ( min_dist <= new_target + tolerance && + (prev_min_target == node_to_target_longer.end() || + new_target < prev_min_target->second)) { + //If this target is better than last one + node_to_target_longer.erase(make_pair(id, rev)); + node_to_target_longer.emplace(make_pair(id, rev), + new_target); + node_to_path[make_pair(make_pair(id, rev), + new_target)] = prev_node; + int64_t diff = min_dist - new_target; + reachable.push(make_pair(make_pair( + make_pair(id, rev), new_target), diff)); + } + + } else if (max_dist <= new_target){ + //All paths too short + if ( max_dist >= new_target - tolerance && + (prev_max_target == node_to_target_shorter.end() || + new_target > prev_max_target->second)){ + + node_to_target_shorter.erase(make_pair(id, rev)); + node_to_target_shorter.emplace(make_pair(id, rev), + new_target); + node_to_path[make_pair(make_pair(id, rev), + new_target)] = prev_node; + int64_t diff = new_target - max_dist; + reachable.push(make_pair(make_pair(make_pair( + id, rev),new_target), diff)); + } + } else { + //Target is within bounds again + //TODO: Maybe keep track of whether the path is too long or too short + auto prev = node_to_path.find(make_pair(make_pair( + id, rev), new_target)); + if (prev == node_to_path.end()) { + //If this node hasn't been seen before + node_to_path[make_pair(make_pair(id, rev), + new_target)]= prev_node; + reachable.push(make_pair(make_pair(make_pair( + id, rev), new_target), 0)); + } + } + } + + return true; + }; + handle_graph.follow_edges(curr_handle, false, add_next); } } -#endif - - for (const unordered_map& linear_space : linear_spaces) { - // For each linear space - - // The linear space may run forward or reverse relative to our read. + + if (best_long.first != -1 && best_long.first <= tolerance && + (next_best.first == -1 || + best_long.first <= next_best.first)) { + //Get path for the best that is longer than the target + + return get_min_path(best_long.second); + + } else if (next_best.first != -1 && next_best.first <= tolerance) { - // This will hold pairs of relative position and cluster number - vector> sorted_pos; - for (auto& cluster_and_pos : linear_space) { - // Flip each pair around and put it in the list to sort. - sorted_pos.emplace_back(cluster_and_pos.second, cluster_and_pos.first); + //Backtrack to get path + list result; + auto prev = node_to_path.find(next_best.second); + handle_t handle = handle_graph.get_handle(next_best.second.first.first, + next_best.second.first.second); + result.push_front(handle); + while (prev != node_to_path.end()) { + pair, int64_t> prev_node = prev->second; + handle = handle_graph.get_handle(prev_node.first.first, + prev_node.first.second); + result.push_front(handle); + prev = node_to_path.find(prev_node); + } - // Sort the list ascending by the first item (relative position) - std::sort(sorted_pos.begin(), sorted_pos.end()); - - // Now scan for opposing pairs within the distance limit. - // TODO: this is going to be O(n^2) in the number of clusters in range. - // Note: but only if there are a lot of clusters within the range, if the - // clusters are distributed sparsely it will be approximately linear - - // Keep a cursor to the start of the window and the end of the window. - // When adding each new thing to the window, eject anything too far - // behind it, then compare it to everything that is left. - size_t window_start = 0; - size_t window_last = 0; + + return vector(result.begin(), result.end()); + } else { + + return vector(); + } +} + +int64_t TargetValueSearch::tv_path_length(const pos_t& pos_1, const pos_t& pos_2, int64_t target_value, int64_t tolerance) { + + vector path = tv_path(pos_1, pos_2, target_value, tolerance); + if (path.empty()) { + return numeric_limits::max(); + } + else { + // TODO: we should move tv_path into an internal function that also returns length, + // there shouldn't be any reason to recompute it here! + int64_t length = offset(pos_2) - offset(pos_1); + for (size_t i = 0, end = path.size() - 1; i < end; i++) { + length += handle_graph.get_length(path[i]); + } + return length; + } +} + +TVSClusterer::TVSClusterer(const HandleGraph* handle_graph, SnarlDistanceIndex* distance_index) : + tvs(*handle_graph, new TipAnchoredMaxDistance(*distance_index), new SnarlMinDistance(*distance_index)) { + + // nothing else to do +} + +MEMClusterer::HitGraph TVSClusterer::make_hit_graph(const Alignment& alignment, const vector& mems, + const GSSWAligner* aligner, size_t min_mem_length, + const match_fanouts_t* fanouts) { + + + // intialize with nodes + HitGraph hit_graph(mems, alignment, aligner, min_mem_length, false, fanouts); + + // assumes that MEMs are given in lexicographic order by read interval + for (size_t i = 0; i < hit_graph.nodes.size(); i++) { + HitNode& hit_node_1 = hit_graph.nodes[i]; - for (size_t i = 0; i < sorted_pos.size(); i++) { - // we're looking for left to right connections, so don't start from the right - if ((sorted_pos[i].second >= left_clusters.size() && sorted_pos[i].second < total_clusters) - || (sorted_pos[i].second >= total_clusters + left_alt_cluster_anchors.size())) { + for (size_t j = i + 1; j < hit_graph.nodes.size(); j++){ + + HitNode& hit_node_2 = hit_graph.nodes[j]; + + if (hit_node_2.mem->begin <= hit_node_1.mem->begin + && hit_node_2.mem->end <= hit_node_1.mem->end) { + // this node is at the same place or earlier in the read, so they can't be colinear + +#ifdef debug_mem_clusterer + cerr << "nodes " << i << " (" << hit_node_1.start_pos << ") and " << j << " (" << hit_node_2.start_pos << ") are not read colinear" << endl; +#endif continue; } - size_t left_idx = (sorted_pos[i].second < total_clusters ? - sorted_pos[i].second : left_alt_cluster_anchors[sorted_pos[i].second - total_clusters].first); + // how far apart do we expect them to be based on the read? + int64_t read_separation = hit_node_2.mem->begin - hit_node_1.mem->begin; - // the interval of linearized coordinates we want to form pairs to - int64_t coord_interval_start = sorted_pos[i].first + min_inter_cluster_distance; - int64_t coord_interval_end = sorted_pos[i].first + max_inter_cluster_distance; + // how long of an insert/deletion could we detect based on the scoring parameters? + size_t longest_gap = min(min(aligner->longest_detectable_gap(alignment, hit_node_1.mem->end), + aligner->longest_detectable_gap(alignment, hit_node_2.mem->begin)), + max_gap); -#ifdef debug_od_clusterer - if (sorted_pos[i].second < total_clusters) { - cerr << "looking for clusters consistent with cluster that starts with " << left_clusters[sorted_pos[i].second]->front().second << " at relative position " << sorted_pos[i].first << " in coordinate window " << coord_interval_start << ":" << coord_interval_end << endl; - } - else { - const pair& alt_anchor = left_alt_cluster_anchors[sorted_pos[i].second - total_clusters]; - cerr << "looking for clusters consistent with (alt) cluster that starts with " << left_clusters[alt_anchor.first]->front().second << " at relative position " << sorted_pos[i].first << " in coordinate window " << coord_interval_start << ":" << coord_interval_end << endl; - } +#ifdef debug_mem_clusterer + cerr << "estimating distance between " << i << " (pos " << hit_node_1.start_pos << ") and " << j << " (pos " << hit_node_2.start_pos << ") with target " << read_separation << " and tolerance " << longest_gap << endl; #endif - // move the window bounds forward until it's inside the coordinate interval - while (window_start < sorted_pos.size() ? sorted_pos[window_start].first < coord_interval_start : false) { - window_start++; -#ifdef debug_od_clusterer - if (window_start == sorted_pos.size()) { - cerr << "window is beyond the end of the clusters" << endl; - } - else { - cerr << "moving window start to relative position " << sorted_pos[window_start].first << endl; - } -#endif - } - while (window_last + 1 < sorted_pos.size() ? sorted_pos[window_last + 1].first < coord_interval_end : false) { - window_last++; -#ifdef debug_od_clusterer - cerr << "moving window end to relative position " << sorted_pos[window_last - 1].first << endl; -#endif - } + // how close can we get to the expected distance, restricting to detectable edits + int64_t tv_len = tvs.tv_path_length(hit_node_1.start_pos, hit_node_2.start_pos, read_separation, longest_gap); - // add each pair of clusters that's from the two read ends to the return value - for (size_t j = window_start; j <= window_last; j++) { - if (sorted_pos[j].second < left_clusters.size() - || (sorted_pos[j].second >= total_clusters && sorted_pos[j].second < total_clusters + left_alt_cluster_anchors.size())) { -#ifdef debug_od_clusterer - size_t idx = sorted_pos[j].second < total_clusters ? sorted_pos[j].second : sorted_pos[j].second - total_clusters; - cerr << "cluster at relative position " << sorted_pos[idx].first << " is from the same end, skipping" << endl; +#ifdef debug_mem_clusterer + cerr << "estimate distance at " << tv_len << endl; #endif - continue; - } + + if (tv_len == read_separation + && ((hit_node_2.mem->begin >= hit_node_1.mem->begin && hit_node_2.mem->end < hit_node_1.mem->end) + || (hit_node_2.mem->begin > hit_node_1.mem->begin && hit_node_2.mem->end <= hit_node_1.mem->end))) { + // this has the appearance of being a redundant hit of a sub-MEM, which we don't want to form + // a separate cluster - size_t right_idx; - if (sorted_pos[j].second < total_clusters) { - right_idx = sorted_pos[j].second - left_clusters.size(); - } - else { - right_idx = right_alt_cluster_anchors[sorted_pos[j].second - total_clusters - left_alt_cluster_anchors.size()].first; - } + // we add a dummy edge, but only to connect the nodes' components and join the clusters, + // not to actually use in dynamic programming (given arbitrary low weight that should not + // cause overflow) + hit_graph.add_edge(i, j, numeric_limits::lowest() / 2, tv_len); + } + else if (tv_len != numeric_limits::max() + && hit_node_2.mem->begin >= hit_node_1.mem->begin + && hit_node_2.mem->end >= hit_node_1.mem->end) { + // there's a path within in the limit, and these hits are read colinear -#ifdef debug_od_clusterer - cerr << "adding pair (" << left_idx << ", " << right_idx << ") with cluster relative position " << sorted_pos[j].first << " starting with " << right_clusters[right_idx]->front().second << endl; -#endif + // the distance from the end of the first hit to the beginning of the next + int64_t graph_dist = tv_len - (hit_node_1.mem->end - hit_node_1.mem->begin); + + // add the corresponding edge + hit_graph.add_edge(i, j, estimate_edge_score(hit_node_1.mem, hit_node_2.mem, graph_dist, aligner), graph_dist); - to_return.emplace_back(make_pair(left_idx, right_idx), - sorted_pos[j].first - sorted_pos[i].first); } } } - if (!left_alt_cluster_anchors.empty() || !right_alt_cluster_anchors.empty()) { - // We assume that we're looking for the middle of the distances - int64_t target_separation = (max_inter_cluster_distance + min_inter_cluster_distance) / 2; - - // sort so that pairs with same clusters are adjacent - sort(to_return.begin(), to_return.end()); - -#ifdef debug_od_clusterer - cerr << "pairs before deduplicating:" << endl; - for (const auto& pair_record : to_return) { - cerr << pair_record.first.first << ", " << pair_record.first.second << ": " << pair_record.second << endl; - } - cerr << "target separation " << target_separation << endl; + return hit_graph; +} + +vector, int64_t>> TVSClusterer::pair_clusters(const Alignment& alignment_1, + const Alignment& alignment_2, + const vector& left_clusters, + const vector& right_clusters, + const vector>& left_alt_cluster_anchors, + const vector>& right_alt_cluster_anchors, + int64_t optimal_separation, + int64_t max_deviation) { + +#ifdef debug_mem_clusterer + cerr << "clustering pairs of clusters" << endl; #endif + + vector, int64_t>> to_return; + + for (size_t i = 0, i_end = left_clusters.size() + left_alt_cluster_anchors.size(); i < i_end; i++) { - size_t removed_so_far = 0; + // choose the appropriate left cluster and assign it a position + size_t left_clust_idx; + hit_t left_clust_hit; + if (i < left_clusters.size()) { + left_clust_idx = i; + left_clust_hit = left_clusters[i]->first.front(); + } + else { + auto& alt_anchor = left_alt_cluster_anchors[i - left_clusters.size()]; + left_clust_idx = alt_anchor.first; + left_clust_hit = left_clusters[left_clust_idx]->first.at(alt_anchor.second); + } - for (size_t i = 0; i < to_return.size();) { - // find the range of values that have the same pair of indices - size_t range_end = i + 1; - while (range_end < to_return.size() ? to_return[i].first == to_return[range_end].first : false) { - range_end++; - } + for (size_t j = 0, j_end = right_clusters.size() + right_alt_cluster_anchors.size(); j < j_end; j++) { - // find the pair that is closest to the middle of the target interval - int64_t best_separation = to_return[i].second; - size_t best_idx = i; - for (size_t j = i + 1; j < range_end; j++) { - if (abs(to_return[j].second - target_separation) < abs(best_separation - target_separation)) { - best_separation = to_return[j].second; - best_idx = j; - } + // choose the appropriate right cluster and assign it a position + size_t right_clust_idx; + hit_t right_clust_hit; + if (j < right_clusters.size()) { + right_clust_idx = j; + right_clust_hit = right_clusters[j]->first.front(); + } + else { + auto& alt_anchor = right_alt_cluster_anchors[j - right_clusters.size()]; + right_clust_idx = alt_anchor.first; + right_clust_hit = right_clusters[right_clust_idx]->first.at(alt_anchor.second); } - // move the best pair with these indices into the part of the vector we will keep - to_return[i - removed_so_far] = to_return[best_idx]; + // adjust the target value by how far away we are from the ends of the fragment + int64_t left_clip = left_clust_hit.first->begin - alignment_1.sequence().begin(); + int64_t right_clip = alignment_2.sequence().end() - right_clust_hit.first->begin; + int64_t target_separation = optimal_separation - left_clip - right_clip; - // we remove the entire interval except for one - removed_so_far += range_end - i - 1; - i = range_end; - } - - // trim off the end of the vector, which now contains arbitrary values - to_return.resize(to_return.size() - removed_so_far); - -#ifdef debug_od_clusterer - cerr << "pairs after deduplicating:" << endl; - for (const auto& pair_record : to_return) { - cerr << pair_record.first.first << ", " << pair_record.first.second << ": " << pair_record.second << endl; - } +#ifdef debug_mem_clusterer + cerr << "measuring distance between cluster " << left_clust_idx << " (" << left_clust_hit.second << ") and " << right_clust_idx << " (" << right_clust_hit.second << ") with target of " << target_separation << " and max deviation " << max_deviation << endl; #endif - } - - return to_return; -} - -void OrientedDistanceClusterer::prune_low_scoring_edges(vector>& components, size_t component_idx, double score_factor) { - - vector& component = components[component_idx]; - - // get the topological order within this component (expressed in indexes into the component vector) - vector component_order; - component_topological_order(component, component_order); - -#ifdef debug_od_clusterer - cerr << "doing backwards DP" << endl; + + // find the closest distance to this in the path + int64_t tv_dist = tvs.tv_path_length(left_clust_hit.second, right_clust_hit.second, + target_separation, max_deviation); + +#ifdef debug_mem_clusterer + cerr << "estimate distance at " << tv_dist << endl; #endif - - vector backwards_dp_score(component.size()); - unordered_map node_idx_to_component_idx; - for (size_t i = 0; i < component.size(); i++) { - backwards_dp_score[i] = nodes[component[i]].score; - node_idx_to_component_idx[component[i]] = i; - } - - // do dynamic programming backwards within this component - for (int64_t i = component_order.size() - 1; i >= 0; i--) { - size_t idx = component_order[i]; - size_t node_idx = component[idx]; - for (ODEdge& edge : nodes[node_idx].edges_to) { - size_t local_to_idx = node_idx_to_component_idx[edge.to_idx]; - int32_t dp_score = backwards_dp_score[idx] + edge.weight; - if (dp_score > backwards_dp_score[local_to_idx]) { - backwards_dp_score[local_to_idx] = dp_score; + + if (tv_dist != numeric_limits::max()) { + // we found a suitable path, add it to the return vector + to_return.emplace_back(make_pair(left_clust_idx, right_clust_idx), + tv_dist + left_clip + right_clip); + } } } -#ifdef debug_od_clusterer - cerr << "backwards dp scores:" << endl; - for (size_t i = 0; i < component.size(); i++) { - cerr << "\t" << component[i] << ": " << backwards_dp_score[i] << endl; + if (!left_alt_cluster_anchors.empty() || !right_alt_cluster_anchors.empty()) { + // get rid of extra copies of pairs due to alternate anchor positions + deduplicate_cluster_pairs(to_return, optimal_separation); } -#endif - // the minimum score we will require each edge to be a part of - int32_t min_score = *max_element(backwards_dp_score.begin(), backwards_dp_score.end()) * score_factor; + return to_return; +} + +MinDistanceClusterer::MinDistanceClusterer(SnarlDistanceIndex* distance_index) : distance_index(distance_index) { + // nothing to do +} -#ifdef debug_od_clusterer - cerr << "looking for edges with max score less than " << min_score << endl; +vector, int64_t>> MinDistanceClusterer::pair_clusters(const Alignment& alignment_1, + const Alignment& alignment_2, + const vector& left_clusters, + const vector& right_clusters, + const vector>& left_alt_cluster_anchors, + const vector>& right_alt_cluster_anchors, + int64_t optimal_separation, + int64_t max_deviation) { +#ifdef debug_mem_clusterer + cerr << "clustering pairs of clusters" << endl; #endif - for (size_t i = 0; i < component.size(); i++) { - size_t node_idx = component[i]; - ODNode& node = nodes[node_idx]; - for (size_t j = 0; j < node.edges_from.size(); ) { - ODEdge& edge = node.edges_from[j]; + vector, int64_t>> to_return; + + for (size_t i = 0, i_end = left_clusters.size() + left_alt_cluster_anchors.size(); i < i_end; i++) { + + // choose the appropriate left cluster and assign it a position + size_t left_clust_idx; + hit_t left_clust_hit; + if (i < left_clusters.size()) { + left_clust_idx = i; + left_clust_hit = left_clusters[i]->first.front(); + } + else { + auto& alt_anchor = left_alt_cluster_anchors[i - left_clusters.size()]; + left_clust_idx = alt_anchor.first; + left_clust_hit = left_clusters[left_clust_idx]->first.at(alt_anchor.second); + } + + for (size_t j = 0, j_end = right_clusters.size() + right_alt_cluster_anchors.size(); j < j_end; j++) { - // don't remove edges that look nearly perfect (helps keep redundant sub-MEMs in the cluster with - // their parent so that they can be removed later) - if (abs((edge.distance + (node.mem->end - node.mem->begin)) - - (nodes[edge.to_idx].mem->begin - node.mem->begin)) <= 1) { -#ifdef debug_od_clusterer - cerr << "preserving edge because distance looks good" << endl; + // choose the appropriate right cluster and assign it a position + size_t right_clust_idx; + hit_t right_clust_hit; + if (j < right_clusters.size()) { + right_clust_idx = j; + right_clust_hit = right_clusters[j]->first.front(); + } + else { + auto& alt_anchor = right_alt_cluster_anchors[j - right_clusters.size()]; + right_clust_idx = alt_anchor.first; + right_clust_hit = right_clusters[right_clust_idx]->first.at(alt_anchor.second); + } + +#ifdef debug_mem_clusterer + cerr << "measuring distance between cluster " << left_clust_idx << " (" << left_clust_hit.second << ") and " << right_clust_idx << " (" << right_clust_hit.second << ") with target of " << optimal_separation << " and max deviation " << max_deviation << endl; #endif - j++; - continue; + + // what is the minimum distance between these hits? + int64_t min_dist = minimum_distance(*distance_index, left_clust_hit.second, right_clust_hit.second); + if (min_dist == std::numeric_limits::max()) { + size_t rev_min_dist = minimum_distance(*distance_index, left_clust_hit.second, right_clust_hit.second); + if (rev_min_dist == std::numeric_limits::max()) { + // these are not reachable, don't make a pair + continue; + } + else { + // this is reachable by traversing backwards, give it negative distance + min_dist = -(int64_t)rev_min_dist; + } } - // the forward-backward score of this edge - int32_t edge_score = node.dp_score + edge.weight + backwards_dp_score[node_idx_to_component_idx[edge.to_idx]]; - // is the max score across this edge too low? - if (edge_score < min_score) { - -#ifdef debug_od_clusterer - cerr << "removing edge " << node_idx << "->" << edge.to_idx << " with weight " << edge.weight << " and max score " << edge_score << endl; -#endif - - // remove it's reverse counterpart - ODNode& dest_node = nodes[edge.to_idx]; - for (size_t k = 0; k < dest_node.edges_to.size(); k++) { - if (dest_node.edges_to[k].to_idx == node_idx) { -#ifdef debug_od_clusterer - cerr << "removing bwd edge " << edge.to_idx << "->" << dest_node.edges_to[k].to_idx << " with weight " << dest_node.edges_to[k].weight << " and max score " << edge_score << endl; +#ifdef debug_mem_clusterer + cerr << "estimate distance at " << min_dist << endl; #endif - dest_node.edges_to[k] = dest_node.edges_to.back(); - dest_node.edges_to.pop_back(); - break; - } - } + + // adjust the distance by how far away we are from the ends of the fragment + int64_t left_clip = left_clust_hit.first->begin - alignment_1.sequence().begin(); + int64_t right_clip = alignment_2.sequence().end() - right_clust_hit.first->begin; + int64_t adjusted_dist = min_dist + left_clip + right_clip; + + if (adjusted_dist >= optimal_separation - max_deviation + && adjusted_dist <= optimal_separation + max_deviation) { + // we found a suitable path, add it to the return vector + to_return.emplace_back(make_pair(left_clust_idx, right_clust_idx), adjusted_dist); - // remove the edge - node.edges_from[j] = node.edges_from.back(); - node.edges_from.pop_back(); - } - else { - j++; } } } -#ifdef debug_od_clusterer - cerr << "reidentifying connected components" << endl; -#endif + if (!left_alt_cluster_anchors.empty() || !right_alt_cluster_anchors.empty()) { + // get rid of extra copies of pairs due to alternate anchor positions + deduplicate_cluster_pairs(to_return, optimal_separation); + } - // use DFS to identify the connected components again - vector> new_components; + return to_return; +} + +MEMClusterer::HitGraph MinDistanceClusterer::make_hit_graph(const Alignment& alignment, + const vector& mems, + const GSSWAligner* aligner, + size_t min_mem_length, + const match_fanouts_t* fanouts) { - vector enqueued(component.size(), false); - for (size_t i = 0; i < component.size(); i++) { - if (enqueued[i]) { - continue; + // intialize with nodes + HitGraph hit_graph(mems, alignment, aligner, min_mem_length, false, fanouts); + + // assumes that MEMs are given in lexicographic order by read interval + for (size_t i = 0, j_begin = 1; i < hit_graph.nodes.size(); ++i) { + + HitNode& hit_node_1 = hit_graph.nodes[i]; + + // start either at the first un-equal position or at the next position + j_begin = max(j_begin, i + 1); + + // skip measuring to any additional hits of the same read interval + while (j_begin < hit_graph.nodes.size() && + hit_graph.nodes[j_begin].mem->begin == hit_node_1.mem->begin && + hit_graph.nodes[j_begin].mem->end == hit_node_1.mem->end) { + + // this node is at the same place in the read, so they can't be colinear +#ifdef debug_mem_clusterer + cerr << "nodes " << i << " (" << hit_node_1.start_pos << ") and " << j_begin << " (" << hit_graph.nodes[j_begin].start_pos << ") are not read colinear" << endl; +#endif + ++j_begin; } - new_components.emplace_back(); - vector stack(1, component[i]); - enqueued[node_idx_to_component_idx[i]] = true; - while (!stack.empty()) { - size_t node_idx = stack.back(); - stack.pop_back(); + + for (size_t j = j_begin; j < hit_graph.nodes.size(); ++j){ + + HitNode& hit_node_2 = hit_graph.nodes[j]; + + // what is the minimum distance between these hits? + int64_t min_dist = minimum_distance(*distance_index, hit_node_1.start_pos, hit_node_2.start_pos); + if (min_dist == std::numeric_limits::max()) { + // these are not reachable, don't make an edge + continue; + } - new_components.back().push_back(node_idx); + // how far apart do we expect them to be based on the read? + int64_t read_separation = hit_node_2.mem->begin - hit_node_1.mem->begin; - for (ODEdge& edge : nodes[node_idx].edges_from) { - size_t local_idx = node_idx_to_component_idx[edge.to_idx]; - if (!enqueued[local_idx]) { - stack.push_back(edge.to_idx); - enqueued[local_idx] = true; - } + // how long of an insert/deletion could we detect based on the scoring parameters? + size_t longest_gap = min(min(aligner->longest_detectable_gap(alignment, hit_node_1.mem->end), + aligner->longest_detectable_gap(alignment, hit_node_2.mem->begin)), + max_gap); + + // is it possible that an alignment containing both could be detected with local alignment? + if (abs(read_separation - min_dist) > longest_gap) { + continue; } - for (ODEdge& edge : nodes[node_idx].edges_to) { - size_t local_idx = node_idx_to_component_idx[edge.to_idx]; - if (!enqueued[local_idx]) { - stack.push_back(edge.to_idx); - enqueued[local_idx] = true; - } + if (min_dist == read_separation && hit_node_2.mem->begin >= hit_node_1.mem->begin && hit_node_2.mem->end <= hit_node_1.mem->end) { + // this has the appearance of being a redundant hit of a sub-MEM, which we don't want to form + // a separate cluster + + // we add a dummy edge, but only to connect the nodes' components and join the clusters, + // not to actually use in dynamic programming (given arbitrary low weight that should not + // cause overflow) + hit_graph.add_edge(i, j, numeric_limits::lowest() / 2, min_dist); + } + else if (hit_node_2.mem->begin >= hit_node_1.mem->begin + && hit_node_2.mem->end >= hit_node_1.mem->end) { + // there's a path within in the limit, and these hits are read colinear + + // the distance from the end of the first hit to the beginning of the next + int64_t graph_dist = min_dist - (hit_node_1.mem->end - hit_node_1.mem->begin); + + // add the corresponding edge + hit_graph.add_edge(i, j, estimate_edge_score(hit_node_1.mem, hit_node_2.mem, graph_dist, aligner), graph_dist); + } } } - // did we break this connected component into multiple connected components? - if (new_components.size() > 1) { -#ifdef debug_od_clusterer - stringstream strm; - strm << "splitting cluster:" << endl; - for (auto& comp : new_components) { - for (size_t i : comp) { - strm << "\t" << i << " " << nodes[i].mem->sequence() << " " << nodes[i].start_pos << endl; - } - strm << endl; - } - cerr << strm.str(); -#endif - // the the original component - components[component_idx] = move(new_components[0]); - // add the remaining to the end - for (size_t i = 1; i < new_components.size(); i++) { - components.emplace_back(move(new_components[i])); - } - } + return hit_graph; } + +GreedyMinDistanceClusterer::GreedyMinDistanceClusterer(SnarlDistanceIndex* distance_index) : MinDistanceClusterer(distance_index) { + // nothing else to do +} + +MEMClusterer::HitGraph GreedyMinDistanceClusterer::make_hit_graph(const Alignment& alignment, const vector& mems, + const GSSWAligner* aligner, size_t min_mem_length, + const match_fanouts_t* fanouts) { -size_t OrientedDistanceClusterer::median_mem_coverage(const vector& component, const Alignment& aln) const { - - // express the MEMs as intervals along the read sequence - vector> mem_intervals; - for (size_t node_idx : component) { - mem_intervals.emplace_back(nodes[node_idx].mem->begin - aln.sequence().begin(), nodes[node_idx].mem->end - aln.sequence().begin()); - } + // init the hit graph's nodes + HitGraph hit_graph(mems, alignment, aligner, min_mem_length, false, fanouts); - // put the intervals in order by starting index and then descending by length - sort(mem_intervals.begin(), mem_intervals.end(), [](const pair& a, const pair& b) { - return a.first < b.first || (a.first == b.first && a.second > b.second); - }); + // we will initialize this with the next backward and forward comparisons for each hit node + vector> next_comparisons; + next_comparisons.reserve(2 * hit_graph.nodes.size()); -#ifdef debug_median_algorithm - cerr << "intervals:" << endl; - for (const auto& interval : mem_intervals) { - cerr << "\t[" << interval.first << ", " << interval.second << ")" << endl; + // assumes that MEMs are given in lexicographic order by read interval + for (size_t i = 0, j = 1; i < hit_graph.nodes.size(); ++i) { + + // where we expect to find the next match on the read + auto target = hit_graph.nodes[i].mem->end + expected_separation; + + // start either at the previous target or one past i + j = max(j, i + 1); + + // move forward until we are past the target + while (j < hit_graph.nodes.size() && hit_graph.nodes[j].mem->begin < target) { + ++j; + } + + // move backward until the next index is past the target + while (j > i + 1 && hit_graph.nodes[j - 1].mem->begin >= target) { + --j; + } + + // the next backwards comparison + if (j > i + 1 && target - hit_graph.nodes[j - 1].mem->begin >= min_separation) { + next_comparisons.emplace_back(i, j - 1); + } + // the backwards comparison + if (j < hit_graph.nodes.size() && target - hit_graph.nodes[j].mem->begin <= max_separation) { + next_comparisons.emplace_back(i, j); + } } -#endif - - unordered_map coverage_count; - // a pointer to the read index we're currently at - int64_t at = 0; - // to keep track of how many intervals cover the current segment - int64_t depth = 0; + // the iteration order is starting from distances near the expected distance first, but also + // favoring forward distances by some pre-determined multiplier + auto priority_cmp = [&](const pair& a, const pair& b) { + int64_t a_dist = (hit_graph.nodes[a.second].mem->begin - hit_graph.nodes[a.first].mem->end) - expected_separation; + int64_t b_dist = (hit_graph.nodes[b.second].mem->begin - hit_graph.nodes[b.first].mem->end) - expected_separation; + return (a_dist < 0 ? -a_dist * forward_multiplier : a_dist) > (b_dist < 0 ? -b_dist * forward_multiplier : b_dist); + }; - // we can keep track of the SMEM we're in by checking whether we've passed its final index - pair curr_smem(0, 0); - // and the number of hits of this SMEM we've seen - int64_t curr_smem_hit_count = 0; - // we will skip one copy of each sub-MEM (heurstically assuming it's redundant with the parent) - // per copy of the SMEM - unordered_map, int64_t> skipped_sub_mems; + // establish the initial heap ordering + make_heap(next_comparisons.begin(), next_comparisons.end(), priority_cmp); - // the sort order ensures we will encounter the interval starts in order, we use a priority queue - // to also ensure that we will encounter their ends in order - priority_queue, greater> ends; + // we will block off seeds as they become incorporated into clusters + // pairs indicate whether a node is blocked for edges (into, out of) it + vector> blocked(hit_graph.nodes.size(), pair(false, false)); - for (size_t i = 0; i < mem_intervals.size(); i++) { - pair& interval = mem_intervals[i]; + // iterate through the comparisons + while (!next_comparisons.empty()) { -#ifdef debug_median_algorithm - cerr << "iter for interval [" << interval.first << ", " << interval.second << "), starting at " << at << endl; -#endif + pop_heap(next_comparisons.begin(), next_comparisons.end(), priority_cmp); + auto comparison = next_comparisons.back(); + next_comparisons.pop_back(); - if (interval.second > curr_smem.second) { - // we're in a MEM that covers distinct sequence from the current SMEM, so this is - // a new SMEM (because of sort order) - curr_smem = interval; - curr_smem_hit_count = 1; -#ifdef debug_median_algorithm - cerr << "\tthis is a new SMEM" << endl; -#endif - } - else if (interval == curr_smem) { - // this is another hit of the same SMEM, increase the count - curr_smem_hit_count++; -#ifdef debug_median_algorithm - cerr << "\tthis is a repeat of the current SMEM" << endl; -#endif - } - else if (skipped_sub_mems[interval] < curr_smem_hit_count) { - // we're in a MEM that covers a strict subinterval of the current SMEM, so skip - // one sub-MEM per hit of the SMEM on the assumption that it's redundant - skipped_sub_mems[interval]++; -#ifdef debug_median_algorithm - cerr << "\tthis is a sub-MEM we must skip" << endl; +#ifdef debug_mem_clusterer + cerr << "greedy cluster comparing:" << endl; + cerr << "\t" << comparison.first << ": " << hit_graph.nodes[comparison.first].start_pos << " " << hit_graph.nodes[comparison.first].mem->sequence() << endl; + cerr << "\t" << comparison.second << ": " << hit_graph.nodes[comparison.second].start_pos << " " << hit_graph.nodes[comparison.second].mem->sequence() << endl; #endif + + if (blocked[comparison.first].second) { + // we've already greedily accumulated out of this match, so we don't + // need to look for more connections from it + + // TODO: this is actually not correct, we should try to add the next one here... continue; } - // add the coverage of any segments that come before the start of this interval - while (ends.empty() ? false : ends.top() <= interval.first) { -#ifdef debug_median_algorithm - cerr << "\ttraversing interval end at " << ends.top() << " adding " << ends.top() - at << " to depth " << depth << endl; + auto& hit_node_1 = hit_graph.nodes[comparison.first]; + auto& hit_node_2 = hit_graph.nodes[comparison.second]; + + int64_t read_dist = hit_node_2.mem->begin - hit_node_1.mem->end; + + if (!blocked[comparison.second].first) { + + // what is the minimum distance between these hits? + size_t min_dist = minimum_distance(*distance_index, hit_node_1.start_pos, hit_node_2.start_pos); + +#ifdef debug_mem_clusterer + cerr << "read dist: " << read_dist << ", min dist: " << min_dist << ", graph dist: " << min_dist - (hit_node_1.mem->end - hit_node_1.mem->begin) << endl; #endif - coverage_count[depth] += ends.top() - at; - at = ends.top(); - ends.pop(); - // an interval is leaving scope, decrement the depth - depth--; - } - - // if there's an initial interval of 0 depth, we ignore it (helps with read-end effects from sequencers) - if (at > 0 || depth > 0) { -#ifdef debug_median_algorithm - cerr << "\ttraversing pre-interval segment staring from " << at << " adding " << interval.first - at << " to depth " << depth << endl; + // TODO: i'm ignoring sub-matches here because it's intended to be used with the stripped + // algorithm. that might come back to haunt me later + + if (min_dist != std::numeric_limits::max()) { + // we were able to measure a distance + + // how long of an insert/deletion could we detect based on the scoring parameters? + int64_t longest_gap = min(min(aligner->longest_detectable_gap(alignment, hit_node_1.mem->end), + aligner->longest_detectable_gap(alignment, hit_node_2.mem->begin)), + max_gap); + + // the distance from the end of the first hit to the beginning of the next + int64_t graph_dist = (int64_t)min_dist - (hit_node_1.mem->end - hit_node_1.mem->begin); + + // is it possible that an alignment containing both could be detected with local alignment? + if (abs(read_dist - graph_dist) < longest_gap) { + // there's a path within in the limit + +#ifdef debug_mem_clusterer + cerr << "found hit edge" << endl; #endif - coverage_count[depth] += interval.first - at; - } -#ifdef debug_median_algorithm - else { - cerr << "\tskipping an initial segment from " << at << " to " << interval.first << " with depth " << depth << endl; + + // add the corresponding edge + hit_graph.add_edge(comparison.first, comparison.second, + estimate_edge_score(hit_node_1.mem, hit_node_2.mem, graph_dist, aligner), + graph_dist); + + // we won't look for any more connections involving this end of these two + blocked[comparison.first].second = true; + blocked[comparison.second].first = true; + } + } } -#endif - - - at = interval.first; - // an interval is entering scope, increment the depth - depth++; - ends.push(interval.second); - } - - // run through the rest of the ends - while (!ends.empty()) { -#ifdef debug_median_algorithm - cerr << "\ttraversing interval end at " << ends.top() << " adding " << ends.top() - at << " to depth " << depth << endl; -#endif - coverage_count[depth] += ends.top() - at; - at = ends.top(); - ends.pop(); - // an interval is leaving scope, decrement the depth - depth--; + if (!blocked[comparison.first].second) { + + // TODO: this is actually not correct, we should try to continue until finding an unblocked + // node + + // we didn't just block off connections out of this match, + // so we can queue up the next one + if (read_dist >= 0 && comparison.second + 1 < hit_graph.nodes.size() && + hit_graph.nodes[comparison.second + 1].mem->begin - hit_node_1.mem->end <= max_separation) { + // the next in the forward direction + next_comparisons.emplace_back(comparison.first, comparison.second + 1); + push_heap(next_comparisons.begin(), next_comparisons.end(), priority_cmp); + } + else if (read_dist < 0 && hit_graph.nodes[comparison.second - 1].mem->begin > hit_node_1.mem->begin && + hit_graph.nodes[comparison.second - 1].mem->begin - hit_node_1.mem->end >= min_separation) { + // the next in the backward direction, requiring read colinearity + next_comparisons.emplace_back(comparison.first, comparison.second - 1); + push_heap(next_comparisons.begin(), next_comparisons.end(), priority_cmp); + } + } } - // NOTE: we used to count the final interval of depth 0 here, but now we ignore 0-depth terminal intervals - // because it seems to help with the read-end effects of sequencers (which can lead to match dropout) - //coverage_count[0] += aln.sequence().size() - at; + return hit_graph; +} + +ComponentMinDistanceClusterer::ComponentMinDistanceClusterer(SnarlDistanceIndex* distance_index) : MinDistanceClusterer(distance_index) { + // nothing else to do +} + +MEMClusterer::HitGraph ComponentMinDistanceClusterer::make_hit_graph(const Alignment& alignment, const vector& mems, + const GSSWAligner* aligner, size_t min_mem_length, + const match_fanouts_t* fanouts) { - // convert it into a CDF over read coverage - vector> cumul_coverage_count(coverage_count.begin(), coverage_count.end()); - sort(cumul_coverage_count.begin(), cumul_coverage_count.end()); + // init the hit graph's nodes + HitGraph hit_graph(mems, alignment, aligner, min_mem_length, false, fanouts); -#ifdef debug_median_algorithm - cerr << "\tcoverage distr is: " ; - for (const auto& record : cumul_coverage_count) { - cerr << record.first << ":" << record.second << " "; + // shim the hit graph nodes into the seed clusterer algorithm interface + vector positions(hit_graph.nodes.size()); + for (size_t i = 0; i < hit_graph.nodes.size(); ++i) { + positions[i].pos = hit_graph.nodes[i].start_pos; } - cerr << endl; + + typedef SnarlDistanceIndexClusterer::Cluster Cluster; + SnarlDistanceIndexClusterer seed_clusterer(*distance_index); + // TODO: magic number, want enough space for the max gap and the inter-seed distance but how to do this in + // a principled way? + std::vector distance_components = seed_clusterer.cluster_seeds(positions, 2 * max_gap); + + // these components are returned by the structures::UnionFind::all_groups() method, which + // always returns them in sorted order, so we can assume that they are still lexicographically + // ordered internally + for (Cluster& cluster : distance_components) { + std::vector& component = cluster.seeds; +#ifdef debug_mem_clusterer + cerr << "looking edges in distance component containing:" << endl; + for (size_t i : component) { + cerr << "\t" << i << " " << hit_graph.nodes[i].start_pos << " " << hit_graph.nodes[i].mem->sequence() << endl; + } #endif - - int64_t cumul = 0; - for (pair& coverage_record : cumul_coverage_count) { - coverage_record.second += cumul; - cumul = coverage_record.second; - } - - // bisect to find the median - int64_t target = aln.sequence().size() / 2; - if (target <= cumul_coverage_count[0].second) { - return cumul_coverage_count[0].first; - } - int64_t low = 0; - int64_t hi = cumul_coverage_count.size() - 1; - int64_t mid; - while (hi > low + 1) { - mid = (hi + low) / 2; - if (target <= cumul_coverage_count[mid].second) { - hi = mid; - } - else { - low = mid; + for (size_t i = 0, j_begin = 1; i < component.size(); ++i) { + + HitNode& hit_node_1 = hit_graph.nodes[component[i]]; + auto from = hit_node_1.mem->begin; + + // start either at the previous target or one past i + j_begin = max(j_begin, i + 1); + + // move forward until we are within the window + while (j_begin < component.size() && + hit_graph.nodes[component[j_begin]].mem->begin - from < min_read_separation) { + ++j_begin; + } + + // move backward until we are just within the window + while (j_begin > i + 1 && + hit_graph.nodes[component[j_begin - 1]].mem->begin - from >= min_read_separation) { + --j_begin; + } + + int64_t connections_made = 0; + + for (size_t j = j_begin; + j < component.size() && hit_graph.nodes[component[j]].mem->begin - from <= max_gap; ++j) { + + HitNode& hit_node_2 = hit_graph.nodes[component[j]]; + + // TODO: this code is getting repetitive, i should probably factor it into a MinDistanceClusterer method + + int64_t min_dist = minimum_distance(*distance_index, hit_node_1.start_pos, hit_node_2.start_pos); + if (min_dist != std::numeric_limits::max()) { + // how long of an insert/deletion could we detect based on the scoring parameters? + int64_t longest_gap = min(min(aligner->longest_detectable_gap(alignment, hit_node_1.mem->end), + aligner->longest_detectable_gap(alignment, hit_node_2.mem->begin)), + max_gap); + + // the distance from the end of the first hit to the beginning of the next + int64_t graph_dist = (int64_t)min_dist - (hit_node_1.mem->end - hit_node_1.mem->begin); + + // the distance between the seeds on the read + int64_t read_dist = hit_node_2.mem->begin - hit_node_1.mem->end; + + if (min_dist == hit_node_2.mem->begin - hit_node_1.mem->begin && + ((hit_node_2.mem->begin >= hit_node_1.mem->begin && hit_node_2.mem->end <= hit_node_1.mem->end) + || (hit_node_1.mem->begin >= hit_node_2.mem->begin && hit_node_1.mem->end <= hit_node_2.mem->end))) { + // this has the appearance of being a redundant hit of a sub-MEM, which we don't want to form + // a separate cluster + +#ifdef debug_mem_clusterer + cerr << "adding dummy hit edge edge " << component[i] << " -> " << component[j] << " to join componenents" << endl; +#endif + + // we add a dummy edge, but only to connect the nodes' components and join the clusters, + // not to actually use in dynamic programming (given arbitrary low weight that should not + // cause overflow) + hit_graph.add_edge(component[i], component[j], numeric_limits::lowest() / 2, graph_dist); + } + else if (abs(read_dist - graph_dist) < longest_gap) { + // there's a path within in the limit + +#ifdef debug_mem_clusterer + cerr << "adding hit edge " << component[i] << " -> " << component[j] << ", read dist " << read_dist << ", graph dist " << graph_dist << endl; +#endif + + // add the corresponding edge + hit_graph.add_edge(component[i], component[j], + estimate_edge_score(hit_node_1.mem, hit_node_2.mem, graph_dist, aligner), + graph_dist); + + // check if we've made enough connnections to stop early (usually the very next + // match along the read is the true "next" match in the cluster) + ++connections_made; + if (early_stop_number && connections_made >= early_stop_number) { + break; + } + } + } + } } } -#ifdef debug_median_algorithm - cerr << "\tmedian is " << cumul_coverage_count[hi].first << endl; -#endif - return cumul_coverage_count[hi].first; -} + return hit_graph; +} // collect node starts to build out graph -vector > mem_node_start_positions(const xg::XG& xg, const vg::MaximalExactMatch& mem) { +vector > mem_node_start_positions(const HandleGraph& graph, const vg::MaximalExactMatch& mem) { // walk the match, getting all the nodes that it touches string mem_seq = mem.sequence(); vector > positions; @@ -2553,8 +3782,8 @@ vector > mem_node_start_positions(const xg::XG& xg auto& pos = h.first; size_t query_offset = h.second; // check if we match each node in next - auto handle = xg.get_handle(gcsa::Node::id(pos), gcsa::Node::rc(pos)); - string h_seq = xg.get_sequence(handle); + auto handle = graph.get_handle(gcsa::Node::id(pos), gcsa::Node::rc(pos)); + string h_seq = graph.get_sequence(handle); size_t mem_todo = mem_seq.size() - query_offset; size_t overlap = min((size_t)mem_todo, (size_t)(h_seq.size()-gcsa::Node::offset(pos))); /* @@ -2573,8 +3802,8 @@ vector > mem_node_start_positions(const xg::XG& xg // if we continue past this node, insert our next nodes into nexts if (mem_todo - overlap > 0) { size_t new_off = query_offset + overlap; - xg.follow_edges(handle, false, [&](const handle_t& next) { - todo.insert(make_pair(gcsa::Node::encode(xg.get_id(next), 0, xg.get_is_reverse(next)), new_off)); + graph.follow_edges(handle, false, [&](const handle_t& next) { + todo.insert(make_pair(gcsa::Node::encode(graph.get_id(next), 0, graph.get_is_reverse(next)), new_off)); return true; }); } @@ -2599,75 +3828,74 @@ vector > mem_node_start_positions(const xg::XG& xg return positions; } -Graph cluster_subgraph_walk(const xg::XG& xg, const Alignment& aln, const vector& mems, double expansion) { +bdsg::HashGraph cluster_subgraph_containing(const HandleGraph& base, const Alignment& aln, const vector& cluster, const GSSWAligner* aligner) { + vector positions; + vector forward_max_dist; + vector backward_max_dist; + positions.reserve(cluster.size()); + forward_max_dist.reserve(cluster.size()); + backward_max_dist.reserve(cluster.size()); + for (auto& mem : cluster) { + // get the start position of the MEM + positions.push_back(make_pos_t(mem.nodes.front())); + // search far enough away to get any hit detectable without soft clipping + forward_max_dist.push_back(aligner->longest_detectable_gap(aln, mem.end) + + (aln.sequence().end() - mem.begin)); + backward_max_dist.push_back(aligner->longest_detectable_gap(aln, mem.begin) + + (mem.begin - aln.sequence().begin())); + } + auto cluster_graph = bdsg::HashGraph(); + algorithms::extract_containing_graph(&base, &cluster_graph, positions, forward_max_dist, backward_max_dist); + return cluster_graph; +} + +bdsg::HashGraph cluster_subgraph_walk(const HandleGraph& base, const Alignment& aln, const vector& mems, double expansion) { assert(mems.size()); auto& start_mem = mems.front(); auto start_pos = make_pos_t(start_mem.nodes.front()); - auto rev_start_pos = reverse(start_pos, xg.node_length(id(start_pos))); + auto rev_start_pos = reverse(start_pos, base.get_length(base.get_handle(id(start_pos)))); // Even if the MEM is right up against the start of the read, it may not be // part of the best alignment. Make sure to have some padding. // TODO: how much padding? - Graph graph; + bdsg::HashGraph graph; int inside_padding = max(1, (int)aln.sequence().size()/16); int end_padding = max(8, (int)aln.sequence().size()/8); int get_before = end_padding + (int)(expansion * (int)(start_mem.begin - aln.sequence().begin())); if (get_before) { - graph.MergeFrom(xg.graph_context_id(rev_start_pos, get_before)); + //algorithms::extract_context(base, graph, base.get_handle(id(rev_start_pos), is_rev(rev_start_pos)), offset(rev_start_pos), get_before, false, true); } //cerr << "======================================================" << endl; for (int i = 0; i < mems.size(); ++i) { auto& mem = mems[i]; //cerr << mem << endl; - vector > match_positions = mem_node_start_positions(xg, mem); + vector > match_positions = mem_node_start_positions(base, mem); if (!match_positions.size()) { // TODO XXX is MEM merging causing this to occur? match_positions.push_back(make_pair(mem.nodes.front(), mem.length())); } for (auto& p : match_positions) { - graph.MergeFrom(xg.node_subgraph_id(gcsa::Node::id(p.first))); + handle_t h = base.get_handle(gcsa::Node::id(p.first), gcsa::Node::rc(p.first)); + algorithms::extract_context(base, graph, h, gcsa::Node::offset(p.first), 0); } // extend after the last match node with the expansion auto& p = match_positions.back(); auto& pos = p.first; int mem_remainder = p.second; //cerr << p.first << " " << p.second << endl; - int get_after = xg.node_length(gcsa::Node::id(pos)) - + (i+1 == mems.size() ? + handle_t h = base.get_handle(gcsa::Node::id(pos)); + int get_after = //base.get_length(h); + (i+1 == mems.size() ? end_padding + expansion * ((int)(aln.sequence().end() - mem.end) + mem_remainder) : inside_padding + expansion * ((int)(mems[i+1].begin - mem.end) + mem_remainder)); - if (get_after > 0) graph.MergeFrom(xg.graph_context_id(make_pos_t(pos), get_after)); - } - xg.expand_context(graph, 1, false); - sort_by_id_dedup_and_clean(graph); - return graph; -} - -Graph cluster_subgraph(const xg::XG& xg, const Alignment& aln, const vector& mems, double expansion) { - assert(mems.size()); - auto& start_mem = mems.front(); - auto start_pos = make_pos_t(start_mem.nodes.front()); - auto rev_start_pos = reverse(start_pos, xg.node_length(id(start_pos))); - // Even if the MEM is right up against the start of the read, it may not be - // part of the best alignment. Make sure to have some padding. - // TODO: how much padding? - Graph graph; - int padding = 1; - int get_before = padding + (int)(expansion * (int)(start_mem.begin - aln.sequence().begin())); - if (get_before) { - graph.MergeFrom(xg.graph_context_id(rev_start_pos, get_before)); - } - for (int i = 0; i < mems.size(); ++i) { - auto& mem = mems[i]; - auto pos = make_pos_t(mem.nodes.front()); - int get_after = padding + (i+1 == mems.size() ? - expansion * (int)(aln.sequence().end() - mem.begin) - : expansion * max(mem.length(), (int)(mems[i+1].end - mem.begin))); - graph.MergeFrom(xg.graph_context_id(pos, get_after)); + if (get_after > 0) { + algorithms::extract_context(base, graph, h, gcsa::Node::offset(pos), get_after, true, false); + } } - sort_by_id_dedup_and_clean(graph); + //algorithms::expand_subgraph_by_steps(base, graph, 0); + algorithms::expand_subgraph_to_length(base, graph, aln.sequence().size() * expansion, false); return graph; } diff --git a/src/cluster.hpp b/src/cluster.hpp index 89b8a4eb1be..df997cc51c6 100644 --- a/src/cluster.hpp +++ b/src/cluster.hpp @@ -5,11 +5,13 @@ #include #include #include "position.hpp" -#include "gssw_aligner.hpp" -#include "utility.hpp" +#include "aligner.hpp" #include "mem.hpp" -#include "xg.hpp" #include "handle.hpp" +#include "snarl_distance_index.hpp" +#include "snarl_seed_clusterer.hpp" +#include "path_component_index.hpp" +#include "bdsg/hash_graph.hpp" #include #include @@ -75,7 +77,10 @@ class ShuffledPairs { // Default copy constructor and assignment operator. iterator(const iterator& other) = default; - iterator& operator=(const iterator& other) = default; + + // TODO: This gets implicitly deleted and generates warning because of the const reference + // member variable + //iterator& operator=(const iterator& other) = default; private: // What is the ordinal value of this element in the permutation? @@ -131,13 +136,13 @@ class MEMChainModelVertex { class MEMChainModel { public: vector model; - map::iterator> > > positions; + unordered_map::iterator> > > positions; set::iterator> redundant_vertexes; MEMChainModel( const vector& aln_lengths, const vector >& matches, const function& approx_position, - const function > >(pos_t)>& path_position, + const function > >(pos_t)>& path_position, const function& transition_weight, int band_width = 10, int position_depth = 1, @@ -146,78 +151,330 @@ class MEMChainModel { MEMChainModelVertex* max_vertex(void); vector > traceback(int alt_alns, bool paired, bool debug); void display(ostream& out); + void display_dot(ostream& out, vector vertex_trace); void clear_scores(void); }; - -class OrientedDistanceClusterer { + +/* + * A base class to hold some shared methods and data types between the TVS, + * oriented distance, and minimum distance clusterers. + */ +class MEMClusterer { public: + MEMClusterer() = default; + virtual ~MEMClusterer() = default; /// Each hit contains a pointer to the original MEM and the position of that /// particular hit in the graph. using hit_t = pair; - /// Each cluster is a vector of hits. - using cluster_t = vector; - - /// A memo for the results of XG::paths_of_node - using paths_of_node_memo_t = unordered_map>; - - /// A memo for the results of XG::oriented_occurrences_on_path - using oriented_occurences_memo_t = unordered_map, vector>>; - - /// A memo for the results of XG::get_handle - using handle_memo_t = unordered_map, handle_t>; - - /// Constructor using QualAdjAligner, optionally memoizing succinct data structure operations - OrientedDistanceClusterer(const Alignment& alignment, - const vector& mems, - const QualAdjAligner& aligner, - xg::XG* xgindex, - size_t max_expected_dist_approx_error = 8, - size_t min_mem_length = 1, - bool unstranded = false, - paths_of_node_memo_t* paths_of_node_memo = nullptr, - oriented_occurences_memo_t* oriented_occurences_memo = nullptr, - handle_memo_t* handle_memo = nullptr); - - /// Constructor using Aligner, optionally memoizing succinct data structure operations - OrientedDistanceClusterer(const Alignment& alignment, - const vector& mems, - const Aligner& aligner, - xg::XG* xgindex, - size_t max_expected_dist_approx_error = 8, - size_t min_mem_length = 1, - bool unstranded = false, - paths_of_node_memo_t* paths_of_node_memo = nullptr, - oriented_occurences_memo_t* oriented_occurences_memo = nullptr, - handle_memo_t* handle_memo = nullptr); + /// Each cluster is a vector of hits and a paired multiplicity + using cluster_t = pair, double>; + + /// Represents the mismatches that were allowed in "MEMs" from the fanout + /// match algorithm + using match_fanouts_t = unordered_map>>; /// Returns a vector of clusters. Each cluster is represented a vector of MEM hits. Each hit /// contains a pointer to the original MEM and the position of that particular hit in the graph. vector clusters(const Alignment& alignment, + const vector& mems, + const GSSWAligner* Aligner, + size_t min_mem_length = 1, int32_t max_qual_score = 60, int32_t log_likelihood_approx_factor = 0, size_t min_median_mem_coverage_for_split = 0, - double suboptimal_edge_pruning_factor = .75); + double suboptimal_edge_pruning_factor = .75, + double cluster_multiplicity_diff = 10.0, + const match_fanouts_t* fanouts = nullptr); /** - * Given two vectors of clusters, an xg index, an bounds on the distance between clusters, + * Given two vectors of clusters and bounds on the distance between clusters, * returns a vector of pairs of cluster numbers (one in each vector) matched with the estimated - * distance + * distance. + * + * Clusters are assumed to be located at the position of the first MEM hit they contain. Optionally, + * additional MEMs may be identied as possible anchors for the cluster. Additional anchors are + * provided as pairs of (cluster index, MEM index within cluster). Only one result will be returned + * per pair of clusters regardless of how many alternate anchors are given. */ - static vector, int64_t>> pair_clusters(const Alignment& alignment_1, - const Alignment& alignment_2, - const vector& left_clusters, - const vector& right_clusters, - const vector>& left_alt_cluster_anchors, - const vector>& right_alt_cluster_anchors, - xg::XG* xgindex, - int64_t min_inter_cluster_distance, - int64_t max_inter_cluster_distance, - bool unstranded, - paths_of_node_memo_t* paths_of_node_memo = nullptr, - oriented_occurences_memo_t* oriented_occurences_memo = nullptr, - handle_memo_t* handle_memo = nullptr); + virtual vector, int64_t>> pair_clusters(const Alignment& alignment_1, + const Alignment& alignment_2, + const vector& left_clusters, + const vector& right_clusters, + const vector>& left_alt_cluster_anchors, + const vector>& right_alt_cluster_anchors, + int64_t optimal_separation, + int64_t max_deviation) = 0; + + /// The largest discrepency we will allow between the read-implied distances and the estimated gap distance + int64_t max_gap = numeric_limits::max(); + +protected: + + class HitNode; + class HitEdge; + class HitGraph; + class DPScoreComparator; + + /// Initializes a hit graph and adds edges to it, this must be implemented by any inheriting + /// class + virtual HitGraph make_hit_graph(const Alignment& alignment, const vector& mems, + const GSSWAligner* aligner, size_t min_mem_length, + const match_fanouts_t* fanouts) = 0; + + /// Once the distance between two hits has been estimated, estimate the score of the hit graph edge + /// connecting them + int32_t estimate_edge_score(const MaximalExactMatch* mem_1, const MaximalExactMatch* mem_2, int64_t graph_dist, + const GSSWAligner* aligner) const; + + /// Sorts cluster pairs and removes copies of the same cluster pair, choosing only the one whose distance + /// is closest to the optimal separation + void deduplicate_cluster_pairs(vector, int64_t>>& cluster_pairs, int64_t optimal_separation); +}; + +class MEMClusterer::HitGraph { +public: + + /// Initializes nodes in the hit graph, but does not add edges + HitGraph(const vector& mems, const Alignment& alignment, const GSSWAligner* aligner, + size_t min_mem_length = 1, bool track_components = false, const match_fanouts_t* fanouts = nullptr); + + /// Add an edge + void add_edge(size_t from, size_t to, int32_t weight, int64_t distance); + + /// Returns the top scoring connected components + vector clusters(const Alignment& alignment, + const GSSWAligner* aligner, + int32_t max_qual_score, + int32_t log_likelihood_approx_factor, + size_t min_median_mem_coverage_for_split, + double suboptimal_edge_pruning_factor, + double cluster_multiplicity_diff); + + vector nodes; + +private: + + /// Identify weakly connected components in the graph + void connected_components(vector>& components_out) const; + + /// Prune edges that are not on any traceback that scores highly compared to the best score in the component, + /// splits up the components (adding some to the end of the vector) if doing so splits a component + void prune_low_scoring_edges(vector>& components, size_t component_idx, double score_factor); + + /// Perform dynamic programming and store scores in nodes + void perform_dp(); + + /// Fills input vectors with indices of source and sink nodes + void identify_sources_and_sinks(vector& sources_out, vector& sinks_out) const; + + /// Fills the input vector with the indices of a topological sort + void topological_order(vector& order_out) const; + + /// Computes the topological order of + void component_topological_order(const vector& component, vector& order_out) const; + + /// Returns the median coverage of bases in the reads by bases in the cluster, attempts to remove apparent + /// redundant sub-MEMs + size_t median_mem_coverage(const vector& component, const Alignment& aln) const; + + /// Should we actively keep track of connected components? + bool track_components; + + /// Keeps track of the connected components + UnionFind components; +}; + +class MEMClusterer::HitNode { +public: + HitNode(const MaximalExactMatch& mem, pos_t start_pos, int32_t score) : mem(&mem), start_pos(start_pos), score(score) { } + HitNode() = default; + ~HitNode() = default; + + const MaximalExactMatch* mem; + + /// Position of GCSA hit in the graph + pos_t start_pos; + + /// Score of the exact match this node represents + int32_t score; + + /// Score used in dynamic programming + int32_t dp_score; + + /// Edges from this node that are colinear with the read + vector edges_from; + + /// Edges to this node that are colinear with the read + vector edges_to; +}; + +class MEMClusterer::HitEdge { +public: + HitEdge(size_t to_idx, int32_t weight, int64_t distance) : to_idx(to_idx), weight(weight), distance(distance) {} + HitEdge() = default; + ~HitEdge() = default; + + /// Index of the node that the edge points to + size_t to_idx; + + /// Weight for dynamic programming + int32_t weight; + + /// Estimated distance + int64_t distance; +}; + +struct MEMClusterer::DPScoreComparator { +private: + const vector& nodes; +public: + DPScoreComparator() = delete; + DPScoreComparator(const vector& nodes) : nodes(nodes) {} + ~DPScoreComparator() {} + inline bool operator()(const size_t i, const size_t j) { + return nodes[i].dp_score < nodes[j].dp_score; + } +}; + +/* + * A clustering implementation that actually doesn't do any clustering + */ +class NullClusterer : public MEMClusterer { +public: + NullClusterer() = default; + virtual ~NullClusterer() = default; + + /// Concrete implementation of virtual method from MEMClusterer + vector, int64_t>> pair_clusters(const Alignment& alignment_1, + const Alignment& alignment_2, + const vector& left_clusters, + const vector& right_clusters, + const vector>& left_alt_cluster_anchors, + const vector>& right_alt_cluster_anchors, + int64_t optimal_separation, + int64_t max_deviation); + +protected: + + /// Concrete implementation of virtual method from MEMClusterer + /// Note: ignores the min_mem_length parameter + HitGraph make_hit_graph(const Alignment& alignment, const vector& mems, const GSSWAligner* aligner, + size_t min_mem_length, const match_fanouts_t* fanouts); +}; + + +/* + * An abstract class that provides distances to the oriented distance clusterer + */ +class OrientedDistanceMeasurer { +public: + virtual ~OrientedDistanceMeasurer() = default; + + /// Returns a signed distance, where positive indicates that pos_2 is to the right + /// of pos_1, and negative indicates to the left. If the distance is infinite or + /// can't be determined, returns numeric_limits::max(). + virtual int64_t oriented_distance(const pos_t& pos_1, const pos_t& pos_2) = 0; + + /// Return a vector of groups that we believe will have finite distances under this metric, + /// can be empty. + virtual vector> get_buckets(const function& get_position, size_t num_items) = 0; + + /// Return a vector of pairs of groups (referred to by indexes in the current_groups vector) + /// that cannot have finite distances between them (typically because they are on separate components). + virtual vector> exclude_merges(vector>& current_groups, + const function& get_position) = 0; +}; + +/* + * A distance function that uses an a graph's embedded paths to measure distances, either in a stranded + * or unstranded manner. + */ +class PathOrientedDistanceMeasurer : public OrientedDistanceMeasurer { + +public: + + /// Construct a distance service to measures distance along paths in this graph. Optionally + /// measures all distances on the forward strand of the paths. + PathOrientedDistanceMeasurer(const PathPositionHandleGraph* graph, + const PathComponentIndex* path_component_index = nullptr); + + /// Default desctructor + ~PathOrientedDistanceMeasurer() = default; + + /// Returns a signed distance, where positive indicates that pos_2 is to the right + /// of pos_1, and negative indicates to the left. If the distance is infinite or + /// can't be determined, returns numeric_limits::max(). + int64_t oriented_distance(const pos_t& pos_1, const pos_t& pos_2); + + /// Return a vector of groups that we believe will have finite distances under this metric, + /// can be empty. + vector> get_buckets(const function& get_position, size_t num_items); + + /// Return a vector of pairs of groups (referred to by indexes in the current_groups vector) + /// that cannot have finite distances between them (typically because they are on separate components). + vector> exclude_merges(vector>& current_groups, + const function& get_position); + + /// The maximum distance we will walk trying to find a shared path + size_t max_walk = 50; + +private: + + const PathPositionHandleGraph* graph = nullptr; + const PathComponentIndex* path_component_index = nullptr; + +}; + +/* + * A distance function that the minimum distance function provided by the Snarl-based + * distance index + */ +class SnarlOrientedDistanceMeasurer : public OrientedDistanceMeasurer { + +public: + // Construct a distance service to measures distance as the minimum distance in the graph + SnarlOrientedDistanceMeasurer(SnarlDistanceIndex* distance_index); + + /// Default desctructor + ~SnarlOrientedDistanceMeasurer() = default; + + /// Returns a signed distance, where positive indicates that pos_2 is to the right + /// of pos_1, and negative indicates to the left. If the distance is infinite or + /// can't be determined, returns numeric_limits::max(). + int64_t oriented_distance(const pos_t& pos_1, const pos_t& pos_2); + + /// Return a vector of groups that we believe will have finite distances under this metric, + /// can be empty. + vector> get_buckets(const function& get_position, size_t num_items); + + /// Return a vector of pairs of groups (referred to by indexes in the current_groups vector) + /// that cannot have finite distances between them (typically because they are on separate components). + vector> exclude_merges(vector>& current_groups, + const function& get_position); + +private: + + SnarlDistanceIndex* distance_index = nullptr; +}; + +class OrientedDistanceClusterer : public MEMClusterer { +public: + + /// Constructor + OrientedDistanceClusterer(OrientedDistanceMeasurer& distance_measurer, + size_t max_expected_dist_approx_error = 8); + + /// Concrete implementation of virtual method from MEMClusterer + vector, int64_t>> pair_clusters(const Alignment& alignment_1, + const Alignment& alignment_2, + const vector& left_clusters, + const vector& right_clusters, + const vector>& left_alt_cluster_anchors, + const vector>& right_alt_cluster_anchors, + int64_t optimal_separation, + int64_t max_deviation); //static size_t PRUNE_COUNTER; //static size_t CLUSTER_TOTAL; @@ -228,24 +485,8 @@ class OrientedDistanceClusterer { //static size_t SUCCESSFUL_SPLIT_ATTEMPT_COUNTER; //static size_t POST_SPLIT_CLUSTER_COUNTER; -private: - class ODNode; - class ODEdge; - struct DPScoreComparator; - - /// Internal constructor that public constructors filter into - OrientedDistanceClusterer(const Alignment& alignment, - const vector& mems, - const Aligner* aligner, - const QualAdjAligner* qual_adj_aligner, - xg::XG* xgindex, - size_t max_expected_dist_approx_error, - size_t min_mem_length, - bool unstranded, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo); - +protected: + /** * Given a certain number of items, and a callback to get each item's * position, and a callback to a fixed offset from that position @@ -253,88 +494,51 @@ class OrientedDistanceClusterer { * verify are on the same strand of the same molecule. * * We use the distance approximation to cluster the MEM hits according to - * the strand they fall on using the oriented distance estimation function - * in xg. + * the strand they fall on using the oriented distance estimation function. * * Returns a map from item pair (lower number first) to distance (which may * be negative) from the first to the second along the items' forward * strand. */ - static unordered_map, int64_t> get_on_strand_distance_tree(size_t num_items, bool unstranded, xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo); + unordered_map, int64_t> get_on_strand_distance_tree(size_t num_items, + const function& get_position, + const function& get_offset); /** * Adds edges into the distance tree by estimating the distance between pairs * generated by a high entropy deterministic permutation */ - static void extend_dist_tree_by_permutations(int64_t max_failed_distance_probes, - int64_t max_search_distance_to_path, - size_t decrement_frequency, - size_t& num_possible_merges_remaining, - UnionFind& component_union_find, - unordered_map, int64_t>& recorded_finite_dists, - map, size_t>& num_infinite_dists, - bool unstranded, - size_t num_items, - xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo); + void extend_dist_tree_by_permutations(const function& get_position, + const function& get_offset, + size_t num_items, + int64_t max_failed_distance_probes, + size_t decrement_frequency, + unordered_map, int64_t>& recorded_finite_dists, + map, size_t>& num_infinite_dists, + UnionFind& component_union_find, + size_t& num_possible_merges_remaining); - - /** - * Adds edges into the distance tree by estimating the distance only between pairs - * of items that can be directly inferred to share a strand of a path based on the memo of - * node occurrences on paths - */ - static void extend_dist_tree_by_strand_buckets(int64_t max_failed_distance_probes, - size_t& num_possible_merges_remaining, - UnionFind& component_union_find, - unordered_map, int64_t>& recorded_finite_dists, - map, size_t>& num_infinite_dists, - size_t num_items, - xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo); /** * Adds edges into the distance tree by estimating the distance only between pairs - * of items that can be directly inferred to share a path based on the memo of + * of items that can be easily identified as having a finite distance (e.g. by sharing + * a path) */ - static void extend_dist_tree_by_path_buckets(int64_t max_failed_distance_probes, - size_t& num_possible_merges_remaining, - UnionFind& component_union_find, - unordered_map, int64_t>& recorded_finite_dists, - map, size_t>& num_infinite_dists, - size_t num_items, - xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo, - oriented_occurences_memo_t* oriented_occurences_memo, - handle_memo_t* handle_memo); + void extend_dist_tree_by_buckets(const function& get_position, + const function& get_offset, + size_t num_items, + unordered_map, int64_t>& recorded_finite_dists, + UnionFind& component_union_find, + size_t& num_possible_merges_remaining); /** * Automatically blocks off merges in the distance tree between groups that can be inferred - * to be on separate components based on the paths they overlap + * to be on separate components */ - static void exclude_dist_tree_merges_by_components(int64_t max_failed_distance_probes, - size_t& num_possible_merges_remaining, - UnionFind& component_union_find, - map, size_t>& num_infinite_dists, - unordered_map neighbors_on_paths, - xg::XG* xgindex, - const function& get_position, - const function& get_offset, - paths_of_node_memo_t* paths_of_node_memo); + void exclude_dist_tree_merges(const function& get_position, + map, size_t>& num_infinite_dists, + UnionFind& component_union_find, + size_t& num_possible_merges_remaining, + int64_t max_failed_distance_probes); /** * Given a number of nodes, and a map from node pair to signed relative @@ -347,102 +551,222 @@ class OrientedDistanceClusterer { * Assumes all the distances are transitive, even though this isn't quite * true in graph space. */ - static vector> flatten_distance_tree(size_t num_items, - const unordered_map, int64_t>& recorded_finite_dists); + vector> flatten_distance_tree(size_t num_items, + const unordered_map, int64_t>& recorded_finite_dists); /// Returns a vector containing the number of SMEM beginnings to the left and the number of SMEM /// endings to the right of each read position vector> compute_tail_mem_coverage(const Alignment& alignment, const vector& mems); - /// Fills input vectors with indices of source and sink nodes - void identify_sources_and_sinks(vector& sources_out, vector& sinks_out); + /// Concrete implementation of virtual method from MEMClusterer + HitGraph make_hit_graph(const Alignment& alignment, const vector& mems, const GSSWAligner* aligner, + size_t min_mem_length, const match_fanouts_t* fanouts); - /// Identify weakly connected components in the graph - void connected_components(vector>& components_out); + OrientedDistanceMeasurer& distance_measurer; + size_t max_expected_dist_approx_error; + bool unstranded; - /// Fills the input vector with the indices of a topological sort - void topological_order(vector& order_out); +}; - /// Perform dynamic programming and store scores in nodes - void perform_dp(); +/* + * An abtract class that provides a heuristic distance between two positions. The semantics are + * unspecified for what manner of "approximate" the heuristic is (upperbound, lowerbound, etc.). + */ +class DistanceHeuristic { +public: + virtual ~DistanceHeuristic() = default; - /// Returns the median coverage of bases in the reads by bases in the cluster, attempts to remove apparent - /// redundant sub-MEMs - size_t median_mem_coverage(const vector& component, const Alignment& aln) const; + virtual int64_t operator()(const pos_t& pos_1, const pos_t& pos_2) = 0; +}; - /// Prune edges that are not on any traceback that scores highly compared to the best score in the component, - /// splits up the components (adding some to the end of the vector) if doing so splits a component - void prune_low_scoring_edges(vector>& components, size_t component_idx, double score_factor); +/* + * An exact computation of the minimum distance between two positions using the snarl + * decomposition + */ +class SnarlMinDistance : public DistanceHeuristic { +public: + SnarlMinDistance() = delete; + SnarlMinDistance(SnarlDistanceIndex& distance_index); + ~SnarlMinDistance() = default; - /// Computes the topological order of - void component_topological_order(const vector& component, vector& order_out) const; + int64_t operator()(const pos_t& pos_1, const pos_t& pos_2); +private: + SnarlDistanceIndex& distance_index; +}; + +/* + * An upperbound on the distance between two positions computed using the distance + * between those positions and tips. Strict upperbound in DAGs, only an upperbound + * among a subset of paths in cyclic graphs (as distance is unbounded above). + */ +class TipAnchoredMaxDistance : public DistanceHeuristic { +public: + TipAnchoredMaxDistance() = delete; + TipAnchoredMaxDistance(SnarlDistanceIndex& distance_index); + ~TipAnchoredMaxDistance() = default; - vector nodes; + int64_t operator()(const pos_t& pos_1, const pos_t& pos_2); +private: + SnarlDistanceIndex& distance_index; +}; + +/* + * Implements the heuristic solution to the Target Value Search problem described + * in Kuhn, et al. (2008). + */ +class TargetValueSearch { +public: + TargetValueSearch() = delete; + TargetValueSearch(const HandleGraph& handle_graph, + DistanceHeuristic* upper_bound_heuristic, + DistanceHeuristic* lower_bound_heuristic); + ~TargetValueSearch() = default; + + /// Does a path exist from pos_1 to pos_2 with length within the tolerance from the target value? + bool tv_path_exists(const pos_t& pos_1, const pos_t& pos_2, int64_t target_value, int64_t tolerance); + + /// Returns the length of path from pos_1 to pos_2 with length closest to the target value. If there + /// is no such path within the tolerance of the target value, returns numeric_limits::max(). + int64_t tv_path_length(const pos_t& pos_1, const pos_t& pos_2, int64_t target_value, int64_t tolerance); + + /// Returns a path from pos_1 to pos_2 with length closest to the target value. If there is no such + /// path within the tolerance of the target value, returns an empty vector. + vector tv_path(const pos_t& pos_1, const pos_t& pos_2, int64_t target_value, int64_t tolerance); + +protected: + + vector tv_phase2(const pos_t& pos_1, + const pos_t& pos_2, + int64_t target_value, + int64_t tolerance, + hash_map,int64_t>& node_to_target_shorter, + hash_map, int64_t>& node_to_target_longer, + pair,int64_t>>& best_lng, + pair, int64_t>>& next_best, + hash_map, int64_t>, pair, int64_t>>& node_to_path); + + const HandleGraph& handle_graph; + unique_ptr upper_bound_heuristic; + unique_ptr lower_bound_heuristic; +}; - const Aligner* aligner; - const QualAdjAligner* qual_adj_aligner; +/* + * A MEM clusterer built around the Target Value Search problem. + */ +class TVSClusterer : public MEMClusterer { +public: + TVSClusterer(const HandleGraph* handle_graph, SnarlDistanceIndex* distance_index); + ~TVSClusterer() = default; + + /// Concrete implementation of virtual method from MEMClusterer + vector, int64_t>> pair_clusters(const Alignment& alignment_1, + const Alignment& alignment_2, + const vector& left_clusters, + const vector& right_clusters, + const vector>& left_alt_cluster_anchors, + const vector>& right_alt_cluster_anchors, + int64_t optimal_separation, + int64_t max_deviation); + +protected: + + /// Concrete implementation of virtual method from MEMClusterer + HitGraph make_hit_graph(const Alignment& alignment, const vector& mems, const GSSWAligner* aligner, + size_t min_mem_length, const match_fanouts_t* fanouts); + + TargetValueSearch tvs; +}; + +/* + * A MEM clusterer based on finding the minimum distance between all pairs of seeds or clusters + */ +class MinDistanceClusterer : public MEMClusterer { +public: + MinDistanceClusterer(SnarlDistanceIndex* distance_index); + virtual ~MinDistanceClusterer() = default; + + /// Concrete implementation of virtual method from MEMClusterer + vector, int64_t>> pair_clusters(const Alignment& alignment_1, + const Alignment& alignment_2, + const vector& left_clusters, + const vector& right_clusters, + const vector>& left_alt_cluster_anchors, + const vector>& right_alt_cluster_anchors, + int64_t optimal_separation, + int64_t max_deviation); + +protected: + + /// Concrete implementation of virtual method from MEMClusterer + virtual HitGraph make_hit_graph(const Alignment& alignment, const vector& mems, const GSSWAligner* aligner, + size_t min_mem_length, const match_fanouts_t* fanouts); + + const HandleGraph* handle_graph; + SnarlDistanceIndex* distance_index; }; -class OrientedDistanceClusterer::ODNode { +/* + * A version of the MinDistanceClusterer that greedily agglomerates seeds into connected components + * based on minimum distance, iterating over pairs in a sensible order + */ +class GreedyMinDistanceClusterer : public MinDistanceClusterer { public: - ODNode(const MaximalExactMatch& mem, pos_t start_pos, int32_t score) : - mem(&mem), start_pos(start_pos), score(score) {} - ODNode() = default; - ~ODNode() = default; + GreedyMinDistanceClusterer(SnarlDistanceIndex* distance_index); + ~GreedyMinDistanceClusterer() = default; - const MaximalExactMatch* mem; +protected: - /// Position of GCSA hit in the graph - pos_t start_pos; + /// Concrete implementation of virtual method from MEMClusterer, overides the inherited one from MinDistanceClusterer + HitGraph make_hit_graph(const Alignment& alignment, const vector& mems, const GSSWAligner* aligner, + size_t min_mem_length, const match_fanouts_t* fanouts); - /// Score of the exact match this node represents - int32_t score; - /// Score used in dynamic programming - int32_t dp_score; + /// How far apart do we expect the seeds to be on the read? + const int64_t expected_separation = 20; - /// Edges from this node that are colinear with the read - vector edges_from; + /// How more bases would we search forward to find the next seed before we think + /// it's worth searching 1 base backward? + const int64_t forward_multiplier = 3; + + /// Minimum distance between two seeds on the read + const int64_t min_separation = -10; + + /// Maximum distance between two seeds on the read + const int64_t max_separation = 250; - /// Edges to this node that are colinear with the read - vector edges_to; }; -class OrientedDistanceClusterer::ODEdge { +/* + * A version of the MinDistanceClusterer that uses the SeedClusterer to partition reads + * into nearby clusters and only measures distances within clusters + */ +class ComponentMinDistanceClusterer : public MinDistanceClusterer { public: - ODEdge(size_t to_idx, int32_t weight, int64_t distance) : to_idx(to_idx), weight(weight), distance(distance) {} - ODEdge() = default; - ~ODEdge() = default; + ComponentMinDistanceClusterer(SnarlDistanceIndex* distance_index); + ~ComponentMinDistanceClusterer() = default; - /// Index of the node that the edge points to - size_t to_idx; +protected: - /// Weight for dynamic programming - int32_t weight; + /// Concrete implementation of virtual method from MEMClusterer, overides the inherited one from MinDistanceClusterer + HitGraph make_hit_graph(const Alignment& alignment, const vector& mems, const GSSWAligner* aligner, + size_t min_mem_length, const match_fanouts_t* fanouts); - /// Estimated distance - int64_t distance; -}; - -struct OrientedDistanceClusterer::DPScoreComparator { -private: - const vector& nodes; -public: - DPScoreComparator() = delete; - DPScoreComparator(const vector& nodes) : nodes(nodes) {} - ~DPScoreComparator() {} - inline bool operator()(const size_t i, const size_t j) { - return nodes[i].dp_score < nodes[j].dp_score; - } + + /// Minimum distance between two seeds on the read + const int64_t min_read_separation = 0; + + /// The number of connections from one hit in a component to another that we will consider (0 for no maximum) + const int64_t early_stop_number = 2; }; /// get the handles that a mem covers -vector > mem_node_start_positions(const xg::XG& xg, const vg::MaximalExactMatch& mem); +vector > mem_node_start_positions(const HandleGraph& graph, const vg::MaximalExactMatch& mem); +/// return a containing subgraph connecting the mems +bdsg::HashGraph cluster_subgraph_containing(const HandleGraph& base, const Alignment& aln, const vector& cluster, const GSSWAligner* aligner); +/// return a subgraph for a cluster of MEMs from the given alignment /// use walking to get the hits -Graph cluster_subgraph_walk(const xg::XG& xg, const Alignment& aln, const vector& mems, double expansion); -/// return a subgraph form an xg for a cluster of MEMs from the given alignment -Graph cluster_subgraph(const xg::XG& xg, const Alignment& aln, const vector& mems, double expansion); +bdsg::HashGraph cluster_subgraph_walk(const HandleGraph& base, const Alignment& aln, const vector& mems, double expansion); } diff --git a/src/colors.hpp b/src/colors.hpp index 37ade757a5c..c886dc50c01 100644 --- a/src/colors.hpp +++ b/src/colors.hpp @@ -1,6 +1,12 @@ #ifndef VG_COLORS_HPP_INCLUDED #define VG_COLORS_HPP_INCLUDED +/** + * \file colors.hpp + * Includes facilities for generating random GraphViz colors seeded by strings. + * Used for colorizing paths in visualizations. + */ + #include #include @@ -8,6 +14,10 @@ namespace vg { using namespace std; +/** + * Generates random GraphViz colors seeded by strings. + * Used for colorizing paths in visualizations. + */ class Colors { mt19937 rng; public: diff --git a/src/config/allocator_config.hpp b/src/config/allocator_config.hpp new file mode 100644 index 00000000000..c646ac99b67 --- /dev/null +++ b/src/config/allocator_config.hpp @@ -0,0 +1,20 @@ +#ifndef VG_ALLOCATOR_CONFIG_HPP_INCLUDED +#define VG_ALLOCATOR_CONFIG_HPP_INCLUDED + +/** + * \file + * Allocator configuration header. Used with either + * allocator_config_jemalloc.cpp or allocator_config_system.cpp as appropriate + * for the build. + */ + +namespace vg { + +/** + * If using a non-system memory allocator, initialize it to a safe configuration in this runtime environment. + */ +void configure_memory_allocator(); + +} + +#endif diff --git a/src/config/allocator_config_jemalloc.cpp b/src/config/allocator_config_jemalloc.cpp new file mode 100644 index 00000000000..50de9dbf043 --- /dev/null +++ b/src/config/allocator_config_jemalloc.cpp @@ -0,0 +1,112 @@ +/** + * \file + * Allocator configuration procedure for jemalloc. + */ + +#include "allocator_config.hpp" + +#include +#include +#include + +#include + +extern "C" { + // Hackily define symbols that jemalloc actually exports. + // Somehow it gets a "je_" prefix on these relative to what's in it's + // source. + // They're also all "local" symbols in the dynamic jemalloc library, + // meaning we can't link them form outside the library; we need to use + // static jemalloc if we intend to access these from here. + + // We use int here but really this takes an enum type. + bool je_extent_dss_prec_set(int dss_prec); + + // This is really the last enum value + int dss_prec_limit = 3; + + // These are the globals used to store the human-readable dss priority in + // addition to what the function controls. + extern const char *je_opt_dss; + extern const char *je_dss_prec_names[]; + + extern bool je_opt_retain; +} + +// Stringifier we need for jemalloc from its docs +#define STRINGIFY_HELPER(x) #x +#define STRINGIFY(x) STRINGIFY_HELPER(x) + +namespace vg { + +using namespace std; + +void configure_memory_allocator() { + // TODO: this is going to allocate when we don't really maybe want to. But + // the dynamic linker also allocated; we have to hope we don't upset any + // existing jemalloc stuff. + ifstream procfile("/proc/sys/vm/overcommit_memory"); + if (procfile) { + // We're actually on a Linux system with an overcommit setting. + // TODO: Can it be changed on Mac? + + // We need to work around jemalloc's propensity to run out of memory + // mappings and fail to allocate, when overcommit is disabled and the + // number of distinct mappings is capped. See + + // Read the setting + char overcommit; + procfile >> overcommit; + + if (overcommit == '2') { + // It is the never-overcommit value. + + // Complain to the user + cerr << "vg [warning]: System's vm.overcommit_memory setting is 2 (never overcommit). " + << "vg does not work well under these conditions; you may appear to run out of memory with plenty of memory left. " + << "Attempting to unsafely reconfigure jemalloc to deal better with this situation." << endl; + + // Try some stuff that may help + + // Configure the allocator to prefer sbrk() if it can because memory mapping will cause trouble + const char* dss_str = "primary"; + size_t dss_str_len = strlen(dss_str); + + bool match = false; + // Redo the dss_prec loop from jemalloc: + // This should cover newly created arenas. + for (int i = 0; i < dss_prec_limit; i++) { + if (strncmp(je_dss_prec_names[i], dss_str, dss_str_len) == 0) { + if (je_extent_dss_prec_set(i)) { + cerr << "Could not reconfigure jemalloc dss_prec" << endl; + exit(1); + } else { + je_opt_dss = je_dss_prec_names[i]; + match = true; + break; + } + } + } + if (!match) { + cerr << "Could not find jemalloc dss_prec of " << dss_str << endl; + exit(1); + } + // Then fix up all existing arenas (without allocating?) + // To write these string parameters we need to copy a pointer into place, not a value + const char** dss_str_location = &dss_str; + auto mallctl_result = mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".dss", nullptr, nullptr, (void*) dss_str_location, sizeof(dss_str_location)); + if (mallctl_result) { + cerr << "Could not set dss priority on existing jemalloc arenas: " << strerror(mallctl_result) << endl; + exit(1); + } + + // Finally, make the opt_retain flag be off. + // This seems most likely to upset jemalloc because it changes the semantics of some of its internal fields. + je_opt_retain = false; + } + + } +} + +} + diff --git a/src/config/allocator_config_system.cpp b/src/config/allocator_config_system.cpp new file mode 100644 index 00000000000..bf5ee16e119 --- /dev/null +++ b/src/config/allocator_config_system.cpp @@ -0,0 +1,19 @@ +/** + * \file + * Allocator configuration procedure for the system allocator. + */ + +#include "allocator_config.hpp" + +namespace vg { + +using namespace std; + +void configure_memory_allocator() { + // Nothing to do! The system allocator may be slow or not, depending on the + // system, but it isn't really configurable in any meaningful way. +} + +} + + diff --git a/src/constructor.cpp b/src/constructor.cpp index ed3f829cedd..f5f4653b921 100644 --- a/src/constructor.cpp +++ b/src/constructor.cpp @@ -2,10 +2,13 @@ * \file * constructor.cpp: contains implementations for vg construction functions. */ - - -#include "vg.hpp" + #include "constructor.hpp" +#include "utility.hpp" +#include "crash.hpp" +#include "io/load_proto_to_graph.hpp" + +#include #include #include @@ -23,6 +26,7 @@ namespace vg { void Constructor::trim_to_variable(vector>& parsed_alleles) { #ifdef debug + cerr << "Before trimming to variable region:" << endl; for (auto& allele : parsed_alleles) { cerr << "Allele:" << endl; @@ -63,6 +67,11 @@ namespace vg { for(size_t front_match_count = get_match_count(true); front_match_count > 0; front_match_count = get_match_count(true)) { // While we have shared matches at the front + +#ifdef debug + cerr << "Edits at the front share " << front_match_count << " match bases and need to be trimmed down" << endl; +#endif + for (auto& allele : parsed_alleles) { // Trim each allele if (allele.front().ref.size() > front_match_count) { @@ -74,6 +83,9 @@ namespace vg { #endif allele.front().ref = new_match_string; allele.front().alt = new_match_string; + + // Since we're trimming off the front we need to bump the position up. + allele.front().position += front_match_count; } else { // This perfect match can be completely eliminated #ifdef debug @@ -87,6 +99,11 @@ namespace vg { for(size_t back_match_count = get_match_count(false); back_match_count > 0; back_match_count = get_match_count(false)) { // While we have shared matches at the back + +#ifdef debug + cerr << "Edits at the back share " << back_match_count << " match bases and need to be trimmed down" << endl; +#endif + for (auto& allele : parsed_alleles) { // Trim each allele if (allele.back().ref.size() > back_match_count) { @@ -112,6 +129,7 @@ namespace vg { } #ifdef debug + cerr << "After trimming to variable region:" << endl; for (auto& allele : parsed_alleles) { cerr << "Allele: " << endl; for (auto& edit : allele) { @@ -147,16 +165,19 @@ namespace vg { } } - pair Constructor::get_bounds(vcflib::Variant var){ - int64_t start = numeric_limits::max(); - - start = min(start, (int64_t) var.zeroBasedPosition()); - int64_t end = -1; - - - end = var.getMaxReferenceLength(); + pair Constructor::get_symbolic_bounds(vcflib::Variant var) { + // TODO: We assume that the variant actually has at least one symbolic alt allele like . + // If that is the case, the base at POS must be an anchoring, unmodified base. + // But you can also have SV tags on something like a CCATG->G right-anchored deletion as long as + // none of the alleles are symbolic. + // We really should be calling this on variants that *were* symbolic before canonicalization. + + // Move the start 1 base right to account for the required anchor base. + // This may make us start after the end. + int64_t start = (int64_t) var.zeroBasedPosition() + 1; + int64_t end = var.getMaxReferencePos(); - return std::make_pair( start, end); + return std::make_pair(start, end); } @@ -182,31 +203,135 @@ namespace vg { return make_pair(variable_start, variable_stop); } - - ConstructedChunk Constructor::construct_chunk(string reference_sequence, string reference_path_name, - vector variants, size_t chunk_offset) const { - - #ifdef debug - cerr << "constructing chunk " << reference_path_name << ":" << chunk_offset << " length " << reference_sequence.size() << endl; - #endif + + bool Constructor::sanitize_sequence_in_place(string& sequence, const string* sequence_name, size_t sequence_start_offset, const vcflib::Variant* variant) const { + + bool made_change = false; // Make sure the input sequence is upper-case - string uppercase_sequence = toUppercase(reference_sequence); + string uppercase_sequence = toUppercase(sequence); + if (uppercase_sequence != sequence) { + // We had to make a change + if (warn_on_lowercase) { + if (variant) { + // We are warning about a variant (alt) + if (!lowercase_warned_alt) { + #pragma omp critical (cerr) + { + cerr << "warning:[vg::Constructor] Lowercase characters found in " + << "variant; coercing to uppercase:\n" << *const_cast(variant) << endl; + lowercase_warned_alt = true; + } + } + } else { + // What sequence are we complaining about? + string name_to_warn = sequence_name ? *sequence_name : "DNA sequence"; + #pragma omp critical (cerr) + { + // Note that the pragma also protects this mutable map that we update + if (!lowercase_warned_sequences.count(name_to_warn)) { + // We haven't warned about this sequence yet + cerr << "warning:[vg::Constructor] Lowercase characters found in " + << name_to_warn << "; coercing to uppercase." << endl; + lowercase_warned_sequences.insert(name_to_warn); + } + } + } + } + // Replace the original + sequence = std::move(uppercase_sequence); + made_change = true; + } + + // Make sure all IUPAC codes are Ns + string n_sequence = allAmbiguousToN(sequence); + if (n_sequence != sequence) { + // We had to make a change + if (warn_on_ambiguous) { + if (variant) { + // We are warning about a variant (alt). + // TODO: We used to always bail for IUPAC codes in a + // variant allele; do we really want to not? + #pragma omp critical (cerr) + { + cerr << "warning:[vg::Constructor] Unsupported IUPAC ambiguity codes found in " + << "variant; coercing to N:\n" << *const_cast(variant) << endl; + } + } else { + // What sequence are we complaining about? + string name_to_warn = sequence_name ? *sequence_name : "DNA sequence"; + #pragma omp critical (cerr) + { + // Note that the pragma also protects this mutable map that we update + if (!ambiguous_warned_sequences.count(name_to_warn)) { + // We haven't warned about this sequence yet + cerr << "warning:[vg::Constructor] Unsupported IUPAC ambiguity codes found in " + << name_to_warn << "; coercing to N." << endl; + ambiguous_warned_sequences.insert(name_to_warn); + } + } + } + } + // Replace the original + sequence = std::move(n_sequence); + made_change = true; + } - if (uppercase_sequence != reference_sequence && warn_on_lowercase) { + // TODO: this is like the forth scan of the whole string we do; can we + // condense this all into one pass? + if (!allATGCN(sequence)) { + // We don't know what to do with gaps, and we want to catch + // complete garbage. + + // We would like an example. + auto it = sequence.begin(); + while (it != sequence.end() && (*it == 'A' || *it == 'T' || *it == 'G' || *it == 'C' || *it == 'N')) { + ++it; + } + #pragma omp critical (cerr) { - // Note that the pragma also protects this mutable map that we update - if (!warned_sequences.count(reference_path_name)) { - // We haven't warned about this sequence yet - cerr << "warning:[vg::Constructor] Lowercase characters found in " - << reference_path_name << "; coercing to uppercase." << endl; - warned_sequences.insert(reference_path_name); - } + cerr << "error:[vg::Constructor] unacceptable character "; + if (it != sequence.end()) { + cerr << "\"" << *it << "\" "; + } + cerr << "found in "; + if (sequence_name) { + cerr << *sequence_name; + } else { + cerr << "DNA sequence"; + } + if (it != sequence.end()) { + cerr << " at index " << (it - sequence.begin() + sequence_start_offset); + } + if (variant) { + cerr << " in variant:\n" << *const_cast(variant); + } else { + cerr << "."; + } + cerr << endl; + exit(1); } } - swap(reference_sequence, uppercase_sequence); + + return made_change; + } + ConstructedChunk Constructor::construct_chunk(string reference_sequence, string reference_path_name, + vector variants, size_t chunk_offset) const { + + std::stringstream status_stream; + status_stream << "constructing chunk " << reference_path_name << ":" << chunk_offset << " length " << reference_sequence.size(); + + set_crash_context(status_stream.str()); + + #ifdef debug + cerr << status_stream.str() << endl; + #endif + + // Make sure the input sequence is upper-case and all IUPAC codes are Ns + sanitize_sequence_in_place(reference_sequence, &reference_path_name); + // Construct a chunk for this sequence with these variants. ConstructedChunk to_return; @@ -228,15 +353,28 @@ namespace vg { // And nodes starting at these reference positions that haven't yet all been // wired up. map> nodes_starting_at; - + + // We also keep separate maps for reference nodes only, for tracing + // back through inversions. Since when we trace through an inversion we + // need to visit every node in a run, we don't just care about the + // bounding IDs. So we store entire copies of the runs. But since the + // inversions always go backward, we only need them by their end. + // These are also on-the-end and not past-the-end positions, matching + // nodes_ending_at. + map> ref_runs_by_end; + // We don't want to wire inserts to each other, so we have a set of all // insert endpoints. set inserts; - // We need to wire up our inversions super specially. - map > inversion_starts; - map > inversion_ends; - + // We need to wire up our inversions super specially. These hold the + // end positions for each inversion anchored at the key, for + // inversions_starting, or visa-versa, for inversions_ending. In other + // words, the start position is exclusive and the end position is + // inclusive. + map> inversions_starting; + map> inversions_ending; + // Here we remember deletions that end at paritcular positions in the // reference, which are the positions of the last deleted bases. We map from // last deleted base to last non-deleted base before the deletion, so we can @@ -245,12 +383,10 @@ namespace vg { // We also need to track all points at which deletions start, so we can // search for the next one when deciding where to break the reference. + // We store the last NON-deleted base; the deletion arc attaches to the + // right side of this base! This base is *not* deleted. set deletion_starts; - // We need to track where the alt paths of deletions start/end (and maintain their ids) - // so that we can put them in the graph. - map deletion_start_to_alt_name; - // We use this to get the next variant auto next_variant = variants.begin(); @@ -267,10 +403,16 @@ namespace vg { // We have a utility function to tack a full length perfect match onto a // path. We need the node so we can get its length. // Automatically fills in rank, starting from 1. - auto add_match = [&](Path* path, Node* node) { + auto add_match = [&](Path* path, Node* node, bool is_reverse = false) { + #ifdef debug + cerr << "Add node " << node->id() << " orientation " << is_reverse + << " length " << node->sequence().size() << " to path " << path->name() << endl; + #endif + // Make a mapping for it auto* mapping = path->add_mapping(); mapping->mutable_position()->set_node_id(node->id()); + mapping->mutable_position()->set_is_reverse(is_reverse); // Set the rank to the next available rank in the path. mapping->set_rank(++max_rank[path]); @@ -345,13 +487,25 @@ namespace vg { auto add_reference_nodes_until = [&](size_t target_position) { #ifdef debug - cerr << "Create reference from cursor at " << reference_cursor << " out to " + cerr << "Create reference from cursor at " << reference_cursor << " out up to before " << target_position << "/" << reference_sequence.size() << endl; #endif // Don't get out of the chunk - assert(target_position <= reference_sequence.size()); - assert(reference_cursor <= reference_sequence.size()); + if (target_position > reference_sequence.size()) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_path_name + << ", attempted to add reference nodes until position " << target_position + << " but reference is only " << reference_sequence.size() << " long" << endl; + exit(1); + } + if (reference_cursor > reference_sequence.size()) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_path_name + << ", reference cursor is at " << reference_cursor + << " but reference is only " << reference_sequence.size() << " long" << endl; + exit(1); + } if (target_position < reference_cursor) { // TODO: should this ever happen? Should we be asked to go backward? @@ -375,7 +529,6 @@ namespace vg { // Remember where it starts and ends along the reference path nodes_starting_at[reference_cursor].insert(new_nodes.front()->id()); - for (Node* node : new_nodes) { // Add matches on the reference path for all the new nodes add_match(ref_path, node); @@ -386,6 +539,12 @@ namespace vg { // Add the end node to the ending at map. nodes_ending_at[reference_cursor + seen_bases - 1].insert(new_nodes.back()->id()); + + // Save the whole run for inversion tracing + #ifdef debug + cerr << "Create ref run ending at " << reference_cursor + seen_bases - 1 << endl; + #endif + ref_runs_by_end[reference_cursor + seen_bases - 1] = std::move(new_nodes); } @@ -393,11 +552,16 @@ namespace vg { reference_cursor = target_position; #ifdef debug - cerr << "Advanced reference cursor to " << reference_cursor << "/" << reference_sequence.size() << endl; + cerr << "Advanced reference cursor for next unmade base to " << reference_cursor << "/" << reference_sequence.size() << endl; #endif - assert(reference_cursor <= reference_sequence.size()); - + if (reference_cursor > reference_sequence.size()) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_path_name + << ", after adding reference nodes, reference cursor is at " << reference_cursor + << " but reference is only " << reference_sequence.size() << " long" << endl; + exit(1); + } }; while (next_variant != variants.end() || !clump.empty()) { @@ -405,10 +569,8 @@ namespace vg { // Group variants into clumps of overlapping variants. if (clump.empty() || - (next_variant != variants.end() && clump_end > next_variant->zeroBasedPosition() - chunk_offset)) { - // Either there are no variants in the clump, or this variant // overlaps the clump. It belongs in the clump clump.push_back(&(*next_variant)); @@ -416,7 +578,6 @@ namespace vg { // TODO: make sure long SVs don't fall outside chunk clump_end = max(clump_end, next_variant->zeroBasedPosition() + next_variant->ref.size() - chunk_offset); - // Try the variant after that next_variant++; } else { @@ -424,7 +585,7 @@ namespace vg { // Handle the clump. #ifdef debug - cerr << "Handling clump of " << clump.size() << " variants" << endl; + cerr << "Handling clump of " << clump.size() << " variants up to " << (clump_end + chunk_offset) << endl; #endif // Parse all the variants into VariantAllele edits @@ -442,38 +603,96 @@ namespace vg { // This holds the min and max values for the starts and ends of // edits in each variant that are actual change-making edits. These // are in chunk coordinates. They are only populated if a variant - // has a variable region. They can enclose a 0-length variable - // region by having the end before the beginning. - map> variable_bounds; + // has a variable region. Equal start and end indicate a 1-base region. + vector::interval> variable_intervals; // This holds the min and max values for starts and ends of edits // not removed from the clump. These are in chunk coordinates. int64_t first_edit_start = numeric_limits::max(); int64_t last_edit_end = -1; - // We'll fill this with any duplicate variants that should be - // ignored, out of the clump. This is better than erasing out of a + // We'll fill this with any variants that should be ignored, + // out of the clump. This is better than erasing out of a // vector. - set duplicates; - - for (vcflib::Variant* variant : clump) { + set skipped; + + for (size_t var_num = 0; var_num < clump.size(); var_num++) { + // For each variant in the clump + vcflib::Variant* variant = clump[var_num]; +#ifdef debug + cerr << "Handling clump variant " << var_num << "/" << clump.size() << " @ " << variant->zeroBasedPosition() << endl; +#endif + + // Since we make the fasta reference uppercase, we do the VCF too (otherwise vcflib gets mad). + // We set this if we modify the variant and vcflib needs to reindex it. + bool reindex = false; + // We set this if we skipped the variant + bool skip_variant = false; + for (size_t i = 0; i < variant->alt.size(); i++) { + auto& alt = variant->alt[i]; + // Process all the alts and not the ref + if (alt == "*") { + // This is a newer VCF feature we don't support, + // but not a broken file. + #pragma omp critical (cerr) + { + cerr << "warning:[vg::Constructor] Unsupported allele \"*\" found in " + << "variant, skipping variant:\n" << *variant << endl; + } + skipped.insert(variant); + skip_variant = true; + break; + } + // Sanitize the alt of Ns and lower case characters, + // and ensure what remains is something we can use, not + // a symbolic SV. + bool modified = sanitize_sequence_in_place(alt, nullptr, 0, variant); + if (modified) { + // Also update the copy in alleles + variant->alleles[i + 1] = alt; + } + reindex |= modified; + } + if (skip_variant) { + // Move to the next variant + continue; + } + // Also process the reference, but blame problems on the reference + if (sanitize_sequence_in_place(variant->ref, &reference_path_name, variant->zeroBasedPosition())) { + // Also update the copy in alleles + variant->alleles[0] = variant->ref; + reindex = true; + } + if (reindex) { + // Redo the indexing + variant->updateAlleleIndexes(); + } // Check the variant's reference sequence to catch bad VCF/FASTA pairings - - if (!variant->is_symbolic_sv()){ - auto expected_ref = reference_sequence.substr(variant->zeroBasedPosition() - chunk_offset, variant->ref.size()); - - if(variant->ref != expected_ref) { - // TODO: report error to caller somehow + auto expected_ref = reference_sequence.substr(variant->zeroBasedPosition() - chunk_offset, variant->ref.size()); + if(variant->ref != expected_ref) { + // TODO: report error to caller somehow #pragma omp critical (cerr) - cerr << "error:[vg::Constructor] Variant/reference sequence mismatch: " << variant->ref - << " vs pos: " << variant->position << ": " << expected_ref << "; do your VCF and FASTA coordinates match?"<< endl - << "Variant: " << *variant << endl; - cerr << "zero ind: " << variant->zeroBasedPosition() << " 1-indexed: " << variant->position << endl; - exit(1); - } + cerr << "error:[vg::Constructor] Variant/reference sequence mismatch: " << variant->ref + << " vs pos: " << variant->position << ": " << expected_ref << "; do your VCF and FASTA coordinates match?"<< endl + << "Variant: " << *variant << endl; + cerr << "zero ind: " << variant->zeroBasedPosition() << " 1-indexed: " << variant->position << endl; + exit(1); } + // No variants should still be symbolic at this point. + // Either we canonicalized them into base-level sequence, or we rejected them when making the clump. + // If they had IUPAC codes in them we should have fixed that already too. + if (variant->isSymbolicSV()) { + #pragma omp critical (cerr) + { + cerr << "error:[vg::Constructor] On " << reference_path_name << " @ " << variant->zeroBasedPosition() + << ", variant appears to be a symbolic SV, but all variants should have already been converted to explicit sequence edits." << endl; + cerr << "error:[vg::Constructor] Offending variant: " << *variant << endl; + } + exit(1); + } + // If variants have SVTYPE set, though, we will still use that info instead of the base-level sequence. // Name the variant and place it in the order that we'll // actually construct nodes in (see utility.hpp) @@ -485,7 +704,7 @@ namespace vg { #pragma omp critical (cerr) cerr << "warning:[vg::Constructor] Skipping duplicate variant with hash " << variant_name << " at " << variant->sequenceName << ":" << variant->position << endl; - duplicates.insert(variant); + skipped.insert(variant); continue; } @@ -499,35 +718,76 @@ namespace vg { // reference allele of the variant won't appear here. map> alternates; - if (flat) { + + // Decide if we should parse (i.e. align) the variant alts. + // We don';t want to do it if the alignment would be too big. + bool can_parse = !flat; + if (can_parse) { + if (variant->isSymbolicSV()) { + // All the variants are probably canonicalized by now and + // probably should not look like SVs. And vcflib is + // smart enough to give us flat alts when we ask to + // parse something that still does look like an SV. + // But we still probably want the flat alt + // postprocessing, so go with flat alts here. + can_parse = false; + } else { + // We (no longer?) have symbolic alleles, so just + // bail if any allele is too long. Only ref and alt + // fields will be filled in with sequence; alleles + // field may still be symbolic. + if (variant->ref.size() > max_parsed_variant_size) { + // Ref is too long. Handle as flat. + can_parse = false; + } else { + for (auto& a : variant->alt) { + if (a.size() > max_parsed_variant_size) { + // This alt is too long. Handle as flat. + can_parse = false; + break; + } + } + } + } + } + + + if (can_parse) { + // Do alignments to parse the alleles + alternates = variant->parsedAlternates(); + } else { alternates = variant->flatAlternates(); // if we can, remove the 1bp "standard" base that's added at the beginning of indels - for (auto& v : alternates) { - for (auto& a : v.second) { - if (a.ref[0] == a.alt[0]) { - a.ref = a.ref.substr(1); - a.alt = a.alt.substr(1); - ++a.position; + if (this->trim_indels){ + for (auto& v : alternates) { + for (auto& a : v.second) { + if (a.ref[0] == a.alt[0]) { + a.ref = a.ref.substr(1); + a.alt = a.alt.substr(1); + ++a.position; + } } } } - //} else if (!variant->has_sv_tags()) { - } else { - alternates = variant->parsedAlternates(); } - if (!variant->is_symbolic_sv()){ - - //map>> parsed_clump; - //auto alternates = use_flat_alts ? variant.flatAlternates() : variant.parsedAlternates(); - for (auto &kv : alternates){ - // For each alt in the variant - - if (kv.first == variant->ref) - { - // Skip the ref, because we can't make any ref nodes - // until all the edits for the clump are known. - continue; - } + + // Get the variable bounds in VCF space for all the trimmed alts of this variant + // Note: we still want bounds for SVs, we just have to get them differently + std::pair bounds; + + if (!variant->canonical){ + // The variant did not have to be canonicalized. + // We will process the variant as a normal variant, based on its ref and alt sequences. + + for (auto &kv : alternates) { + // For each alt in the variant + + if (kv.first == variant->ref) + { + // Skip the ref, because we can't make any ref nodes + // until all the edits for the clump are known. + continue; + } // With 0 being the first non-ref allele, which alt are we? @@ -558,33 +818,46 @@ namespace vg { // Trim the alts down to the variant's (possibly empty) variable // region trim_to_variable(parsed_clump[variant]); + + // The bounds are determined from that + bounds = get_bounds(parsed_clump[variant]); } else { - cerr << "Is symbolic: " << *variant << endl; + // We are going to make the edit specified by the SV + // tags, instead of whatever edits are implied by + // aligning the ref and alt sequences. + #ifdef debug + cerr << "Use SV tags to define edit for: " << *variant << endl; + #endif // For now, only permit one allele for SVs // in the future, we'll build out VCF lib to fix this. // TODO build out vcflib to fix this. + // TODO: Warn if we are lopping anything off! // We need to make sure parsed_clump[variant] has an entry for each allele. // But the contents won't matter since this is an SV. parsed_clump[variant].resize(variant->alt.size()); + + // The bounds are determined symbolically + bounds = get_symbolic_bounds(*variant); } - // Get the variable bounds in VCF space for all the trimmed alts of this variant - // Note: we still want bounds for SVs, we just have to get them differently - std::pair bounds; - bounds = get_bounds(parsed_clump[variant]); - if (variant->is_symbolic_sv()){ - bounds = get_bounds(*variant); - } - - if (bounds.first != numeric_limits::max() || bounds.second != -1) { // There's a (possibly 0-length) variable region bounds.first -= chunk_offset; bounds.second -= chunk_offset; - // Save the bounds for making reference node path visits - // inside the ref allele of the variable region. - variable_bounds[variant] = bounds; + + if (alt_paths && bounds.second >= bounds.first) { + // The variant covers a nonempty part of the + // reference, and we will need to find it for alt + // path generation. + + // Save the bounds for making reference node path visits + // inside the ref allele of the variable region. + #ifdef debug + cerr << "Record ref interval of " << bounds.first << " to " << bounds.second << " for " << *variant << endl; + #endif + variable_intervals.push_back(IntervalTree::interval(bounds.first, bounds.second, variant)); + } #ifdef debug if (bounds.first < first_edit_start) { @@ -598,18 +871,45 @@ namespace vg { // Expand bounds for the variable region of the chunk as a whole first_edit_start = min(first_edit_start, bounds.first); last_edit_end = max(last_edit_end, bounds.second); - } + } else { + // There's no variation in this variant. + #pragma omp critical (cerr) + cerr << "warning:[vg::Constructor] Skipping variant with no sequence changes at " + << variant->sequenceName << ":" << variant->position << endl; + skipped.insert(variant);} + } + + if (skipped.size() == clump.size()) { + // We skipped all the variants in the clump. Kick back up + // to clump building. + clump.clear(); + clump_end = 0; + continue; } - // We have to have some non-ref material in the clump, even if it - // occupies 0 reference space. - assert(last_edit_end != -1); - assert(first_edit_start != numeric_limits::max()); + // Otherwise, we have to have some non-ref material in the + // clump, even if it occupies 0 reference space. + if (first_edit_start == numeric_limits::max() || last_edit_end == -1) { + // Sometimes we still see this, so make a report of the offending variants. + #pragma omp critical (cerr) + { + cerr << "error:[vg::Constructor] got improperly bounded region " << first_edit_start << " to " << last_edit_end << " for edits of clump of " << clump.size() << " variants, of which " << skipped.size() << " were skipped." << endl; + for (vcflib::Variant* v : clump) { + if (!skipped.count(v)) { + cerr << "Unskipped variant: " << *v << endl; + } + } + } + exit(1); + } #ifdef debug cerr << "edits run between " << first_edit_start << " and " << last_edit_end << endl; #endif + // Index the variants in the clump by the reference region they overlap + IntervalTree variable_interval_tree(std::move(variable_intervals)); + // Create ref nodes from the end of the last clump (where the cursor // is) to the start of this clump's interior non-ref content. add_reference_nodes_until(first_edit_start); @@ -625,7 +925,14 @@ namespace vg { // This holds on to variant ref paths, which we can't actually fill // in until all the variants in the clump have had their non-ref // paths done. - map variant_ref_paths; + unordered_map variant_ref_paths; + + // This holds alt Path pointers and the inversions (start, end) + // that they need to trace through in their inverted + // orientations. They can't be traced until the corresponding + // reference nodes have been made. This can be resolved at the + // clump level. + vector> inversion_trace_queue; for (auto& kv : variants_by_name) { // For each variant in the clump, sorted by name @@ -633,7 +940,8 @@ namespace vg { auto* variant = kv.second; #ifdef debug - cerr << "Process variant " << variant_name << " with " << parsed_clump[variant].size() << " alts" << endl; + cerr << "Process variant " << variant_name << " @ " << variant->zeroBasedPosition() + << " with " << parsed_clump[variant].size() << " alts" << endl; #endif if (alt_paths) { @@ -657,26 +965,47 @@ namespace vg { alt_path->set_name(alt_name); } - // SV HAX - if (this->do_svs && variant->has_sv_tags() && variant->canonical){ - - auto e_start = variant->zeroBasedPosition() - chunk_offset; - // TODO check index here, may or may not need to subtract 1 - auto e_end = variant->zeroBasedPosition() + abs(std::stol(variant->info.at("SVLEN")[0])) - chunk_offset - 1; - - // Make in between nodes by grabbing our sequence from the fasta(s), - // either from the reference (deletions) or from insertion sequences. - auto key = make_tuple(variant->zeroBasedPosition() - chunk_offset, variant->info.at("SVTYPE")[0], ""); - - string sv_type = variant->info.at("SVTYPE")[0]; - - if (variant->info.at("SVTYPE")[0] == "INS"){ + // SV HAX + if (this->do_svs && variant->hasSVTags() && variant->canonical) { + // This is an SV + + #ifdef debug + cerr << "Process alt " << (alt_index + 1) << " of variant " << variant_name << " as an SV" << endl; + #endif + + string sv_type = variant->info.at("SVTYPE").at(0); - // Create insertion sequence nodes - if (created_nodes.count(key) == 0){ - vector node_run = create_nodes(variant->info.at("SEQ")[0]); + if (sv_type == "INS") { + + // For an insertion, the created node will + // start at the next base after the base used + // to position the variant, just like we would + // do for a non-SV insertion with the same POS + // anchored on that base. + auto e_start = variant->zeroBasedPosition() - chunk_offset + 1; + + // The insertion will "end" at the position + // *before* that, so the things it is getting + // inserted before can link up with it. TODO: + // Respect the END tag if it says something + // different. + auto e_end = e_start - 1; + + auto inserted_sequence = variant->info.at("SEQ").at(alt_index); + + // Identify our created node run with a key, in + // case it exists already somehow. + auto key = make_tuple(e_start, "", inserted_sequence); + if (created_nodes.count(key) == 0) { + // Create insertion sequence nodes + vector node_run = create_nodes(inserted_sequence); + #ifdef debug + cerr << "Inserted node " << node_run.front()->id() << " starts at " << e_start << endl; + cerr << "Inserted node " << node_run.back()->id() << " ends at " << e_end << endl; + #endif + nodes_starting_at[e_start].insert(node_run.front()->id()); nodes_ending_at[e_end].insert(node_run.back()->id()); @@ -694,41 +1023,63 @@ namespace vg { } } } - } - - else if (sv_type == "DEL"){ - if (created_nodes.count(key) == 0){ - - size_t arc_end = variant->zeroBasedPosition() - chunk_offset + std::stol(variant->info.at("SVLEN")[0]); + } else if (sv_type == "DEL") { + // Deletions also start after the base used + // to anchor them, so you can keep the same + // POS between SV and non-SV + // representations. The END is inclusive. int64_t arc_start = (int64_t) variant->zeroBasedPosition() - chunk_offset; + size_t arc_end = std::stol(variant->info.at("END").at(alt_index)) - chunk_offset - 1; + + #ifdef debug + cerr << "Deletion arc runs " << arc_start << " to " << arc_end << endl; + #endif deletions_ending_at[arc_end].insert(arc_start); deletion_starts.insert(arc_start); + + // No alt path mappings necessary for the + // deletion; the existence of the empty + // path is sufficient. + + } else if (sv_type == "INV"){ + // Handle inversions + // We only need reference nodes, plus two arcs + // one from the inverted sequence's beginning to the sequence following + // its last node and + // one from the end of the sequence preceding the inversion to the back + // of the inverted sequence's last node. + + // Inversions also require a left anchoring base, according to the spec. + // + // "If any of the ALT alleles is a symbolic + // allele (an angle-bracketed ID String “â€) + // then the padding base is required and POS + // denotes the coordinate of the base preceding + // the polymorphism." + // + + // The END is still inclusive. + + // Start at the anchoring base + int64_t inv_start = (int64_t) variant->zeroBasedPosition() - chunk_offset; + size_t inv_end = std::stol(variant->info.at("END").at(alt_index)) - chunk_offset - 1; + #ifdef debug + cerr << "Inversion arcs connect right of " << inv_start << " and right of " << inv_end << endl; + #endif + inversions_starting[inv_start].insert(inv_end); + inversions_ending[inv_end].insert(inv_start); + if (alt_paths) { - deletion_start_to_alt_name[arc_start] = alt_name; + // We need to make alt path entries through this inverted sequence, backward. + // But we don't have the reference nodes created yet. + // So we queue them up + inversion_trace_queue.emplace_back(alt_path, inv_start, inv_end); } - } - } - else if (sv_type == "INV"){ - // Handle inversions - // We only need reference nodes, plus two arcs - // one from the inverted sequence's beginning to the sequence following - // its last node and - // one from the end of the sequence preceding the inversion to the back - // of the inverted sequence's last node. - - size_t inv_end = variant->zeroBasedPosition() - chunk_offset + std::stol(variant->info.at("SVLEN")[0]); - int64_t inv_start = (int64_t) variant->zeroBasedPosition() - chunk_offset; - // inversion_starts[inv_start - 1].insert(inv_end); - // inversion_ends[inv_end + 1].insert(inv_start); - - inversion_starts[inv_start].insert(inv_end); - inversion_ends[inv_end].insert(inv_start); - } - else { - // Unknown or unsupported SV type + } else { + // Unknown or unsupported SV type cerr << "warning:[vg::Constructor]: unrecognized SV type " << sv_type << endl; } } else { @@ -765,6 +1116,16 @@ namespace vg { nodes_starting_at[edit_start].insert(node_run.front()->id()); nodes_ending_at[edit_end].insert(node_run.back()->id()); + if (edit.ref == edit.alt) { + // This edit is a no-op and so the node we just created is a reference run. + // These can be necessary if insertions and deletions are part of the same record. + // Remember the whole node run for inversion tracing + #ifdef debug + cerr << "Create ref run ending at " << edit_end << endl; + #endif + ref_runs_by_end[edit_end] = node_run; + } + // Save it in case any other alts also have this edit. created_nodes[key] = node_run; @@ -798,9 +1159,11 @@ namespace vg { // It's a deletion (and not a weird ""->"" edit). // Add an entry to the deletion arcs - // What is the past-the-end position (first non-deleted) - size_t arc_end = edit.position - 1 - chunk_offset + edit.ref.size(); - // What is the before-the-beginning position (last non-deleted, may be -1) + // What is the end position (last deleted) + // Take the 0-based edit position, remove the chunk offset, + // advance the ref bases, and then back up 1 base to the last deleted ref base. + size_t arc_end = (edit.position - 1) - chunk_offset + edit.ref.size() - 1; + // What is the before-the-beginning anchoring position (last non-deleted, may be -1) int64_t arc_start = (int64_t) edit.position - 1 - chunk_offset - 1; @@ -817,6 +1180,9 @@ namespace vg { // Remember that an arc comes from this base deletion_starts.insert(arc_start); + + // No alt path mappings necessary for the + // deletion } } @@ -830,24 +1196,38 @@ namespace vg { // come in or out. // We need a function to work that out - auto next_breakpoint_after = [&](size_t position) -> size_t { - // This returns the position of the base to the left of the next - // required breakpoint within this clump, after the given - // position, given created nodes and deletions that already - // exist. + /** + * Find the next breakpoint. + * + * Takes in the search position, like the position of the next + * un-made reference base. So searches from the left edge of + * the passed inclusive position. + * + * Finds the next required breakpoint within this clump, after + * the given position, given created nodes and deletions that + * already exist. + * + * Returns the inclusive position of the base to the left of + * this breakpoint, so the breakpoint is immediately to the + * right of the base at the returned position. This means that + * sometimes, such as if the next piece of the reference would + * be 1 bp long, this function will return the same value it + * was passed. + */ + auto next_breakpoint_after = [&](size_t position) -> size_t { // If nothing else, we're going to break at the end of the last // edit in the clump. size_t to_return = last_edit_end; #ifdef debug - cerr << "Next breakpoint must be at or before " << last_edit_end << endl; + cerr << "Next breakpoint after " << position << " must be at or before " << last_edit_end << endl; #endif // See if any nodes are registered as starting after our // position. They'll all start before the end of the clump, and // we don't care if they start at our position since that - // breakpoint already happened. + // breakpoint would be to the left and already happened. auto next_start_iter = nodes_starting_at.upper_bound(position); if(next_start_iter != nodes_starting_at.end()) { @@ -861,7 +1241,8 @@ namespace vg { // See if any nodes are registered as ending at or after our // position. We do care if they end at our position, since that - // means we need to break right here. + // means we need to break right here because that + // breakpoint would be to the right. auto next_end_iter = nodes_ending_at.lower_bound(position); if(next_end_iter != nodes_ending_at.end()) { @@ -873,24 +1254,27 @@ namespace vg { #endif } - // See if any deletions are registered as ending after here. - // Deletions break the reference before their past-the-end base, - // so we don't care about deletions ending here exactly. - auto deletion_end_iter = deletions_ending_at.upper_bound(position); + // See if any deletions are registered as ending at or + // after here. If the deletion ends here, this is the last + // base deleted, and that creates a breeakpoint to our + // right. + auto deletion_end_iter = deletions_ending_at.lower_bound(position); if(deletion_end_iter != deletions_ending_at.end()) { // If we found something, walk back where the breakpoint // needs to be so we break before the node after the // deletion starts. - to_return = min(to_return, deletion_end_iter->first - 1); + to_return = min(to_return, deletion_end_iter->first); #ifdef debug - cerr << "Next deletion ends at " << deletion_end_iter->first - 1 << endl; + cerr << "Next deletion ends by deleting " << deletion_end_iter->first << endl; #endif } - + // See if any deletions are known to start at or after this // base. We care about exact hits now, because deletions break - // after the base they start at. + // to the right of the base they start at, since we are + // storing the base that the deletion arc attaches to the + // right side of. auto deletion_start_iter = deletion_starts.lower_bound(position); // We don't need to worry about -1s here. They won't be found // with lower_bound on a size_t. @@ -901,25 +1285,32 @@ namespace vg { // needs to leave from. to_return = min(to_return, (size_t)*deletion_start_iter); #ifdef debug - cerr << "Next deletion starts at " << *deletion_start_iter << endl; + cerr << "Next deletion starts after " << *deletion_start_iter << endl; #endif } - // Check to see if any inversions happen past this point + // Check to see if any inversions' last (largest + // coordinate) inverted bases are at or after this point // Inversions break the reference twice, much like deletions. - auto inv_end_iter = inversion_ends.upper_bound(position); - if (inv_end_iter != inversion_ends.end()){ - to_return = min(to_return, (size_t) inv_end_iter->first - 1); + // Since we store the final base that is inverted, and the + // inversion creates a breakpoint on the right side of that + // base, we care about exact hits. + auto inv_end_iter = inversions_ending.lower_bound(position); + if (inv_end_iter != inversions_ending.end()){ + to_return = min(to_return, (size_t) inv_end_iter->first); #ifdef debug - cerr << "Next inversion ends at " << inv_end_iter->first - 1 << endl; + cerr << "Next inversion ends after inverting " << inv_end_iter->first << endl; #endif } - auto inv_start_iter = inversion_starts.lower_bound(position); - if (inv_start_iter != inversion_starts.end()){ + // Also look at inversions' first (smallest coordinate) bases. + // Inversions break just after the anchor the base they start at, + // so we care about exact hits and use lower_bound. + auto inv_start_iter = inversions_starting.lower_bound(position); + if (inv_start_iter != inversions_starting.end()){ to_return = min(to_return, (size_t) inv_start_iter->first); #ifdef debug - cerr << "Next inversion starts at " << inv_start_iter->first << endl; + cerr << "Next inversion starts after " << inv_start_iter->first << endl; #endif } @@ -948,8 +1339,20 @@ namespace vg { << next_end << "/" << reference_sequence.size() << endl; #endif - assert(reference_cursor <= reference_sequence.size()); - assert(next_end <= reference_sequence.size()); + if (reference_cursor > reference_sequence.size()) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_path_name + << ", adding reference to last edit end, reference cursor is at " << reference_cursor + << " but reference is only " << reference_sequence.size() << " long" << endl; + exit(1); + } + if (next_end > reference_sequence.size()) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_path_name + << ", adding reference to last edit end, next end is at " << next_end + << " but reference is only " << reference_sequence.size() << " long" << endl; + exit(1); + } // We need to have a reference node/run of nodes (which may have // already been created by a reference match) between where the @@ -959,48 +1362,64 @@ namespace vg { // We need a key to see if a node (run) has been made for this sequece already auto key = make_tuple(reference_cursor, run_sequence, run_sequence); - - if (created_nodes.count(key) == 0) { + auto representative_nodes = created_nodes.find(key); + if (representative_nodes == created_nodes.end()) { // We don't have a run of ref nodes up to the next break, so make one vector node_run = create_nodes(run_sequence); // Remember where the first one starts and the last one ends, for wiring up later. nodes_starting_at[reference_cursor].insert(node_run.front()->id()); nodes_ending_at[next_end].insert(node_run.back()->id()); + + // Remember the whole node run for inversion tracing + #ifdef debug + cerr << "Create ref run ending at " << next_end << endl; + #endif + ref_runs_by_end[next_end] = node_run; + #ifdef debug cerr << "Created reference nodes running " << reference_cursor << " to " << next_end << endl; #endif // Save it in case any other alts also have this edit. - created_nodes[key] = node_run; + representative_nodes = created_nodes.insert(representative_nodes, {key, node_run}); + } else { +#ifdef debug + cerr << "Reference nodes at " << reference_cursor << " for constant " << run_sequence.size() << " bp sequence " << run_sequence << " already exist" << endl; +#endif } - for (Node* node : created_nodes[key]) { + for (Node* node : representative_nodes->second) { // Add a reference visit to each node we created/found add_match(ref_path, node); + } - if (alt_paths) { - for (vcflib::Variant* variant : clump) { + if (!representative_nodes->second.empty() && alt_paths) { + // Ref paths from other variants may need to visit these new nodes. + auto overlapping_intervals = variable_interval_tree.findOverlapping(reference_cursor, reference_cursor); + #ifdef debug + cerr << "Found " << overlapping_intervals.size() << " potential overlapping variants in clump at " << reference_cursor << endl; + #endif + for (auto& interval : overlapping_intervals) { + if (interval.start <= reference_cursor && interval.stop >= reference_cursor && !skipped.count(interval.value)) { // For each variant we might also be part of the ref allele of - if (!duplicates.count(variant) && - variable_bounds.count(variant) && - reference_cursor >= variable_bounds[variant].first && - reference_cursor <= variable_bounds[variant].second) { - // For unique variants that actually differ from reference, - // if this run of nodes starts within the variant's variable region... - // (We know if it starts in the variable region it has to - // end in the variant, because the variable region ends with - // a node break) - - if (variant_ref_paths.count(variant) == 0) { - // All unique variants ought to have a ref path created - cerr << "error:[vg::Constructor] no ref path for " << *variant << endl; - exit(1); - } + // For unique variants that actually differ from reference, + // if this run of nodes starts within the variant's variable region... + // (We know if it starts in the variable region it has to + // end in the variant, because the variable region ends with + // a node break) + + if (variant_ref_paths.count(interval.value) == 0) { + // All unique variants ought to have a ref path created + cerr << "error:[vg::Constructor] no ref path for " << *interval.value << endl; + exit(1); + } + + for (Node* node : representative_nodes->second) { // Add a match along the variant's ref allele path - add_match(variant_ref_paths[variant], node); + add_match(variant_ref_paths[interval.value], node); } } } @@ -1009,13 +1428,72 @@ namespace vg { // Advance the reference cursor to after this run of reference nodes reference_cursor = next_end + 1; - assert(reference_cursor <= reference_sequence.size()); - + if (reference_cursor > reference_sequence.size()) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_path_name + << ", after adding reference to last edit end, reference cursor is at " << reference_cursor + << " but reference is only " << reference_sequence.size() << " long" << endl; + exit(1); + } + // Keep going until we have created reference nodes through to // the end of the clump's interior edits. } // Now we have gotten through all the places where nodes start, before the end of the clump. + + for (auto& to_trace : inversion_trace_queue) { + // Now that all the ref nodes exist, create the path entries for inversion alt paths. + auto& alt_path = get<0>(to_trace); + auto& inv_start = get<1>(to_trace); + auto& inv_end = get<2>(to_trace); + + // We will walk this cursor back from the end of the + // inversion to the start, going backward through runs of + // reference nodes that end here. + // It will track the first base from right to left that we have yet to cover with our path. + // Our inversion end is inclusive and that base is inverted, so start past there. + int64_t inv_end_cursor = inv_end; + + + while (inv_end_cursor > inv_start) { + #ifdef debug + cerr << "Inversion cursor at " << inv_end_cursor << endl; + #endif + + // Get the next ref run on the right that the inversion has to visit + auto& trailing_run = ref_runs_by_end.at(inv_end_cursor); + + for (auto it = trailing_run.rbegin(); it != trailing_run.rend(); it++) { + // For each node in the run in reverse order + Node* node = *it; + + #ifdef debug + cerr << "Reverse node " << node->id() << endl; + #endif + + // Add a match to this node in its reverse orientation, since we are inverting. + add_match(alt_path, node, true); + + // Advance the cursor left after visiting the node + inv_end_cursor -= node->sequence().size(); + } + } + + #ifdef debug + cerr << "Added inversion alt path from " << inv_end << " back to " << inv_start << " and arrived at " + << inv_end_cursor << endl; + #endif + + if (inv_end_cursor != inv_start) { + // Make sure we did it right + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_path_name << " near " << reference_cursor + << ", inversion end cursor " << inv_end_cursor << " did not reach inversion start " << inv_start << endl; + exit(1); + } + + } // Now the clump is handled clump.clear(); @@ -1040,6 +1518,10 @@ namespace vg { // These are nodes that start somewhere else. for (auto& right_node : kv.second) { // For every node that could occur here + +#ifdef debug + cerr << "Node " << right_node << " can start at " << kv.first << endl; +#endif for (auto& left_node : nodes_ending_at[kv.first - 1]) { // For every node that could come before these nodes @@ -1072,7 +1554,7 @@ namespace vg { // We also keep a list of unexplored deletion end points to chain from. list possible_ends; - possible_ends.push_back(kv.first); + possible_ends.push_back(kv.first - 1); // And a set of explored ones set explored_ends; @@ -1083,6 +1565,10 @@ namespace vg { int64_t deletion_end = possible_ends.front(); possible_ends.pop_front(); +#ifdef debug + cerr << deletions_ending_at[deletion_end].size() << " deletions end by deleting " << deletion_end << endl; +#endif + for (auto& deletion_start : deletions_ending_at[deletion_end]) { // For every deletion start that can end there. @@ -1090,9 +1576,9 @@ namespace vg { // transitive deletions. possible_starts.insert(deletion_start); - // We can daisy chain from deletions that end at the - // base after this deletion starts. - int64_t possible_end = deletion_start + 1; + // We can daisy chain from deletions that end by + // deleting the anchor base that this deletion starts at. + int64_t possible_end = deletion_start; if(chain_deletions && possible_end > 0 && !explored_ends.count(possible_end)) { // Queue it up if not already queued. If we aren't @@ -1116,7 +1602,9 @@ namespace vg { // but actually starts at a place where there are nodes. for (auto& left_node : nodes_ending_at[deletion_start]) { - // For every node that the deletion could start with + // For every node that the deletion could + // anchor from (because they end exactly where + // it starts with its anchor) if (inserts.count(left_node)) { // Don't let an inserted node happen just before a deletion. @@ -1138,28 +1626,86 @@ namespace vg { } } } + + #ifdef debug + for (auto& kv2 : inversions_starting) { + cerr << "Inversion can start after " << kv2.first << endl; + } + for (auto& kv2 : inversions_ending) { + cerr << "Inversion can end by inverting " << kv2.first << endl; + } + #endif - for (auto& inv_end : inversion_starts[kv.first]){ - for (auto& n : nodes_starting_at[inv_end]){ - auto* e = to_return.graph.add_edge(); - e->set_from(right_node); - e->set_to(n); - e->set_from_start(true); - e->set_to_end(false); + // Now do the inversions. + + // What do we hook up to the start of right_node, which starts at kv.first? + // For any inversions that end by inverting kv.first - 1, we hook up the starts of anything where the inversion started. + if (inversions_ending.count(kv.first - 1)) { + for (auto& inv_start : inversions_ending[kv.first - 1]) { + // For each inversion start position corresponding to inversions ending by inverting this base + +#ifdef debug + cerr << "Inversion ending by inverting " << kv.first - 1 << " is anchored at " << inv_start + << " after which " << nodes_starting_at[inv_start + 1].size() << " nodes start" << endl; +#endif + + for (auto& n : nodes_starting_at[inv_start + 1]) { +#ifdef debug + cerr << "Node " << n << " can start at " << (inv_start + 1) << " where inversion first inverts. " + << "So link its start to our right_node's start." << endl; +#endif + + // For each node that starts at the inversion start position, link it up inverted. + auto* e = to_return.graph.add_edge(); + e->set_from(n); + e->set_from_start(true); + e->set_to(right_node); + e->set_to_end(false); + +#ifdef debug + cerr << "Invert " << n << " to " << right_node << endl; +#endif + } } } - for (auto& inv_start : inversion_ends[kv.first]){ - for (auto& n : nodes_ending_at[inv_start]){ - auto* e = to_return.graph.add_edge(); - e->set_from(n); - e->set_to(right_node); - e->set_to_end(true); - e->set_from_start(false); + + + } + + // Inversions continue with another loop over the left nodes + for (auto& left_node : nodes_ending_at[kv.first - 1]) { + + // What do we hook up to the end of left_node, which ends right before kv.first? + // For any inversions anchoring where this node ends, we hook up the ends of everything that is at where the inversion ends. + + if (inversions_starting.count(kv.first - 1)) { + for (auto& inv_end : inversions_starting[kv.first - 1]) { + // For each inversion end position corresponding to inversions starting by inverting this base + +#ifdef debug + cerr << "Inversion starting by inverting " << kv.first << " and anchored at " << (kv.first - 1) + << " can end at " << inv_end << " where " << nodes_ending_at[inv_end].size() << " nodes end" << endl; +#endif + + for (auto& n : nodes_ending_at[inv_end]) { +#ifdef debug + cerr << "Node " << n << " can end at " << inv_end << " where inversion does. " + << "So link its end to " << left_node << "'s end at anchor point " << (kv.first - 1) << endl; +#endif + + // For each node that ends at that inversion end position, link it up inverted. + auto* e = to_return.graph.add_edge(); + e->set_from(left_node); + e->set_from_start(false); + e->set_to(n); + e->set_to_end(true); + +#ifdef debug + cerr << "Invert " << left_node << " to " << n << endl; +#endif + } } } - - - } } } @@ -1171,7 +1717,7 @@ namespace vg { to_return.right_ends.insert(node_id); } - for(auto& deletion_start : deletions_ending_at[reference_sequence.size()]) { + for(auto& deletion_start : deletions_ending_at[reference_sequence.size() - 1]) { // Also add in nodes at the starts of deletions that go to the end of the chunk if(deletion_start == -1) { @@ -1190,12 +1736,31 @@ namespace vg { // Remember to tell the caller how many IDs we used to_return.max_id = next_id - 1; - + + // Filter out any empty variant Paths. + // First stop pointing to them. + max_rank.clear(); + // We have this many nonempty paths at the start of the collection at + // the start of every loop. + size_t nonempty_paths = 0; + while (nonempty_paths < to_return.graph.path_size()) { + if (to_return.graph.path(nonempty_paths).mapping_size() == 0) { + // This is empty of mappings so swap it to the end and remove it. + to_return.graph.mutable_path()->SwapElements(nonempty_paths, to_return.graph.path_size() - 1); + to_return.graph.mutable_path()->RemoveLast(); + // Leave our cursor where it is; we have to check the element we swapped to here. + } else { + // This is nonempty so advance the cursor. + nonempty_paths++; + } + } + + clear_crash_context(); return to_return; } void Constructor::construct_graph(string vcf_contig, FastaReference& reference, VcfBuffer& variant_source, - const vector& insertions, function callback) { + const vector& insertions, const function& callback) { // Our caller will set up indexing. We just work with the buffered source that we pull variants from. @@ -1232,13 +1797,13 @@ namespace vg { while(variant_source.get() && (variant_source.get()->sequenceName != vcf_contig || variant_source.get()->zeroBasedPosition() < leading_offset || variant_source.get()->zeroBasedPosition() + variant_source.get()->ref.size() > reference_end)) { - // This variant comes before our region + // This variant comes before or ends after our region - // Discard variants that come out that are before our region + // Discard variants that come out that are outside our region variant_source.handle_buffer(); variant_source.fill_buffer(); } - + // Now we're on the variants we actually want. // This is where the next chunk will start in the reference sequence. @@ -1258,17 +1823,24 @@ namespace vg { // max rank used? size_t max_ref_rank = 0; - // Whenever a chunk ends with a single node, we separate it out and buffer - // it here, because we may need to glue it together with subsequent leading - // nodes that were broken by a chunk boundary. + // Whenever a chunk ends with a single node, with no edges to the end + // or non-reference paths, we separate it out and buffer it here, + // because we may need to glue it together with subsequent leading + // nodes with no edges to the start or non-reference paths, to + // eliminate spurious node breaks at chunk boundaries. Node last_node_buffer; // Sometimes we need to emit single node reference chunks gluing things // together auto emit_reference_node = [&](Node& node) { - // Don't emit nonexistent nodes - assert(node.id() != 0); + if (node.id() == 0) { + // Don't emit nonexistent nodes + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << vcf_contig << " near " << chunk_start + << ", tried to produce a reference node without an ID" << endl; + exit(1); + } // Make a single node chunk for the node Graph chunk; @@ -1296,10 +1868,87 @@ namespace vg { // Modifies the chunk in place. auto wire_and_emit = [&](ConstructedChunk& chunk) { // When each chunk comes back: + + // If we have a single head or tail with no outer edges or + // non-reference paths, we will fill in the ID here. + nid_t head_id = 0; + nid_t tail_id = 0; + + if (last_node_buffer.id() != 0 && chunk.left_ends.size() == 1) { + // We have a single dangling node buffered that we can rewrite. + // So we also want to know if we have a head to combine it with. + // And it looks like we might. + head_id = *chunk.left_ends.begin(); + #ifdef debug + cerr << "Node " << head_id << " might be mergable with buffered node " << last_node_buffer.id() << endl; + #endif + } - if (chunk.left_ends.size() == 1 && last_node_buffer.id() != 0) { + if (chunk.right_ends.size() == 1) { + // We always need to know if we can buffer the tail. + // Name this node a candidate tail. + tail_id = *chunk.right_ends.begin(); + #ifdef debug + cerr << "Node " << tail_id << " might be a tail we can buffer" << endl; + #endif + } + + for (auto& edge : chunk.graph.edge()) { + // Go through all edges and kick out head and tail candidates if they have any on the outside. + if (head_id && ((edge.from() == head_id && edge.from_start()) || (edge.to() == head_id && !edge.to_end()))) { + // Edge connects to the start of the head candidate, so it fails. + #ifdef debug + cerr << "Node " << head_id << " has an edge to its left and so can't merge" << endl; + #endif + head_id = 0; + } + if (tail_id && ((edge.from() == tail_id && !edge.from_start()) || (edge.to() == tail_id && edge.to_end()))) { + // Edge connects to the end of the tail candidate, so it fails. + #ifdef debug + cerr << "Node " << tail_id << " has an edge to its right and so can't merge" << endl; + #endif + tail_id = 0; + } + } + + for (size_t i = 1; (head_id != 0 || tail_id != 0) && i < chunk.graph.path_size(); i++) { + // Go through all paths other than the reference (which is 0) + auto& path = chunk.graph.path(i); + + // Check the first and last steps on the path to see if they + // touch our head/tail nodes. Other steps can't touch them + // because of the edge restrictions we already checked. + auto check_mapping = [&](size_t mapping_index) { + nid_t touched_node = path.mapping(mapping_index).position().node_id(); + if (touched_node == head_id) { + #ifdef debug + cerr << "Node " << head_id << " is visited by path " << path.name() << " and so can't merge" << endl; + #endif + head_id = 0; + } + if (touched_node == tail_id) { + #ifdef debug + cerr << "Node " << tail_id << " is visited by path " << path.name() << " and so can't merge" << endl; + #endif + tail_id = 0; + } + }; + + // Sometimes the first and last step are the same step! + if (path.mapping_size() > 0) { + // We have a first step + check_mapping(0); + if (path.mapping_size() > 1) { + // We have a distinct last step + check_mapping(path.mapping_size() - 1); + } + } + } + + + if (last_node_buffer.id() != 0 && head_id != 0) { // We have a last node from the last chunk that we want to glom onto - // this chunk. + // this chunk, and we have a node to do it with. // We want to merge it with the single source node for this // chunk. But depending on the variant structure it may not be @@ -1312,23 +1961,25 @@ namespace vg { // graph size anyway, and we never have to scan through more // than a variant's worth of nodes. - // This is the node we want - auto wanted_id = *chunk.left_ends.begin(); - // We will fill this in Node* mutable_first_node = nullptr; for (size_t i = 0; i < chunk.graph.node_size(); i++) { // Look at each node in turn mutable_first_node = chunk.graph.mutable_node(i); - if (mutable_first_node->id() == wanted_id) { + if (mutable_first_node->id() == head_id) { // We found the left end we want break; } } - // Make sure we found it - assert(mutable_first_node != nullptr && mutable_first_node->id() == wanted_id); + if (mutable_first_node == nullptr || mutable_first_node->id() != head_id) { + // Make sure we found it + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_contig + << ", could not find node " << head_id << endl; + exit(1); + } // Combine the sequences for the two nodes string combined_sequence = last_node_buffer.sequence() + mutable_first_node->sequence(); @@ -1366,19 +2017,37 @@ namespace vg { // Update the mapping lengths on the mutable first node. // First we find the primary path Path* path = chunk.graph.mutable_path(0); - assert(path->name() == reference_contig); + if (path->name() != reference_contig) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] Expected path " << reference_contig + << " but found path " << path->name() << endl; + exit(1); + } // Then the first mapping Mapping* mapping = path->mutable_mapping(0); - assert(mapping->position().node_id() == mutable_first_node->id()); - assert(mapping->edit_size() == 1); + if (mapping->position().node_id() != mutable_first_node->id()) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_contig + << ", expected node " << mutable_first_node->id() + << " but found node " << mapping->position().node_id() << endl; + exit(1); + } + if (mapping->edit_size() != 1) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_contig + << " at node " << mapping->position().node_id() + << ", expected 1 edit but found " << mapping->edit_size() << endl; + exit(1); + } // Then the only edit Edit* edit = mapping->mutable_edit(0); // Correct its length edit->set_from_length(mutable_first_node->sequence().size()); edit->set_to_length(mutable_first_node->sequence().size()); } else if (last_node_buffer.id() != 0) { - // There's no single leading node on this next chunk, but we still - // have a single trailing node to emit. + // There's no single leading node on this next chunk that we + // are free to rewrite, but we still have a single trailing + // node to emit. // Emit it emit_reference_node(last_node_buffer); @@ -1386,29 +2055,53 @@ namespace vg { last_node_buffer = Node(); } - if (chunk.right_ends.size() == 1) { + if (tail_id != 0) { // We need to pull out the last node in the chunk. Note that it may // also be the first node in the chunk... // We know it's the last node in the graph last_node_buffer = chunk.graph.node(chunk.graph.node_size() - 1); - - - assert(chunk.right_ends.count(last_node_buffer.id())); + + if (last_node_buffer.id() != tail_id) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_contig + << ", could not find right end for node " << last_node_buffer.id() << endl; + exit(1); + } // Remove it chunk.graph.mutable_node()->RemoveLast(); // Find the primary path Path* path = chunk.graph.mutable_path(0); - assert(path->name() == reference_contig); + if (path->name() != reference_contig) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] Expected path " << reference_contig + << " but found path " << path->name() << endl; + exit(1); + } // Then drop last mapping, which has to be to this node - assert(path->mapping_size() > 0); - assert(path->mapping(path->mapping_size() - 1).position().node_id() == last_node_buffer.id()); + if (path->mapping_size() == 0) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_contig + << ", found empty path" << endl; + exit(1); + } + if (path->mapping(path->mapping_size() - 1).position().node_id() != last_node_buffer.id()) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] On " << reference_contig + << ", expected last node" << last_node_buffer.id() + << " but found " << path->mapping(path->mapping_size() - 1).position().node_id() << endl; + exit(1); + } path->mutable_mapping()->RemoveLast(); // Update its ID separately, since it's no longer in the graph. last_node_buffer.set_id(last_node_buffer.id() + max_id); + + #ifdef debug + cerr << "Buffered final node becomes: " << last_node_buffer.id() << endl; + #endif } // Up all the IDs in the graph @@ -1486,7 +2179,8 @@ namespace vg { // if we have more than one insertion fasta file, we can pull // sequences from the vcf:fasta pair (i.e. the same index in the vectors). do_external_insertions = true; - cerr << "Passing multiple insertion files not implemented yet." << endl << "Please try combining all of your insertions fastas into one file." << endl; + cerr << "Passing multiple insertion files not implemented yet." << endl + << "Please try combining all of your insertions fastas into one file." << endl; exit(1); } else{ @@ -1494,76 +2188,134 @@ namespace vg { // those with seqs in the vcf. } + + #ifdef debug + cerr << "Handling run of variants starting in region..." << endl; + #endif while (variant_source.get() && variant_source.get()->sequenceName == vcf_contig && - variant_source.get()->zeroBasedPosition() >= leading_offset && - variant_source.get()->zeroBasedPosition() + variant_source.get()->ref.size() <= reference_end) { - + variant_source.get()->zeroBasedPosition() >= leading_offset && + variant_source.get()->zeroBasedPosition() <= reference_end) { + + // For each variant that begins inside our region + + // Skip variants that don't fit in our range + // (maybe there's one that does fit after, so we continue checking) + if (variant_source.get()->zeroBasedPosition() + variant_source.get()->ref.size() > reference_end) { + variant_source.handle_buffer(); + variant_source.fill_buffer(); + continue; + } + // While we have variants we want to include auto vvar = variant_source.get(); + // Fix the variant's canonical flag being uninitialized. + vvar->canonical = false; // We need to decide if we want to use this variant. By default we will use all variants. bool variant_acceptable = true; - if (vvar->is_symbolic_sv() && this->do_svs) { - // Canonicalize the variant and see if that disqualifies it. - // This also takes care of setting the variant's insertion sequences. - variant_acceptable = vvar->canonicalize(reference, insertions, true); - - if (variant_acceptable) { - // Worth checking for multiple alts. - if (vvar->alt.size() > 1) { - // We can't handle multiallelic SVs yet. - #pragma omp critical (cerr) - cerr << "warning:[vg::Constructor] Unsupported multiallelic SV being skipped: " << *vvar << endl; - variant_acceptable = false; - } - } - - if (variant_acceptable) { - // Worth checking for bounds problems. - // We have seen VCFs where the variant positions are on GRCh38 but the END INFO tags are on GRCh37. - auto bounds = get_bounds(*vvar); - if (bounds.second < bounds.first) { - #pragma omp critical (cerr) - cerr << "warning:[vg::Constructor] SV with end position before start being skipped (check liftover?): " - << *vvar << endl; + if (vvar->alt.empty()) { + // Variants with no alts are unimportant + variant_acceptable = false; + } else { + for (string& alt : vvar->alt) { + // Variants with "." alts can't be processsed. + // TODO: handle remaining alts. + if (alt == ".") { variant_acceptable = false; + if (vvar->alt.size() > 1) { + // Warn if there are more alts we will miss because of skipping + #pragma omp critical (cerr) + cerr << "warning:[vg::Constructor] Variant with '.' among multiple alts being skipped: " + << *vvar << endl; + } + break; } } } - for (string& alt : vvar->alt) { - // Validate each alt of the variant - - if(!allATGCN(alt)) { - // It may be a symbolic allele or something. Skip this variant. - variant_acceptable = false; - if (this->do_svs && vvar->is_symbolic_sv() && vvar->canonicalizable()){ - // Only try to normalize SVs if we want to handle SVs, - // the variant is symbolic (i.e. no ref/alts) and the variant - // can be canonicalized (it has at least a type and a length) - variant_acceptable = vvar->canonicalize(reference, insertions, true); - } - else{ - #pragma omp critical (cerr) - { - bool warn = true; - if (!alt.empty() && alt[0] == '<' && alt[alt.size()-1] == '>') { - if (symbolic_allele_warnings.find(alt) != symbolic_allele_warnings.end()) { - warn = false; - } else { - symbolic_allele_warnings.insert(alt); + if (variant_acceptable) { + if (vvar->isSymbolicSV()) { + // We have a symbolic not-all-filled-in alt. + // We need to be processed as a symbolic SV + // It also might just have IUPAC codes in it and an SVTYPE and thus look symbolic. + + if (this->do_svs) { + // We are actually going to try to handle this SV. + + if (vvar->alt.size() > 1) { + // vcflib will refuse to canonicalize multi-alt SVs. + #pragma omp critical (cerr) + { + if (!multiallelic_sv_warned) { + cerr << "warning:[vg::Constructor] Multiallelic SVs cannot be canonicalized by vcflib; skipping variants like: " << *vvar << endl; + multiallelic_sv_warned = true; } } - if (warn) { - cerr << "warning:[vg::Constructor] Unsupported variant allele \"" << alt << "\"; Skipping variant(s) " << *vvar <<" !" << endl; + variant_acceptable = false; + } + + if (variant_acceptable) { + // Canonicalize the variant and see if that disqualifies it. + // This also takes care of setting the variant's alt sequences. + variant_acceptable = vvar->canonicalizable() && vvar->canonicalize(reference, insertions, true); + if (!variant_acceptable) { + #pragma omp critical (cerr) + { + if (!uncanonicalizable_sv_warned) { + cerr << "warning:[vg::Constructor] vcflib could not canonicalize some SVs to base-level sequence; skipping variants like: " << *vvar << endl; + uncanonicalizable_sv_warned = true; + } + } + } + } + + if (variant_acceptable) { + // Worth checking for bounds problems. + // We have seen VCFs where the variant positions are on GRCh38 but the END INFO tags are on GRCh37. + // But for inserts the bounds will have the end right before the start, so we have to allow for those. + auto bounds = get_symbolic_bounds(*vvar); + if (bounds.second + 1 < bounds.first) { + #pragma omp critical (cerr) + cerr << "warning:[vg::Constructor] SV with end position " << bounds.second + << " significantly before start " << bounds.first << " being skipped (check liftover?): " + << *vvar << endl; + variant_acceptable = false; + } + } + } else { + // SV handling is off. + variant_acceptable = false; + + // Figure out exactly what to complain about. + for (string& alt : vvar->alt) { + // Validate each alt of the variant + + if(!allATGCN(alt)) { + // This is our problem alt here. + // Either it's a symbolic alt or it is somehow lower case or something. + // It could be an IUPAC code, which we can't handle usually. + #pragma omp critical (cerr) + { + bool warn = true; + if (!alt.empty() && alt[0] == '<' && alt[alt.size()-1] == '>') { + if (symbolic_allele_warnings.find(alt) != symbolic_allele_warnings.end()) { + warn = false; + } else { + symbolic_allele_warnings.insert(alt); + } + } + if (warn) { + cerr << "warning:[vg::Constructor] Unsupported variant allele \"" + << alt << "\"; skipping variants like: " << *vvar <<" !" << endl; + } + } + break; } } - break; } - } } @@ -1631,6 +2383,10 @@ namespace vg { } } + #ifdef debug + cerr << "Variants in region depleted, which we know because we found a starting-after-region variant." << endl; + #endif + // We ran out of variants, so finish this chunk and all the others after it // without looking for variants. // TODO: unify with above loop? @@ -1675,14 +2431,18 @@ namespace vg { void Constructor::construct_graph(const vector& references, const vector& variant_files, const vector& insertions, - function callback) { + const function& callback) { // Make a map from contig name to fasta reference containing it. map reference_for; for (size_t i = 0; i < references.size(); i++) { // For every FASTA reference, make sure it has an index auto* reference = references[i]; - assert(reference->index); + if (!reference->index) { + #pragma omp critical (cerr) + cerr << "error:[vg::Constructor] Reference #" << i << " is missing its index" << endl; + exit(1); + } for (auto& kv : *(reference->index)) { // For every sequence name and index entry, point to this reference reference_for[kv.first] = reference; @@ -1718,7 +2478,10 @@ namespace vg { #endif // Also the FASTA reference that has that sequence - assert(reference_for.count(fasta_name)); + if (!reference_for.count(fasta_name)) { + cerr << "[vg::Constructor] Error: \"" << fasta_name << "\" not found in fasta file" <& reference_filenames, const vector& variant_filenames, + const vector& insertion_filenames, const function& callback) { + + vector> references; + for (auto& fasta_filename : reference_filenames) { + // Open each FASTA file + FastaReference* reference = new FastaReference(); + references.emplace_back(reference); + reference->open(fasta_filename); + } + + vector> variant_files; + for (auto& vcf_filename : variant_filenames) { + // Make sure each VCF file exists. Otherwise Tabix++ may exit with a non- + // helpful message. + + // We can't invoke stat woithout a place for it to write. But all we + // really want is its return value. + struct stat temp; + if(stat(vcf_filename.c_str(), &temp)) { + cerr << "error:[Constructor::construct_graph] file \"" << vcf_filename << "\" not found" << endl; + exit(1); + } + vcflib::VariantCallFile* variant_file = new vcflib::VariantCallFile(); + variant_file->parseSamples = false; // Major speedup if there are many samples. + variant_files.emplace_back(variant_file); + // TODO: vcflib needs a non-const string for the filename for some reason. Fix that. + string mutable_filename = vcf_filename; + variant_file->open(mutable_filename); + if (!variant_file->is_open()) { + cerr << "error:[Constructor::construct_graph] could not open" << vcf_filename << endl; + exit(1); + } + } + + vector> insertions; + for (auto& insertion_filename : insertion_filenames){ + // Open up those insertion files + FastaReference* insertion = new FastaReference(); + insertions.emplace_back(insertion); + insertion->open(insertion_filename); + } + + // Make vectors of just bare pointers + vector vcf_pointers; + for(auto& vcf : variant_files) { + vcf_pointers.push_back(vcf.get()); + } + vector fasta_pointers; + for(auto& fasta : references) { + fasta_pointers.push_back(fasta.get()); + } + vector ins_pointers; + for (auto& ins : insertions){ + ins_pointers.push_back(ins.get()); + } + + // Construct the graph. + construct_graph(fasta_pointers, vcf_pointers, ins_pointers, callback); + } + + void Constructor::construct_graph(const vector& references, + const vector& variant_files, const vector& insertions, + MutablePathMutableHandleGraph* destination) { + + vg::io::load_proto_to_graph(destination, [&](const function& callback) { + // Start a load of a stream of Protobuf Graphs, and when we get the + // callback to handle them, construct into it. + construct_graph(references, variant_files, insertions, callback); + }); + + // Now we did the construction and all the Graph chunks have been saved. + // TODO: Refactor everything to not go through Graph chunks? + } + + void Constructor::construct_graph(const vector& reference_filenames, const vector& variant_filenames, + const vector& insertion_filenames, MutablePathMutableHandleGraph* destination) { + + vg::io::load_proto_to_graph(destination, [&](const function& callback) { + // Start a load of a stream of Protobuf Graphs, and when we get the + // callback to handle them, construct into it. + construct_graph(reference_filenames, variant_filenames, insertion_filenames, callback); + }); + + // Now we did the construction and all the Graph chunks have been saved. + // TODO: Refactor everything to not go through Graph chunks? + + // TODO: Deduplicate with the version that takes already-opened files somehow... + } } diff --git a/src/constructor.hpp b/src/constructor.hpp index 6573ff20de3..ede856cf46a 100644 --- a/src/constructor.hpp +++ b/src/constructor.hpp @@ -15,23 +15,25 @@ #include "types.hpp" #include "progressive.hpp" +#include "vcf_buffer.hpp" +#include "name_mapper.hpp" +#include "handle.hpp" -#include "vg.pb.h" +#include // We need vcflib -#include "Variant.h" +#include // And fastahack -#include "Fasta.h" +#include + -#include "vcf_buffer.hpp" -#include "name_mapper.hpp" namespace vg { using namespace std; /** - * Represents a constructed region of the graph alogn a single linear sequence. + * Represents a constructed region of the graph along a single linear sequence. * Contains the protobuf Graph holding all the created components (which may be * too large to serialize), a set of node IDs whose left sides need to be * connected to when you connect to the start of the chunk, and a set of node @@ -70,6 +72,10 @@ class Constructor : public Progressive, public NameMapper { // reference by vcflib (true)? bool flat = false; + // In non-flat mode, how big can the longest allele of a variant be before + // we fall back to flat mode anyway? + size_t max_parsed_variant_size = 100; + // Should we add paths for the different alts of variants, like // _alt_6079b4a76d0ddd6b4b44aeb14d738509e266961c_0 and // _alt_6079b4a76d0ddd6b4b44aeb14d738509e266961c_1? @@ -79,6 +85,10 @@ class Constructor : public Progressive, public NameMapper { // or at least the ones we know how to? bool do_svs = false; + // Should we trim the 1bp reference sequence that by default is placed + // on indel variants? + bool trim_indels = true; + // Should we also store the alt_paths as loci? // e.g. // Locus{ @@ -102,6 +112,10 @@ class Constructor : public Progressive, public NameMapper { // Should we warn if lowercase characters are encountered in each input sequence? bool warn_on_lowercase = true; + // Should we warn if IUPAC ambiguity codes (other than N) are encountered + // in each input sequence? + bool warn_on_ambiguous = true; + // What's the maximum node size we should allow? size_t max_node_size = 1000; @@ -143,6 +157,9 @@ class Constructor : public Progressive, public NameMapper { * * Variants in the vector may not use symbolic alleles. * + * All variants must have has their canonical field set, either manually to + * false or by canonicalize() to true. + * * chunk_offset gives the global 0-based position at which this chunk starts * in the reference contig it is part of, which is used to correctly place * variants. @@ -158,9 +175,15 @@ class Constructor : public Progressive, public NameMapper { * Doesn't handle any of the setup for VCF indexing. Just scans all the * variants that can come out of the buffer, so make sure indexing is set on * the file first before passing it in. + * + * insertion contains FASTAs containing serquences for resolving symbolic + * insert alleles in the VCF. + * + * Calls the given callback with constructed graph chunks, in a single + * thread. Chunks may contain dangling edges into the next chunk. */ void construct_graph(string vcf_contig, FastaReference& reference, VcfBuffer& variant_source, - const vector& insertion, function callback); + const vector& insertion, const function& callback); /** * Construct a graph using the given FASTA references and VCFlib VCF files. @@ -168,9 +191,65 @@ class Constructor : public Progressive, public NameMapper { * position within the contig, such that each contig is present in only one * file. If multiple FASTAs are used, each contig must be present in only * one FASTA file. Reference and VCF vectors may not contain nulls. + * + * insertions contains FASTAs containing serquences for resolving symbolic + * insert alleles in the VCFs. + * + * Calls the given callback with constructed graph chunks, eventually + * (hopefully) in multiple threads. Chunks may contain dangling edges into + * the next chunk. */ void construct_graph(const vector& references, const vector& variant_files, - const vector& insertions, function callback); + const vector& insertions, const function& callback); + + /** + * Construct a graph using the given FASTA references and VCF files on disk. + * The VCF files are assumed to be grouped by contig and then sorted by + * position within the contig, such that each contig is present in only one + * file. If multiple FASTAs are used, each contig must be present in only + * one FASTA file. + * + * insertions contains FASTA filenames containing serquences for resolving + * symbolic insert alleles in the VCFs. + * + * Calls the given callback with constructed graph chunks, eventually + * (hopefully) in multiple threads. Chunks may contain dangling edges into + * the next chunk. + */ + void construct_graph(const vector& reference_filenames, const vector& variant_filenames, + const vector& insertion_filenames, const function& callback); + + /** + * Construct a graph using the given FASTA references and VCFlib VCF files. + * The VCF files are assumed to be grouped by contig and then sorted by + * position within the contig, such that each contig is present in only one + * file. If multiple FASTAs are used, each contig must be present in only + * one FASTA file. Reference and VCF vectors may not contain nulls. + * + * insertions contains FASTAs containing serquences for resolving symbolic + * insert alleles in the VCFs. + * + * Builds the graph into the given mutable graph object, which may not be + * thread safe. + */ + void construct_graph(const vector& references, const vector& variant_files, + const vector& insertions, MutablePathMutableHandleGraph* destination); + + /** + * Construct a graph using the given FASTA references and VCF files on disk. + * The VCF files are assumed to be grouped by contig and then sorted by + * position within the contig, such that each contig is present in only one + * file. If multiple FASTAs are used, each contig must be present in only + * one FASTA file. + * + * insertions contains FASTA filenames containing serquences for resolving + * symbolic insert alleles in the VCFs. + * + * Builds the graph into the given mutable graph object, which may not be + * thread safe. + */ + void construct_graph(const vector& reference_filenames, const vector& variant_filenames, + const vector& insertion_filenames, MutablePathMutableHandleGraph* destination); protected: @@ -195,6 +274,8 @@ class Constructor : public Progressive, public NameMapper { * * Postcondition: either all lists of VariantAlleles are empty, or at least * one begins with a non-match and at least one ends with a non-match. + * Adjacent edits in the list abut; there are no uncovered gaps in the edits. + * This means that *internal* perfect match edits will be preserved. */ static void trim_to_variable(vector>& parsed_alleles); @@ -213,14 +294,47 @@ class Constructor : public Progressive, public NameMapper { * the base after it and the base before it. */ static pair get_bounds(const vector>& trimmed_variant); + /** - * Given a variant, check its bounds and return them. - * This function handles SVs properly, since they won't - * always have their ref and alt fields put in. + * Given a symbolic variant, check its bounds and return them. This + * function is needed to handle SVs properly, since they won't always have + * their ref and alt fields put in. Note that insertions may have an end + * bound before their start, because the anchoring base isn't included. */ - static pair get_bounds(vcflib::Variant var); + static pair get_symbolic_bounds(vcflib::Variant var); + + /** + * Given a sequence, get rid of all the lowercase characters and all the + * ambiguity codes. Warn if configured, and the sequence has a name + * assigned, and no warning has yet been issued for that name, or if a + * variant is specified. + * + * Will error if this results in a string with anything other than A, C, G, + * T, and N. + * + * sequence_start_offset can be set to produce useful messages if the + * sequence we are looking at is an excerpt from a longer sequence. + * + * Santitizing may move the stored string data in memory. + * + * Returns true if the string was modified. + * + * We need this as a function because vcflib reaches back and reads the + * FASTA files directly, so we can't *just* preprocess the reference and we + * need to constantly clean up the variants. + */ + bool sanitize_sequence_in_place(string& sequence, const string* sequence_name = nullptr, size_t sequence_start_offset = 0, const vcflib::Variant* variant = nullptr) const; + /// What sequences have we warned about containing lowercase characters? - mutable unordered_set warned_sequences; + mutable unordered_set lowercase_warned_sequences; + /// Have we given a warning yet about lowercase alt alleles? + mutable bool lowercase_warned_alt = false; + /// Have we given a warning yet about multiallelic SVs? + mutable bool multiallelic_sv_warned = false; + /// Have we given a warning yet about uncanonicalizable SVs? + mutable bool uncanonicalizable_sv_warned = false; + /// What sequences have we warned about containing unsupported ambiguity codes? + mutable unordered_set ambiguous_warned_sequences; }; diff --git a/src/contracting_graph.cpp b/src/contracting_graph.cpp new file mode 100644 index 00000000000..a023ea8edf3 --- /dev/null +++ b/src/contracting_graph.cpp @@ -0,0 +1,114 @@ +#include "contracting_graph.hpp" +#include +#include + + +// #define debug +namespace vg{ + using namespace std; + using namespace structures; + + ContractingGraph::ContractingGraph(Graph graph) + :graph(graph),node_ids(graph.get_node_ids()){ + +#ifdef debug + cout << "the number of nodes in the UnionFind " << uf.size()<> all = uf.all_groups(); + cout << "all groups size " << all.size() < ContractingGraph::get_edges(size_t group_num){ + + //create container to keep group and edge_totals + unordered_map group_edges; + + //get contents of group + vector group_nodes = uf.group(group_num); +#ifdef debug + cout << "============================================================================= " << endl; + cout << "group num " << group_num << endl; +#endif + + //if an adj_node exists in group_nodes then it is a contracted edge, we treat it as a special case + for(size_t i = 0; i"<< graph.get_node_by_id(member).edges[j].other << endl; +#endif + size_t connecting_node = graph.get_node_by_id(member).edges[j].other; + + // check if the connecting node is contracted with other nodes + size_t connecting_node_group_id = uf.find_group(connecting_node); + + // avoid double counting edges btween members of group num aka contracted edge + if(connecting_node_group_id == group_num){ +#ifdef debug + cout << "continue" < ContractingGraph::get_nodes(){ + + //holds indices of nodes that are heads in the contracting graph + vector heads; + + //loop through graph nodes and determine which nodes are heads of the group + for(int i =0; i < graph.get_node_ids().size(); i++){ + vector node_ids = graph.get_node_ids(); + size_t group_head = uf.find_group(node_ids[i]); + if(node_ids[i] == group_head){ + heads.push_back(node_ids[i]); + + } + } + return heads; + } + + vector> ContractingGraph::get_disjoint_sets(){ + vector> to_return; + to_return = uf.all_groups(); + return to_return; + } + + + +} \ No newline at end of file diff --git a/src/contracting_graph.hpp b/src/contracting_graph.hpp new file mode 100644 index 00000000000..51b654107d4 --- /dev/null +++ b/src/contracting_graph.hpp @@ -0,0 +1,48 @@ +#ifndef CONTRACTINGGRAPH_HPP +#define CONTRACTINGGRAPH_HPP + +#include "algorithms/min_cut_graph.hpp" +#include +#include "sparse_union_find.hpp" + +namespace vg{ + using namespace std; + using vg::algorithms::Graph; + using namespace structures; + +class ContractingGraph{ + + + +public: + vector node_ids; + Graph graph; + SparseUnionFind uf = SparseUnionFind(true, node_ids); + + ContractingGraph(Graph graph); + + void contract(size_t random_node, size_t other_node); + + unordered_map get_edges(size_t group_num); + + + vector get_nodes(); + //looping through original nodes and sending them to find_group() and determining which nodes are heads + //return all heads + //if node = find_group(node) then its a head otherwise not + vector> get_disjoint_sets(); + + + + + + +}; + + +} + + + + +#endif \ No newline at end of file diff --git a/src/crash.cpp b/src/crash.cpp index e679bed2e38..b74c22eac41 100644 --- a/src/crash.cpp +++ b/src/crash.cpp @@ -23,7 +23,7 @@ #include // Needed to hack contexts for signal traces - #include +#include // Needed to generate stacktraces ourselves #include @@ -31,6 +31,9 @@ // We need strcmp #include +// We need realpath stuff +#include + #include #include #include @@ -54,6 +57,80 @@ const char* var = "VG_FULL_TRACEBACK"; // fullTrace = false means env var was not set bool fullTrace = false; +// Where vg issues should be reported +const char* ISSUE_URL = "https://github.com/vgteam/vg/issues/new/choose"; + +/// Make a horizontal line for delimiting error info. +static void draw_br() { + for (size_t i = 0; i < 20; i++) { + std::cerr << "â”"; + } + std::cerr << std::endl; +} + +/// Print an OSC-8 link start sequence to standard error, if it is a terminal, +/// pointing to the given vg source file path. +static void start_vg_link(const std::string& file_path, int line) { + if (!isatty(fileno(stderr))) { + return; + } + + std::string url_protocol = "file"; + std::string url_host; + std::string url_path; + + char real_path_buffer[PATH_MAX + 1]; + char* abspath = realpath(file_path.c_str(), real_path_buffer); + if (abspath != nullptr) { + // File exists to link to! + url_path = abspath; + + size_t host_length_limit; + #if defined(HOST_NAME_MAX) + host_length_limit = HOST_NAME_MAX; + #elif defined(_POSIX_HOST_NAME_MAX) + host_length_limit = _POSIX_HOST_NAME_MAX; + #else + host_length_limit = 256; + #endif + + // The link probably needs a hostname + char host_buffer[host_length_limit + 1]; + if (gethostname(host_buffer, host_length_limit) == 0) { + url_host = host_buffer; + } + // And we have to pick a protocol depending on if we are local or not + if (getenv("SSH_TTY") != nullptr) { + // We are probably developing over SSH, so link files over SFTP. + url_protocol = "sftp"; + } + } else { + // File doesn't exist relative to here. Link to Github. + url_path = "/vgteam/vg/blob/" + Version::get_version() + "/" + file_path + "#L" + std::to_string(line); + url_protocol = "https"; + url_host = "github.com"; + } + std::cerr << "\e]8;;" << url_protocol << "://" << url_host << url_path << "\e\\"; +} + +/// Print an OSC-8 link start sequence to standard error, if it is a terminal, +/// pointing to the given URL. +static void start_link(const std::string& url) { + if (!isatty(fileno(stderr))) { + return; + } + + std::cerr << "\e]8;;" << url << "\e\\"; +} + +/// Print an OSC-8 link end sequence to standard error, if it is a terminal. +static void stop_link() { + if (!isatty(fileno(stderr))) { + return; + } + std::cerr << "\e]8;;\e\\"; +} + void stacktrace_manually(ostream& out, int signalNumber, void* ip, void** bp) { // Now we compute our own stack trace, because backtrace() isn't so good on OS X. // We operate on the same principles as @@ -142,13 +219,23 @@ void emit_stacktrace(int signalNumber, siginfo_t *signalInfo, void *signalContex if (fullTrace == true) { out = &cerr; } else { - char temp[] = "/tmp/vg_crash_XXXXXX"; - char* tempDir = mkdtemp(temp); + // Determine where to save trace files + std::string temp; + char* tmpdir = getenv("TMPDIR"); + if (tmpdir) { + temp += tmpdir; + } else { + temp += "/tmp"; + } + temp += "/vg_crash_XXXXXX"; + char* tempDir = mkdtemp((char*)temp.c_str()); dirName = tempDir; tempStream.open(dirName+ "/stacktrace.txt"); out = &tempStream; } + draw_br(); + *out << "Crash report for vg " << Version::get_short() << endl; // This holds the context that the signal came from, including registers and stuff @@ -162,13 +249,17 @@ void emit_stacktrace(int signalNumber, siginfo_t *signalInfo, void *signalContex void** bp; #ifdef __APPLE__ - // OS X 64 bit does it this way - ip = (void*)context->uc_mcontext->__ss.__rip; - bp = (void**)context->uc_mcontext->__ss.__rbp; - *out << "Caught signal " << signalNumber << " raised at address " << ip << endl; - // Do our own tracing because backtrace doesn't really work on all platforms. - stacktrace_manually(*out, signalNumber, ip, bp); - #else + #if (defined(__arm64__) || defined(__aarch64__)) + *out << "Stack traces are not supported on ARM Macs yet" << endl; + #else + // macOS does it this way on x86-64 + ip = (void*)context->uc_mcontext->__ss.__rip; + bp = (void**)context->uc_mcontext->__ss.__rbp; + *out << "Caught signal " << signalNumber << " raised at address " << ip << endl; + // Do our own tracing because backtrace doesn't really work on all platforms. + stacktrace_manually(*out, signalNumber, ip, bp); + #endif + #elif __x86_64__ // Linux 64 bit does it this way ip = (void*)context->uc_mcontext.gregs[REG_RIP]; bp = (void**)context->uc_mcontext.gregs[REG_RBP]; @@ -183,11 +274,22 @@ void emit_stacktrace(int signalNumber, siginfo_t *signalInfo, void *signalContex tempStream.close(); #endif - if (fullTrace == false) { - cerr << "ERROR: Signal "<< signalNumber << " occurred. VG has crashed. Run 'vg bugs --new' to report a bug." << endl; + // Use OSC-8 to link the user to their destination. + cerr << "ERROR: Signal "<< signalNumber << " occurred. VG has crashed. "; + start_link(ISSUE_URL); + cerr << "Visit "; + cerr << ISSUE_URL; + cerr << " to report a bug."; + stop_link(); + cerr << endl; + if (fullTrace) { + cerr << "Please include this entire error log in your bug report!" << endl; + } else { // Print path for stack trace file cerr << "Stack trace path: "<< dirName << "/stacktrace.txt" << endl; + cerr << "Please include the stack trace file in your bug report!" << endl; } + draw_br(); // Make sure to exit with the right code exit(signalNumber + 128); } @@ -195,15 +297,19 @@ void emit_stacktrace(int signalNumber, siginfo_t *signalInfo, void *signalContex void enable_crash_handling() { // Set up stack trace support if (getenv(var) != nullptr) { - if (strcmp(getenv(var), "1") == 0) { - // if VG_FULL_TRACEBACK env var is set + if (strcmp(getenv(var), "0") == 0) { + // if VG_FULL_TRACEBACK env var is set to 0 + fullTrace = false; + } else { + // if VG_FULL_TRACEBACK env var is set to anything else fullTrace = true; } } else { // if VG_FULL_TRACEBACK env var is not set - fullTrace = false; + fullTrace = true; } + // backtrace() doesn't work in our Mac builds, and backward-cpp uses backtrace(). // Do this the old-fashioned way. @@ -223,4 +329,50 @@ void enable_crash_handling() { // library's message about what the exception was. } +thread_local std::string stored_crash_context; + +void set_crash_context(const std::string& message) { + stored_crash_context = message; +} + +void clear_crash_context() { + stored_crash_context.clear(); +} + +void with_exception_handling(const std::function& body) { + try { + body(); + } catch(const std::exception& ex) { + report_exception(ex); + } +} + +void report_exception(const std::exception& ex) { + std::cerr << "Unhandled exception: " << ex.what() << std::endl; + if (!stored_crash_context.empty()) { + std::cerr << "Exception context: " << stored_crash_context << std::endl; + } + abort(); +} + +void crash_unless_impl(bool condition, const std::string& condition_string, const std::string& file, int line, const std::string& function) { + if (condition) { + // Nothing is wrong! + return; + } + std::cerr << std::endl << std::endl; + draw_br(); + std::cerr << "VG has crashed because " << condition_string << " is false." << std::endl; + std::cerr << "Problem is at "; + // Use OSC-8 to link our files if we can. + start_vg_link(file, line); + std::cerr << file; + stop_link(); + std::cerr << ":" << line << " in " << function << "." << std::endl; + if (!stored_crash_context.empty()) { + std::cerr << "This is in the context of: " << stored_crash_context << std::endl; + } + abort(); +} + } diff --git a/src/crash.hpp b/src/crash.hpp index 3ceae7402b2..fc13936919c 100644 --- a/src/crash.hpp +++ b/src/crash.hpp @@ -6,18 +6,39 @@ * * Implementation for crash handling to create a stack trace when VG crashes. * To use the crash handling system, call enable_crash_handling() early on in the program. - * When a crash occurs, you will recieve an error message with the path to the - * stack trace file. To get the full stack trace on standard error, you need to - * set the environment variable 'VG_FULL_TRACEBACK=1'. + * When a crash occurs, you will recieve an error message with the stack trace. + * To get just a filename, you need to set the environment variable + * 'VG_FULL_TRACEBACK=0'. * */ -namespace vg { +#include +#include -using namespace std; +namespace vg { /// Main should call this to turn on our stack tracing support. void enable_crash_handling(); +/// User code should call this when it has context for a failure in its thread. +void set_crash_context(const std::string& message); + +/// User code should call this when it wants to clear context for a failure in its thread. +void clear_crash_context(); + +/// User code should call this to get all its exceptions handled. +void with_exception_handling(const std::function& body); + +/// User code should call this if it catches an exception it doesn't know what +/// to do with. +void report_exception(const std::exception& ex); + +/// User code should call this instead of assert +#define crash_unless(condition) crash_unless_impl((condition), #condition, __FILE__, __LINE__, __func__); + +/// crash_unless calls into this function for a real implementation. +void crash_unless_impl(bool condition, const std::string& condition_string, const std::string& file, int line, const std::string& function); + + } #endif diff --git a/src/dagified_graph.cpp b/src/dagified_graph.cpp new file mode 100644 index 00000000000..183e02b3429 --- /dev/null +++ b/src/dagified_graph.cpp @@ -0,0 +1,396 @@ +/** + * \file dagified_graph.cpp: contains the implementation of DagifiedGraph + */ + + +#include "dagified_graph.hpp" + +//#define debug_dagify + +namespace vg { + +using namespace std; + + DagifiedGraph::DagifiedGraph(const HandleGraph* graph, size_t min_preserved_path_length, + size_t max_num_duplications) : graph(graph) { + +#ifdef debug_dagify + cerr << "constructing dagified graph" << endl; +#endif + + // find the numeric range of handles in the underlying graph (needed for later bookkeeping) + uint64_t max_handle = std::numeric_limits::min(); + graph->for_each_handle([&](const handle_t& handle) { + for (handle_t h : {handle, graph->flip(handle)}) { + uint64_t integer_handle = handlegraph::as_integer(h); + min_handle = min(integer_handle, min_handle); + max_handle = max(integer_handle, max_handle); + } + }); + handle_val_range = max_handle - min_handle + 1; + +#ifdef debug_dagify + cerr << "graph has handle range " << handle_val_range << ", min handle " << min_handle << ", min ID " << graph->min_node_id() << ", and ID range " << (graph->max_node_id() - graph->min_node_id() + 1) << endl; + cerr << "preserving walks up to length " << min_preserved_path_length << endl; +#endif + + // now we begin the dagify algorithm + + vector> strong_components; + { + // get a low-FAS layout with a canonical orientation for each handle + vector layout = handlealgs::eades_algorithm(graph); + + // invert the mapping for the layout + layout_order.reserve(layout.size()); + for (size_t i = 0; i < layout.size(); ++i) { + layout_order[layout[i]] = i; + } + + + // identify the SCCs and build the reverse SCC mapping + // TODO: annoying that we have to work with this return type for strongly_connected_components + scc_of_handle.resize(layout.size()); + size_t scc_idx = 0; + for (const unordered_set& scc : handlealgs::strongly_connected_components(graph)) { + // init new component + strong_components.emplace_back(); + auto& component = strong_components.back(); + component.reserve(scc.size()); + + // build the reverse mapping for this SCC + for (const id_t& node_id : scc) { + handle_t handle = graph->get_handle(node_id); + auto iter = layout_order.find(handle); + if (iter == layout_order.end()) { + iter = layout_order.find(graph->flip(handle)); + } + scc_of_handle[iter->second] = scc_idx; + } + ++scc_idx; + } + + // build the SCC components in layout order + for (const handle_t& handle : layout) { + strong_components[scc_of_handle[layout_order[handle]]].push_back(handle); + } + + // let the layout fall out of scope + } +#ifdef debug_dagify + cerr << "identified " << strong_components.size() << " strongly connected components out of " << graph->get_node_count() << " nodes" << endl; +#endif + + // identify how many times each SCC needs to be duplicated + scc_copy_count.resize(strong_components.size()); + for (size_t scc_idx = 0; scc_idx < strong_components.size(); ++scc_idx) { +#ifdef debug_dagify + cerr << "BEGIN NEW SCC " << scc_idx << endl; +#endif + const vector& component = strong_components[scc_idx]; + + // record the ordering of the layout so we can build adjacency lists + unordered_map ordering; + for (size_t i = 0; i < component.size(); i++) { + ordering[component[i]] = i; + } + + // mark the edges as either forward or backward relative to the layout + vector> forward_edges(component.size()); + vector> backward_edges; + for (size_t i = 0; i < component.size(); ++i) { + graph->follow_edges(component[i], false, [&](const handle_t& next) { + if (scc_of_handle[layout_order[next]] == scc_idx) { + // this edge is internal to a strongly connected component + size_t j = ordering[next]; + if (i < j) { + // non feedback arc + forward_edges[i].push_back(j); + } + else { + // feedback arc + backward_edges.emplace_back(i, j); + } + } + }); + } +#ifdef debug_dagify + cerr << "feedforward graph:" << endl; + for (size_t i = 0; i < component.size(); ++i) { + cerr << graph->get_id(component[i]) << ":"; + for (auto j : forward_edges[i]) { + cerr << " " << graph->get_id(component[j]); + } + cerr << endl; + } + cerr << "feedback edges:" << endl; + for (auto edge : backward_edges) { + cerr << graph->get_id(component[edge.first]) << " -> " << graph->get_id(component[edge.second]) << endl; + } +#endif + + // check for each node whether we've duplicated the component enough times + // to preserve its cycles + + // dynamic progamming structures that represent distances within the current + // copy of the SCC and the next copy + vector distances(component.size(), numeric_limits::max()); + + // init the distances so that we are measuring from the end of the heads of + // backward edges (which cross to the next copy of the SCC) + for (const pair& bwd_edge : backward_edges) { + distances[bwd_edge.first] = -graph->get_length(component[bwd_edge.first]); + } + + // init the tracker that we use for the bail-out condition + int64_t min_relaxed_dist = -1; + + // keep track of how many times we've implicitly copied + uint64_t copy_num = 0; + for (; min_relaxed_dist < int64_t(min_preserved_path_length) && copy_num < max_num_duplications; copy_num++) { + +#ifdef debug_dagify + cerr << "making " << copy_num << "-th copy of SCC with incoming min relaxed distance " << min_relaxed_dist << endl; +#endif + + // the distances in the next copy unit + vector next_distances(component.size(), numeric_limits::max()); + + // find the shortest path to the nodes, staying within this copy of the SCC + for (size_t i = 0; i < distances.size(); i++) { + // skip infinity to avoid overflow + if (distances[i] == numeric_limits::max()) { + continue; + } + + int64_t dist_thru = distances[i] + graph->get_length(component[i]); + for (const size_t& j : forward_edges[i]) { + distances[j] = min(distances[j], dist_thru); + } + } + + // now find the minimum distance to nodes in the next copy of the SCC + min_relaxed_dist = numeric_limits::max(); + for (const pair& bwd_edge : backward_edges) { + // skip infinity to avoid overflow + if (distances[bwd_edge.first] == numeric_limits::max()) { + continue; + } + + int64_t dist_thru = distances[bwd_edge.first] + graph->get_length(component[bwd_edge.first]); + if (dist_thru < next_distances[bwd_edge.second]) { + next_distances[bwd_edge.second] = dist_thru; + // keep track of the shortest distance to the next copy + min_relaxed_dist = min(min_relaxed_dist, dist_thru); + } + } + +#ifdef debug_dagify + cerr << "distances within component" << endl; + for (size_t i = 0; i < distances.size(); i++) { + cerr << "\t" << graph->get_id(component[i]) << (graph->get_is_reverse(component[i]) ? "-" : "+") << " "; + if (distances[i] != numeric_limits::max()) { + cerr << distances[i]; + } + else { + cerr << "."; + } + cerr << endl; + } + cerr << "distances to next component" << endl; + for (size_t i = 0; i < next_distances.size(); i++) { + cerr << "\t" << graph->get_id(component[i]) << (graph->get_is_reverse(component[i]) ? "-" : "+") << " "; + if (distances[i] != numeric_limits::max()) { + cerr << distances[i]; + } + else { + cerr << "."; + } + cerr << endl; + } +#endif + + // initialize the DP structures for the next iteration + distances = move(next_distances); + } + + // now we know that the copy count needs to be, so we can record the information we need + // from this component + + // record the copy count + scc_copy_count[scc_idx] = copy_num; + + // add the number of nodes to the total count + node_count += component.size() * copy_num; + + // find the maximum projected node ID in this component + id_t comp_max_id = numeric_limits::min(); + for (const handle_t& handle : component) { + comp_max_id = max(comp_max_id, graph->get_id(handle)); + } + max_id = max(max_id, comp_max_id + (copy_num - 1) * (graph->max_node_id() - graph->min_node_id() + 1)); + } + } + + bool DagifiedGraph::has_node(id_t node_id) const { + id_t original_id = get_underlying_id(node_id); + bool exists = graph->has_node(original_id); + if (exists) { + // was the node duplicated enough times to have created this node ID? + exists = scc_copy_of_node_id(node_id) < scc_copy_count.at(scc_of_handle.at(layout_order_of_handle(graph->get_handle(original_id)))); + } + return exists; + } + + handle_t DagifiedGraph::get_handle(const id_t& node_id, bool is_reverse) const { + return nth_copy_of_handle(graph->get_handle(get_underlying_id(node_id), is_reverse), scc_copy_of_node_id(node_id)); + } + + id_t DagifiedGraph::get_id(const handle_t& handle) const { + return (graph->get_id(get_underlying_handle(handle)) + + scc_copy_of_handle(handle) * (graph->max_node_id() - graph->min_node_id() + 1)); + } + + bool DagifiedGraph::get_is_reverse(const handle_t& handle) const { + return graph->get_is_reverse(get_underlying_handle(handle)); + } + + handle_t DagifiedGraph::flip(const handle_t& handle) const { + return nth_copy_of_handle(graph->flip(get_underlying_handle(handle)), scc_copy_of_handle(handle)); + } + + size_t DagifiedGraph::get_length(const handle_t& handle) const { + return graph->get_length(get_underlying_handle(handle)); + } + + string DagifiedGraph::get_sequence(const handle_t& handle) const { + return graph->get_sequence(get_underlying_handle(handle)); + } + + bool DagifiedGraph::follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const { + + // this is the complicated part where we have to induce an edge structure that is a DAG + handle_t underlying = get_underlying_handle(handle); + uint64_t scc_copy = scc_copy_of_handle(handle); + + // does the handle match the canonical orientation of the layout we computed? + bool matches_layout = layout_order.count(underlying); + + // are we traversing this node forward along the canonical orientation? + bool canonical_fwd = matches_layout != go_left; + + size_t layout_index = layout_order_of_handle(underlying); + uint64_t scc_id = scc_of_handle.at(layout_index); + + return graph->follow_edges(underlying, go_left, [&](const handle_t& next_underlying) { + + size_t next_layout_index = layout_order_of_handle(next_underlying); + uint64_t next_scc_id = scc_of_handle.at(next_layout_index); + + bool keep_going = true; + if (next_scc_id != scc_id) { + // this is over an edge that's between two strongly connected component + uint64_t next_scc_count = scc_copy_count.at(next_scc_id); + if (canonical_fwd) { + // we are traveling in the canonically forward direction + if (scc_copy + 1 == scc_copy_count.at(scc_id)) { + // only the last copy of a handle is allowed to extend to the next strongly + // connected component, and it connects to all copies in the next one + for (size_t i = 0; i < next_scc_count && keep_going; ++i) { + keep_going = iteratee(nth_copy_of_handle(next_underlying, i)); + } + } + } + else { + // we are going in the reverse direction of the canonical orientation, so we + // can only connect to the last copy of the next handle + keep_going = iteratee(nth_copy_of_handle(next_underlying, next_scc_count - 1)); + } + } + else { + // this edge is internal to a strongly connected component + if (canonical_fwd) { + // we are traversing in the direction of the canonical orientation + if (next_layout_index > layout_index) { + // we are not taking a reversing edge, so we stay in the same copy unit + keep_going = iteratee(nth_copy_of_handle(next_underlying, scc_copy)); + } + else if (scc_copy + 1 < scc_copy_count.at(scc_id)) { + // we are taking a reversing edge, and there are still copy units ahead + keep_going = iteratee(nth_copy_of_handle(next_underlying, scc_copy + 1)); + } + } + else { + // we are traversing against the direction of the canonical orientation + if (next_layout_index < layout_index) { + // we are moving backwards over a forward edge, stay in the same copy unit + keep_going = iteratee(nth_copy_of_handle(next_underlying, scc_copy)); + } + else if (scc_copy > 0) { + // we are moving backwards over a reversing edge, and there are still copy units before this one + keep_going = iteratee(nth_copy_of_handle(next_underlying, scc_copy - 1)); + } + } + } + + return keep_going; + }); + } + + bool DagifiedGraph::for_each_handle_impl(const function& iteratee, + bool parallel) const { + return graph->for_each_handle([&](const handle_t& underlying) { + // iterate over however many copies of the handle there are + size_t copy_count = scc_copy_count.at(scc_of_handle.at(layout_order_of_handle(underlying))); + bool keep_going = true; + for (size_t i = 0; i < copy_count && keep_going; ++i) { + keep_going = iteratee(nth_copy_of_handle(underlying, i)); + } + return keep_going; + }, parallel); + } + + size_t DagifiedGraph::get_node_count() const { + return node_count; + } + + id_t DagifiedGraph::min_node_id() const { + // duplicated handles only increase in ID, so the original minimum doesn't change + return graph->min_node_id(); + } + + id_t DagifiedGraph::max_node_id() const { + return max_id; + } + + handle_t DagifiedGraph::get_underlying_handle(const handle_t& handle) const { + return handlegraph::as_handle(((uint64_t(handlegraph::as_integer(handle)) - min_handle) % handle_val_range) + min_handle); + } + + uint64_t DagifiedGraph::scc_copy_of_handle(const handle_t& handle) const { + return (uint64_t(handlegraph::as_integer(handle)) - min_handle) / handle_val_range; + } + + uint64_t DagifiedGraph::scc_copy_of_node_id(const id_t& node_id) const { + return (node_id - graph->min_node_id()) / (graph->max_node_id() - graph->min_node_id() + 1); + } + + size_t DagifiedGraph::layout_order_of_handle(const handle_t& handle) const { + auto iter = layout_order.find(handle); + if (iter == layout_order.end()) { + iter = layout_order.find(graph->flip(handle)); + } + return iter->second; + } + + + id_t DagifiedGraph::get_underlying_id(const id_t& node_id) const { + return ((node_id - graph->min_node_id()) % (graph->max_node_id() - graph->min_node_id() + 1)) + graph->min_node_id(); + } + + handle_t DagifiedGraph::nth_copy_of_handle(const handle_t& handle, const uint64_t& n) const { + return handlegraph::as_handle(uint64_t(handlegraph::as_integer(handle)) + n * handle_val_range); + } +} + diff --git a/src/dagified_graph.hpp b/src/dagified_graph.hpp new file mode 100644 index 00000000000..0a3a72e65af --- /dev/null +++ b/src/dagified_graph.hpp @@ -0,0 +1,150 @@ +#ifndef VG_DAGIFIED_GRAPH_HPP_INCLUDED +#define VG_DAGIFIED_GRAPH_HPP_INCLUDED + +/** \file + * dagified_graph.hpp: defines a handle graph overlay implementation transforms cyclic graphs into DAGs + */ + +#include "handle.hpp" + +namespace vg { + +using namespace std; + + /** + * A HandleGraph implementation that wraps some other handle graph and converts it into a + * DAG, preserving all paths up a a minimum length. + */ + class DagifiedGraph : public ExpandingOverlayGraph { + public: + + /// Expand a single-stranded graph into a DAG, preserving all walks up to the minimum length. + /// If max duplications is provided, limits the number of times any node is copied. + DagifiedGraph(const HandleGraph* graph, size_t min_preserved_path_length, + size_t max_num_duplications = std::numeric_limits::max()); + + /// Default constructor -- not actually functional + DagifiedGraph() = default; + + /// Default destructor + ~DagifiedGraph() = default; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + // Method to check if a node exists by ID + bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph + size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + id_t max_node_id() const; + + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /** + * Returns the handle in the underlying graph that corresponds to a handle in the + * overlay + */ + handle_t get_underlying_handle(const handle_t& handle) const; + + protected: + + /* + * Helper methods + */ + + /// Helper function to identify which ordinal copy of a strongly connected component + /// in the underlying graph the handle belongs to + uint64_t scc_copy_of_handle(const handle_t& handle) const; + + /// Helper function to identify which ordinal copy of a strongly connected component + /// in the underlying graph the node ID belongs to + uint64_t scc_copy_of_node_id(const id_t& node_id) const; + + /// Helper function that returns the layout order of a handle from the underlying graph + size_t layout_order_of_handle(const handle_t& handle) const; + + /// Helper function to identify the node ID in the original graph that corresponds to a + /// node in the dagified graph + id_t get_underlying_id(const id_t& node_id) const; + + /// Helper function, returns the n-th copy in the dagified graph of a handle in the underlying graph + handle_t nth_copy_of_handle(const handle_t& handle, const uint64_t& n) const; + + /* + * Member variables + */ + + /// The underlying graph we're dagifiying + const HandleGraph* graph = nullptr; + + /// Map from a canonical orientation of underlying handles (not necessarily forward!) + /// to the ordinal position of the node in a low-FAS layout + unordered_map layout_order; + + /// The ID of the strongly connected component that the handle at each layout position + /// belongs to + vector scc_of_handle; + + /// The number of times each strongly connected component is duplicated in the + /// dagified graph + vector scc_copy_count; + + /// The minimum value of a handle in the underlying graph, interpreted as an integer + uint64_t min_handle = std::numeric_limits::max(); + + /// The width of the range of values that handles in the underlying graph take + uint64_t handle_val_range = 0; + + /// The number of nodes including duplicated nodes, computed during construction + size_t node_count = 0; + + /// The maximum ID of the graph, computed during construction + id_t max_id = std::numeric_limits::min(); + + }; +} + +#endif diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 95c0d84bafe..118d9df45a5 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1,213 +1,1215 @@ #include "deconstructor.hpp" #include "traversal_finder.hpp" +#include + +//#define debug using namespace std; namespace vg { - Deconstructor::Deconstructor(){ - - } - Deconstructor::~Deconstructor(){ - } - - /** - * Takes in a vector of snarltraversals - * returns their sequences as a vector - * returns a boolean hasRef - * if a reference path is present, hasRef is set to true and the first - * string in the vector is the reference allele - * otherwise, hasRef is set to false and all strings are alt alleles. - */ - pair > Deconstructor::get_alleles(vector travs, string refpath, vg::VG* graph){ - vector ret; - vector ordered_traversals; - bool hasRef = false; - - bool normalize_indels = false; - - // Check if we have a PathIndex for this path - bool path_indexed = pindexes.find(refpath) != pindexes.end(); - for (auto t : travs){ - stringstream t_allele; - - // Get ref path index - if (path_indexed){ - PathIndex* pind = pindexes[refpath]; - // Check nodes of traversals Visits - // if they're all on the ref path, - // then this Snarltraversal is the ref allele. - bool is_ref = true; - - // Get the middle of the traversal that doesn't include the - // boundary nodes - auto iter = t.visit().begin(); - iter++; - auto end = t.visit().end(); - end--; - for (; iter != end; iter++){ - auto v = *iter; - if (!pind->path_contains_node(v.node_id())){ - is_ref = false; - } - if (v.node_id() == 0){ - continue; +Deconstructor::Deconstructor() : VCFOutputCaller(""), + exhaustive_jaccard_warning(false){ +} +Deconstructor::~Deconstructor(){ +} + +/** + * Takes in a vector of snarltraversals, an index of the ref path among them, a + * vector of flags for traversals to actually use, the character before all the + * traversals, and a flag for whether the start should be used (???). + * + * Returns a vector where entires are which allele number a traversal in travs + * ought to become in the VCF. If a traversal is flagged off, it gets a -1. + */ +vector Deconstructor::get_alleles(vcflib::Variant& v, + const pair, + vector>>& path_travs, + int ref_path_idx, + const vector& use_trav, + char prev_char, bool use_start) const { + + auto& travs = path_travs.first; + assert(ref_path_idx >=0 && ref_path_idx < travs.size()); + + // map strings to allele numbers (and their traversal) + // (we are using the traversal finder in such a way that duplicate alleles can get returned + // in order to be able to preserve the path names) + map> allele_idx; + size_t cur_alt = 1; + + // go from traversals number (offset in travs) to allele number + vector trav_to_allele(travs.size()); + + // compute the allele as a string + auto trav_to_string = [&](const SnarlTraversal& trav) { + string allele; + // we skip the snarl endpoints + for (int j = 1; j < trav.visit_size() - 1; ++j) { + const string& node_sequence = graph->get_sequence(graph->get_handle(trav.visit(j).node_id())); + allele += trav.visit(j).backward() ? reverse_complement(node_sequence) : node_sequence; + } + return toUppercase(allele); + }; + + // set the reference allele + string ref_allele = trav_to_string(travs.at(ref_path_idx)); + allele_idx[ref_allele] = make_pair(0, ref_path_idx); + trav_to_allele[ref_path_idx] = 0; + bool substitution = true; + + // set the other alleles (they can end up as 0 alleles too if their strings match the reference) + for (int i = 0; i < travs.size(); ++i) { + if (i != ref_path_idx) { + if (use_trav[i]) { + string allele = trav_to_string(travs[i]); + auto ai_it = allele_idx.find(allele); + if (ai_it == allele_idx.end()) { + // make a new allele for this string + allele_idx[allele] = make_pair(cur_alt, i); + trav_to_allele.at(i) = cur_alt; + ++cur_alt; + substitution = substitution && allele.size() == ref_allele.size(); + } else { + // allele string has been seen, map this traversal to it + trav_to_allele.at(i) = ai_it->second.first; + } + } else { + trav_to_allele.at(i) = -1; // HACK! negative allele indexes are ignored + } + } + } + + // fill in the variant + v.alleles.resize(allele_idx.size()); + assert(allele_idx.size() > 0); + v.alt.resize(allele_idx.size() - 1); + + // if we should flip the traversals + bool reversed = !use_start; + //if (reversed) { + // cerr << "it's reversed!!!" << endl; + //} + + // we're going to go through the allele traversals + // for each, we should find the set of supporting paths + // we will use them to untangle the reference positions + // using the same context mapping typically used for record positions + // AP field (allele positions) + // same pattern as AT except the steps are listed [>|<][id]_[start]_[end]+ + // id is the node id and start/end give the 1-based, half-open reference coordinates of the node + // we establish these by making a reference context per node in the reference allele through the site + // for each step where the reference touches the node more than once + // (in cases where it's only once, this is trivial) + // then we compare each reference matching step inside + // (todo: this should be done only for top level bubbles) + + vector allele_idx_unfolded(allele_idx.size()); + for (auto& ai_pair : allele_idx) { + int allele_no = ai_pair.second.first; + int allele_trav_no = ai_pair.second.second; + allele_idx_unfolded[allele_no] = allele_trav_no; + } + + // record the alleles in the VCF record + for (auto ai_pair : allele_idx) { + string allele_string = ai_pair.first; + int allele_no = ai_pair.second.first; + int allele_trav_no = ai_pair.second.second; + if (reversed) { + reverse_complement_in_place(allele_string); + } + if (!substitution) { + allele_string = string(1, prev_char) + allele_string; + } + v.alleles[allele_no] = allele_string; + if (allele_no > 0) { + v.alt[allele_no - 1] = allele_string; + } else { + v.ref = allele_string; + } + } + + if (untangle_allele_traversals) { + + // set up for reference position context mapping across allele traversals + path_handle_t ref_path = graph->get_path_handle_of_step(path_travs.second.at(ref_path_idx).first); + unordered_map>>> ref_dup_nodes; + unordered_map ref_simple_pos; + { + auto& trav = travs.at(ref_path_idx); + for (size_t i = 0; i < trav.visit_size(); ++i) { + size_t j = !reversed ? i : trav.visit_size() - 1 - i; + const Visit& visit = trav.visit(j); + nid_t node_id = visit.node_id(); + if (ref_simple_pos.find(node_id) != ref_simple_pos.end()) continue; + if (ref_dup_nodes.find(node_id) != ref_dup_nodes.end()) continue; + handle_t h = graph->get_handle(node_id); + // count reference occurrences on node + step_handle_t ref_step; + uint64_t ref_step_count = 0; + graph->for_each_step_on_handle( + h, [&](const step_handle_t& step) { + auto p = graph->get_path_handle_of_step(step); + if (p == ref_path) { + ++ref_step_count; + ref_step = step; + } + }); + if (ref_step_count > 1) { + //ref_dup_nodes[node_id] = make_pair(ref_context); + ref_dup_nodes[node_id] = {}; + auto& contexts = ref_dup_nodes[node_id]; + //vector>> contexts; + graph->for_each_step_on_handle( + h, [&](const step_handle_t& step) { + auto p = graph->get_path_handle_of_step(step); + if (p == ref_path) { + contexts.emplace_back(); + auto& c = contexts.back(); + c.first = graph->get_position_of_step(step); + c.second = get_context(step, step); + } + }); + // + } else if (ref_step_count == 1) { + auto pos = graph->get_position_of_step(ref_step) + 1; + auto len = graph->get_length(graph->get_handle_of_step(ref_step)); + ref_simple_pos[node_id] = pos; + } + } + } + + // set up the UT field for our reference-relative traversal position untangling + auto& ut_field = v.info["UT"]; + ut_field.resize(allele_idx.size()); + +#pragma omp parallel for schedule(dynamic,1) + for (size_t i = 0; i < allele_idx_unfolded.size(); i++) { + int allele_no = i; + int allele_trav_no = allele_idx_unfolded[i]; + auto start_step = path_travs.second.at(allele_trav_no).first; + auto end_step = path_travs.second.at(allele_trav_no).second; + auto start_pos = graph->get_position_of_step(start_step); + auto end_pos = graph->get_position_of_step(end_step); + bool flip_path = start_pos > end_pos; + if (flip_path) { + std::swap(start_step, end_step); + } + path_handle_t path = graph->get_path_handle_of_step(start_step); + std::vector steps; + for (auto s = start_step; ; + s = graph->get_next_step(s)) { + steps.push_back(s); + if (s == end_step) break; + if (!graph->has_next_step(s)) break; + } + if (steps.front() != start_step || steps.back() != end_step) { + //cerr << "warning!" << endl; + // something went wrong + ut_field[allele_no] = "."; + continue; + } + if (flip_path) { + std::reverse(steps.begin(), steps.end()); + } + // update the traversal info + stringstream trav_pos_info; + //string& trav_pos_info = ut_field[allele_no]; + for (auto& step : steps) { + handle_t h = graph->get_handle_of_step(step); + nid_t node_id = graph->get_id(h); + bool step_rev = graph->get_is_reverse(h) != reversed; + trav_pos_info << (reversed ? "<" : ">") << node_id << "_"; + if (allele_no == 0) { // special case the reference allele + auto pos = graph->get_position_of_step(step) + 1; + auto len = graph->get_length(graph->get_handle_of_step(step)); + trav_pos_info << pos << "_" << pos+len; + } else { // for non-reference alleles + auto f = ref_simple_pos.find(node_id); + if (f != ref_simple_pos.end()) { + // we have a single reference position at this node + auto pstart = f->second + 1; + auto pend = pstart + graph->get_length(h); + trav_pos_info << pstart << "_" << pend; + } else { + auto d = ref_dup_nodes.find(node_id); + if (d == ref_dup_nodes.end()) { + // no reference position at this node + trav_pos_info << "._."; + } else { + // multiple reference positions at this node + // compare the reference contexts of each step to our + // path context to determine reference position assignment + // ... first we get our path's context + auto path_context = get_context(step, step); + auto& ref_contexts = d->second; + //cerr << "path context size " << path_context.size() << endl; + // check vs. the contexts + double best_jaccard = -1; + uint64_t best_pos = 0; + for (auto& c : ref_contexts) { + auto& ref_context = c.second; + auto& ref_pos = c.first; + double j = context_jaccard(ref_context, path_context); + if (j > best_jaccard) { + best_jaccard = j; + best_pos = ref_pos; + } + } + trav_pos_info << best_pos+1 << "_" << best_pos+1+graph->get_length(h); + } } - t_allele << graph->get_node(v.node_id())->sequence(); } + } + // save the untangled traversal field + ut_field[allele_no] = trav_pos_info.str(); + } + } else { + // init the traversal info + v.info["AT"].resize(allele_idx.size()); + for (auto ai_pair : allele_idx) { + string allele_string = ai_pair.first; + int allele_no = ai_pair.second.first; + int allele_trav_no = ai_pair.second.second; + // update the traversal info + add_allele_path_to_info(v, allele_no, travs[allele_trav_no], reversed, !substitution); + } + } + + // shift our variant back if it's an indel + if (!substitution) { + assert(v.position >= 2); + --v.position; + } + + v.updateAlleleIndexes(); - string t_str = t_allele.str(); - if (t_str == ""){ - normalize_indels = true; + return trav_to_allele; +} + +void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& names, + const vector& trav_to_allele) const { + assert(names.size() == trav_to_allele.size()); + // set up our variant fields + v.format.push_back("GT"); + if (show_path_info && path_to_sample_phase && path_restricted) { + v.format.push_back("PI"); + } + + // get a list of traversals for every vcf sample + // (this will be 1:1 unless we're using the path_to_sample name map) + map > sample_to_traversals; + // phasing information from the gbwt where applicable + vector gbwt_phases(trav_to_allele.size(), -1); + for (int i = 0; i < names.size(); ++i) { + string sample_name = PathMetadata::parse_sample_name(names[i]); + // for backward compatibility + if (sample_name.empty()) { + sample_name = names[i]; + } + auto phase = PathMetadata::parse_haplotype(names[i]); + if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) { + // THis probably won't fit in an int. Use 0 instead. + phase = 0; + } + gbwt_phases[i] = (int)phase; + if (sample_names.count(sample_name)) { + sample_to_traversals[sample_name].push_back(i); + } + } + + // write out the genotype for each sample + // if we're mapping a vg path name to its prefix for the sample name, we stick some information about the full + // path name in the PI part of format + set conflicts; + for (const auto& sample_name : sample_names) { + if (sample_to_traversals.count(sample_name)) { + const vector& travs = sample_to_traversals[sample_name]; + assert(!travs.empty()); + vector chosen_travs; + bool conflict; + std::tie(chosen_travs, conflict) = choose_traversals(sample_name, travs, trav_to_allele, names, gbwt_phases); + if (conflict) { +#ifdef debug +#pragma omp critical (cerr) + cerr << "Conflict for sample " << sample_name << endl; +#endif + conflicts.insert(sample_name); + } + string genotype; + for (int i = 0; i < chosen_travs.size(); ++i) { + if (i > 0) { + // TODO check flag for phasing + genotype += (path_to_sample_phase || gbwt_trav_finder.get()) ? "|" : "/"; } - if (is_ref){ - ret.insert(ret.begin(), t_str); - ordered_traversals.insert(ordered_traversals.begin(), t); - hasRef = true; + genotype += (chosen_travs[i] != -1 && (!conflict || keep_conflicted_genotypes)) + ? std::to_string(trav_to_allele[chosen_travs[i]]) : "."; + } + v.samples[sample_name]["GT"] = {genotype}; + if (show_path_info && path_to_sample_phase) { + for (auto trav : travs) { + auto allele = trav_to_allele[trav]; + if (allele != -1) { + v.samples[sample_name]["PI"].push_back(names[trav] + "=" + std::to_string(allele)); + } } - else{ - ret.push_back(t_str); - ordered_traversals.push_back(t); + } + } else { + string blank_gt = "."; + if (gbwt_sample_to_phase_range.count(sample_name)) { + auto& phase_range = gbwt_sample_to_phase_range.at(sample_name); + for (int phase = phase_range.first + 1; phase <= phase_range.second; ++phase) { + blank_gt += "|."; } - } + v.samples[sample_name]["GT"] = {blank_gt}; + if (show_path_info && path_to_sample_phase && path_restricted) { + v.samples[sample_name]["PI"] = {blank_gt}; + } + } + } + for (auto& conflict_sample : conflicts) { + v.info["CONFLICT"].push_back(conflict_sample); + } +} - else{ - // All alleles are alt alleles - // Just make our strings and push them back. - - // Get the middle of the traversal that doesn't include the - // boundary nodes - auto iter = t.visit().begin(); - iter++; - auto end = t.visit().end(); - end--; - for (; iter != end; iter++){ - auto v = *iter; - t_allele << graph->get_node(v.node_id())->sequence(); - } - ret.push_back(t_allele.str()); - ordered_traversals.push_back(t); - } - } - // If we haev indels to normalize, loop over our alleles - // normalize each string to VCF-friendly format (i.e. clip one ref base - // on the left side and put it in the ref field and the alt field). - if (normalize_indels){ - for (int i = 0; i < ret.size(); ++i){ - // Get the reference base to the left of the variant. - // If our empty allele is the reference (and we have a reference), - // put our new-found ref base in the 0th index of alleles vector. - // Then, prepend that base to each allele in our alleles vector. - SnarlTraversal t = ordered_traversals[i]; - id_t start_id = t.visit(0).node_id(); - id_t end_id = t.visit(t.visit_size() - 1).node_id(); - pair pos_orientation_start = pindexes[refpath]->by_id[start_id]; - pair pos_orientation_end = pindexes[refpath]->by_id[end_id]; - bool use_start = pos_orientation_start.first < pos_orientation_end.first; - bool rev = use_start ? pos_orientation_start.second : pos_orientation_end.second; - string pre_node_seq = use_start ? graph->get_node(start_id)->sequence() : - graph->get_node(end_id)->sequence(); - string pre_variant_base = rev ? string(1, pre_node_seq[0]) : string(1, pre_node_seq[pre_node_seq.length() - 1]); - ret[i].insert(0, pre_variant_base); - } - } - return make_pair(hasRef, ret); - - } - - void Deconstructor::deconstruct(string refpath, vg::VG* graph){ - - - - // Create path index for the contig if we don't have one. - if (pindexes.find(refpath) == pindexes.end()){ - pindexes[refpath] = new PathIndex(*graph, refpath, false); - } - - // Spit header - // Set contig to refpath - // Set program field - // Set the version and data - // Set info field, if needed - // Make the header line - // open a VCF file - - if (!headered){ - vcflib::VariantCallFile outvcf; - stringstream stream; - stream << "##fileformat=VCFv4.2" << endl; - stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" << "Sample" << endl; +pair, bool> Deconstructor::choose_traversals(const string& sample_name, + const vector& travs, const vector& trav_to_allele, + const vector& trav_to_name, + const vector& gbwt_phases) const { + + assert(trav_to_name.size() == trav_to_allele.size()); + assert(gbwt_phases.size() == trav_to_name.size()); + assert(!travs.empty()); + // count the number of times each allele comes up in a traversal + vector allele_frequencies(std::max(0, *max_element(trav_to_allele.begin(), trav_to_allele.end()) + 1), 0); + for (auto trav : travs) { + // we always want to choose alt over ref when possible in sorting logic below, so + // cap ref frequency at 1 + int allele = trav_to_allele.at(trav); + // avoid counting filtered alleles (== -1) + if (allele >= 0 && (allele > 0 || allele_frequencies[allele] == 0)) { + ++allele_frequencies[allele]; + } + } + // sort on frquency + function comp = [&] (int trav1, int trav2) { + auto& trav1_allele = trav_to_allele[trav1]; + auto& trav2_allele = trav_to_allele[trav2]; + // avoid the filtered allele state (== -1) + auto trav1_af = trav1_allele >= 0 ? allele_frequencies[trav1_allele] : 0; + auto trav2_af = trav2_allele >= 0 ? allele_frequencies[trav2_allele] : 0; + if (trav1_af < trav2_af) { + return true; + } else if (trav1_af == trav2_af) { + // prefer non-ref when possible + if (trav1_allele < trav2_allele) { + return true; + } else if (trav1_allele == trav2_allele) { + return trav_to_name[trav1] < trav_to_name[trav2]; + } + } + return false; + }; + vector sorted_travs = travs; + std::sort(sorted_travs.begin(), sorted_travs.end(), comp); + // find the most frequent traversals + vector most_frequent_travs; + + // try to pull out unique phases if available + bool has_phasing = gbwt_sample_to_phase_range.count(sample_name) && + std::any_of(gbwt_phases.begin(), gbwt_phases.end(), [](int i) { return i >= 0; }); + //|| path_to_sample_phase; + bool phasing_conflict = false; + int sample_ploidy = ploidy; + int min_phase = 1; + int max_phase = ploidy; + if (has_phasing || path_to_sample_phase) { + if (has_phasing) { + // override ploidy with information about all phases found in input + std::tie(min_phase, max_phase) = gbwt_sample_to_phase_range.at(sample_name); + // shift left by 1 unless min phase is 0 + sample_ploidy = min_phase == 0 ? max_phase + 1 : max_phase; + assert(sample_ploidy > 0); + } else { + sample_ploidy = sample_ploidys.at(sample_name); + } + set used_phases; + for (int i = sorted_travs.size() - 1; i >= 0 && most_frequent_travs.size() < sample_ploidy; --i) { + int phase = gbwt_phases.at(sorted_travs[i]); + if (!used_phases.count(phase)) { + if (trav_to_allele[sorted_travs[i]] >= 0) { + most_frequent_travs.push_back(sorted_travs[i]); + used_phases.insert(phase); + } + } else if (strict_conflict_checking) { + phasing_conflict = true; + } + } + } else { + for (int i = sorted_travs.size() - 1; i >= 0 && most_frequent_travs.size() < sample_ploidy; --i) { + if (trav_to_allele[sorted_travs[i]] >= 0) { + most_frequent_travs.push_back(sorted_travs[i]); + } + } + } + + // sort by phase + if (has_phasing) { + std::sort(most_frequent_travs.begin(), most_frequent_travs.end(), + [&](int t1, int t2) {return gbwt_phases.at(t1) < gbwt_phases.at(t2);}); + if (max_phase > 0) { + // pad out by phase + assert(most_frequent_travs.empty() || gbwt_phases.at(most_frequent_travs.back()) <= max_phase); + assert(max_phase < 1000); + // we normally expect to have phases 1,2,3, ... + // in this case, we shift them all back, otherwise leave 0-based + int offset = min_phase != 0 ? -1 : 0; + vector padded_travs(max_phase + 1 + offset, -1); + for (auto ft : most_frequent_travs) { + int phase = gbwt_phases.at(ft) + offset; + padded_travs.at(phase) = ft; + } + swap(padded_travs, most_frequent_travs); + } + } else if (path_to_sample_phase) { + std::sort(most_frequent_travs.begin(), most_frequent_travs.end(), + [&](int t1, int t2) {return gbwt_phases.at(t1) < gbwt_phases.at(t2);}); + vector padded_travs(sample_ploidy, -1); + for (auto ft : most_frequent_travs) { + int phase = gbwt_phases.at(ft); + //std::cerr << "on phase " << phase << std::endl; + padded_travs.at(phase) = ft; + } + swap(padded_travs, most_frequent_travs); + } + // check if there's a conflict + size_t zero_count = std::count(allele_frequencies.begin(), allele_frequencies.end(), 0); + bool conflict = phasing_conflict || allele_frequencies.size() - zero_count > sample_ploidy; + return make_pair(most_frequent_travs, conflict); +} + + +// todo refactor if we need to reuse elsewhere in vg +// implemented inline for development +// assumes sorted input +double Deconstructor::context_jaccard( + const vector& target, + const vector& query) const { + size_t node_isec = 0; + std::set_intersection(target.begin(), target.end(), + query.begin(), query.end(), + count_back_inserter(node_isec)); + size_t node_union = 0; + std::set_union(target.begin(), target.end(), + query.begin(), query.end(), + count_back_inserter(node_union)); + return (double)node_isec / (double)node_union; +} + +double Deconstructor::context_jaccard( + const dac_vector<>& target, + const vector& query) const { + size_t node_isec = 0; + std::set_intersection(target.begin(), target.end(), + query.begin(), query.end(), + count_back_inserter(node_isec)); + size_t node_union = 0; + std::set_union(target.begin(), target.end(), + query.begin(), query.end(), + count_back_inserter(node_union)); + return (double)node_isec / (double)node_union; +} + +vector Deconstructor::get_context( + step_handle_t start_step, + step_handle_t end_step) const { + if (graph->get_position_of_step(start_step) + > graph->get_position_of_step(end_step)) { + std::swap(start_step, end_step); + } + // by definition, our start and end are shared among all traversals + // we establish a graph context sketch including the nodes traversed in the bubble + // and flanks upstream and downstream of path_jaccard_window bp + vector context; + step_handle_t curr = start_step; + const int check_distance = this->path_jaccard_window; // how far we look in each direction + int distance_checked = 0; + while (distance_checked < check_distance && graph->has_previous_step(curr)) { + curr = graph->get_previous_step(curr); + auto h = graph->get_handle_of_step(curr); + context.push_back(graph->get_id(h)); + distance_checked += graph->get_length(h); + } + // add the nodes in the bubble + if (start_step != end_step) { + curr = start_step; + context.push_back(graph->get_id(graph->get_handle_of_step(curr))); + while (graph->has_next_step(curr) && + curr != end_step) { + curr = graph->get_next_step(curr); + context.push_back(graph->get_id(graph->get_handle_of_step(curr))); + } + } + distance_checked = 0; + curr = end_step; + while (distance_checked < check_distance && graph->has_next_step(curr)) { + curr = graph->get_next_step(curr); + auto h = graph->get_handle_of_step(curr); + context.push_back(graph->get_id(h)); + distance_checked += graph->get_length(h); + } + std::sort(context.begin(), context.end()); + return context; +} + +vector Deconstructor::get_context( + const pair, + vector>>& path_travs, + const int& trav_idx) const { + step_handle_t start_step = path_travs.second[trav_idx].first; + step_handle_t end_step = path_travs.second[trav_idx].second; + return get_context(start_step, end_step); +} + +bool Deconstructor::deconstruct_site(const Snarl* snarl) const { + + auto contents = snarl_manager->shallow_contents(snarl, *graph, false); + if (contents.first.empty()) { + // Nothing but the boundary nodes in this snarl +#ifdef debug +#pragma omp critical (cerr) + cerr << "Skipping empty site " << pb2json(*snarl) << endl; +#endif + return false; + } +#ifdef debug +#pragma omp crtiical (cerr) + cerr << "Computing traversals of site " << pb2json(*snarl) << endl; +#endif + + // find every traversal that runs through a path in the graph + pair, vector > > path_travs; + path_travs = path_trav_finder->find_path_traversals(*snarl); + vector path_trav_names; + for (const pair& trav_ends : path_travs.second) { + path_trav_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_ends.first))); + } + + // pick out the traversal corresponding to a reference path, breaking ties consistently + string ref_trav_name; + for (int i = 0; i < path_travs.first.size(); ++i) { + const string& path_trav_name = path_trav_names.at(i); +#ifdef debug +#pragma omp critical (cerr) + { + cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << path_travs.first[i].visit_size() + << ", start=" << graph->get_position_of_step(path_travs.second[i].first) + << ", end=" << graph->get_position_of_step(path_travs.second[i].second) << endl + << " trav=" << pb2json(path_travs.first[i]) << endl; + } +#endif + if (ref_paths.count(path_trav_name) && + (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { + ref_trav_name = path_trav_name; +#ifdef debug +#pragma omp critical (cerr) + cerr << "Setting ref_trav_name " << ref_trav_name << endl; +#endif + } + } + + // add in the gbwt traversals + // after this, all traversals are treated the same, with metadata embedded in their names + int64_t first_gbwt_trav_idx = path_trav_names.size(); + vector gbwt_path_ids; + if (gbwt_trav_finder.get() != nullptr) { + const gbwt::GBWT& gbwt_index = gbwt_trav_finder->get_gbwt(); + pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(*snarl); + for (int i = 0; i < thread_travs.first.size(); ++i) { + // We need to get a bunch of metadata about the path, but the GBWT + // we have might not even have structured path names stored. + gbwt::size_type path_id = gbwt::Path::id(thread_travs.second[i]); + if (!gbwt_index.hasMetadata() || !gbwt_index.metadata.hasPathNames() || path_id >= gbwt_index.metadata.paths()) { + continue; + } - string hstr = stream.str(); - assert(outvcf.openForOutput(hstr)); - cout << outvcf.header << endl; - this->headered = true; - } - - // Find snarls - // Snarls are variant sites ("bubbles") - SnarlFinder* snarl_finder = new CactusSnarlFinder(*graph, refpath); - SnarlManager snarl_manager = snarl_finder->find_snarls(); - vector snarl_roots = snarl_manager.top_level_snarls(); - TraversalFinder* trav_finder = new ExhaustiveTraversalFinder(*graph, snarl_manager); - for (const Snarl* snarl: snarl_roots){ - // For each top level snarl + gbwt_path_ids.push_back(path_id); + PathSense sense = gbwtgraph::get_path_sense(gbwt_index, path_id, gbwt_reference_samples); - // Except the trivial ones - if (snarl->type() == ULTRABUBBLE) { - auto contents = snarl_manager.shallow_contents(snarl, *graph, false); - if (contents.first.empty()) { - // Nothing but the boundary nodes in this snarl - continue; + if (sense == PathSense::HAPLOTYPE) { + // we count on convention of reference as embedded path above, so only use haplotype paths here. + // todo: would be nice to be more flexible... + string path_name = PathMetadata::create_path_name( + sense, + gbwtgraph::get_path_sample_name(gbwt_index, path_id, sense), + gbwtgraph::get_path_locus_name(gbwt_index, path_id, sense), + gbwtgraph::get_path_haplotype(gbwt_index, path_id, sense), + gbwtgraph::get_path_phase_block(gbwt_index, path_id, sense), + gbwtgraph::get_path_subrange(gbwt_index, path_id, sense)); + path_trav_names.push_back(path_name); + path_travs.first.push_back(thread_travs.first[i]); + // dummy handles so we can use the same code as the named path traversals above + path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t())); + } + } + } + + // remember all the reference traversals (there can be more than one only in the case of a + // cycle in the reference path + + // in case of cycles, we need our allele traversals to be associated to the correct reference position + // this is done with the path jaccard metric over all overlapping reference paths the given path_jaccard_window size + + vector ref_travs; + // hacky subpath support -- gets added to variant on output + vector ref_offsets; + if (!ref_trav_name.empty()) { + for (int i = 0; i < path_travs.first.size(); ++i) { + const string& path_trav_name = path_trav_names.at(i); + subrange_t subrange ; + Paths::strip_subrange(path_trav_name, &subrange); + int64_t sub_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + if (path_trav_name == ref_trav_name) { + ref_travs.push_back(i); + ref_offsets.push_back(sub_offset); +#ifdef debug +#pragma omp critical (cerr) + cerr << "Adding ref_tav idx=" << i << " offset=" << sub_offset << " because " << path_trav_name << " == " << ref_trav_name << endl; +#endif + } + } + } + + // there's no reference path through the snarl, so we can't make a variant + // (todo: should we try to detect this before computing traversals?) + if (ref_travs.empty()) { +#ifdef debug +#pragma omp critical (cerr) + cerr << "Skipping site because no reference traversal was found " << pb2json(*snarl) << endl; +#endif + return false; + } + + bool exhaustive = !path_restricted && gbwt_trav_finder.get() == nullptr; + if (exhaustive) { + // add in the exhaustive traversals + vector additional_travs; + + // exhaustive traversal can't do all snarls + if (snarl->type() != ULTRABUBBLE) { + return false; + } + if (!check_max_nodes(snarl)) { +#pragma omp critical (cerr) + cerr << "Warning: Skipping site because it is too complex for exhaustive traversal enumeration: " << pb2json(*snarl) << endl << " Consider using -e to traverse embedded paths" << endl; + return false; + } + additional_travs = explicit_exhaustive_traversals(snarl); + + // happens when there was a nested non-ultrabubble snarl + if (additional_travs.empty()) { + return false; + } + path_travs.first.insert(path_travs.first.end(), additional_travs.begin(), additional_travs.end()); + for (int i = 0; i < additional_travs.size(); ++i) { + // dummy names so we can use the same code as the named path traversals above + path_trav_names.push_back(" >>" + std::to_string(i)); + // dummy handles so we can use the same code as the named path traversals above + path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t())); + } + + } + + // there's not alt path through the snarl, so we can't make an interesting variant + if (path_travs.first.size() < 2) { +#ifdef debug +#pragma omp critical (cerr) + cerr << "Skipping site because to alt traversal was found " << pb2json(*snarl) << endl; +#endif + return false; + } + + // XXX CHECKME this assumes there is only one reference path here, and that multiple traversals are due to cycles + + // we collect windows around the reference traversals + // to compare with equivalent windows from the alternate allele paths + // we will associate these 1:1 with reference traversals + + // remember that path_travs := pair, vector > > path_travs; + + // map from each path_trav index to the ref_trav index it best maps to + vector path_trav_to_ref_trav; + + if (ref_travs.size() > 1 && this->path_jaccard_window && exhaustive && !exhaustive_jaccard_warning) { +#pragma omp critical (cerr) + cerr << "warning [vg deconstruct]: Conext Jaccard logic for multiple references disabled with exhaustive traversals. Use -e, -g or GBZ input to switch to path-based traversals only (recommended)." << endl; + exhaustive_jaccard_warning = true; + } + if (ref_travs.size() > 1 && this->path_jaccard_window && !exhaustive) { + path_trav_to_ref_trav.resize(path_travs.first.size()); +#ifdef debug +#pragma omp critical (cerr) + cerr << "Multiple ref traversals!" << endl; +#endif + { + vector> ref_contexts(ref_travs.size()); +#pragma omp parallel for schedule(dynamic,1) + for (size_t i = 0; i < ref_travs.size(); ++i) { + auto& trav_id = ref_travs[i]; + ref_contexts[i] = get_context(path_travs, trav_id); + } + // now for each traversal, we compute and equivalent context and match it to a ref context + // using a jaccard metric over node ids +#pragma omp parallel for schedule(dynamic,1) + for (size_t i = 0; i < path_travs.first.size(); ++i) { + vector context = get_context(path_travs, i); + // map jaccard metric to the index of the ref_trav + vector> ref_mappings; + for (uint64_t j = 0; j < ref_travs.size(); ++j) { + ref_mappings.push_back(make_pair( + context_jaccard( + ref_contexts[j], + context), + ref_travs[j])); } + std::sort(ref_mappings.begin(), ref_mappings.end()); + // the best is the last, which has the highest jaccard + path_trav_to_ref_trav[i] = ref_mappings.back().second; } - + } + } + + // we write a variant for every reference traversal + // (optionally) selecting the subset of path traversals that are 1:1 +//#pragma omp parallel for + for (size_t i = 0; i < ref_travs.size(); ++i) { +//#pragma omp task firstprivate(i) + { + auto& ref_trav_idx = ref_travs[i]; + auto& ref_trav_offset = ref_offsets[i]; + + const SnarlTraversal& ref_trav = path_travs.first[ref_trav_idx]; + vcflib::Variant v; - // SnarlTraversals are the (possible) alleles of our variant site. - vector travs = trav_finder->find_traversals(*snarl); + v.quality = 60; + + // in VCF we usually just want the contig + string contig_name = PathMetadata::parse_locus_name(ref_trav_name); + if (contig_name == PathMetadata::NO_LOCUS_NAME) { + contig_name = ref_trav_name; + } else if (long_ref_contig) { + // the sample name isn't unique enough, so put a full ugly name in the vcf + if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) { + contig_name = ref_trav_name; + } else { + contig_name = PathMetadata::create_path_name(PathSense::REFERENCE, + PathMetadata::parse_sample_name(ref_trav_name), + contig_name, + PathMetadata::parse_haplotype(ref_trav_name), + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); + } + } + // write variant's sequenceName (VCF contig) - v.sequenceName = refpath; - // Set position based on the lowest position in the snarl. - pair pos_orientation_start = pindexes[refpath]->by_id[snarl->start().node_id()]; - pair pos_orientation_end = pindexes[refpath]->by_id[snarl->end().node_id()]; - bool use_start = pos_orientation_start.first < pos_orientation_end.first; - size_t node_pos = (use_start ? pos_orientation_start.first : pos_orientation_end.first); - v.position = node_pos +(use_start ? graph->get_node(snarl->start().node_id())->sequence().length() : graph->get_node(snarl->end().node_id())->sequence().length()); - std::pair > t_alleles = get_alleles(travs, refpath, graph); - if (t_alleles.first){ - v.alleles.insert(v.alleles.begin(), t_alleles.second[0]); - v.ref = t_alleles.second[0]; - for (int i = 1; i < t_alleles.second.size(); i++){ - v.alleles.push_back(t_alleles.second[i]); - v.alt.push_back(t_alleles.second[i]); - } - } - else{ - cerr << "NO REFERENCE ALLELE FOUND" << endl; - v.alleles.insert(v.alleles.begin(), "."); - for (int i = 0; i < t_alleles.second.size(); i++){ - v.alleles.push_back(t_alleles.second[i]); - v.alt.push_back(t_alleles.second[i]); - } - } - v.updateAlleleIndexes(); - cout << v << endl; + v.sequenceName = contig_name; + + // Map our snarl endpoints to oriented positions in the embedded path in the graph + handle_t first_path_handle; + size_t first_path_pos; + bool use_start; + assert(ref_trav_idx < first_gbwt_trav_idx); + step_handle_t start_step = path_travs.second[ref_trav_idx].first; + step_handle_t end_step = path_travs.second[ref_trav_idx].second; + handle_t start_handle = graph->get_handle_of_step(start_step); + handle_t end_handle = graph->get_handle_of_step(end_step); + size_t start_pos = graph->get_position_of_step(start_step); + size_t end_pos = graph->get_position_of_step(end_step); + use_start = start_pos < end_pos; + first_path_handle = use_start ? start_handle : end_handle; + first_path_pos = use_start ? start_pos : end_pos; + + // Get the first visit of our snarl traversal + const Visit& first_trav_visit = use_start ? ref_trav.visit(0) : ref_trav.visit(ref_trav.visit_size() - 1); + + char prev_char; + if ((use_start && first_trav_visit.backward() == graph->get_is_reverse(first_path_handle)) || + (!use_start && first_trav_visit.backward() != graph->get_is_reverse(first_path_handle))) { + // Our path and traversal have consistent orientation. leave off the end of the start node going forward + first_path_pos += graph->get_length(first_path_handle); + prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]); + } else { + // They are flipped: leave off the beginning of the start node going backward + prev_char = reverse_complement(::toupper(graph->get_sequence(first_path_handle)[0])); + } + + // shift from 0-based to 1-based for VCF + first_path_pos += 1; + v.position = first_path_pos + ref_trav_offset; + + v.id = print_snarl(*snarl); + + // Convert the snarl traversals to strings and add them to the variant + vector use_trav(path_travs.first.size()); + if (path_trav_to_ref_trav.size()) { + for (uint64_t i = 0; i < use_trav.size(); ++i) { + use_trav[i] = (ref_trav_idx == path_trav_to_ref_trav[i]); + } + } else { + for (uint64_t i = 0; i < use_trav.size(); ++i) { + use_trav[i] = true; + } + } + + vector trav_to_allele = get_alleles(v, path_travs, + ref_trav_idx, + use_trav, + prev_char, use_start); + + // Fill in the genotypes + if (path_restricted || gbwt_trav_finder.get()) { + get_genotypes(v, path_trav_names, trav_to_allele); + } + + // we only bother printing out sites with at least 1 non-reference allele + if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { + if (path_restricted || gbwt_trav_finder.get()) { + // run vcffixup to add some basic INFO like AC + vcf_fixup(v); + } + add_variant(v); + } } + } +//#pragma omp taskwait + return true; +} + +/** + * Convenience wrapper function for deconstruction of multiple paths. + */ +void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, + bool path_restricted_traversals, + int ploidy, + bool include_nested, + int context_jaccard_window, + bool untangle_traversals, + bool keep_conflicted, + bool strict_conflicts, + bool long_ref_contig, + gbwt::GBWT* gbwt) { + + this->graph = graph; + this->snarl_manager = snarl_manager; + this->path_restricted = path_restricted_traversals; + this->ploidy = ploidy; + this->ref_paths = set(ref_paths.begin(), ref_paths.end()); + this->include_nested = include_nested; + this->path_jaccard_window = context_jaccard_window; + this->untangle_allele_traversals = untangle_traversals; + this->keep_conflicted_genotypes = keep_conflicted; + this->strict_conflict_checking = strict_conflicts; + if (gbwt) { + this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); + } + + // the need to use nesting is due to a problem with omp tasks and shared state + // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) + omp_set_nested(1); + omp_set_max_active_levels(3); + + // Keep track of the non-reference paths in the graph. They'll be our sample names + ref_samples.clear(); + set ref_haplotypes; + for (const string& ref_path_name : ref_paths) { + ref_samples.insert(PathMetadata::parse_sample_name(ref_path_name)); + ref_haplotypes.insert(PathMetadata::parse_haplotype(ref_path_name)); + } + if (!long_ref_contig) { + long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1; + } + this->long_ref_contig = long_ref_contig; + sample_names.clear(); + unordered_map> sample_to_haps; + + // find sample names from non-reference paths + graph->for_each_path_handle([&](const path_handle_t& path_handle) { + string path_name = graph->get_path_name(path_handle); + if (!this->ref_paths.count(path_name)) { + string sample_name = graph->get_sample_name(path_handle); + // for backward compatibility + if (sample_name == PathMetadata::NO_SAMPLE_NAME) { + sample_name = path_name; + } + if (!ref_samples.count(sample_name)) { + size_t haplotype = graph->get_haplotype(path_handle); + if (haplotype == PathMetadata::NO_HAPLOTYPE) { + haplotype = 0; + } + sample_to_haps[sample_name].insert((int)haplotype); + sample_names.insert(sample_name); + } + } + }); + + // add in the GBWT sample names + if (gbwt) { + // add in sample names from the gbwt + for (size_t i = 0; i < gbwt->metadata.paths(); i++) { + PathSense sense = gbwtgraph::get_path_sense(*gbwt, i, gbwt_reference_samples); + if (sense == PathSense::HAPLOTYPE) { + string path_name = PathMetadata::create_path_name( + sense, + gbwtgraph::get_path_sample_name(*gbwt, i, sense), + gbwtgraph::get_path_locus_name(*gbwt, i, sense), + gbwtgraph::get_path_haplotype(*gbwt, i, sense), + gbwtgraph::get_path_phase_block(*gbwt, i, sense), + gbwtgraph::get_path_subrange(*gbwt, i, sense)); + if (!this->ref_paths.count(path_name)) { + string sample_name = gbwtgraph::get_path_sample_name(*gbwt, i, sense); + if (!ref_samples.count(sample_name)) { + auto phase = gbwtgraph::get_path_haplotype(*gbwt, i, sense); + if (phase == PathMetadata::NO_HAPLOTYPE) { + // Default to 0. + phase = 0; + } + sample_to_haps[sample_name].insert((int)phase); + sample_names.insert(sample_name); + } + } + } + } + } + + // find some stats about the haplotypes for each sample + gbwt_sample_to_phase_range.clear(); + sample_ploidys.clear(); + for (auto& sample_haps : sample_to_haps) { + sample_ploidys[sample_haps.first] = sample_haps.second.size(); + gbwt_sample_to_phase_range[sample_haps.first] = make_pair(*sample_haps.second.begin(), *sample_haps.second.rbegin()); + } + // print the VCF header + stringstream stream; + stream << "##fileformat=VCFv4.2" << endl; + if (path_restricted || gbwt) { + stream << "##FORMAT=" << endl; + } + if (show_path_info && path_to_sample_phase && path_restricted) { + stream << "##FORMAT=" << endl; + } + if (path_to_sample_phase || gbwt) { + stream << "##INFO=" << endl; + } + if (path_restricted || gbwt) { + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + } + if (include_nested) { + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + } + if (untangle_allele_traversals) { + stream << "##INFO=|<][id]_[start|.]_[end|.], with '.' indicating non-reference nodes.\">" << endl; + } else { + stream << "##INFO=" << endl; + } + set gbwt_ref_paths; + map ref_path_to_length; + for(auto& refpath : ref_paths) { + if (graph->has_path(refpath)) { + int64_t path_len = 0; + path_handle_t path_handle = graph->get_path_handle(refpath); + for (handle_t handle : graph->scan_path(path_handle)) { + path_len += graph->get_length(handle); + } + string locus_name = graph->get_locus_name(path_handle); + if (locus_name == PathMetadata::NO_LOCUS_NAME) { + locus_name = refpath; + } else if (long_ref_contig) { + // the sample name isn't unique enough, so put a full ugly name in the vcf + if (graph->get_sense(path_handle) == PathSense::GENERIC) { + locus_name = graph->get_path_name(path_handle); + } else { + locus_name = PathMetadata::create_path_name(PathSense::REFERENCE, + graph->get_sample_name(path_handle), + locus_name, + graph->get_haplotype(path_handle), + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); + } + } + subrange_t subrange = graph->get_subrange(path_handle); + int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + ref_path_to_length[locus_name] = std::max(ref_path_to_length[locus_name], path_len + offset); + } else { + gbwt_ref_paths.insert(refpath); + } + } + for (auto& ref_path_len : ref_path_to_length) { + stream << "##contig=" << endl; } + if (!gbwt_ref_paths.empty()) { + unordered_map> gbwt_name_to_ids; + for (size_t i = 0; i < gbwt->metadata.paths(); i++) { + // Collect all the GBWT path IDs for each sample and contig. + gbwt_name_to_ids[compose_short_path_name(*gbwt, i)].push_back(i); + } + for (const string& refpath : gbwt_ref_paths) { + // For each sample and contig name that is a GBWT ref path + vector& thread_ids = gbwt_name_to_ids.at(refpath); + size_t path_len = 0; + for (gbwt::size_type thread_id : thread_ids) { + // For each actual path in the GBWT for that sample-and-contig, + // we need to see how long it extends the space of the sample + // and contig. + + // TODO: These are probably all guaranteed to be haplotype sense? + PathSense sense = gbwtgraph::get_path_sense(*gbwt, thread_id, gbwt_reference_samples); + subrange_t subrange = gbwtgraph::get_path_subrange(*gbwt, thread_id, sense); + + // TODO: when importing GFAs we might cram the start of a walk + // into the GBWT count field. But we don't ever guarantee that + // we've done that so it might not be visible as a subrange + // here. Fix that somehow??? + size_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + size_t len = path_to_length(extract_gbwt_path(*graph, *gbwt, thread_id)); + path_len = std::max(path_len, offset + len); + } + stream << "##contig=" << endl; + } + } + + stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; + if (path_restricted || gbwt) { + for (auto& sample_name : sample_names) { + stream << "\t" << sample_name; + } + } + stream << endl; + + string hstr = stream.str(); + assert(output_vcf.openForOutput(hstr)); + cout << output_vcf.header << endl; - /** - * Convenience wrapper function for deconstruction of multiple paths. - */ - void Deconstructor::deconstruct(vector ref_paths, vg::VG* graph){ + // create the traversal finder + map reads_by_name; + path_trav_finder = unique_ptr(new PathTraversalFinder(*graph, + *snarl_manager)); + + if (!path_restricted && !gbwt) { + trav_finder = unique_ptr(new ExhaustiveTraversalFinder(*graph, + *snarl_manager, + true)); - for (auto path : ref_paths){ - deconstruct(path, graph); + } + + if (gbwt != nullptr) { + gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); + } + + vector snarls_todo; + // Do the top-level snarls in parallel + snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { + vector todo(1, snarl); + vector next; + while (!todo.empty()) { + for (auto next_snarl : todo) { + // if we can't make a variant from the snarl due to not finding + // paths through it, we try again on the children + // note: we may want to push the parallelism down a bit +#pragma omp critical (snarls_todo) + snarls_todo.push_back(next_snarl); + if (include_nested) { + // n.b. we no longer attempt to deconstruct the site to determine if we nest + const vector& children = snarl_manager->children_of(next_snarl); + next.insert(next.end(), children.begin(), children.end()); + } + } + swap(todo, next); + next.clear(); + } + }); + +//#pragma omp parallel +//#pragma omp single + { +#pragma omp parallel for schedule(dynamic,1) + for (size_t i = 0; i < snarls_todo.size(); i++) { +//#pragma omp task firstprivate(i) + { + auto& snarl = snarls_todo[i]; + deconstruct_site(snarl); + } } + } +//#pragma omp taskwait + + // write variants in sorted order + write_variants(cout, snarl_manager); +} +bool Deconstructor::check_max_nodes(const Snarl* snarl) const { + unordered_set nodeset = snarl_manager->deep_contents(snarl, *graph, false).first; + int node_count = 0; + for (auto node_id : nodeset) { + handle_t node = graph->get_handle(node_id); + if (graph->get_degree(node, true) > 1 || graph->get_degree(node, false) > 1) { + ++node_count; + if (node_count > max_nodes_for_exhaustive) { + return false; + } + } } + return true; +}; + +vector Deconstructor::explicit_exhaustive_traversals(const Snarl* snarl) const { + vector out_travs; + bool ultra_all_the_way_down = true; + function extend_trav = + [&](const SnarlTraversal& trav, const Snarl& nested_snarl) { + // exhaustive traversal finder is limited. if we find something + // that's not an ultrabubble, not much we can do + if (nested_snarl.type() != ULTRABUBBLE) { + ultra_all_the_way_down = false; + return; + } + vector nested_travs = trav_finder->find_traversals(nested_snarl); + for (auto& nested_trav : nested_travs) { + SnarlTraversal extended_trav = trav; + bool is_explicit = true; + for (int i = 0; i < nested_trav.visit_size(); ++i) { + if (nested_trav.visit(i).node_id() != 0) { + Visit* visit = extended_trav.add_visit(); + *visit = nested_trav.visit(i); + } else { + extend_trav(extended_trav, nested_trav.visit(i).snarl()); + is_explicit = false; + } + } + if (is_explicit) { + out_travs.push_back(extended_trav); + } + } + }; + SnarlTraversal trav; + extend_trav(trav, *snarl); + if (!ultra_all_the_way_down) { + out_travs.clear(); + } + return out_travs; +} + } diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 354b3af889c..a8c66e369bb 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -2,17 +2,16 @@ #define VG_DECONSTRUCTOR_HPP_INCLUDED #include #include -#include #include #include +#include +#include #include "genotypekit.hpp" -#include "path_index.hpp" #include "Variant.h" -#include "path.hpp" -#include "vg.hpp" -#include "genotypekit.hpp" -#include "vg.pb.h" -#include "Fasta.h" +#include "handle.hpp" +#include "traversal_finder.hpp" +#include "graph_caller.hpp" +#include "lru_cache.h" /** \file * Deconstruct is getting rewritten. @@ -26,20 +25,161 @@ ** "Linear-Time Superbubble Identification Algorithm for Genome Assembly" */ namespace vg{ - using namespace std; - class Deconstructor{ - public: +using namespace std; + +// note: added VCFOutputCaller parent class from vg call bring in sorted vcf output. it would +// be nice to re-use more of the VCFOutputCaller code, much of which is still duplicated in +// Deconstructor +class Deconstructor : public VCFOutputCaller { +public: + + Deconstructor(); + ~Deconstructor(); + + // deconstruct the entire graph to cout. + // Not even a little bit thread safe. + void deconstruct(vector refpaths, const PathPositionHandleGraph* grpah, SnarlManager* snarl_manager, + bool path_restricted_traversals, + int ploidy, + bool include_nested, + int context_jaccard_window, + bool untangle_traversals, + bool keep_conflicted, + bool strict_conflicts, + bool long_ref_contig, + gbwt::GBWT* gbwt = nullptr); + +private: + + // write a vcf record for the given site. returns true if a record was written + // (need to have a path going through the site) + bool deconstruct_site(const Snarl* site) const; + + // convert traversals to strings. returns mapping of traversal (offset in travs) to allele + vector get_alleles(vcflib::Variant& v, + const pair, + vector>>& path_travs, + int ref_path_idx, + const vector& use_trav, + char prev_char, bool use_start) const; + + // write traversal path names as genotypes + void get_genotypes(vcflib::Variant& v, const vector& names, const vector& trav_to_allele) const; + + // given a set of traversals associated with a particular sample, select a set of size for the VCF + // the highest-frequency ALT traversal is chosen + // the bool returned is true if multiple traversals map to different alleles, more than ploidy. + pair, bool> choose_traversals(const string& sample_name, + const vector& travs, const vector& trav_to_allele, + const vector& trav_to_name, + const vector& gbwt_phases) const; + + // check to see if a snarl is too big to exhaustively traverse + bool check_max_nodes(const Snarl* snarl) const; + + // get traversals from the exhaustive finder. if they have nested visits, fill them in (exhaustively) + // with node visits + vector explicit_exhaustive_traversals(const Snarl* snarl) const; + + // gets a sorted node id context for a given path + vector get_context( + const pair, + vector>>& path_travs, + const int& trav_idx) const; + + // the underlying context-getter + vector get_context( + step_handle_t start_step, + step_handle_t end_step) const; + + // compares node contexts + double context_jaccard(const vector& target, + const vector& query) const; + + // specialization for enc_vectors + double context_jaccard( + const dac_vector<>& target, + const vector& query) const; + + // toggle between exhaustive and path restricted traversal finder + bool path_restricted = false; + + // the max ploidy we expect. + int ploidy; + + // the graph + const PathPositionHandleGraph* graph; + + // the snarl manager + SnarlManager* snarl_manager; + + // the traversal finders. we always use a path traversal finder to get the reference path + unique_ptr path_trav_finder; + // we optionally use another (exhaustive for now) traversal finder if we don't want to rely on paths + unique_ptr trav_finder; + // we can also use a gbwt for traversals + unique_ptr gbwt_trav_finder; + // When using the gbwt we need some precomputed information to ask about stored paths. + unordered_set gbwt_reference_samples; + + // infer ploidys from gbwt when possible + unordered_map> gbwt_sample_to_phase_range; + + // the ref paths + set ref_paths; + + // keep track of reference samples + set ref_samples; + + // do we need to write metadata for reference contigs + bool long_ref_contig = false; + + // keep track of the non-ref paths as they will be our samples + set sample_names; + + // map the path name to the sample in the vcf + const unordered_map>* path_to_sample_phase; + + // the sample ploidys given in the phases in our path names + unordered_map sample_ploidys; + + // upper limit of degree-2+ nodes for exhaustive traversal + int max_nodes_for_exhaustive = 100; + + // target window size for determining the correct reference position for allele traversals with path jaccard + int path_jaccard_window = 10000; + + // should we add positional untangling of traversals in the AP field + bool untangle_allele_traversals = false; + + // should we be strict about flagging and removing conflicted phases? + bool strict_conflict_checking = false; + + // show path info mapping paths to genotypes (very verbose) + bool show_path_info = false; + + // should we keep conflicted genotypes or not + bool keep_conflicted_genotypes = false; - Deconstructor(); - ~Deconstructor(); - pair > get_alleles(vector travs, string refpath, vg::VG* graph); + // warn about context jaccard not working with exhaustive traversals + mutable atomic exhaustive_jaccard_warning; +}; - void deconstruct(string refpath, vg::VG* graph); - void deconstruct(vector refpaths, vg::VG* graph); - map pindexes; +// helpel for measuring set intersectiond and union size +template +class count_back_inserter { + size_t &count; +public: + typedef void value_type; + typedef void difference_type; + typedef void pointer; + typedef void reference; + typedef std::output_iterator_tag iterator_category; + count_back_inserter(size_t &count) : count(count) {}; + void operator=(const T &){ ++count; } + count_back_inserter &operator *(){ return *this; } + count_back_inserter &operator++(){ return *this; } +}; - private: - bool headered = false; - }; } #endif diff --git a/src/deletion_aligner.cpp b/src/deletion_aligner.cpp new file mode 100644 index 00000000000..88293f1c182 --- /dev/null +++ b/src/deletion_aligner.cpp @@ -0,0 +1,250 @@ +/** + * \file deletion_aligner.cpp + * + * Implements an aligner for global deletions + * + */ + +#include "deletion_aligner.hpp" + +//#define debug_deletion_aligner + +namespace vg { + +DeletionAligner::DeletionAligner(int8_t gap_open, int8_t gap_extension) + : gap_open(gap_open), gap_extension(gap_extension) +{ + +} + +void DeletionAligner::align(Alignment& aln, const HandleGraph& graph) const { + if (!aln.sequence().empty()) { + cerr << "error: DeletionAligner can only be used for alignments of empty strings" << endl; + exit(1); + } + auto traces = run_dp(graph, 1); + trace_to_alignment(aln, traces.at(0), graph); +} + +void DeletionAligner::align_multi(Alignment& aln, vector& alt_alns, + const HandleGraph& graph, int32_t max_alt_alns) const { + if (!aln.sequence().empty()) { + cerr << "error: DeletionAligner can only be used for alignments of empty strings" << endl; + exit(1); + } + auto traces = run_dp(graph, max_alt_alns); + alt_alns.resize(traces.size()); + for (size_t i = 0; i < traces.size(); ++i) { + alt_alns[i].set_sequence(aln.sequence()); + alt_alns[i].set_quality(aln.quality()); + trace_to_alignment(alt_alns[i], traces[i], graph); + } + *aln.mutable_path() = alt_alns.front().path(); + aln.set_score(alt_alns.front().score()); +} + +vector> DeletionAligner::run_dp(const HandleGraph& graph, + int32_t max_tracebacks) const { +#ifdef debug_deletion_aligner + cerr << "aligning deletions with " << max_tracebacks << " tracebacks" << endl; +#endif + auto order = handlealgs::lazier_topological_order(&graph); + + if (order.empty() && max_tracebacks > 0) { + // Turns out the graph is empty. + // We need to produce one traceback of visiting nothing. + return {{}}; + } + + unordered_map index_of; + index_of.reserve(order.size()); + for (size_t i = 0; i < order.size(); ++i) { + index_of[order[i]] = i; + } + vector dists; + vector> sinks; + tie(dists, sinks) = min_dists(order, index_of, graph); + return traceback(order, index_of, graph, dists, sinks, max_tracebacks); +} + +pair, vector>> DeletionAligner::min_dists( + const vector& order, + const unordered_map& index_of, + const HandleGraph& graph) const +{ +#ifdef debug_deletion_aligner + cerr << "finding min dists among " << order.size() << " handles" << endl; +#endif + // use dynamic programming to compute the minimum distance from a source + vector dists(order.size(), numeric_limits::max()); + vector> sinks; + for (size_t i = 0; i < order.size(); ++i) { + if (dists[i] == numeric_limits::max()) { + // nothing has replaced the starting value, must be a source + dists[i] = 0; +#ifdef debug_deletion_aligner + cerr << "Declare a source at " << graph.get_id(order[i]) << (graph.get_is_reverse(order[i]) ? '-' : '+') << endl; +#endif + } + size_t length_thru = dists[i] + graph.get_length(order[i]); + bool is_sink = true;; + graph.follow_edges(order[i], false, [&](const handle_t& next) { + size_t j = index_of.at(next); + dists[j] = min(dists[j], length_thru); +#ifdef debug_deletion_aligner + cerr << "Edge from " << graph.get_id(order[i]) << (graph.get_is_reverse(order[i]) ? '-' : '+') + << " to " << graph.get_id(next) << (graph.get_is_reverse(next) ? '-' : '+') + << " assigns distance " << dists[j] << endl; +#endif + is_sink = false; + }); + if (is_sink) { + // didn't find any edges forward + sinks.emplace_back(i, length_thru); +#ifdef debug_deletion_aligner + cerr << "Declare a sink at " << graph.get_id(order[i]) << (graph.get_is_reverse(order[i]) ? '-' : '+') << " with distance " << length_thru << endl; +#endif + } + } +#ifdef debug_deletion_aligner + cerr << "min dist results:" << endl; + for (size_t i = 0; i < order.size(); ++i) { + cerr << "\t" << i << " (node " << graph.get_id(order[i]) << "): " << dists[i] << endl; + } + cerr << "sinks:" << endl; + for (auto sink : sinks) { + cerr << "\t" << sink.first << " " << sink.second << endl; + } +#endif + return make_pair(dists, sinks); +} + +vector> DeletionAligner::traceback(const vector& order, + const unordered_map& index_of, + const HandleGraph& graph, + const vector& dists, + const vector>& sinks, + size_t max_tracebacks) const { + +#ifdef debug_deletion_aligner + cerr << "beginning multi-traceback" << endl; +#endif + // records of (distance, deflections (from, to)) + structures::MinMaxHeap>>> heap; + vector> traces; + + // check if we want to take this deviation from the optimal traceback next time + auto propose_deflection = [&](size_t from, size_t to, size_t dist, + const vector>& curr_deflections) { +#ifdef debug_deletion_aligner + cerr << "proposing deflection from " << from << " to " << to << " with dist " << dist << endl; +#endif + if (heap.size() + traces.size() < max_tracebacks || + (!heap.empty() && heap.max().first > dist)) { + // we've used all of the current deflections (so we can now propose more) + // and we either haven't fully populated the heap, or this is better than + // the worst +#ifdef debug_deletion_aligner + cerr << "accepted deflection" << endl; +#endif + vector> deflections = curr_deflections; + deflections.emplace_back(from, to); + heap.emplace(dist, move(deflections)); + if (heap.size() + traces.size() > max_tracebacks) { +#ifdef debug_deletion_aligner + cerr << "ejecting traceback with dist " << heap.max().first << endl; +#endif + heap.pop_max(); + } + } + }; + + // get the next trace, either by taking a deflection or by doing traceback, + // also propose deflections as needed + auto get_next = [&](size_t at, size_t& deflxn, size_t curr_dist, + const vector>& curr_deflections) { + if (deflxn < curr_deflections.size() + && at == curr_deflections[deflxn].first) { + return curr_deflections[deflxn++].second; + } + else { + size_t next = numeric_limits::max(); + size_t dist_here = dists[at]; + graph.follow_edges(order[at], true, [&](const handle_t& prev) { + size_t idx = index_of.at(prev); + size_t dist_thru = dists[idx] + graph.get_length(prev); + if (next == numeric_limits::max() && dist_thru == dist_here) { + next = idx; + } + else if (deflxn == curr_deflections.size()) { + propose_deflection(at, idx, curr_dist - dist_here + dist_thru, + curr_deflections); + } + // keep looking if we haven't found the trace or we're looking + // for deflections + return (next == numeric_limits::max() || + deflxn == curr_deflections.size()); + }); + return next; + } + }; + + // the first deflection is from a sentinel at the end to whichever + // sink node we'll start from, propose these deflections to init the heap + vector> deflections; + for (auto& sink : sinks) { + propose_deflection(order.size(), sink.first, sink.second, + deflections); + } + + traces.reserve(max_tracebacks); + while (!heap.empty()) { + // init the next traceback + traces.emplace_back(); + auto& trace = traces.back(); + size_t trace_dist; + tie(trace_dist, deflections) = heap.min(); + heap.pop_min(); + +#ifdef debug_deletion_aligner + cerr << "beginning trace of dist " << trace_dist << " with deflections:" << endl; + for (auto d : deflections) { + cerr << "\t" << d.first << " -> " << d.second << endl; + } +#endif + + // start by taking deflection from the beginning sentinel + size_t deflxn = 0; + size_t tracer = get_next(order.size(), deflxn, trace_dist, + deflections); + while (tracer != numeric_limits::max()) { +#ifdef debug_deletion_aligner + cerr << "doing trace on " << tracer << endl; +#endif + trace.push_back(order[tracer]); + tracer = get_next(tracer, deflxn, trace_dist, deflections); + } + } + return traces; +} + +void DeletionAligner::trace_to_alignment(Alignment& aln, const vector& trace, + const HandleGraph& graph) const { + int64_t total_dist = 0; + auto path = aln.mutable_path(); + // traces are constructed in reverse + for (auto it = trace.rbegin(); it != trace.rend(); ++it) { + handle_t handle = *it; + auto mapping = path->add_mapping(); + auto position = mapping->mutable_position(); + position->set_node_id(graph.get_id(handle)); + position->set_is_reverse(graph.get_is_reverse(handle)); + auto edit = mapping->add_edit(); + edit->set_from_length(graph.get_length(handle)); + total_dist += edit->from_length(); + } + // TODO: ideally scoring would live in the Aligner, but i guess it's okay + aln.set_score(total_dist ? -gap_open - (total_dist - 1) * gap_extension : 0); +} + +} diff --git a/src/deletion_aligner.hpp b/src/deletion_aligner.hpp new file mode 100644 index 00000000000..db23018510a --- /dev/null +++ b/src/deletion_aligner.hpp @@ -0,0 +1,72 @@ +/** + * \file deletion_aligner.hpp + * + * Defines an aligner for global deletions + * + */ +#ifndef VG_DELETION_ALIGNER_GRAPH_HPP_INCLUDED +#define VG_DELETION_ALIGNER_GRAPH_HPP_INCLUDED + +#include +#include +#include + +#include "handle.hpp" + +namespace vg { + +using namespace std; + +/* + * An aligner for global deletions. Can only produce alignments + * for empty sequences + */ +class DeletionAligner { +public: + DeletionAligner(int8_t gap_open, int8_t gap_extension); + DeletionAligner() = default; + ~DeletionAligner() = default; + + // store a global alignment of an empty sequence to this graph in the alignment + // crashes if alignment sequence is not empty + void align(Alignment& aln, const HandleGraph& graph) const; + + // store a global alignment of an empty sequence to this graph in the alignment + // also store the next highest-scoring alignments in the vector + // crashes if alignment sequence is not empty + void align_multi(Alignment& aln, vector& alt_alns, + const HandleGraph& graph, int32_t max_alt_alns) const; + + +private: + + // do the dynamic programming and return the trace backs + vector> run_dp(const HandleGraph& graph, + int32_t max_tracebacks) const; + + // calculate min distance with DP. returns the DP table and + // the sink nodes in the graph, with their full DP value + pair, vector>> min_dists(const vector& order, + const unordered_map& index_of, + const HandleGraph& graph) const; + + // compute tracebacks using the DP table + vector> traceback(const vector& order, + const unordered_map& index_of, + const HandleGraph& graph, + const vector& dists, + const vector>& sinks, + size_t max_tracebacks) const; + + // convert a trace into an alignment path + void trace_to_alignment(Alignment& aln, const vector& trace, + const HandleGraph& graph) const; + + int32_t gap_open; + int32_t gap_extension; + +}; + +} + +#endif diff --git a/src/dinucleotide_machine.cpp b/src/dinucleotide_machine.cpp new file mode 100644 index 00000000000..2bbc1545bfc --- /dev/null +++ b/src/dinucleotide_machine.cpp @@ -0,0 +1,107 @@ +/** + * \file dinucleotide_machine.cpp + * + * Implements DinucleotideMachine + * + */ + +#include "dinucleotide_machine.hpp" + +//#define debug_machine + +#ifdef debug_machine +#include +#endif + +namespace vg { + +DinucleotideMachine::DinucleotideMachine() { + // build the transition table for the state machine + for (size_t i = 0; i < 16; ++i) { + uint16_t base = 0; + for (size_t j = 0; j < 4; ++j) { + if (i & (1 << j)) { + base |= 1 << (4 * j); + } + } + for (size_t j = 0; j < 4; ++j) { + // the dinucleotide set ending in j + transition_table[4 * i + j] = base << j; + // the dinucleotide set ending in j including Nj + transition_table[4 * i + j + 64] = (base << j) | (1 << (16 + j)); + } + } + // handle transitions to the XN state (the lookup value for N indexes + // past the rest of the transition table) + for (size_t i = 128; i < 256; ++i) { + transition_table[i] = init_state(); + } + +#ifdef debug_machine + cerr << "constructed transition table:" << endl; + for (size_t i = 0; i < 32; ++i) { + cerr << i << "\t" << bitset<8>(i); + for (size_t j = 0; j < 4; ++j) { + cerr << "\t" << bitset<32>(transition_table[4 * i + j]); + } + cerr << endl; + } +#endif + + // build a translation table for ASCII nucleotides + for (size_t i = 0; i < 256; ++i) { + switch (i) { + case 'a': + case 'A': + nt_table[i] = 0; + break; + case 'c': + case 'C': + nt_table[i] = 1; + break; + case 'g': + case 'G': + nt_table[i] = 2; + break; + case 't': + case 'T': + nt_table[i] = 3; + break; + default: + // this will cause us to index past the entire table into + // the XN state + nt_table[i] = 128; + break; + } + } +} + +uint32_t DinucleotideMachine::init_state() const { + // start in the XN state + return 1 << 20; +} + +uint32_t DinucleotideMachine::update_state(uint32_t state, char next) const { + // merge the dinucleotide set according to it's final base from positions [15,0] + uint32_t transition_row = state | (state >> 4); + transition_row |= (transition_row >> 8); + // merge in the XN and NA...NT states from positions [20,16] + transition_row = (transition_row & 0xf) | (state >> 16); + // do the transitions + return transition_table[(transition_row << 2) | nt_table[next]]; +} + + +uint32_t DinucleotideMachine::merge_state(uint32_t state_1, uint32_t state_2) const { + return state_1 | state_2; +} + +bool DinucleotideMachine::matches(uint32_t state, const char* dinucleotide) const { + return state & (1 << ((nt_table[dinucleotide[0]] << 2) | nt_table[dinucleotide[1]])); +} + +bool DinucleotideMachine::matches(uint32_t state, const string& dinucleotide) const { + return matches(state, dinucleotide.c_str()); +} + +} diff --git a/src/dinucleotide_machine.hpp b/src/dinucleotide_machine.hpp new file mode 100644 index 00000000000..101c4ac733d --- /dev/null +++ b/src/dinucleotide_machine.hpp @@ -0,0 +1,54 @@ +/** + * \file dinucleotide_machine.hpp + * + * Defines a nondeterministic finite automaton over dinucleotides + * + */ +#ifndef VG_DINUCLEOTIDE_MACHINE_GRAPH_HPP_INCLUDED +#define VG_DINUCLEOTIDE_MACHINE_GRAPH_HPP_INCLUDED + +#include +#include +#include +#include + +namespace vg { + +using namespace std; + +/* + * Represents a non-deterministic finite automaton whose states + * correspond to dinucleotides + */ +class DinucleotideMachine { +public: + DinucleotideMachine(); + ~DinucleotideMachine() = default; + + /// Return an empty dinucleotide set + uint32_t init_state() const; + + /// Get the dinucleotide set that results from extending the set by the given character + /// Ns are valid, but will result in no matches + uint32_t update_state(uint32_t state, char next) const; + + /// Get the union of two dinucleotide sets + uint32_t merge_state(uint32_t state_1, uint32_t state_2) const; + + /// Return true if the set includes this dinucleotide. Only valid for dinucleotides of ACGT (never N). + bool matches(uint32_t state, const char* dinucleotide) const; + + /// Same semantics as above + bool matches(uint32_t state, const string& dinucleotide) const; + +private: + + // lookup table for transitions + uint32_t transition_table[256]; + // ASCII-indexed conversion from char to table index + uint32_t nt_table[256]; +}; + +} + +#endif diff --git a/src/distance.cpp b/src/distance.cpp deleted file mode 100644 index 9e81e152be4..00000000000 --- a/src/distance.cpp +++ /dev/null @@ -1,3496 +0,0 @@ -//#define indexTraverse -//#define printDistances - -#include "distance.hpp" - -using namespace std; -namespace vg { - -/*TODO: - * Fix documentation - * Make chain distance API based on the direction the node is traversed - * Make snarls/chains represented by the node id in netgraph - * Max distance probably hits long loops - */ - -int64_t DistanceIndex::sizeOf() { -//TODO: Delete this - //Estimate of the size of the object in memory - - int64_t total = 0; - - int64_t numSnarls = snarlDistances.size(); - - int64_t snarlDists = 0; - int64_t snarlNodes = 0; //# node ids + direction - - for (auto x : snarlDistances) { - //Add size of each SnarlIndex object - SnarlIndex sd = x.second; - int64_t numNodes = sd.visitToIndex.size(); - - snarlNodes += numNodes; - snarlDists += ((numNodes + 1) * numNodes) / 2; - - total += numNodes * 17; //Add all elements in visitToIndex - total += sd.distances.capacity() / 8; - - total += 3 * sizeof(pair); - total += sizeof(hash_map, int64_t>); - - - } - - int64_t chainDists = 0; - int64_t chainNodes = 0; - - int64_t numChains = chainDistances.size(); - - for (auto x : chainDistances) { - ChainIndex cd = x.second; - int64_t numNodes = cd.snarlToIndex.size(); - - chainDists += numNodes*3; - chainNodes += numNodes; - - total += numNodes * 16; //Add all elements in snarlToIndex - total += cd.prefixSum.capacity() / 8; - total += cd.loopFd.capacity() / 8; - total += cd.loopRev.capacity() / 8; - total += sizeof(id_t) + sizeof(hash_map); - } - - total += nodeToSnarl.size() * 8;//TODO: ??? - - - cerr << numSnarls << " snarls containing " << snarlNodes << " nodes" << endl; - cerr << numChains << " chains containing " << chainNodes << " nodes" << endl; - cerr << "Total: " << total << " bytes??" << endl; - return total; - - -} - - -DistanceIndex::DistanceIndex(HandleGraph* vg, SnarlManager* snarlManager, uint64_t cap){ - /*Constructor for the distance index given a VG and snarl manager - cap is the largest distance that the maximum distance estimation will be - accurate to - */ - - if (snarlManager->top_level_snarls().size() == 0 && maxNodeID > 1) { - - throw runtime_error("Snarl manager is empty"); - } - minNodeID = -1; - maxNodeID = -1; - - - graph = vg; - sm = snarlManager; - - #ifdef indexTraverse - cerr << endl << "Creating distance index"<< endl; - #endif - - //Calculate minimum distance index - const vector topSnarls = sm->top_level_snarls(); - - unordered_set seenSnarls; - for (const Snarl* snarl : topSnarls) { - if (seenSnarls.count(snarl) == 0){ - if (sm->in_nontrivial_chain(snarl)){ - const Chain* chain = sm->chain_of(snarl); - calculateMinIndex(chain); - for (auto s : *chain) { - seenSnarls.insert(s.first); - } - } else { - Chain currChain; - currChain.emplace_back(snarl, false); - calculateMinIndex(&currChain); - seenSnarls.insert(snarl); - } - - } - - } - nodeToSnarl = calculateNodeToSnarl(snarlManager); - //TODO: Cap should be given -// maxIndex = MaxDistanceIndex (this, topSnarls, cap); - -}; - - -DistanceIndex::DistanceIndex(HandleGraph* vg, SnarlManager* snarlManager, istream& in) { - - /*Constructor for the distance index given a VG, snarl manager, and a vector - of ints from serialization - */ - - //TODO: These need to change but shouldn't be needed for min index - minNodeID = 0; - maxNodeID = 0; - graph = vg; - sm = snarlManager; - load(in); - - for ( auto x : snarlDistances ) { - //Check that vg and snarl manager match the distance index - - pair node = x.first; -/* TODO: Make test that uses handle graph - if (!graph->has_node(node.first)) { - - throw runtime_error("Distance index does not match vg"); - - } else */if (sm->into_which_snarl(node.first, node.second) == NULL) { - - throw runtime_error("Distance index does not match snarl manager"); - } - - } - - uint64_t cap = 20; - const vector topSnarls = sm->top_level_snarls(); -//TODO: Serialize this too -// maxIndex = MaxDistanceIndex (this, topSnarls, cap); -} -void DistanceIndex::load(istream& in){ - //Load serialized index from an istream - auto toInt = [] (uint64_t uval) { - /*convert unsigned representation of signed int back to int64_t*/ - int64_t val = uval / 2; - if (uval % 2 == 1) {val = -val;} - return val; - }; - - int_vector<> d1; - int_vector<> d2; - int_vector<> d3; - int_vector<> d4; - - d1.load(in); - d2.load(in); - d3.load(in); - d4.load(in); - nodeToSnarl.load(in); - - vector snarlNodes(d1.size(), 0); - vector snarlVector(d2.size(), 0); - vector chainNodes(d3.size(), 0); - vector chainVector(d4.size(), 0); - - for (size_t i = 0; i < d1.size(); i++) { - uint64_t uval = d1[i]; - int64_t val = toInt(uval); - snarlNodes[i] = val; - } - - for (size_t i = 0; i < d2.size(); i++) { - snarlVector[i] = toInt(d2[i]); - } - for (size_t i = 0; i < d3.size(); i++) { - chainNodes[i] = toInt(d3[i]); - } - for (size_t i = 0; i < d4.size(); i++) { - chainVector[i] = toInt(d4[i]); - } - - - //Construct snarl index - size_t snarlI = 0;//Index into snarlVector - for (size_t i = 0; i < snarlNodes.size()/2; i++) { - int64_t snarlInt = snarlNodes[2*i]; - - pair node = snarlInt < 0 ? - make_pair( (id_t) abs(snarlInt), true) : - make_pair( (id_t) abs(snarlInt), false); - size_t nextIndex = snarlI + snarlNodes[2*i+1]; - - - vector snarlv;//get subvector - snarlv.resize(nextIndex - snarlI); - for (size_t j = 0; j < nextIndex-snarlI; j++) { - snarlv[j] = snarlVector[snarlI + j]; - } - - - //Create SnarlIndex object from vector and assign - snarlDistances.insert(make_pair(node, SnarlIndex (this, snarlv))); - - snarlI = nextIndex; - - } - - - size_t chainI = 0; //Index into chainVector - //Construct chain index - for (size_t i = 0; i < chainNodes.size()/2; i++) { - id_t chainID = (id_t) chainNodes[2*i]; - - size_t nextIndex = chainI + chainNodes[2*i + 1]; - - vector chainv; - chainv.resize(nextIndex - chainI); - for ( size_t j = 0; j < nextIndex - chainI; j++) { - - chainv[j] = chainVector[chainI + j]; - } - - //Create chaindistances object and assign in index - chainDistances.insert(make_pair(chainID, ChainIndex(chainv))); - - chainI = nextIndex; - - } -}; - -void DistanceIndex::serialize(ostream& out) { - - auto toUint = [](int64_t val) { - /* convert signed integer into unsigned representation where last bit - represents sign*/ - uint64_t uval= abs(val) * 2; - if (val < 0) { uval += 1; } - return uval; - }; - - vector d1; //Snarl nodes as a vector [node, length, node, ...] - vector d2; //Snarl distances as a vector - vector d3; //Chain nodes as a vector - vector d4; //Chain distances as a vector - - size_t snarlNodesI = 0; - size_t snarlVectorI = 0; - //Serialize snarls - d1.resize(2*snarlDistances.size()); - - for (pair, SnarlIndex> snarlPair : snarlDistances) { - int64_t nodeInt = snarlPair.first.second ? - - (int64_t) snarlPair.first.first : (int64_t) snarlPair.first.first; - vector currVector = snarlPair.second.toVector(); - - d1[snarlNodesI++] = nodeInt; - d1[snarlNodesI++] = (int64_t) currVector.size(); - - d2.resize(d2.size() + currVector.size()); - //Concatenate new vector onto whole snarl vector - for (auto x : currVector) { - d2[snarlVectorI++] = x; - } - } - - size_t chainNodesI = 0; - size_t chainVectorI = 0; - //Serialize chains - for (pair chainP: chainDistances) { - vector currVector = chainP.second.toVector(); - - d3.resize(d3.size() + 2 ); - d3[chainNodesI++] = (int64_t) chainP.first; - d3[chainNodesI++] = (int64_t) currVector.size(); - - d4.resize(d4.size() + currVector.size()); - for (auto x : currVector) { - d4[chainVectorI ++ ] = x; - } - } - - //Convert vectors of ints to int_vector TODO: Start with int_vector - int_vector<> snarlNodes; - int_vector<> snarlVector; - int_vector<> chainNodes; - int_vector<> chainVector; - - util::assign(snarlNodes, int_vector<>(d1.size())); - util::assign(snarlVector, int_vector<>(d2.size())); - util::assign(chainNodes, int_vector<>(d3.size())); - util::assign(chainVector, int_vector<>(d4.size())); - - //Copy vector into int_vector - for (size_t i = 0; i < d1.size(); i++) { - snarlNodes[i] = toUint(d1[i]); - } - - for (size_t i = 0; i < d2.size(); i++) { - snarlVector[i] = toUint(d2[i]); - } - for (size_t i = 0; i < d3.size(); i++) { - chainNodes[i] = toUint(d3[i]); - } - for (size_t i = 0; i < d4.size(); i++) { - chainVector[i] = toUint(d4[i]); - } - - util::bit_compress(snarlNodes); - util::bit_compress(snarlVector); - util::bit_compress(chainNodes); - util::bit_compress(chainVector); - - //Serialize - - snarlNodes.serialize(out, NULL, "snarl_nodes"); - snarlVector.serialize(out, NULL, "snarl_vector"); - chainNodes.serialize(out, NULL, "chain_nodes"); - chainVector.serialize(out, NULL, "chain_vector"); - nodeToSnarl.serialize(out, NULL, "node_to_snarl"); -} - - -int_vector<> DistanceIndex::calculateNodeToSnarl(SnarlManager* sm){ - - auto toUint = [](int64_t val) { - /* convert signed integer into unsigned representation where last bit - represents sign*/ - uint64_t uval= abs(val) * 2; - if (val < 0) { uval += 1; } - return uval; - }; - - int_vector<> result(maxNodeID - minNodeID + 1, 0); - - const vector topSnarls = sm->top_level_snarls(); - vector allSnarls(topSnarls.begin(), topSnarls.end()); - - while (allSnarls.size() > 0) { - const Snarl* snarl = allSnarls.back(); - allSnarls.pop_back(); - - int64_t currSnarlID = snarl->start().backward() ? - -snarl->start().node_id() : snarl->start().node_id(); - - NetGraph ng = NetGraph(snarl->start(), snarl->end(), sm->chains_of(snarl), graph); - - //Get all the nodes in the snarl - - vector allNodes; - - auto addNode = [&](const handle_t& h)-> bool { - allNodes.push_back(ng.get_id(h)); - return true; - - }; - ng.for_each_handle(addNode); - - for ( id_t nodeID : allNodes) { - - const Snarl* tempSnarl = sm->into_which_snarl(nodeID, true); - const Snarl* nextSnarl = tempSnarl == NULL ? - sm->into_which_snarl(nodeID, false) : tempSnarl; - - if (nodeID != snarl->start().node_id() && - nodeID != snarl->end().node_id() && nextSnarl != NULL) { - //If this node is a child snarl - if (sm->in_nontrivial_chain(nextSnarl)) { - const Chain* chain = sm->chain_of(nextSnarl); - for (auto s : *chain) { - allSnarls.push_back(s.first); - } - } else { - allSnarls.push_back(nextSnarl); - } - - } else { - - //If this node is just a node - result[nodeID - minNodeID] = toUint(currSnarlID); - - } - } - } - util::bit_compress(result); - return result; - -} - - - -///////////////////////// MINIMUM INDEX /////////////////////////////// - - - -int64_t DistanceIndex::calculateMinIndex(const Chain* chain) { - /*Populate the DistanceIndex - Add chain to distance index and recursively add all distances in - snarls contained within chain - */ - auto cmp = [] (pair,int64_t> x, - pair,int64_t> y) { - //Comparison function for the priority of a pair of handle, distance - return (x.second > y.second); - }; - - //Vectors for chain distance index - //initialize chain prefix sum to [0, len of first node in chain] - vector chainPrefixSum (1,0); - vector chainLoopFd; - vector chainLoopRev; - handle_t firstNode = graph->get_handle(get_start_of(*chain)); - - #ifdef indexTraverse - cerr << "Prefix sum before chain initial node: " << chainPrefixSum.back() << endl; - #endif - chainPrefixSum.push_back(graph->get_length(firstNode)); - #ifdef indexTraverse - cerr << "Prefix sum after chain initial node: " << chainPrefixSum.back() << endl; - #endif - - hash_map snarlToIndex; - snarlToIndex[get_start_of(*chain).node_id()] = 0; - #ifdef indexTraverse - cerr << "Node " << get_start_of(*chain).node_id() << " represents snarl at index " - << snarlToIndex[get_start_of(*chain).node_id()] << endl; - #endif - - ChainIterator chainEnd = chain_end(*chain); - for (ChainIterator c = chain_begin(*chain); c != chainEnd; ++c) { - //for each snarl in the chain - - - const Snarl* snarl = c->first; - bool snarlRevInChain = c->second; - - id_t snarlStartID = snarl->start().node_id(); - bool snarlStartRev = snarl->start().backward(); //into snarl - id_t snarlEndID = snarl->end().node_id(); - bool snarlEndRev = snarl->end().backward(); //pointing out - - -/*TODO: Make a test that uses handle graph - if (!( graph->has_node(snarlStartID) && graph->has_node(snarlEndID))) { - //Make sure that vg contains the boundary nodes of this snarl - throw runtime_error("Snarl manager does not match vg"); - } -*/ - - if (snarlToIndex.find(snarlEndID) == snarlToIndex.end()){ - //Store the index of the start of the snarl only if it hasn't - //already been seen (if the chain loops) - size_t nextIndex = snarlToIndex.size(); - snarlToIndex[snarlEndID] = nextIndex; - - #ifdef indexTraverse - cerr << "Node " << snarlEndID << " represents snarl at index " - << snarlToIndex[snarlEndID] << endl; - #endif - } else { - #ifdef indexTraverse - cerr << "Node " << snarlEndID << " already represents a snarl, at index " << snarlToIndex[snarlEndID] << endl; - #endif - } - - - NetGraph ng = - NetGraph(snarl->start(), snarl->end(), sm->chains_of(snarl), graph); - - - - //Get all the nodes in the snarl - - unordered_set> allNodes; - - auto addNode = [&](const handle_t& h)-> bool { - allNodes.insert(make_pair(ng.get_id(h), ng.get_is_reverse(h))); - allNodes.insert(make_pair(ng.get_id(h), !ng.get_is_reverse(h))); - return true; - - }; - ng.for_each_handle(addNode); - - //Create snarl distance obj for current snarl and add to distance index - snarlDistances.insert(make_pair( make_pair(snarlStartID,snarlStartRev), - SnarlIndex(this,allNodes, make_pair(snarlStartID,snarlStartRev), - make_pair(snarlEndID, snarlEndRev)))); - - SnarlIndex& sd = snarlDistances.at(make_pair(snarlStartID, snarlStartRev)); - - #ifdef indexTraverse - cerr << "Snarl at " << snarl->start() << " -> " << snarl->end() << endl; - cerr << " Contains nodes : "; - { - unordered_set reported; - for (pair node: allNodes) { - if (!reported.count(node.first)) { - cerr << node.first << " "; - reported.insert(node.first); - } - } - } - cerr << endl; - #endif - - for (pair startID : allNodes){ - //Use each node in the snarl as start of djikstra search - - - minNodeID = minNodeID == -1 ? startID.first : - min(minNodeID, startID.first); - maxNodeID = max(maxNodeID, startID.first); - - handle_t startHandle = - graph->get_handle(startID.first, startID.second); - //Priority queue of reachable nodes (pair of node id and direction) - priority_queue< pair, int64_t>, - vector, int64_t>>, - decltype(cmp)> reachable(cmp); - reachable.push(make_pair(startID, 0)); - - #ifdef indexTraverse - cerr << " Start Node: " << startID.first << "," - << startID.second << endl; - #endif - bool firstLoop = true; - unordered_set> seenNodes; - - while (reachable.size() > 0) { - pair, int64_t> next = reachable.top(); - reachable.pop(); - pair currID= next.first; - handle_t currHandle = graph->get_handle(currID.first, - currID.second); - int64_t currDist = next.second; - if ( seenNodes.count(currID) == 0) { - //If node has not already been found: - - //Save distance from start to current node - if (!firstLoop) { - - sd.insertDistance(startID, currID, currDist); - seenNodes.insert(currID); - - } - - - - int64_t nodeLen; //length of the current node - - int64_t loopDist = -1; - //Dist to enter curr node then exit at same side - - //Get the snarl that the node represents, if any - const Snarl* tempSnarl = sm->into_which_snarl( - currID.first, currID.second); - const Snarl* currSnarl = tempSnarl == NULL ? - sm->into_which_snarl(currID.first, !currID.second) : - tempSnarl; - - if (currID.first != snarlStartID && - currID.first != snarlEndID && - currSnarl != NULL) { - //If current node is a child snarl/chain - - - if (sm->in_nontrivial_chain(currSnarl)) { - //The node is a chain - - const Chain* currChain= sm->chain_of(currSnarl); - auto chainDists = chainDistances.find( get_start_of( - *currChain).node_id()); - - if (chainDists != chainDistances.end()) { - //Length of chain has already been found - - //Get the length of the node (chain) - nodeLen = chainDists->second.chainLength(); - - //Get loop dist- enter and exit chain at same side - if (get_start_of( - *currChain).backward() == currID.second) { - //If traversing snarl forward in chain - - loopDist = chainDists->second.loopFd[0] - 1; - - if (loopDist != -1) { - loopDist = loopDist + graph->get_length( - currHandle) ; - } - - } else { - loopDist = chainDists->second.loopRev[ - chainDists->second.loopRev.size()-1] - 1; - handle_t tempHandle = graph->get_handle( - get_end_of(*currChain)); - - if (loopDist != -1) { - loopDist = loopDist + - graph->get_length(tempHandle); - } - } - - } else {//haven't recursed on this chain yet - #ifdef indexTraverse - cerr << " recurse" << endl; - #endif - nodeLen = calculateMinIndex(currChain); - - ChainIndex& currChainDists = - chainDistances.at( get_start_of( - *currChain).node_id()); - if (get_start_of( - *currChain).backward() == currID.second) { - //If traversing snarl forward in chain - - loopDist = currChainDists.loopFd[0] - 1; - - if (loopDist != -1) { - loopDist = loopDist + graph->get_length( - currHandle); - } - } else { - - loopDist = currChainDists.loopRev[ - currChainDists.loopRev.size()-1] - 1; - - if (loopDist != -1) { - handle_t tempHandle = graph->get_handle( - get_end_of(*currChain)); - loopDist = loopDist + - graph->get_length(tempHandle); - } - } - } - } else {//Snarl - - id_t snarlID = currSnarl->start().node_id(); - bool snarlRev = currSnarl->start().backward(); - id_t endID = currSnarl->end().node_id(); - bool endRev = currSnarl->end().backward(); - - auto snarlDists = snarlDistances.find(make_pair( - snarlID, snarlRev)); - - if (snarlDists != snarlDistances.end()) {//Already found - nodeLen = snarlDists->second.snarlLength( - graph, &ng); - - //Find the distance to enter and exit snarl - //at the same side - if (currID.second == snarlRev) { - //If traversing snarl forward - loopDist = snarlDists->second.snarlDistance( - graph, &ng, - make_pair(snarlID, snarlRev), - make_pair(snarlID, !snarlRev)); - - if (loopDist != -1) { - loopDist = loopDist+ graph->get_length( - currHandle); - } - } else { - loopDist = snarlDists->second.snarlDistance( - graph, &ng, - make_pair(endID, !endRev), - make_pair(endID, endRev)); - - if (loopDist != -1) { - handle_t tempHandle = - graph->get_handle(currSnarl->end()); - loopDist = loopDist + - graph->get_length(tempHandle); - } - } - } else {//Haven't recursed on snarl yet - #ifdef indexTraverse - cerr << " recurse" << endl; - #endif - - //Create chain to recurse on and recurse - Chain currChain; - - currChain.emplace_back(currSnarl, false); - calculateMinIndex(&currChain); - - SnarlIndex& currSnarlDists = - snarlDistances.at(make_pair(snarlID,snarlRev)); - - nodeLen = currSnarlDists.snarlLength(graph, &ng); - - //Find the distance to enter and exit snarl - //at the same side - if (currID.second == snarlRev) { - - loopDist = currSnarlDists.snarlDistance( - graph, &ng, - make_pair(snarlID, snarlRev), - make_pair(snarlID, !snarlRev)); - - if (loopDist != -1) { - loopDist = loopDist +graph->get_length( - currHandle); - } - - } else { - - loopDist = currSnarlDists.snarlDistance( - graph, &ng, - make_pair(endID, !endRev), - make_pair(endID, endRev)); - - if (loopDist != -1) { - handle_t tempHandle = graph->get_handle - (currSnarl->end()); - loopDist = loopDist+ - graph->get_length(tempHandle); - } - } - } - - } - } else { //Node is just a node - nodeLen = graph->get_length(currHandle); - } - - - const handle_t currHandle = ng.get_handle(currID.first, - currID.second); - - - if (loopDist != -1 && !firstLoop) { - /*If there is a path within the current node that loops - to enter the node and exit it at the same side - add - reachable nodes from current node in reverse - Do not add this distance if the current node is the - starting node */ - - handle_t revHandle = ng.get_handle( - ng.get_id(currHandle), - !ng.get_is_reverse(currHandle)); - - - auto addRevHandle = [&](const handle_t& h)-> bool { - pair node = make_pair( - ng.get_id(h), ng.get_is_reverse(h)); - reachable.push(make_pair(node, - currDist + loopDist)); - - - return true; - }; - - ng.follow_edges(revHandle, false, addRevHandle); - } - - //Add reachable nodes to priority queue - auto addHandle = [&](const handle_t& h)-> bool { - pair node = make_pair( - ng.get_id(h), ng.get_is_reverse(h)); - if (nodeLen != -1) { - reachable.push(make_pair(node, currDist + nodeLen)); - } - - #ifdef indexTraverse - cerr << node.first << " " << node.second << ", "; - #endif - return true; - }; - //Add reachable nodes to priority queue for unary snarl that doesn't loop - 0 distance - auto addHandle0 = [&](const handle_t& h)-> bool { - pair node = make_pair( - ng.get_id(h), ng.get_is_reverse(h)); - reachable.push(make_pair(node, 0)); - - #ifdef indexTraverse - cerr << node.first << " " << node.second << ", "; - #endif - return true; - }; - - - if ( (nodeLen == -1 && firstLoop) || currID == startID) { - //If the nodeLen is -1 then node is a unary snarl that doesn't have a path from start to end. If this is the start of the distance calculation then add subsequent nodes assuming that the node length was 0 - //Or if this is the starting node - - #ifdef indexTraverse - - cerr << " From start node " << startID.first << " " << startID.second - << " in snarl " << snarl->start() << " -> " << snarl->end() - << " at " << ng.get_id(currHandle) << " " << ng.get_is_reverse(currHandle) << endl; - cerr << " Adding next nodes: "; - #endif - ng.follow_edges(currHandle, false, addHandle0); - - } else { - - #ifdef indexTraverse - cerr << " From start node " << startID.first << " " << startID.second<< " in snarl " << snarl->start().node_id() << " at " << ng.get_id(currHandle) << " " << ng.get_is_reverse(currHandle) << endl; - cerr << " Adding next nodes: "; - #endif - ng.follow_edges(currHandle, false, addHandle); - } - - if (nodeLen != -1) { - ng.follow_edges(currHandle, false, addHandle); - } else if (firstLoop) { - //If the nodeLen is -1 then node is a unary snarl that - //doesn't have a path from start to end. If this is the - //start of the distance calculation then add subsequent - //nodes assuming that the node length was 0 - ng.follow_edges(currHandle, false, addHandle0); - } - - - //Add edges between the boundary nodes that are not in - //the net graph - int64_t nextDist = currID == startID ? 0 : currDist+nodeLen; - - if ((currID.first == snarlStartID && - currID.second != snarlStartRev) || - ( currID.first == snarlEndID && - currID.second == snarlEndRev ) ) { - - //If currently leaving start of snarl - auto addHandleEnd = [&](const handle_t& h)-> bool { - pair node = make_pair( - ng.get_id(h), ng.get_is_reverse(h)); - if ( node.first == snarlStartID || - node.first == snarlEndID ) { - reachable.push(make_pair(node, nextDist)); - } - return true; - }; - graph->follow_edges(currHandle, false, addHandleEnd); - - } - #ifdef indexTraverse - cerr << " prev dist: " << currDist << "+ new dist " << nodeLen << endl; - #endif - } - firstLoop = false; - }//End while loop - }//End for loop over starting node/directions in a snarl - - #ifdef indexTraverse - cerr << "End snarl " << snarl->start() << " -> " << snarl->end() << endl; - #endif - - /*Add to prefix sum the distance to the beginning and end of the last - node in the current snarl - */ - - int64_t dist; - if (snarlRevInChain) { - //If traversing snarl backwards - dist = sd.snarlDistance( graph, &ng, - make_pair (snarlEndID, !snarlEndRev) , - make_pair (snarlStartID, !snarlStartRev)); - - chainPrefixSum.push_back(chainPrefixSum[chainPrefixSum.size()-2]+ - dist); - - #ifdef indexTraverse - cerr << "Prefix sum before snarl reverse start: " << chainPrefixSum.back() << endl; - #endif - - handle_t tempHandle = graph->get_handle(snarlStartID, false); - chainPrefixSum.push_back(chainPrefixSum[chainPrefixSum.size()-1] + - graph->get_length(tempHandle)); - - #ifdef indexTraverse - cerr << "Prefix sum after snarl reverse start: " << chainPrefixSum.back() << endl; - #endif - - } else { - dist = sd.snarlDistance( graph, &ng, - make_pair (snarlStartID, snarlStartRev), - make_pair (snarlEndID, snarlEndRev) ); - - chainPrefixSum.push_back(chainPrefixSum[chainPrefixSum.size()-2]+ - dist); - #ifdef indexTraverse - cerr << "Prefix sum before snarl end: " << chainPrefixSum.back() << endl; - #endif - handle_t tempHandle = graph->get_handle(snarlEndID, false); - chainPrefixSum.push_back(chainPrefixSum[chainPrefixSum.size()-1] + - graph->get_length(tempHandle)); - #ifdef indexTraverse - cerr << "Prefix sum after snarl end: " << chainPrefixSum.back() << endl; - #endif - } - - //Bit compress distance matrix of snarl index - util::bit_compress(sd.distances); - - }//End for loop over snarls in chain - - - //Add reverse loop distances - - - for (ChainIterator c = chain_begin(*chain); c != chainEnd; ++c) { - //Loop through the chain in reverse - const Snarl* snarl = c->first; - bool snarlRevInChain = c->second; - id_t snarlStartID = snarl->start().node_id(); - bool snarlStartRev = snarl->start().backward(); - id_t snarlEndID = snarl->end().node_id(); - bool snarlEndRev = snarl->end().backward(); - auto& sd = snarlDistances.at(make_pair(snarlStartID, snarlStartRev)); - NetGraph ng (snarl->start(), snarl->end(),sm->chains_of(snarl), graph); - //Add reverse loop distances- from start node rev to start node fd - - - - if ( chainLoopRev.size() == 0) { - int64_t firstRevDist; - if (snarlRevInChain){ - //If this is the first snarl in the chain - firstRevDist = sd.snarlDistance( graph, &ng, - make_pair(snarlEndID, snarlEndRev), - make_pair(snarlEndID, !snarlEndRev)); - } else { - firstRevDist = sd.snarlDistance( graph, &ng, - make_pair (snarlStartID, !snarlStartRev), - make_pair (snarlStartID, snarlStartRev)); - } - - - if (snarlToIndex.size() == (chainPrefixSum.size()/2) -1) { - //If the chain loops, might need distance from last snarl - ChainIterator chainEndR = chain_rbegin(*chain); - const Snarl* lastSnarl = chainEndR->first; - bool lastRev = chainEndR->second; - - id_t lastStartID = lastSnarl->start().node_id(); - bool lastStartRev = lastSnarl->start().backward(); - id_t lastEndID = lastSnarl->end().node_id(); - bool lastEndRev = lastSnarl->end().backward(); - auto& sdLast = snarlDistances.at(make_pair( - lastStartID, lastStartRev)); - - if (lastRev) { - firstRevDist = minPos({firstRevDist, - sdLast.snarlDistance(graph, &ng, - make_pair(lastStartID, lastStartRev), - make_pair(lastStartID, !lastStartRev)) }); - - - } else { - firstRevDist = minPos({firstRevDist, - sdLast.snarlDistance(graph, &ng, - make_pair(lastEndID, !lastEndRev), - make_pair(lastEndID, lastEndRev)) }); - - } - } - chainLoopRev.push_back(firstRevDist); - } - int64_t revLoopDist; - - if ( snarlRevInChain ) { - - revLoopDist = sd.snarlDistance(graph, &ng, - make_pair (snarlStartID, snarlStartRev), - make_pair (snarlStartID, !snarlStartRev)); - } else { - revLoopDist = sd.snarlDistance( graph, &ng, - make_pair (snarlEndID, !snarlEndRev), - make_pair (snarlEndID, snarlEndRev)); - } - - - - int64_t lastLoop = chainLoopRev.back(); - - if (lastLoop == -1) { - - chainLoopRev.push_back(revLoopDist); - - } else { - - //Push the minimum of the loop distance of the current snarl and - //the loop distance of the previous snarl + dist to and from loop - int64_t loopDistance = minPos({revLoopDist, lastLoop + - sd.snarlDistance(graph, &ng, - make_pair (snarlEndID, !snarlEndRev), - make_pair (snarlStartID, !snarlStartRev)) - + - sd.snarlDistance(graph, &ng, - make_pair (snarlStartID, snarlStartRev), - make_pair (snarlEndID, snarlEndRev))}); - chainLoopRev.push_back(loopDistance); - } - } - //Add forward loop distances - - //Check if there is an edge traversing last node in chain fd -> rev - - ChainIterator chainStartR = chain_rend(*chain); - for (ChainIterator c = chain_rbegin(*chain); c != chainStartR; ++c) { - //Loop through the chain in reverse - const Snarl* snarl = c->first; - bool snarlRevInChain = c->second; - id_t snarlStartID = snarl->start().node_id(); - bool snarlStartRev = snarl->start().backward(); - id_t snarlEndID = snarl->end().node_id(); - bool snarlEndRev = snarl->end().backward(); - auto& sd = snarlDistances.at(make_pair(snarlStartID, snarlStartRev)); - NetGraph ng (snarl->start(), snarl->end(),sm->chains_of(snarl), graph); - - - - if (c == chain_rbegin(*chain)) { - //If this is the last snarl in the chain, push loop for last node - - int64_t loopDistLast; - if (snarlRevInChain) { - - loopDistLast = sd.snarlDistance( graph, &ng, - make_pair(snarlStartID, !snarlStartRev), - make_pair(snarlStartID, snarlStartRev)); - } else { - - loopDistLast = sd.snarlDistance(graph, &ng, - make_pair(snarlEndID, snarlEndRev), - make_pair(snarlEndID, !snarlEndRev)); - } - - if (snarlToIndex.size() == (chainPrefixSum.size()/2) -1) { - //If the chain loops, might need distance from first snarl - ChainIterator chainStart = chain_begin(*chain); - const Snarl* firstSnarl = chainStart->first; - bool firstSnarlRev = chainStart->second; - - id_t firstStartID = firstSnarl->start().node_id(); - bool firstStartRev = firstSnarl->start().backward(); - id_t firstEndID = firstSnarl->end().node_id(); - bool firstEndRev = firstSnarl->end().backward(); - auto& sdFirst = snarlDistances.at(make_pair( - firstStartID, firstStartRev)); - if (firstSnarlRev) { - loopDistLast = minPos({loopDistLast, - sdFirst.snarlDistance(graph, &ng, - make_pair(firstEndID, !firstEndRev), - make_pair(firstEndID, firstEndRev)) }); - } else { - loopDistLast = minPos({loopDistLast, - sdFirst.snarlDistance(graph, &ng, - make_pair(firstStartID, firstStartRev), - make_pair(firstStartID, !firstStartRev)) }); - - } - - } - chainLoopFd.push_back(loopDistLast); - } - - int64_t fdLoopDist; - - - if (snarlRevInChain) { - //If the snarl is reversed in the chain - fdLoopDist = sd.snarlDistance(graph, &ng, - make_pair (snarlEndID, !snarlEndRev), - make_pair (snarlEndID, snarlEndRev)); - } else { - fdLoopDist = sd.snarlDistance(graph, &ng, - make_pair (snarlStartID, snarlStartRev), - make_pair (snarlStartID, !snarlStartRev)); - } - - int64_t lastLoop = chainLoopFd.back(); - - if (lastLoop == -1) { - - chainLoopFd.push_back(fdLoopDist); - - } else { - //push dist to end of snarl + loop dist + dist to start of snarl - - int64_t loopDistance = minPos({fdLoopDist, lastLoop + - sd.snarlDistance(graph, &ng, - make_pair(snarlEndID, !snarlEndRev), - make_pair(snarlStartID, !snarlStartRev)) + - sd.snarlDistance(graph, &ng, - make_pair(snarlStartID, snarlStartRev), - make_pair(snarlEndID, snarlEndRev))}); - chainLoopFd.push_back(loopDistance); - } - - } - reverse(chainLoopFd.begin(), chainLoopFd.end()); - - if (chainPrefixSum.size() > 4) { //If chain and not just one snarl - chainDistances.insert(make_pair(get_start_of(*chain).node_id(), - ChainIndex(snarlToIndex, chainPrefixSum, chainLoopFd, - chainLoopRev))); - } - return chainPrefixSum.back();//return length of entire chain -}; - - - -////////////////// Distance Calculations - -int64_t DistanceIndex::maxDistance(pos_t pos1, pos_t pos2) { - //Get the upper bound of the distance between two positions - -/* TODO: Make test that uses handle graph - if (!(graph->has_node(get_id(pos1)) && graph->has_node(get_id(pos2)))) { - throw runtime_error("Node not in graph"); - } -*/ - int64_t minDist = minDistance(pos1, pos2); -/* - if (minDist == -1) { - return -1; - } else if (minDist >= maxIndex.cap) { - return minDist; - } - - return maxIndex.maxDistance(pos1, pos2); -TODO */ - return minDist; - -} - -int64_t DistanceIndex::minDistance(pos_t pos1, pos_t pos2) { - const Snarl* snarl1 = snarlOf(get_id(pos1)); - const Snarl* snarl2 = snarlOf(get_id(pos2)); - return minDistance(snarl1, snarl2, pos1,pos2); -} -int64_t DistanceIndex::minDistance(const Snarl* snarl1, const Snarl* snarl2, - pos_t pos1, pos_t pos2) { - /*Find the shortest distance between two positions - pos1 and pos2 must be on nodes contained in snarl1/snarl2 */ - - int64_t shortestDistance = -1; - - if (get_id(pos1) == get_id(pos2)) { //if positions are on the same node - int64_t nodeSize = graph->get_length(graph->get_handle(get_id(pos1), - false)); - int64_t offset1; - if (is_rev(pos1)) { - offset1 = nodeSize -get_offset(pos1) - 1;//Len of node - offset - } else { - offset1 = get_offset(pos1); - } - - int64_t offset2; - if (is_rev(pos2)) { - offset2 = nodeSize - get_offset(pos2) - 1; - } else { - offset2 = get_offset(pos2); - } - -/*TODO: Might need this still - if (graph->has_edge(node_start(get_id(pos1)), node_end(get_id(pos1)))){ - //If there is an edge from start to end of node - - shortestDistance = min( abs(offset1-offset2)+1, - nodeSize - abs(offset1-offset2) + 1 ); - - } else { -*/ - - shortestDistance = abs(offset1-offset2)+1; //+1 to be consistent - -// } - } - - id_t nodeID1 = get_id(pos1); - bool nodeRev1 = false; - id_t nodeID2 = get_id(pos2); - bool nodeRev2 = false; - - - const Snarl* commonAncestor = NULL; - - -#ifdef printDistances - cerr << endl << "Start distance calculation from " << nodeID1 << "->" << - nodeID2 << endl; - - cerr << "Shortes distance within same node: " << shortestDistance<< endl; - - cerr << "Find common ancestor" << endl; -#endif - - - //// Find common ancestor of the two snarls - unordered_set> ancestors1;//set of all ancestor snarls of node1 - const Snarl* ancestor1 = snarl1; - -#ifdef printDistances - cerr << "Ancestors of 1: "; -#endif - - - while (ancestor1 != NULL) { -#ifdef printDistances - cerr << ancestor1->start().node_id() << " "; -#endif - ancestors1.emplace(make_pair(ancestor1->start().node_id(), - ancestor1->start().backward())); - ancestor1 = sm->parent_of(ancestor1); - } - - -#ifdef printDistances - cerr << endl << "ancestors of 2: "; -#endif - - - const Snarl* ancestor2 = snarl2; - while (ancestor2 != NULL) { - - -#ifdef printDistances - cerr << ancestor2->start().node_id() << " "; -#endif - - - if (ancestors1.count(make_pair(ancestor2->start().node_id(), - ancestor2->start().backward())) > 0) { - commonAncestor = ancestor2; - break; - } - ancestor2 = sm->parent_of(ancestor2); - } - -#ifdef printDistances - cerr << endl; - if (commonAncestor == NULL) { - cerr << "common ancestor found: NULL" << endl; - } else { - cerr << "common ancestor found: " << - commonAncestor->start().node_id()<< endl; - } - - cerr << " Snarl1: " << snarl1->start().node_id() << " Snarl2: " - << snarl2->start().node_id() << endl; -#endif - - - //Find distances from pos1 and pos2 to ends of child snarls of ancestor - pair, const Snarl*> p1 = - distToCommonAncestor(snarl1, commonAncestor, pos1); - pair temp1 = p1.first; - snarl1 = p1.second; - if (snarl1 != commonAncestor) { - nodeID1 = snarl1->start().node_id(); - nodeRev1 = snarl1->start().backward(); - } - int64_t distL1 = temp1.first; int64_t distR1 = temp1.second; - - pair, const Snarl*> p2 = - distToCommonAncestor(snarl2, commonAncestor, pos2); - pair temp3 = p2.first; - snarl2 = p2.second; - if (snarl2 != commonAncestor) { - nodeID2 = snarl2->start().node_id(); - nodeRev2 = snarl2->start().backward(); - } - int64_t distL2 = temp3.first; int64_t distR2 = temp3.second; - - - id_t endID1 = snarl1->end().node_id(); - bool endRev1 = snarl1->end().backward(); - id_t endID2 = snarl2->end().node_id(); - bool endRev2 = snarl2->end().backward(); - - //Snarl1 and snarl2 are children of common ancestor or common ancestor - -#ifdef printDistances - cerr << "Distances to snarl in common ancestor: " << distL1 << ", " << - distR1 << " " << distL2 << ", " << distR2 << endl; -#endif - int64_t chainDist = -1; - - //Find shortest distance between boundary nodes of snarls containing pos - // within the common ancestor snarl - - if (snarl1 != commonAncestor && snarl2 != commonAncestor && - sm->in_nontrivial_chain(snarl1) && sm->in_nontrivial_chain(snarl2) - && sm->chain_of(snarl1) == sm->chain_of(snarl2)) { - - //If positions are in the same chain within common ancestor - - const Chain* chain = sm->chain_of(snarl1); - id_t chainStartID = get_start_of(*chain).node_id(); - - - ChainIndex& chainDists = chainDistances.at( chainStartID); - bool snarlRev1 = sm->chain_orientation_of(snarl1); - bool snarlRev2 = sm->chain_orientation_of(snarl2); - - //Distance from left of s1 (reverse), left of s2 (forward) - int64_t d1 = chainDists.chainDistanceShort(graph, - make_pair(nodeID1, !snarlRev1), - make_pair(nodeID2, snarlRev2)); - d1 = (distL1 == -1 || distL2 == -1 || d1 == -1) ? -1 : - distL1 + distL2 + d1; - - //Distance from left of s1 (reverse) to right of s2 (reverse) - int64_t d2 = chainDists.chainDistanceShort(graph, - make_pair(nodeID1, !snarlRev1), - make_pair(endID2, !snarlRev2)); - if (nodeID1 == endID2) { - //If snarls share a node, chainDistanceShort returns length of - //shared node - d2 = (distL1 == -1 || distR2 == -1 || d2 == -1) ? -1 : - distL1 + distR2 - d2; - } else { - d2 = (distL1 == -1 || distR2 == -1 || d2 == -1) ? -1 : - distL1 + distR2 + d2; - } - - //Distance from right of s1 (fd) to left of s2 (fd) - int64_t d3 = chainDists.chainDistanceShort(graph, - make_pair(endID1, snarlRev1), - make_pair(nodeID2, snarlRev2)); - if (endID1 == nodeID2) { - d3 = (distR1 == -1 || distL2 == -1 || d3 == -1) ? -1 : - distR1 + distL2 - d3; - } else { - d3 = (distR1 == -1 || distL2 == -1 || d3 == -1) ? -1 : - distR1 + distL2 + d3; - } - - //Distance from right of s1 (fd) to right of s2 (rev) - int64_t d4 = chainDists.chainDistanceShort(graph, - make_pair(endID1, snarlRev1), - make_pair(endID2, !snarlRev2)); - d4 = (distR1 == -1 || distR2 == -1 || d4 == -1) ? -1 : - distR1 + distR2 + d4; - - - chainDist = minPos({d1, d2, d3, d4}); - -#ifdef printDistances - cerr << " Possible distances within chain: " << d1 << " " << d2 - << " " << d3 << " " << d4 << endl; - cerr << "Chain distance in common ancestor: " << chainDist << endl; - -#endif - - } - if (commonAncestor == NULL) { - return minPos({chainDist, shortestDistance}); - } - - //Get dist from pos1 to ends of its chain - if (snarl1 != commonAncestor && sm->in_nontrivial_chain(snarl1)) { - const Chain* chain = sm->chain_of(snarl1); - - id_t chainStartID = get_start_of(*chain).node_id(); - - ChainIndex& chainDists = chainDistances.at( chainStartID); - bool snarlRev = sm->chain_orientation_of(snarl1); - pair endDists = chainDists.distToEnds( - make_pair(nodeID1, snarlRev), distL1, distR1); - - distL1 = endDists.first; - distR1 = endDists.second; - - nodeID1 = chainStartID; - nodeRev1 = get_start_of(*chain).backward(); - } - //Get dist from pos2 to ends of its chain - if (snarl2 != commonAncestor && sm->in_nontrivial_chain(snarl2)) { - const Chain* chain = sm->chain_of(snarl2); - id_t chainStartID = get_start_of(*chain).node_id(); - - ChainIndex& chainDists = chainDistances.at( chainStartID); - bool snarlRev = sm->chain_orientation_of(snarl2); - - pair endDists = chainDists.distToEnds( - make_pair(nodeID2, snarlRev), distL2, distR2); - distL2 = endDists.first; - distR2 = endDists.second; - - nodeID2 = chainStartID; - nodeRev2 = get_start_of(*chain).backward(); - } - - - -#ifdef printDistances - cerr << "Distances to node in common ancestor: " << distL1 << ", " << distR1 - << " " << distL2 << ", " << distR2 << endl; -#endif - //Both nodes are nodes in common ancestor - - //Get distance between ends of nodes in common ancestor snarl - NetGraph ng = NetGraph(commonAncestor->start(), - commonAncestor->end(),sm->chains_of(commonAncestor), graph); - - - auto snarlDistsTmp = snarlDistances.find(make_pair( - commonAncestor->start().node_id(), - commonAncestor->start().backward())); - if (snarlDistsTmp == snarlDistances.end()) { - snarlDistsTmp = snarlDistances.find(make_pair( - commonAncestor->end().node_id(), - !commonAncestor->end().backward())); - } - SnarlIndex& snarlDists = snarlDistsTmp->second; - - - int64_t d1 = snarlDists.snarlDistanceShort( - make_pair(nodeID1, nodeRev1), make_pair(nodeID2, nodeRev2)); - d1 = (distR1 == -1 || distL2 == -1 || d1 == -1) ? -1 : - distR1 + distL2 + d1; - - int64_t d2 = snarlDists.snarlDistanceShort( - make_pair(nodeID1, nodeRev1), make_pair(nodeID2, !nodeRev2)); - - d2 = (distR1 == -1 || distR2 == -1 || d2 == -1) ? -1 : - distR1 + distR2 + d2; - int64_t d3 = snarlDists.snarlDistanceShort( - make_pair(nodeID1, !nodeRev1), make_pair(nodeID2, nodeRev2)); - d3 = (distL1 == -1 || distL2 == -1 || d3 == -1) ? -1 : - distL1 + distL2 + d3; - int64_t d4 = snarlDists.snarlDistanceShort( - make_pair(nodeID1, !nodeRev1), make_pair(nodeID2, !nodeRev2)); - d4 = (distL1 == -1 || distR2 == -1 || d4 == -1) ? -1 : - distL1 + distR2 + d4; - - shortestDistance = minPos({d1, d2, d3, d4, chainDist, shortestDistance}); - -#ifdef printDistances - cerr << "Distances within common ancestor: " << d1 << ", " << d2 - << ", " << d3 << ", " << d4 << endl; - cerr << "Shortest dist only up to common ancestor: " << shortestDistance - << endl; -#endif - - - //Find distances to the ends of the common ancestor snarl - pair endDists = snarlDists.distToEnds(graph, &ng, nodeID1, nodeRev1, distL1, distR1); - distL1 = endDists.first; - distR1 = endDists.second; - - endDists = snarlDists.distToEnds(graph, &ng, nodeID2, nodeRev2, distL2, distR2); - distL2 = endDists.first; - distR2 = endDists.second; - -#ifdef printDistances - cerr << "Distances to ends of common ancestor: " << distL1 << " " << distR1 - << " " << distL2 << " " << distR2 - << endl; -#endif - - const Snarl* currSnarl = commonAncestor; - const Snarl* parentSnarl = sm->parent_of(currSnarl); - id_t startID = currSnarl->start().node_id(); - id_t startRev = currSnarl->start().backward(); //pointing into snarl - id_t endID = currSnarl->end().node_id(); - id_t endRev = currSnarl->end().backward(); //pointing out - - /*shortestDistance is now the shortest distance only traversing up to the - most recent common ancestor. - - currSnarl is the common ancestor, start/end ID are a node in the - common ancestor, distances are up to a node in the common ancestor - Traverse up to root and check for path at each level - */ - - while ( currSnarl != NULL) { - - - if (sm->in_nontrivial_chain(currSnarl)) { - //Find paths between ends of current chain - - const Chain* currChain= sm->chain_of(currSnarl); - ChainIndex& chainDists = chainDistances.at( - get_start_of(*currChain).node_id()); - bool snarlRev = sm->chain_orientation_of(currSnarl); - - //Distance from start (reverse) to start (forward) - int64_t d1 = chainDists.chainDistanceShort(graph, - make_pair(startID, !snarlRev), - make_pair(startID, snarlRev)); - d1 = (distL1 == -1 || distL2 == -1 || d1 == -1) ? -1 : - distL1 + distL2 + d1; - - //Distance from start (reverse) to end (reverse) - int64_t d = chainDists.chainDistanceShort(graph, - make_pair(startID, !snarlRev), - make_pair(endID, !snarlRev)); - - d2 = (distL1 == -1 || distR2 == -1 || d == -1) ? -1 : - distL1 + distR2 + d; - d3 = (distR1 == -1 || distL2 == -1 || d == -1) ? -1 : - distR1 + distL2 + d; - - //Distance from end (fd) to end (rev) - int64_t d4 = chainDists.chainDistanceShort(graph, - make_pair(endID, snarlRev), - make_pair(endID, !snarlRev)); - d4 = (distR1 == -1 || distR2 == -1 || d4 == -1) ? -1 : - distR1 + distR2 + d4; - - - shortestDistance = minPos({shortestDistance, d1, d2, d3, d4}); - - //Find distances to ends of the current chain - //TODO: clean this up a bit - - - size_t startI = chainDists.snarlToIndex[startID]; - pair startFdP = make_pair(startI, snarlRev); - pair startRevP; - if (snarlRev) { - if (startI == 0) { - startRevP = make_pair(startI, !snarlRev); - } else { - startRevP = make_pair(startI-1, !snarlRev); - } - } else { - startRevP = make_pair(startI + 1, !snarlRev); - } - - int64_t dsl = chainDists.chainDistanceHelper(make_pair(0,false), startFdP); - int64_t dsr = chainDists.chainDistanceHelper(make_pair(0,false), startRevP); - int64_t der = chainDists.chainDistanceHelper(make_pair(chainDists.loopFd.size()-1,true),startRevP); - int64_t del = chainDists.chainDistanceHelper(make_pair(chainDists.loopFd.size()-1,true), startFdP); - - - if (dsl == -1) {distL1 = dsr == -1 || distR1 == -1? -1 : distR1 + dsr;} - else if (dsr ==-1) {distL1 = dsl == -1 || distL1 == -1? -1 : distL1 + dsl;} - else {distL1 = min(distL1 = dsr == -1 || distR1 == -1? -1 : distR1 + dsr, distL1 = dsl == -1 || distL1 == -1? -1 : distL1 + dsl);} - - if (del == -1) {distR1 = der == -1 || distR1 == -1? -1 : distR1 + der;} - else if (der ==-1) {distR1 = del == -1 || distL1 == -1? -1 : distL1 + del;} - else {distR1 = min(der == -1 || distR1 == -1? -1 : distR1 + der, del == -1 || distL1 == -1? -1 : distL1 + del);} - - - if (dsl == -1) {distL2 = dsr == -1 || distR2 == -1? -1 : distR2 + dsr;} - else if (dsr ==-1) {distL2 = dsl == -1 || distL2 == -1? -1 : distL2 + dsl;} - else {distL2 = min(dsr == -1 || distR2 == -1? -1 : distR2 + dsr, dsl == -1 || distL2 == -1? -1 : distL2 + dsl);} - - if (del == -1) {distR2 = der == -1 || distR2 == -1? -1 : distR2 + der;} - else if (der ==-1) {distR2 = del == -1 || distL2 == -1? -1 : distL2 + del;} - else {distR2 = min(der == -1 || distR2 == -1? -1 : distR2 + der, del == -1 || distL2 == -1? -1 : distL2 + del);} - - - - startID = get_start_of(*currChain).node_id(); - startRev = get_start_of(*currChain).backward(); - endID = get_end_of(*currChain).node_id(); - endRev = get_end_of(*currChain).backward(); - -#ifdef printDistances - cerr << "At chain " << startID << " dists to ends: " << distL1 << " " << - distR1 << " " << distL2 << " " << distR2 << endl; - cerr << "distances: " << d1 << " " << d2 << " " << d3 << " " << d4 << endl; - cerr << " Shortest distance : " - << shortestDistance << endl; -#endif - } - - if (parentSnarl == NULL) {break;} - - auto snarlDistsTmp = snarlDistances.find( - make_pair(parentSnarl->start().node_id(), - parentSnarl->start().backward())); - if (snarlDistsTmp == snarlDistances.end()) { - snarlDistsTmp = snarlDistances.find( - make_pair(parentSnarl->end().node_id(), - ! parentSnarl->end().backward())); - } - SnarlIndex& snarlDists = snarlDistsTmp->second; - - - NetGraph ng = NetGraph(parentSnarl->start(), - parentSnarl->end(),sm->chains_of(parentSnarl), graph); - - //Find the shortest distance within the snarl - - //Dist from start to start - d1 = snarlDists.snarlDistanceShort( - make_pair(startID, !startRev), make_pair(startID, startRev)); - d1 = (distL1 == -1 || distL2 == -1 || d1 == -1) ? -1 : - distL1 + distL2 + d1; - - //Dist from end to end - d2 = snarlDists.snarlDistanceShort( - make_pair(startID, startRev), make_pair(startID, !startRev)); - - d2 = (distR1 == -1 || distR2 == -1 || d2 == -1) ? -1 : - distR1 + distR2 + d2; - //Dist from start to end - int64_t dtemp = snarlDists.snarlDistanceShort( - make_pair(startID, startRev), make_pair(startID, startRev)); - d3 = (distL1 == -1 || distR2 == -1 || dtemp == -1) ? -1 : - distL1 + distR2 + dtemp; - d4 = (distR1 == -1 || distL2 == -1 || dtemp == -1) ? -1 : - distR1 + distL2 + dtemp; - - - shortestDistance = minPos({d1, d2, d3, d4, shortestDistance}); - - - //Find the distances to ends of the snarl - pair endDists1 = snarlDists.distToEnds(graph, &ng, startID, - startRev, distL1, distR1); - distL1= endDists1.first; distR1= endDists1.second; - - pair endDists2 = snarlDists.distToEnds(graph, &ng, startID, - startRev, distL2, distR2); - distL2= endDists2.first; distR2= endDists2.second; - - startID = parentSnarl->start().node_id(); - startRev = parentSnarl->start().backward(); - endID = parentSnarl->end().node_id(); - endRev = parentSnarl->end().backward(); - - -#ifdef printDistances - cerr << "At snarl " << startID << " dists to ends: " << distL1 << " " << - distR1 << " " << distL2 << " " << distR2 << " Shortest distance : " - << shortestDistance << endl; -#endif - currSnarl = parentSnarl; - parentSnarl = sm->parent_of(currSnarl); - } - - return shortestDistance; - -}; - - -pair, const Snarl*> DistanceIndex::distToCommonAncestor( - const Snarl* snarl, const Snarl* commonAncestor, pos_t& pos){ - - /* Find the distance from pos to either end of a snarl node in - commonAncestor. Doesn't find the distance to ends of a chain child of - common ancestor. - Return the two distances and the Snarl whose parent is the is - commonAncestor or commonAncestor if the position is on a node (not a - snarl) in commonAncestor - */ - - int64_t distL; //Dist from pos1 to boundaries of curr snarl - int64_t distR; //To start and end of snarls, not necessarily left/right - id_t nodeID = get_id(pos); - - int64_t offset = get_offset(pos); - #ifdef printDistances - cerr << "Dist to common ancestor" << "node " << get_id(pos) << " offset " << offset <<" reversed " << is_rev(pos) - << " in snarl " << snarl->start().node_id() << endl; - - #endif - if (is_rev(pos)) {//Get distance to ends of current node - distL = graph->get_length(graph->get_handle(get_id(pos), false))-offset; - distR = offset + 1; - } else { - distR = graph->get_length(graph->get_handle(get_id(pos), false))-offset; - distL = offset + 1; - } - - #ifdef printDistances - cerr << "start pos: " << get_offset(pos) << "-> start: " << distL << - ", end: " << distR << endl; - #endif - - if (commonAncestor != NULL && - snarl->start().node_id() == commonAncestor->start().node_id() && - snarl->start().backward() == commonAncestor->start().backward()) { - /*If the node is a node in commonAncestor, return the distances to - the ends of the node - */ - return make_pair(make_pair(distL, distR), snarl); - } - - id_t startID = snarl->start().node_id(); - bool startRev = snarl->start().backward(); - - auto snarlDistsTmp = snarlDistances.find(make_pair(startID, startRev)); - if (snarlDistsTmp == snarlDistances.end()) { - snarlDistsTmp = snarlDistances.find(make_pair(startID, !startRev)); - } - SnarlIndex& snarlDists = snarlDistsTmp->second; - - - NetGraph ng (snarl->start(), snarl->end(), sm->chains_of(snarl), graph); - - pair endDists = snarlDists.distToEnds(graph, &ng, - nodeID, false, distL, distR); - distL = endDists.first; - distR = endDists.second; - - #ifdef printDistances - cerr << nodeID << "->" << startID << ": " << distL << ", " << distR << endl; - #endif - - nodeID = startID; - bool nodeRev = startRev; - - while ((sm->parent_of(snarl) != NULL && commonAncestor == NULL) || - (commonAncestor != NULL && - !(sm->parent_of(snarl)->start().node_id() == commonAncestor->start().node_id() && - sm->parent_of(snarl)->start().backward() == commonAncestor->start().backward()))) { - //While snarl's parent doesn't equal common ancestor - - int64_t dsl; int64_t dsr; int64_t der; int64_t del; - - if (sm->in_nontrivial_chain(snarl)) { - //Get distances to ends of chain - - const Chain* chain = sm->chain_of(snarl); - id_t chainStartID = get_start_of(*chain).node_id(); - - ChainIndex& chainDists = chainDistances.at(chainStartID); - bool snarlRev = sm->chain_orientation_of(snarl); - - pair endDists = chainDists.distToEnds( - make_pair(nodeID, snarlRev), distL, distR); - - distL = endDists.first; - distR = endDists.second; - - nodeID = chainStartID; - nodeRev = get_start_of(*chain).backward(); - #ifdef printDistances - cerr << nodeID << "->" << chainStartID << ": " << distL << ", " << distR - << endl; - #endif - } - - //Get distances to ends of parent snarl - snarl = sm->parent_of(snarl); - id_t startNodeID = snarl->start().node_id(); - id_t startNodeRev = snarl->start().backward(); - - auto snarlDistsTmp = snarlDistances.find( - make_pair(startNodeID, startNodeRev)); - if (snarlDistsTmp == snarlDistances.end()) { - snarlDistsTmp = snarlDistances.find( - make_pair(startNodeID, !startNodeRev)); - } - SnarlIndex& snarlDists = snarlDistsTmp->second; - - pair endDists = snarlDists.distToEnds( - graph, &ng, nodeID, nodeRev, distL, distR); - - distL = endDists.first; - distR = endDists.second; - #ifdef printDistances - cerr << nodeID << "->" << startNodeID << ": " << distL << ", " << distR - << endl; - #endif - nodeID = startNodeID; - nodeRev = startNodeRev; - } - return make_pair(make_pair(distL, distR), snarl); -}; - -int64_t DistanceIndex::minPos (vector vals) { - /*return the minimum value in vals that is not -1, returns -1 if all - values are -1 */ - return accumulate(vals.begin(), vals.end(), -1, - [](int x, int y) {if (x==-1) {return y;} - else if (y == -1) {return x;} - else {return min(x, y);}} - ); - -}; - - -const Snarl* DistanceIndex::snarlOf (id_t nodeID) { - /*Given a node id, return the snarl that contains the node*/ - - int64_t uintSID = nodeToSnarl[nodeID - minNodeID]; - const Snarl* s = sm->into_which_snarl(uintSID>>1, (uintSID % 2 == 1)); - return s; - - -} - - -void DistanceIndex::printSelf() { - cerr << "Nodes : Snarls" << endl; - for (size_t i = 0 ; i < nodeToSnarl.size() ; i++) { - cerr << i << " " << nodeToSnarl[i] << endl; - } - cerr << "Snarls: " << endl; - for (auto snarls : snarlDistances) { - snarls.second.printSelf(); - } - cerr << endl << "Chains:" << endl; - for (auto chains : chainDistances) { - chains.second.printSelf(); - } - cerr << endl << "Maximum distances" << endl; -// maxIndex.printSelf(); -} - - -DistanceIndex::SnarlIndex::SnarlIndex(DistanceIndex* di, - unordered_set>& - allNodes, pair start, pair end) { - /*Constructor for SnarlIndex object that stores distances between - nodes in a snarl */ - distIndex = di; - - //Assign all nodes+direction in snarl to an index - size_t snarlDistances = 0; - for (pair node: allNodes) { - visitToIndex[node] = snarlDistances++; - } - - int size = visitToIndex.size(); - //Initialize all distances to 0 (representing -1) - util::assign(distances, int_vector<>(((size+1)*size)/2, 0)); - snarlStart = start; - snarlEnd = end; - -} - -DistanceIndex::SnarlIndex::SnarlIndex(DistanceIndex* di, - vector v) { - /*Constructor for SnarlIndex object given vector from serialization */ - - distIndex = di; - int64_t numNodes = v[0]; - int64_t start = v[1]; - snarlStart = (start < 0) ? make_pair( (id_t) abs(start), true) : - make_pair( (id_t) abs(start), false); - - int64_t end = v[2]; - snarlEnd = (end < 0) ? make_pair( (id_t) abs(end), true) : - make_pair( (id_t) abs(end), false); - - //Get visitToIndex - for (size_t i = 0; i < numNodes; i ++ ) { - - int64_t n = v[i + 3]; //Node - pair node = (n < 0) ? make_pair( (id_t) abs(n), true) : - make_pair( (id_t) abs(n), false); - visitToIndex[node] = i; - } - - //Get distance vector - distances.resize(((numNodes+1) *numNodes) / 2); - size_t j = 0; - for (size_t i = numNodes + 3; i < v.size(); i++) { - - distances[j++] = v[i]; - - } - util::bit_compress(distances); - -} - - vectorDistanceIndex::SnarlIndex::toVector() { - /*Convert contents of object to vector for serialization - Vector contains a header of four ints: #nodes, start node, end node,length - a vector representing visitToIndex [node1, node2, ...] where the nodes are ordered by the index they map to - a vector representing distances*/ - - vector v;// v (1, 0, sizeof(int64_t)); - size_t numNodes = visitToIndex.size();//number of node+directions - v.resize(numNodes + distances.size() + 3); //store map, distances, header - - v[0] = (int64_t) numNodes; - v[1] = snarlStart.second ? -(int64_t) snarlStart.first : - (int64_t) snarlStart.first; - v[2] = snarlEnd.second ? -(int64_t) snarlEnd.first : - (int64_t) snarlEnd.first; - - for (pair, size_t> p : visitToIndex) { - pair node = p.first; - int64_t index = (int64_t) p.second; - v[3 + index] = node.second ? -(int64_t) node.first : - (int64_t) node.first; - } - - - size_t i = 3 + numNodes; - for (int64_t d : distances) { - v[i++] = d; - } - return v; - -} - - -size_t DistanceIndex::SnarlIndex::index(pair start, - pair end) { - /*Get the index of dist from start to end in a snarl distance matrix - given the node ids + direction */ - size_t length = visitToIndex.size(); - size_t i1 = visitToIndex.at(start); - size_t i2 = visitToIndex.at(make_pair(end.first, !end.second)); - if (i1 > i2) { - //Reverse order of nodes - i1 = visitToIndex.at(make_pair(end.first, !end.second)); - i2 = visitToIndex.at(make_pair(start.first, start.second)); - } - - size_t k = length - i1; - return ( ((length + 1) * length ) / 2 ) - ( ((k + 1) * k ) / 2 ) + i2-i1; -} - -void DistanceIndex::SnarlIndex::insertDistance(pair start, - pair end, int64_t dist) { - //Assign distance between start and end - size_t i = index(start, end); - - distances[i] = dist + 1; -} - -int64_t DistanceIndex::SnarlIndex::snarlDistance(HandleGraph* graph, - NetGraph* ng, pair start, pair end) { - /*Distance between beginnings of two nodes n1 and n2 in snarl - */ - size_t i = index(start, end); - int64_t dist = int64_t(distances[i])-1; - return dist == -1 ? -1 : dist + nodeLength(graph, ng,start.first); -} - -int64_t DistanceIndex::SnarlIndex::snarlDistanceShort(pair start, pair end) { - /*Distance between end of node n1 and beginning of node n2 in snarl - */ - size_t i = index(start, end); - return int64_t(distances[i]) - 1; -} -int64_t DistanceIndex::SnarlIndex::nodeLength(HandleGraph* graph, - NetGraph* ng, id_t node){ - - //Get the length of the node. - - handle_t handle = ng->get_handle(node, false); -//TODO: Probably bad to do distIndex->sm-> -//TODO: Should be able to use is_child - //Get the snarl that the node represents, if any - const Snarl* tempSnarl = distIndex->sm->into_which_snarl( - node, false); - const Snarl* currSnarl = tempSnarl == NULL ? - distIndex->sm->into_which_snarl(node, true) : - tempSnarl; - - if (node!= snarlStart.first && node!= snarlEnd.first && - currSnarl != NULL) { - //If node represents a chain or snarl - auto chainDists = distIndex->chainDistances.find(node); - - if (chainDists != distIndex->chainDistances.end()) { - //If chain - return chainDists->second.chainLength(); - } else { - //If snarl - auto snarlDists = distIndex->snarlDistances.find(make_pair(node, false)); - auto snarlDists1 = distIndex->snarlDistances.find(make_pair(node, - true)); - if (snarlDists != distIndex->snarlDistances.end()) { - return snarlDists->second.snarlLength(graph, ng); - } else { - return snarlDists1->second.snarlLength(graph, ng); - } - } - - } else { - return graph->get_length(graph->get_handle(node, false)); - } - -} - -int64_t DistanceIndex::SnarlIndex::snarlLength(HandleGraph* graph, NetGraph* ng) { - //Return the length of the snarl- dist from beginning of start to end of end - int64_t dist = snarlDistance(graph, ng, snarlStart, snarlEnd); - - //length of snarl - if (dist == -1) { - return -1; - } else { - int64_t nodeLen = graph->get_length(graph->get_handle(snarlEnd.first, - snarlEnd.second)); - return dist + nodeLen; - } - -} - -pair DistanceIndex::SnarlIndex::distToEnds(HandleGraph* graph, - NetGraph* ng, id_t node, bool rev, int64_t distL, int64_t distR) { - /* Given the distances to either end of a node, find the distances to - either end of the snarl - Rev is true if the node is reversed in the snarl - */ - if (rev) { - int64_t temp = distL; - distL = distR; - distR = temp; - } - - pair snarlEndRev = make_pair(snarlEnd.first, !snarlEnd.second); - int64_t dsl = snarlDistance(graph, ng, snarlStart, make_pair(node, false)); - - int64_t dsr = snarlDistance( graph, ng, snarlStart, make_pair(node, true)); - - int64_t der = snarlDistance( graph, ng, snarlEndRev, make_pair(node, true)); - - int64_t del = snarlDistance(graph, ng, snarlEndRev, make_pair(node, false)); - - //If the current node is already the start or end position of the snarl - //then there may be no path between them in the index but the distance is 0 - if (node == snarlStart.first) { - if( rev == snarlStart.second) { - dsl = 0; - } else { - dsr = 0; - } - } - - if (node == snarlEnd.first) { - if (rev == !snarlEnd.second) {//node is snarl end pointing in - del = 0; - } else { - der = 0; - } - } - - dsl = dsl == -1 || distL == -1? -1 : distL + dsl; - dsr = dsr == -1 || distR == -1? -1 : distR + dsr; - der = der == -1 || distR == -1? -1 : distR + der; - del = del == -1 || distL == -1? -1 : distL + del; - - int64_t distStart; - if (dsl == -1) {distStart = dsr;} - else if (dsr ==-1) {distStart = dsl;} - else {distStart = min(dsr, dsl);} - - int64_t distEnd; - if (del == -1) {distEnd = der;} - else if (der ==-1) {distEnd = del;} - else {distEnd = min(der, del);} - - return make_pair(distStart, distEnd); -} - -void DistanceIndex::SnarlIndex::printSelf() { - //Print the nodes contained in SnarlDistance - cerr << endl; - - cerr << "Snarl Distances for snarl starting at " << snarlStart.first; - if (snarlStart.second) {cerr << " reverse and ending at ";} - else { cerr << " forward and ending at ";} - cerr << snarlEnd.first; - if (snarlEnd.second) {cerr << " reverse";} - else {cerr << " forward";} - - cerr << endl << "Indices:" << endl; - - for (auto n : visitToIndex) { - cerr << n.first.first << ", " << n.first.second << ": " << n.second << endl; - } - cerr << "Distances:" << endl; - cerr << " "; - for (auto n : visitToIndex) { - cerr << n.first.first; - if (n.first.second) { - cerr << "r "; - } else { - cerr << "f "; - } - } - cerr << endl; - for (auto n1 : visitToIndex) { - if (n1.first.second) { - cerr << n1.first.first << "r "; - } else { - cerr << n1.first.first << "f "; - } - for (auto n2 : visitToIndex) { - size_t length = visitToIndex.size(); - size_t i1 = visitToIndex.at(n1.first); - size_t i2 = visitToIndex.at(n2.first); - size_t k = length - i1; - - size_t i = ( ((length + 1) * length ) / 2 ) - ( ((k + 1) * k ) / 2 ) + i2-i1; -// if (i1 <= i2) { - cerr << snarlDistanceShort(n1.first, n2.first) << " "; -// } else { -// cerr << "- "; -// } - } - cerr << endl; - } - cerr << endl; -} - -//ChainDistance methods -DistanceIndex::ChainIndex::ChainIndex(hash_map s, - vector p, vector fd, vector rev) { - - snarlToIndex = move(s); - util::assign(prefixSum, int_vector<>(p.size())); - util::assign(loopFd, int_vector<>(fd.size())); - util::assign(loopRev, int_vector<>(rev.size())); - - for (size_t i = 0; i < p.size(); i++) { - prefixSum[i] = p[i] + 1; - } - - - for (size_t i = 0; i < fd.size(); i++) { - loopFd[i] = fd[i] + 1; - } - - - for (size_t i = 0; i < rev.size(); i++) { - loopRev[i] = rev[i] + 1; - } - - util::bit_compress(prefixSum); - util::bit_compress(loopFd); - util::bit_compress(loopRev); - -} - -DistanceIndex::ChainIndex::ChainIndex(vector v) { - //Constructor given vector of ints from serialization - - size_t numNodes = v.size() / 5; - - prefixSum.resize(numNodes * 2); - loopFd.resize(numNodes); - loopRev.resize(numNodes); - - for (size_t i = 0; i < numNodes; i ++ ) { - id_t node = (id_t) v[i*5]; - if (snarlToIndex.find(node) == snarlToIndex.end()) { - snarlToIndex[node] = i; - } - - prefixSum[2*i] = v[i*5 + 1]; - prefixSum[2*i+1] = v[i*5 + 2 ]; - loopFd[i] = v[i*5 + 3]; - loopRev[i] = v[i*5 + 4]; - - } -} - -vector DistanceIndex::ChainIndex::toVector() { - /*Convert contents into vector of ints for serialization - Stored as [node_id, prefix sum1, prefix sum2, loopfd,loop rev, node_id2...] - */ - - int64_t numNodes = snarlToIndex.size(); - bool loops = numNodes == prefixSum.size() / 2 - 1; - if (loops) { numNodes = numNodes + 1; } - - vector v;// int_vector<> v (1, 0, sizeof(int64_t)); - v.resize(numNodes * 5); - - for (int i = 0 ; i < numNodes ; i++) { - v[5*i + 1] = prefixSum[2*i]; - v[5*i + 2] = prefixSum[2*i + 1]; - v[5*i + 3] = loopFd[i]; - v[5*i + 4] = loopRev[i]; - } - - for (pair p : snarlToIndex) { - v[p.second * 5] = (int64_t) p.first; - } - if (loops) {v[(numNodes-1) * 5] = v[0];} //Last node id is first - - return v; - -} -int64_t DistanceIndex::ChainIndex::chainDistance(pair start, - pair end) { - - /* - * Return the distance between the given node sides, except node side is - * specified relative to the reading orientation of the chain that the - * nodes are in. - */ - size_t i1 = snarlToIndex.at(start.first); - size_t i2 = snarlToIndex.at(end.first); - - return chainDistanceHelper(make_pair(i1, start.second), - make_pair(i2, end.second)); -} - -int64_t DistanceIndex::ChainIndex::chainDistanceHelper( - pair start, pair end, bool recurse) { - - /*Return the distance from the index start to end. Same as chainDistance - but given the index of the node in the chain not node id */ - - size_t i1 = start.first; - size_t i2 = end.first; - bool rev1 = start.second;//TODO: Change - bool rev2 = end.second; - int64_t loopDist = -1; - - if (snarlToIndex.size() == (prefixSum.size()/2) -1 && i1 != i2 && recurse) { - //If the chain loops - - size_t size = snarlToIndex.size(); - if (i1 == 0) { - - loopDist = chainDistanceHelper(make_pair(size, rev1), - end, false); - - } else if (i2 == 0) { - - loopDist = chainDistanceHelper(start, - make_pair(size, rev2), false); - - } else if (i1 < i2 && start.second) { - //If path could pass through first node in reverse - - loopDist = - chainDistanceHelper(start, make_pair(0, rev1), false) - + chainDistanceHelper(make_pair(size, rev1), end, false); - - } else if (i1 > i2 && !rev1) { - - loopDist = - chainDistanceHelper(start, make_pair(size, rev1), false) - + chainDistanceHelper(make_pair(0, rev1), end, false); - } - - } - - if ((!rev1 && !rev2)) { - //If start and end are facing forward relative to the start of the chain - if (i1 <= i2) { - int64_t dNoRev = prefixSum[2*i2] - prefixSum[2*i1] ; - return minPos({loopDist, dNoRev}); - } else { - int64_t revID1 = loopFd[i1] - 1; - int64_t revID2 = loopRev[i2] - 1; - int64_t chainDist = prefixSum[2*i1+1] - prefixSum[2*i2+1]; - return minPos({loopDist, (revID1 == -1 || revID2 == -1) ? -1 : - chainDist + revID1 + revID2}); - } - - } else if (rev1 && rev2 ){ - //If start and end are both reversed relative to the start of the chain - if (i1 >= i2) { - int64_t dNoRev = prefixSum[2*i1+1] - prefixSum[2*i2+1] ; - return minPos({loopDist, dNoRev}); - } else { - int64_t revID1 = loopRev[i1] - 1; - int64_t revID2 = loopFd[i2] - 1; - int64_t chainDist = prefixSum[2*i2] - prefixSum[2*i1]; - return minPos({loopDist, ((revID1 == -1 || revID2 == -1) ? -1 : - chainDist+ revID1 + revID2)}); - } - } else if (!rev1 && rev2) { - //Start is forward, end is reversed - if (i1 <= i2) { - int64_t rev = loopFd[i2] - 1; - int64_t chainDist = prefixSum[2*i2]- prefixSum[2*i1]; - return minPos({loopDist, ((rev == -1) ? -1 : rev + chainDist )}); - } else { - int64_t rev = loopFd[i1] - 1; - int64_t chainDist = prefixSum[2*i1+1] - prefixSum[2*i2+1]; - return minPos({loopDist, ((rev == -1) ? -1 : rev + chainDist )}); - } - - } else { - //start is reverse, end is forward - if (i1 <= i2) { - int64_t rev = loopRev[i1] - 1; - int64_t chainDist = prefixSum[2*i2] - prefixSum[2*i1]; - return minPos({loopDist, (rev == -1 ? -1 : rev + chainDist )}); - - - } else { - int64_t rev = loopRev[i2] - 1; - int64_t chainDist = prefixSum[2*i1+1] - prefixSum[2*i2+1]; - return minPos({loopDist, ((rev == -1) ? -1 : rev + chainDist )}); - } - } -} - -int64_t DistanceIndex::ChainIndex::chainDistanceShort(HandleGraph* graph, - pair start, pair end) { - /*Distance between end of start node to beginning of end node in chain - or the distance from the end of the end node to the start of the start - node - If start and end are the same node, then return the length of that node - because the length is needed for the distance calculation and a negative - distance would indicate no path. - */ - int64_t d1 = chainDistance(start, end); - int64_t d2 = chainDistance(make_pair(end.first, !end.second), - make_pair(start.first, !start.second)); - if (start == end) { - //If two positions are on different snarls that share a node - return graph->get_length(graph->get_handle(start.first, start.second)); - - } - if (d1 == -1 && d2 == -1) { - return -1; - } else if (d2 == -1) { - return d1 - graph->get_length(graph->get_handle(start.first, - start.second)); - } else if (d1 == -1) { - return d2 - graph->get_length(graph->get_handle(end.first, end.second)); - } else { - return min(d1 - graph->get_length(graph->get_handle(start.first, - start.second)), - d2 - graph->get_length(graph->get_handle(end.first, end.second))); - } -} - -int64_t DistanceIndex::ChainIndex::chainLength() { - - //Get the length of a chain including length of last node - return prefixSum[prefixSum.size()-1] - 1; -} - -pair DistanceIndex::ChainIndex::distToEnds( - pair start, int64_t distL, int64_t distR) { - /*Given the distance to either end of snarl starting at start, find the - distance to either end of the chain*/ - - size_t startI = snarlToIndex[start.first]; - pair startFd = make_pair(startI, start.second); - pair startRev; - if (start.second) { - - if (startI == 0) { - startRev = make_pair(startI, !start.second); - } else { - startRev = make_pair(startI-1, !start.second); - } - - } else { - - startRev = make_pair(startI+1, !start.second); - - } - - int64_t dsl = chainDistanceHelper(make_pair(0,false), startFd); - int64_t dsr = chainDistanceHelper(make_pair(0,false), startRev); - int64_t der = chainDistanceHelper(make_pair(loopFd.size()-1,true),startRev); - int64_t del = chainDistanceHelper(make_pair(loopFd.size()-1,true), startFd); - - dsl = dsl == -1 || distL == -1? -1 : distL + dsl; - dsr = dsr == -1 || distR == -1? -1 : distR + dsr; - der = der == -1 || distR == -1? -1 : distR + der; - del = del == -1 || distL == -1? -1 : distL + del; - - int64_t distStart; - if (dsl == -1) {distStart = dsr;} - else if (dsr ==-1) {distStart = dsl;} - else {distStart = min(dsr, dsl);} - - int64_t distEnd; - if (del == -1) {distEnd = der;} - else if (der ==-1) {distEnd = del;} - else {distEnd = min(der, del);} - return make_pair(distStart, distEnd); -} - -void DistanceIndex::ChainIndex::printSelf() { - //Print the contenst of ChainDistance - - cerr << "ChainDistance Indices:" << endl; - - for (auto n : snarlToIndex) { - cerr << n.first << ": " << n.second << endl; - } - cerr << "Distances:" << endl; - cerr << endl; - for (auto n : prefixSum) { - cerr << n << " "; - } - cerr << endl; - cerr << "Loop Forward:" << endl; - cerr << endl; - for (auto n : loopFd) { - cerr << n << " "; - } - cerr << endl; - cerr << "Loop Reverse:" << endl; - cerr << endl; - for (auto n : loopRev) { - cerr << n << " "; - } - cerr << endl; -} - - - - -/////////////////////// MAXIMUM DISTANCE /////////////////////////////////// - - -DistanceIndex::MaxDistanceIndex::MaxDistanceIndex() { -//TODO: Need this to compile for some reason? -} -DistanceIndex::MaxDistanceIndex::MaxDistanceIndex(DistanceIndex* di, const vector chain, uint64_t c) { - - //Calculate maximum distance index - - //TODO: Try different bit vectors -/* -TODO: Probably don't need this but it might be faster - bit_vector inCycle(graph->max_node_id() - minNodeID+1); - //Flag each node that is in a cycle of length < cap - for (const Snarl* snarl : *chain) { - flagCycles(snarl, inCycle, cap); - } -*/ -/* -cerr << "Nodes in cycles: " << endl; -for (auto x : inCycle) { - if (x.second) { - cerr << x.first << " " ; - } -} -cerr << endl; -*/ - - /////// DFS to get connected componpents that are in cycles - distIndex = di; - cap = c; - int64_t maxNodeID = distIndex->maxNodeID; - int64_t minNodeID = distIndex->minNodeID; - int_vector<> n(maxNodeID- minNodeID + 1, 0); - nodeToComponent = n; -//TODO: All distances in these will be +1 - int_vector<> max(maxNodeID - minNodeID + 1, 0); - int_vector<> minFd(maxNodeID - minNodeID + 1, 0); - int_vector<> minRev(maxNodeID - minNodeID + 1, 0); - - numCycles= findComponents(nodeToComponent, max, minFd, minRev, 0, true); - - //Find connected components of nodes not in cycles - numComponents = findComponents(nodeToComponent, max, minFd, minRev, - numCycles, false); - - maxDistances = max; - - minDistances.resize(maxNodeID - minNodeID + 1); - for (size_t i = 0; i < minDistances.size(); i++) { - uint64_t d1 = minFd[i]; - uint64_t d2 = minRev[i]; - uint64_t d; - if (d1 == 0) { - d = d2; - } else if (d2 == 0) { - d = d1; - } else { - d = min(d1, d2); - } - minDistances[i] = d; - } - -} - -int64_t DistanceIndex::MaxDistanceIndex::maxDistance(pos_t pos1, pos_t pos2) { - //Upper bound of distance between two positions -//TODO: Might incorporate directions- node could be very big - id_t node1 = get_id(pos1); - HandleGraph* graph = distIndex->graph; - int64_t len1 = max(get_offset(pos1), - graph->get_length(graph->get_handle(node1, false)) - - get_offset(pos1)) + 1; - - id_t node2 = get_id(pos2); - int64_t len2 = max(get_offset(pos2), - graph->get_length(graph->get_handle(node2, false)) - - get_offset(pos2)) + 1; - - int64_t minNodeID = distIndex->minNodeID; - - //Return the max distance between nodes plus maximum length of nodes - //TODO: Add just one node len or both? - uint64_t comp1 = nodeToComponent[node1-minNodeID]; - uint64_t comp2 = nodeToComponent[node2-minNodeID]; - if (comp1 != comp2 || comp1 <= numCycles) { - //If they are in separate components or both in a cyclic component - return cap; - - } - - uint64_t max1 = maxDistances[node1-minNodeID]; - uint64_t max2 = maxDistances[node2-minNodeID]; - uint64_t min1 = minDistances[node1-minNodeID]; - uint64_t min2 = minDistances[node2-minNodeID]; - uint64_t d1 = max1 > min2 ? max1-min2 : 0; - uint64_t d2 = max2 > min1 ? max2-min1 : 0; - -cerr << "MAX DIST BETWEEN: " << node1 << " " << node2 << endl; -cerr << comp1 << " " << maxDistances[node1-minNodeID] << " " << -minDistances[node1-minNodeID] << " " << len1 << " AND " << comp2 << " " << -maxDistances[node2-minNodeID] << " " << minDistances[node2-minNodeID] << " " << len2 << endl; -cerr << " IS: " << len1 + len2 + max(d1, d2) << endl; - return len1 + len2 + max(d1, d2); -} - -uint64_t DistanceIndex::MaxDistanceIndex::findComponents( - int_vector<>& nodeToComponent, int_vector<>& maxDists, - int_vector<>& minDistsFd, int_vector<>& minDistsRev, - uint64_t currComponent, bool onlyCycles ){ - - /*If onlyCycles, assign all nodes to a component of connected cycles - if in a cycle, 0 otherwise - If not onlyCycles, assign all unassigned nodes to a connected component - - Returns the maximum component number, the number of connected components - */ - -cerr << "NEW COMPONENT " << endl; - int64_t minNodeID = distIndex->minNodeID; - HandleGraph* graph = distIndex->graph; - int64_t maxNodeID = distIndex->maxNodeID; - hash_set> seen; - for (id_t i = minNodeID ; i <= maxNodeID ; i ++ ) { - if (/*TODO: Maybe still need this check graph->has_node(i) && */nodeToComponent[i - minNodeID] == 0) { - - bool loops = distIndex->loopDistance(make_pair(i, false), - make_pair(i, false)) > -1; - if (onlyCycles == loops) { - //If this node hasn't been seen before and if only counting cycles, - - currComponent++; - //Next nodes to look at; going forward at the end, remove from end - list, bool>> nextNodes; - - //Arbitrarily assign direction for DAG - nextNodes.push_back(make_pair(make_pair(i, true), true)); - nextNodes.push_back(make_pair(make_pair(i, false), false)); - unordered_set> sinkNodes;//Sink nodes of DAG - pair currNode; - - - while (nextNodes.size() > 0) { - //For each reachable node - - //Traverse going forward first - pair, bool> next = nextNodes.back(); - nextNodes.pop_back(); - - currNode = next.first; - bool forward = next.second; - - if (seen.count(currNode) == 0) { - //That hasn't been seen before - - seen.insert(currNode); - bool added = false; - - auto addNextNodes = [&](const handle_t& h)-> bool { - //Helper fn to get adjacent nodes - - pair node = make_pair( - graph->get_id(h), graph->get_is_reverse(h)); - int64_t edgeLoop = distIndex->loopDistance( - currNode, node) > -1; - int64_t nodeLoop = distIndex->loopDistance( - node, node) > -1; - - if ( - ((onlyCycles && edgeLoop && nodeLoop) || - (!onlyCycles && !edgeLoop && !nodeLoop)) ){ - //Add nodes whose edges are in loops - - added = true; - if (seen.count(node) == 0 ) { - if (forward) { - nextNodes.push_back(make_pair(node, - forward)); - } else { - nextNodes.push_front(make_pair(node, - forward)); - } - - if (seen.count(make_pair(node.first, - !node.second)) - == 0) { - if (forward) { - nextNodes.push_front(make_pair( - make_pair( node.first, !node.second), - !forward)); - } else { - nextNodes.push_back(make_pair( - make_pair( node.first, !node.second), - !forward)); - } - } - } - } - return true; - }; - - - nodeToComponent[currNode.first-minNodeID] = currComponent; - if (onlyCycles) { -//TODO: Doing this twice - //Save the loop distance as maxDist of cycle - int64_t nodeLoop = - distIndex->loopDistance(currNode, currNode); - maxDists[currNode.first-minNodeID] = nodeLoop; - } - - handle_t handle =graph->get_handle(currNode.first, - currNode.second); - - //Add nodes that are connected by edges in loops - graph->follow_edges(handle, false, addNextNodes); - - if (!added && forward) { - //If there were no outgoing edges and this was a sink - sinkNodes.insert(currNode); - } - - } - } - //Found all nodes in current component - if (!onlyCycles) { - calculateMaxDistances(sinkNodes, nodeToComponent, maxDists, - minDistsFd, minDistsRev); - } - } - } - } - return currComponent; -} - - -void DistanceIndex::MaxDistanceIndex::calculateMaxDistances( - unordered_set>& sinkNodes, - int_vector<>& nodeToComponent, - int_vector<>& maxDists, int_vector<>& minDistsFd, - int_vector<>& minDistsRev ){ - - /*Given all nodes in a connected component and a set of source/sink nodes - (pointing out),get the max and min distances from each node to a sink node - */ - //TODO: This is arbitrarily breaking long loops - //TODO: I think not using directions in nodes will still produce an upper bound, just not as tight - -cerr << " NEW MAX" << endl; - HandleGraph* graph = distIndex->graph; - int64_t minNodeID = distIndex->minNodeID; - - for (pair sink : sinkNodes) { - //Sink nodes are pointing out of the DAG - - pair currNode = make_pair(sink.first, !sink.second); - list, pair>> nextNodes; - nextNodes.push_back(make_pair(currNode, make_pair(1, 1))); - uint64_t len = graph->get_length(graph->get_handle(sink.first, - sink.second)); - nextNodes.push_front(make_pair(sink, make_pair(0, len+1))); - uint64_t maxMin = 0; // Largest min distance - uint64_t currComp = nodeToComponent[currNode.first-minNodeID]; - - hash_set> exitNodes;//Nodes traversed to exit component - hash_set> seenLoops;//Nodes in loops that have been seen- only traverse each loop at most once and only until the maximum distance found does not exceed cap - - while (nextNodes.size() != 0) { - //Traverse graph from one sink node - - pair, pair> next = - nextNodes.back(); - nextNodes.pop_back(); - currNode = next.first; - - - uint64_t minDist = next.second.first; - uint64_t maxDist = next.second.second; - bool loopSeen = false; - if (nodeToComponent[currNode.first-minNodeID] == currComp) { - //If in the same component - update distances - - uint64_t oldMin; - if (currNode.second) { - oldMin = minDistsFd[currNode.first-minNodeID]; - minDist = oldMin == 0 ? minDist : min(oldMin, minDist); - if (minDist != 0) { - minDistsFd[currNode.first-minNodeID] = minDist; - } - } else { - oldMin = minDistsRev[currNode.first-minNodeID]; - minDist = oldMin == 0 ? minDist : min(oldMin, minDist); - if (minDist != 0) { - minDistsRev[currNode.first-minNodeID] = minDist; - } - } - - if (minDist == 0) { - - - int64_t nodeLen = graph->get_length(graph->get_handle( - currNode.first, currNode.second)); - uint64_t minDist = currNode.second ? - minDistsFd[currNode.first-minNodeID] : - minDistsRev[currNode.first-minNodeID]; - uint64_t oldMax = maxDists[currNode.first-minNodeID]; - maxDist += 2*(minDist + nodeLen); - maxDist = oldMax == 0 ? maxDist : max(oldMax, maxDist); - - maxDists[currNode.first-minNodeID] = maxDist; - } else { - uint64_t oldMax = maxDists[currNode.first-minNodeID]; - maxDist = oldMax == 0 ? maxDist : max(oldMax, maxDist); - maxDists[currNode.first-minNodeID] = maxDist; - } - - maxMin = max(maxMin, minDist); - - } else if (nodeToComponent[currNode.first-minNodeID] <= numCycles) { - //If current node is in a loop - - if (seenLoops.count(currNode) > 0 ) { - loopSeen = true; //True if already been through this loop - } else { - seenLoops.insert(currNode); - } - - } - - int64_t nodeLen = graph->get_length(graph->get_handle( - currNode.first, currNode.second)); - - auto addNextNodes = [&](const handle_t& h)-> bool { - //Helper fn to get adjacent nodes - - pair node = make_pair( - graph->get_id(h), graph->get_is_reverse(h)); - - uint64_t nodeComp = nodeToComponent[node.first - minNodeID]; - if ( nodeComp == currComp ) { -//TODO: This might loop - //If next node is in the same component and either haven't - //visited this node before or distance is less than cap - if ( nodeToComponent[currNode.first - minNodeID] != currComp - && exitNodes.count(make_pair(node.first, !node.second)) - > 0){ - //If this node is re-entering component from the same node exited - nextNodes.push_back(make_pair(node, - make_pair(0, maxDist+nodeLen))); - } else { - - - nextNodes.push_back(make_pair(node, - make_pair(minDist+nodeLen, maxDist+nodeLen))); - } - - } else if ( maxDist < maxMin + cap && !loopSeen) { - //If the max distance that could be found is less than cap - // and the node didn't loop - - if ( nodeToComponent[currNode.first - minNodeID] == currComp ){ - //If leaving component for the first time - exitNodes.insert(currNode); - } - nextNodes.push_front(make_pair(node, - make_pair(minDist+nodeLen, maxDist+nodeLen))); - - } - return true; - }; - - - handle_t handle = graph->get_handle(currNode.first, - currNode.second); - - graph->follow_edges(handle, false, addNextNodes); - - - } - } - - -} - -void DistanceIndex::MaxDistanceIndex::printSelf() { - - cerr << "Number of cyclic components: " << numCycles << endl - << "Number of components: " << numComponents << endl - << "Components: " << endl; - for (auto x : nodeToComponent) {cerr << x << " ";} - cerr << endl - << "Min distances: " << endl; - for (auto x : minDistances) {cerr << x << " " ;} - cerr << endl - << "Max distances: " << endl; - for (auto x : maxDistances) {cerr << x << " " ;} - cerr << endl << endl; -} -void DistanceIndex::flagCycles(const Snarl* snarl, - bit_vector& inCycle, uint64_t cap){ - -//TODO: May not actually use this - //Flag each node with true if it is in a cycle shorter than cap - - auto flagNode = [&](const handle_t& h)-> bool { - - - //Get the snarl that the node represents, if any - const Snarl* currSnarl = sm->into_which_snarl( - graph->get_id(h), graph->get_is_reverse(h)); - - - - if (currSnarl != NULL && - currSnarl->start().node_id() != snarl->start().node_id() && - currSnarl->start().node_id() != snarl->end().node_id() && - snarl->start().node_id() != currSnarl->end().node_id() ) { - //If the node is a snarl/chain - - if (sm->in_nontrivial_chain(currSnarl)) { - //The node is a chain - const Chain* currChain= sm->chain_of(currSnarl); - for (auto s : *currChain) { - flagCycles(s.first, inCycle, cap); - } - } else { - //The node is a snarl - flagCycles(currSnarl, inCycle, cap); - } - } else { - //If the node is really just a node - - - pair node (graph->get_id(h), false); - - int64_t loopDist = loopDistance(snarl, snarl, node, node); - if (loopDist != -1 && loopDist <= cap) { - //If the min cycle dist is less thatn cap - inCycle[node.first - minNodeID] = true; - } else { - inCycle[node.first - minNodeID] = false; - } - } - return true; - }; - - NetGraph ng = NetGraph(snarl->start(), - snarl->end(),sm->chains_of(snarl), graph); - ng.for_each_handle(flagNode); - -} - -int64_t DistanceIndex::loopDistance( - pair node1, pair node2) { - const Snarl* snarl1 = snarlOf(node1.first); - const Snarl* snarl2 = snarlOf(node2.first); - return loopDistance(snarl1, snarl2, node1, node2); -} - -int64_t DistanceIndex::loopDistance(const Snarl* snarl1,const Snarl* snarl2, - pair node1, pair node2) { - /*Find the minimum distance to loop through the given edge or, if node1 and - node2 are the same, to loop through that node */ - -/*TODO: make a test using handle graphs - if (node1 != node2 && !graph->has_edge( - NodeSide(node1.first, !node1.second), - NodeSide(node2.first, node2.second))){ - //Edge must exist - throw runtime_error("Edge does not exist"); - - } -*/ - -#ifdef indexTraverse -cerr << endl << " NEW LOOP CALCULATION: " << node1.first << " TO " << node2.first << endl; -#endif - - int64_t minLoop = -1; - - int64_t distSRev = 0; //Dist to start of snarl traversing node backward - int64_t distSFd = -1; // not including the length of the node - int64_t distERev = -1; - int64_t distEFd = 0; - int64_t distERev1 = -1; - int64_t distSFd2 = -1; - - - const Snarl* snarl; - - - //Length of current node passing through original node - int64_t nodeLen; - if (node1 == node2) { //Same node - look for loop through the node - - nodeLen = graph->get_length(graph->get_handle(node1.first, false)); - - } else { //Look for loop that uses given edge - - nodeLen = graph->get_length(graph->get_handle(node1.first, false)) + - graph->get_length(graph->get_handle(node2.first, false)); - - } - - const Snarl* snarl1Rev = node1.first == snarl1->start().node_id() ? - sm->into_which_snarl(node1.first, !snarl1->start().backward()) : - sm->into_which_snarl(node1.first, snarl1->end().backward()); - - const Snarl* snarl2Rev = node2.first == snarl2->start().node_id() ? - sm->into_which_snarl(node2.first, !snarl2->start().backward()) : - sm->into_which_snarl(node2.first, snarl2->end().backward()); - - if (snarl1 == snarl2) { - - snarl = snarl1; - - } else if (sm->chain_of(snarl1) == sm->chain_of(snarl2)) { - //If the two snarls are on the same chain - - const Chain* chain = sm->chain_of(snarl1); - if ((node1.first == get_start_of(*chain).node_id() && - node2.first == get_end_of(*chain).node_id()) - || - (node2.first == get_start_of(*chain).node_id() && - node1.first == get_end_of(*chain).node_id())) { - /*If the nodes are on opposite sides of the chain, then the edge is - part of a loop through the whole chain */ - auto chainDists = chainDistances.at(get_start_of(*chain).node_id()); - - return chainDists.chainLength(); - } - - //At least one node must be the boundary node of a snarl - if (node1.first == snarl1->start().node_id() || - node1.first == snarl1->end().node_id()) { - - snarl = sm->into_which_snarl(node1.first, node1.second); - - } else if (node2.first == snarl2->start().node_id() || - node2.first == snarl2->end().node_id()){ - - snarl = sm->into_which_snarl(node2.first, !node2.second); - - } - - - } else if (sm->parent_of(snarl1) == sm->parent_of(snarl2)) { - //Snarls share a common parent snarl but aren't on the same chain - - int64_t length1 = 0; //Size of the snarl or chain of node1 - if (sm->in_nontrivial_chain(snarl1)) { - //If chain, node is already a boundary node of snarl in chain - - const Chain* chain = sm->chain_of(snarl1); - id_t chainStartID = get_start_of(*chain).node_id(); - - ChainIndex& chainDists = chainDistances.at(chainStartID); - - bool chainRev = sm->chain_orientation_of(snarl1); - - pair bound; - if (node1.first == chainStartID) { - bound = make_pair(get_end_of(*chain).node_id(), true); - } else { - bound = make_pair(chainStartID, false); - } - - bool rev1 = chainRev != snarl1->start().backward() ? !node1.second : node1.second; - distSRev = chainDists.chainDistance(bound, make_pair(node1.first, - rev1)); - length1 = chainDists.chainLength(); - - distERev = chainDists.chainDistance( - make_pair(node1.first, !rev1), - make_pair(node1.first, rev1)); - distERev1 = distERev; - node1 = make_pair(chainStartID, node1.second); - -#ifdef indexTraverse -cerr << "DISTANCES TO ENDS OF CHAIN OF NODE 1: " << distSRev << " " << distSFd - << " " << distERev << " " << distEFd << endl; -#endif - - } else { - //Node 1 is in a snarl - SnarlIndex& snarlDists = snarlDistances.at(make_pair( - snarl1->start().node_id(),snarl1->start().backward())); - - NetGraph ng (snarl1->start(), snarl1->end(), - sm->chains_of(snarl1), graph); - - pair bound; - if (node1.first == snarl1->start().node_id()) { - bound = make_pair(snarl1->end().node_id(), - !snarl1->end().backward()); - } else { - bound = make_pair(snarl1->start().node_id(), - snarl1->start().backward()); - } - distSRev = snarlDists.snarlDistance(graph, &ng, bound, node1); - length1 = snarlDists.snarlLength(graph, &ng); - - distERev = snarlDists.snarlDistance(graph, &ng, - make_pair(node1.first, !node1.second), node1); - distERev1 = distERev; - - node1 = make_pair(snarl1->start().node_id(), node1.second); - -#ifdef indexTraverse -cerr << "DISTANCES TO ENDS OF SNARL OF NODE 1: " << distSRev << " " << distSFd - << " " << distERev << " " << distEFd << endl; -#endif - - - } - - - int64_t length2 = 0; //Size of the snarl or chain of node1 - if (sm->in_nontrivial_chain(snarl2)) { - //If chain, node is already a boundary node of snarl in chain - - const Chain* chain = sm->chain_of(snarl2); - id_t chainStartID = get_start_of(*chain).node_id(); - - ChainIndex& chainDists = chainDistances.at(chainStartID); - - bool chainRev = sm->chain_orientation_of(snarl2); - - pair bound; - if (node2.first == chainStartID) { - bound = make_pair(get_end_of(*chain).node_id(), true); - } else { - bound = make_pair(chainStartID, false); - } - - bool rev2 = chainRev != snarl2->start().backward()? !node2.second : node2.second; - - distEFd = chainDists.chainDistance(bound, - make_pair(node2.first, !rev2)); - length2 = chainDists.chainLength(); - - - distSFd = chainDists.chainDistance( - make_pair(node2.first, rev2), - make_pair(node2.first, !rev2)); - distSFd2 = distSFd; - - node2 = make_pair(chainStartID, node2.second); - -#ifdef indexTraverse -cerr << "DISTANCES TO ENDS OF CHAIN OF NODE 2: " << distSRev << " " << distSFd - << " " << distERev << " " << distEFd << endl; -#endif - - - } else { - //Node 2 is in a snarl - SnarlIndex& snarlDists = snarlDistances.at(make_pair( - snarl2->start().node_id(),snarl2->start().backward())); - - NetGraph ng (snarl2->start(), snarl2->end(), - sm->chains_of(snarl2), graph); - - pair bound; - if (node2.first == snarl2->start().node_id()) { - bound = make_pair(snarl2->end().node_id(), - !snarl2->end().backward()); - } else { - bound = make_pair(snarl2->start().node_id(), - snarl2->start().backward()); - } - distEFd = snarlDists.snarlDistance(graph, &ng, bound, - make_pair(node2.first, !node2.second)); - length2 = snarlDists.snarlLength(graph, &ng); - - distSFd = snarlDists.snarlDistance(graph, &ng, node2, - make_pair(node2.first, !node2.second)); - distSFd2 = distSFd; - node2 = make_pair(snarl2->start().node_id(), node2.second); - -#ifdef indexTraverse -cerr << "DISTANCES TO ENDS OF SNARL OF NODE 2: " << distSRev << " " << distSFd - << " " << distERev << " " << distEFd << endl; -#endif - - - } - - distSFd = distSFd == -1 ? -1 : distSFd + length1; - distERev = distERev == -1 ? -1 : distERev + length2; - - - snarl = sm->parent_of(snarl1); - -#ifdef indexTraverse -cerr << "DISTANCES: " << distSRev << " " << distSFd << " " << distERev << " " << distEFd << endl; -#endif - - } else { - //One snarl must be the parent of the other - - if (snarl1Rev != NULL && sm->parent_of(snarl2) == snarl1Rev) { - //Snarl1 is in a chain, adjacent snarl contains snarl 2 - snarl1 = snarl1Rev; - } else if (snarl2Rev != NULL && sm->parent_of(snarl1) == snarl2Rev) { - snarl2 = snarl2Rev; - } - if (sm->parent_of(snarl1) == snarl2) { - - //Snarl1 is start or end of child snarl in snarl2 - //Switch the orientation of the edge and continue to next condition - - pair node1Rev = make_pair(node1.first, !node1.second); - pair node2Rev = make_pair(node2.first, !node2.second); - node1 = node2Rev; - node2 = node1Rev; - const Snarl* temp = snarl1; - snarl1 = snarl2; - snarl2 = temp; - - - - - } - if (sm->parent_of(snarl2) == snarl1) { - //Snarl2 is start or end of child snarl in snarl1 - if (sm->in_nontrivial_chain(snarl2)) { - //If chain, node is already a boundary node of snarl in chain - - - const Chain* chain = sm->chain_of(snarl2); - - - id_t chainStartID = get_start_of(*chain).node_id(); - - ChainIndex& chainDists = chainDistances.at(chainStartID); - - bool chainRev = sm->chain_orientation_of(snarl2); - - pair chainStart (chainStartID, chainRev); - pair chainEnd (get_end_of(*chain).node_id(), !chainRev); - - pair node2Rev = make_pair(node2.first, !chainRev); - if (chainStartID != node2.first) { - //Assume start of chain is the side node was on - chainEnd = make_pair(chainStartID, chainRev); - chainStart = make_pair(node2.first, !chainRev); - node2Rev = make_pair(node2.first, chainRev); - } - - - distSFd = chainDists.chainDistance(chainStart, node2Rev); - distERev = chainDists.chainDistance(node2Rev, chainEnd); - distEFd = chainDists.chainDistance(chainEnd, node2Rev); - -#ifdef indexTraverse -cerr << "DISTANCES IN CHILD CHAIN: " << distSRev << " " << distSFd << " " << distERev << " " << distEFd << endl; -#endif - node2 = make_pair(chainStartID, node2.second); - - } else { - //If only snarl - - SnarlIndex& snarlDists = snarlDistances.at(make_pair( - snarl2->start().node_id(),snarl2->start().backward())); - - pair snarlStart = snarlDists.snarlStart; - pair snarlEnd = snarlDists.snarlEnd; - snarlEnd = make_pair(snarlEnd.first, !snarlEnd.second); - - NetGraph ng (snarl2->start(), snarl2->end(), - sm->chains_of(snarl2), graph); - - if (node2.first != snarlStart.first) { - auto temp = snarlStart; - snarlStart = snarlEnd; - snarlEnd = temp; - } - - pair node2Rev = make_pair(node2.first, - !node2.second); - - distSFd = snarlDists.snarlDistance(graph, &ng, - snarlStart, node2Rev); - - distEFd = snarlDists.snarlDistance(graph, &ng, snarlEnd, - node2Rev); - - -/*TODO - if (node2.first != snarl2->start().node_id()) { - node2 = make_pair( snarl2->start().node_id(), - node2.second ); - } - if (snarl2->start().node_id() == snarl2->end().node_id()) { - node2 = make_pair(node2.first, snarl2->end().backward()); - } -*/ -node2 = node2.first == snarl2->start().node_id() ? - make_pair(snarl2->start().node_id(), snarl2->start().backward()) - : make_pair(snarl2->start().node_id(), !snarl2->start().backward()); - -#ifdef indexTraverse -cerr << "DISTANCES IN CHILD SNARL " << snarl2->start().node_id() << " : " << distSRev << " " << distSFd << " " << distERev << " " << distEFd << endl; -#endif - } - - snarl = snarl1; - - auto snarlDists = snarlDistances.at(make_pair( - snarl->start().node_id(),snarl->start().backward())); - - NetGraph ng = NetGraph(snarl->start(), - snarl->end(),sm->chains_of(snarl), graph); - - pair node1Rev = make_pair(node1.first, !node1.second); - pair node2Rev = make_pair(node2.first, !node2.second); - //Update snarl, node, and node length - - int64_t distSL = snarlDists.snarlDistanceShort(node2Rev, node1Rev); - int64_t distEL = snarlDists.snarlDistanceShort(node2Rev, node1); - int64_t distSR = snarlDists.snarlDistanceShort(node2, node1Rev); - int64_t distER = snarlDists.snarlDistanceShort(node2, node1); - - int64_t distSFdTemp = minPos({ -//TODO -// (distEFd == -1 || distSR == -1) ? -1 : distEFd + distSR, - (distSFd == -1 || distSL == -1) ? -1 : distSFd + distSL}); - - int64_t distERevTemp = minPos({ -// (distSRev == -1 || distER == -1) ? -1 : distSRev + distER, - (distERev == -1 || distEL == -1) ? -1 : distERev + distEL}); - - distSFd2 = distSFdTemp; - distSRev = 0; - - distSFd = distSFdTemp == -1 ? -1 : distSFdTemp + - snarlDists.nodeLength(graph, &ng, node1.first); - distERev = distERevTemp == -1 ? -1 : distERevTemp + - snarlDists.nodeLength(graph, &ng, node2.first); - -#ifdef indexTraverse -cerr << "DISTANCES: " << distSRev << " " << distSFd << " " << distERev << " " << distEFd << endl; -#endif - } - - - } - - - while (snarl != NULL) { - //Check each ancestor snarl for a loop - -#ifdef indexTraverse -cerr << "SNARL: " << snarl->start() << endl; -#endif - NetGraph ng = NetGraph(snarl->start(), - snarl->end(),sm->chains_of(snarl), graph); - - SnarlIndex& snarlDists = snarlDistances.at(make_pair( - snarl->start().node_id(),snarl->start().backward())); - - pair node1Rev = make_pair(node1.first, !node1.second); - pair node2Rev = make_pair(node2.first, !node2.second); - - int64_t loop = minPos({ - snarlDists.snarlDistanceShort(node2, node1), - snarlDists.snarlDistanceShort(node1Rev, node2Rev)}); - - int64_t loopL = snarlDists.snarlDistanceShort(node1Rev, node2); - int64_t loopR = snarlDists.snarlDistanceShort(node2, node1Rev); -#ifdef indexTraverse -cerr << "SNARL LOOPS: " << loop << " " << loopL << " " << loopR << endl; - #endif - int64_t loop1 = loop == -1 || distSRev == -1 || distEFd == -1 ? -1 : - loop + distSRev + distEFd + nodeLen; - int64_t loop2 = loop == -1 || distSFd == -1 || distERev == -1 ? -1 : - loop + distSFd + distERev + nodeLen; - int64_t loop3 = -1; - if (node1 == node2) { - - loopL = loopL == -1 || distSFd == -1 || distSRev == -1 ? -1 : - loopL + distSFd + distSRev + nodeLen; - loopR = loopR == -1 || distEFd == -1 || distERev == -1 ? -1 : - loopR + distEFd + distERev + nodeLen; - } else { - - loopL = loopL == -1 || distSFd2 == -1 || distSRev == -1 ? -1 : - loopL + distSFd2 + distSRev + nodeLen; - loopR = loopR == -1 || distEFd == -1 || distERev1 == -1 ? -1 : - loopR + distEFd + distERev1 + nodeLen; - loop3 = distSFd2 == -1 || distERev1 == -1 ? -1 : distSFd2 + distERev1 + nodeLen; - } - - -#ifdef indexTraverse -cerr << " LOOP DISTANCES: " << loop3 << " " << loop1 << " " << loop2 << " " << loopL << " " << loopR << endl; -#endif - minLoop = minPos({minLoop, loop1, loop2, loop3, loopL, loopR}); - - - //Update snarl, node, and node length - int64_t distSL = (node1 == snarlDists.snarlStart) ? 0 : - snarlDists.snarlDistance(graph, &ng, make_pair( - snarl->start().node_id(), snarl->start().backward()), node1); - int64_t distSR = (node2.first == snarlDists.snarlStart.first && - node2.second != snarlDists.snarlStart.second) ? 0 : - snarlDists.snarlDistance(graph, &ng, make_pair( - snarl->start().node_id(), snarl->start().backward()), node2Rev); - int64_t distEL = (node1.first == snarlDists.snarlEnd.first && - node1.second != snarlDists.snarlEnd.second) ? 0 : - snarlDists.snarlDistance(graph, &ng, make_pair( - snarl->end().node_id(), !snarl->end().backward()), node1); - int64_t distER = (node2 == snarlDists.snarlEnd) ? 0 : - snarlDists.snarlDistance(graph, &ng, make_pair( - snarl->end().node_id(), !snarl->end().backward()), node2Rev); - -#ifdef indexTraverse -cerr << "DISTANCES IN SNARL " << snarl->start().node_id() << " : " << distSL << " " << distSR << " " << distEL << " " << distER << endl; -#endif - int64_t distSRevTemp = minPos({ - ((distSRev == -1 || distSL == -1) ? -1 : distSRev + distSL), - ((distERev == -1 || distSR == -1) ? -1 : distERev + distSR)}); - - int64_t distSFdTemp = minPos({ - ((distSFd == -1 || distSL == -1) ? -1 : distSFd + distSL), - ((distEFd == -1 || distSR == -1) ? -1 : distEFd + distSR) }); - - int64_t distERevTemp = minPos({ - ((distSRev == -1 || distEL == -1) ? -1 : distSRev + distEL), - ((distERev == -1 || distER == -1) ? -1 : distERev + distER) }); - - int64_t distEFdTemp = minPos({ - ((distSFd == -1 || distEL == -1) ? -1 : distSFd + distEL), - ((distEFd == -1 || distER == -1) ? -1 : distEFd + distER) }); - - if (node1 != node2) { - int64_t distSL2 = snarlDists.snarlDistance(graph, &ng, make_pair( - snarl->start().node_id(), snarl->start().backward()), node2); - - int64_t distSR1 = snarlDists.snarlDistance(graph, &ng, make_pair( - snarl->start().node_id(), snarl->start().backward()), node1Rev); - - int64_t distEL2 = snarlDists.snarlDistance(graph, &ng, make_pair( - snarl->end().node_id(), !snarl->end().backward()), node2); - int64_t distER1 = snarlDists.snarlDistance(graph, &ng, make_pair( - snarl->end().node_id(), !snarl->end().backward()), node1Rev); - - distSRevTemp = minPos({distSRevTemp, - ((distERev1 == -1 || distSR1 == -1) ? -1 : distERev1 + distSR1)}); - - distSFdTemp = minPos({distSFdTemp, - ((distSFd2 == -1 || distSL2 == -1) ? -1 : distSFd2 + distSL2)}); - - distERevTemp = minPos({distERevTemp, - ((distERev1 == -1 || distER1 == -1) ? -1 : distERev1 + distER1) }); - - distEFdTemp = minPos({distEFdTemp, - ((distSFd2 == -1 || distEL2 == -1) ? -1 : distSFd2 + distEL2) }); - } - distSRev = distSRevTemp; - distSFd = distSFdTemp; - distERev = distERevTemp; - distEFd = distEFdTemp; - -#ifdef indexTraverse -cerr << "DISTANCES AFTER SNARL: " << distSRev << " " << distSFd << " " << distERev << " " << distEFd << endl; -#endif - node1 = snarlDists.snarlStart; - - node2 = node1; - - if (sm->in_nontrivial_chain(snarl)) { - //Loop distance through a chain - - node2 = snarlDists.snarlEnd; - - const Chain* chain = sm->chain_of(snarl); - - id_t chainStartID = get_start_of(*chain).node_id(); - id_t chainEndID = get_end_of(*chain).node_id(); - ChainIndex& chainDists = chainDistances.at(chainStartID); - bool chainRev = sm->chain_orientation_of(snarl); - - int64_t loopChain = chainDists.chainDistanceShort(graph, - make_pair(snarl->end().node_id(), chainRev), - make_pair(snarl->start().node_id(), chainRev)); - - int64_t loopL = chainDists.chainDistanceShort(graph, - make_pair(snarl->start().node_id(), !chainRev), - make_pair(snarl->start().node_id(), chainRev)); - int64_t loopR = chainDists.chainDistanceShort(graph, - make_pair(snarl->end().node_id(), chainRev), - make_pair(snarl->end().node_id(), !chainRev)); - -#ifdef indexTraverse -cerr << "LOOP DISTANCES IN CHAIN " << chainStartID << " from node " << snarl->start().node_id() << " to " << snarl->end().node_id() << " : " << loopChain << " " << loopL << " " << loopR << endl; -#endif - pair node1Chain = make_pair(node1.first, chainRev); - pair node2Rev = make_pair(node2.first, !chainRev); - - int64_t loop1 = loopChain == -1 || distSRev == -1 || distEFd == -1 ? -1 : - loopChain + distSRev + distEFd + nodeLen; - int64_t loop2 = loopChain == -1 || distSFd == -1 || distERev == -1 ? -1 : - loopChain + distSFd + distERev + nodeLen; - loopL = loopL == -1 || distSFd == -1 || distSRev == -1 ? -1 : - loopL + distSFd + distSRev + nodeLen; - loopR = loopR == -1 || distEFd == -1 || distERev == -1 ? -1 : - loopR + distEFd + distERev + nodeLen; - minLoop = minPos({minLoop, loop1, loop2, loopL, loopR }); - -#ifdef indexTraverse -cerr << " CHAIN LOOPS " << chainStartID << " : " << loop1 << " " << loop2 << " " << loopL << " " << loopR << endl; -#endif - - //Get distance to ends of the chain - int64_t distSL = chainDists.chainDistance(make_pair( - chainStartID, chainRev), node1Chain); - int64_t distSR = chainDists.chainDistance(make_pair( - chainStartID, chainRev), node2Rev); - int64_t distEL = chainDists.chainDistance(make_pair( - chainEndID, !chainRev), node1Chain); - int64_t distER = chainDists.chainDistance(make_pair( - chainEndID, !chainRev), node2Rev); - - - int64_t distSRevTemp = minPos({ - ((distSRev == -1 || distSL == -1) ? -1 : distSRev + distSL), - ((distERev == -1 || distSR == -1) ? -1 : distERev + distSR)}); - - int64_t distSFdTemp = minPos({ - ((distSFd == -1 || distSL == -1) ? -1 : distSFd + distSL), - ((distEFd == -1 || distSR == -1) ? -1 : distEFd + distSR) }); - - int64_t distERevTemp = minPos({ - ((distSRev == -1 || distEL == -1) ? -1 : distSRev + distEL), - ((distERev == -1 || distER == -1) ? -1 : distERev + distER) }); - - int64_t distEFdTemp = minPos({ - ((distSFd == -1 || distEL == -1) ? -1 : distSFd + distEL), - ((distEFd == -1 || distER == -1) ? -1 : distEFd + distER) }); - - distSRev = distSRevTemp; - distSFd = distSFdTemp; - distERev = distERevTemp; - distEFd = distEFdTemp; - -#ifdef indexTraverse - -cerr << "DISTANCES chain? : " << distSL << " " << distSR << " " << distEL << " " << distER << endl; -cerr << "DISTANCES TO ENDS OF CHAIN: " << distSRev << " " << distSFd << " " << distERev << " " << distEFd << endl; -#endif - bool rev1 = node1.first == chainEndID ? !get_start_of(*chain).backward() : get_start_of(*chain).backward(); - node1 = make_pair(chainStartID, rev1); - node2 = node1; - - - } - snarl = sm->parent_of(snarl); - - } - - return minLoop; - -} - - - - - -//////////////////////Methods for testing -int64_t DistanceIndex::checkChainDist(id_t snarl, size_t index) { - return chainDistances.at(snarl).prefixSum[index] - 1; -} -int64_t DistanceIndex::checkChainLoopFd(id_t snarl, size_t index) { - return chainDistances.at(snarl).loopFd[index] - 1; -} -int64_t DistanceIndex::checkChainLoopRev(id_t snarl, size_t index) { - return chainDistances.at(snarl).loopRev[index] - 1; -} -} diff --git a/src/distance.hpp b/src/distance.hpp deleted file mode 100644 index 9dd0978df90..00000000000 --- a/src/distance.hpp +++ /dev/null @@ -1,324 +0,0 @@ -#include "snarls.hpp" -#include "hash_map.hpp" -using namespace sdsl; -namespace vg { - -class DistanceIndex { - - /*The distance index. Used for calculation of the minimum distance between - two positions and for a maximum distance estimation. The maximum distance - estimation is at least as large as the maximum distance between two - positions up to a specified cap*/ - - public: - //Constructor - DistanceIndex (HandleGraph* vg, SnarlManager* snarlManager, uint64_t cap); - - //Constructor to load index from serialization - DistanceIndex (HandleGraph* vg, SnarlManager* snarlManager, istream& in); - - //Serialize object into out - void serialize(ostream& out); - - //Load serialized object from in - void load(istream& in); - - - /*Get the minimum distance between two positions - pos1 must be on a node contained in snarl1 and not on any children of - snarl1. The same for pos2 and snarl2 - */ - int64_t minDistance( pos_t pos1, pos_t pos2); - int64_t minDistance( - const Snarl* snarl1, const Snarl* snarl2, pos_t pos1, pos_t pos2); - - /*Get an upper bound of the distance between two positions */ - int64_t maxDistance(pos_t pos1, pos_t pos2); - - //Helper function to find the minimum value that is not -1 - static int64_t minPos(vector vals); - - - //Given a node, find the snarl containing it - const Snarl* snarlOf(id_t nodeID); - - protected: - void printSelf(); - class SnarlIndex { - - /* Stores distance information for nodes in a snarl. - visitToIndex maps each visit (node_id, negative if reverse) to an int - distances stores all the distance bewteen each pair of visits in a - snarl - */ - - public: - - //Constructor - SnarlIndex(DistanceIndex* di, - unordered_set>& allNodes, - pair start, pair end); - - //Construct from vector - inverse of toVector - SnarlIndex(DistanceIndex* di, vector v); - - /*Store contents of object as a vector of ints for serialization - Stored as [# nodes, start node id, end node id, snarl length] + - [visit to index as list of node ids in order of index] + - [distances] - */ - vector toVector(); - - //Distance between beginning of node start and beginning of node end - //Only works for nodes heading their chains (which represent the chains), or snarl boundaries. - int64_t snarlDistance(HandleGraph* graph,NetGraph* ng,pair start, - pair end); - - - //Distance between end of node start and beginning of node end - //Only works for nodes heading their chains (which represent the chains), or snarl boundaries. - int64_t snarlDistanceShort(pair start, - pair end); - - //Add the distance from start to end to the index - void insertDistance(pair start, pair curr, - int64_t dist); - - //Length of a node - int64_t nodeLength(HandleGraph*graph, NetGraph* ng, id_t node); - - //Total length of the snarl - int64_t snarlLength(HandleGraph* graph, NetGraph* ng); - - /*Given distances from a position to either end of a node, find the - shortest distance from that position to the start and end nodes of - the snarl - */ - pair distToEnds(HandleGraph* graph, NetGraph* ng, - id_t node, bool rev, int64_t distL, int64_t distR); - - void printSelf(); - - protected: - - //Maps node to index to get its distance - hash_map< pair, size_t> visitToIndex; - - /*Store the distance between every pair nodes, -1 indicates no path - For child snarls that are unary or only connected to one node - in the snarl, distances between that node leaving the snarl - and any other node is -1 - */ - int_vector<> distances; - - //ID of the first node in the snarl, also key for distance index - pair snarlStart; - - //End facing out of snarl - pair snarlEnd; - - //The index into distances for distance start->end - size_t index(pair start, pair end); - - private: - DistanceIndex* distIndex; - - - - friend class DistanceIndex; - friend class TestDistanceIndex; - }; - - class ChainIndex { - /*Stores distances between snarls in a chain*/ - - public: - - //Constructor - ChainIndex(hash_map s, vector p, - vector fd, vector rev ); - - //Constructor from vector of ints after serialization - ChainIndex(vector v); - - /*Convert contents into vector of ints for serialization - stored as [node_id1, prefixsum1 start, prefixsum1 end, - loopfd1, loopfd2, node_id2, ...] - */ - vector toVector(); - - /** - * Distance between two node sides in a chain. id_t values specify - * the nodes, and bool values specify the sides. Side orientations - * are relative to the node's orientation *in the chain*, so if - * reading through the chain in its forward orientation you - * encounter the node in reverse, then true is the *left* side of - * the node and false is the *right* side. - */ - int64_t chainDistance(pair start, pair end); - - /** - * Takes the graph and two node sides, with orientations specified - * relative to the nodes' orientation in their chain (i.e. nodes - * backward in the chain have false represent the *end* of the - * node). - * - * Returns the distance from the **opposite** side of the start - * node to the specified side of the end node. - */ - int64_t chainDistanceShort(HandleGraph* graph, pair start, - pair end); - //Length of entire chain - int64_t chainLength(); - - /* Given the distance from a position to either end of a snarl in - the chain, find the shortest distance from the position to - either end of the chain - */ - pair distToEnds(pair start, - int64_t distL, int64_t distR); - void printSelf(); - - protected: - - hash_map snarlToIndex; - - /*Dist from start of chain to start and end of each boundary node of - all snarls in the chain*/ - int_vector<> prefixSum; - - /*For each boundary node of snarls in the chain, the distance - from the start of the node traversing forward to the end of - the same node traversing backwards*/ - int_vector<> loopFd; - - /*For each boundary node of snarls in the chain, the distance - from the end of the node traversing backward to the start of - the same node traversing forward*/ - - int_vector<> loopRev; - - - - /*Helper function for finding distances*/ - int64_t chainDistanceHelper(pair start, - pair end, bool recurse = true); - - friend class DistanceIndex; - friend class TestDistanceIndex; - }; - - class MaxDistanceIndex { - //Index for calculating the maximum distance between two points - public: - - //Constructor - MaxDistanceIndex(); - MaxDistanceIndex(DistanceIndex* di, const vector chain, - uint64_t cap); - - //Actual distance function for finding upper bound for distance - int64_t maxDistance( pos_t pos1, pos_t pos2); - -//TODO: Finish testing protected: - int_vector<> nodeToComponent;//Maps each node to its connected component - int_vector<> minDistances; //Min and max distances to - int_vector<> maxDistances; //sink nodes in topological order - uint64_t numCycles; //Number of cyclic connected components - uint64_t numComponents; //Number of connected components - uint64_t cap; //Maximum distance to be considered - - void printSelf(); - - private: - //Helper functions for constructor - //Assign each node to a connected component of cycles - DistanceIndex* distIndex; - uint64_t findComponents( int_vector<>& nodeToComponent, - int_vector<>& maxDists, int_vector<>& minDistsFd, - int_vector<>& minDistsRev, - uint64_t currComponent, bool onlyCycles ); - //Populate minDistances and maxDistances - void calculateMaxDistances(unordered_set>& sinkNodes, - int_vector<>& nodeToComponent,int_vector<>& maxDists, - int_vector<>& minDistsFd, int_vector<>& minDistsRev); - - friend class DistanceIndex; - friend class TestDistanceIndex; - }; - - int64_t sizeOf(); - ///////// Data members of overall index - - //map each node to connected component for max distance estimation - hash_map nodeToCycles; - - //map from start node of a snarl to its index - unordered_map, SnarlIndex> snarlDistances; - - //map from node id of first node in snarl to that chain's index - hash_map chainDistances; - - //Graph and snarl manager for this index - HandleGraph* graph; - - SnarlManager* sm; - - - /*Index to find the snarl containing a node - The start node id of the snarl containing each node - negative if - the start node is reverse - TODO: Maybe put this somewhere else*/ - dac_vector<> nodeToSnarl; - id_t minNodeID; //minimum node id of the graph - id_t maxNodeID; //maximum node id of the graph - -// MaxDistanceIndex maxIndex; - - - - ////// Private helper functions - - - - - - //Helper function for constructor - populate the minimum distance index - int64_t calculateMinIndex(const Chain* chain); - - - //Helper function for constructor - populate node to snarl - int_vector<> calculateNodeToSnarl(SnarlManager* sm); - - //Flag each node with true if it is in a cycle that has minimum length - //smaller than cap - void flagCycles(const Snarl* snarl, bit_vector& inCycle, - uint64_t cap); - - //Minimum distance of a loop that involves node - int64_t loopDistance(pair node1, pair node2); - int64_t loopDistance( const Snarl* snarl1, const Snarl* snarl2, - pair node1, pair node2); - - /*Helper function for distance calculation - Returns the distance to the start of and end of the child snarl of - common ancestor containing snarl, commonAncestor if snarl is - the common ancestor - */ - pair, const Snarl*> distToCommonAncestor( - const Snarl* snarl, const Snarl* commonAncestor, pos_t& pos); - - - - // Methods for testing - int64_t checkChainDist(id_t snarl, size_t index); - int64_t checkChainLoopFd(id_t snarl, size_t index); - int64_t checkChainLoopRev(id_t snarl, size_t index); - friend class SnarlIndex; - friend class ChainIndex; - friend class TestDistanceIndex; - - -}; - -} diff --git a/src/dozeu_interface.cpp b/src/dozeu_interface.cpp new file mode 100644 index 00000000000..41a3900db3d --- /dev/null +++ b/src/dozeu_interface.cpp @@ -0,0 +1,766 @@ +/** + * @file dozeu_interface.hpp + * @author Hajime Suzuki + * @date 2018/03/23 + */ +#include +#include +#include + +#include "dozeu_interface.hpp" + +// Configure dozeu: +// We want the full length bonus included +#ifndef DZ_FULL_LENGTH_BONUS +#define DZ_FULL_LENGTH_BONUS +#endif +// We want the non-qual versions of functions +#ifdef DZ_QUAL_ADJ +#undef DZ_QUAL_ADJ +#endif +// We require these particular values for this enum because we index arrays with it. +enum { MISMATCH = 1, MATCH = 2, INS = 3, DEL = 4 }; +// Set dozeu's CIGAR codes to match our enum +#ifndef DZ_CIGAR_OP +#define DZ_CIGAR_OP 0x04030201 +#endif + + +// To turn on debugging: +//#define DEBUG +//#define DZ_PRINT_VECTOR + +#include + +#ifdef DEBUG +#include +#endif + + +using namespace vg; + +DozeuInterface::OrderedGraph::OrderedGraph(const HandleGraph& graph, const vector& order) : graph(graph), order(order) { + for (size_t i = 0; i < order.size(); ++i) { + index_of[order[i]] = i; + } +} + +void DozeuInterface::OrderedGraph::for_each_neighbor(const size_t i, bool go_left, + const function& lambda) const { + graph.follow_edges(order[i], go_left, [&](const handle_t& pred) { + auto it = index_of.find(pred); + if (it != index_of.end()) { + lambda(it->second); + } + return true; + }); +} + +size_t DozeuInterface::OrderedGraph::size() const { + return order.size(); +} + +static inline char comp(char x) +{ + switch(x) { + case 'a': case 'A': return('T'); + case 'c': case 'C': return('G'); + case 'g': case 'G': return('C'); + case 't': case 'T': return('A'); + default: return('N'); + } +} + + +DozeuInterface::graph_pos_s DozeuInterface::calculate_seed_position(const OrderedGraph& graph, const vector& mems, + size_t query_length, bool direction) const +{ + /* + * seed selection: + * the most upstream for forward one, the most downstream one for reverse one. + * FIXME: better selection strategy (or multiple trial) would be possible. + * TODO: longest MEM is a better heuristic, should just take a position, vector is unnecessary + * + * node_id extraction: + * use the most downstrem one for forward mapping to avoid any *artifact* + * caused by seed position. (redundant multiple mapping may reported due to redundant seeds; + * having almost the same position, length, and sequence but pass slightly different paths) + * GSSW aligner avoids such artifacts because it does not depends on any seed positions on + * computing the DP matrix (it is true local alignment). + */ + const MaximalExactMatch& seed = direction ? mems.back() : mems.front(); // here mems is never empty + auto seed_pos = direction ? seed.nodes.front() : seed.nodes.back(); + + graph_pos_s pos; + + // get node index + pos.node_index = graph.index_of.at(graph.graph.get_handle(gcsa::Node::id(seed_pos), gcsa::Node::rc(seed_pos))); + + // calc ref_offset + pos.ref_offset = direction ? (graph.graph.get_length(graph.order[pos.node_index]) - gcsa::Node::offset(seed_pos)) + : gcsa::Node::offset(seed_pos); + + // calc query_offset (FIXME: is there O(1) solution?) + // TODO: i think we need access to the original Alignment to make this O(1), but the one available in + // the parent function seems to not be the one that these MEMs are from + pos.query_offset = query_length - seed.length(); + for (auto p = seed.end; *p != '\0'; p++) { + pos.query_offset--; + } + pos.query_offset = direction ? query_length - pos.query_offset : pos.query_offset; + + // fprintf(stderr, "calc_seed_pos, direction(%d), rpos(%lu, %u), qpos(%u), len(%lu)\n", direction, pos.node_index, pos.ref_offset, pos.query_offset, seed.end - seed.begin); + return pos; +} + +DozeuInterface::graph_pos_s DozeuInterface::calculate_max_position(const OrderedGraph& graph, const graph_pos_s& seed_pos, size_t max_node_index, + bool direction, const vector& forefronts) +{ + // save node id + graph_pos_s pos; + pos.node_index = max_node_index; + + // Find the node + handle_t n = graph.order[pos.node_index]; + + assert(forefronts.at(max_node_index)->mcap != nullptr); + + // calc max position on the node + uint64_t max_pos = (uint64_t) dz_calc_max_pos(forefronts[max_node_index]); + + // ref-side offset fixup + int32_t rpos = (int32_t)(max_pos>>32); + + pos.ref_offset = direction ? -rpos : (graph.graph.get_length(n) - rpos); + + // query-side offset fixup + int32_t qpos = max_pos & 0xffffffff; + pos.query_offset = seed_pos.query_offset + (direction ? -qpos : qpos); + // fprintf(stderr, "calc_max_pos, rpos(%lu, %u), qpos(%u, %u)\n", pos.node_index, pos.ref_offset, seed_pos.query_offset, pos.query_offset); + return pos; +} + +pair DozeuInterface::scan_seed_position(const OrderedGraph& graph, const Alignment& alignment, + bool direction, vector& forefronts, + int8_t full_length_bonus, uint16_t max_gap_length) +{ + const string& query_seq = alignment.sequence(); + const string& query_qual = alignment.quality(); + + const uint64_t qlen = query_seq.length(), scan_len = qlen < 15 ? qlen : 15; // FIXME: scan_len should be variable + + const char* pack_seq = direction ? query_seq.c_str() : query_seq.c_str() + (qlen - scan_len); + const uint8_t* pack_qual = nullptr; + if (!alignment.quality().empty()) { + pack_qual = (const uint8_t*) (direction ? query_qual.c_str() : query_qual.c_str() + (qlen - scan_len)); + } + + const dz_query_s* packed_query = (direction + ? pack_query_reverse(pack_seq, pack_qual, full_length_bonus, scan_len) + : pack_query_forward(pack_seq, pack_qual, full_length_bonus, scan_len) + ); + + // make a root forefront + dz_alignment_init_s aln_init = dz_align_init(dz, max_gap_length); + + int64_t inc = direction ? -1 : 1; + int64_t max_idx = direction ? graph.order.size() - 1 : 0; + for (int64_t i = max_idx; i >= 0 && i < graph.order.size(); i += inc) { + + vector incoming_forefronts; + graph.for_each_neighbor(i, !direction, [&](size_t j){ + const dz_forefront_s* inc_ff = forefronts[j]; + incoming_forefronts.push_back(inc_ff); + }); + + auto seq = graph.graph.get_sequence(graph.order[i]); + if (incoming_forefronts.empty()) { + forefronts[i] = scan(packed_query, &aln_init.root, 1, + &seq.c_str()[direction ? seq.size() : 0], + direction ? -seq.size() : seq.size(), i, aln_init.xt); + } + else { + forefronts[i] = scan(packed_query, incoming_forefronts.data(), incoming_forefronts.size(), + &seq.c_str()[direction ? seq.size() : 0], + direction ? -seq.size() : seq.size(), i, aln_init.xt); + } + + if(forefronts[i]->max + (direction & dz_geq(forefronts[i])) > forefronts[max_idx]->max) { + max_idx = i; + } + } + + if (forefronts[max_idx]->mcap == nullptr) { + // the scan failed find a positive scoring seed alignment, we will return a placeholder + // and a flag that indicates the failure + return make_pair(graph_pos_s(), false); + } + else { + // find the maximum scoring position and return a success + graph_pos_s pos; + pos.node_index = 0; + pos.ref_offset = 0; + pos.query_offset = direction ? scan_len : qlen - scan_len; + graph_pos_s p = calculate_max_position(graph, pos, max_idx, direction, forefronts); + debug("node_index(%lu), ref_offset(%d), query_offset(%d), max(%d)", p.node_index, p.ref_offset, p.query_offset, forefronts[max_idx]->max); + return make_pair(p, true); + } +} + +size_t DozeuInterface::do_poa(const OrderedGraph& graph, const dz_query_s* packed_query, + const vector& seed_positions, bool right_to_left, + vector& forefronts, uint16_t max_gap_length) +{ + // seed_offset: 0-------->L for both forward and reverse + // right_to_left: true for a right-to-left pass with left-to-right traceback, false otherwise + + // ensure that the forefronts are reset + for (size_t i = 0; i < forefronts.size(); ++i) { + forefronts[i] = nullptr; + } + + // how far into the topological order we can start + size_t start_idx = right_to_left ? 0 : graph.order.size(); + + // initialze an alignment + dz_alignment_init_s aln_init = dz_align_init(dz, max_gap_length); + + debug("extend pass: %s over %lu forefronts", right_to_left ? "right-to-left" : "left-to-right", forefronts.size()); + + // seed an alignment at each of the seed positions + for (const graph_pos_s& seed_pos : seed_positions) { + + // get root node + auto root_seq = graph.graph.get_sequence(graph.order[seed_pos.node_index]); + + // load position and length + int64_t rlen = (right_to_left ? 0 : root_seq.size()) - seed_pos.ref_offset; + + + debug("seed rpos(%lu), rlen(%ld), nid(%ld), rseq(%s)", seed_pos.ref_offset, rlen, + graph.graph.get_id(graph.order[seed_pos.node_index]), root_seq.c_str()); + forefronts[seed_pos.node_index] = extend(packed_query, &aln_init.root, 1, + root_seq.c_str() + seed_pos.ref_offset, + rlen, seed_pos.node_index, aln_init.xt); + + // push the start index out as far as we can + if (right_to_left) { + start_idx = max(start_idx, seed_pos.node_index); + } + else { + start_idx = min(start_idx, seed_pos.node_index); + } + } + + size_t max_idx = start_idx; + //debug("root: node_index(%lu, %ld), ptr(%p), score(%d)", start_idx, graph.graph.get_id(graph.order[start_idx]), forefronts[start_idx], forefronts[start_idx]->max); + + int64_t inc = right_to_left ? -1 : 1; + for (int64_t i = start_idx + inc; i < graph.order.size() && i >= 0; i += inc) { + + vector incoming_forefronts; + graph.for_each_neighbor(i, !right_to_left, [&](size_t j) { + const dz_forefront_s* inc_ff = forefronts[j]; + if (inc_ff) { + incoming_forefronts.push_back(inc_ff); + } + }); + + if (!incoming_forefronts.empty()) { + + // TODO: if there were multiple seed positions and we didn't choose head nodes, we + // can end up clobbering them here, seems like it might be fragile if anyone develops this again... + + auto ref_seq = graph.graph.get_sequence(graph.order[i]); + + debug("extend rlen(%ld), nid(%ld), rseq(%s)", ref_seq.size(), + graph.graph.get_id(graph.order[i]), ref_seq.c_str()); + + forefronts[i] = extend(packed_query, incoming_forefronts.data(), incoming_forefronts.size(), + &ref_seq.c_str()[right_to_left ? ref_seq.length() : 0], + right_to_left ? -ref_seq.length() : ref_seq.length(), i, aln_init.xt); + } + + if (forefronts[i] != nullptr) { + if (forefronts[i]->max + (right_to_left & dz_geq(forefronts[i])) > forefronts[max_idx]->max) { + max_idx = i; + } + } + } + + // Get max query pos + assert(max_idx <= forefronts.size()); + assert(forefronts[max_idx] != nullptr); + +#ifdef DEBUG + if (forefronts[max_idx]->mcap != nullptr) { + + uint64_t query_max_pos = dz_calc_max_qpos(forefronts[max_idx]); + uint64_t ref_node_max_pos = dz_calc_max_rpos(forefronts[max_idx]); + + debug("max(%p), score(%d), qpos(%ld), rpos(%ld)", forefronts[max_idx], forefronts[max_idx]->max, query_max_pos, ref_node_max_pos); + } +#endif + return max_idx; +} + +// append an edit at the end of the current mapping array, returns forwarded length on the query +size_t DozeuInterface::push_edit(Mapping *mapping, uint8_t op, char const *alt, size_t len) const +{ + /* see aligner.cpp:gssw_mapping_to_alignment */ + #define _add_edit(_from_len, _to_len, _subseq) { \ + Edit *e = mapping->add_edit(); \ + e->set_from_length((_from_len)); \ + e->set_to_length((_to_len)); \ + /* expect a branch dependent on a compile-time NULL will be eliminated */ \ + if((_subseq) != nullptr) { e->set_sequence((char const *)(_subseq), (size_t)(_to_len)); } \ + } + + debug("push_edit: %lu%c in %s", len, "-XMID"[op], pb2json(mapping->position()).c_str()); + if(op == MISMATCH) { + // break down into multiple SNVs + // TODO: why not make this one substitution? + for(size_t i = 0; i < len; i++) { _add_edit(1, 1, &alt[i]); } + } else if (len > 0) { + // Only add an edit if the operation has nonzero length + alt = (op == INS) ? alt : nullptr; + size_t rlen = (op & 0x01) ? 0 : len; + size_t qlen = (op & 0x02) ? len : 0; + _add_edit(rlen, qlen, alt); len = qlen; + } + return(len); + + #undef _add_edit +} + +void DozeuInterface::calculate_and_save_alignment(Alignment &alignment, const OrderedGraph& graph, const vector& head_positions, + size_t tail_node_index, bool left_to_right, const vector& forefronts) +{ + // clear existing alignment (no matter if any significant path is not obtained) + alignment.clear_path(); + alignment.set_score(forefronts.at(tail_node_index)->max); + if(forefronts.at(tail_node_index)->max == 0) { + // No alignment scoring anything other than 0, or not safe to try and trace back. + // Emit a full-length insertion + debug("no alignment; emit full length insertion"); + Mapping* m = alignment.mutable_path()->add_mapping(); + handle_t start = graph.order[head_positions.front().node_index]; + m->mutable_position()->set_node_id(graph.graph.get_id(start)); + m->mutable_position()->set_is_reverse(graph.graph.get_is_reverse(start)); + m->mutable_position()->set_offset(head_positions.front().ref_offset); + m->set_rank(1); + Edit* e = m->add_edit(); + e->set_from_length(0); + e->set_to_length(alignment.sequence().size()); + e->set_sequence(alignment.sequence()); + return; + } + + // If we have a nonzero score we should have a nonempty mcap on the winning forefront + assert(forefronts.at(tail_node_index)->mcap != nullptr); + + // traceback. This produces an alignment in the same order as we traversed the nodes when filling them in. + const dz_alignment_s* aln = trace(forefronts.at(tail_node_index)); + + if(aln == nullptr || aln->path_length == 0) { + // No alignment actually computed. + // Emit a full-length insertion + debug("no traceback; emit full length insertion"); + Mapping* m = alignment.mutable_path()->add_mapping(); + handle_t start = graph.order[head_positions.front().node_index]; + m->mutable_position()->set_node_id(graph.graph.get_id(start)); + m->mutable_position()->set_is_reverse(graph.graph.get_is_reverse(start)); + m->mutable_position()->set_offset(head_positions.front().ref_offset); + m->set_rank(1); + Edit* e = m->add_edit(); + e->set_from_length(0); + e->set_to_length(alignment.sequence().size()); + e->set_sequence(alignment.sequence()); + return; + } + + #ifdef DEBUG + // Dump the Dozeu alignment + // Make sure to translate CIGAR numbers to characters + string translated_path; + for (auto* op = aln->path; *op != 0; ++op) { + translated_path.push_back("-XMID"[*op]); + } + debug("ref_length(%u), query_length(%u), span_length(%d), path_length(%d), score(%d), path(%s)", aln->ref_length, aln->query_length, aln->span_length, aln->path_length, aln->score, translated_path.c_str()); + debug("matches(%u), mismatches(%u), inserts(%u), deletes(%u)", aln->match_count, aln->mismatch_count, aln->ins_count, aln->del_count); + for(size_t i = 0; i < aln->span_length; i++) { + dz_path_span_s const *s = &aln->span[i]; + + translated_path.clear(); + for (auto* op = &aln->path[s->offset]; op != &aln->path[s->offset] + (s[1].offset - s[0].offset); ++op) { + translated_path.push_back("-XMID"[*op]); + } + + debug("node_id(%u), subpath_length(%u:%u-%u), subpath(%s)", + s->id, + s[1].offset - s[0].offset, + s[0].offset, s[1].offset, + translated_path.c_str() + ); + } + #endif + + // Make sure it ends where we started traceback from. + assert(aln->span[aln->span_length - 1].id == tail_node_index); + + // Check the length and make sure it is right. + if (aln->query_length > alignment.sequence().size()) { + cerr << "[vg xdrop_aligner.cpp] Error: dozeu alignment query_length longer than sequence" << endl; + exit(1); + } + // aln->query_length can be shorter than alignment.sequence().size() if we + // didn't traceback from the very last base of the query, or if we didn't + // pack the whole query because of an offset. + + #define _push_mapping(_id) ({ \ + handle_t n = graph.order[(_id)]; \ + Mapping *mapping = path->add_mapping(); \ + mapping->set_rank(path->mapping_size()); \ + Position *position = mapping->mutable_position(); \ + position->set_node_id(graph.graph.get_id(n)); \ + position->set_is_reverse(graph.graph.get_is_reverse(n)); \ + position->set_offset(ref_offset); ref_offset = 0; \ + mapping; \ + }) + #define _push_op(_m, _op, _len) { \ + query_offset += push_edit(_m, (_op), &query[query_offset], (_len)); \ + } + #define _append_op(_m, _op, _init) { \ + if((state & 0xff) == (_op)) { state += 0x100; } \ + else { _push_op(_m, state & 0xff, state>>8); state = (_op) | ((_init)<<8); } \ + } + #define _flush_op(_m, _next_op) { \ + _push_op(_m, state & 0xff, state>>8); state = (_next_op); \ + } + + // figure out which of the head positions we ended up using + graph_pos_s head_pos; + for (const graph_pos_s& pos : head_positions) { + if (pos.node_index == aln->span[0].id) { + head_pos = pos; + break; + } + } + + // Work out what region of the unpacked query sequence has a real alignment. + // query_min_pos is first, query_max_pos is past end + // It will be surrounded by inserts of the rest of the query. + // + // Account for the head_pos.query_offset. + // If left_to_right, it is number of leading bases in alignment.sequence() not packed and not visible to dozeu. + // Else, it is number of leading bases packd and visible to dozeu. + // Also, if not left_to_right, alignment.sequence() and dozeu's sequence run in opposite directions. + uint64_t query_max_pos; + uint64_t query_min_pos; + if (left_to_right) { + // We end where we end in the Dozeu sequence, plus the offset from the real sequence. + query_max_pos = aln->query_length + head_pos.query_offset; + // We begin the distance before there accounted for by the alignment + query_min_pos = query_max_pos - aln->match_count - aln->mismatch_count - aln->ins_count; + } else { + // Total packed length minus where we end in the Dozeu sequence counts from the left edge of aln.sequence + query_min_pos = head_pos.query_offset - aln->query_length; + // Then we advance by the number of query bases used + query_max_pos = query_min_pos + aln->match_count + aln->mismatch_count + aln->ins_count; + } + debug("aligned query region: %lu-%lu", query_min_pos, query_max_pos); + + + // extract query (again). + // We're going to go through it in alignment.sequence() order, no matter the order Dozeu ran in. + const string& query_seq = alignment.sequence(); + const char* query = query_seq.c_str(); + // Start a cursor at 0. If there is a leading insert, we will emit it and add the length into the cursor. + size_t query_offset = 0; + + // set score and pos + alignment.set_score(aln->score); + alignment.set_identity((double)aln->match_count / (double)query_seq.length()); + alignment.set_query_position(0); // always zero? + + // convert the result to protobuf object + Path *path = alignment.mutable_path(); + Mapping *m = nullptr; + if(left_to_right) { + // The order that Dozeu gave us the alignment in (and in which we + // filled the nodes) is left to right (i.e. the order we want to emit + // the alignment in) + uint64_t ref_offset = head_pos.ref_offset; + if (query_min_pos != 0) { + debug("leading insert of %ld bp will be required", query_min_pos); + } + uint64_t state = query_min_pos<<8; + + handle_t n = graph.order[aln->span[aln->span_length - 1].id]; + + debug("rid(%u, %ld), ref_length(%lu), ref_offset(%lu), query_length(%u), query_init_length(%lu)", aln->span[aln->span_length - 1].id, graph.graph.get_id(n), graph.graph.get_length(n), ref_offset, aln->query_length, state>>8); + + state |= state == 0 ? MATCH : INS; + for(size_t i = 0, path_offset = aln->span[0].offset; i < aln->span_length; i++) { + debug("accounted for query up to %lu/%lu", query_offset, query_max_pos); + dz_path_span_s const *span = &aln->span[i]; + debug("i(%lu), rid(%u, %ld), ref_length(%lu), path_offset(%lu), span->offset(%lu)", i, span->id, graph.graph.get_id(graph.order[aln->span[i].id]), graph.graph.get_length(graph.order[aln->span[i].id]), (uint64_t)path_offset, (uint64_t)span->offset); + + for(m = _push_mapping(span->id); path_offset < span[1].offset; path_offset++) { + _append_op(m, aln->path[path_offset], 1); + } + _flush_op(m, aln->path[path_offset]); + } + // We should have made it all the way through our region of the query. + debug("accounted for query up to %lu/%lu", query_offset, query_max_pos); + if(m != nullptr && query_seq.length() != query_offset) { + // We have extra query sequence that dozeu didn't actually align. + // Treat it as a trailing insert. + // TODO: how do we know it should be trailing? + debug("trailing insert of %ld bp to make up length difference", query_seq.length() - query_offset); + _push_op(m, INS, query_seq.length() - query_offset); + } + debug("rv: (%ld, %u) -> (%ld, %u), score(%d), %s\n", graph.graph.get_id(graph.order[aln->span[aln->span_length - 1].id]), head_pos.ref_offset, graph.graph.get_id(graph.order[aln->span[0].id]), aln->span[1].offset, aln->score, alignment.sequence().c_str()); + } else { + // The order that Dozeu gave us the alignment in (and in which we + // filled the nodes) is right to left (i.e. backwards). We have to flip + // it. Also we packed the query sequence in backward, so to follow + // along it forward, we go through the Dozeu alignment backward. + + uint64_t ref_offset = -((int32_t)aln->rrem); + if (query_min_pos != 0) { + debug("leading insert of %ld bp will be required", query_min_pos); + } + uint64_t state = query_min_pos<<8; + + handle_t n = graph.order[aln->span[aln->span_length - 1].id]; + + debug("rid(%u, %ld), ref_length(%lu), ref_offset(%lu), query_length(%lu), query_aln_length(%u), query_init_length(%lu)", aln->span[aln->span_length - 1].id, graph.graph.get_id(n), graph.graph.get_length(n), ref_offset, query_seq.length(), aln->query_length, state>>8); + + state |= state == 0 ? MATCH : INS; + for(size_t i = aln->span_length, path_offset = aln->path_length; i > 0; i--) { + debug("accounted for query up to %lu/%lu", query_offset, query_max_pos); + dz_path_span_s const *span = &aln->span[i - 1]; + debug("i(%lu), rid(%u, %ld), ref_length(%lu), path_offset(%lu), span->offset(%lu)", i, span->id, graph.graph.get_id(graph.order[aln->span[i - 1].id]), graph.graph.get_length(graph.order[aln->span[i - 1].id]), (uint64_t)path_offset, (uint64_t)span->offset); + + for(m = _push_mapping(span->id); path_offset > span->offset; path_offset--) { + _append_op(m, aln->path[path_offset - 1], 1); + } + _flush_op(m, aln->path[path_offset - 1]); + } + // We should have made it all the way through our region of the query. + debug("accounted for query up to %lu/%lu", query_offset, query_max_pos); + if(m != nullptr && query_seq.length() != query_offset) { + // We have extra query sequence that dozeu didn't actually align. + // Treat it as a trailing insert. + // TODO: how do we know it should be trailing? + debug("trailing insert of %ld bp to make up length difference", query_seq.length() - query_offset); + _push_op(m, INS, query_seq.length() - query_offset); + } + debug("fw: (%ld, %u) -> (%ld, %u), score(%d), %s", graph.graph.get_id(graph.order[aln->span[aln->span_length - 1].id]), -((int32_t)aln->rrem), graph.graph.get_id(graph.order[aln->span[0].id]), aln->span[1].offset, aln->score, alignment.sequence().c_str()); + } + return; + + #undef _push_mapping + #undef _push_op + #undef _append_op + #undef _flush_op +} + +#if 0 +void DozeuInterface::debug_print(const Alignment& alignment, const OrderedGraph& graph, const MaximalExactMatch& seed, bool reverse_complemented) const +{ + uint64_t seed_pos = gcsa::Node::offset(seed.nodes.front()); + uint64_t rlen = graph.graph.get_length(graph.order[graph.index_of.at(gcsa::Node::id(seed_pos))]; + auto sequence = graph.graph.get_sequence(graph.order[graph.index_of(gcsa::Node::id(seed_pos))]); + char const *rseq = sequence.c_str(); + uint64_t qlen = alignment.sequence().length(), qpos = calculate_query_seed_pos(alignment, seed); + char const *qseq = alignment.sequence().c_str(); + fprintf(stderr, "xdrop_aligner::align, rev(%d), ptr(%p, %p), (%u, %u, %lu), (%d, %d), %s\n", + reverse_complemented, + &(*seed.begin), &(*alignment.sequence().begin()), + qpos, seed.end - seed.begin, rlen, + gcsa::Node::id(seed_pos), gcsa::Node::offset(seed_pos), + alignment.sequence().c_str()); + + if(reverse_complemented) { + for(uint64_t i = qpos; i > 0; i--) { fprintf(stderr, "%c", comp(qseq[i - 1])); } fprintf(stderr, "\n"); + for(uint64_t i = rlen - gcsa::Node::offset(seed_pos); i > 0; i--) { fprintf(stderr, "%c", comp(rseq[i - 1])); } fprintf(stderr, "\n"); + } else { + for(uint64_t i = qpos; i < qlen; i++) { fprintf(stderr, "%c", qseq[i]); } fprintf(stderr, "\n"); + for(uint64_t i = gcsa::Node::offset(seed_pos); i < rlen; i++) { fprintf(stderr, "%c", rseq[i]); } fprintf(stderr, "\n"); + } + return; +} +#endif + +/* + * align query: forward-backward banded alignment + * + * First we find a "head" position, on the upstream side of the backing graph. If we have MEMs we do it by extending an alignment back from the most backing-downstream MEM; if we don't have MEMs then we hunt for a good match see dourdelves. + * + * Then we extend the head seed backing-downstream, and trace that back to find the optimal alignment. + */ +void DozeuInterface::align(Alignment& alignment, const HandleGraph& graph, const vector& mems, + bool reverse_complemented, int8_t full_length_bonus, uint16_t max_gap_length) +{ + vector topological_order = handlealgs::lazy_topological_order(&graph); + return align(alignment, graph, topological_order, mems, reverse_complemented, max_gap_length); +} + +void DozeuInterface::align(Alignment& alignment, const HandleGraph& graph, const vector& order, + const vector& mems, bool reverse_complemented, + int8_t full_length_bonus, uint16_t max_gap_length) +{ + + const OrderedGraph ordered_graph(graph, order); + + // debug_print(alignment, graph, mems[0], reverse_complemented); + + // compute direction (currently just copied), FIXME: direction (and position) may contradict the MEMs when the function is called via the unfold -> dagify path + bool direction = reverse_complemented; + + // extract query + const string& query_seq = alignment.sequence(); + const string& query_qual = alignment.quality(); + + // construct node_id -> index mapping table + vector forefronts(ordered_graph.size(), nullptr); + + // extract seed node + graph_pos_s head_pos; + if(mems.empty()) { + // seeds are not available here; probably called from mate_rescue + + // scan seed position mems is empty + bool scan_success; + tie(head_pos, scan_success) = scan_seed_position(ordered_graph, alignment, direction, forefronts, + full_length_bonus, max_gap_length); + if (!scan_success) { + // we failed to find a seed, so we will not attempt an alignment + // clear the path just in case we're realigning a GAM + alignment.clear_path(); + return; + } + } + else { + // ordinary extension DP + + // we need seed to build edge table (for semi-global extension) + graph_pos_s seed_pos = calculate_seed_position(ordered_graph, mems, query_seq.size(), direction); + + const char* pack_seq = direction ? query_seq.c_str() : query_seq.c_str() + seed_pos.query_offset; + const uint8_t* pack_qual = nullptr; + if (!alignment.quality().empty()) { + pack_qual = (const uint8_t*) (direction ? query_qual.c_str() : query_qual.c_str() + seed_pos.query_offset); + } + + // pack query (upward) + const dz_query_s* packed_query_seq_up = (direction + ? pack_query_reverse(pack_seq, pack_qual, full_length_bonus, seed_pos.query_offset) + : pack_query_forward(pack_seq, pack_qual, full_length_bonus, query_seq.size() - seed_pos.query_offset) + ); + // upward extension + head_pos = calculate_max_position(ordered_graph, seed_pos, + do_poa(ordered_graph, packed_query_seq_up, + {seed_pos}, direction, forefronts, max_gap_length), + direction, forefronts); + } + // fprintf(stderr, "head_node_index(%lu), rpos(%lu, %u), qpos(%u), direction(%d)\n", head_pos.node_index, head_pos.node_index, head_pos.ref_offset, head_pos.query_offset, direction); + + // Now that we have determined head_pos, do the downward alignment from there, and the traceback. + align_downward(alignment, ordered_graph, {head_pos}, reverse_complemented, forefronts, full_length_bonus, max_gap_length); + + #ifdef DEBUG + if (mems.empty()) { + fprintf(stderr, "rescue: score(%d)\n", alignment.score()); + } + #endif + + // bench_end(bench); +} + +void DozeuInterface::align_downward(Alignment& alignment, const OrderedGraph& graph, const vector& head_positions, + bool left_to_right, vector& forefronts, + int8_t full_length_bonus, uint16_t max_gap_length) +{ + + // we're now allowing multiple graph start positions, but not multiple read start positions + for (size_t i = 1; i < head_positions.size(); ++i) { + assert(head_positions.at(i).query_offset == head_positions.front().query_offset); + } + + // extract query + const string& query_seq = alignment.sequence(); + const string& query_qual = alignment.quality(); + const uint64_t qlen = query_seq.length(); + + const char* pack_seq = left_to_right ? query_seq.c_str() + head_positions.front().query_offset : query_seq.c_str(); + const uint8_t* pack_qual = nullptr; + if (!alignment.quality().empty()) { + pack_qual = (const uint8_t*) (left_to_right ? query_qual.c_str() + head_positions.front().query_offset : query_qual.c_str()); + } + + // pack query (downward) + const dz_query_s* packed_query_seq_dn = (left_to_right + ? pack_query_forward(pack_seq, pack_qual, full_length_bonus, qlen - head_positions.front().query_offset) + : pack_query_reverse(pack_seq, pack_qual, full_length_bonus, head_positions.front().query_offset) + ); + + // downward extension + calculate_and_save_alignment(alignment, graph, head_positions, + do_poa(graph, packed_query_seq_dn, head_positions, !left_to_right, + forefronts, max_gap_length), + left_to_right, forefronts); + + // clear the memory + flush(); +} + +void DozeuInterface::align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left, + int8_t full_length_bonus, uint16_t max_gap_length) +{ + // Compute our own topological order + vector order = handlealgs::lazy_topological_order(&g); + + if (order.empty()) { + // Can't do anything with no nodes in the graph. + return; + } + + // Dozeu needs a seed position to start at, but that position doesn't necessarily actually become a match. + + // Find all of the tips that we'd want to pin at + vector head_positions; + for (size_t i = 0; i < order.size(); ++i) { + handle_t handle = order[i]; + // check if this is a tip in the correct direction so that we'd want to pin on it + bool do_pinning = g.follow_edges(handle, pin_left, [](const handle_t& neighbor) { return false; }); + if (do_pinning) { + head_positions.emplace_back(); + head_positions.back().node_index = i; + if (pin_left) { + head_positions.back().ref_offset = 0; + head_positions.back().query_offset = 0; + } + else { + head_positions.back().ref_offset = g.get_length(handle); + head_positions.back().query_offset = alignment.sequence().size(); + } + } + } + + + // Attach order to graph + OrderedGraph ordered(g, order); + + // construct node_id -> index mapping table + vector forefronts(ordered.order.size(), nullptr); + + // Do the left-to-right alignment from the fixed head_pos seed, and then do the traceback. + align_downward(alignment, ordered, head_positions, pin_left, forefronts, full_length_bonus, max_gap_length); +} + +/** + * end of dozeu_interface.cpp + */ diff --git a/src/dozeu_interface.hpp b/src/dozeu_interface.hpp new file mode 100644 index 00000000000..a751d4d39f4 --- /dev/null +++ b/src/dozeu_interface.hpp @@ -0,0 +1,342 @@ +/** + * @file dozeu_interface.hpp + * @author Hajime Suzuki + * @date 2018/03/23 + */ +#ifndef VG_DOZEU_INTERFACE_HPP_INCLUDED +#define VG_DOZEU_INTERFACE_HPP_INCLUDED + +#include +#include +#include +#include +#include + +#include +#include "types.hpp" +#include "handle.hpp" +#include "mem.hpp" + +// #define BENCH +// #include "bench.h" + +// forward declarations of dozeu structs +struct dz_s; +struct dz_forefront_s; +struct dz_query_s; +struct dz_alignment_s; + +namespace vg { + +static constexpr uint16_t default_xdrop_max_gap_length = 40; + +/** + * Align to a graph using the xdrop algorithm, as implemented in dozeu. + * + * The underlying Dozeu library is fundamentally based around semi-global + * alignment: extending an alignment from a known matching position (what + * in other parts of vg we call "pinned" alignment). + * + * To simulate non-pinned alignment, we align in two passes in different + * directions. One from a guess of a pinning position, to get a more + * accurate "head" pinning position for the other end, and once back from + * where the previous pass ended up, to get an overall hopefully-optimal + * alignment. + * + * If the input graph is not reverse-complemented, direction = false + * (reverse, right to left) on the first pass, and direction = true + * (forward, left to right) on the second. If it is reverse complemented, + * we flip them. + * + * This won't actually work in theory to get the optimal local alignment in + * all cases, but it works well in practice. + * + * This class maintains an internal dz_s, which is *NOT THREADSAFE*, + * and non-const during alignments. However, it may be reused for + * subsequent alignments. + */ +class DozeuInterface { + +public: + + virtual ~DozeuInterface() = default; + /** + * align query: forward-backward banded alignment + * + * Compute an alignment of the given Alignment's sequence against the + * given DAG, using (one of) the given MEMs to seed the alignment. + * + * reverse_complemented is true if the topologically sorted graph we + * have was reverse-complemented when extracted from a larger + * containing graph, and false if it is in the same orientation as it + * exists in the larger containing graph. The MEMs and the Alignment + * are interpreted as being against the forward strand of the passed + * subgraph no matter the value of this setting. + * + * reverse_complemented true means we will compute the alignment + * forward in the topologically-sorted order of the given graph + * (anchoring to the first node if no MEMs are provided) and false if + * we want to compute the alignment backward in the topological order + * (anchoring to the last node). + * + * First the head (the most upstream) seed in MEMs is selected and + * extended downward to detect the downstream breakpoint. Next the + * alignment path is generated by second upward extension from the + * downstream breakpoint. + * + * The MEM list may be empty. If MEMs are provided, uses only the + * begin, end, and nodes fields of the MaximalExactMatch objects. It + * uses the first occurrence of the last MEM if reverse_complemented is + * true, and the last occurrence of the first MEM otherwise. + */ + void align(Alignment& alignment, const HandleGraph& graph, const vector& mems, + bool reverse_complemented, int8_t full_length_bonus, + uint16_t max_gap_length = default_xdrop_max_gap_length); + + /** + * Same as above except using a precomputed topological order, which + * need not include all handles in the graph, and which may contain both + * orientations of a handle. + */ + void align(Alignment& alignment, const HandleGraph& graph, const vector& order, + const vector& mems, bool reverse_complemented, + int8_t full_length_bonus, uint16_t max_gap_length = default_xdrop_max_gap_length); + + /** + * Compute a pinned alignment, where the start (pin_left=true) or end + * (pin_left=false) end of the Alignment sequence is pinned to the + * start of the first (pin_left=true) or end of the last + * (pin_left=false) node in the graph's topological order. + * + * Does not account for multiple sources/sinks in the topological + * order; whichever comes first/last ends up being used for the pin. + */ + void align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left, + int8_t full_length_bonus, uint16_t max_gap_length = default_xdrop_max_gap_length); + +protected: + /** + * Represents a correspondance between a position in the subgraph we are + * mapping to and a position in the read we are mapping. + */ + struct graph_pos_s { + /// What index in the node list of our extracted subgraph is our node at? + size_t node_index; + /// What is the offset in the node? Note that we only think about the forward strand. + uint32_t ref_offset; + /// What is the correspondign offset in the query sequence? + uint32_t query_offset; + }; + + /** + * Represents a HandleGraph with a defined (topological) order calculated for it. + */ + struct OrderedGraph { + OrderedGraph(const HandleGraph& graph, const vector& order); + void for_each_neighbor(const size_t i, bool go_left, const function& lambda) const; + size_t size() const; + + const HandleGraph& graph; + const vector& order; + unordered_map index_of; + }; + + // wrappers for dozeu functions that can be used to toggle between between quality + // adjusted and standard alignments + virtual dz_query_s* pack_query_forward(const char* seq, const uint8_t* qual, + int8_t full_length_bonus, size_t len) = 0; + virtual dz_query_s* pack_query_reverse(const char* seq, const uint8_t* qual, + int8_t full_length_bonus, size_t len) = 0; + virtual const dz_forefront_s* scan(const dz_query_s* query, const dz_forefront_s** forefronts, + size_t n_forefronts, const char* ref, int32_t rlen, uint32_t rid, + uint16_t xt) = 0; + virtual const dz_forefront_s* extend(const dz_query_s* query, const dz_forefront_s** forefronts, + size_t n_forefronts, const char* ref, int32_t rlen, + uint32_t rid, uint16_t xt) = 0; + virtual dz_alignment_s* trace(const dz_forefront_s* forefront) = 0; + virtual void flush() = 0; + + /// Given the subgraph we are aligning to, the MEM hist against it, the + /// length of the query, and the direction we are aligning the query in + /// (true = forward), select a single anchoring match between the graph + /// and the query to align out from. + /// + /// This replaces scan_seed_position for the case where we have MEMs. + graph_pos_s calculate_seed_position(const OrderedGraph& graph, const vector& mems, + size_t query_length, bool direction) const; + /// Given the index of the node at which the winning score occurs, find + /// the position in the node and read sequence at which the winning + /// match is found. + graph_pos_s calculate_max_position(const OrderedGraph& graph, const graph_pos_s& seed_pos, + size_t max_node_index, bool direction, + const vector& forefronts); + + /// If no seeds are provided as alignment input, we need to compute our own starting anchor position. This function does that. + /// Takes the topologically-sorted graph, the query sequence, and the direction. + /// If direction is false, finds a seed hit on the first node of the graph. If it is true, finds a hit on the last node. + /// + /// This replaces calculate_seed_position for the case where we have no MEMs. + /// + /// The bool return with the position indicates whether the scan succeeded or failed. + /// If the scan failed, then the alignment should not be attempted. + pair scan_seed_position(const OrderedGraph& graph, const Alignment& alignment, + bool direction, vector& forefronts, + int8_t full_length_bonus, uint16_t max_gap_length); + + /// Append an edit at the end of the current mapping array. + /// Returns the length passed in. + size_t push_edit(Mapping *mapping, uint8_t op, const char* alt, size_t len) const; + + /// Do alignment. Takes the graph, the sorted packed edges in + /// ascending order for a forward pass or descending order for a + /// reverse pass, the packed query sequence, the index of the seed node + /// in the graph, the offset (TODO: in the read?) of the seed position, + /// and the direction to traverse the graph topological order. + /// + /// Note that we take our direction as right_to_left, whole many other + /// functions take it as left_to_right. + /// + /// If a MEM seed is provided, this is run in two passes. The first is + /// left to right (right_to_left = false) if align did not have + /// reverse_complement set and the second is right to left (right_to_left = + /// true). + /// + /// If we have no MEM seed, we only run one pass (the second one). + /// + /// Returns the index in the topological order of the node with the + /// highest scoring alignment. + /// + /// Note that if no non-empty local alignment is found, it may not be + /// safe to call dz_calc_max_qpos on the associated forefront! + size_t do_poa(const OrderedGraph& graph, const dz_query_s* packed_query, + const vector& seed_positions, bool right_to_left, + vector& forefronts, uint16_t); + + /** + * After all the alignment work has been done, do the traceback and + * save into the given Alignment object. + * + * If left_to_right is true, the nodes were filled left to right, and + * the internal traceback will come out in left to right order, so we + * can emit it as is. If it is false, the nodes were filled right to + * left, and the internal traceback comes out in right to left order, + * so we need to flip it. + */ + void calculate_and_save_alignment(Alignment& alignment, const OrderedGraph& graph, + const vector& head_positions, + size_t tail_node_index, bool left_to_right, + const vector& forefronts); + + // void debug_print(Alignment const &alignment, OrderedGraph const &graph, MaximalExactMatch const &seed, bool reverse_complemented); + // bench_t bench; + + /// After doing the upward pass and finding head_pos to anchor from, do + /// the downward alignment pass and traceback. If left_to_right is + /// set, goes left to right and traces back the other way. If it is + /// unset, goes right to left and traces back the other way. + void align_downward(Alignment &alignment, const OrderedGraph& graph, + const vector& head_positions, + bool left_to_right, vector& forefronts, + int8_t full_length_bonus, uint16_t max_gap_length); + + + /// The core dozeu class, which does the alignments + dz_s* dz = nullptr; +}; + +/* + * A dozeu-backed X-drop aligner that does not use base qualities + * to adjust alignment scores. + */ +class XdropAligner : public DozeuInterface { +public: + + /// Main constructor. Expects a 4 x 4 score matrix. + XdropAligner(const int8_t* _score_matrix, + int8_t _gap_open, + int8_t _gap_extension); + + // see DozeuInterface::align and DozeuInterface::align_pinned below for alignment + // interface + +private: + + // implementations of virtual functions from DozeuInterface + dz_query_s* pack_query_forward(const char* seq, const uint8_t* qual, int8_t full_length_bonus, size_t len); + dz_query_s* pack_query_reverse(const char* seq, const uint8_t* qual, int8_t full_length_bonus, size_t len); + const dz_forefront_s* scan(const dz_query_s* query, const dz_forefront_s** forefronts, + size_t n_forefronts, const char* ref, int32_t rlen, uint32_t rid, + uint16_t xt); + const dz_forefront_s* extend(const dz_query_s* query, const dz_forefront_s** forefronts, + size_t n_forefronts, const char* ref, int32_t rlen, + uint32_t rid, uint16_t xt); + dz_alignment_s* trace(const dz_forefront_s* forefront); + void flush(); + +public: + + XdropAligner() = default; + ~XdropAligner(void); + + /// Copy constructor + XdropAligner(const XdropAligner& other); + /// Copy assignment + XdropAligner& operator=(const XdropAligner& other); + /// Move constructor + XdropAligner(XdropAligner&& other); + /// Move assignment + XdropAligner& operator=(XdropAligner&& other); +}; + +/* + * A dozeu-backed X-drop aligner that uses base qualities to adjust + * alignment scores. + */ +class QualAdjXdropAligner : public DozeuInterface { +public: + + /// Main constructor. Expects a 4 x 4 score matrix and a 4 x 4 x 64 quality adjusted matrix + QualAdjXdropAligner(const int8_t* _score_matrix, + const int8_t* _qual_adj_score_matrix, + int8_t _gap_open, + int8_t _gap_extension); + + + // see DozeuInterface::align and DozeuInterface::align_pinned below for alignment + // interface + +private: + + // implementations of virtual functions from DozeuInterface + dz_query_s* pack_query_forward(const char* seq, const uint8_t* qual, int8_t full_length_bonus, size_t len); + dz_query_s* pack_query_reverse(const char* seq, const uint8_t* qual, int8_t full_length_bonus, size_t len); + const dz_forefront_s* scan(const dz_query_s* query, const dz_forefront_s** forefronts, + size_t n_forefronts, const char* ref, int32_t rlen, uint32_t rid, + uint16_t xt); + const dz_forefront_s* extend(const dz_query_s* query, const dz_forefront_s** forefronts, + size_t n_forefronts, const char* ref, int32_t rlen, + uint32_t rid, uint16_t xt); + dz_alignment_s* trace(const dz_forefront_s* forefront); + void flush(); + +public: + + QualAdjXdropAligner() = default; + ~QualAdjXdropAligner(void); + + /// Copy constructor + QualAdjXdropAligner(const QualAdjXdropAligner& other); + /// Copy assignment + QualAdjXdropAligner& operator=(const QualAdjXdropAligner& other); + /// Move constructor + QualAdjXdropAligner(QualAdjXdropAligner&& other); + /// Move assignment + QualAdjXdropAligner& operator=(QualAdjXdropAligner&& other); +}; + +} // end of namespace vg + +#endif // VG_DOZEU_INTERFACE_HPP_INCLUDED +/** + * end of dozeu_interface.hpp + */ diff --git a/src/dozeu_pinning_overlay.cpp b/src/dozeu_pinning_overlay.cpp new file mode 100644 index 00000000000..ff3f96e1714 --- /dev/null +++ b/src/dozeu_pinning_overlay.cpp @@ -0,0 +1,222 @@ +/** + * \file dozeu_pinning_overlay.cpp: contains the implementation of DozeuPinningOverlay + */ + + +#include "dozeu_pinning_overlay.hpp" + + +namespace vg { + +using namespace std; + +DozeuPinningOverlay::DozeuPinningOverlay(const HandleGraph* graph, bool preserve_sinks) : graph(graph), preserve_sinks(preserve_sinks) { + + unordered_set empty_nodes; + + // find the numeric range of handles in the underlying graph (needed for later bookkeeping) + // and all nodes with no sequence + uint64_t min_handle = std::numeric_limits::max(); + graph->for_each_handle([&](const handle_t& handle) { + + min_handle = min(handlegraph::as_integer(handle), min_handle); + min_handle = min(handlegraph::as_integer(graph->flip(handle)), min_handle); + max_handle = max(handlegraph::as_integer(handle), max_handle); + max_handle = max(handlegraph::as_integer(graph->flip(handle)), max_handle); + + if (graph->get_length(handle) == 0) { + empty_nodes.insert(handle); + } + }); + // keep track of these values + num_null_nodes = empty_nodes.size(); + handle_val_range = max_handle - min_handle + 1; + + for (const handle_t& empty : empty_nodes) { + + // is this empty node a source/sink (depending on what we're preserving)? + bool should_preserve = graph->follow_edges(empty, !preserve_sinks, [&](const handle_t& next) { return false; }); + + if (should_preserve) { + // check the neighbors of the empty source/sink + graph->follow_edges(empty, preserve_sinks, [&](const handle_t& next) { + // walk back in the direction of the empty node and check if there are any different neighbors + bool must_duplicate = !graph->follow_edges(next, !preserve_sinks, [&](const handle_t& prev) { return prev == empty; }); + + if (must_duplicate) { + // this node has a length 0 path from a source/sink, but it will not once we mask out the + // length 0 nodes, so we have to duplicate it so that it retains its ability to be used as + // a pinning point + duplicated_handles.insert(next); + } + }); + } + } + num_null_nodes = empty_nodes.size(); +} + +bool DozeuPinningOverlay::performed_duplications() const { + return !duplicated_handles.empty(); +} + +bool DozeuPinningOverlay::has_node(id_t node_id) const { + if (is_a_duplicate_id(node_id)) { + id_t under_id = get_underlying_id(node_id); + if (graph->has_node(under_id)) { + handle_t handle = graph->get_handle(under_id); + return duplicated_handles.count(handle); + } + } + else { + if (graph->has_node(node_id)) { + handle_t handle = graph->get_handle(node_id); + return graph->get_length(handle) != 0; + } + } + return false; +} + +handle_t DozeuPinningOverlay::get_handle(const id_t& node_id, bool is_reverse) const { + if (is_a_duplicate_id(node_id)) { + return get_duplicate_handle(graph->get_handle(get_underlying_id(node_id), is_reverse)); + } + else { + return graph->get_handle(node_id, is_reverse); + } +} + +id_t DozeuPinningOverlay::get_id(const handle_t& handle) const { + if (is_a_duplicate_handle(handle)) { + return graph->get_id(get_underlying_handle(handle)) + (graph->max_node_id() - graph->min_node_id() + 1); + } + else { + return graph->get_id(handle); + } +} + +bool DozeuPinningOverlay::get_is_reverse(const handle_t& handle) const { + if (is_a_duplicate_handle(handle)) { + return graph->get_is_reverse(get_underlying_handle(handle)); + } + else { + return graph->get_is_reverse(handle); + } +} + +handle_t DozeuPinningOverlay::flip(const handle_t& handle) const { + if (is_a_duplicate_handle(handle)) { + return get_duplicate_handle(graph->flip(get_underlying_handle(handle))); + } + else { + return graph->flip(handle); + } +} + +size_t DozeuPinningOverlay::get_length(const handle_t& handle) const { + if (is_a_duplicate_handle(handle)) { + return graph->get_length(get_underlying_handle(handle)); + } + else { + return graph->get_length(handle); + } +} + +string DozeuPinningOverlay::get_sequence(const handle_t& handle) const { + if (is_a_duplicate_handle(handle)) { + return graph->get_sequence(get_underlying_handle(handle)); + } + else { + return graph->get_sequence(handle); + } +} + +bool DozeuPinningOverlay::follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const { + handle_t to_iterate = handle; + if (is_a_duplicate_handle(handle)) { + // this is a duplicate that we made to preserve sources/sinks + if (preserve_sinks != (go_left != graph->get_is_reverse(handle))) { + // we are going in the direction where the duplicate has no edges + return true; + } + to_iterate = get_underlying_handle(handle); + } + + return graph->follow_edges(to_iterate, go_left, [&](const handle_t& next) { + bool keep_going = true; + if (graph->get_length(next) > 0) { + // the node is non-empty, so it hasn't been removed + keep_going = iteratee(next); + if (keep_going && duplicated_handles.count(graph->forward(next))) { + // the node has a duplicate + if (preserve_sinks != (go_left != graph->get_is_reverse(next))) { + // we arrived at this node over an edge that the duplicate also shares + keep_going = iteratee(get_duplicate_handle(next)); + } + } + } + return keep_going; + }); +} + +bool DozeuPinningOverlay::for_each_handle_impl(const function& iteratee, bool parallel) const { + // iterate over the original non empty nodes + bool keep_going = graph->for_each_handle([&](const handle_t& handle) { + if (graph->get_length(handle) > 0) { + return iteratee(handle); + } + else { + return true; + } + }, parallel); + // iterate over the duplicates + for (auto it = duplicated_handles.begin(), end = duplicated_handles.end(); it != end && keep_going; ++it) { + keep_going = iteratee(get_duplicate_handle(*it)); + } + return keep_going; +} + +size_t DozeuPinningOverlay::get_node_count() const { + return graph->get_node_count() - num_null_nodes + duplicated_handles.size(); +} + +id_t DozeuPinningOverlay::min_node_id() const { + return graph->min_node_id(); +} + +id_t DozeuPinningOverlay::max_node_id() const { + id_t max_id = graph->max_node_id(); + for (const handle_t& handle : duplicated_handles) { + max_id = max(max_id, get_id(get_duplicate_handle(handle))); + } + return max_id; +} + +bool DozeuPinningOverlay::is_a_duplicate_handle(const handle_t& handle) const { + return max_handle < (uint64_t) handlegraph::as_integer(handle); +} + +bool DozeuPinningOverlay::is_a_duplicate_id(const id_t& node_id) const { + return node_id > graph->max_node_id(); +} + + +handle_t DozeuPinningOverlay::get_underlying_handle(const handle_t& handle) const { + if (!is_a_duplicate_handle(handle)) { + return handle; + } + else { + return handlegraph::as_handle(uint64_t(handlegraph::as_integer(handle)) - handle_val_range); + } +} + +id_t DozeuPinningOverlay::get_underlying_id(const id_t& node_id) const { + return node_id - (graph->max_node_id() - graph->min_node_id() + 1); +} + + +handle_t DozeuPinningOverlay::get_duplicate_handle(const handle_t& handle) const { + return handlegraph::as_handle(uint64_t(handlegraph::as_integer(handle)) + handle_val_range); +} + +} diff --git a/src/dozeu_pinning_overlay.hpp b/src/dozeu_pinning_overlay.hpp new file mode 100644 index 00000000000..dd40b07d7b0 --- /dev/null +++ b/src/dozeu_pinning_overlay.hpp @@ -0,0 +1,126 @@ +#ifndef VG_DOZEU_PINNING_OVERLAY_HPP_INCLUDED +#define VG_DOZEU_PINNING_OVERLAY_HPP_INCLUDED + +/** \file + * dozeu_pinning_overlay.hpp: defines a handle graph implementation that handles nodes + * with no sequence in a way that plays well with dozeu pinned alignment + */ + +#include "handle.hpp" + +namespace vg { + +using namespace std; + +/* + * An overlay that 1) masks source/sink nodes that have no sequence + * associated with them and 2) duplicates any nodes necessary to + * ensure that their neighbors are still sourc/sink nodes. This + * transformation is invalid on graphs that have source-to-sink path + * that consists entirely of null nodes. Assumes graph is single- + * stranded. + */ +class DozeuPinningOverlay : public ExpandingOverlayGraph { +public: + + /// Initialize with the graph we want to do pinned alignment to. Boolean flag indicates + /// whether sources or sinks should be preserved (for pinning right or left respectively). + DozeuPinningOverlay(const HandleGraph* graph, bool preserve_sinks); + + /// Default constructor -- not actually functional + DozeuPinningOverlay() = default; + + /// Default destructor + ~DozeuPinningOverlay() = default; + + /// Returns true if any node has been duplicated in the overlay, in which case it may + /// be necessary to translate between the IDs of this overlay and the underlying graph. + bool performed_duplications() const; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + // Method to check if a node exists by ID + bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph. + size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + id_t max_node_id() const; + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /// Returns the handle in the underlying graph that corresponds to a handle in the + /// overlay + handle_t get_underlying_handle(const handle_t& handle) const; + +private: + + bool is_a_duplicate_handle(const handle_t& handle) const; + + bool is_a_duplicate_id(const id_t& node_id) const; + + id_t get_underlying_id(const id_t& node_id) const; + + handle_t get_duplicate_handle(const handle_t& handle) const; + + /// The graph we're masking empty nodes in + const HandleGraph* graph = nullptr; + + /// Are we duplicating to preserve sources or sinks? + bool preserve_sinks; + + /// The total number of null nodes + size_t num_null_nodes = 0; + + /// The minimum numeric value of a handle + uint64_t max_handle = numeric_limits::min(); + + /// The length of the interval between the max and min numeric values of handles + uint64_t handle_val_range = 0; + + unordered_set duplicated_handles; +}; +} + +#endif // VG_DOZEU_PINNING_OVERLAY_HPP_INCLUDED diff --git a/src/edit.cpp b/src/edit.cpp deleted file mode 100644 index a30f979cdb7..00000000000 --- a/src/edit.cpp +++ /dev/null @@ -1,106 +0,0 @@ -#include "edit.hpp" -#include "utility.hpp" - -namespace vg { - -bool edit_is_match(const Edit& e) { - return e.from_length() == e.to_length() && e.sequence().empty(); -} - -bool edit_is_sub(const Edit& e) { - return e.from_length() == e.to_length() && !e.sequence().empty(); -} - -bool edit_is_insertion(const Edit& e) { - return e.from_length() == 0 && e.to_length() > 0 && !e.sequence().empty(); -} - -bool edit_is_deletion(const Edit& e) { - return e.from_length() > 0 && e.to_length() == 0; -} - -bool edit_is_empty(const Edit& e) { - return e.to_length() == 0 && e.from_length() == 0 && e.sequence().empty(); -} - -pair cut_edit_at_to(const Edit& e, size_t to_off) { - Edit left, right; - if (to_off > e.to_length()) { - return make_pair(e, right); - } - // to-length of left portion - size_t l = e.to_length() - to_off; - // to-length of right portion - size_t r = e.to_length() - l; - if (l > e.to_length()) { - left = e; - } else if (edit_is_match(e)) { - left.set_from_length(l); - left.set_to_length(l); - right.set_from_length(r); - right.set_to_length(r); - } else if (edit_is_sub(e)) { - left.set_from_length(l); - left.set_to_length(l); - left.set_sequence(e.sequence().substr(0, l)); - right.set_from_length(r); - right.set_to_length(r); - right.set_sequence(e.sequence().substr(l)); - } else if (edit_is_insertion(e)) { - left.set_to_length(l); - left.set_sequence(e.sequence().substr(0, l)); - right.set_to_length(r); - right.set_sequence(e.sequence().substr(l)); - } else if (edit_is_deletion(e)) { - left = e; - } - return make_pair(left, right); -} - -pair cut_edit_at_from(const Edit& e, size_t from_off) { - Edit left, right; - if (from_off > e.from_length()) { - return make_pair(e, right); - } - // from-length of left portion - size_t l = e.from_length() - from_off; - // from-length of right portion - size_t r = e.from_length() - l; - if (edit_is_match(e)) { - left.set_from_length(l); - left.set_to_length(l); - right.set_from_length(r); - right.set_to_length(r); - } else if (edit_is_sub(e)) { - left.set_from_length(l); - left.set_to_length(l); - left.set_sequence(e.sequence().substr(0, l)); - right.set_from_length(r); - right.set_to_length(r); - right.set_sequence(e.sequence().substr(l)); - } else if (edit_is_insertion(e)) { - left = e; - } else if (edit_is_deletion(e)) { - left.set_from_length(l); - right.set_from_length(r); - } - return make_pair(left, right); -} - -Edit reverse_complement_edit(const Edit& e) { - // Make a reversed copy - Edit reversed = e; - - // All we have to do is flip the sequence - reversed.set_sequence(reverse_complement(e.sequence())); - - return reversed; -} - -bool operator==(const Edit& e1, const Edit& e2) { - return (e1.to_length() == e2.to_length()) - && (e1.from_length() == e2.from_length()) - && (e1.sequence() == e2.sequence()); -} - -} diff --git a/src/edit.hpp b/src/edit.hpp deleted file mode 100644 index e2e45406fc0..00000000000 --- a/src/edit.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef VG_EDIT_HPP_INCLUDED -#define VG_EDIT_HPP_INCLUDED - -#include "vg.pb.h" -#include -#include // for debugging -#include "json2pb.h" - -namespace vg { - -using namespace std; - -bool edit_is_match(const Edit& e); -bool edit_is_sub(const Edit& e); -bool edit_is_insertion(const Edit& e); -bool edit_is_deletion(const Edit& e); -bool edit_is_empty(const Edit& e); -pair cut_edit_at_to(const Edit& e, size_t to_off); -pair cut_edit_at_from(const Edit& e, size_t from_off); -// Reverse an edit and reverse complement any embedded sequence -Edit reverse_complement_edit(const Edit& e); -bool operator==(const Edit& e1, const Edit& e2); - -} - -#endif diff --git a/src/endianness.hpp b/src/endianness.hpp new file mode 100644 index 00000000000..ec676e48dae --- /dev/null +++ b/src/endianness.hpp @@ -0,0 +1,89 @@ +#ifndef VG_ENDIANNESS_HPP_INCLUDED +#define VG_ENDIANNESS_HPP_INCLUDED + +/** \file endianness.hpp + * Methods for converting endianness in integers + */ + +#include +#include + +namespace vg { + + /** + * A struct namespace for methods to handle endianness in integer values. + */ + template + struct endianness { + public: + + /// Converts from an integer in the native representation in the machine + /// architecture to a big-endian representation + static inline IntType to_big_endian(IntType value); + + /// Converts from a big-endian integer to the native representation in + /// the machine architecture + static inline IntType from_big_endian(IntType value); + + private: + + /// Returns the integer in the opposite endianness representation it currently + /// has + static inline IntType swap_endianness(IntType value); + + /// Returns true if the architecture is big-endian, otherwise false + static inline bool arch_is_big_endian(); + + static_assert(std::is_integral::value, + "Endianness only applies to integer data types"); + }; + + //////////////////////////// + /// Template implementations + //////////////////////////// + + + template + inline IntType endianness::to_big_endian(IntType value) { + return arch_is_big_endian() ? value : swap_endianness(value); + } + + template + inline IntType endianness::from_big_endian(IntType value) { + // these turn out to be identical functions, but having both aliases still + // seems cognitively useful + return to_big_endian(value); + } + + template + inline IntType endianness::swap_endianness(IntType value) { + + IntType swapped; + + uint8_t* from = (uint8_t*) &value; + uint8_t* to = (uint8_t*) &swapped; + + for (int i = 0; i < sizeof(IntType); ++i) { + to[i] = from[sizeof(IntType) - i - 1]; + } + + return swapped; + } + + // TODO: this method will not detect endianness correctly on 1-byte + // integers, but endianness is irrelevant for them anyway... + template + inline bool endianness::arch_is_big_endian() { + + // mark volatile so the compiler won't optimize it away + volatile IntType val = 1; + + uint8_t* bytes = (uint8_t*) &val; + + // the 1 is only set at the lowest memory address if the architecture + // is little-endian + return !bytes[0]; + } +} + +#endif diff --git a/src/explainer.cpp b/src/explainer.cpp new file mode 100644 index 00000000000..cddd68bf55c --- /dev/null +++ b/src/explainer.cpp @@ -0,0 +1,373 @@ +/** + * \file + * Implementations for algorithm explanation utilities. + */ + +#include "explainer.hpp" + +#include + +#include + +namespace vg { + +std::atomic Explainer::next_explanation_number {0}; + +bool Explainer::save_explanations = false; + +Explainer::Explainer() : explanation_number(Explainer::next_explanation_number++) { + // Nothing to do! +} + +Explainer::~Explainer() { + // Nothing to do! +} + +ProblemDumpExplainer::ProblemDumpExplainer(const std::string& name) : Explainer() { + if (!Explainer::save_explanations) { + return; + } + out.open(name + std::to_string(explanation_number) + ".json"); +} + +ProblemDumpExplainer::~ProblemDumpExplainer() { + // Nothing to do! +} + +void ProblemDumpExplainer::object_start() { + if (!Explainer::save_explanations) { + return; + } + comma(); + out << "{"; +} + +void ProblemDumpExplainer::object_end() { + if (!Explainer::save_explanations) { + return; + } + out << "}"; + need_comma = true; +} + +void ProblemDumpExplainer::array_start() { + if (!Explainer::save_explanations) { + return; + } + comma(); + out << "["; +} + +void ProblemDumpExplainer::array_end() { + if (!Explainer::save_explanations) { + return; + } + out << "]"; + need_comma = true; +} + +void ProblemDumpExplainer::key(const std::string& k) { + if (!Explainer::save_explanations) { + return; + } + comma(); + out << "\"" << k << "\":"; +} + +void ProblemDumpExplainer::value(const std::string& v) { + if (!Explainer::save_explanations) { + return; + } + comma(); + out << "\"" << v << "\""; + need_comma = true; +} + +void ProblemDumpExplainer::value(double v) { + if (!Explainer::save_explanations) { + return; + } + comma(); + out << v; + need_comma = true; +} + +void ProblemDumpExplainer::value(size_t v) { + if (!Explainer::save_explanations) { + return; + } + comma(); + out << "\"" << v << "\""; + need_comma = true; +} + +void ProblemDumpExplainer::value(int v) { + if (!Explainer::save_explanations) { + return; + } + comma(); + out << "\"" << v << "\""; + need_comma = true; +} + +void ProblemDumpExplainer::value(bool v) { + if (!Explainer::save_explanations) { + return; + } + comma(); + out << (v ? "true" : "false"); + need_comma = true; +} + +void ProblemDumpExplainer::value(vg::id_t v) { + if (!Explainer::save_explanations) { + return; + } + comma(); + out << "\"" << v << "\""; + need_comma = true; +} + +void ProblemDumpExplainer::value(const pos_t& v) { + if (!Explainer::save_explanations) { + return; + } + object_start(); + key("node_id"); + value(id(v)); + if (offset(v) != 0) { + key("offset"); + value(offset(v)); + } + if (is_rev(v)) { + key("is_reverse"); + value(true); + } + object_end(); +} + +void ProblemDumpExplainer::value(const HandleGraph& v) { + if (!Explainer::save_explanations) { + return; + } + object_start(); + key("node"); + array_start(); + v.for_each_handle([&](const handle_t& h) { + // Put all the nodes in the node array + object_start(); + key("id"); + value(v.get_id(h)); + key("sequence"); + value(v.get_sequence(h)); + object_end(); + }); + array_end(); + key("edge"); + array_start(); + v.for_each_edge([&](const edge_t& e) { + // Put all the edges in the edge array + object_start(); + key("from"); + value(v.get_id(e.first)); + if (v.get_is_reverse(e.first)) { + key("from_start"); + value(true); + } + key("to"); + value(v.get_id(e.second)); + if (v.get_is_reverse(e.second)) { + key("to_end"); + value(true); + } + object_end(); + }); + array_end(); + object_end(); +} + +void ProblemDumpExplainer::value(const handle_t& v, const HandleGraph& context) { + if (!Explainer::save_explanations) { + return; + } + // Implement via pos_t serialization. + this->value(make_pos_t(context.get_id(v), context.get_is_reverse(v), 0)); +} + +const size_t DiagramExplainer::MAX_DISPLAYED_SUGGESTIONS_PER_CATEGORY {5}; + +DiagramExplainer::DiagramExplainer() : Explainer() { + // Nothing to do! +} + +DiagramExplainer::~DiagramExplainer() { + if (!Explainer::save_explanations) { + return; + } + write_connected_components(); +} + +void DiagramExplainer::add_globals(const annotation_t& annotations) { + if (!Explainer::save_explanations) { + return; + } + std::copy(annotations.begin(), annotations.end(), std::back_inserter(globals)); +} + +void DiagramExplainer::add_node(const std::string& id, const annotation_t& annotations) { + if (!Explainer::save_explanations) { + return; + } + nodes.emplace(id, annotations); +} + +void DiagramExplainer::ensure_node(const std::string& id, const annotation_t& annotations) { + if (!Explainer::save_explanations) { + return; + } + auto found = nodes.find(id); + if (found == nodes.end()) { + nodes.emplace_hint(found, id, annotations); + } +} + +void DiagramExplainer::add_edge(const std::string& a_id, const std::string& b_id, const annotation_t& annotations) { + if (!Explainer::save_explanations) { + return; + } + edges.emplace(std::make_pair(a_id, b_id), annotations); +} + +void DiagramExplainer::ensure_edge(const std::string& a_id, const std::string& b_id, const annotation_t& annotations) { + if (!Explainer::save_explanations) { + return; + } + auto key = std::make_pair(a_id, b_id); + auto found = edges.find(key); + if (found == edges.end()) { + edges.emplace_hint(found, std::move(key), annotations); + } +} + +void DiagramExplainer::suggest_edge(const std::string& a_id, const std::string& b_id, const std::string& category, double importance, const annotation_t& annotations) { + if (!Explainer::save_explanations) { + return; + } + + // Find the heap it goes in + auto& heap = suggested_edges[category]; + + // And make a comparator + std::greater comp; + + // Put the suggestion in + heap.emplace_back(importance, std::make_tuple(a_id, b_id, annotations)); + std::push_heap(heap.begin(), heap.end(), comp); + while (heap.size() > DiagramExplainer::MAX_DISPLAYED_SUGGESTIONS_PER_CATEGORY) { + // Clamp to size limit + std::pop_heap(heap.begin(), heap.end(), comp); + heap.pop_back(); + } +} + +void DiagramExplainer::for_each_edge(const std::function& iteratee) const { + for (auto& kv : edges) { + // Do all the required edges + iteratee(edge_ref_t(kv.first.first, kv.first.second, kv.second)); + } + + for (auto& kv : suggested_edges) { + for (auto& suggestion : kv.second) { + const stored_edge_t& edge = suggestion.second; + // Do all the surviving suggested edges + iteratee(edge_ref_t(std::get<0>(edge), std::get<1>(edge), std::get<2>(edge))); + } + } +} + +void DiagramExplainer::write_annotations(std::ostream& out, const annotation_t& annotations) const { + if (!annotations.empty()) { + out << " ["; + for (auto& annotation : annotations) { + // Add all the annotations to the thing, in the brackets that we need + out << annotation.first << "=\"" << annotation.second << "\", "; + } + out << "]"; + } +} + +void DiagramExplainer::write_node(std::ostream& out, const std::string& id, const annotation_t& annotations) const { + out << id; + write_annotations(out, annotations); + out << ";" << std::endl; +} + +void DiagramExplainer::write_edge(std::ostream& out, const std::string& a_id, const std::string& b_id, const annotation_t& annotations) const { + out << a_id << " -> " << b_id; + write_annotations(out, annotations); + out << ";" << std::endl; +} + +void DiagramExplainer::write_globals(std::ostream& out, const annotation_t& annotations) const { + for (auto& kv : annotations) { + // Add all the globals, which have a different syntax than item annotations. + out << kv.first << "=\"" << kv.second << "\";" << std::endl; + } +} + +void DiagramExplainer::write_connected_components() const { + // Number all the nodes + std::vector node_order; + // TODO: Use a symbol-registering widget so we don't need to keep both these tables. + std::unordered_map id_to_index; + node_order.reserve(nodes.size()); + for (auto it = nodes.begin(); it != nodes.end(); ++it) { + id_to_index.emplace(it->first, node_order.size()); + node_order.push_back(it); + } + + // Compose connected components + structures::UnionFind components(node_order.size()); + + for_each_edge([&](const edge_ref_t& edge) { + // Connect connected components for each edge + components.union_groups(id_to_index.at(std::get<0>(edge)), id_to_index.at(std::get<1>(edge))); + }); + + std::unordered_map files_by_group; + for (size_t i = 0; i < node_order.size(); i++) { + // For each node + + // Make sure we have a file for the connected component it goes in + size_t group = components.find_group(i); + auto file_it = files_by_group.find(group); + if (file_it == files_by_group.end()) { + // We need to open and set up a new file + std::stringstream name_stream; + name_stream << "graph" << explanation_number << "-" << files_by_group.size() << ".dot"; + file_it = files_by_group.emplace_hint(file_it, group, name_stream.str()); + + // Start off with the heading + file_it->second << "digraph explanation {" << std::endl; + // And any globals + write_globals(file_it->second, globals); + } + + // Write the node + write_node(file_it->second, node_order[i]->first, node_order[i]->second); + } + + for_each_edge([&](const edge_ref_t& edge) { + // Add each edge to the file for its group + size_t group = components.find_group(id_to_index.at(std::get<0>(edge))); + write_edge(files_by_group.at(group), std::get<0>(edge), std::get<1>(edge), std::get<2>(edge)); + }); + + for (auto& kv : files_by_group) { + // Close out all the files + kv.second << "}" << std::endl; + kv.second.close(); + } +} + +} diff --git a/src/explainer.hpp b/src/explainer.hpp new file mode 100644 index 00000000000..0de6b03fb83 --- /dev/null +++ b/src/explainer.hpp @@ -0,0 +1,217 @@ +#ifndef VG_EXPLAINER_HPP_INCLUDED +#define VG_EXPLAINER_HPP_INCLUDED + +/** + * \file + * Contains utility classes for producing algorithms which can explain + * themselves and capture monitoring statistics in an efficient way. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// For pair hash overload +#include "hash_map.hpp" + +#include "types.hpp" +#include "handle.hpp" + +namespace vg { + +/** + * Base explainer class. Handles making sure each explanation has a different unique number. + */ +class Explainer { +public: + /// Determine if explanations should be generated. + static bool save_explanations; + + /// Construct an Explainer that will save to one or more files + Explainer(); + + /// Close out the files being explained to + virtual ~Explainer(); + +protected: + /// What number explanation are we? Distinguishes different objects. + size_t explanation_number; + + /// Counter used to give different explanations their own unique filenames. + static std::atomic next_explanation_number; +}; + +/** + * Widget to serialize somewhat structured logs. + */ +class ProblemDumpExplainer : public Explainer { +public: + /// Construct a ProblemDumpExplainer that will save a dump of a problem to a file. + ProblemDumpExplainer(const std::string& name = "problem"); + /// Close out the file being explained to + ~ProblemDumpExplainer(); + + // We think in JSON, but with support for vg types. + + /// Begin an object in a value context. + void object_start(); + /// End an object after its last value. + void object_end(); + /// Begin an array in a value context. + void array_start(); + /// End an array after its last value. + void array_end(); + + /// Put the key for a value, inside an object + void key(const std::string& k); + + /// Put a value after a key or in an array. + /// Assumes the string is pre-escaped. + void value(const std::string& v); + /// Put a value after a key or in an array. + void value(double v); + /// Put a value after a key or in an array. + void value(size_t v); + /// Put a value after a key or in an array. + void value(int v); + /// Put a value after a key or in an array. + void value(bool v); + /// Put a value after a key or in an array. + void value(vg::id_t v); + /// Put a value after a key or in an array. + /// Represents the position as a vg Protobuf Position. + void value(const pos_t& v); + /// Put a value after a key or in an array. + /// Represents the graph as a single chunk vg Protobuf Graph. + void value(const HandleGraph& v); + /// Put a value after a key or in an array. + /// Represents a handle as a vg Protobuf Position. + void value(const handle_t& v, const HandleGraph& context); + +protected: + /// Stream being written to + ofstream out; + /// Whether we need a comma before the next key or value. + bool need_comma = false; + + /// Write a separating comma if needed. + inline void comma() { + if (need_comma) { + out << ","; + need_comma = false; + } + } +}; + +/** + * Widget to log statistics to a GraphViz file. + */ +class DiagramExplainer : public Explainer { +public: + // We define a type for miscelaneous annotations, since we don't have kwargs + using annotation_t = std::vector>; + + /// Construct a DiagramExplainer that will save a diagram to one or more files. + DiagramExplainer(); + /// Close out the files being explained to + ~DiagramExplainer(); + + /// Add global annotations (like rankdir) + void add_globals(const annotation_t& annotations); + + /// Add a node. Optionally give it the given annotation, which must be pre-escaped. + /// The node is assumed not to exist already + void add_node(const std::string& id, const annotation_t& annotations = {}); + + /// Add a node. Optionally give it the given annotation, which must be pre-escaped. + /// Deduplicates multiple calls with the same ID. + void ensure_node(const std::string& id, const annotation_t& annotations = {}); + + /// Add an edge. Optionally give it the given annotation, which must be + /// pre-escaped. The edge is assumed not to exist already. + void add_edge(const std::string& a_id, const std::string& b_id, const annotation_t& annotations = {}); + + /// Add an edge. Optionally give it the given annotation, which must be + /// pre-escaped. Deduplicates multiple calls with the same IDs in the same order. + void ensure_edge(const std::string& a_id, const std::string& b_id, const annotation_t& annotations = {}); + + /// Add an optional edge. Optionally give it the given annotation, which must be pre-escaped. + /// Only the k most important edges in each category will actually render + void suggest_edge(const std::string& a_id, const std::string& b_id, const std::string& category, double importance, const annotation_t& annotations = {}); + +protected: + /// Collection of all global diagram key-value pairs (like rankdir) + annotation_t globals; + + /// Collection of all nodes, by ID + std::unordered_map nodes; + + /// We will need to store edges + using stored_edge_t = std::tuple; + /// And show them to people + using edge_ref_t = std::tuple; + + /// Collection of all required edges + std::unordered_map, annotation_t> edges; + + using suggested_edge_t = std::pair; + + /// Top k most important edges for each suggested edge category + std::unordered_map> suggested_edges; + + /// Limit on suggested edges + static const size_t MAX_DISPLAYED_SUGGESTIONS_PER_CATEGORY; + + /// Loop over all the edges, across all kinds of storage + void for_each_edge(const std::function& iteratee) const; + + /// Save the annotations for a node or edge, if any. + void write_annotations(std::ostream& out, const annotation_t& annotations) const; + + /// Write out a node + void write_node(std::ostream& out, const std::string& id, const annotation_t& annotations) const; + + /// Write out an edge + void write_edge(std::ostream& out, const std::string& a_id, const std::string& b_id, const annotation_t& annotations) const; + + /// Write out globals + void write_globals(std::ostream& out, const annotation_t& annotations) const; + + /// Write each connected component to a different file + void write_connected_components() const; + +}; + +/** + * Explainer that can dump anything that has a: + * void to_dot(ostream& out) const; + * method, such as a Funnel. + */ +template +class DotDumpExplainer : public Explainer { +public: + /// Construct a DotDumpExplainer that will save a diagram to a file + DotDumpExplainer(const T& to_dump); +}; + +template +DotDumpExplainer::DotDumpExplainer(const T& to_dump) : Explainer() { + if (!Explainer::save_explanations) { + return; + } + // Open the dot file + std::ofstream out("dotdump" + std::to_string(explanation_number) + ".dot"); + // And dump to it + to_dump.to_dot(out); +} + + +} + +#endif diff --git a/src/extra_node_graph.cpp b/src/extra_node_graph.cpp new file mode 100644 index 00000000000..615933f17e3 --- /dev/null +++ b/src/extra_node_graph.cpp @@ -0,0 +1,231 @@ +#include "extra_node_graph.hpp" +#include "utility.hpp" + +#include + +//#define debug + +namespace vg { + +using namespace std; +using namespace handlegraph; + +ExtraNodeGraph::ExtraNodeGraph( + const HandleGraph* backing, + const string& sequence, + const vector& edges_in, + const vector& edges_out) : + backing(backing), + in_from(edges_in.begin(), edges_in.end()), + out_to(edges_out.begin(), edges_out.end()), + added_id(backing->max_node_id() + 1), + sequence(sequence) { + + // Nothing to do! +} + +handle_t ExtraNodeGraph::get_created_handle() const { + return added_fwd; +} + +bool ExtraNodeGraph::has_node(id_t node_id) const { + return node_id == added_id || backing->has_node(node_id); +} + +handle_t ExtraNodeGraph::get_handle(const id_t& node_id, bool is_reverse) const { + if (node_id == added_id) { + // They asked for the added node + return is_reverse ? added_rev : added_fwd; + } else { + // Otherwise they asked for something in the backing graph + handle_t backing_handle = backing->get_handle(node_id, is_reverse); + + // Budge up to make room for the added node in each orientation + return from_backing(backing_handle); + } +} + +id_t ExtraNodeGraph::get_id(const handle_t& handle) const { + if (handle == added_fwd || handle == added_rev) { + return added_id; + } else { + return backing->get_id(to_backing(handle)); + } +} + +bool ExtraNodeGraph::get_is_reverse(const handle_t& handle) const { + if (handle == added_fwd) { + return false; + } else if (handle == added_rev) { + return true; + } else { + return backing->get_is_reverse(to_backing(handle)); + } +} + +handle_t ExtraNodeGraph::flip(const handle_t& handle) const { + if (is_ours(handle)) { + // In our block of handles, orientation is the low bit + return as_handle(as_integer(handle) ^ 1); + } else { + // Make the backing graph flip it + return from_backing(backing->flip(to_backing(handle))); + } +} + +size_t ExtraNodeGraph::get_length(const handle_t& handle) const { + if (is_ours(handle)) { + // Our node is the same length in any orientation + return sequence.length(); + } else { + return backing->get_length(to_backing(handle)); + } +} + +string ExtraNodeGraph::get_sequence(const handle_t& handle) const { + if (handle == added_fwd) { + return sequence; + } else if (handle == added_rev) { + return reverse_complement(sequence); + } else { + assert(!is_ours(handle)); + return backing->get_sequence(to_backing(handle)); + } +} + +bool ExtraNodeGraph::follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const { + if (is_ours(handle)) { + // This is some orientation of our added node + + if (handle == added_fwd) { + // We are in the forward orientation + if (go_left) { + // We are going left, so get everywhere we come from + for (auto& prev : in_from) { + if (!iteratee(from_backing(prev))) { + return false; + } + } + } else { + // We are going right, so get everything we go to + for (auto& next : out_to) { + if (!iteratee(from_backing(next))) { + return false; + } + } + } + } else if (handle == added_rev) { + // We are going in the reverse orientation + if (go_left) { + // We actually want where we go to, backward + for (auto& next : out_to) { + if (!iteratee(flip(from_backing(next)))) { + return false; + } + } + } else { + // We actually want where we come from, backward + for (auto& prev : in_from) { + if (!iteratee(flip(from_backing(prev)))) { + return false; + } + } + } + } + + return true; + } else { + // The handle refers to a node in the backing graph + auto backing_handle = to_backing(handle); + + // If we get through those, do the actual edges in the backing graph + bool keep_going = backing->follow_edges(backing_handle, go_left, [&](const handle_t& found) -> bool { + return iteratee(from_backing(found)); + }); + + // Also handle edges to/from our added node. + + if (keep_going && + ((go_left && out_to.count(backing_handle)) || + (!go_left && in_from.count(backing_handle)))) { + // Visit it forward + keep_going &= iteratee(added_fwd); + } + + if (keep_going && + ((go_left && out_to.count(backing->flip(backing_handle))) || + (!go_left && in_from.count(backing->flip(backing_handle))))) { + // Visit it reverse + keep_going &= iteratee(added_rev); + } + + return keep_going; + + } +} + +bool ExtraNodeGraph::for_each_handle_impl(const function& iteratee, bool parallel) const { + + // First do the node we added + if (!iteratee(added_fwd)) { + return false; + } + +#ifdef debug + cerr << "Try backing graph " << (parallel ? "in parallel" : "") << endl; +#endif + return backing->for_each_handle([&](const handle_t& backing_handle) -> bool { + // Now do each backing node, possibly in parallel. +#ifdef debug + cerr << "Invoke iteratee on " << backing->get_id(backing_handle) << endl; +#endif + return iteratee(from_backing(backing_handle)); + }, parallel); +} + +size_t ExtraNodeGraph::get_node_count() const { + return backing->get_node_count() + 1; +} + +id_t ExtraNodeGraph::min_node_id() const { + return min(backing->min_node_id(), added_id); +} + +id_t ExtraNodeGraph::max_node_id() const { + return max(backing->max_node_id(), added_id); +} + +size_t ExtraNodeGraph::get_degree(const handle_t& handle, bool go_left) const { + if (is_ours(handle)) { + if ((handle == added_fwd && !go_left) || (handle == added_rev && go_left)) { + // Edges out of the added node + return out_to.size(); + } else if ((handle == added_fwd && go_left) || (handle == added_rev && !go_left)) { + // Edges into the added node + return in_from.size(); + } + } else { + // We need to find the backing graph degree and possibly adjust it if this is a head or tail + handle_t backing_handle = to_backing(handle); + + size_t degree = backing->get_degree(backing_handle, go_left); + + if ((go_left && out_to.count(backing_handle)) || (!go_left && in_from.count(backing_handle))) { + // Forward version of this handle connects to the added node on this side. + degree++; + } + + if ((go_left && in_from.count(backing->flip(backing_handle))) || (!go_left && out_to.count(backing->flip(backing_handle)))) { + // Reverse version of this handle connects to added node on this side. + degree++; + } + + return degree; + } + + // We must return from one of the other branches + throw runtime_error("Did not hit a return statement that should have been hit"); +} + + +} diff --git a/src/extra_node_graph.hpp b/src/extra_node_graph.hpp new file mode 100644 index 00000000000..30d01cb140d --- /dev/null +++ b/src/extra_node_graph.hpp @@ -0,0 +1,137 @@ +#ifndef VG_EXTRA_NODE_GRAPH_HPP_INCLUDED +#define VG_EXTRA_NODE_GRAPH_HPP_INCLUDED + +/** + * \file extra_node_graph.hpp + * + * Provides a HandleGraph implementation that can add an extra node and edges + * to/from it on top of a backing HandleGraph. + * + */ + + +#include "handle.hpp" + +#include + + +namespace vg { + +using namespace handlegraph; + +/** + * Present a HandleGraph that is a backing HandleGraph with an additional node + * and edges to/from it added. + */ +class ExtraNodeGraph : public HandleGraph { + +public: + /** + * Make a new ExtraNodeGraph. The backing graph must not be modified + * while the overlay exists. + * + * Creates a new handle with the given sequence, with edges from all the + * edges_in handles to it, and edges from it to all the edges_out handles. + * + * Self loops on the new node are not supported. + */ + ExtraNodeGraph(const HandleGraph* backing, const string& sequence, const vector& edges_in, const vector& edges_out); + + /// Expose the handle to the new extra node + handle_t get_created_handle() const; + + //////////////////////////////////////////////////////////////////////////// + // Handle-based interface + //////////////////////////////////////////////////////////////////////////// + + /// Check if a node exists by ID + virtual bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + virtual handle_t get_handle(const id_t& node_id, bool is_reverse) const; + + /// Get the ID from a handle + virtual id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + virtual bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + virtual handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + virtual size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + virtual string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee returns false. + virtual bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph + virtual size_t get_node_count() const; + + /// Return the smallest ID in the graph. + virtual id_t min_node_id() const; + + /// Return the largest ID in the graph. + virtual id_t max_node_id() const; + + /// Compute the degree of one side of a handle in O(1) time, if the backing + /// graph also provides this facility in O(1) time. Takes O(n) time + /// otherwise in the returned degree. + virtual size_t get_degree(const handle_t& handle, bool go_left) const; + + //////////////////////////////////////////////////////////////////////////// + // (Future) Overlay Interface + //////////////////////////////////////////////////////////////////////////// + + /// Convert a backing graph handle to our handle to the same node + inline handle_t from_backing(const handle_t& backing_handle) const { + return as_handle(as_integer(backing_handle) + 2); + } + +protected: + + // TODO: a lot of this code can be unified with SourceSinkOverlay + + /// What backing graph do we overlay? + const HandleGraph* backing; + + /// What are the handles in the backing graph that read into our new node forward? + unordered_set in_from; + /// And where do we go after our new node forward? + unordered_set out_to; + + /// What is our projected node ID? + id_t added_id; + // And sequence? + string sequence; + + // We reserve the 2 low numbers of the handles for our new node, and shift everything else up. + const handle_t added_fwd = as_handle(0); + const handle_t added_rev = as_handle(1); + + /// Convert our handle to a backing graph node into a backing graph handle to the same node + inline handle_t to_backing(const handle_t& our_handle) const { + return as_handle(as_integer(our_handle) - 2); + } + + /// Determine if a handle points to an overlay-added node or not + inline bool is_ours(const handle_t& our_handle) const { + return ((uint64_t) as_integer(our_handle)) < 2; + } + +}; + + +} + +#endif diff --git a/src/filter.cpp b/src/filter.cpp index 4d6bc25380a..7a194db1107 100644 --- a/src/filter.cpp +++ b/src/filter.cpp @@ -309,8 +309,8 @@ namespace vg{ my_vg = vg; } - void Filter::set_my_xg_idx(xg::XG* idx){ - my_xg_index = idx; + void Filter::set_my_path_position_graph(PathPositionHandleGraph* graph){ + my_path_position_graph = graph; } void Filter::set_inverse(bool do_inv){ @@ -318,12 +318,12 @@ namespace vg{ } void Filter::init_mapper(){ - if (my_xg_index == NULL || gcsa_ind == NULL || lcp_ind == NULL){ + if (my_path_position_graph == NULL || gcsa_ind == NULL || lcp_ind == NULL){ cerr << "Must provide an xg and gcsa to inititiate mapper for split read mapping." << endl; exit(1); } - my_mapper = new Mapper(my_xg_index, gcsa_ind, lcp_ind); + my_mapper = new Mapper(my_path_position_graph, gcsa_ind, lcp_ind); } bool Filter::perfect_filter(Alignment& aln){ @@ -678,24 +678,38 @@ namespace vg{ * Inverse behavior: if the Alignment is path divergent, return aln, else return an empty Alignment */ Alignment Filter::path_divergence_filter(Alignment& aln){ - Path path = aln.path(); - for (int i = 1; i < path.mapping_size(); i++){ - Mapping mapping = path.mapping(i); - Position pos = mapping.position(); - id_t current_node = pos.node_id(); - id_t prev_node = path.mapping(i - 1).position().node_id(); - bool paths_match = false; - vector paths_of_prev = my_xg_index->paths_of_node(prev_node); - for (int i = 0; i < paths_of_prev.size(); i++){ - string p_name = my_xg_index->path_name(paths_of_prev[i]); - if (my_xg_index->path_contains_node(p_name, current_node)){ - paths_match = true; + const Path& path = aln.path(); + if (path.mapping_size() > 0) { + + handle_t first_handle = my_path_position_graph->get_handle(path.mapping(0).position().node_id()); + unordered_set prev_node_paths; + my_path_position_graph->for_each_step_on_handle(first_handle, [&](const step_handle_t& step) { + prev_node_paths.insert(my_path_position_graph->get_path_handle_of_step(step)); + }); + + for (int i = 1; i < path.mapping_size(); i++){ + + handle_t handle = my_path_position_graph->get_handle(path.mapping(i).position().node_id()); + unordered_set curr_node_paths; + my_path_position_graph->for_each_step_on_handle(handle, [&](const step_handle_t& step) { + curr_node_paths.insert(my_path_position_graph->get_path_handle_of_step(step)); + }); + + bool paths_match = false; + + for (const path_handle_t& path : curr_node_paths) { + if (prev_node_paths.count(path)) { + paths_match = true; + break; + } } + + if (!paths_match){ + return inverse ? aln : Alignment(); + } + + prev_node_paths = move(curr_node_paths); } - if (!paths_match){ - return inverse ? aln : Alignment(); - } - } return inverse ? Alignment() : aln; } @@ -856,7 +870,7 @@ namespace vg{ vector Filter::remap(Alignment& aln){ - if (this->my_xg_index == NULL || this->gcsa_ind == NULL || this->my_mapper == NULL){ + if (this->my_path_position_graph == NULL || this->gcsa_ind == NULL || this->my_mapper == NULL){ cerr << "An XG and GCSA are required for remapping." << endl; exit(1337); } @@ -866,7 +880,7 @@ namespace vg{ } vector Filter::remap(string seq){ - if (this->my_xg_index == NULL || this->gcsa_ind == NULL){ + if (this->my_path_position_graph == NULL || this->gcsa_ind == NULL){ cerr << "An XG and GCSA are required for remapping." << endl; exit(1337); } @@ -890,7 +904,7 @@ namespace vg{ */ bool Filter::split_read_filter(Alignment& aln){ - if (this->my_xg_index == NULL || this->gcsa_ind == NULL){ + if (this->my_path_position_graph == NULL || this->gcsa_ind == NULL){ cerr << "An XG and GCSA are required for split read processing." << endl; exit(1337); } diff --git a/src/filter.hpp b/src/filter.hpp index 680d213491a..250f28eea0b 100644 --- a/src/filter.hpp +++ b/src/filter.hpp @@ -10,8 +10,7 @@ #include #include "vg.hpp" #include "mapper.hpp" -#include "xg.hpp" -#include "vg.pb.h" +#include /** \file * Provides a way to filter Edits contained @@ -22,6 +21,56 @@ */ namespace vg{ +struct BREAKPOINT{ + string name; + Position position; + vector mates; + + string contig; + int64_t start = -1; + int64_t upper_bound = 100; + int64_t lower_bound = 100; + + // Does the breakpoint point this way --->> + // or this way <<--- + bool isForward; + // 0: Unset, 1: INS, 2: DEL, 3: INV, 4: DUP + int SV_TYPE = 0; + // + + int normal_supports = 0; + int tumor_supports = 0; + + int fragl_supports = 0; + int split_supports = 0; + int other_supports = 0; + + inline int total_supports(){ + return fragl_supports + split_supports + other_supports; + } + inline bool overlap(BREAKPOINT p, int dist){ + + if (start > -1 ){ + if ( abs(start - p.start) < dist){ + return true; + } + } + else{ + if (position.node_id() == p.position.node_id() && abs(position.offset() - p.position.offset()) < dist){ + return true; + } + } + + return false; + } + inline string to_string(){ + stringstream x; + x << "Pos: " << start << " u: " << upper_bound << " l: " << lower_bound << " s: " << total_supports(); + return x.str(); + } + +}; + class Filter{ public: Filter(); @@ -96,13 +145,13 @@ class Filter{ void set_split_read_limit(int split_limit); void set_window_length(int window_length); void set_my_vg(vg::VG* vg); - void set_my_xg_idx(xg::XG* xg_idx); + void set_my_path_position_graph(PathPositionHandleGraph* graph); void set_inverse(bool do_inv); void init_mapper(); vg::VG* my_vg = NULL; - xg::XG* my_xg_index = NULL; + PathPositionHandleGraph* my_path_position_graph = NULL; gcsa::GCSA* gcsa_ind; gcsa::LCPArray * lcp_ind; Mapper* my_mapper; diff --git a/src/flat_file_back_translation.cpp b/src/flat_file_back_translation.cpp new file mode 100644 index 00000000000..4977abf69ff --- /dev/null +++ b/src/flat_file_back_translation.cpp @@ -0,0 +1,79 @@ +/** + * \file flat_file_back_translation.cpp + * Implementation for flat-file-backed named-node back-translation. + */ + +#include "flat_file_back_translation.hpp" +#include "utility.hpp" + +namespace vg { + +FlatFileBackTranslation::FlatFileBackTranslation(std::istream& stream) { + if (!stream) { + // File didn't open properly or something. + throw std::runtime_error("Could not read translation from stream"); + } + while (stream) { + // Get each line. + std::string line; + std::getline(stream, line); + if (line.empty()) { + // Skip blank lines. + continue; + } + // Split on tabs. + auto parts = split_delims(line, "\t"); + if (parts[0] == "T") { + // This is T segment name segment number + if (parts.size() != 3) { + throw std::runtime_error("Encountered unparseable T line: " + line); + } + segment_to_name[parse(parts[2])] = parts[1]; + } else if (parts[0] == "K") { + // This is K old ID forward offset reverse offset new ID + if (parts.size() != 5) { + throw std::runtime_error("Encountered unparseable K line: " + line); + } + // Save, under the new node ID, the old node ID and the offsets. + node_to_segment_and_offsets[parse(parts[4])] = {parse(parts[1]), parse(parts[2]), parse(parts[3])}; + } else { + // This shouldn't be here. + throw std::runtime_error("Encountered unrecognized line: " + line); + } + } +} + +std::vector FlatFileBackTranslation::translate_back(const oriented_node_range_t& range) const { + // Look up the node ID + auto it = node_to_segment_and_offsets.find(std::get<0>(range)); + + if (it == node_to_segment_and_offsets.end()) { + // This doesn't have to go anywhere else. + return {range}; + } + + // Otherwise this goes somewhere else. + return {{ + // The destination segment + std::get<0>(it->second), + // In the requested orientation + std::get<1>(range), + // Starting at the correct offset along that orientation of the segment + std::get<2>(range) + (std::get<1>(range) ? std::get<2>(it->second) : std::get<1>(it->second)), + // And running for the specified length + std::get<3>(range) + }}; +} + +std::string FlatFileBackTranslation::get_back_graph_node_name(const nid_t& back_node_id) const { + auto it = segment_to_name.find(back_node_id); + if (it == segment_to_name.end()) { + // There's no non-default name for this segment. + return std::to_string(back_node_id); + } + + // Otherwise, we found a name. + return it->second; +} + +} diff --git a/src/flat_file_back_translation.hpp b/src/flat_file_back_translation.hpp new file mode 100644 index 00000000000..9e535c8c1a8 --- /dev/null +++ b/src/flat_file_back_translation.hpp @@ -0,0 +1,77 @@ +/** + * \file flat_file_back_translation.hpp + * Defines a back-translation from graph node ID space to named node space, + * backed by a flat text file. + */ + +#ifndef VG_FLAT_FILE_BACK_TRANSLATION_HPP_INCLUDED +#define VG_FLAT_FILE_BACK_TRANSLATION_HPP_INCLUDED + +#include "handle.hpp" + +#include +#include +#include + +namespace vg { + +/** + * A NamedNodeBackTranslation loadable from a text file. + * + * The file is a GFA-like tab-separated file with types of lines identified by + * a letter in the first field. It consists of 0 or more T lines, each giving a + * segment name and an assigned number for it. This is followed by 0 or more K + * lines, each giving a segment number, offsets along that segment in forward + * and then reverse orientations, and then the graph node ID that begins at + * that offset. + * + * Note that an empty file is allowed, and that this format can only represent + * translations where nodes are broken up (and not merged) and where + * orientation does not change. + * + * Many applications (such as loading the translation into a GBWTGraph) will + * expect the graph node IDs alogn a segment to be dense, contiguous, and + * increasing. + */ +class FlatFileBackTranslation : public NamedNodeBackTranslation { + +public: + /** + * Create a FlatFileBackTranslation by reading it from an open file. + */ + FlatFileBackTranslation(std::istream& stream); + + virtual ~FlatFileBackTranslation() = default; + + /** + * Translate the given range of bases on the given orientation of the given + * node in the current graph, to zero or more ranges on orientations of + * nodes in some prior graph. + */ + virtual std::vector translate_back(const oriented_node_range_t& range) const; + + /** + * Get the name of a node in the graph that translate_back() translates + * into, given its number. + */ + virtual std::string get_back_graph_node_name(const nid_t& back_node_id) const; + +protected: + /** + * This holds, for each node ID, the segment number and starting offset, if + * it is not offset 0, on each orientation of the segment with the same + * number as the node ID. + */ + std::unordered_map> node_to_segment_and_offsets; + + /** + * This holds, for each segment ID, the segment name, if it is not the + * string version of the segment number. + */ + std::unordered_map segment_to_name; + +}; + +} + +#endif diff --git a/src/flow_sort.cpp b/src/flow_sort.cpp index 59efc44a91f..72bb2e600a2 100644 --- a/src/flow_sort.cpp +++ b/src/flow_sort.cpp @@ -1,7 +1,7 @@ #include "vg.hpp" -#include "stream.hpp" -#include "gssw_aligner.hpp" -#include "vg.pb.h" +#include +#include "aligner.hpp" +#include #include "flow_sort.hpp" #include diff --git a/src/flow_sort.hpp b/src/flow_sort.hpp index cca16ff6af8..367537a85b0 100644 --- a/src/flow_sort.hpp +++ b/src/flow_sort.hpp @@ -1,7 +1,7 @@ #ifndef VG_FLOW_SORT_HPP_INCLUDED #define VG_FLOW_SORT_HPP_INCLUDED -#include "vg.pb.h" +#include namespace vg { diff --git a/src/funnel.cpp b/src/funnel.cpp new file mode 100644 index 00000000000..2da613598a3 --- /dev/null +++ b/src/funnel.cpp @@ -0,0 +1,705 @@ +#include "funnel.hpp" + +#include +#include + +/** + * \file funnel.hpp: implementation of the Funnel class + */ + +namespace vg { +using namespace std; + +void Funnel::PaintableSpace::paint(size_t start, size_t length) { + // Find the last interval starting strictly before start + auto predecessor = regions.lower_bound(start); + if (predecessor != regions.begin()) { + --predecessor; + // We have one. + + if (predecessor->first + predecessor->second >= start) { + // If its length is long enough to abut or cover start + + if (predecessor->first + predecessor->second > start + length) { + // It completely encompasses us, so nothing to do! + return; + } + + // Budge start back and increase length + length += (start - predecessor->first); + start = predecessor->first; + + // And remove it + regions.erase(predecessor); + // TODO: Can we fix it up? + } + } + + // Find the first interval starting at or after start + auto successor = regions.upper_bound(start); + auto range_first = regions.end(); + auto range_last = regions.end(); + while (successor != regions.end() && successor->first <= start + length) { + // For each from there that starts at or before start + length + // Increase length to cover up to its end + length = std::max(successor->first + successor->second, start + length) - start; + // And remember to remove it it + if (range_first == regions.end()) { + range_first = successor; + } + // Check the next thing + ++successor; + // Which provides the removal past-end + range_last = successor; + } + + // Remove the covered intervals + regions.erase(range_first, range_last); + + // Add the new interval + regions.emplace(start, length); +} + +bool Funnel::PaintableSpace::is_any_painted(size_t start, size_t length) const { + // Find the last interval starting strictly before start + auto predecessor = regions.lower_bound(start); + if (predecessor != regions.begin()) { + --predecessor; + // We have one. + if (predecessor->first + predecessor->second > start) { + // It covers our start, so we overlap + return true; + } + } + + auto successor = regions.upper_bound(start); + if (successor != regions.end()) { + // There's something starting at or after us + if (start + length > successor->first) { + // And we overlap it + return true; + } + } + + // We can't overlap anything + return false; +} + +void Funnel::start(const string& name) { + assert(!name.empty()); + + // (Re)start the funnel. + funnel_name = name; + start_time = clock::now(); + + // Clear out old data + stage_name.clear(); + substage_name.clear(); + stages.clear(); +} + +void Funnel::stop() { + // Stop any lingering stage (which takes care of substages) + stage_stop(); + // Stop the funnel overall + stop_time = clock::now(); +} + +void Funnel::stage(const string& name) { + assert(!funnel_name.empty()); + assert(!name.empty()); + + // Stop the previous stage if any. + stage_stop(); + + // Allocate new stage structures. + stages.emplace_back(); + stages.back().name = name; + + // Save the name + stage_name = name; + + // Record the start time + stage_start_time = clock::now(); +} + +void Funnel::stage_stop() { + if (!stage_name.empty()) { + // A stage was running. + + // Stop any substage. + substage_stop(); + // Stop any process/produce + processed_input(); + produced_output(); + + // Say the stage is stopped + stage_name.clear(); + + // Record the duration in seconds + auto stage_stop_time = clock::now(); + stages.back().duration = chrono::duration_cast>(stage_stop_time - stage_start_time).count(); + } +} + +void Funnel::substage(const string& name) { + assert(!funnel_name.empty()); + assert(!stage_name.empty()); + assert(!name.empty()); + + // Stop previous substage if any. + substage_stop(); + + // Substages don't bound produce/process. + + // Save the name + substage_name = name; +} + +void Funnel::substage_stop() { + if (!substage_name.empty()) { + // A substage was running. + + // Substages don't bound produce/process. + + // Say the stage is stopped + substage_name.clear(); + } +} + +void Funnel::processing_input(size_t prev_stage_item) { + // We can only take input from previous stages, in a stage + assert(!stage_name.empty()); + assert(stages.size() > 1); + assert(prev_stage_item != numeric_limits::max()); + assert(stages[stages.size() - 2].items.size() > prev_stage_item); + + // Stop any previous input processing + processed_input(); + + // Start this one + input_in_progress = prev_stage_item; +} + +void Funnel::processed_input() { + if (input_in_progress != numeric_limits::max()) { + // We were processing an input + + // Say we're done with the input. + input_in_progress = numeric_limits::max(); + } +} + +void Funnel::producing_output(size_t item) { + // We can only produce output in a stage + assert(!stage_name.empty()); + assert(!stages.empty()); + assert(item != numeric_limits::max()); + + // Stop any previous input processing + produced_output(); + + // Start this one + output_in_progress = item; +} + +void Funnel::produced_output() { + if (output_in_progress != numeric_limits::max()) { + // We were producing an output + + // Say we're done with the output. + output_in_progress = numeric_limits::max(); + } +} + +void Funnel::introduce(size_t count) { + // Create that many new items + for (size_t i = 0; i < count; i++) { + create_item(); + } +} + +void Funnel::expand(size_t prev_stage_item, size_t count) { + for (size_t i = 0; i < count; i++) { + // Create the requested number of items + project(prev_stage_item); + } +} + +void Funnel::project(size_t prev_stage_item) { + // There must be a prev stage to project from + assert(stages.size() > 1); + auto& prev_stage = stages[stages.size() - 2]; + + // Make one new item + size_t index = create_item(); + + // Record the ancestry + get_item(index).prev_stage_items.push_back(prev_stage_item); + + auto& old = prev_stage.items[prev_stage_item]; + + if (old.tag != State::NONE) { + // Tag the new item if it came from something tagged + tag(index, old.tag, old.tag_start, old.tag_length); + } +} + +void Funnel::project_group(size_t prev_stage_item, size_t group_size) { + // Project the item + project(prev_stage_item); + // Save the group size + get_item(latest()).group_size = group_size; +} + +void Funnel::also_relevant(size_t earlier_stage_lookback, size_t earlier_stage_item) { + assert(earlier_stage_lookback > 0); + assert(stages.size() > earlier_stage_lookback); + auto& earlier_stage = stages[stages.size() - 1 - earlier_stage_lookback]; + assert(earlier_stage.items.size() > earlier_stage_item); + auto& item = get_item(latest()); + if (earlier_stage_lookback == 1) { + // References to the immediately preceeding stage are special + item.prev_stage_items.push_back(earlier_stage_item); + } else { + // References to earlier stages include the stage offset back + item.earlier_stage_items.emplace_back(earlier_stage_lookback, earlier_stage_item); + } +} + +void Funnel::fail(const char* filter, size_t prev_stage_item, double statistic) { + // There must be a prev stage to project from + assert(stages.size() > 1); + auto& prev_stage = stages[stages.size() - 2]; + + // Record the item as having failed this filter + prev_stage.items[prev_stage_item].failed_filter = filter; + prev_stage.items[prev_stage_item].failed_statistic = statistic; +} + +void Funnel::pass(const char* filter, size_t prev_stage_item, double statistic) { + // There must be a prev stage to project from + assert(stages.size() > 1); + auto& prev_stage = stages[stages.size() - 2]; + + // Record the item as having passed this filter + prev_stage.items[prev_stage_item].passed_filters.emplace_back(filter); + prev_stage.items[prev_stage_item].passed_statistics.emplace_back(statistic); +} + +void Funnel::score(size_t item, double score) { + get_item(item).score = score; +} + +void Funnel::tag(size_t item, State state, size_t tag_start, size_t tag_length) { + +#ifdef debug + std::cerr << "Tag item " << item << " stage " << stages.back().name << " as " << state << " on " << tag_start << "-" << tag_start + tag_length << std::endl; +#endif + + // Say the item is tagged + auto& to_mark = get_item(item); + to_mark.tag = std::max(to_mark.tag, state); + + if (to_mark.tag_start == std::numeric_limits::max() && to_mark.tag_length == 0) { + // Item hasn't been tagged before, so we can just adopt the passed range. + to_mark.tag_start = tag_start; + to_mark.tag_length = tag_length; + } else { + // We need to find the enclosing range of the existing range and the new range. + size_t correct_end = std::max(to_mark.tag_start + to_mark.tag_length, tag_start + tag_length); + to_mark.tag_start = std::min(to_mark.tag_start, tag_start); + to_mark.tag_length = correct_end - to_mark.tag_start; + } + +#ifdef debug + std::cerr << "\tNow tagged over " << to_mark.tag_start << "-" << to_mark.tag_start + to_mark.tag_length << std::endl; +#endif + + // TODO: Allow different tags to cover different ranges? + // TODO: Allow per-item gapped range tracking? + + // Say the stage has tag over this interval. + stages.back().tag = std::max(stages.back().tag, state); + stages.back().tag_space.paint(tag_start, tag_length); +} + +void Funnel::tag_correct(size_t item, size_t tag_start, size_t tag_length) { + tag(item, State::CORRECT, tag_start, tag_length); +} + +bool Funnel::is_correct(size_t item) const { + return stages.back().items[item].tag >= State::CORRECT; +} + +bool Funnel::was_correct(size_t prev_stage_item) const { + assert(stages.size() > 1); + auto& prev_stage = stages[stages.size() - 2]; + return prev_stage.items[prev_stage_item].tag >= State::CORRECT; +} + +bool Funnel::was_correct(size_t prev_stage_index, const string& prev_stage_name, size_t prev_stage_item) const { + assert(stages.size() > prev_stage_index); + auto& prev_stage = stages[prev_stage_index]; + assert(prev_stage.name == prev_stage_name); + return prev_stage.items.at(prev_stage_item).tag >= State::CORRECT; +} + +string Funnel::last_tagged_stage(State tag, size_t tag_start, size_t tag_length) const { + // Just do a linear scan backward through stages + for (auto it = stages.rbegin(); it != stages.rend(); ++it) { + if (it->tag >= tag && it->tag_space.is_any_painted(tag_start, tag_length)) { + // If we are tagged good enough and have a tag in part of that + // area, then we are a matching stage. + return it->name; + } + } + return "none"; +} + +string Funnel::last_correct_stage(size_t tag_start, size_t tag_length) const { + return last_tagged_stage(State::CORRECT, tag_start, tag_length); +} + +size_t Funnel::latest() const { + assert(!stages.empty()); + assert(!stages.back().items.empty()); + return stages.back().items.size() - 1; +} + +void Funnel::for_each_stage(const function&, const double&)>& callback) const { + for (auto& stage : stages) { + // Make a vector of item sizes + vector item_sizes; + item_sizes.reserve(stage.items.size()); + for (auto& item : stage.items) { + item_sizes.push_back(item.group_size); + } + // Report the name and item count of each stage. + callback(stage.name, item_sizes, stage.duration); + } +} + +void Funnel::for_each_filter(const function&, const vector&)>& callback) const { + + for (auto& stage : stages) { + // Hold the names of all filters encountered + vector filter_names; + // And the by-item and by-size performance stats. + vector> filter_performances; + // And the correct and not-known-correct filter statistic values + vector, vector>> filter_statistics; + + for (auto& item : stage.items) { + // For each item + size_t filter_index; + for (filter_index = 0; filter_index < item.passed_filters.size(); filter_index++) { + // For each filter it passed + if (filter_index >= filter_names.size()) { + // If it is new + + // Remember its name in the list of filters + filter_names.push_back(item.passed_filters[filter_index]); + + // And give it an empty report + filter_performances.emplace_back(); + filter_statistics.emplace_back(); + } else { + // Make sure the name is correct + // TODO: can we justy match on pointer value and not string value? + assert(strcmp(filter_names[filter_index], item.passed_filters[filter_index]) == 0); + } + + // Record passing + filter_performances[filter_index].first.passing++; + filter_performances[filter_index].first.passing_correct += item.tag >= State::CORRECT; + + filter_performances[filter_index].second.passing += item.group_size; + filter_performances[filter_index].second.passing_correct += item.tag >= State::CORRECT ? item.group_size : 0; + + if (item.tag >= State::CORRECT) { + // Record this statistic value as belonging to a correct item + filter_statistics[filter_index].first.push_back(item.passed_statistics[filter_index]); + } else { + // Record this statistic value as belonging to a not necessarily correct item + filter_statistics[filter_index].second.push_back(item.passed_statistics[filter_index]); + } + } + + if (item.failed_filter != nullptr) { + // For the final, failed filter, if any + + if (filter_index >= filter_names.size()) { + // If it is new + + // Remember its name in the list of filters + filter_names.push_back(item.failed_filter); + + // And give it an empty report + filter_performances.emplace_back(); + filter_statistics.emplace_back(); + } else { + // Make sure the name is correct + // TODO: can we justy match on pointer value and not string value? + assert(strcmp(filter_names[filter_index], item.failed_filter) == 0); + } + + // Record failing + filter_performances[filter_index].first.failing++; + filter_performances[filter_index].first.failing_correct += (item.tag >= State::CORRECT) ? 1 : 0; + + filter_performances[filter_index].second.failing += item.group_size; + filter_performances[filter_index].second.failing_correct += (item.tag >= State::CORRECT) ? item.group_size : 0; + + if (item.tag >= State::CORRECT) { + // Record this statistic value as belonging to a correct item + filter_statistics[filter_index].first.push_back(item.failed_statistic); + } else { + // Record this statistic value as belonging to a not necessarily correct item + filter_statistics[filter_index].second.push_back(item.failed_statistic); + } + } + } + + // Now we have gone through the filters for this stage for every item. + + for (size_t i = 0; i < filter_names.size(); i++) { + // For each filter + + // Report the results tabulated across items. + callback(stage.name, filter_names[i], + filter_performances[i].first, filter_performances[i].second, + filter_statistics[i].first, filter_statistics[i].second); + } + } +} + +void Funnel::to_dot(ostream& out) const { + out << "digraph graphname {" << endl; + out << "rankdir=\"TB\";" << endl; + + for (size_t s = 0; s < stages.size(); s++) { + // For each stage in order + auto& stage = stages[s]; + + // Compute a GraphViz ID part for the stage + string stage_id = "s" + to_string(s); + + // Start a subgraph. + // Prepend cluster so it draws as a box. + out << "subgraph cluster_" << stage_id << " {" << endl; + out << "label = \"" << stage.name; + if (stage.tag != State::NONE) { + // Put in if it is tagged and over what area + out << ", " << stage.tag; + size_t range_min = std::numeric_limits::max(); + size_t range_max = 0; + for (auto& region : stage.tag_space.regions) { + if (region.first != 0 || region.second != std::numeric_limits::max()) { + // Expand bounds by the bounds of this region, to summarize + range_min = std::min(range_min, region.first); + range_max = std::max(range_max, region.first + region.second); + } + } + if (range_min != 0 && range_max != std::numeric_limits::max()) { + out << " " << range_min << "-" << range_max << " in " << stage.tag_space.regions.size() << " regions"; + } + } + out << "\";" << endl; + out << "graph[style=solid];" << endl; + out << "rank=same;" << endl; + + for (size_t i = 0; i < stage.items.size(); i++) { + // For each item in the stage + auto& item = stage.items[i]; + + // Compute a GraphViz ID + string item_id = stage_id + "i" + to_string(i); + + // Emit a node + out << item_id << "[label=\"" << i << "\" shape=circle tooltip=\""; + if (item.group_size != 0) { + out << "size " << item.group_size; + } + if (item.score != 0) { + if (item.group_size != 0) { + out << ", "; + } + out << "score " << item.score; + } + if (item.tag != State::NONE && (item.tag_start != std::numeric_limits::max() || item.tag_length != 0)) { + if (item.group_size != 0 || item.score != 0) { + out << ", "; + } + out << "tagged " << item.tag_start << " - " << item.tag_start + item.tag_length; + } + out << "\""; + if (item.tag >= State::CORRECT) { + // Make it green if it is correct + out << " color=green"; + } else if (item.tag >= State::PLACED) { + out << " color=blue"; + } + out << "];" << endl; + + if (s > 0) { + // There is a previous stage, so we can draw edges from it. + for (auto& p : item.prev_stage_items) { + // Connect everything from the previous stage to it + auto& prev_item = stages[s - 1].items.at(p); + + out << "s" << (s - 1) << "i" << p << " -> " << item_id << "["; + if (item.tag >= State::CORRECT && prev_item.tag >= State::CORRECT) { + // Correctness came this way + out << "color=green"; + } else if (item.tag >= State::PLACED && prev_item.tag >= State::PLACED) { + // Placedness came this way + out << "color=blue"; + } + out << "];" << endl; + } + if (s > 1) { + // And there are other stages before that + for (auto& p : item.earlier_stage_items) { + // Connect everything from the earlier stages to it + assert(p.first > 1); + auto& prev_item = stages.at(s - p.first).items.at(p.second); + + out << "s" << (s - p.first) << "i" << p.second << " -> " << item_id << "[constraint=false"; + if (item.tag >= State::CORRECT && prev_item.tag >= State::CORRECT) { + // Correctness came this way + out << ",color=green"; + } else if (item.tag >= State::PLACED && prev_item.tag >= State::PLACED) { + // Placedness came this way + out << ",color=blue"; + } + out << "];" << endl; + } + } + } + + } + + out << "}" << endl; + } + + out << "}" << endl; +} +void Funnel::annotate_mapped_alignment(Alignment& aln, bool annotate_correctness) const { + // Save the total duration in the field set asside for it + aln.set_time_used(chrono::duration_cast>(stop_time - start_time).count()); + + for_each_stage([&](const string& stage, const vector& result_sizes, const double& duration) { + // Save the number of items + set_annotation(aln, "stage_" + stage + "_results", (double)result_sizes.size()); + // And the per-stage duration + set_annotation(aln, "stage_" + stage + "_time", duration); + }); + + set_annotation(aln, "last_placed_stage", last_tagged_stage(State::PLACED)); + for (size_t i = 0; i < aln.sequence().size(); i += 500) { + // For each 500 bp window, annotate with the last stage that had something placed in or spanning the window. + // TODO: This is terrible, use an array or something. + set_annotation(aln, "last_placed_stage_" + std::to_string(i) + "bp", last_tagged_stage(State::PLACED, i, 500)); + } + + if (annotate_correctness) { + // And with the last stage at which we had any descendants of the correct seed hit locations + set_annotation(aln, "last_correct_stage", last_correct_stage()); + } + + // Annotate with the performances of all the filters + // We need to track filter number + size_t filter_num = 0; + for_each_filter([&](const string& stage, const string& filter, + const Funnel::FilterPerformance& by_count, const Funnel::FilterPerformance& by_size, + const vector& filter_statistics_correct, const vector& filter_statistics_non_correct) { + + string filter_id = to_string(filter_num) + "_" + filter + "_" + stage; + + // Save the stats + set_annotation(aln, "filter_" + filter_id + "_passed_count_total", (double) by_count.passing); + set_annotation(aln, "filter_" + filter_id + "_failed_count_total", (double) by_count.failing); + set_annotation(aln, "filter_" + filter_id + "_passed_size_total", (double) by_size.passing); + set_annotation(aln, "filter_" + filter_id + "_failed_size_total", (double) by_size.failing); + + if (annotate_correctness) { + set_annotation(aln, "filter_" + filter_id + "_passed_count_correct", (double) by_count.passing_correct); + set_annotation(aln, "filter_" + filter_id + "_failed_count_correct", (double) by_count.failing_correct); + set_annotation(aln, "filter_" + filter_id + "_passed_size_correct", (double) by_size.passing_correct); + set_annotation(aln, "filter_" + filter_id + "_failed_size_correct", (double) by_size.failing_correct); + } + + // Save the correct and non-correct filter statistics, even if + // everything is non-correct because correctness isn't computed + bool all_nan = true; + for (auto& v : filter_statistics_correct) { + if (!isnan(v)) { + all_nan = false; + break; + } + } + if (all_nan) { + // Elide all-nan vector + set_annotation(aln, "filterstats_" + filter_id + "_correct", std::vector()); + } else { + set_annotation(aln, "filterstats_" + filter_id + "_correct", filter_statistics_correct); + } + all_nan = true; + for (auto& v : filter_statistics_non_correct) { + if (!isnan(v)) { + all_nan = false; + break; + } + } + if (all_nan) { + // Elide all-nan vector + set_annotation(aln, "filterstats_" + filter_id + "_noncorrect", std::vector()); + } else { + set_annotation(aln, "filterstats_" + filter_id + "_noncorrect", filter_statistics_non_correct); + } + filter_num++; + }); +} + +Funnel::Item& Funnel::get_item(size_t index) { + assert(!stages.empty()); + if (index >= stages.back().items.size()) { + // Allocate up through here + stages.back().items.resize(index + 1); + } + return stages.back().items[index]; +} + +size_t Funnel::create_item() { + assert(!stages.empty()); + + // Work out where to put it + size_t next_index = stages.back().projected_count; + // Make sure the item slot exists + get_item(next_index); + // Record the item's creation + stages.back().projected_count++; + + // Return the index used + return next_index; +} + + + + +} + + + + + + + + + + + + + diff --git a/src/funnel.hpp b/src/funnel.hpp new file mode 100644 index 00000000000..69219ff3cd9 --- /dev/null +++ b/src/funnel.hpp @@ -0,0 +1,462 @@ +#ifndef VG_FUNNEL_HPP_INCLUDED +#define VG_FUNNEL_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "annotation.hpp" + + +/** + * \file funnel.hpp + * Contains the Funnel class, for recording the history of complex multi-stage transformations of sets of results. + */ + +namespace vg { + +using namespace std; + +/** + * Represents a record of an invocation of a pipeline for an input. + * + * Tracks the history of "lines" of data "item" provenance through a series of + * "stages", containing a series of "filters". + * + * Lines are "introduced", and "project" from earlier stages to later stages, + * possibly "expanding" or "merging", until they "fail" a filter or reach the + * final stage. At each stage, items occur in a linear order and are identified + * by index. + * + * An item may be a "group", with a certain size. + * + * We also can assign "scores" or correctness/placed-ness "tags" to items at a + * stage. Tags can cover a region of a linear read space. + */ +class Funnel { + +public: + /// Start processing the given named input. + /// Name must not be empty. + /// No stage or substage will be active. + void start(const string& name); + + /// Stop processing the given named input. + /// All stages and substages are stopped. + void stop(); + + /// Start the given stage, and end all previous stages and substages. + /// Name must not be empty. + /// Multiple stages with the same name will be coalesced. + void stage(const string& name); + + /// Stop the current stage. + void stage_stop(); + + /// Start the given substage, nested insude the current stage. End all previous substages. + /// Substages within a stage may repeat and are coalesced. + /// Name must not be empty. + void substage(const string& name); + + /// Stop the current substage. + void substage_stop(); + + /// Start processing the given item coming from the previous stage. + void processing_input(size_t prev_stage_item); + + /// Stop processing an item from the previous stage. + void processed_input(); + + /// Start producing the given output item, whether it has been projected yet or not. + void producing_output(size_t item); + + /// Stop producing an output item. + void produced_output(); + + /// Introduce the given number of new items, starting their own lines of provenance (default 1). + void introduce(size_t count = 1); + + /// Expand the given item from the previous stage into the given number of new items at this stage. + void expand(size_t prev_stage_item, size_t count); + + /// Merge all the given item indexes from the previous stage into a new item at this stage. + /// The new item will be a group, sized according to the number of previous items merged. + template + void merge_group(Iterator prev_stage_items_begin, Iterator prev_stage_items_end); + + /// Merge all the given item indexes from the previous stage into a new item at this stage. + /// The new item will be a group, sized according to the total size of + /// previous groups, with non-groups counting as size 1. + template + void merge_groups(Iterator prev_stage_items_begin, Iterator prev_stage_items_end); + + /// Merge all the given item indexes from the previous stage into a new item at this stage. + /// The new item will be a single item. + template + void merge(Iterator prev_stage_items_begin, Iterator prev_stage_items_end); + + /// Record extra provenance relationships where the latest current-stage + /// item came from the given previous-stage items. Increases the + /// current-stage item group size by the number of previous-stage items + /// added. + /// + /// Propagates tagging. + template + void also_merge_group(Iterator prev_stage_items_begin, Iterator prev_stage_items_end); + + /// Record extra provenance relationships where the latest current-stage + /// item came from the given earlier-stage items. Increases the + /// current-stage item group size by the number of previous-stage items + /// added. + /// + /// Propagates tagging. + /// + /// earlier_stage_lookback determines how many stages to look back and must be + /// 1 or more. + template + void also_merge_group(size_t earlier_stage_lookback, Iterator earlier_stage_items_begin, Iterator earlier_stage_items_end); + + /// Record an extra provenance relationship where the latest current-stage + /// item came from the given previous-stage item, the given number of + /// stages ago (min 1). + /// + /// Does not adjust group size or propagate tagging. + void also_relevant(size_t earlier_stage_lookback, size_t earlier_stage_item); + + /// Project a single item from the previous stage to a single non-group item at this stage. + void project(size_t prev_stage_item); + + /// Project a single item from the previous stage to a new group item at the current stage, with the given size. + void project_group(size_t prev_stage_item, size_t group_size); + + /// Fail the given item from the previous stage on the given filter and do not project it through to this stage. + /// Items which do not fail a filter must pass the filter and be projected to something. + /// The filter name must survive the funnel, because a pointer to it will be stored. + /// Allows a statistic for the filtered-on value for the failing item to be recorded. + void fail(const char* filter, size_t prev_stage_item, double statistic = nan("")); + + /// Pass the given item from the previous stage through the given filter at this stage. + /// Items which do not pass a filter must fail it. + /// All items which pass filters must do so in the same order. + /// The filter name must survive the funnel, because a pointer to it will be stored. + /// Allows a statistic for the filtered-on value for the passing item to be recorded. + void pass(const char* filter, size_t prev_stage_item, double statistic = nan("")); + + /// Assign the given score to the given item at the current stage. + void score(size_t item, double score); + + /// We can tag items as having one of these states. + enum class State { + NONE = 0, + PLACED = 1, + CORRECT = 2 + }; + + /// Tag the given item as being in the given state at the current stage. + /// Future items that derive from it will inherit these tags. Optionally + /// allows specifying that the state extends over a range in read space. + void tag(size_t item, State state, size_t tag_start = 0, size_t tag_length = std::numeric_limits::max()); + + /// Tag the given item as "correct" at the current stage. Future items that + /// derive from it will also be tagged as correct. + /// Optionally allows specifying that the correctness extends over a range + /// in read space, so correctness can be tracked as a property of regions + /// of the read, rather than the whole read. + /// If called multiple times, with different bounds, the correct region + /// will enclose all the correct regions provided in the different calls. + void tag_correct(size_t item, size_t tag_start = 0, size_t tag_length = std::numeric_limits::max()); + + /// Return true if the given item at this stage is tagged correct, or + /// descends from an item that was tagged correct. + bool is_correct(size_t item) const; + + /// Return true if the given item at the previous stage is tagged correct, or + /// descends from an item that was tagged correct. + bool was_correct(size_t prev_stage_item) const; + + /// Return true if the given item at the given named previous stage is + /// tagged correct, or descends from an item that was tagged correct. + /// Needs a hint about what number the stage was in the order, to make + /// lookup fast. + bool was_correct(size_t prev_stage_index, const string& prev_stage_name, size_t prev_stage_item) const; + + /// Get the name of the most recent stage that had a correct-tagged item + /// survive into it, or "none" if no items were ever tagged correct. + /// Optionally allows specifying a read space interval to intersect with + /// items, so the query returns the last stage that had a correct item + /// intersecting that range. + string last_correct_stage(size_t tag_start = 0, size_t tag_length = std::numeric_limits::max()) const; + + /// Get the name of the most recent stage that had a n item tagged with the + /// given tag or better survive into it, or "none" if no items were ever + /// tagged that good. Optionally allows specifying a read space interval to + /// intersect with items, so the query returns the last stage that had an + /// item intersecting that range and also an item witht hat tag or better. + /// + /// TODO: Make worse tag ranges not match queries for better tags! + string last_tagged_stage(State tag, size_t tag_start = 0, size_t tag_length = std::numeric_limits::max()) const; + + /// Get the index of the most recent item created in the current stage. + size_t latest() const; + + /// Call the given callback with stage name, and vector of result item + /// sizes at that stage, and a duration in seconds, for each stage. + void for_each_stage(const function&, const double&)>& callback) const; + + /// Represents the performance of a filter, for either item counts or total item sizes. + /// Note that passing_correct and failing_correct will always be 0 if nothing is tagged correct. + struct FilterPerformance { + size_t passing = 0; + size_t failing = 0; + size_t passing_correct = 0; + size_t failing_correct = 0; + }; + + /// Call the given callback with stage name, filter name, performance + /// report for items, performance report for total size of items, values + /// for correct items for the filter statistic, and values for incorrect + /// (or merely not known-correct) items for the filter statistic. + /// Runs the callback for each stage and filter, in order. Only includes + /// filters that were actually passed or failed by any items. + void for_each_filter(const function&, const vector&)>& callback) const; + + /// Dump information from the Funnel as a dot-format Graphviz graph to the given stream. + /// Illustrates stages and provenance. + void to_dot(ostream& out) const; + + /// Set an alignments annotations with the number of results at each stage + /// if annotate_correctness is true, also annotate the alignment with the + /// number of correct results at each stage. This assumes that we've been + /// tracking correctness all along + void annotate_mapped_alignment(Alignment& aln, bool annotate_correctness) const; + +protected: + + /// Pick a clock to use for measuring stage duration + using clock = std::chrono::high_resolution_clock; + /// And a type to represent stage transition times + using time_point = clock::time_point; + + /// What's the name of the funnel we start()-ed. Will be empty if nothing is running. + string funnel_name; + + /// At what time did we start() + time_point start_time; + + /// At what time did we stop() + time_point stop_time; + + /// What's the name of the current stage? Will be empty if no stage is running. + string stage_name; + + /// At what time did the stage start? + time_point stage_start_time; + + /// What's the name of the current substage? Will be empty if no substage is running. + string substage_name; + + /// What's the current prev-stage input we are processing? + /// Will be numeric_limits::max() if none. + size_t input_in_progress = numeric_limits::max(); + + /// what's the current current-stage output we are generating? + /// Will be numeric_limits::max() if none. + size_t output_in_progress = numeric_limits::max(); + + // Now members we need for provenance tracking + + /// Represents a flag vector over positions via a sorted interval list. + /// Allows setting flags in a range. + struct PaintableSpace { + /// Mark a range as painted + void paint(size_t start, size_t length); + /// Check if any position in the given range is painted + bool is_any_painted(size_t start, size_t length) const; + + /// Store start position and length for all painted intervals. + std::map regions; + }; + + /// Represents an Item whose provenance we track + struct Item { + size_t group_size = 0; + double score = 0; + /// Is this item tagged with a state, or a descendant of a tagged item? + State tag = State::NONE; + /// If the item is tagged, over what interval is it tagged? + /// When projecting, intervals are combined by min/maxing the bounds. + size_t tag_start = std::numeric_limits::max(); + size_t tag_length = 0; + /// What previous stage items were combined to make this one, if any? + vector prev_stage_items = {}; + /// And what items from stages before that? Recorded as (stage offset, + /// item number) pairs; all the offsets will be >=2. + vector> earlier_stage_items = {}; + /// What filters did the item pass at this stage, if any? + vector passed_filters = {}; + /// And what statistics did they have (or NaN)? + vector passed_statistics = {}; + /// What filter did the item finally fail at at this stage, if any? + const char* failed_filter = nullptr; + /// And what statistic did it fail with (or NaN)? + double failed_statistic = nan(""); + }; + + /// Represents a Stage which is a series of Items, which track their own provenance. + struct Stage { + string name; + vector items; + /// How long did the stage last, in seconds? + float duration; + /// How many of the items were actually projected? + /// Needed because items may need to expand to hold information for items that have not been projected yet. + size_t projected_count = 0; + /// What's the best tag of anything at this stage? + State tag = State::NONE; + /// Where are tags applied? + PaintableSpace tag_space; + }; + + /// Ensure an item with the given index exists in the current stage and return a reference to it. + /// We need to do it this way because we might save a production duration before an item is really projected. + /// The items of the current stage should only be modified through this. + /// Note that you do *not* need to create an item in order to get it. + Item& get_item(size_t index); + + /// Create a new item in the current stage and get its index. + /// Advances the projected count counter. + size_t create_item(); + + /// Rercord all the stages, including their names and item provenance. + /// Handles repeated stages. + vector stages; +}; + +inline std::ostream& operator<<(std::ostream& out, const Funnel::State& state) { + switch (state) { + case Funnel::State::NONE: + return out << "NONE"; + case Funnel::State::PLACED: + return out << "PLACED"; + case Funnel::State::CORRECT: + return out << "CORRECT"; + default: + return out << "UNKNOWN"; + } +} + +template +void Funnel::merge_group(Iterator prev_stage_items_begin, Iterator prev_stage_items_end) { + // Do a non-group merge + merge(prev_stage_items_begin, prev_stage_items_end); + + // Find it + size_t index = latest(); + + // Update its size + get_item(index).group_size = get_item(index).prev_stage_items.size(); +} + +template +void Funnel::merge_groups(Iterator prev_stage_items_begin, Iterator prev_stage_items_end) { + // Do a non-group merge + merge(prev_stage_items_begin, prev_stage_items_end); + + // Find it + size_t index = latest(); + + // Compute the total size it should have + auto& prev_stage = stages[stages.size() - 2]; + size_t total_size = 0; + for (auto& prev : get_item(index).prev_stage_items) { + size_t prev_group_size = prev_stage.items[prev].group_size; + if (prev_group_size == 0) { + // Non-groups count as size 1 + prev_group_size = 1; + } + total_size += prev_group_size; + } + + // Update its size + get_item(index).group_size = total_size; +} + +template +void Funnel::merge(Iterator prev_stage_items_begin, Iterator prev_stage_items_end) { + // There must be a prev stage to merge from + assert(stages.size() > 1); + auto& prev_stage = stages[stages.size() - 2]; + + // Make a new item to combine all the given items. + size_t index = create_item(); + + for (Iterator& it = prev_stage_items_begin; it != prev_stage_items_end; ++it) { + // For each prev stage item + size_t prev_stage_item = *it; + + // Make sure it existed + assert(prev_stage.items.size() > prev_stage_item); + + // Record the dependency + get_item(index).prev_stage_items.push_back(prev_stage_item); + + // Propagate tags + auto& old = prev_stage.items[prev_stage_item]; + if (old.tag != State::NONE) { + // Tag the new item if it came from something tagged. + tag(index, old.tag, old.tag_start, old.tag_length); + } + } +} + +template +void Funnel::also_merge_group(Iterator prev_stage_items_begin, Iterator prev_stage_items_end) { + also_merge_group(1, prev_stage_items_begin, prev_stage_items_end); +} + +template +void Funnel::also_merge_group(size_t earlier_stage_lookback, Iterator earlier_stage_items_begin, Iterator earlier_stage_items_end) { + assert(earlier_stage_lookback > 0); + assert(stages.size() > earlier_stage_lookback); + auto& earlier_stage = stages[stages.size() - 1 - earlier_stage_lookback]; + auto& item = get_item(latest()); + + for (Iterator& it = earlier_stage_items_begin; it != earlier_stage_items_end; ++it) { + // For each earlier stage item + size_t earlier_stage_item = *it; + + // Make sure it existed + assert(earlier_stage.items.size() > earlier_stage_item); + + // Record the dependency + if (earlier_stage_lookback == 1) { + // References to the immediately preceeding stage are special + item.prev_stage_items.push_back(earlier_stage_item); + } else { + // References to earlier stages include the stage offset back + item.earlier_stage_items.emplace_back(earlier_stage_lookback, earlier_stage_item); + } + + // Increase group size + item.group_size += 1; + + // Propagate tags + auto& old = earlier_stage.items[earlier_stage_item]; + if (old.tag != State::NONE) { + // Tag the new item if it came from something tagged. + tag(latest(), old.tag, old.tag_start, old.tag_length); + } + } +} + +} + +#endif diff --git a/src/gam_index.cpp b/src/gam_index.cpp deleted file mode 100644 index c7aad4c445b..00000000000 --- a/src/gam_index.cpp +++ /dev/null @@ -1,800 +0,0 @@ -#include "gam_index.hpp" - -#include - -namespace vg { - -using namespace std; - -auto GAMIndex::bins_of_id(id_t id) -> vector { - vector to_return; - - // How many bits are in the number? We get a bin per bit. - auto number_bits = CHAR_BIT * sizeof(bin_t); - - // We need to keep in mind that shifting *all* the bits out of a number in - // one go is undefined behavior. - - // Bin number consists of an index which is a prefix of the number - bin_t bin_index = ((bin_t)id) >> 1; - // And an offset to keep from colliding with other bins. - bin_t bin_offset = ((bin_t)~0) >> 1; - - for (int i = 0; i < number_bits; i++) { - -#ifdef debug - cerr << hex << id << " " << i << " " << bin_index << " " << bin_offset << " " << (bin_index + bin_offset) << endl; -#endif - - to_return.push_back(bin_index + bin_offset); - bin_index = bin_index >> 1; - bin_offset = bin_offset >> 1; - } - - return to_return; -} - -auto GAMIndex::bins_of_range(id_t min_id, id_t max_id) -> vector { - // We can just get the two bin vectors for the ending bin, and generate an inclusive range of bins at each level. - - vector to_return; - - auto min_bins = bins_of_id(min_id); - auto max_bins = bins_of_id(max_id); - - assert(min_bins.size() == max_bins.size()); - - for (size_t i = 0; i < min_bins.size(); i++) { - // For each specificity level - for (bin_t bin = min_bins[i]; bin != max_bins[i] + 1; bin++) { - // For each bin in the inclusive range, emit it - to_return.push_back(bin); - } - } - - return to_return; - -} - -auto GAMIndex::common_bin(id_t a, id_t b) -> bin_t { - // Convert to unsigned numbers - bin_t a_bin = a; - bin_t b_bin = b; - - // Define the offset for the bin - bin_t offset = ((bin_t)~0); - - // We're just going to pop off bits until we find the common prefix. - // Always pop off one bit, even if we are binning a number and itself. - // TODO: Find a faster way to do this with the appropriate instruction intrinsics. - do { - a_bin = a_bin >> 1; - b_bin = b_bin >> 1; - offset = offset >> 1; - } while(a_bin != b_bin); - return a_bin + offset; -} - -auto GAMIndex::window_of_id(id_t id) -> window_t { - return id >> WINDOW_SHIFT; -} - -auto GAMIndex::add_group(id_t min_id, id_t max_id, int64_t virtual_start, int64_t virtual_past_end) -> void { - - if (min_id < last_group_min_id) { - // Someone is trying to index an unsorted GAM. - // This is probably user error, so complain appropriately: - cerr << "error [vg::GAMIndex]: GAM data being indexed is not sorted. Sort with vg gamsort." << endl; - exit(1); - } - last_group_min_id = min_id; - - // Find the bin for the run - bin_t bin = common_bin(min_id, max_id); - -#ifdef debug - cerr << "Group spanning " << min_id << "-" << max_id << " at " - << virtual_start << "-" << virtual_past_end << " lands in bin " << bin << endl; -#endif - - // Find the existing ranges in the bin. - // We know the previous one, if present, must end at or before this one's start. - auto& ranges = bin_to_ranges[bin]; - - if (!ranges.empty() && ranges.back().second == virtual_start) { - // We fit right after the last range. - ranges.back().second = virtual_past_end; -#ifdef debug - cerr << "Extend existing range to " << ranges.back().first << "-" << ranges.back().second << endl; -#endif - } else { - // We need a new range - bin_to_ranges[bin].emplace_back(virtual_start, virtual_past_end); - } - - for (window_t w = window_of_id(min_id); w <= window_of_id(max_id); w++) { - // For each window that this group overlaps - - if (!window_to_start.count(w)) { - // If it is the first group we encounter in the window, it must also be the earliest-staring group in the window. - - // This is the earliest virtual offset to overlap that window - window_to_start[w] = virtual_start; - -#ifdef debug - cerr << "Start window " << w << endl; -#endif - } - } -} - -auto GAMIndex::find(id_t node_id) const -> vector> { - vector> to_return; - - find(node_id, [&](int64_t run_start, int64_t run_past_end) -> bool { - // For each run we find, remember it - to_return.emplace_back(run_start, run_past_end); - // Keep getting runs until we run out in the index. We can't actually scan the data. - return true; - }); - - return to_return; -} - -auto GAMIndex::find(id_t node_id, const function scan_callback) const -> void { - // Look for a single-node inclusive range - find(node_id, node_id, std::move(scan_callback)); -} - -auto GAMIndex::find(id_t min_node, id_t max_node, const function scan_callback) const -> void { - -#ifdef debug - cerr << "Query for node range " << min_node << "-" << max_node << endl; -#endif - - // Find the window that gives us a lower bound on the virtual offset we - // need to be at to find things that touch this node ID. - window_t min_window = window_of_id(min_node); - window_t max_window = window_of_id(max_node); - -#ifdef debug - cerr << "Looking for first filled window of " << min_window << "-" << max_window << endl; -#endif - - // Find the minimum virtual offset we need to consider - int64_t min_vo; - // It will be for the first occupied window at or after the min window but not greater than the max window. - auto found = window_to_start.lower_bound(min_window); - if (found != window_to_start.end() && found->first <= max_window) { - // Some groups overlapped this window, and they started here. - min_vo = found->second; - -#ifdef debug - cerr << "First occupied window is " << found->first << " at offset " << min_vo << endl; -#endif - } else { - // No groups overlapped any window within the range, so don't iterate anything. - -#ifdef debug - cerr << "No windows occupied; range is empty" << endl; -#endif - - return; - } - - // Find the bins that any of the nodes in the range can be in - auto bin_numbers = bins_of_range(min_node, max_node); - - // Filter down to bins that actually have vectors in the index - vector used_bins; - for (auto& bin_number : bin_numbers) { - auto found = bin_to_ranges.find(bin_number); - - if (found != bin_to_ranges.end()) { - used_bins.push_back(found); - } - } - - // Set up a cursor in each bin - // TODO: Could we do one cursor per specificity level instead? This way might be introducing some n^2 stuff in the range length. - vector>::const_iterator> cursors; - for (auto& bin : used_bins) { - cursors.push_back(bin->second.begin()); -#ifdef debug - cerr << "Bin " << bin->first << " overlaps the query and is nonempty" << endl; -#endif - } - - while (true) { - // Loop until the user asks us to stop or we run out of things to give them. - -#ifdef debug - cerr << "Find earliest-starting run in any bin ending after " << min_vo << endl; -#endif - - // This tracks which of the cursors points to the run that starts earliest, or the max value if no candidate runs exist. - size_t starts_earliest = numeric_limits::max(); - - for(size_t i = 0; i < used_bins.size(); i++) { - // Advance each cursor to the earliest-starting window that ends after the min_vo, by linear scan - auto& bin_ranges = used_bins[i]->second; - auto& cursor = cursors[i]; - - while (cursor != bin_ranges.end() && cursor->second <= min_vo) { - // This run ends too early, so keep advancing. - ++cursor; - } - - if (cursor != bin_ranges.end()) { - // We actually have a candidate run - if (starts_earliest == numeric_limits::max() || cursor->first < cursors[starts_earliest]->first) { - // This candidate run starts earlier than the earliest candidate run from other bins. - - // Rememebr it. - starts_earliest = i; - } - - } - } - - if (starts_earliest == numeric_limits::max()) { - // We are all out of runs in any of the bins. We are done! -#ifdef debug - cerr << "Out of runs in bins" << endl; -#endif - return; - } - -#ifdef debug - cerr << "Found run " << cursors[starts_earliest]->first << "-" << cursors[starts_earliest]->second - << " from bin " << used_bins[starts_earliest]->first << endl; -#endif - - // Call the callback with the range max(min_vo, that run's start) to that run's end. - bool keep_going = scan_callback(max(min_vo, cursors[starts_earliest]->first), cursors[starts_earliest]->second); - - if (!keep_going) { - // The user is done with runs. They must have found a group that has an out-of-range minimum node ID. - // We are done! - return; - } - - // Look for the next run continuing after here. - min_vo = cursors[starts_earliest]->second; - } -} - -auto GAMIndex::add_group(const vector& alns, int64_t virtual_start, int64_t virtual_past_end) -> void { - // Find the min and max ID visited by any of the alignments - id_t min_id = numeric_limits::max(); - id_t max_id = numeric_limits::min(); - - for (auto& aln : alns) { - // For each alignment - if (aln.path().mapping_size() == 0) { - // The read is unmapped, so it belongs to node ID 0 - min_id = min(min_id, (id_t)0); - max_id = max(max_id, (id_t)0); - } else { - for (auto& mapping : aln.path().mapping()) { - // For each mapping in it, min/max in the ID - auto id = mapping.position().node_id(); - min_id = min(min_id, id); - max_id = max(max_id, id); - } - } - } - - add_group(min_id, max_id, virtual_start, virtual_past_end); -} - -auto GAMIndex::index(cursor_t& cursor) -> void { - // Keep track of what group we are in - int64_t group_vo = cursor.tell_group(); - // And load all its alignments - vector group; - - // We need to have seek support - assert(group_vo != -1); - - while (cursor.has_next()) { - // For each alignment - - // Work out what group it is in - int64_t alignment_group_vo = cursor.tell_group(); - - if (alignment_group_vo != group_vo) { - // This is the start of a new group - - // Record the old group as being up to here - add_group(group, group_vo, alignment_group_vo); - - // Set up for the new group - group.clear(); - group_vo = alignment_group_vo; - } - - // Add the alignment to the group and move on - group.emplace_back(std::move(cursor.take())); - } - - if (!group.empty()) { - // Record the final group. Use wherever the cursor landed at the end as its final virtual offset. - add_group(group, group_vo, cursor.tell_raw()); - } -} - -auto GAMIndex::find(cursor_t& cursor, id_t min_node, id_t max_node, - const function handle_result) const -> void { - - find(cursor, vector>{{min_node, max_node}}, handle_result); - -} - -/// Return true if the given ID is in any of the sorted, coalesced, inclusive ranges in the vector, and false otherwise. -/// TODO: Is repeated binary search on the ranges going to be better than an unordered_set of all the individual IDs? -static bool is_in_range(const vector>& ranges, id_t id) { - // Use a binary search - size_t left = 0; - size_t past_right = ranges.size(); - - while (past_right >= left + 1) { - // We have a nonempty interval - - // Find the middle - size_t center = (left + past_right) / 2; - assert(center < ranges.size()); - - // Look at the range there - auto& range = ranges[center]; - - if (id < range.first) { - // If we're before it, go left - past_right = center; - } else if (id > range.second) { - // If we're after it, go right - left = center + 1; - } else { - // If we're in it, return true - return true; - } - } - - // If we get here, it wasn't in any range - return false; - -} - -auto GAMIndex::find(cursor_t& cursor, const vector>& ranges, - const function handle_result, bool only_fully_contained) const -> void { - -#ifdef debug - cerr << "Begin a find query on ranges:" << endl; - for (auto& range : ranges) { - cerr << "\t" << range.first << "-" << range.second << endl; - } -#endif - - // We need seek support - assert(cursor.tell_raw() != -1); - - // Because a node in a later range may appear earlier in the file than a - // node in an earlier range (but in a high-in-the-hierarchy bin), in - // general we need to jump around in the file. TODO: Use a processed_up_to - // counter to constrain us to one sweep in the only_fully_contained case. - - // To prevent us from scanning groups multiple times over, we keep a map - // from already-processed group start VO to the VO of the next group (or - // EOF). We can ride down chains in this map whenever we hit somewhere we - // have already been, instead of actually re-reading anything. - unordered_map next_unprocessed; - - // We access it with this accessor function. It returns the given address - // if the group there has not been read, or the next unprocessed VO (or EOF - // VO) if it has. - auto get_next_unprocessed = [&](int64_t currently_at) { - // If we have to chain through multiple VOs to find the final one, we store them here. - vector chain; - -#ifdef debug - cerr << "Find next unprocessed group after " << currently_at << endl; -#endif - - auto found = next_unprocessed.find(currently_at); - while(found != next_unprocessed.end()) { - // We have a place to go. - - // Remember this place as a place that needs to go to the final place we find. - chain.push_back(currently_at); - -#ifdef debug - cerr << currently_at << " chains to " << found->second << endl; -#endif - - // Advance to the place we found. - currently_at = found->second; - found = next_unprocessed.find(currently_at); - } - - // Now we hit the end. Save the final answer back to the map for - // everything but the last item, so we never need to scan it again - for (size_t i = 0; i + 1 < chain.size(); i++) { - next_unprocessed[chain[i]] = currently_at; - } - -#ifdef debug - cerr << "It is " << currently_at << endl; -#endif - - return currently_at; - }; - - // And this accessor marks a group as processed - auto mark_processed = [&](int64_t start_vo, int64_t past_end_vo) { - -#ifdef debug - cerr << "Mark group " << start_vo << " to " << past_end_vo << " as processed" << endl; -#endif - - next_unprocessed[start_vo] = past_end_vo; - }; - - for (auto& range : ranges) { - // For each range of IDs to look up - -#ifdef debug - cerr << "Look up range " << range.first << "-" << range.second << endl; -#endif - - find(range.first, range.second, [&](int64_t start_vo, int64_t past_end_vo) -> bool { - // For each matching range of virtual offsets in the index - -#ifdef debug - cerr << "Look at VOs " << start_vo << "-" << past_end_vo << endl; -#endif - - // Warp the start past any already-processed groups we know about - start_vo = get_next_unprocessed(start_vo); - if (start_vo >= past_end_vo) { - // Skip this whole range and look at the next one - -#ifdef debug - cerr << "The VO range has already been processed." << endl; -#endif - - return true; - } - - // Now the range starts with a group we have never seen before. - - // Seek the cursor, even if we are already at the group in question. - // TODO: We don't have a good way to tell if we are at the beginning of a group or not. - -#ifdef debug - cerr << "Seek cursor to " << start_vo << endl; -#endif - - cursor.seek_group(start_vo); - - // We need to track each group we encounter, so we can tell when an - // entire group is past the top end of the ID range we are - // currently looking up. - int64_t group_vo = cursor.tell_group(); - id_t group_min_id = numeric_limits::max(); - while (cursor.has_next() && cursor.tell_group() < past_end_vo) { - // Read each alignment until we find a group that starts out of range - - // Which group is this alignment in? - auto alignment_group_vo = cursor.tell_group(); - - if (alignment_group_vo != group_vo) { - // We finished the previous group. - -#ifdef debug - cerr << "Finished group " << group_vo << endl; -#endif - - // Record the group as processed - mark_processed(group_vo, alignment_group_vo); - - if (group_min_id != numeric_limits::max() && group_min_id > range.second) { - // Everything in the (non-empty) previous group was too high. We don't care about this group; our iteration is over. - -#ifdef debug - cerr << "Group was out of bounds for its range with min id " << group_min_id << " > " << range.second << endl; - cerr << "Move on to next range" << endl; -#endif - - // Stop early. Don't finish this run and don't look at the next runs for this query range. - return false; - } - - // Otherwise we need to start a new group - group_min_id = numeric_limits::max(); - - // Zip the group VO ahead to the next unprocessed group (which may be here, or at EOF) - group_vo = get_next_unprocessed(alignment_group_vo); - if (group_vo != alignment_group_vo) { - // We want to go to a different group next. - if (group_vo >= past_end_vo) { - // But it's out of range for this range. Don't go there. -#ifdef debug - cerr << "Next unprocessed VO is out of range" << endl; -#endif - break; - } else { - // Seek there and restart the loop to see if we found anything good. -#ifdef debug - cerr << "Seek to next unprocessed VO at " << group_vo << endl; -#endif - cursor.seek_group(group_vo); - continue; - } - } else { - // Otherwise, we are continuing with this group we just found. -#ifdef debug - cerr << "Next unprocessed VO is right here." << endl; -#endif - } - } - - // Filter the alignment by the query and yield it if it matches - const auto& alignment = *cursor; - bool alignment_match = false; - - if (alignment.path().mapping_size() == 0) { - // This read is unmapped, so count it as node 0. - group_min_id = min(group_min_id, (id_t)0); - if (is_in_range(ranges, 0)) { - // We want unmapped reads. - alignment_match = true; - } - } else { - // The read has mappings - for (const auto& mapping : alignment.path().mapping()) { - // Look at each node that is visited - auto visited = mapping.position().node_id(); - group_min_id = min(group_min_id, visited); - if (is_in_range(ranges, visited)) { - // We want this node. - alignment_match = true; - if (!only_fully_contained) { - // All we care about is that any of the nodes match - break; - } - } else if (only_fully_contained) { - // We need *all* of the nodes to match, and this one didn't. - alignment_match = false; - break; - } - } - } - - if (alignment_match) { - // This alignment is one that matches the query. Yield it. - handle_result(alignment); - } - - // Look for the next alignment - cursor.get_next(); - - } - - if (group_vo < past_end_vo) { - // We finished a final group, from group_vo to past_end_vo -#ifdef debug - cerr << "Finished last group " << group_vo << endl; -#endif - - // Mark it finished - mark_processed(group_vo, past_end_vo); - - if (group_min_id != numeric_limits::max() && group_min_id > range.second) { - // If the (non-empty) last group had all its node IDs past the max - // node ID, we know nothing after it can possibly match, so stop - // iteration. - -#ifdef debug - cerr << "Group was out of bounds with min id " << group_min_id << " > " << range.second << endl; - cerr << "Move on to next range" << endl; -#endif - - return false; - } - } - - // Otherwise, the last group we looked at was not yet out of bounds, so get another range to look at, if one exists. - return true; - }); - - } -} - -auto GAMIndex::find(cursor_t& cursor, id_t node_id, const function handle_result) const -> void { - find(cursor, node_id, node_id, std::move(handle_result)); -} - -const string GAMIndex::MAGIC_BYTES = "GAI!"; - -auto GAMIndex::save(ostream& to) const -> void { - // We aren't going to save as Protobuf messages; we're going to save as a bunch of varints. - - // Format is - // Magic bytes - // Index version (varint32) - // Bin count (varint64) - // For each bin: - // Bin number (varint64) - // Run count (varint64) - // For each run: - // Start (varint64) - // Past-end (varint64) - // And then window count (varint64) - // And for each window: - // Window number (varint64) - // Window start (varint64) - - // All the integers are Protobuf variable-length values. - // The result is gzip-compressed. - - ::google::protobuf::io::OstreamOutputStream raw_out(&to); - ::google::protobuf::io::GzipOutputStream gzip_out(&raw_out); - ::google::protobuf::io::CodedOutputStream coded_out(&gzip_out); - - // Save the magic bytes - coded_out.WriteRaw((void*)MAGIC_BYTES.c_str(), MAGIC_BYTES.size()); - - // Save the version - coded_out.WriteVarint32(OUTPUT_VERSION); - - // Save the bin count - coded_out.WriteVarint64(bin_to_ranges.size()); - for (auto& kv : bin_to_ranges) { - // For each bin, save the number - coded_out.WriteVarint64(kv.first); - // And the number of runs - coded_out.WriteVarint64(kv.second.size()); - - for (auto& run : kv.second) { - // For each run, write the VO range - coded_out.WriteVarint64(run.first); - coded_out.WriteVarint64(run.second); - } - } - - // Save the window count - coded_out.WriteVarint64(window_to_start.size()); - for (auto& kv : window_to_start) { - // Save each window's number and start - coded_out.WriteVarint64(kv.first); - coded_out.WriteVarint64(kv.second); - } - -} - -auto GAMIndex::load(istream& from) -> void { - - ::google::protobuf::io::IstreamInputStream raw_in(&from); - ::google::protobuf::io::GzipInputStream gzip_in(&raw_in); - - - bin_to_ranges.clear(); - window_to_start.clear(); - - // Define an error handling function - auto handle = [](bool ok) { - if (!ok) throw std::runtime_error("GAMIndex::load detected corrupt index file"); - }; - - // Look for the magic value - - // First read a bit of data - char* buffer; - int buffer_size = 0; - while (buffer_size == 0) { - // We must retry until we get some data, accoridng to the ZeroCopyInputStream spec - handle(gzip_in.Next((const void**)&buffer, &buffer_size)); - } - - // TODO: In theory, we might have arbitrarily small buffers given to us. - // We assume that the buffers are always big enough to actually peek the magic value and back up. - assert(buffer_size >= MAGIC_BYTES.size()); - - // We will fill this in with the version if we find it - uint32_t input_version = 0; - - // Check to see if the magic bytes are there - if (std::equal(MAGIC_BYTES.begin(), MAGIC_BYTES.end(), buffer)) { - // We found the magic bytes! We know this is a versioned GAM index file. - - // Roll back to just after them - gzip_in.BackUp(buffer_size - MAGIC_BYTES.size()); - - // Read the input version - { - ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); - handle(coded_in.ReadVarint32(&input_version)); - } - } else { - // No magic bytes means input version 0 - // Roll back everything - gzip_in.BackUp(buffer_size); - } - - if (input_version > MAX_INPUT_VERSION) { - throw std::runtime_error("GAMIndex::load can understand only up to index version " + to_string(MAX_INPUT_VERSION) + - " and file is version " + to_string(input_version)); - } - - switch (input_version) { - case 0: - case 1: - // Read the number of bins that are used - uint64_t bin_count; - { - // TODO: To avoid hitting the coded input stream's byte limit (why is - // it even at this level?) we destory and recreate it for every - // semantic group. - ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); - handle(coded_in.ReadVarint64(&bin_count)); - } - - for (size_t i = 0; i < bin_count; i++) { - // Read the bin number and run count for each bin - uint64_t bin_number; - uint64_t run_count; - { - ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); - handle(coded_in.ReadVarint64(&bin_number)); - handle(coded_in.ReadVarint64(&run_count)); - } - - // Create the empty bin - auto& runs = bin_to_ranges[bin_number]; - - for (size_t j = 0; j < run_count; j++) { - // Load each run - uint64_t run_start; - uint64_t run_end; - - { - ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); - handle(coded_in.ReadVarint64(&run_start)); - handle(coded_in.ReadVarint64(&run_end)); - } - - runs.emplace_back(run_start, run_end); - - } - - } - - - // Now count the number of windows - uint64_t window_count; - { - ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); - handle(coded_in.ReadVarint64(&window_count)); - } - - for (size_t i = 0; i < window_count; i++) { - // Load each window - uint64_t window_number; - uint64_t window_start; - - { - ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); - handle(coded_in.ReadVarint64(&window_number)); - handle(coded_in.ReadVarint64(&window_start)); - } - - window_to_start[window_number] = window_start; - } - break; - default: - throw std::runtime_error("Unimplemented GAM index version " + to_string(input_version)); - } - -} - -} diff --git a/src/gam_index.hpp b/src/gam_index.hpp deleted file mode 100644 index 805ea025f6d..00000000000 --- a/src/gam_index.hpp +++ /dev/null @@ -1,212 +0,0 @@ -#ifndef VG_GAM_INDEX_HPP_INCLUDED -#define VG_GAM_INDEX_HPP_INCLUDED - -/** - * \file gam_index.hpp - * Contains the GAMIndex class, which allows retrieving reads from a (blocked) BAM file for certain queries. - */ - -#include -#include -#include -#include -#include - -#include "types.hpp" -#include "vg.pb.h" -#include "stream.hpp" - -namespace vg { - -using namespace std; - -/** - * - * An index for a node-ID-sorted GAM file. Reads are sorted by lowest visited - * node ID, then by highest visited node ID. - * - * Works on a BAI-like concept of bins partitioning node ID space. - * - * GAM files are serialized as count-prefixed groups of reads, which are the - * smallest unit that can be deserialized. - * - * Every *group* of reads gets assigned to a bin which is the longest bin that - * completely contains the ID range used in the group. - * - * We define *runs* of adjacent groups of reads which have the same bin, which - * are the basic subject of the index. - * - * We then store an index from bin to the virtual offset ranges (start and - * past-the-end), in order, of runs that are assigned to the bin. - * - * You will get non-contiguous virtual offset ranges for a node ID range when - * some reads run into the range from the left, then reads that start later - * don't, and then reads that start even later do again. - * - * We also have a BAI-style linear index, mapping from tiling windows in node - * ID space to the lowest virtual offset of a group that overlaps the window. - * - * The bin structure is that we partition all of node ID space into bins of - * power-of-2 size, starting with size 2 nodes. We number the bins such that 0 - * is the whole-ID-space bin, divided into 1 and 2, then into 3, 4, 5, and 6, - * and so on. - * - * The tiling windows are just the node IDs down-shifted by a few bits. - * - * Unmapped reads are considered to visit node ID 0. The maximum and minimum - * id_t values are used as sentinels, so they can't be real nodes. - * - * All find operations are thread-safe with respect to each other. Simultaneous - * adds or finds and ads are prohibited. - * - */ -class GAMIndex { -public: - GAMIndex() = default; - - // Methods that actually go get reads for you are going to need a cursor on an open, seekable GAM file. - using cursor_t = stream::ProtobufIterator; - - // Bins are identified of unsigned integers of the same width as node IDs. - using bin_t = make_unsigned::type; - - // So are windows, but we give them their own semantic type - using window_t = make_unsigned::type; - - /// Load a GAMIndex from a file. - /// File holds the index, not the GAM. - void load(istream& from); - - /// Save a GAMIndex to a file. - void save(ostream& to) const; - - // Like the XG we support versioning. - - /// What's the maximum GAM index version number we can read with this code? - const static uint32_t MAX_INPUT_VERSION = 1; - /// What's the version we serialize? - const static uint32_t OUTPUT_VERSION = 1; - /// What magic value do we embed in the compressed gam index data? - const static string MAGIC_BYTES; - - - /////////////////// - // Top-level Alignment-based interface - /////////////////// - - /// Call the given callback with all Alignments in the index that visit the given node. - void find(cursor_t& cursor, id_t node_id, const function handle_result) const; - - /// Call the given callback with all Alignments in the index that visit a node in the given inclusive range. - void find(cursor_t& cursor, id_t min_node, id_t max_node, const function handle_result) const; - - /// Call the given callback with all the Alignments in the index that visit - /// a node in any of the given sorted, coalesced inclusive ranges. - /// Emits each alignment at most once. - /// If only_fully_contained is set, only Alignments where *all* the mappings are to nodes in one of the ranges will match. - void find(cursor_t& cursor, const vector>& ranges, const function handle_result, - bool only_fully_contained = false) const; - - /// Given a cursor at the beginning of a sorted, readable file, index the file. - void index(cursor_t& cursor); - - /// Add a group articulated as a vector of alignments, between the given virtual offsets. - /// Must be called in virtual offset order for successive groups. - void add_group(const vector& alns, int64_t virtual_start, int64_t virtual_past_end); - - /////////////////// - // Lower-level virtual-offset-based interface - /////////////////// - - // Note that retrieving all the runs overlapping a node ID or node ID range - // isn't possible. We can use the index to look up addresses to start at, - // but the only way to know when to stop scanning groups is when you find a - // group in the file with a minimum node ID that is too large. Then you - // know to jump to the next start address. - - /// Find all the ranges of run virtual offsets from the first position that - /// might be relevant for the given node ID to the ends of all the bins it - /// is in. Trims ranges by the linear index on the low end, and returns a - /// series of potentially abutting but non-overlapping virtual offset - /// ranges. Does not stop early (because it has no access to the actual - /// reads to tell when it should stop looking at runs in a bin). So you - /// will get ranges covering all runs in a bin that follow the runs you are - /// interested in as well. - vector> find(id_t node_id) const; - - /// Find all the ranges of run virtual offsets to check for reads visiting - /// the given node ID. Relies on a scanning callback, which will be called - /// repeatedly with the start and past-the-end virtual offsets of runs - /// which may contain groups touching the given node ID. When called, the - /// callback should scan the run and return either true if it wants the - /// next run, or false if it encountered a group with an out-of-range start - /// and wants to stop iteration. Runs will be emitted in order, and - /// truncated on the left to either the appropriate lower bound from the - /// linear index, or the past-the-end of the previous run scanned. - void find(id_t node_id, const function scan_callback) const; - - // Find all the ranges of run virtual offsets to check for reads visiting - /// the given inclusive node ID range. Relies on a scanning callback, which - /// will be called repeatedly with the start and past-the-end virtual - /// offsets of runs which may contain groups touching the given node ID. - /// When called, the callback should scan the run and return either true if - /// it wants the next run, or false if it encountered a group with an - /// out-of-range start and wants to stop iteration. Runs will be emitted in - /// order, and truncated on the left to either the appropriate lower bound - /// from the linear index, or the past-the-end of the previous run scanned. - void find(id_t min_node, id_t max_node, const function scan_callback) const; - - /// Add a group into the index, based on its minimum and maximum - /// (inclusive) used node IDs. Must be called for all groups in virtual - /// offset order. - void add_group(id_t min_id, id_t max_id, int64_t virtual_start, int64_t virtual_past_end); - - /////////////////// - // Lowest-level functions for thinking about bins and windows. - /////////////////// - - /// Compute the bins, from most to least specific, that a node ID occurs in. - static vector bins_of_id(id_t id); - - /// Compute the bins, from most to least specific, that any of the node IDs - /// in the given inclusive range occur in. There may be multiple bins at a - /// given level of specificity; they will appear in numerical order. - static vector bins_of_range(id_t min_id, id_t max_id); - - /// Get the most specific bin that contains both of the given node IDs. - static bin_t common_bin(id_t a, id_t b); - - /// Get the linear index window that the given node ID falls in. The window - /// range for a group is its min nodes' window through its max node's - /// window. - static window_t window_of_id(id_t id); - - -protected: - - // How many bits of a node ID do we truncate to get its linear index window? - const static size_t WINDOW_SHIFT = 8; - - /// Maps from bin number to all the ranges of virtual offsets, in order, for runs that land in the given bin. - /// A run lands in a bin if that bin is the most specific bin that includes both its lowest and highest nodes it uses. - unordered_map>> bin_to_ranges; - - /// Maps from linear index window to the virtual offset of the first group - /// that overlaps that window (taking the group as a min-to-max node - /// range). If you are looking for reads that visit a node, they can't - /// possibly occur in a group before the first offset stored for the node's - /// window (or any greater window). TODO: Should we make this a vector - /// instead and hope nobody uses high/sparse node IDs? - map window_to_start; - - /// What was the minimum node ID of the last group added? - /// If this isn't strictly increasing, we're trying to idnex data that is not sorted. - id_t last_group_min_id = numeric_limits::min(); - -}; - -} - -#endif - - diff --git a/src/gamsorter.cpp b/src/gamsorter.cpp deleted file mode 100644 index 83d289ea8ab..00000000000 --- a/src/gamsorter.cpp +++ /dev/null @@ -1,525 +0,0 @@ -#include "gamsorter.hpp" -#include "utility.hpp" -#include "json2pb.h" -#include "position.hpp" -#include "gam_index.hpp" - -#include -#include - -/** - * \file gamsorter.cpp - * GAMSorter: sort a gam by position and offset. - * Store unmapped reads at node 0. - */ - -using namespace std; -using namespace vg; - -GAMSorter::GAMSorter(bool show_progress) { - this->show_progress = show_progress; - - // We would like this many FDs max, if not limited below that. - max_fan_in = 2048; - // We need at least this many to sort practically. - int min_fan_in = 100; - - // We need this many extra FDs not used for fan-in - int extra_fds = 10; - - // Work out how many FDs we are allowed - struct rlimit fd_limit; - if (getrlimit(RLIMIT_NOFILE, &fd_limit) != 0) { - // We don't know; choose a conservative default. - max_fan_in = min_fan_in; - cerr << "warning:[vg gamsort]: Cannot determine file descriptor limits; using " << max_fan_in << " temp file fan-in" << endl; - } else { - // We read the limit - if (fd_limit.rlim_cur != RLIM_INFINITY && fd_limit.rlim_cur < max_fan_in + extra_fds) { - // Max out our FD limit - fd_limit.rlim_cur = min(max_fan_in + extra_fds, fd_limit.rlim_max); - - if (setrlimit(RLIMIT_NOFILE, &fd_limit) != 0) { - // We asked for a value in bound sso we should have succeeded - throw runtime_error("Error adjusting file descriptor limit to " + to_string(fd_limit.rlim_cur) - + " / " + to_string(fd_limit.rlim_max)); - } - } - - if (fd_limit.rlim_cur != RLIM_INFINITY && fd_limit.rlim_cur < max_fan_in + extra_fds) { - // We need to limit ourselves to under the max FD limit - if (fd_limit.rlim_cur < extra_fds + min_fan_in) { - // If we can't at least do a fan-in of 10 we have a big problem. - cerr << "error:[vg gamsort]: Open file limit very low (" << fd_limit.rlim_cur << "); we need " << (extra_fds + min_fan_in) << endl; - exit(1); - } - - // Set the max fan in to be subject to the limit - max_fan_in = min((size_t)(fd_limit.rlim_cur - extra_fds), max_fan_in); - } - } -} - -void GAMSorter::sort(vector& alns) const { - std::sort(alns.begin(), alns.end(), [&](const Alignment& a, const Alignment& b) { - return this->less_than(a, b); - }); -} - -void GAMSorter::dumb_sort(istream& gam_in, ostream& gam_out, GAMIndex* index_to) { - std::vector sort_buffer; - - stream::for_each(gam_in, [&](Alignment &aln) { - sort_buffer.push_back(aln); - }); - - this->sort(sort_buffer); - - // Write the output in non-enormous chunks, so indexing is actually useful - vector out_buffer; - - // Make an output emitter - stream::ProtobufEmitter emitter(gam_out); - - if (index_to != nullptr) { - emitter.on_group([&index_to](const vector& group, int64_t start_vo, int64_t past_end_vo) { - // Whenever it emits a group, index it. - // Make sure to only capture things that will outlive the emitter - index_to->add_group(group, start_vo, past_end_vo); - }); - } - - for (auto& aln : sort_buffer) { - // Feed in all the sorted alignments - emitter.write(std::move(aln)); - } - - // Emitter destruction will terminate the file with an EOF marker -} - - - -void GAMSorter::stream_sort(istream& gam_in, ostream& gam_out, GAMIndex* index_to) { - - // We want to work out the file size, if we can. - size_t file_size = 0; - { - // Save our position - auto here = gam_in.tellg(); - // Go to the end - gam_in.seekg(0, gam_in.end); - // Get its position - auto there = gam_in.tellg(); - // Go back to where we were - gam_in.seekg(here); - - if (gam_in.good()) { - // We can seek in this stream. So how far until the end? - file_size = there - here; - } else { - // It's entirely possible that none of that worked. So clear the error flags and leave the size at 0. - gam_in.clear(); - } - } - - - // Don't give an actual 0 to the progress code or it will NaN - create_progress("break into sorted chunks", file_size == 0 ? 1 : file_size); - - // Eventually we put sorted chunks of data in temp files and put their names here - vector outstanding_temp_files; - - // This tracks the number of reads in each file, by file name - unordered_map reads_per_file; - // This tracks the total reads observed on input - size_t total_reads_read = 0; - - // This cursor will read in the input file. - cursor_t input_cursor(gam_in); - - #pragma omp parallel shared(gam_in, input_cursor, outstanding_temp_files, reads_per_file, total_reads_read) - { - - while(true) { - - vector thread_buffer; - - #pragma omp critical (input_cursor) - { - // Each thread fights for the file and the winner reads some data - size_t buffered_message_bytes = 0; - while (input_cursor.has_next() && buffered_message_bytes < max_buf_size) { - // Until we run out of input alignments or space, buffer each, recording its size. - buffered_message_bytes += input_cursor.get_item_size(); - thread_buffer.emplace_back(std::move(input_cursor.take())); - } - - // Update the progress bar - update_progress(gam_in.tellg()); - } - - if (thread_buffer.empty()) { - // No data was found - break; - } - - // Do a sort of the data we grabbed - this->sort(thread_buffer); - - // Save it to a temp file. - string temp_name = temp_file::create(); - ofstream temp_stream(temp_name); - // OK to save as one massive group here. - // TODO: This write could also be in a thread. - stream::write_buffered(temp_stream, thread_buffer, 0); - - #pragma omp critical (outstanding_temp_files) - { - // Remember the temp file name - outstanding_temp_files.push_back(temp_name); - // Remember the reads in the file, for progress purposes - reads_per_file[temp_name] = thread_buffer.size(); - // Remember how many reads we found in the total - total_reads_read += thread_buffer.size(); - } - } - } - - // Now we know the reader threads have taken care of the input, and all the data is in temp files. - - destroy_progress(); - - while (outstanding_temp_files.size() > max_fan_in) { - // We can't merge them all at once, so merge subsets of them. - outstanding_temp_files = streaming_merge(outstanding_temp_files, &reads_per_file); - } - - // Now we can merge (and maybe index) the final layer of the tree. - - // Open up cursors into all the files. - list temp_ifstreams; - list temp_cursors; - open_all(outstanding_temp_files, temp_ifstreams, temp_cursors); - - // Make an output emitter - emitter_t emitter(gam_out); - - if (index_to != nullptr) { - emitter.on_group([&index_to](const vector& group, int64_t start_vo, int64_t past_end_vo) { - // Whenever it emits a group, index it. - // Make sure to only capture things that will outlive the emitter - index_to->add_group(group, start_vo, past_end_vo); - }); - } - - // Merge the cursors into the emitter - streaming_merge(temp_cursors, emitter, total_reads_read); - - // Clean up - temp_cursors.clear(); - temp_ifstreams.clear(); - for (auto& filename : outstanding_temp_files) { - temp_file::remove(filename); - } - -} - -void GAMSorter::open_all(const vector& filenames, list& streams, list& cursors) { - // The open files need to live in a collection; the cursors don't own them. - // They also can't be allowed to move since we reference them. - // The cursors also need to live in a collection, because we don't want to be - // moving/copying them and their internal buffers and streams. - // And they can't move after creation either. - - // So everything lives in caller-passed lists. - - for (auto& filename : filenames) { - // Open each file - streams.emplace_back(); - streams.back().open(filename); - // Make a cursor for it - cursors.emplace_back(streams.back()); - } - -} - -void GAMSorter::streaming_merge(list& cursors, emitter_t& emitter, size_t expected_reads) { - - create_progress("merge " + to_string(cursors.size()) + " files", expected_reads == 0 ? 1 : expected_reads); - // Count the reads we actually see - size_t observed_reads = 0; - - // Put all the files in a priority queue based on which has an alignment that comes first. - // We work with pointers to cursors because we don't want to be copying the actual cursors around the heap. - // We also *reverse* the order, because priority queues put the "greatest" element forts - auto cursor_order = [&](cursor_t*& a, cursor_t*& b) { - if (b->has_next()) { - if(!a->has_next()) { - // Cursors that aren't empty come first - return true; - } - return less_than(*(*b), *(*a)); - } - return false; - }; - priority_queue, decltype(cursor_order)> cursor_queue(cursor_order); - - for (auto& cursor : cursors) { - // Put the cursor pointers in the queue - cursor_queue.push(&cursor); - } - - while(!cursor_queue.empty() && cursor_queue.top()->has_next()) { - // Until we have run out of data in all the temp files - - // Pop off the winning cursor - cursor_t* winner = cursor_queue.top(); - cursor_queue.pop(); - - // Grab and emit its alignment, and advance it - emitter.write(std::move(winner->take())); - - // Put it back in the heap if it is not depleted - if (winner->has_next()) { - cursor_queue.push(winner); - } - // TODO: Maybe keep it off the heap for the next loop somehow if it still wins - - observed_reads++; - if (expected_reads != 0) { - update_progress(observed_reads); - } - } - - // We finished the files, so say we're done. - // TODO: Should we warn/fail if we expected the wrong number of reads? - update_progress(expected_reads == 0 ? 1 : expected_reads); - destroy_progress(); - -} - -vector GAMSorter::streaming_merge(const vector& temp_files_in, unordered_map* reads_per_file) { - - // What are the names of the merged files we create? - vector temp_files_out; - - // We don't do this loop in parallel because the point of looping is to limit the total currently open files. - for (size_t start_file = 0; start_file < temp_files_in.size(); start_file += max_fan_in) { - // For each range of sufficiently few files, starting at start_file and running for file_count - size_t file_count = min(max_fan_in, temp_files_in.size() - start_file); - - // Open up cursors into all the files. - list temp_ifstreams; - list temp_cursors; - open_all(vector(&temp_files_out[start_file], &temp_files_out[start_file + file_count]), temp_ifstreams, temp_cursors); - - // Work out how many reads to expect - size_t expected_reads = 0; - if (reads_per_file != nullptr) { - for (size_t i = start_file; i < start_file + file_count; i++) { - expected_reads += reads_per_file->at(temp_files_in.at(i)); - } - } - - // Open an output file - string out_file_name = temp_file::create(); - ofstream out_stream(out_file_name); - temp_files_out.push_back(out_file_name); - - // Make an output emitter - emitter_t emitter(out_stream); - - // Merge the cursors into the emitter - streaming_merge(temp_cursors, emitter, expected_reads); - - // The output file will be flushed and finished automatically when the emitter goes away. - - // Clean up the input files we used - temp_cursors.clear(); - temp_ifstreams.clear(); - for (size_t i = start_file; i < file_count; i++) { - temp_file::remove(temp_files_in.at(i)); - } - - if (reads_per_file != nullptr) { - // Save the total reads that should be in the created file, in case we need to do another pass - (*reads_per_file)[out_file_name] = expected_reads; - } - } - - return temp_files_out; - -} - -void GAMSorter::benedict_sort(istream& gam_in, ostream& gam_out, GAMIndex* index_to) { - // Go to the end of the file - gam_in.seekg(0, gam_in.end); - // Get its position - auto file_end = gam_in.tellg(); - // Go to the start - gam_in.seekg(0); - - // This will have all the item VOs and let us sort them by position - vector> pos_to_vo; - - stream::ProtobufIterator cursor(gam_in); - - if (cursor.tell_raw() == -1) { - // This will catch non-blocked gzip files, as well as streaming streams. - cerr << "error:[vg gamsort]: Cannot sort an unseekable GAM" << endl; - exit(1); - } - - // Make a progress bar - create_progress("load positions", file_end); - - // Count reads seen so we can only update our progress bar sometimes - size_t seen = 0; - - while(cursor.has_next()) { - // Get the min position of each alignment - pos_t min_pos = make_pos_t(get_min_position(*cursor)); - - // Save it with the alignment-s virtual offset - pos_to_vo.emplace_back(min_pos, cursor.tell_item()); - - cursor.get_next(); - - if (seen % 1000 == 0) { - update_progress(gam_in.tellg()); - } - seen++; - } - - update_progress(gam_in.tellg()); - destroy_progress(); - create_progress("sort positions", 1); - - // Sort everything by pos_t key - std::sort(pos_to_vo.begin(), pos_to_vo.end(), [&](const pair& a, const pair& b) { - return this->less_than(a.first, b.first); - }); - - update_progress(1); - destroy_progress(); - create_progress("reorder reads", pos_to_vo.size()); - - // Make an output emitter - stream::ProtobufEmitter emitter(gam_out); - - if (index_to != nullptr) { - emitter.on_group([&index_to](const vector& group, int64_t start_vo, int64_t past_end_vo) { - // Whenever it emits a group, index it. - // Make sure to only capture things that will outlive the emitter - index_to->add_group(group, start_vo, past_end_vo); - }); - } - - // Actually do the shuffle - for (auto& pos_and_vo : pos_to_vo) { - // For each item in sorted order - - // Load it - cursor.seek_item_and_stop(pos_and_vo.second); - - // Send it out - emitter.write(std::move(cursor.take())); - - increment_progress(); - } - - destroy_progress(); -} - - -bool GAMSorter::less_than(const Alignment &a, const Alignment &b) const { - return less_than(get_min_position(a), get_min_position(b)); -} - -Position GAMSorter::get_min_position(const Alignment& aln) const { - return get_min_position(aln.path()); -} - -Position GAMSorter::get_min_position(const Path& path) const { - if (path.mapping_size() == 0) { - // This path lives at a default Position - return Position(); - } - - Position min = path.mapping(0).position(); - for(size_t i = 1; i < path.mapping_size(); i++) { - const Position& other = path.mapping(i).position(); - if (less_than(other, min)) { - // We found a smaller position - min = other; - } - } - - return min; -} - -bool GAMSorter::equal_to(const Position& a, const Position& b) const { - return (a.node_id() == b.node_id() && - a.is_reverse() == b.is_reverse() && - a.offset() == b.offset()); -} - -bool GAMSorter::less_than(const Position& a, const Position& b) const { - if (a.node_id() < b.node_id()) { - return true; - } else if (a.node_id() > b.node_id()) { - return false; - } - - if (a.is_reverse() < b.is_reverse()) { - return true; - } else if (a.is_reverse() > b.is_reverse()) { - return false; - } - - if (a.offset() < b.offset()) { - return true; - } - - return false; -} - -bool GAMSorter::less_than(const pos_t& a, const pos_t& b) const { - if (id(a) < id(b)) { - return true; - } else if (id(a) > id(b)) { - return false; - } - - if (is_rev(a) < is_rev(b)) { - return true; - } else if (is_rev(a) > is_rev(b)) { - return false; - } - - if (offset(a) < offset(b)) { - return true; - } - - return false; -} - -bool GAMSorter::greater_than(const Position& a, const Position& b) const { - if (a.node_id() > b.node_id()) { - return true; - } else if (a.node_id() < b.node_id()) { - return false; - } - - if (a.is_reverse() > b.is_reverse()) { - return true; - } else if (a.is_reverse() < b.is_reverse()) { - return false; - } - - if (a.offset() > b.offset()) { - return true; - } - - return false; -} diff --git a/src/gamsorter.hpp b/src/gamsorter.hpp deleted file mode 100644 index 2d3b27dd1e5..00000000000 --- a/src/gamsorter.hpp +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef VG_GAMSORTER_HPP_INCLUDED -#define VG_GAMSORTER_HPP_INCLUDED - -#include "vg.pb.h" -#include "stream.hpp" -#include "types.hpp" -#include "progressive.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/** - * \file gamsorter.hpp - * GAM sorting tools. - */ -using namespace std; -namespace vg { - - -// We need to know about the GAMIndex, but we don't actually need to hold one. -// So pre-declare it here. -class GAMIndex; - -/// Provides the ability to sort a GAM, either "dumbly" (in memory), or -/// "streaming" into temporary files. Paired alignments are not necessarily -/// going to end up next to each other, so if sorting by position make sure to -/// set the position cross-references first if you want to be able to find -/// them. -class GAMSorter : public Progressive { -public: - - ////////////////// - // Main entry points - ////////////////// - - /// Create a GAM sorter, showing sort progress on standard error if show_progress is true. - GAMSorter(bool show_progress = false); - - /// Sort a stream of GAM-format data, using temporary files, limiting the - /// number of simultaneously open input files and the size of in-memory - /// data. Optionally index the sorted GAM file into the given GAMIndex. - void stream_sort(istream& gam_in, ostream& gam_out, GAMIndex* index_to = nullptr); - - /// Sort a stream of GAM-format data, loading it all into memory and doing - /// a single giant sort operation. - /// Optionally index the sorted GAM file into the given GAMIndex. - void dumb_sort(istream& gam_in, ostream& gam_out, GAMIndex* index_to = nullptr); - - /// Sort a seekable input stream by doing one pass to load all the - /// positions, sorting all the positions in memory, and doing another pass - /// of jumping around to re-order all the reads. - /// Optionally index the sorted GAM file into the given GAMIndex. - void benedict_sort(istream& gam_in, ostream& gam_out, GAMIndex* index_to = nullptr); - - ////////////////// - // Supporting API - ////////////////// - - /// Sort a vector of alignments, in place. - void sort(vector& alns) const; - - /// Return true if out of Alignments a and b, alignment a must come before alignment b, and false otherwise. - bool less_than(const Alignment& a, const Alignment& b) const; - - /// Determine the minumum Position visited by an Alignment. The minumum - /// Position is the lowest node ID visited by the alignment, with the - /// lowest offset visited on that node ID as the offset, and the - /// orientation set to false if the forward strand is visited, and true if - /// only the reverse strand is visited. - Position get_min_position(const Alignment& aln) const; - - /// Determine the minimum position visited by a Path, as for an Alignment. - Position get_min_position(const Path& path) const; - - /// Return True if the given Position values are equal, and false otherwise. - bool equal_to(const Position& a, const Position& b) const; - - /// Return True if position A is less than position B in our sort, and false otherwise. - /// Position order is defined first by node ID, then by strand (forward first), and then by offset within the strand. - /// We can't sort by actual base on the forward strand, because we need to be able to sort without knowing the graph's node lengths. - bool less_than(const Position& a, const Position& b) const; - - /// Return true if out of pos_t items a and b, a must come before b, and false otherwise. - bool less_than(const pos_t& a, const pos_t& b) const; - - /// Return True if position A is greater than position B in our sort, and false otherwise. - bool greater_than(const Position& a, const Position& b) const; - - private: - /// What's the maximum size of reads in serialized, uncompressed bytes to - /// load into memory for a single temp file chunk, during the streaming - /// sort? - /// For reference, a whole-genome GAM file is about 500 GB of uncompressed data - size_t max_buf_size = (512 * 1024 * 1024); - /// What's the max fan-in when combining temp files, during the streaming sort? - /// This will be computed based on the max file descriptor limit from the OS. - size_t max_fan_in; - - using cursor_t = stream::ProtobufIterator; - using emitter_t = stream::ProtobufEmitter; - - /// Open all the given input files, keeping the streams and cursors in the given lists. - /// We use lists because none of these should be allowed to move after creation. - void open_all(const vector& filenames, list& streams, list& cursors); - - /// Merge all the reads from the given list of cursors into the given emitter. - /// The total expected number of reads can be passed for progress bar purposes. - void streaming_merge(list& cursors, emitter_t& emitter, size_t expected_reads = 0); - - /// Merge all the given temp input files into one or more temp output - /// files, opening no more than max_fan_in input files at a time. The input - /// files, which must be from temp_file::create(), will be deleted. - /// - /// If reads_per_file is specified, it will be used to show progress bars, - /// and will be updated for newly-created files. - vector streaming_merge(const vector& temp_names_in, unordered_map* reads_per_file = nullptr); -}; -} -#endif diff --git a/src/gbwt_extender.cpp b/src/gbwt_extender.cpp new file mode 100644 index 00000000000..bec78296a8d --- /dev/null +++ b/src/gbwt_extender.cpp @@ -0,0 +1,2305 @@ +#include "gbwt_extender.hpp" + +#include +#include +#include +#include +#include + +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +// Numerical class constants. + +constexpr size_t GaplessExtender::MAX_MISMATCHES; +constexpr double GaplessExtender::OVERLAP_THRESHOLD; + +//------------------------------------------------------------------------------ + +bool GaplessExtension::contains(const HandleGraph& graph, seed_type seed) const { + handle_t expected_handle = GaplessExtender::get_handle(seed); + size_t expected_node_offset = GaplessExtender::get_node_offset(seed); + size_t expected_read_offset = GaplessExtender::get_read_offset(seed); + + size_t read_offset = this->read_interval.first; + size_t node_offset = this->offset; + for (handle_t handle : this->path) { + size_t len = graph.get_length(handle) - node_offset; + read_offset += len; + node_offset += len; + if (handle == expected_handle && read_offset - expected_read_offset == node_offset - expected_node_offset) { + return true; + } + node_offset = 0; + } + + return false; +} + +Position GaplessExtension::starting_position(const HandleGraph& graph) const { + Position position; + if (this->empty()) { + return position; + } + + position.set_node_id(graph.get_id(this->path.front())); + position.set_is_reverse(graph.get_is_reverse(this->path.front())); + position.set_offset(this->offset); + + return position; +} + +Position GaplessExtension::tail_position(const HandleGraph& graph) const { + Position position; + if (this->empty()) { + return position; + } + + position.set_node_id(graph.get_id(this->path.back())); + position.set_is_reverse(graph.get_is_reverse(this->path.back())); + position.set_offset(this->tail_offset(graph)); + + return position; +} + +size_t GaplessExtension::tail_offset(const HandleGraph& graph) const { + size_t result = this->offset + this->length(); + for (size_t i = 0; i + 1 < this->path.size(); i++) { + result -= graph.get_length(this->path[i]); + } + return result; +} + +size_t GaplessExtension::overlap(const HandleGraph& graph, const GaplessExtension& another) const { + size_t result = 0; + size_t this_pos = this->read_interval.first, another_pos = another.read_interval.first; + auto this_iter = this->path.begin(), another_iter = another.path.begin(); + size_t this_offset = this->offset, another_offset = another.offset; + while (this_pos < this->read_interval.second && another_pos < another.read_interval.second) { + if (this_pos == another_pos && *this_iter == *another_iter && this_offset == another_offset) { + size_t len = std::min({ graph.get_length(*this_iter) - this_offset, + this->read_interval.second - this_pos, + another.read_interval.second - another_pos }); + result += len; + this_pos += len; + another_pos += len; + ++this_iter; + ++another_iter; + this_offset = 0; + another_offset = 0; + } else if (this_pos <= another_pos) { + this_pos += graph.get_length(*this_iter) - this_offset; + ++this_iter; + this_offset = 0; + } else { + another_pos += graph.get_length(*another_iter) - another_offset; + ++another_iter; + another_offset = 0; + } + } + return result; +} + +Path GaplessExtension::to_path(const HandleGraph& graph, const std::string& sequence) const { + + Path result; + + auto mismatch = this->mismatch_positions.begin(); // The next mismatch. + size_t read_offset = this->read_interval.first; // Current offset in the read. + size_t node_offset = this->offset; // Current offset in the current node. + for (size_t i = 0; i < this->path.size(); i++) { + size_t limit = std::min(read_offset + graph.get_length(this->path[i]) - node_offset, this->read_interval.second); + Mapping& mapping = *(result.add_mapping()); + mapping.mutable_position()->set_node_id(graph.get_id(this->path[i])); + mapping.mutable_position()->set_offset(node_offset); + mapping.mutable_position()->set_is_reverse(graph.get_is_reverse(this->path[i])); + while (mismatch != this->mismatch_positions.end() && *mismatch < limit) { + if (read_offset < *mismatch) { + Edit& exact_match = *(mapping.add_edit()); + exact_match.set_from_length(*mismatch - read_offset); + exact_match.set_to_length(*mismatch - read_offset); + } + Edit& edit = *(mapping.add_edit()); + edit.set_from_length(1); + edit.set_to_length(1); + edit.set_sequence(std::string(1, sequence[*mismatch])); + read_offset = *mismatch + 1; + ++mismatch; + } + if (read_offset < limit) { + Edit& exact_match = *(mapping.add_edit()); + exact_match.set_from_length(limit - read_offset); + exact_match.set_to_length(limit - read_offset); + read_offset = limit; + } + mapping.set_rank(i + 1); + node_offset = 0; + } + + return result; +} + +//------------------------------------------------------------------------------ + +ReadMasker::ReadMasker(const std::string& valid_chars) : mask(256, 'X') { + for (char c : valid_chars) { + this->mask[static_cast(c)] = c; + } +} + +void ReadMasker::operator()(std::string& sequence) const { + for (char& c : sequence) { + c = this->mask[static_cast(c)]; + } +} + +//------------------------------------------------------------------------------ + +GaplessExtender::GaplessExtender() : + graph(nullptr), aligner(nullptr), mask("ACGT") +{ +} + +GaplessExtender::GaplessExtender(const gbwtgraph::GBWTGraph& graph, const Aligner& aligner) : + graph(&graph), aligner(&aligner), mask("ACGT") +{ +} + +//------------------------------------------------------------------------------ + +template +void in_place_subvector(std::vector& vec, size_t head, size_t tail) { + if (head >= tail || tail > vec.size()) { + vec.clear(); + return; + } + if (head > 0) { + for (size_t i = head; i < tail; i++) { + vec[i - head] = std::move(vec[i]); + } + } + vec.resize(tail - head); +} + +// Compute the score based on read_interval, internal_score, left_full, and right_full. +void set_score(GaplessExtension& extension, const Aligner* aligner) { + // Assume that everything matches. + extension.score = static_cast((extension.read_interval.second - extension.read_interval.first) * aligner->match); + // Handle the mismatches. + extension.score -= static_cast(extension.internal_score * (aligner->match + aligner->mismatch)); + // Handle full-length bonuses. + extension.score += static_cast(extension.left_full * aligner->full_length_bonus); + extension.score += static_cast(extension.right_full * aligner->full_length_bonus); +} + +// Match the initial node, assuming that read_offset or node_offset is 0. +// Updates internal_score and old_score; use set_score() to compute score. +void match_initial(GaplessExtension& match, const std::string& seq, gbwtgraph::view_type target) { + size_t node_offset = match.offset; + size_t left = std::min(seq.length() - match.read_interval.second, target.second - node_offset); + while (left > 0) { + size_t len = std::min(left, sizeof(std::uint64_t)); + std::uint64_t a = 0, b = 0; + std::memcpy(&a, seq.data() + match.read_interval.second, len); + std::memcpy(&b, target.first + node_offset, len); + if (a == b) { + match.read_interval.second += len; + node_offset += len; + } else { + for (size_t i = 0; i < len; i++) { + if (seq[match.read_interval.second] != target.first[node_offset]) { + match.internal_score++; + } + match.read_interval.second++; + node_offset++; + } + } + left -= len; + } + match.old_score = match.internal_score; +} + +// Match forward but stop before the mismatch count reaches the limit. +// Updates internal_score; use set_score() to recompute score. +// Returns the tail offset (the number of characters matched). +size_t match_forward(GaplessExtension& match, const std::string& seq, gbwtgraph::view_type target, uint32_t mismatch_limit) { + size_t node_offset = 0; + size_t left = std::min(seq.length() - match.read_interval.second, target.second - node_offset); + while (left > 0) { + size_t len = std::min(left, sizeof(std::uint64_t)); + std::uint64_t a = 0, b = 0; + std::memcpy(&a, seq.data() + match.read_interval.second, len); + std::memcpy(&b, target.first + node_offset, len); + if (a == b) { + match.read_interval.second += len; + node_offset += len; + } else { + for (size_t i = 0; i < len; i++) { + if (seq[match.read_interval.second] != target.first[node_offset]) { + if (match.internal_score + 1 >= mismatch_limit) { + return node_offset; + } + match.internal_score++; + } + match.read_interval.second++; + node_offset++; + } + } + left -= len; + } + return node_offset; +} + +// Match forward but stop before the mismatch count reaches the limit. +// Starts from the offset in the match and updates it. +// Updates internal_score; use set_score() to recompute score. +void match_backward(GaplessExtension& match, const std::string& seq, gbwtgraph::view_type target, uint32_t mismatch_limit) { + size_t left = std::min(match.read_interval.first, match.offset); + while (left > 0) { + size_t len = std::min(left, sizeof(std::uint64_t)); + std::uint64_t a = 0, b = 0; + std::memcpy(&a, seq.data() + match.read_interval.first - len, len); + std::memcpy(&b, target.first + match.offset - len, len); + if (a == b) { + match.read_interval.first -= len; + match.offset -= len; + } else { + for (size_t i = 0; i < len; i++) { + if (seq[match.read_interval.first - 1] != target.first[match.offset - 1]) { + if (match.internal_score + 1 >= mismatch_limit) { + return; + } + match.internal_score++; + } + match.read_interval.first--; + match.offset--; + } + } + left -= len; + } +} + +// Sort full-length extensions by internal_score, remove ones that are not +// full-length alignments, remove duplicates, and return the best extensions +// that have sufficiently low overlap. +void handle_full_length(const HandleGraph& graph, std::vector& result, double overlap_threshold) { + std::sort(result.begin(), result.end(), [](const GaplessExtension& a, const GaplessExtension& b) -> bool { + if (a.full() && b.full()) { + return (a.internal_score < b.internal_score); + } + return a.full(); + }); + size_t tail = 0; + for (size_t i = 0; i < result.size(); i++) { + if (!(result[i].full())) { + break; // No remaining full-length extensions. + } + bool overlap = false; + for (size_t prev = 0; prev < tail; prev++) { + if (result[i].overlap(graph, result[prev]) > overlap_threshold * result[prev].length()) { + overlap = true; + break; + } + } + if (overlap) { + continue; + } + if (i > tail) { + result[tail] = std::move(result[i]); + } + tail++; + } + result.resize(tail); +} + +// Sort the extensions from left to right. Remove duplicates and empty extensions. +void remove_duplicates(std::vector& result) { + auto sort_order = [](const GaplessExtension& a, const GaplessExtension& b) -> bool { + if (a.read_interval != b.read_interval) { + return (a.read_interval < b.read_interval); + } + if (a.state.backward.node != b.state.backward.node) { + return (a.state.backward.node < b.state.backward.node); + } + if (a.state.forward.node != b.state.forward.node) { + return (a.state.forward.node < b.state.forward.node); + } + if (a.state.backward.range != b.state.backward.range) { + return (a.state.backward.range < b.state.backward.range); + } + if (a.state.forward.range != b.state.forward.range) { + return (a.state.forward.range < b.state.forward.range); + } + return (a.offset < b.offset); + }; + std::sort(result.begin(), result.end(), sort_order); + size_t tail = 0; + for (size_t i = 0; i < result.size(); i++) { + if (result[i].empty()) { + continue; + } + if (tail == 0 || result[i] != result[tail - 1]) { + if (i > tail) { + result[tail] = std::move(result[i]); + } + tail++; + } + } + result.resize(tail); +} + +// Realign the extensions to find the mismatching positions. +void find_mismatches(const std::string& seq, const gbwtgraph::CachedGBWTGraph& graph, std::vector& result) { + for (GaplessExtension& extension : result) { + if (extension.internal_score == 0) { + continue; + } + extension.mismatch_positions.reserve(extension.internal_score); + size_t node_offset = extension.offset, read_offset = extension.read_interval.first; + for (const handle_t& handle : extension.path) { + gbwtgraph::view_type target = graph.get_sequence_view(handle); + while (node_offset < target.second && read_offset < extension.read_interval.second) { + if (target.first[node_offset] != seq[read_offset]) { + extension.mismatch_positions.push_back(read_offset); + } + node_offset++; + read_offset++; + } + node_offset = 0; + } + } +} + +size_t interval_length(std::pair interval) { + return interval.second - interval.first; +} + +std::vector get_path(const std::vector& first, handle_t second) { + std::vector result; + result.reserve(first.size() + 1); + result.insert(result.end(), first.begin(), first.end()); + result.push_back(second); + return result; +} + +std::vector get_path(handle_t first, const std::vector& second) { + std::vector result; + result.reserve(second.size() + 1); + result.push_back(first); + result.insert(result.end(), second.begin(), second.end()); + return result; +} + +std::vector get_path(const std::vector& first, gbwt::node_type second) { + return get_path(first, gbwtgraph::GBWTGraph::node_to_handle(second)); +} + +std::vector get_path(gbwt::node_type reverse_first, const std::vector& second) { + return get_path(gbwtgraph::GBWTGraph::node_to_handle(gbwt::Node::reverse(reverse_first)), second); +} + +//------------------------------------------------------------------------------ + +// Trim mismatches from the extension to maximize the score. Returns true if the +// extension was trimmed. +bool trim_mismatches(GaplessExtension& extension, const gbwtgraph::CachedGBWTGraph& graph, const Aligner& aligner) { + + if (extension.exact()) { + return false; + } + + // Start with the initial run of matches. + auto mismatch = extension.mismatch_positions.begin(); + std::pair current_interval(extension.read_interval.first, *mismatch); + int32_t current_score = interval_length(current_interval) * aligner.match; + if (extension.left_full) { + current_score += aligner.full_length_bonus; + } + + // Process the alignment and keep track of the best interval we have seen so far. + std::pair best_interval = current_interval; + int32_t best_score = current_score; + while (mismatch != extension.mismatch_positions.end()) { + // See if we should start a new interval after the mismatch. + if (current_score >= aligner.mismatch) { + current_interval.second++; + current_score -= aligner.mismatch; + } else { + current_interval.first = current_interval.second = *mismatch + 1; + current_score = 0; + } + ++mismatch; + + // Process the following run of matches. + if (mismatch == extension.mismatch_positions.end()) { + size_t length = extension.read_interval.second - current_interval.second; + current_interval.second = extension.read_interval.second; + current_score += length * aligner.match; + if (extension.right_full) { + current_score += aligner.full_length_bonus; + } + } else { + size_t length = *mismatch - current_interval.second; + current_interval.second = *mismatch; + current_score += length * aligner.match; + } + + // Update the best interval. + if (current_score > best_score || (current_score > 0 && current_score == best_score && interval_length(current_interval) > interval_length(best_interval))) { + best_interval = current_interval; + best_score = current_score; + } + } + + // Special cases: no trimming or complete trimming. + if (best_interval == extension.read_interval) { + return false; + } + if (interval_length(best_interval) == 0) { + extension.path.clear(); + extension.read_interval = best_interval; + extension.mismatch_positions.clear(); + extension.score = 0; + extension.left_full = extension.right_full = false; + return true; + } + + // Update alignment statistics. + bool path_changed = false; + if (best_interval.first > extension.read_interval.first) { + extension.left_full = false; + } + if (best_interval.second < extension.read_interval.second) { + extension.right_full = false; + } + size_t node_offset = extension.offset, read_offset = extension.read_interval.first; + extension.read_interval = best_interval; + extension.score = best_score; + + // Trim the path. + size_t head = 0; + while (head < extension.path.size()) { + size_t node_length = graph.get_length(extension.path[head]); + read_offset += node_length - node_offset; + node_offset = 0; + if (read_offset > extension.read_interval.first) { + extension.offset = node_length - (read_offset - extension.read_interval.first); + break; + } + head++; + } + size_t tail = head + 1; + while (read_offset < extension.read_interval.second) { + read_offset += graph.get_length(extension.path[tail]); + tail++; + } + if (head > 0 || tail < extension.path.size()) { + in_place_subvector(extension.path, head, tail); + extension.state = graph.bd_find(extension.path); + } + + // Trim the mismatches. + head = 0; + while (head < extension.mismatch_positions.size() && extension.mismatch_positions[head] < extension.read_interval.first) { + head++; + } + tail = head; + while (tail < extension.mismatch_positions.size() && extension.mismatch_positions[tail] < extension.read_interval.second) { + tail++; + } + in_place_subvector(extension.mismatch_positions, head, tail); + + return true; +} + +//------------------------------------------------------------------------------ + +std::vector GaplessExtender::extend(cluster_type& cluster, std::string sequence, const gbwtgraph::CachedGBWTGraph* cache, size_t max_mismatches, double overlap_threshold) const { + + std::vector result; + if (this->graph == nullptr || this->aligner == nullptr || cluster.empty() || sequence.empty()) { + return result; + } + result.reserve(cluster.size()); + this->mask(sequence); + + // Allocate a cache if we were not provided with one. + bool free_cache = (cache == nullptr); + if (free_cache) { + cache = new gbwtgraph::CachedGBWTGraph(*(this->graph)); + } + + // Find the best extension starting from each seed. + size_t best_alignment = std::numeric_limits::max(); + for (seed_type seed : cluster) { + + // Check if the seed is contained in an exact full-length alignment. + if (best_alignment < result.size() && result[best_alignment].internal_score == 0) { + if (result[best_alignment].contains(*cache, seed)) { + continue; + } + } + + GaplessExtension best_match { + { }, static_cast(0), gbwt::BidirectionalState(), + { static_cast(0), static_cast(0) }, { }, + std::numeric_limits::min(), false, false, + false, false, std::numeric_limits::max(), std::numeric_limits::max() + }; + + // Match the initial node and add it to the queue. + std::priority_queue extensions; + { + size_t read_offset = get_read_offset(seed); + size_t node_offset = get_node_offset(seed); + GaplessExtension match { + { seed.first }, node_offset, cache->get_bd_state(seed.first), + { read_offset, read_offset }, { }, + static_cast(0), false, false, + false, false, static_cast(0), static_cast(0) + }; + match_initial(match, sequence, cache->get_sequence_view(seed.first)); + if (match.read_interval.first == 0) { + match.left_full = true; + match.left_maximal = true; + } + if (match.read_interval.second >= sequence.length()) { + match.right_full = true; + match.right_maximal = true; + } + set_score(match, this->aligner); + extensions.push(std::move(match)); + } + + // Extend the most promising extensions first, using alignment scores for priority. + // First make the extension right-maximal and then left-maximal. + while (!extensions.empty()) { + GaplessExtension curr = std::move(extensions.top()); + extensions.pop(); + + // Case 1: Extend to the right. + if (!curr.right_maximal) { + size_t num_extensions = 0; + // Always allow at least max_mismatches / 2 mismatches in the current flank. + uint32_t mismatch_limit = std::max( + static_cast(max_mismatches + 1), + static_cast(max_mismatches / 2 + curr.old_score + 1)); + cache->follow_paths(curr.state, false, [&](const gbwt::BidirectionalState& next_state) -> bool { + handle_t handle = gbwtgraph::GBWTGraph::node_to_handle(next_state.forward.node); + GaplessExtension next { + { }, curr.offset, next_state, + curr.read_interval, { }, + curr.score, curr.left_full, curr.right_full, + curr.left_maximal, curr.right_maximal, curr.internal_score, curr.old_score + }; + size_t node_offset = match_forward(next, sequence, cache->get_sequence_view(handle), mismatch_limit); + if (node_offset == 0) { // Did not match anything. + return true; + } + next.path = get_path(curr.path, handle); + // Did the extension become right-maximal? + if (next.read_interval.second >= sequence.length()) { + next.right_full = true; + next.right_maximal = true; + next.old_score = next.internal_score; + } else if (node_offset < cache->get_length(handle)) { + next.right_maximal = true; + next.old_score = next.internal_score; + } + set_score(next, this->aligner); + num_extensions += next.state.size(); + extensions.push(std::move(next)); + return true; + }); + // We could not extend all threads in 'curr' to the right. The unextended ones + // may have different left extensions, so we must consider 'curr' right-maximal. + if (num_extensions < curr.state.size()) { + curr.right_maximal = true; + curr.old_score = curr.internal_score; + extensions.push(std::move(curr)); + } + continue; + } + + // Case 2: Extend to the left. + if (!curr.left_maximal) { + bool found_extension = false; + // Always allow at least max_mismatches / 2 mismatches in the current flank. + uint32_t mismatch_limit = std::max( + static_cast(max_mismatches + 1), + static_cast(max_mismatches / 2 + curr.old_score + 1)); + cache->follow_paths(curr.state, true, [&](const gbwt::BidirectionalState& next_state) -> bool { + handle_t handle = gbwtgraph::GBWTGraph::node_to_handle(gbwt::Node::reverse(next_state.backward.node)); + size_t node_length = cache->get_length(handle); + GaplessExtension next { + { }, node_length, next_state, + curr.read_interval, { }, + curr.score, curr.left_full, curr.right_full, + curr.left_maximal, curr.right_maximal, curr.internal_score, curr.old_score + }; + match_backward(next, sequence, cache->get_sequence_view(handle), mismatch_limit); + if (next.offset >= node_length) { // Did not match anything. + return true; + } + next.path = get_path(handle, curr.path); + // Did the extension become left-maximal? + if (next.read_interval.first == 0) { + next.left_full = true; + next.left_maximal = true; + // No need to set old_score. + } else if (next.offset > 0) { + next.left_maximal = true; + // No need to set old_score. + } + set_score(next, this->aligner); + extensions.push(std::move(next)); + found_extension = true; + return true; + }); + if (!found_extension) { + curr.left_maximal = true; + // No need to set old_score. + } else { + continue; + } + } + + // Case 3: Maximal extension with a better score than the best extension so far. + if (best_match < curr) { + best_match = std::move(curr); + } + } + + // Add the best match to the result and update the best_alignment offset. + if (!best_match.empty()) { + if (best_match.full() && (best_alignment >= result.size() || best_match.internal_score < result[best_alignment].internal_score)) { + best_alignment = result.size(); + } + result.emplace_back(std::move(best_match)); + } + } + + // If we have a good enough full-length alignment, return the best sufficiently + // distinct full-length alignments. + if (best_alignment < result.size() && result[best_alignment].internal_score <= max_mismatches) { + handle_full_length(*cache, result, overlap_threshold); + find_mismatches(sequence, *cache, result); + } + + // Otherwise remove duplicates, find mismatches, and trim the extensions to maximize + // score. + else { + remove_duplicates(result); + find_mismatches(sequence, *cache, result); + bool trimmed = false; + for (GaplessExtension& extension : result) { + trimmed |= trim_mismatches(extension, *cache, *(this->aligner)); + } + if (trimmed) { + remove_duplicates(result); + } + } + + // Free the cache if we allocated it. + if (free_cache) { + delete cache; + cache = nullptr; + } + + return result; +} + +//------------------------------------------------------------------------------ + +bool GaplessExtender::full_length_extensions(const std::vector& result, size_t max_mismatches) { + return (result.size() > 0 && result.front().full() && result.front().mismatches() <= max_mismatches); +} + +//------------------------------------------------------------------------------ + +struct state_hash { + size_t operator()(const gbwt::BidirectionalState& state) const { + size_t result = wang_hash_64(state.forward.node); + result ^= wang_hash_64(state.forward.range.first) + 0x9e3779b9 + (result << 6) + (result >> 2); + result ^= wang_hash_64(state.forward.range.second) + 0x9e3779b9 + (result << 6) + (result >> 2); + result ^= wang_hash_64(state.backward.node) + 0x9e3779b9 + (result << 6) + (result >> 2); + result ^= wang_hash_64(state.backward.range.first) + 0x9e3779b9 + (result << 6) + (result >> 2); + result ^= wang_hash_64(state.backward.range.second) + 0x9e3779b9 + (result << 6) + (result >> 2); + return result; + } +}; + +//------------------------------------------------------------------------------ + +WFAAlignment WFAAlignment::from_extension(const GaplessExtension& extension) { + + // Start by aggregate-initializing. + WFAAlignment to_return { + extension.path, + {}, + (uint32_t)extension.offset, + (uint32_t)extension.read_interval.first, + (uint32_t)extension.length(), + extension.score, + true + }; + + // We need to make edits for all the mismatches. + // This tracks the base after the last edit in edits, in the sequence space. + size_t edits_made_up_to = to_return.seq_offset; + for (auto& mismatch_at : extension.mismatch_positions) { + // For each mismatch position + if (!to_return.edits.empty() && edits_made_up_to == mismatch_at && to_return.edits.back().first == mismatch) { + // If we can glom it onto an existing mismatch, do that. + ++to_return.edits.back().second; + } else { + // Otherwise, we need some new edits + if (edits_made_up_to < mismatch_at) { + // Add a match for the intervening non-mismatch sequence + to_return.edits.emplace_back(match, mismatch_at - edits_made_up_to); + } + // Add a new 1 base mismatch + to_return.edits.emplace_back(mismatch, 1); + } + // Advance the cursor to through this mismatch + edits_made_up_to = mismatch_at; + } + if (edits_made_up_to < to_return.seq_offset + to_return.length) { + // Add any trailing match + to_return.edits.emplace_back(match, (to_return.seq_offset + to_return.length) - edits_made_up_to); + } + + return to_return; +} + +WFAAlignment WFAAlignment::make_unlocalized_insertion(size_t sequence_offset, size_t length, int score) { + // We can do it all by aggregate-initializing + return {{}, {{insertion, length}}, 0, (uint32_t)sequence_offset, (uint32_t)length, score, true}; +} + +WFAAlignment WFAAlignment::make_empty() { + // We can do it all by aggregate-initializing + return {{}, {}, 0, 0, 0, 0, true}; +} + +bool WFAAlignment::unlocalized_insertion() const { + return ( + this->ok && + this->path.empty() && + this->edits.size() == 1 && + this->edits.front().first == insertion + ); +} + +int64_t WFAAlignment::final_offset(const gbwtgraph::GBWTGraph& graph) const { + int64_t final_offset = this->node_offset; + for (auto edit : this->edits) { + if (edit.first != WFAAlignment::insertion) { + final_offset += edit.second; + } + } + for (size_t i = 0; i + 1 < this->path.size(); i++) { + final_offset -= graph.get_length(this->path[i]); + } + return final_offset; +} + +void WFAAlignment::flip(const gbwtgraph::GBWTGraph& graph, const std::string& sequence) { + this->seq_offset = sequence.length() - this->seq_offset - this->length; + + if (this->path.empty()) { + return; + } + this->node_offset = graph.get_length(this->path.back()) - this->final_offset(graph); + + // Reverse the path and the edits. + std::reverse(this->path.begin(), this->path.end()); + for (size_t i = 0; i < this->path.size(); i++) { + this->path[i] = graph.flip(this->path[i]); + } + std::reverse(this->edits.begin(), this->edits.end()); +} + +void WFAAlignment::append(Edit edit, uint32_t length) { + if (length == 0) { + return; + } + if (this->edits.empty() || this->edits.back().first != edit) { + this->edits.push_back(std::make_pair(edit, length)); + } else { + this->edits.back().second += length; + } +} + +//#define debug_join + +void WFAAlignment::join(const WFAAlignment& second) { +#ifdef debug_join + std::cerr << "Joining alignment of sequence " << seq_offset << " - " << (seq_offset + length) + << " with alignment of " << second.seq_offset << " - " << (second.seq_offset + second.length) << std::endl; + std::cerr << "Left alignment: "; + print(std::cerr); + std::cerr << std::endl; + std::cerr << "Right alignment: "; + second.print(std::cerr); + std::cerr << std::endl; +#endif + + if (!ok) { + throw std::runtime_error("Cannot join onto an alignment that is not OK"); + } + + if (!second.ok) { + throw std::runtime_error("Cannot join an alignment that is not OK onto another alignment"); + } + + if (second.empty()) { + // We are joining an empty alignment onto us. Do nothing. + return; + } + + if (empty()) { + // We are ourselves empty. Just be replaced. + *this = second; + return; + } + + // Otherwise there is actual splicing to do. + + // Do error checking + if (seq_offset + length != second.seq_offset) { + throw std::runtime_error("Cannot join alignments because past-end position " + + std::to_string(seq_offset + length) + + " is not at start position " + + std::to_string(second.seq_offset)); + } + if (path.empty() && ! unlocalized_insertion()) { + throw std::runtime_error("Cannot join alignments because first alignment has no path"); + } + if (second.path.empty() && ! second.unlocalized_insertion()) { + throw std::runtime_error("Cannot join alignments because second alignment has no path"); + } + if (edits.empty()) { + throw std::runtime_error("Cannot join alignments because first alignment has no edits"); + } + if (second.edits.empty()) { + throw std::runtime_error("Cannot join alignments because second alignment has no edits"); + } + + if (!second.unlocalized_insertion()) { + // The second alignment has a path + if (unlocalized_insertion()) { + // We don't, so include the first handle and copy the offset on it. + node_offset = second.node_offset; + path.push_back(second.path.front()); + } else if (second.node_offset == 0) { + // Include the first handle from the second alignment because it can't be shared + path.push_back(second.path.front()); + } else { + // It must be shared with this alignment + if (second.path.front() != path.back()) { + throw std::runtime_error("Cannot join alignments because second alignment starts in the middle of a handle that first alignment doesn't end on"); + } + } + + // Copy all the other path handles. + std::copy(second.path.begin() + 1, second.path.end(), std::back_inserter(path)); + } + + for (auto& edit : second.edits) { + // Copy over all the edits + append(edit.first, edit.second); + } + + // Offsets don't need to change. + + // Add the length + length += second.length; + + // Add the score + score += second.score; + + // And that's all the fields we have! +} + +//#define debug_path + +Path WFAAlignment::to_path(const HandleGraph& graph, const std::string& sequence) const { + + if (!*this) { + throw std::runtime_error("WFAAlignment is not OK and cannot become a path"); + } + + if (this->seq_offset + this->length > sequence.size()) { + throw std::runtime_error("WFAAlignment extends past end of sequence"); + } + + Path result; + + if (this->unlocalized_insertion()) { + // Special handling for pure insertions with no position: make a Path + // that has no position. + vg::Mapping* m = result.add_mapping(); + vg::Edit* e = m->add_edit(); + e->set_to_length(this->edits.front().second); + e->set_sequence(sequence.substr(this->seq_offset, this->edits.front().second)); + return result; + } + + if (this->path.empty()) { + // Not a pure insert, but has an empty path. We assume it's an empty WFAAlignment. + // Nothing to do! + return result; + } + + // Walk through the sequence + size_t sequence_cursor = this->seq_offset; + + // And each node along the path + auto path_it = this->path.begin(); + + // And each base along the current node + size_t node_cursor = this->node_offset; + size_t first_node_length = graph.get_length(*path_it); + if (this->node_offset >= first_node_length) { + throw std::runtime_error("WFAAlignment has offset to or past end of first node"); + } + // When the base along the node hits this, we leave the node. + // We track this ourselves to avoid repeated length queries. + size_t node_end = first_node_length; + + // Walk through the edits + auto edit_it = this->edits.begin(); + if (edit_it == this->edits.end()) { + throw std::runtime_error("WFAAlignment has no edits"); + } + // And track how much of the current edit has been resolved aleady. + size_t current_edit_used = 0; + + // As we walk along, we build a mapping. Set up the first mapping + Mapping* mapping_in_progress = result.add_mapping(); + // And set its position, with the offset + mapping_in_progress->mutable_position()->set_node_id(graph.get_id(*path_it)); + mapping_in_progress->mutable_position()->set_is_reverse(graph.get_is_reverse(*path_it)); + mapping_in_progress->mutable_position()->set_offset(node_cursor); + + while (edit_it != this->edits.end()) { + if (current_edit_used == edit_it->second) { + // There's no edit left, but there ought to be as a loop invariant, + // if no empty edits exist. + throw std::runtime_error("WFAAlignment has empty edit"); + } + + // What kind of edit is it? + auto& edit_type = edit_it->first; + + // And how much is left? + size_t length_to_resolve = edit_it->second - current_edit_used; + + if (edit_type == match || edit_type == mismatch || edit_type == deletion) { + // These edits consume some graph. + // Make sure there is a graph node. + if (path_it == this->path.end()) { + throw std::runtime_error("WFAAlignment tried to go past end of path"); + } + if (node_cursor == node_end) { + // Make sure we aren't starting right at the end of the node. + // We make sure this doesn't happen when we advance nodes, as + // long as all nodes are nonempty. + // TODO: Can we hit this from an anchored tail alignment somehow? + throw std::runtime_error("WFAAlignment tried to go past end of node (" + std::to_string(node_end) + " bp)"); + } + // Limit to length of graph node. + length_to_resolve = std::min(length_to_resolve, node_end - node_cursor); + } + + assert(length_to_resolve > 0); + +#ifdef debug_path + std::cerr << "Use " << length_to_resolve << " bp of " << edit_it->second << edit_it->first << " against node " << (path_it != this->path.end() ? graph.get_id(*path_it) : (nid_t)0) << " to go from edit " << (edit_it - edits.begin()) << " offset " << current_edit_used << " = path step " << (path_it - path.begin()) << " offset " << node_cursor; +#endif + + // Create a vg Edit to translate to + vg::Edit* created = mapping_in_progress->add_edit(); + + if (edit_type == match || edit_type == mismatch || edit_type == deletion) { + // These edits consume some graph + created->set_from_length(length_to_resolve); + node_cursor += length_to_resolve; + } + if (edit_type == mismatch || edit_type == insertion) { + // These edits carry sequence + if (sequence_cursor + length_to_resolve > this->seq_offset + this->length) { + throw std::runtime_error("WFAAlignment uses more sequence than provided"); + } + created->set_sequence(sequence.substr(sequence_cursor, length_to_resolve)); + } + if (edit_type == match || edit_type == mismatch || edit_type == insertion) { + // These edits consume some sequence + created->set_to_length(length_to_resolve); + sequence_cursor += length_to_resolve; + } + + // Now we've resolved at least part of this edit. + current_edit_used += length_to_resolve; + + if (current_edit_used == edit_it->second) { + // Finished the edit. + // Reset to the start of the next edit. + ++edit_it; + current_edit_used = 0; + } + if (edit_type == match || edit_type == mismatch || edit_type == deletion) { + // These edits consume some graph. So we may need to advance in the graph now. + if (node_cursor == node_end) { + // Finished the node. + +#ifdef debug_path + std::cerr << " (leave node at path step " << (path_it - path.begin()) << " offset " << node_cursor << ")"; +#endif + + // We already checked above, and the path cursor isn't at the end. + assert(path_it != this->path.end()); + + // Reset to the start of the next node. + node_cursor = 0; + // And advance along the path if possible. + ++path_it; + if (path_it != this->path.end()) { + // We've reached a new node, so work out where the end is + node_end = graph.get_length(*path_it); + + if (node_cursor == node_end) { + throw std::runtime_error("WFAAlignment has empty node " + std::to_string(graph.get_id(*path_it))); + } + + // Also start a new Mapping + mapping_in_progress = result.add_mapping(); + // And set its position + mapping_in_progress->mutable_position()->set_node_id(graph.get_id(*path_it)); + mapping_in_progress->mutable_position()->set_is_reverse(graph.get_is_reverse(*path_it)); + // The offset will always be 0 since we entered from somewhere. + } else { + // No next node, so we should be at the end of what we use in the graph. + // If we try to use more graph, we will throw an error. + node_end = 0; + } + } + } + +#ifdef debug_path + std::cerr << " to edit " << (edit_it - edits.begin()) << " offset " << current_edit_used << " = path step " << (path_it - path.begin()) << " offset " << node_cursor << std::endl; +#endif + } + + return result; +} + +std::ostream& WFAAlignment::print(const HandleGraph& graph, std::ostream& out) const { + out << "{"; + if (!ok) { + out << " NOT OK!"; + } + out << " path = ["; + for (handle_t handle : this->path) { + out << " (" << graph.get_id(handle) << ", " << graph.get_is_reverse(handle) << ")"; + } + out << " ], edits = [ "; + for (auto edit : this->edits) { + out << edit.second << edit.first; + } + out << " ], node offset = " << this->node_offset; + out << ", sequence range = [" << this->seq_offset << ", " << (this->seq_offset + this->length) << ")"; + out << ", score = " << this->score << " }"; + + return out; +} + +std::ostream& WFAAlignment::print(std::ostream& out) const { + out << "{"; + if (!ok) { + out << " NOT OK!"; + } + out << " path = ["; + for (handle_t handle : this->path) { + out << " (" << as_integer(handle) << ")"; + } + out << " ], edits = [ "; + for (auto edit : this->edits) { + out << edit.second << edit.first; + } + out << " ], node offset = " << this->node_offset; + out << ", sequence range = [" << this->seq_offset << ", " << (this->seq_offset + this->length) << ")"; + out << ", score = " << this->score << " }"; + + return out; +} + +void WFAAlignment::check_lengths(const HandleGraph& graph) const { + // Compute read and graph lengths from the edits + size_t edit_graph_length = 0; + size_t edit_read_length = 0; + for (auto& e : edits) { + if (e.first == match || e.first == mismatch || e.first == insertion) { + // These edits use read sequence + edit_read_length += e.second; + } + if (e.first == match || e.first == mismatch || e.first == deletion) { + // These edits use graph sequence + edit_graph_length += e.second; + } + } + + // Compute graph length from the path + size_t path_graph_length = 0; + for (auto& h : path) { + path_graph_length += graph.get_length(h); + } + path_graph_length -= node_offset; + + if (edit_graph_length > path_graph_length) { + // We want to use more graph than we got. + print(graph, std::cerr); + std::cerr << std::endl; + throw std::runtime_error("WFAAlignment has path graph length " + std::to_string(path_graph_length) + " but edit graph length " + std::to_string(edit_graph_length)); + } + if (edit_read_length != length) { + // We want to use a different amount of read than we should. + print(graph, std::cerr); + std::cerr << std::endl; + throw std::runtime_error("WFAAlignment has length " + std::to_string(length) + " but edit read length " + std::to_string(edit_read_length)); + } +} + +std::ostream& operator<<(std::ostream& out, const WFAAlignment::Edit& edit) { + return out << std::to_string(edit); +} + +} + +namespace std { + +std::string to_string(const vg::WFAAlignment::Edit& edit) { + switch (edit) { + case vg::WFAAlignment::match: + return "M"; + break; + case vg::WFAAlignment::mismatch: + return "X"; + break; + case vg::WFAAlignment::insertion: + return "I"; + break; + case vg::WFAAlignment::deletion: + return "D"; + break; + default: + throw std::runtime_error("Unknown edit operation"); + } +} + +} + +namespace vg { + +//------------------------------------------------------------------------------ + +const WFAExtender::ErrorModel WFAExtender::default_error_model { + // Mismatches (per base, plus min, cap at max) + {0.03, 1, 6}, + // Gaps + {0.05, 1, 10}, + // Gap length + {0.1, 1, 20} +}; + +WFAExtender::WFAExtender() : + graph(nullptr), mask("ACGT"), aligner(nullptr) +{ +} + +WFAExtender::WFAExtender(const gbwtgraph::GBWTGraph& graph, const Aligner& aligner, const ErrorModel& error_model) : + graph(&graph), mask("ACGT"), aligner(&aligner), error_model(&error_model) +{ + // Check that the scoring parameters are reasonable. + assert(this->aligner->match >= 0); + assert(this->aligner->mismatch > 0); + assert(this->aligner->gap_open >= this->aligner->gap_extension); + assert(this->aligner->gap_extension > 0); + + // Check that the error model makes sense. + for (auto& event : { + this->error_model->mismatches, + this->error_model->gaps, + this->error_model->gap_length + }) { + assert(event.per_base >= 0); + assert(event.min >= 0); + assert(event.max >= event.min); + } +} + +//------------------------------------------------------------------------------ + +// A position in an alignment between a sequence and a graph. +struct MatchPos { + uint32_t seq_offset; + uint32_t node_offset; + + /// We need a stack-like type that can be copied or referred to quickly, + /// since we consider MatchPos objects for each WFANode along a path when + /// doing find_pos(), and they contain the path. + /// This path type assumes that we only ever push and *then* pop. + struct PathList { + const static size_t NUM_INLINE = 4; + size_t item_count = 0; + uint32_t inline_items[NUM_INLINE]; + std::shared_ptr> additional_items; + + void push(uint32_t value) { + ++this->item_count; + if (this->item_count == NUM_INLINE + 1) { + this->additional_items.reset(new std::vector()); + } + if (this->item_count > NUM_INLINE) { + this->additional_items->resize(this->item_count - NUM_INLINE); + } + this->top() = value; + } + + bool empty() const { + return this->item_count == 0; + } + + size_t size() const { + return this->item_count; + } + + const uint32_t& top() const { + if (this->item_count > NUM_INLINE) { + return this->additional_items->at(this->item_count - NUM_INLINE - 1); + } else { + return this->inline_items[this->item_count - 1]; + } + } + + uint32_t& top() { + if (this->item_count > NUM_INLINE) { + return this->additional_items->at(this->item_count - NUM_INLINE - 1); + } else { + return this->inline_items[this->item_count - 1]; + } + } + + void pop() { + --this->item_count; + if (this->item_count == NUM_INLINE) { + this->additional_items.reset(); + } + } + }; + + PathList path; // Sequence of tree offsets from a leaf to the relevant node. + + // Creates an empty position. + MatchPos() : seq_offset(0), node_offset(0) {} + + // Creates a position with the given offsets and path. + MatchPos(uint32_t seq_offset, uint32_t node_offset, const PathList& path) : seq_offset(seq_offset), node_offset(node_offset), path(path) {} + + bool empty() const { return this->path.empty(); } + bool at_last_node() const { return this->path.size() == 1; } + uint32_t node() const { return this->path.top(); } + void pop() { this->path.pop(); } + + // Positions are ordered by sequence offsets. Empty positions are smaller than + // non-empty ones. + bool operator<(const MatchPos& another) { + if (this->empty()) { + return true; + } + if (another.empty()) { + return false; + } + return (this->seq_offset < another.seq_offset); + } +}; + +// A point in an WFA score matrix (for a specific node). Must be aggregate-initializable. +struct WFAPoint { + int32_t score; + int32_t diagonal; // seq_offset - target offset + uint32_t seq_offset; + uint32_t node_offset; + + // Returns the offset in the target sequence. + int32_t target_offset() const { + return static_cast(this->seq_offset) - this->diagonal; + } + + // Returns the four-parameter alignment score. + int32_t alignment_score(const Aligner& aligner) const { + return (static_cast(aligner.match) * (static_cast(this->seq_offset) + this->target_offset()) - this->score) / 2; + } + + // Returns the four-parameter alignment score with an implicit final insertion. + int32_t alignment_score(const Aligner& aligner, uint32_t final_insertion) const { + return (static_cast(aligner.match) * (static_cast(this->seq_offset + final_insertion) + this->target_offset()) - this->score) / 2; + } + + // Converts the point to an alignment position with the given path. + MatchPos pos(const MatchPos::PathList& path) const { + return MatchPos(this->seq_offset, this->node_offset, path); + } + + // For ordering the points in WFANode. + bool operator<(const WFAPoint& another) const { + return (this->score < another.score || (this->score == another.score && this->diagonal < another.diagonal)); + } + + // We have a way to split up into a key and a value, and to be out back + // together again. + using key_type = std::pair; + using value_type = std::pair; + using map_entry_type = std::pair; + + key_type key() const { + return key_type(this->score, this->diagonal); + } + + value_type value() const { + return value_type(this->seq_offset, this->node_offset); + } + + static WFAPoint from_key_value(const key_type& key, const value_type& value) { + return {key.first, key.second, value.first, value.second}; + } + + static WFAPoint from_map_entry(const map_entry_type& entry) { + return from_key_value(entry.first, entry.second); + } +}; + +//------------------------------------------------------------------------------ + +/// Represents a node in the tree of haplotypes we are traversing and doing WFA +/// against. +/// +/// Will have WFANode::find_pos() called against it as part of a loop for each +/// diagonal, and if it doesn't answer its parent will be queried, recursively +/// back to the root. If we allow the number of WFANode objects along a +/// non-branching path to be linear in the sequence length, then we will make +/// O(sequence length) calls for each diagonal, and we end up getting O(n^2) +/// (or worse?) lookups. +/// +/// So, it is essential that we allow one WFANode to stand for a whole +/// non-branching run of haplotypes, up to about the total sequence length we +/// will be working on. This limits the number of recursive queries of parents +/// so it grows only with the number of haplotypes we are aligning against, +/// which is bounded, and not directly with the sequence length. +struct WFANode { + /// This tracks the GBWT search states for all graph nodes we visit that + /// have been coalesced into this WFANode + std::vector states; + /// And this tracks the GBWT packed nodes (ID and orientation) that are + /// visited, and maps to start offset, for O(1) query. It cannot have + /// duplicates. + std::unordered_map starts_by_node; + /// And this tracks the start offsets of each in our sequence space. + /// TODO: Replace with something O(1) + std::map states_by_start; + /// Total length + size_t stored_length; + + // Offsets in the vector of nodes. + uint32_t parent; + std::vector children; + + // All haplotypes end here. + bool dead_end; + + constexpr static size_t MATCHES = 0; + constexpr static size_t INSERTIONS = 1; // characters in the sequence but not in the graph + constexpr static size_t DELETIONS = 2; // characters in the graph but not in the sequence + + // Points on the wavefronts are indexed by score, diagonal. + std::array, 3> wavefronts; + + WFANode(const vector& states, uint32_t parent, const gbwtgraph::GBWTGraph& graph) : + states(states), + starts_by_node(), + states_by_start(), + stored_length(0), + parent(parent), children(), + dead_end(false), + wavefronts() { + if (states.empty()) { + throw std::runtime_error("Cannot make a WFANode for nothing"); + } + + // Fill in the visited nodes set and the index from start position to node. + this->starts_by_node.reserve(states.size()); + for (size_t i = 0; i < this->states.size(); i++) { + // Remember that this node starts here + this->starts_by_node.emplace(this->states[i].node, stored_length); + // Remember that here starts this node + states_by_start[stored_length] = i; + +#ifdef debug_wfa + std::cerr << "State #" << i << " is GBWT encoded node " << this->states[i].node << " and starts at offset " << stored_length << std::endl; +#endif + + // And up the start position + stored_length += graph.get_length(gbwtgraph::GBWTGraph::node_to_handle(this->states[i].node)); + } + } + + bool is_leaf() const { return (this->children.empty() || this->dead_end); } + bool expanded() const { return (!this->children.empty() || this->dead_end); } + + bool same_node(pos_t pos) const { + // See if we have seen anything on this node + gbwt::node_type lookup = gbwt::Node::encode(id(pos), is_rev(pos)); + bool is_here = starts_by_node.count(lookup); + + return is_here; + } + + /// Map from graph position to offset along the WFANode. + /// TODO: Having this requires that the WFANode never visits the same + /// oriented geaph node twice. Can we get away with not having this + /// somehow? + size_t node_offset_of(pos_t pos) const { + gbwt::node_type lookup = gbwt::Node::encode(id(pos), is_rev(pos)); + // Find where the referenced graph node starts in us + size_t start = starts_by_node.at(lookup); + // And then apply the offset + size_t result = start + offset(pos); + + return result; + } + + size_t length() const { + return stored_length; + } + + // WFANode::find_pos + // Returns the position for the given score and diagonal with the given path, or an empty position if it does not exist. + MatchPos find_pos(size_t type, int32_t score, int32_t diagonal, const MatchPos::PathList& path) const { + WFAPoint::key_type key { score, diagonal }; + auto& points = this->wavefronts[type]; + // Find the item with the same score and diagonal + auto iter = points.find(key); + if (iter == points.end()) { + // Nothing is stored for this score and diagonal + return MatchPos(); + } + // Otherwise, something is stored for this score and diagonal. Reconstitute it. + WFAPoint point = WFAPoint::from_key_value(key, iter->second); + return point.pos(path); + } + + // Update the WFA matrix with the given alignment position. + void update(size_t type, int32_t score, int32_t diagonal, const MatchPos& pos) { + this->update(type, score, diagonal, pos.seq_offset, pos.node_offset); + } + + // Update the WFA matrix with the given offsets. + void update(size_t type, int32_t score, int32_t diagonal, uint32_t seq_offset, uint32_t node_offset) { + // TODO: Make the WFAPoint and then split it up with its methods instead? + WFAPoint::key_type key { score, diagonal }; + WFAPoint::value_type value { seq_offset, node_offset }; + auto& points = this->wavefronts[type]; + auto iter = points.find(key); + if (iter == points.end()) { + // This is a new score and diagonal + points.emplace_hint(iter, std::move(key), std::move(value)); + } else { + // This score and diagonal already exists, so overwrite the value + iter->second = std::move(value); + } + } + + // Returns a position at the first non-match after the given position. + void match_forward(const std::string& sequence, const gbwtgraph::GBWTGraph& graph, MatchPos& pos) const { + + // Get first graph node starting after our offset. + std::map::const_iterator here = this->states_by_start.upper_bound(pos.node_offset); + if (here == this->states_by_start.begin()) { + // We are somehow starting before the first item (which should start at 0). This should never happen. + throw std::runtime_error("Offset on WFANode starts before its first graph node, which ought to be at 0"); + } + // Get last graph node starting at or before our offset. + --here; + + // We have the index of the state starting at or after the match pos. So it's the one the position is on. + while (here != this->states_by_start.end()) { + // Until we hit the end of the WFANode + + // Grab the handle for the graph node we are at + handle_t handle = gbwtgraph::GBWTGraph::node_to_handle(this->states[here->second].node); + + // And get a view of its sequence + gbwtgraph::view_type node_seq = graph.get_sequence_view(handle); + size_t graph_node_offset = pos.node_offset - here->first; + + while (pos.seq_offset < sequence.length() && graph_node_offset < node_seq.second && sequence[pos.seq_offset] == node_seq.first[graph_node_offset]) { + // Until we hit the end of the sequence, or the graph node, or a mismatch, advance + pos.seq_offset++; + pos.node_offset++; + graph_node_offset = pos.node_offset - here->first; + } + if (graph_node_offset >= node_seq.second) { + // We hit the end of a graph node. + // Advance to the next graph node. + ++here; + } else { + // We hit the end of the sequence, or a mismatch. + break; + } + } + + } + + // Returns a position at the start of the run of matches before the given position. + void match_backward(const std::string& sequence, const gbwtgraph::GBWTGraph& graph, MatchPos& pos) const { + + // Get first graph node starting after our offset. + std::map::const_iterator here = this->states_by_start.upper_bound(pos.node_offset); + if (here == this->states_by_start.begin()) { + // We are somehow starting before the first item (which should start at 0). This should never happen. + throw std::runtime_error("Offset on WFANode starts before its first graph node, which ought to be at 0"); + } + // Get last graph node starting at or before our offset. + --here; + + // We have the index of the state starting at or after the match pos. So it's the one the position is on. + while (pos.seq_offset > 0 && pos.node_offset > 0) { + // Until we hit the start of the WFANode + + // Grab the handle for the graph node we are at + handle_t handle = gbwtgraph::GBWTGraph::node_to_handle(this->states[here->second].node); + // And get a view of its sequence + gbwtgraph::view_type node_seq = graph.get_sequence_view(handle); + size_t graph_node_offset = pos.node_offset - here->first; + + while (pos.seq_offset > 0 && graph_node_offset > 0 && sequence[pos.seq_offset - 1] == node_seq.first[graph_node_offset - 1]) { + // Until we hit the start of the sequence, or the graph node, or a mismatch, go left + pos.seq_offset--; + pos.node_offset--; + graph_node_offset = pos.node_offset - here->first; + } + if (graph_node_offset == 0 && here->first != 0) { + // We hit the start of a graph node, but we could go left still. + // Go left to the next graph node. + --here; + } else { + // We hit the end of the sequence, or the end of the node, or a mismatch. + break; + } + } + } + +}; + +//------------------------------------------------------------------------------ + +class WFATree { +public: + const gbwtgraph::GBWTGraph& graph; + const std::string& sequence; + + /// Each WFANode represents a run of graph nodes, as traversed by a set of haplotypes. + std::vector nodes; + + // Best alignment found so far. If we reached the destination in the graph, + // the score includes the implicit insertion at the end but the point itself + // does not. + WFAPoint candidate_point; + uint32_t candidate_node; + + // WFA score (penalty) parameters derived from the actual scoring parameters. + int32_t mismatch, gap_open, gap_extend; + + // Stop if no alignment has been found with this score or less. + int32_t score_bound; + + struct ScoreProperties { + int32_t min_diagonal; + int32_t max_diagonal; + bool reachable_with_gap; + }; + + // A set of possible scores and diagonals reached with them. + std::map possible_scores; + + // The overall closed range of diagonals reached. + std::pair max_diagonals; + + // TODO: Remove when unnecessary. + bool debug; + + WFATree(const gbwtgraph::GBWTGraph& graph, const std::string& sequence, const gbwt::SearchState& root, uint32_t node_offset, const Aligner& aligner, const WFAExtender::ErrorModel& error_model) : + graph(graph), sequence(sequence), + nodes(), + candidate_point({ std::numeric_limits::max(), 0, 0, 0 }), candidate_node(0), + mismatch(2 * (aligner.match + aligner.mismatch)), + gap_open(2 * (aligner.gap_open - aligner.gap_extension)), + gap_extend(2 * aligner.gap_extension + aligner.match), + score_bound(0), + possible_scores(), max_diagonals(0, 0), + debug(false) + { + this->nodes.emplace_back(this->coalesce(root), 0, this->graph); + // No need to convert the node offset because it is from the root state's node start + this->nodes.front().update(WFANode::MATCHES, 0, 0, 0, node_offset); + + // Determine a reasonable upper bound for the number of edits. + int32_t max_mismatches = error_model.mismatches.evaluate(sequence.length()); + int32_t max_gaps = error_model.gaps.evaluate(sequence.length()); + int32_t max_gap_length = error_model.gap_length.evaluate(sequence.length()); + this->score_bound = max_mismatches * this->mismatch + max_gaps * this->gap_open + max_gap_length * this->gap_extend; + + possible_scores[0] = { 0, 0, false }; + } + + /// Get all the GBWT search states for a run of the same set of haplotypes + /// through nodes in the graph, without any haplotypes in the set branching + /// off, and without any visits to the same oriented graph node twice. + /// TODO: We can only visit each graph node once, or we can't map graph + /// pos_t values back to offsets along the WFANode. Do we need to be able + /// to do that, or can we try not doing that? + /// TODO: Save a scan by unifying with WFANode constructor? + vector coalesce(const gbwt::SearchState& start, size_t base_limit = 1024) { + vector coalesced {start}; + + std::unordered_set visited {start.node}; + gbwt::SearchState here = start; + gbwt::CachedGBWT cache = graph.get_cache(); // TODO: Take in cache? Is this even useful here? + // How many bases have we grabbed? + size_t coalesced_bases = 0; + // How many places did we have to pick from? + size_t options = 1; + while(options == 1) { + // Until we find multiple next places we could go + + // See how far we have come + handle_t node_handle = gbwtgraph::GBWTGraph::node_to_handle(here.node); + size_t node_length = graph.get_length(node_handle); + coalesced_bases += node_length; + if (coalesced_bases >= base_limit) { + // We don't want to look any more bases out; we might be + // wasting our time lloking further than the remaining read. + break; + } + + // If we want to keep going, see where we could go + options = 0; + gbwt::SearchState next; + graph.follow_paths(cache, here, [&](const gbwt::SearchState& reachable) { + options++; + if (options > 1) { + // We found bore than one place to go, so stop coalescing. + return false; + } + next = reachable; + return true; + }); + if (options == 1) { + // We found exactly one place to go. + + if (visited.count(next.node)) { + // We can't go there, we would cycle within a WFANode and + // break mapping from graph position to WFANode offset + break; + } + visited.insert(next.node); + + // Some haplotypes may have dropped out, but it is OK to keep + // coalescing because others did not. + // Go there. + here = next; + coalesced.push_back(here); + } + } + + return coalesced; + } + + uint32_t size() const { return this->nodes.size(); } + static bool is_root(uint32_t node) { return (node == 0); } + uint32_t parent(uint32_t node) const { return this->nodes[node].parent; } + + // Assumes length > 0. + int32_t gap_extend_penalty(uint32_t length) const { + return static_cast(length) * this->gap_extend; + } + + // Assumes length > 0. + int32_t gap_penalty(uint32_t length) const { + return this->gap_open + this->gap_extend_penalty(length); + } + + // wf_extend() in the paper. + // If we reach the end of a node, we continue to the start of the next node even + // if we do not use any characters in it. + void extend(int32_t score, pos_t to) { + for (int32_t diagonal = this->max_diagonals.first; diagonal <= this->max_diagonals.second; diagonal++) { + + std::vector leaves = this->get_leaves(); + this->extend_over(score, diagonal, to, leaves); + } + } + + // Returns the next possible score after the given score. Also updates the set + // of possible scores with those reachable from the given score but does not + // set the diagonal ranges for them. + int32_t next_score(int32_t match_score) { + + int32_t mismatch_score = match_score + this->mismatch; + if (this->possible_scores.find(mismatch_score) == this->possible_scores.end()) { + + this->possible_scores[mismatch_score] = { 0, 0, false }; + } + + // We assume that match_score is a valid score. + auto match_iter = this->possible_scores.find(match_score); + if (match_iter->second.reachable_with_gap) { + int32_t extend_score = match_score + this->gap_extend; + auto extend_iter = this->possible_scores.find(extend_score); + if (extend_iter != this->possible_scores.end()) { + extend_iter->second.reachable_with_gap = true; + } else { + this->possible_scores[extend_score] = { 0, 0, true }; + } + } + + int32_t open_score = match_score + this->gap_open + this->gap_extend; + auto open_iter = this->possible_scores.find(open_score); + if (open_iter != this->possible_scores.end()) { + + open_iter->second.reachable_with_gap = true; + } else { + + this->possible_scores[open_score] = { 0, 0, true }; + } + + // We know that there are further values beyond match_score. + ++match_iter; + return match_iter->first; + } + + // wf_next() in the paper. + // If we reach the end of a node, we continue to the start of the next node even + // if we do not use any characters in it. + void next(int32_t score, pos_t to) { + std::pair diagonal_range = this->get_diagonals(score); + for (int32_t diagonal = diagonal_range.first; diagonal <= diagonal_range.second; diagonal++) { + std::vector leaves = this->get_leaves(); + // Note that we may do the same update from multiple leaves. + for (uint32_t leaf : leaves) { + MatchPos ins = this->ins_predecessor(leaf, score, diagonal).first; + if (!ins.empty()) { + ins.seq_offset++; + this->nodes[ins.node()].update(WFANode::INSERTIONS, score, diagonal, ins); + } + + MatchPos del = this->del_predecessor(leaf, score, diagonal).first; + if (!del.empty()) { + this->successor_offset(del); + this->nodes[del.node()].update(WFANode::DELETIONS, score, diagonal, del); + this->expand_if_necessary(del); + } + + MatchPos subst = this->find_pos(WFANode::MATCHES, leaf, score - this->mismatch, diagonal, true, true); + if (!subst.empty()) { + subst.seq_offset++; + this->successor_offset(subst); + this->expand_if_necessary(subst); + } + + // Determine the edit that reaches furthest on the diagonal. + bool is_insertion = false; + if (subst < ins) { + subst = std::move(ins); + is_insertion = true; + } + if (subst < del) { + subst = std::move(del); + is_insertion = false; + } + + if (!subst.empty()) { + // If we reached the end position with the edit, we get a candidate + // alignment by assuming that the rest of the sequence is an insertion. + // If the edit is an insertion, we charge the gap open cost again, but + // we already got the same insertion without the extra cost from the + // match preceding the insertion. + if (this->nodes[subst.node()].same_node(to) && subst.node_offset == this->nodes[subst.node()].node_offset_of(to)) { + uint32_t gap_length = this->sequence.length() - subst.seq_offset; + int32_t gap_score = 0; + if (gap_length > 0) { + gap_score = this->gap_penalty(gap_length); + } + if (score + gap_score < this->candidate_point.score) { + this->candidate_point = { score + gap_score, diagonal, subst.seq_offset, subst.node_offset }; + this->candidate_node = subst.node(); + } + } + this->nodes[subst.node()].update(WFANode::MATCHES, score, diagonal, subst); + } + } + } + } + + // Returns the predecessor position for the furthest reaching insertion for + // (score, diagonal) at the specified node or its ancestors, or an empty position + // if it does not exist. Also returns the type of the predecessor. + std::pair ins_predecessor(uint32_t node, int32_t score, int32_t diagonal) const { + MatchPos open = this->find_pos(WFANode::MATCHES, node, score - this->gap_open - this->gap_extend, diagonal - 1, true, false); + MatchPos extend = this->find_pos(WFANode::INSERTIONS, node, score - this->gap_extend, diagonal - 1, true, false); + return (open < extend ? std::make_pair(extend, WFAAlignment::insertion) : std::make_pair(open, WFAAlignment::match)); + } + + // Returns the predecessor position for the furthest reaching deletion for + // (score, diagonal) at the specified node or its ancestors, or an empty position + // if it does not exist. Also returns the type of the predecessor. + std::pair del_predecessor(uint32_t node, int32_t score, int32_t diagonal) const { + MatchPos open = this->find_pos(WFANode::MATCHES, node, score - this->gap_open - this->gap_extend, diagonal + 1, false, true); + MatchPos extend = this->find_pos(WFANode::DELETIONS, node, score - this->gap_extend, diagonal + 1, false, true); + return (open < extend ? std::make_pair(extend, WFAAlignment::deletion) : std::make_pair(open, WFAAlignment::match)); + } + + // Returns the predecessor position for the furthest reaching run of matches + // for (score, diagonal) at the specified node or its ancestors, or an empty + // position if it does not exist. Also returns the type of the predecessor. + std::pair match_predecessor(uint32_t node, int32_t score, int32_t diagonal) const { + MatchPos ins = this->find_pos(WFANode::INSERTIONS, node, score, diagonal, false, false); + MatchPos del = this->find_pos(WFANode::DELETIONS, node, score, diagonal, false, false); + MatchPos subst = this->find_pos(WFANode::MATCHES, node, score - this->mismatch, diagonal, false, false); + if (!subst.empty()) { + subst.seq_offset++; + subst.node_offset++; + } + + if (ins < del) { + return (del < subst ? std::make_pair(subst, WFAAlignment::mismatch) : std::make_pair(del, WFAAlignment::deletion)); + } else { + return (ins < subst ? std::make_pair(subst, WFAAlignment::mismatch) : std::make_pair(ins, WFAAlignment::insertion)); + } + } + + // Move forward on the path corresponding to the position. If the position points + // to the end of a node, we assume that it has not reached the end of the path. + void successor_offset(MatchPos& pos) const { + if (pos.node_offset >= this->nodes[pos.node()].length()) { + pos.pop(); pos.node_offset = 0; + } + pos.node_offset++; + } + + // Updates the node and an offset in it to the predecessor offset. + void predecessor_offset(uint32_t& node, uint32_t& offset) const { + if (offset > 0) { + offset--; + } else { + node = this->parent(node); + offset = this->nodes[node].length() - 1; + } + } + + // Returns true if the position is empty. + static bool no_pos(pos_t pos) { return (id(pos) == 0); } + + // Replaces the candidate with the partial alignment with the highest alignment + // score according to the aligner. + void trim(const Aligner& aligner) { + this->candidate_point = { 0, 0, 0, 0}; + this->candidate_node = 0; + int32_t best_score = 0; + for (uint32_t node = 0; node < this->size(); node++) { + for (auto& point_entry : this->nodes[node].wavefronts[WFANode::MATCHES]) { + // Scan all stored points on the node. + // TODO: Does iteration order matter? + // Convert map entries to points. + WFAPoint point = WFAPoint::from_map_entry(point_entry); + int32_t alignment_score = point.alignment_score(aligner); + if (alignment_score > best_score) { + // This is a new winner + this->candidate_point = point; + this->candidate_node = node; + best_score = alignment_score; + } + } + } + } + +private: + + // wf_extend() on a specific diagonal for the set of (local) haplotypes corresponding to + // the given list of leaves in the tree of GBWT search states. + void extend_over(int32_t score, int32_t diagonal, pos_t to, const std::vector& leaves) { + for (uint32_t leaf : leaves) { + + MatchPos pos = this->find_pos(WFANode::MATCHES, leaf, score, diagonal, false, false); + if (pos.empty()) { + + continue; // An impossible score / diagonal combination. + } + while (true) { + // We want to determine if we could reach our fixed destination point, if it exists + bool may_reach_to; + // And if so, where it would be along this WFANode. + uint32_t to_offset; + if (this->nodes[pos.node()].same_node(to)) { + // Work out where we would have to go + to_offset = this->nodes[pos.node()].node_offset_of(to); + // And if we can get there + may_reach_to = this->nodes[pos.node()].same_node(to) && (pos.node_offset <= to_offset); + } else { + // We can't get there, it's not on this WFANode. + may_reach_to = false; + } + + this->nodes[pos.node()].match_forward(this->sequence, this->graph, pos); + + // We got a match that reached the end or went past it. + // Alternatively there is no end position and we have aligned the entire sequence. + // This gives us a candidate where the rest of the sequence is an insertion. + if ((may_reach_to && pos.node_offset >= to_offset) || (no_pos(to) && pos.seq_offset >= this->sequence.length())) { + uint32_t overshoot = (no_pos(to) ? 0 : pos.node_offset - to_offset); + uint32_t gap_length = (this->sequence.length() - pos.seq_offset) + overshoot; + int32_t gap_score = 0; + if (gap_length > 0) { + gap_score = this->gap_penalty(gap_length); + } + + if (score + gap_score < this->candidate_point.score) { + this->candidate_point = { score + gap_score, diagonal, pos.seq_offset - overshoot, to_offset }; + this->candidate_node = pos.node(); + } + } + this->nodes[pos.node()].update(WFANode::MATCHES, score, diagonal, pos); + if (pos.node_offset < this->nodes[pos.node()].length()) { + break; + } + this->expand_if_necessary(pos); + if (pos.at_last_node()) { + // We have exhausted the path leading to the current leaf. Make a copy of the children + // of the leaf (the actual list may be invalidated by further expansions) and continue + // aligning over them. + std::vector new_leaves = this->nodes[leaf].children; + this->extend_over(score, diagonal, to, new_leaves); + break; + } + pos.pop(); + pos.node_offset = 0; + } + } + } + + std::vector get_leaves() const { + std::vector leaves; + for (uint32_t node = 0; node < this->size(); node++) { + if (this->nodes[node].is_leaf()) { + leaves.push_back(node); + } + } + return leaves; + } + + std::pair update_range(std::pair range, int32_t score) const { + if (score >= 0) { + auto iter = this->possible_scores.find(score); + if (iter != this->possible_scores.end()) { + range.first = std::min(range.first, iter->second.min_diagonal); + range.second = std::max(range.second, iter->second.max_diagonal); + } + } + return range; + } + + // Determines the diagonal range for the given score and store it in possible_scores. + // Assumes that the score is valid. Updates max_diagonals. + // Returns an empty range if the score is impossible. + std::pair get_diagonals(int32_t score) { + // Determine the diagonal range for the given score. + std::pair range(1, -1); + range = this->update_range(range, score - this->mismatch); // Mismatch. + range = this->update_range(range, score - this->gap_open - this->gap_extend); // New gap. + range = this->update_range(range, score - this->gap_extend); // Extend an existing gap. + if (range.first > range.second) { + return range; + } + + range.first--; range.second++; + this->max_diagonals.first = std::min(this->max_diagonals.first, range.first); + this->max_diagonals.second = std::max(this->max_diagonals.second, range.second); + auto iter = this->possible_scores.find(score); + iter->second.min_diagonal = range.first; + iter->second.max_diagonal = range.second; + + return range; + } + + // If we have reached the end of the current node, expand its children if necessary. + // Call this whenever the alignment advances in the node. + void expand_if_necessary(const MatchPos& pos) { + if (this->nodes[pos.node()].expanded() || pos.node_offset < this->nodes[pos.node()].length()) { + return; + } + bool found = false; + this->graph.follow_paths(this->nodes[pos.node()].states.back(), [&](const gbwt::SearchState& child) -> bool { + this->nodes[pos.node()].children.push_back(this->size()); + this->nodes.emplace_back(this->coalesce(child), pos.node(), this->graph); + found = true; + return true; + }); + if (!found) { + this->nodes[pos.node()].dead_end = true; + } + } + + // WFATree::find_pos + // Returns the furthest position in given WFA matrix for (score, diagonal) at the + // specified node or its ancestors, or an empty position if it does not exist. + // Returns an empty position if an extendable position is requested but the position + // cannot be extended. + MatchPos find_pos(size_t type, uint32_t node, int32_t score, int32_t diagonal, bool extendable_seq, bool extendable_graph) const { + if (score < 0) { + return MatchPos(); + } + MatchPos::PathList path; + while(true) { + // Make the path a bit longer. + path.push(node); + // Find a position at this node. + // The MatchPos will need to know the whole path, so we can return it. + // TODO: Actually manage the moves ourselves to make this faster! + MatchPos pos = this->nodes[node].find_pos(type, score, diagonal, path); + if (!pos.empty()) { + if (extendable_seq && pos.seq_offset >= this->sequence.length()) { + return MatchPos(); + } + if (extendable_graph && this->at_dead_end(pos)) { + return MatchPos(); + } + return pos; + } + if (is_root(node)) { + return MatchPos(); + } + node = this->parent(node); + } + } + + // Assumes that the position is non-empty. + bool at_dead_end(const MatchPos& pos) const { + return (this->nodes[pos.node()].dead_end && pos.node_offset >= this->nodes[pos.node()].length()); + } +}; + +//------------------------------------------------------------------------------ + +WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) const { + if (this->graph == nullptr || this->aligner == nullptr) { +#ifdef debug_connect + std::cerr << "No graph or no aligner! Returning empty alignment!" << std::endl; +#endif + return WFAAlignment(); + } + gbwt::SearchState root_state = this->graph->get_state(this->graph->get_handle(id(from), is_rev(from))); + if (root_state.empty()) { +#ifdef debug_connect + std::cerr << "No root state! Returning empty alignment!" << std::endl; +#endif + return WFAAlignment(); + } + this->mask(sequence); + + WFATree tree(*(this->graph), sequence, root_state, offset(from) + 1, *(this->aligner), *(this->error_model)); + tree.debug = this->debug; + + int32_t score = 0; + while (true) { +#ifdef debug_connect + std::cerr << "Extend for score " << score << std::endl; +#endif + tree.extend(score, to); + + if (tree.candidate_point.score <= score) { + break; + } + + score = tree.next_score(score); +#ifdef debug_connect + std::cerr << "Next score will be " << score << std::endl; +#endif + + if (score > tree.score_bound) { + break; + } + +#ifdef debug_connect + std::cerr << "Next for score " << score << std::endl; +#endif + tree.next(score, to); + } + + // If we do not have a full-length alignment within the score bound, + // we find the best partial alignment if there was no destination or + // return an empty alignment otherwise. + bool full_length = true; + uint32_t unaligned_tail = sequence.length() - tree.candidate_point.seq_offset; + if (tree.candidate_point.score > tree.score_bound) { +#ifdef debug_connect + std::cerr << "No alignment could be found under score bound of " << tree.score_bound << "; best found was " << tree.candidate_point.score << std::endl; +#endif + unaligned_tail = 0; + if (WFATree::no_pos(to)) { + tree.trim(*(this->aligner)); + full_length = false; + } else { + return WFAAlignment(); + } + } + + // Start building an alignment. Store the path first. + // No need to convert the node offset because it is from the root state's node start. + WFAAlignment result { + {}, {}, static_cast(offset(from) + 1), 0, + tree.candidate_point.seq_offset + unaligned_tail, + tree.candidate_point.alignment_score(*(this->aligner), unaligned_tail), + true + }; + uint32_t node = tree.candidate_node; + while (true) { + // Go back up the tree and compose the path in reverse order + for (auto it = tree.nodes[node].states.rbegin(); it != tree.nodes[node].states.rend(); ++it) { + // Visit all the states in each WFANode and put their graph nodes on the path in reverse order. + result.path.push_back(gbwtgraph::GBWTGraph::node_to_handle(it->node)); + } + if (tree.is_root(node)) { + // Stop when we reach the root + break; + } + // Otherwise go to the parent + node = tree.parent(node); + } + std::reverse(result.path.begin(), result.path.end()); + + // We have a full-length alignment within the score bound with an implicit insertion at the end. + WFAPoint point = tree.candidate_point; + node = tree.candidate_node; + if (unaligned_tail > 0) { + uint32_t final_insertion = sequence.length() - tree.candidate_point.seq_offset; + result.append(WFAAlignment::insertion, final_insertion); + point.score -= tree.gap_penalty(unaligned_tail); + } + + // Backtrace the edits. + WFAAlignment::Edit edit = WFAAlignment::match; + while (point.seq_offset > 0 || point.diagonal != 0) { + std::pair predecessor; + switch (edit) + { + case WFAAlignment::match: + predecessor = tree.match_predecessor(node, point.score, point.diagonal); + result.append(WFAAlignment::match, point.seq_offset - predecessor.first.seq_offset); + point.seq_offset = predecessor.first.seq_offset; + point.node_offset = predecessor.first.node_offset; + if (!predecessor.first.empty()) { + node = predecessor.first.node(); + } + edit = predecessor.second; + break; + case WFAAlignment::mismatch: + result.append(WFAAlignment::mismatch, 1); + point.seq_offset--; + tree.predecessor_offset(node, point.node_offset); + point.score -= tree.mismatch; + edit = WFAAlignment::match; + break; + case WFAAlignment::insertion: + predecessor = tree.ins_predecessor(node, point.score, point.diagonal); + result.append(WFAAlignment::insertion, 1); + point.seq_offset--; + if (predecessor.second == WFAAlignment::insertion) { + point.score -= tree.gap_extend; + } else { + point.score -= tree.gap_open + tree.gap_extend; + } + point.diagonal--; + edit = predecessor.second; + break; + case WFAAlignment::deletion: + predecessor = tree.del_predecessor(node, point.score, point.diagonal); + result.append(WFAAlignment::deletion, 1); + tree.predecessor_offset(node, point.node_offset); + if (predecessor.second == WFAAlignment::deletion) { + point.score -= tree.gap_extend; + } else { + point.score -= tree.gap_open + tree.gap_extend; + } + point.diagonal++; + edit = predecessor.second; + break; + } + } + std::reverse(result.edits.begin(), result.edits.end()); + + // We used "from + 1" as the starting position for the alignment. That could have + // been a past-the-end position in the initial node. Once we have an actual path + // instead of a tree of potential paths, we can remove the unused node. + if (!result.path.empty() && result.node_offset >= this->graph->get_length(result.path.front())) { + result.path.erase(result.path.begin()); + result.node_offset = 0; + } + + // Due to the way we expand the tree of GBWT search states and store wavefront + // information in the leaves, and how we coalesce graph nodes into WFANode + // objects, we sometimes do not use any bases in trailing graph nodes. + // We deal with this now to avoid facing the issue later. + int64_t final_offset = result.final_offset(*(this->graph)); + + while ((result.path.size() == 1 && final_offset == result.node_offset) || (result.path.size() > 1 && final_offset <= 0)) { + // No bases actually used on the final node, so drop it and adjust the offset. + result.path.pop_back(); + if (!result.path.empty()) { + // Offset should now be relative to the start of this node, and not end of this node/start of node after + final_offset += this->graph->get_length(result.path.back()); + + } else { + + } + } + +#ifdef debug_connect + std::cerr << "Found an alignment." << std::endl; +#endif + return result; +} + +WFAAlignment WFAExtender::suffix(const std::string& sequence, pos_t from) const { + return this->connect(sequence, from, pos_t(0, false, 0)); +} + +WFAAlignment WFAExtender::prefix(const std::string& sequence, pos_t to) const { + if (this->graph == nullptr) { + return WFAAlignment(); + } + + // Flip the position, extend forward, and reverse the return value. + to = reverse_base_pos(to, this->graph->get_length(this->graph->get_handle(id(to), is_rev(to)))); + WFAAlignment result = this->connect(reverse_complement(sequence), to, pos_t(0, false, 0)); + result.flip(*(this->graph), sequence); + + return result; +} + +//------------------------------------------------------------------------------ + +} // namespace vg diff --git a/src/gbwt_extender.hpp b/src/gbwt_extender.hpp new file mode 100644 index 00000000000..e7286a284dc --- /dev/null +++ b/src/gbwt_extender.hpp @@ -0,0 +1,446 @@ +#ifndef VG_GBWT_EXTENDER_HPP_INCLUDED +#define VG_GBWT_EXTENDER_HPP_INCLUDED + +/** \file + * Haplotype-consistent seed extension in GBWTGraph. + */ + +#include +#include + +#include "aligner.hpp" + +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +/** + * A result of the gapless extension of a seed. + * - The extension is a path starting from offset 'offset' of node path.front(). + * - Search state 'state' corresponds to the path. + * - The extension covers semiopen interval [read_interval.first, read_interval.second) + * of the read. + * - Vector 'mismatch_positions' contains the mismatching read positions in sorted order. + * - 'score' is an alignment score (bigger is better). + * - Flags 'left_full' and 'right_full' indicate whether the extension covers the + * start/end of the read. + */ +struct GaplessExtension +{ + typedef std::pair seed_type; // (handle, read_offset - node_offset). + + // In the graph. + std::vector path; + size_t offset; + gbwt::BidirectionalState state; + + // In the read. + std::pair read_interval; + std::vector mismatch_positions; + + // Alignment properties. + int32_t score; + bool left_full, right_full; + + // For internal use. + bool left_maximal, right_maximal; + uint32_t internal_score; // Total number of mismatches. + uint32_t old_score; // Mismatches before the current flank. + + /// Length of the extension. + size_t length() const { return this->read_interval.second - this->read_interval.first; } + + /// Is the extension empty? + bool empty() const { return (this->length() == 0); } + + /// Is the extension a full-length alignment? + bool full() const { return (this->left_full & this->right_full); } + + /// Is the extension an exact match? + bool exact() const { return this->mismatch_positions.empty(); } + + /// Number of mismatches in the extension. + size_t mismatches() const { return this->mismatch_positions.size(); } + + /// Does the extension contain the seed? + bool contains(const HandleGraph& graph, seed_type seed) const; + + /// Return the starting position of the extension. + Position starting_position(const HandleGraph& graph) const; + + /// Return the position after the extension. + Position tail_position(const HandleGraph& graph) const; + + /// Return the node offset after the extension. + size_t tail_offset(const HandleGraph& graph) const; + + /// Number of shared (read position, graph position) pairs in the extensions. + size_t overlap(const HandleGraph& graph, const GaplessExtension& another) const; + + /// Take a prefix of the extension as a new GaplessExtension object. + GaplessExtension prefix(size_t prefix_length) const; + + /// Convert the extension into a Path. The sequence should be the full read-space sequence. + Path to_path(const HandleGraph& graph, const std::string& sequence) const; + + /// For priority queues. + bool operator<(const GaplessExtension& another) const { + return (this->score < another.score); + } + + /// Two extensions are equal if the same read interval matches the same search state + /// with the same node offset. + bool operator==(const GaplessExtension& another) const { + return (this->read_interval == another.read_interval && this->state == another.state && this->offset == another.offset); + } + + /// Two extensions are not equal if the state, the read interval, or the node offset is different. + bool operator!=(const GaplessExtension& another) const { + return !(this->operator==(another)); + } +}; + +//------------------------------------------------------------------------------ + +/** + * An utility class that masks all characters except the specified ones with X, + * which is assumed to not exist in the alignment target. + */ +class ReadMasker { +public: + /// Creates a new ReadMasker that masks all characters except the specified + /// ones. + explicit ReadMasker(const std::string& valid_chars); + + /// Applies the mask to the given sequence. + void operator()(std::string& sequence) const; + +private: + std::vector mask; +}; + +//------------------------------------------------------------------------------ + +/** + * A class that supports haplotype-consistent seed extension using GBWTGraph. Each seed + * is a pair of matching read/graph positions and each extension is a gapless alignment + * of an interval of the read to a haplotype. + * A cluster is an unordered set of distinct seeds. Seeds in the same node with the same + * (read_offset - node_offset) difference are considered equivalent. + * GaplessExtender also needs an Aligner object for scoring the extension candidates. + */ +class GaplessExtender { +public: + typedef GaplessExtension::seed_type seed_type; + typedef pair_hash_set cluster_type; + + /// The default value for the maximum number of mismatches. + constexpr static size_t MAX_MISMATCHES = 4; + + /// Two full-length alignments are distinct, if the fraction of overlapping + /// position pairs is at most this. + constexpr static double OVERLAP_THRESHOLD = 0.8; + + /// Create an empty GaplessExtender. + GaplessExtender(); + + /// Create a GaplessExtender using the given GBWTGraph and Aligner objects. + GaplessExtender(const gbwtgraph::GBWTGraph& graph, const Aligner& aligner); + + /// Convert (graph position, read offset) to a seed. + static seed_type to_seed(pos_t pos, size_t read_offset) { + return seed_type(gbwtgraph::GBWTGraph::node_to_handle(gbwt::Node::encode(id(pos), is_rev(pos))), + static_cast(read_offset) - static_cast(offset(pos))); + } + + /// Get the graph position from a seed. + static pos_t get_pos(seed_type seed) { + gbwt::node_type node = gbwtgraph::GBWTGraph::handle_to_node(seed.first); + return make_pos_t(gbwt::Node::id(node), gbwt::Node::is_reverse(node), get_node_offset(seed)); + } + + /// Get the handle from a seed. + static handle_t get_handle(seed_type seed) { + return seed.first; + } + + /// Get the node offset from a seed. + static size_t get_node_offset(seed_type seed) { + return (seed.second < 0 ? -(seed.second) : 0); + } + + /// Get the read offset from a seed. + static size_t get_read_offset(seed_type seed) { + return (seed.second < 0 ? 0 : seed.second); + } + + /** + * Find the highest-scoring extension for each seed in the cluster. + * If there is a full-length extension with at most max_mismatches + * mismatches, sort them in descending order by score and return the + * best non-overlapping full-length extensions. Two extensions overlap + * if the fraction of identical base mappings is greater than + * overlap_threshold. + * If there are no good enough full-length extensions, trim the + * extensions to maximize the score and remove duplicates. In this + * case, the extensions are sorted by read interval. + * Use full_length_extensions() to determine the type of the returned + * extension set. + * The sequence that will be aligned is passed by value. All non-ACGT + * characters are masked with character X, which should not match any + * character in the graph. + * Allow any number of mismatches in the initial node, at least + * max_mismatches mismatches in the entire extension, and at least + * max_mismatches / 2 mismatches on each flank. + * Use the provided CachedGBWTGraph or allocate a new one. + */ + std::vector extend(cluster_type& cluster, std::string sequence, const gbwtgraph::CachedGBWTGraph* cache = nullptr, size_t max_mismatches = MAX_MISMATCHES, double overlap_threshold = OVERLAP_THRESHOLD) const; + + /** + * Determine whether the extension set contains non-overlapping + * full-length extensions sorted in descending order by score. Use + * the same value of max_mismatches as in extend(). + */ + static bool full_length_extensions(const std::vector& result, size_t max_mismatches = MAX_MISMATCHES); + + const gbwtgraph::GBWTGraph* graph; + const Aligner* aligner; + ReadMasker mask; +}; + +//------------------------------------------------------------------------------ + +/** + * An alignment found by WFAAligner, consisting of a sequence of nodes, a sequence + * of edits, and a starting offset in the initial node. The alignment score is + * computed using the match / mismatch / gap open / gap extend parameters in the + * Aligner object given to the WFAAligner. Full-length bonuses are not used. + * + * Note: The aligner merges consecutive edit operations of the same type. Hence an + * edit may span multiple nodes. + * + * This struct is an aggregate and can be aggregate-initialized with a + * brace-enclosed initializer list. + */ +struct WFAAlignment { + enum Edit { match, mismatch, insertion, deletion }; + + /// Generate a WFAAlignment from a GaplessExtension. This can't be a + /// constructor because then WFAAlignment wouldn't be able to be + /// aggregate-initialized. + static WFAAlignment from_extension(const GaplessExtension& extension); + + /// Generate a WFAAlignment that is an unlocalized insertion of the given + /// length. Can also be used to represent a softclip. + /// Length may not be 0. + static WFAAlignment make_unlocalized_insertion(size_t sequence_offset, size_t length, int score); + + /// Generate an empty WFAAlignment. + static WFAAlignment make_empty(); + + /// Sequence of oriented nodes. + std::vector path; + + /// Sequence of edit operations and their lengths. + std::vector> edits; + + /// Starting offset in the initial node. + /// If there is no initial node, the value stored is undefined. + uint32_t node_offset = 0; + + /// Starting offset in the sequence. + uint32_t seq_offset = 0; + + /// Length of the alignment in the sequence. + uint32_t length = 0; + + /// Alignment score. + int32_t score = 0; + + /// Is this an actual alignment or a failure? + bool ok = false; + + /// Is this an actual alignment or a failure? + operator bool() const { return this->ok; } + + /// Returns true if the alignment does not cover anything in the sequence + /// and the graph. + bool empty() const { return (this->path.empty() && this->edits.empty()); } + + /// Returns true if the alignment is an insertion without a location. + bool unlocalized_insertion() const; + + /// Computes the node offset after the alignment in the final node. + /// Will be negative if the alignment's final node(s) are not actually used by edits. + int64_t final_offset(const gbwtgraph::GBWTGraph& graph) const; + + /// Transforms the alignment to the other strand. + void flip(const gbwtgraph::GBWTGraph& graph, const std::string& sequence); + + /// Appends an edit operation, merging it with the latest edit if possible. + /// Ignores empty edits. + void append(Edit edit, uint32_t length); + + /// Concatenate another WFAAlignment onto this one, assuming they abut in + /// the read and the graph. Either may be empty, or an unlocalized insertion. + void join(const WFAAlignment& second); + + /// Convert the WFAAlignment into a Path. + Path to_path(const HandleGraph& graph, const std::string& sequence) const; + + /// Prints some debug information about the alignment. + std::ostream& print(const HandleGraph& graph, std::ostream& out) const; + /// Prints some debug information about the alignment. + std::ostream& print(std::ostream& out) const; + + /// Make sure the stored lengths, paths, and edits are accurate and + /// consistent with each other. + void check_lengths(const HandleGraph& graph) const; +}; + +/// Allow printing an Edit +std::ostream& operator<<(std::ostream& out, const WFAAlignment::Edit& edit); + +} + +namespace std { + /// Convert a WFAAlignment Edit operation to a string + std::string to_string(const vg::WFAAlignment::Edit& edit); +} + +namespace vg { + +//------------------------------------------------------------------------------ + +/** + * A class that supports haplotype-consistent seed extension in a GBWTGraph using the + * WFA algorithm: + * + * Marco-Sola, Moure, Moreto, Espinosa: Fast gap-affine pairwise alignment using the + * wavefront algorithm. Bioinformatics, 2021. + * + * The algorithm either tries to connect two seeds or extends a seed to the start/end + * of the read. + * + * WFAExtender also needs an Aligner object for scoring the extension candidates. + * While VG wants to maximize a four-parameter alignment score, WFA minimizes a + * three-parameter score. We use the conversion between the parameters from: + * + * Eizenga, Paten: Improving the time and space complexity of the WFA algorithm + * and generalizing its scoring. bioRxiv, 2022. + * + * VG scores a gap of length `n` as `gap_open + (n - 1) * gap_extend`, while WFA + * papers use `gap_open + n * gap_extend`. Hence we use `gap_open - gap_extend` as + * the effective four-parameter gap open score inside the aligner. + * + * NOTE: Most internal arithmetic operations use 32-bit integers. + */ +class WFAExtender { +public: + + /** + * Defines how many of what kind of errors the WFA alignment algorithm + * should tolerate, as a function of sequence length. + * + * The WFA alignment can't actually limit edits by type, but it can limit + * the score such that you can't exceed all the limits on different event + * types at once. + */ + struct ErrorModel { + + /// We have one of these each for matches, mismatches, and gaps to + /// define how many of them to allow. + struct Event { + /// How many of the event should we allow per base? + double per_base; + /// How many should we allow at least or at most, regardless + /// of sequence length? Min is a bonus over the per-base calculation, + /// but max is a cap. + int32_t min, max; + + /// Evaluate the model and get a limit number for this kind of + /// event for a given sequence length. + inline int32_t evaluate(size_t length) const { + return std::min(max, (int32_t)(per_base * length) + min); + } + }; + + /// Limits for mismatches + Event mismatches; + /// Limits for total gaps (*not* gap opens; a gap open uses 1 gap and 1 gap length) + Event gaps; + /// Limits for total gap length (gap extends plus gap opens) + Event gap_length; + }; + + /// If not specified, we use this default error model. + static const ErrorModel default_error_model; + + /// Create an empty WFAExtender. + WFAExtender(); + + /// Create a WFAExtender using the given GBWTGraph and Aligner objects. + /// If an error model is passed, use that instead of the default error model. + /// All arguments must outlive the WFAExtender. + WFAExtender(const gbwtgraph::GBWTGraph& graph, const Aligner& aligner, const ErrorModel& error_model = default_error_model); + + /** + * Align the sequence to a haplotype between the two graph positions. + * + * The endpoints are assumed to be valid graph positions. In order for + * there to be an alignment, there must be a haplotype that includes the + * endpoints and connects them. However, the endpoints are not covered + * by the returned alignment. + * + * The sequence that will be aligned is passed by value. All non-ACGT + * characters are masked with character X, which should not match any + * character in the graph. + * + * Returns a failed alignment if there is no alignment with an acceptable + * score. + * + * NOTE: The alignment is to a path after `from` and before `to`. If the + * points are identical, such a path can only exist if there is a cycle. + */ + WFAAlignment connect(std::string sequence, pos_t from, pos_t to) const; + + /** + * A special case of connect() for aligning the sequence to a haplotype + * starting at the given position. If there is no alignment for the + * entire sequence with an acceptable score, returns the highest-scoring + * partial alignment, which may be empty. + * + * NOTE: This creates a suffix of the full alignment by aligning a + * prefix of the sequence. + * TODO: Should we use full-length bonuses? + */ + WFAAlignment suffix(const std::string& sequence, pos_t from) const; + + /** + * A special case of connect() for aligning the sequence to a haplotype + * ending at the given position. If there is no alignment for the entire + * sequence with an acceptable score, returns the highest-scoring partial + * alignment, which may be empty. + * + * NOTE: This creates a prefix of the full alignment by aligning a suffix + * of the sequence. + * TODO: Should we use full-length bonuses? + */ + WFAAlignment prefix(const std::string& sequence, pos_t to) const; + + const gbwtgraph::GBWTGraph* graph; + ReadMasker mask; + const Aligner* aligner; + const ErrorModel* error_model; + + /// TODO: Remove when unnecessary. + bool debug = false; +}; + +//------------------------------------------------------------------------------ + +} // namespace vg + +#endif // VG_GBWT_EXTENDER_HPP_INCLUDED diff --git a/src/gbwt_helper.cpp b/src/gbwt_helper.cpp new file mode 100644 index 00000000000..cdf59e4e928 --- /dev/null +++ b/src/gbwt_helper.cpp @@ -0,0 +1,620 @@ +#include "gbwt_helper.hpp" +#include "utility.hpp" + +#include +#include +#include + +#include +#include + +namespace vg { + +std::vector parseGenotypes(const std::string& vcf_line, size_t num_samples) { + std::vector result; + + // The 9th tab-separated field should start with "GT". + size_t offset = 0; + for (int i = 0; i < 8; i++) { + size_t pos = vcf_line.find('\t', offset); + if (pos == std::string::npos) { + std::cerr << "error: [vg index] VCF line does not contain genotype information" << std::endl; + std::exit(EXIT_FAILURE); + } + offset = pos + 1; + } + if (vcf_line.substr(offset, 2) != "GT") { + std::cerr << "error: [vg index] VCF line does not contain genotype information" << std::endl; + std::exit(EXIT_FAILURE); + } + + // Genotype strings are the first colon-separated fields in the 10th+ tab-separated fields. + offset = vcf_line.find('\t', offset); + while (offset != std::string::npos && offset + 1 < vcf_line.length()) { + offset++; + size_t pos = vcf_line.find_first_of("\t:", offset); + if (pos == std::string::npos) { + pos = vcf_line.length(); + } + result.emplace_back(vcf_line.substr(offset, pos - offset)); + offset = vcf_line.find('\t', offset); + } + + if (result.size() != num_samples) { + std::cerr << "error: [vg index] expected " << num_samples << " samples, got " << result.size() << std::endl; + std::exit(EXIT_FAILURE); + } + + return result; +} + +//------------------------------------------------------------------------------ + +gbwt::vector_type extract_as_gbwt_path(const PathHandleGraph& graph, const std::string& path_name) { + gbwt::vector_type result; + if (!graph.has_path(path_name)) { + return result; + } + path_handle_t path_handle = graph.get_path_handle(path_name); + result.reserve(graph.get_step_count(path_handle)); + for (handle_t handle : graph.scan_path(path_handle)) { + result.push_back(gbwt::Node::encode(graph.get_id(handle), graph.get_is_reverse(handle))); + } + return result; +} + +gbwt::vector_type path_predecessors(const PathHandleGraph& graph, const std::string& path_name) { + gbwt::vector_type result; + if (!graph.has_path(path_name)) { + return result; + } + path_handle_t path_handle = graph.get_path_handle(path_name); + if (graph.get_step_count(path_handle) == 0) { + return result; + } + step_handle_t step = graph.path_begin(path_handle); + handle_t handle = graph.get_handle_of_step(step); + graph.follow_edges(handle, true, [&] (handle_t prev) { + if (prev != handle) { + result.push_back(gbwt::Node::encode(graph.get_id(prev), graph.get_is_reverse(prev))); + } + }); + return result; +} + +//------------------------------------------------------------------------------ + +gbwt::size_type gbwt_node_width(const HandleGraph& graph) { + return gbwt::bit_length(gbwt::Node::encode(graph.max_node_id(), true)); +} + +void finish_gbwt_constuction(gbwt::GBWTBuilder& builder, + const std::vector& sample_names, + const std::vector& contig_names, + size_t haplotype_count, bool print_metadata, + const std::string& header) { + + builder.finish(); + builder.index.metadata.setSamples(sample_names); + builder.index.metadata.setHaplotypes(haplotype_count); + builder.index.metadata.setContigs(contig_names); + if (print_metadata) { + #pragma omp critical + { + std::cerr << header << ": "; + gbwt::operator<<(std::cerr, builder.index.metadata); + std::cerr << std::endl; + } + } +} + +//------------------------------------------------------------------------------ + +void load_gbwt(gbwt::GBWT& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Loading compressed GBWT from " << filename << std::endl; + } + std::unique_ptr loaded = vg::io::VPKG::load_one(filename); + if (loaded.get() == nullptr) { + std::cerr << "error: [load_gbwt()] cannot load compressed GBWT " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + index = std::move(*loaded); +} + +void load_gbwt(gbwt::DynamicGBWT& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Loading dynamic GBWT from " << filename << std::endl; + } + std::unique_ptr loaded = vg::io::VPKG::load_one(filename); + if (loaded.get() == nullptr) { + std::cerr << "error: [load_gbwt()] cannot load dynamic GBWT " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + index = std::move(*loaded); +} + +void load_r_index(gbwt::FastLocate& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Loading r-index from " << filename << std::endl; + } + std::unique_ptr loaded = vg::io::VPKG::load_one(filename); + if (loaded.get() == nullptr) { + std::cerr << "error: [load_r_index()] cannot load r-index " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + index = std::move(*loaded); +} + +void save_gbwt(const gbwt::GBWT& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Saving compressed GBWT to " << filename << std::endl; + } + sdsl::simple_sds::serialize_to(index, filename); +} + +void save_gbwt(const gbwt::DynamicGBWT& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Saving dynamic GBWT to " << filename << std::endl; + } + sdsl::simple_sds::serialize_to(index, filename); +} + +void save_r_index(const gbwt::FastLocate& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Saving r-index to " << filename << std::endl; + } + if (!sdsl::store_to_file(index, filename)) { + std::cerr << "error: [save_r_index()] cannot write r-index to " << filename << std::endl; + std::exit(EXIT_FAILURE); + } +} + +//------------------------------------------------------------------------------ + +void GBWTHandler::use_compressed() { + if (this->in_use == index_compressed) { + return; + } else if (this->in_use == index_dynamic) { + if (this->show_progress) { + std::cerr << "Converting dynamic GBWT into compressed GBWT" << std::endl; + } + this->compressed = gbwt::GBWT(this->dynamic); + this->dynamic = gbwt::DynamicGBWT(); + this->in_use = index_compressed; + } else { + load_gbwt(this->compressed, this->filename, this->show_progress); + this->in_use = index_compressed; + } +} + +void GBWTHandler::use_dynamic() { + if (this->in_use == index_dynamic) { + return; + } else if (this->in_use == index_compressed) { + if (this->show_progress) { + std::cerr << "Converting compressed GBWT into dynamic GBWT" << std::endl; + } + this->dynamic = gbwt::DynamicGBWT(this->compressed); + this->compressed = gbwt::GBWT(); + this->in_use = index_dynamic; + } else { + load_gbwt(this->dynamic, this->filename, this->show_progress); + this->in_use = index_dynamic; + } +} + +void GBWTHandler::use(gbwt::GBWT& new_index) { + this->clear(); + this->unbacked(); + this->compressed.swap(new_index); + this->in_use = index_compressed; +} + +void GBWTHandler::use(gbwt::DynamicGBWT& new_index) { + this->clear(); + this->unbacked(); + this->dynamic.swap(new_index); + this->in_use = index_dynamic; +} + +void GBWTHandler::unbacked() { + this->filename = std::string(); +} + +void GBWTHandler::serialize(const std::string& new_filename) { + if (this->in_use == index_none) { + std::cerr << "warning: [GBWTHandler] no GBWT to serialize" << std::endl; + return; + } else if (this->in_use == index_compressed) { + save_gbwt(this->compressed, new_filename, this->show_progress); + } else { + save_gbwt(this->dynamic, new_filename, this->show_progress); + } + this->filename = new_filename; +} + +void GBWTHandler::clear() { + this->compressed = gbwt::GBWT(); + this->dynamic = gbwt::DynamicGBWT(); + this->in_use = index_none; +} + +//------------------------------------------------------------------------------ + +// Partition the GBWT seqeuences between jobs by the first node. +std::vector> partition_gbwt_sequences(const gbwt::GBWT& gbwt_index, const std::unordered_map& node_to_job, size_t num_jobs) { + std::vector> result(num_jobs); + for (gbwt::size_type sequence = 0; sequence < gbwt_index.sequences(); sequence += 2) { + gbwt::edge_type start = gbwt_index.start(sequence); + if (start != gbwt::invalid_edge()) { + nid_t node = gbwt::Node::id(start.first); + auto iter = node_to_job.find(node); + if (iter != node_to_job.end()) { + result[iter->second].push_back(sequence); + } else if (start.first == gbwt::ENDMARKER) { + result[0].push_back(sequence); + } + } + } + return result; +} + +// Build a GBWT by inserting the specified sequences and applying the specified mappings. +gbwt::GBWT rebuild_gbwt_job(const gbwt::GBWT& gbwt_index, const RebuildJob& job, const std::vector& sequences, const RebuildParameters& parameters) { + + // Partition the mappings by the first node and determine node width. + gbwt::size_type node_width = sdsl::bits::length(gbwt_index.sigma() - 1); + std::unordered_map> mappings_by_first_node; + for (auto& mapping : job.mappings) { + if (mapping.first.empty() || mapping.first == mapping.second) { + continue; + } + mappings_by_first_node[mapping.first.front()].push_back(mapping); + std::pair reverse; + gbwt::reversePath(mapping.first, reverse.first); + gbwt::reversePath(mapping.second, reverse.second); + mappings_by_first_node[reverse.first.front()].push_back(reverse); + for (auto node : mapping.second) { + node_width = std::max(node_width, static_cast(sdsl::bits::length(node))); + } + } + + // Insert the sequences from the original GBWT and apply the mappings. + gbwt::GBWTBuilder builder(node_width, parameters.batch_size, parameters.sample_interval); + for (gbwt::size_type sequence : sequences) { + gbwt::vector_type path = gbwt_index.extract(sequence); + gbwt::vector_type mapped; + size_t i = 0; + while (i < path.size()) { + auto iter = mappings_by_first_node.find(path[i]); + bool found = false; + if (iter != mappings_by_first_node.end()) { + for (auto& mapping : iter->second) { + size_t j = 1; + while (j < mapping.first.size() && i + j < path.size() && mapping.first[j] == path[i + j]) { + j++; + } + if (j >= mapping.first.size()) { + // Leave the last node unprocessed if it does not change. + if (mapping.first.size() > 1 && mapping.second.size() > 0 && mapping.first.back() == mapping.second.back()) { + mapped.insert(mapped.end(), mapping.second.begin(), mapping.second.end() - 1); + i += mapping.first.size() - 1; + } else { + mapped.insert(mapped.end(), mapping.second.begin(), mapping.second.end()); + i += mapping.first.size(); + } + found = true; + break; + } + } + } + if (!found) { + mapped.push_back(path[i]); + i++; + } + } + builder.insert(mapped, true); + } + builder.finish(); + + return gbwt::GBWT(builder.index); +} + +// Copy metadata from source to target, but reorder path names according to the merging order. +void copy_metadata(const gbwt::GBWT& source, gbwt::GBWT& target, const std::vector>& jobs, const std::vector& job_order) { + if (!source.hasMetadata()) { + return; + } + target.addMetadata(); + target.metadata = source.metadata; + if (source.metadata.hasPathNames()) { + target.metadata.clearPathNames(); + for (size_t job : job_order) { + for (gbwt::size_type sequence : jobs[job]) { + target.metadata.addPath(source.metadata.path(gbwt::Path::id(sequence))); + } + } + } +} + +gbwt::GBWT rebuild_gbwt(const gbwt::GBWT& gbwt_index, + const std::vector& jobs, + const std::unordered_map& node_to_job, + const RebuildParameters& parameters) { + + if (gbwt_index.empty() || jobs.empty()) { + return gbwt_index; + } + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + + // Sort the jobs in descending order by size. + std::vector jobs_by_size(jobs.size()); + for (size_t i = 0; i < jobs_by_size.size(); i++) { + jobs_by_size[i] = i; + } + std::sort(jobs_by_size.begin(), jobs_by_size.end(), [&](size_t a, size_t b) -> bool { + return (jobs[a].total_size > jobs[b].total_size); + }); + + // Build indexes in parallel. + if (parameters.show_progress) { + std::cerr << "rebuild_gbwt(): Building " << jobs.size() << " partial GBWTs using up to " << parameters.num_jobs << " parallel jobs" << std::endl; + } + std::vector indexes(jobs.size()); + std::vector> sequences_by_job = partition_gbwt_sequences(gbwt_index, node_to_job, jobs.size()); + int old_max_threads = omp_get_max_threads(); + omp_set_num_threads(parameters.num_jobs); + #pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < jobs.size(); i++) { + size_t job = jobs_by_size[i]; + if (parameters.show_progress) { + #pragma omp critical + { + std::cerr << "rebuild_gbwt(): Starting job " << job << std::endl; + } + } + indexes[job] = rebuild_gbwt_job(gbwt_index, jobs[job], sequences_by_job[job], parameters); + if (parameters.show_progress) { + #pragma omp critical + { + std::cerr << "rebuild_gbwt(): Inserted " << sequences_by_job[job].size() << " threads in job " << job << std::endl; + } + } + } + omp_set_num_threads(old_max_threads); + + // We can avoid merging if we had only one job. + if (indexes.size() == 1) { + gbwt::GBWT result(std::move(indexes.front())); + if (gbwt_index.hasMetadata()) { + result.addMetadata(); + result.metadata = gbwt_index.metadata; + } + return result; + } + + // Merge the partial GBWTs and copy the metadata. + if (parameters.show_progress) { + std::cerr << "rebuild_gbwt(): Merging the partial GBWTs" << std::endl; + } + gbwt::GBWT merged(indexes); + indexes.clear(); + copy_metadata(gbwt_index, merged, sequences_by_job, jobs_by_size); + + return merged; +} + +gbwt::GBWT rebuild_gbwt(const gbwt::GBWT& gbwt_index, const std::vector& mappings) { + std::vector jobs { + { mappings, 0 } + }; + std::unordered_map node_to_job; + for (gbwt::size_type i = 0; i < gbwt_index.sequences(); i += 2) { + gbwt::edge_type start = gbwt_index.start(i); + if (start != gbwt::invalid_edge()) { + node_to_job[gbwt::Node::id(start.first)] = 0; + } + } + RebuildParameters parameters; + return rebuild_gbwt(gbwt_index, jobs, node_to_job, parameters); +} + +//------------------------------------------------------------------------------ + +std::vector threads_for_sample(const gbwt::GBWT& gbwt_index, const std::string& sample_name) { + if (gbwt_index.hasMetadata() && gbwt_index.metadata.hasSampleNames() && gbwt_index.metadata.hasPathNames()) { + gbwt::size_type sample_id = gbwt_index.metadata.sample(sample_name); + if (sample_id < gbwt_index.metadata.samples()) { + return gbwt_index.metadata.pathsForSample(sample_id); + } + } + return std::vector(); +} + +std::vector threads_for_contig(const gbwt::GBWT& gbwt_index, const std::string& contig_name) { + if (gbwt_index.hasMetadata() && gbwt_index.metadata.hasContigNames() && gbwt_index.metadata.hasPathNames()) { + gbwt::size_type contig_id = gbwt_index.metadata.contig(contig_name); + if (contig_id < gbwt_index.metadata.contigs()) { + return gbwt_index.metadata.pathsForContig(contig_id); + } + } + return std::vector(); +} + +std::string insert_gbwt_path(MutablePathHandleGraph& graph, const gbwt::GBWT& gbwt_index, gbwt::size_type id, std::string path_name) { + + gbwt::size_type sequence_id = gbwt_index.bidirectional() ? gbwt::Path::encode(id, false) : id; + + if (sequence_id >= gbwt_index.sequences()) { + std::cerr << "error: [insert_gbwt_path()] invalid path id: " << id << std::endl; + return ""; + } + + // If the path name was not specified, try first using the default name generated from GBWT metadata. + // If that fails, simply use the id. + if (path_name.empty()) { + // TODO: Pass this down for efficiency + auto parse = gbwtgraph::parse_reference_samples_tag(gbwt_index); + auto sense = gbwtgraph::get_path_sense(gbwt_index, id, parse); + path_name = gbwtgraph::compose_path_name(gbwt_index, id, sense); + } + if (path_name.empty()) { path_name = std::to_string(id); } + if (graph.has_path(path_name)) { + std::cerr << "error: [insert_gbwt_path()] path name already exists: " << path_name << std::endl; + return ""; + } + + path_handle_t handle = graph.create_path_handle(path_name); + gbwt::edge_type pos = gbwt_index.start(sequence_id); + while (pos.first != gbwt::ENDMARKER) { + graph.append_step(handle, gbwt_to_handle(graph, pos.first)); + pos = gbwt_index.LF(pos); + } + + return path_name; +} + +Path extract_gbwt_path(const HandleGraph& graph, const gbwt::GBWT& gbwt_index, gbwt::size_type id) { + + Path result; + gbwt::size_type sequence_id = gbwt_index.bidirectional() ? gbwt::Path::encode(id, false) : id; + if (sequence_id >= gbwt_index.sequences()) { + std::cerr << "error: [insert_gbwt_path()] invalid path id: " << id << std::endl; + return result; + } + + // TODO: Pass this down for efficiency + auto parse = gbwtgraph::parse_reference_samples_tag(gbwt_index); + auto sense = gbwtgraph::get_path_sense(gbwt_index, id, parse); + std::string path_name = gbwtgraph::compose_path_name(gbwt_index, id, sense); + if (path_name.empty()) { + path_name = std::to_string(id); + } + result.set_name(path_name); + + gbwt::edge_type pos = gbwt_index.start(sequence_id); + size_t rank = 1; + while (pos.first != gbwt::ENDMARKER) { + Mapping* m = result.add_mapping(); + Position* p = m->mutable_position(); + p->set_node_id(gbwt::Node::id(pos.first)); + p->set_is_reverse(gbwt::Node::is_reverse(pos.first)); + Edit* e = m->add_edit(); + size_t len = graph.get_length(gbwt_to_handle(graph, pos.first)); + e->set_to_length(len); + e->set_from_length(len); + m->set_rank(rank); + pos = gbwt_index.LF(pos); + rank++; + } + + return result; +} + +std::string compose_short_path_name(const gbwt::GBWT& gbwt_index, gbwt::size_type id) { + if (!gbwt_index.hasMetadata() || !gbwt_index.metadata.hasPathNames() || id >= gbwt_index.metadata.paths()) { + return ""; + } + + auto& metadata = gbwt_index.metadata; + const gbwt::PathName& path = metadata.path(id); + + // We want a name with just sample and contig and haplotype. + // Spit out a name in reference sense format, which should suffice. + return PathMetadata::create_path_name(PathSense::REFERENCE, + gbwtgraph::get_path_sample_name(metadata, path, PathSense::REFERENCE), + gbwtgraph::get_path_locus_name(metadata, path, PathSense::REFERENCE), + gbwtgraph::get_path_haplotype(metadata, path, PathSense::REFERENCE), + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); +} + +//------------------------------------------------------------------------------ + +gbwt::GBWT get_gbwt(const std::vector& paths) { + gbwt::size_type node_width = 1, total_length = 0; + for (auto& path : paths) { + for (auto node : path) { + node_width = std::max(node_width, gbwt::bit_length(gbwt::Node::encode(node, true))); + } + total_length += 2 * (path.size() + 1); + } + + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + gbwt::GBWTBuilder builder(node_width, total_length); + for (auto& path : paths) { + builder.insert(path, true); + } + builder.finish(); + + return gbwt::GBWT(builder.index); +} + +//------------------------------------------------------------------------------ + +unordered_map> load_translation_map(ifstream& input_stream) { + string buffer; + size_t line = 1; + unordered_map> translation_map; + try { + while (getline(input_stream, buffer)) { + vector toks = split_delims(buffer, "\t"); + if (toks.size() == 3 && toks[0] == "T") { + vector toks2 = split_delims(toks[2], ","); + const string& segment_id = toks[1]; + vector node_ids; + node_ids.reserve(toks2.size()); + for (string& node_id_str : toks2) { + node_ids.push_back(parse(node_id_str)); + } + vector& val = translation_map[segment_id]; + if (!val.empty()) { + throw runtime_error("Segment " + toks[0] + " already in map"); + } + translation_map[segment_id] = node_ids; + } else { + throw runtime_error("Invalid columns"); + } + ++line; + } + } catch (const std::exception& e) { + throw runtime_error("[load_translation_map] error: unable to parse line " + to_string(line) + + " of translation map: " + e.what()); + } + return translation_map; +} + +unordered_map> load_translation_back_map(HandleGraph& graph, ifstream& input_stream) { + string buffer; + size_t line = 1; + unordered_map> translation_back_map; + try { + while (getline(input_stream, buffer)) { + vector toks = split_delims(buffer, "\t"); + if (toks.size() == 3 && toks[0] == "T") { + vector toks2 = split_delims(toks[2], ","); + const string& segment_id = toks[1]; + size_t offset = 0; + for (string& node_id_str : toks2) { + nid_t node_id = stol(node_id_str); + if (translation_back_map.count(node_id)) { + throw runtime_error("Node ID " + node_id_str + " already in map"); + } + translation_back_map[node_id] = make_pair(segment_id, offset); + offset += graph.get_length(graph.get_handle(node_id)); + } + } else { + throw runtime_error("Invalid columns"); + } + ++line; + } + } catch (const std::exception& e) { + throw runtime_error("[load_translation_back_map] error: unable to parse line " + to_string(line) + + " of translation map: " + e.what()); + } + return translation_back_map; +} + + +} // namespace vg diff --git a/src/gbwt_helper.hpp b/src/gbwt_helper.hpp new file mode 100644 index 00000000000..1d8bf0e5db8 --- /dev/null +++ b/src/gbwt_helper.hpp @@ -0,0 +1,263 @@ +#ifndef VG_GBWT_HELPER_HPP_INCLUDED +#define VG_GBWT_HELPER_HPP_INCLUDED + +/** \file + * Utility classes and functions for working with GBWT. + */ + +#include + +#include "position.hpp" + +#include +#include +#include + +namespace vg { + +std::vector parseGenotypes(const std::string& vcf_line, size_t num_samples); + +//------------------------------------------------------------------------------ + +/// Convert gbwt::node_type to handle_t. +inline handle_t gbwt_to_handle(const HandleGraph& graph, gbwt::node_type node) { + return graph.get_handle(gbwt::Node::id(node), gbwt::Node::is_reverse(node)); +} + +/// Convert gbwt::node_type and an offset as size_t to pos_t. +inline pos_t gbwt_to_pos(gbwt::node_type node, size_t offset) { + return make_pos_t(gbwt::Node::id(node), gbwt::Node::is_reverse(node), offset); +} + +/// Convert handle_t to gbwt::node_type. +inline gbwt::node_type handle_to_gbwt(const HandleGraph& graph, handle_t handle) { + return gbwt::Node::encode(graph.get_id(handle), graph.get_is_reverse(handle)); +} + +/// Extract gbwt::node_type from pos_t. +inline gbwt::node_type pos_to_gbwt(pos_t pos) { + return gbwt::Node::encode(id(pos), is_rev(pos)); +} + +/// Convert Mapping to gbwt::node_type. +inline gbwt::node_type mapping_to_gbwt(const Mapping& mapping) { + return gbwt::Node::encode(mapping.position().node_id(), mapping.position().is_reverse()); +} + +/// Convert Path to a GBWT path. +inline gbwt::vector_type path_to_gbwt(const Path& path) { + gbwt::vector_type result(path.mapping_size()); + for (size_t i = 0; i < result.size(); i++) { + result[i] = mapping_to_gbwt(path.mapping(i)); + } + return result; +} + +/// Extract a path as a GBWT path. If the path does not exist, it is treated as empty. +gbwt::vector_type extract_as_gbwt_path(const PathHandleGraph& graph, const std::string& path_name); + +/// Find all predecessor nodes of the path, ignoring self-loops. If the path +/// does not exist, it is treated as empty. +gbwt::vector_type path_predecessors(const PathHandleGraph& graph, const std::string& path_name); + +//------------------------------------------------------------------------------ + +// GBWT construction helpers. + +/// Determine the node width in bits for the GBWT nodes based on the given graph. +gbwt::size_type gbwt_node_width(const HandleGraph& graph); + +/// Finish GBWT construction and optionally print the metadata. +void finish_gbwt_constuction(gbwt::GBWTBuilder& builder, + const std::vector& sample_names, + const std::vector& contig_names, + size_t haplotype_count, bool print_metadata, + const std::string& header = "GBWT"); + +//------------------------------------------------------------------------------ + +/* + These are the proper ways of saving and loading GBWT structures. + Loading with `vg::io::VPKG::load_one` is also supported. +*/ + +/// Load a compressed GBWT from the file. +void load_gbwt(gbwt::GBWT& index, const std::string& filename, bool show_progress = false); + +/// Load a dynamic GBWT from the file. +void load_gbwt(gbwt::DynamicGBWT& index, const std::string& filename, bool show_progress = false); + +/// Load an r-index from the file. +void load_r_index(gbwt::FastLocate& index, const std::string& filename, bool show_progress = false); + +/// Save a compressed GBWT to the file. +void save_gbwt(const gbwt::GBWT& index, const std::string& filename, bool show_progress = false); + +/// Save a dynamic GBWT to the file. +void save_gbwt(const gbwt::DynamicGBWT& index, const std::string& filename, bool show_progress = false); + +/// Save an r-index to the file. +void save_r_index(const gbwt::FastLocate& index, const std::string& filename, bool show_progress = false); + +//------------------------------------------------------------------------------ + +/** + * Helper class that stores either a GBWT or a DynamicGBWT and loads them from a file + * or converts between them when necessary. + */ +struct GBWTHandler { + enum index_type { index_none, index_compressed, index_dynamic }; + + /// Compressed GBWT. + gbwt::GBWT compressed; + + /// Dynamic GBWT. + gbwt::DynamicGBWT dynamic; + + /// Which index is in use. + index_type in_use = index_none; + + /// The in-memory indexes are backed by this file. + std::string filename; + + /// Print progress information to stderr when loading/converting indexes. + bool show_progress = false; + + /// Switch to a compressed GBWT, converting it from the dynamic GBWT or reading it + /// from a file if necessary. + void use_compressed(); + + /// Switch to a dynamic GBWT, converting it from the compressed GBWT or reading it + /// from a file if necessary. + void use_dynamic(); + + /// Start using this compressed GBWT. Clears the index used as the argument. + /// The GBWT is not backed by a file. + void use(gbwt::GBWT& new_index); + + /// Start using this dynamic GBWT. Clears the index used as the argument. + /// The GBWT is not backed by a file. + void use(gbwt::DynamicGBWT& new_index); + + /// The GBWT is no longer backed by a file. + void unbacked(); + + /// Serialize the in-memory index to this file and start using it as the backing file. + void serialize(const std::string& new_filename); + + /// Clear the in-memory index. + void clear(); +}; + +//------------------------------------------------------------------------------ + +/// A GBWT construction job in `rebuild_gbwt`. Each job corresponds to one or more +/// weakly connected components in the graph. All mappings must replace subpaths in +/// these components. When there are multiple jobs, no component may appear in more +/// than one job and the node identifiers used in the mappings must not overlap +/// between jobs. +struct RebuildJob { + /// Replace the first subpath with the second subpath. + typedef std::pair mapping_type; + + /// Mappings that should be applied in this job. + std::vector mappings; + + /// Total size of the graph components in nodes. + size_t total_size = 0; +}; + +/// Parameters for `rebuild_gbwt`. +struct RebuildParameters { + /// Maximum number of parallel construction jobs. + size_t num_jobs = 1; + + /// Print progress information to stderr. + bool show_progress = false; + + /// Size of the GBWT construction buffer in nodes. + gbwt::size_type batch_size = gbwt::DynamicGBWT::INSERT_BATCH_SIZE; + + /// Sample interval for locate queries. + gbwt::size_type sample_interval = gbwt::DynamicGBWT::SAMPLE_INTERVAL; +}; + +/// Rebuild the GBWT by applying all provided mappings. Each mapping is a pair +/// (original subpath, new subpath). If the original subpath is empty, the +/// mapping is ignored. If there are multiple applicable mappings, the first one +/// will be used. +/// +/// The mappings will be applied in both orientations. The reverse mapping replaces +/// the reverse of the original subpath with the reverse of the new subpath. +/// +/// The first and the last node can be used as context. For example (aXb, aYb) +/// can be interpreted as "replace X with Y in context a b". If both subpaths +/// end with the same node, the cursor will point at that node after the mapping. +/// Otherwise the cursor will be set past the original subpath. +/// +/// NOTE: To avoid infinite loops, the cursor will proceed after a mapping of the +/// type (a, Xa). +/// +/// The process can be partitioned into multiple non-overlapping jobs, each of them +/// corresponding to one or more weakly connected components in the graph. Multiple +/// jobs can be run in parallel using 2 threads each, and the jobs will be started +/// from the largest to the smallest. +/// +/// `node_to_job` maps each node identifier to the corresponding job identifier. +/// Empty paths go to the first job, but this can be overridden by including +/// `gbwt::ENDMARKER` in `node_to_job`. +/// +/// NOTE: Threads may be reordered if there are multiple jobs. Old thread ids are +/// no longer valid after rebuilding the GBWT. +gbwt::GBWT rebuild_gbwt(const gbwt::GBWT& gbwt_index, + const std::vector& jobs, + const std::unordered_map& node_to_job, + const RebuildParameters& parameters); + +/// As the general `rebuild_gbwt`, but always using a single job with default parameters. +gbwt::GBWT rebuild_gbwt(const gbwt::GBWT& gbwt_index, const std::vector& mappings); + +//------------------------------------------------------------------------------ + +/// Return the list of thread ids / gbwt path ids for the given sample. +std::vector threads_for_sample(const gbwt::GBWT& gbwt_index, const std::string& sample_name); + +/// Return the list of thread ids / gbwt path ids for the given contig. +std::vector threads_for_contig(const gbwt::GBWT& gbwt_index, const std::string& contig_name); + +/// Insert a GBWT thread into the graph and return its name. Returns an empty string on failure. +/// If a path name is specified and not empty, that name will be used for the inserted path. +/// NOTE: id is a gbwt path id, not a gbwt sequence id. +std::string insert_gbwt_path(MutablePathHandleGraph& graph, const gbwt::GBWT& gbwt_index, gbwt::size_type id, std::string path_name = ""); + +/// Extract a GBWT thread as a path in the given graph. +/// NOTE: id is a gbwt path id, not a gbwt sequence id. +Path extract_gbwt_path(const HandleGraph& graph, const gbwt::GBWT& gbwt_index, gbwt::size_type id); + +/// Get a short version of a string representation of a thread name stored in +/// GBWT metadata, made of just the sample and contig and haplotype. +/// NOTE: id is a gbwt path id, not a gbwt sequence id. +std::string compose_short_path_name(const gbwt::GBWT& gbwt_index, gbwt::size_type id); + +//------------------------------------------------------------------------------ + +/// Transform the paths into a GBWT index. Primarily for testing. +gbwt::GBWT get_gbwt(const std::vector& paths); + +//------------------------------------------------------------------------------ + +/// Load a translation file (created with vg gbwt --translation) and return a mapping +/// original segment ids to a list of chopped node ids +unordered_map> load_translation_map(ifstream& input_stream); + +/// Load a translation file (created with vg gbwt --translation) and return a backwards mapping +/// of chopped node to original segment position (id,offset pair) +/// NOTE: hopefully this is just a short-term hack, and we get a general interface baked into +/// the handlegraphs themselves +unordered_map> load_translation_back_map(HandleGraph& graph, ifstream& input_stream); + +//------------------------------------------------------------------------------ + +} // namespace vg + +#endif // VG_GBWT_HELPER_HPP_INCLUDED diff --git a/src/gbwtgraph_helper.cpp b/src/gbwtgraph_helper.cpp new file mode 100644 index 00000000000..495c5aaf2d1 --- /dev/null +++ b/src/gbwtgraph_helper.cpp @@ -0,0 +1,197 @@ +#include "gbwtgraph_helper.hpp" +#include "gbwt_helper.hpp" + +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +gbwtgraph::GFAParsingParameters get_best_gbwtgraph_gfa_parsing_parameters() { + gbwtgraph::GFAParsingParameters parameters; + // Configure GBWTGraph GFA parsing to be as close to the vg GFA parser as we can get. + // TODO: Make it closer. + parameters.path_name_formats.clear(); + // Parse panSN with a fragment after it. + parameters.path_name_formats.emplace_back( + gbwtgraph::GFAParsingParameters::PAN_SN_REGEX + "#([0-9][0-9]*)", + gbwtgraph::GFAParsingParameters::PAN_SN_FIELDS + "F", + gbwtgraph::GFAParsingParameters::PAN_SN_SENSE + ); + // Parse panSN with a range after it as a normal but with a fragment based + // on start position. + parameters.path_name_formats.emplace_back( + gbwtgraph::GFAParsingParameters::PAN_SN_REGEX + "\\[([0-9][0-9]*)(-[0-9]*)?\\]", + gbwtgraph::GFAParsingParameters::PAN_SN_FIELDS + "F", + gbwtgraph::GFAParsingParameters::PAN_SN_SENSE + ); + // Parse standard panSN as what we think that is + parameters.path_name_formats.emplace_back( + gbwtgraph::GFAParsingParameters::PAN_SN_REGEX, + gbwtgraph::GFAParsingParameters::PAN_SN_FIELDS, + gbwtgraph::GFAParsingParameters::PAN_SN_SENSE + ); + // Parse paths with just a name and a range as generic paths with a contig + // and a fragment. Sample for generic paths gets provided automatically. + parameters.path_name_formats.emplace_back( + "(.*)\\[([0-9][0-9]*)(-[0-9]*)?\\]", + "XCF", + PathSense::GENERIC + ); + // Parse paths with nothing to distinguish them the default way (as generic named paths) + parameters.path_name_formats.emplace_back( + gbwtgraph::GFAParsingParameters::DEFAULT_REGEX, + gbwtgraph::GFAParsingParameters::DEFAULT_FIELDS, + gbwtgraph::GFAParsingParameters::DEFAULT_SENSE + ); + return parameters; +} + +void load_gbwtgraph(gbwtgraph::GBWTGraph& graph, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Loading GBWTGraph from " << filename << std::endl; + } + std::unique_ptr loaded = vg::io::VPKG::load_one(filename); + if (loaded.get() == nullptr) { + std::cerr << "error: [load_gbwtgraph()] cannot load GBWTGraph " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + graph = std::move(*loaded); +} + +void load_gbz(gbwtgraph::GBZ& gbz, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Loading GBZ from " << filename << std::endl; + } + std::unique_ptr loaded = vg::io::VPKG::load_one(filename); + if (loaded.get() == nullptr) { + std::cerr << "error: [load_gbz()] cannot load GBZ " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + gbz = std::move(*loaded); +} + +void load_gbz(gbwt::GBWT& index, gbwtgraph::GBWTGraph& graph, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Loading GBWT and GBWTGraph from " << filename << std::endl; + } + std::unique_ptr loaded = vg::io::VPKG::load_one(filename); + if (loaded.get() == nullptr) { + std::cerr << "error: [load_gbz()] cannot load GBZ " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + index = std::move(loaded->index); + graph = std::move(loaded->graph); + graph.set_gbwt(index); // We moved the GBWT out from the GBZ, so we have to update the pointer. +} + +void load_gbz(gbwtgraph::GBZ& gbz, const std::string& gbwt_name, const std::string& graph_name, bool show_progress) { + gbz = gbwtgraph::GBZ(); + load_gbwt(gbz.index, gbwt_name, show_progress); + load_gbwtgraph(gbz.graph, graph_name, show_progress); + gbz.graph.set_gbwt(gbz.index); // We know the GBWT index corresponding to the graph. +} + +void load_minimizer(gbwtgraph::DefaultMinimizerIndex& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Loading MinimizerIndex from " << filename << std::endl; + } + std::unique_ptr loaded = vg::io::VPKG::load_one(filename); + if (loaded.get() == nullptr) { + std::cerr << "error: [load_minimizer()] cannot load MinimizerIndex " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + index = std::move(*loaded); +} + +void save_gbwtgraph(const gbwtgraph::GBWTGraph& graph, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Saving GBWTGraph to " << filename << std::endl; + } + graph.serialize(filename); +} + +void save_gbz(const gbwtgraph::GBZ& gbz, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Saving GBZ to " << filename << std::endl; + } + sdsl::simple_sds::serialize_to(gbz, filename); +} + +void save_gbz(const gbwt::GBWT& index, gbwtgraph::GBWTGraph& graph, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Saving GBWT and GBWTGraph to " << filename << std::endl; + } + std::ofstream out(filename, std::ios_base::binary); + if (!out) { + std::cerr << "error: [save_gbz()] cannot open file " << filename << " for writing" << std::endl; + std::exit(EXIT_FAILURE); + } + gbwtgraph::GBZ::simple_sds_serialize(index, graph, out); + out.close(); +} + +void save_gbz(const gbwtgraph::GBZ& gbz, const std::string& gbwt_name, const std::string& graph_name, bool show_progress) { + save_gbwt(gbz.index, gbwt_name, show_progress); + save_gbwtgraph(gbz.graph, graph_name, show_progress); +} + +void save_minimizer(const gbwtgraph::DefaultMinimizerIndex& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Saving MinimizerIndex to " << filename << std::endl; + } + std::ofstream out(filename, std::ios_base::binary); + if (!out) { + std::cerr << "error: [save_minimizer()] cannot open file " << filename << " for writing" << std::endl; + std::exit(EXIT_FAILURE); + } + index.serialize(out); + out.close(); +} + +//------------------------------------------------------------------------------ + +/// Return a mapping of the original segment ids to a list of chopped node ids +/// (mimicking logic and interface from function of same name in gbwt_helper.cpp) +unordered_map> load_translation_map(const gbwtgraph::GBWTGraph& graph) { + unordered_map> translation_map; + graph.for_each_segment([&](const std::string& segment_id, std::pair nodes) -> bool { + vector& val = translation_map[segment_id]; + val.reserve(nodes.second - nodes.first); + for (nid_t node_id = nodes.first; node_id < nodes.second; ++node_id) { + val.push_back(node_id); + } + return true; + }); + return translation_map; +} + +/// Return a backwards mapping of chopped node to original segment position (id,offset pair) +/// (mimicking logic and interface from function of same name in gbwt_helper.cpp) +unordered_map> load_translation_back_map(const gbwtgraph::GBWTGraph& graph) { + unordered_map> translation_back_map; + graph.for_each_segment([&](const std::string& segment_id, std::pair nodes) -> bool { + size_t offset = 0; + for (nid_t node_id = nodes.first; node_id < nodes.second; ++node_id) { + translation_back_map[node_id] = make_pair(segment_id, offset); + offset += graph.get_length(graph.get_handle(node_id)); + } + return true; + }); + + return translation_back_map; +} + +//------------------------------------------------------------------------------ + +std::string to_string_gbwtgraph(handle_t handle) { + return to_string_gbwtgraph(gbwtgraph::GBWTGraph::handle_to_node(handle)); +} + +std::string to_string_gbwtgraph(gbwt::node_type node) { + return std::string("(") + std::to_string(gbwt::Node::id(node)) + std::string(", ") + std::to_string(gbwt::Node::is_reverse(node)) + std::string(")"); +} + +//------------------------------------------------------------------------------ + +} // namespace vg diff --git a/src/gbwtgraph_helper.hpp b/src/gbwtgraph_helper.hpp new file mode 100644 index 00000000000..76f8911d3a6 --- /dev/null +++ b/src/gbwtgraph_helper.hpp @@ -0,0 +1,86 @@ +#ifndef VG_GBWTGRAPH_HELPER_HPP_INCLUDED +#define VG_GBWTGRAPH_HELPER_HPP_INCLUDED + +/** \file + * Utility classes and functions for working with GBWTGraph. + */ + +#include +#include +#include +#include "position.hpp" +#include +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +/** + * Get the best configuration to use for the GBWTGraph library GFA parser, to + * best matcch the behavior of vg's GFA parser. + */ +gbwtgraph::GFAParsingParameters get_best_gbwtgraph_gfa_parsing_parameters(); + +/* + These are the proper ways of saving and loading GBWTGraph structures. + Loading with `vg::io::VPKG::load_one` is also supported. +*/ + +/// Load GBWTGraph from the file. +/// NOTE: Call `graph.set_gbwt()` afterwards with the appropriate GBWT index. +void load_gbwtgraph(gbwtgraph::GBWTGraph& graph, const std::string& filename, bool show_progress = false); + +/// Load GBZ from the file. +void load_gbz(gbwtgraph::GBZ& gbz, const std::string& filename, bool show_progress = false); + +/// Load GBZ from separate GBWT / GBWTGraph files. +void load_gbz(gbwtgraph::GBZ& gbz, const std::string& gbwt_name, const std::string& graph_name, bool show_progress = false); + +/// Load GBWT and GBWTGraph from the GBZ file. +void load_gbz(gbwt::GBWT& index, gbwtgraph::GBWTGraph& graph, const std::string& filename, bool show_progress = false); + +/// Load a minimizer index from the file. +void load_minimizer(gbwtgraph::DefaultMinimizerIndex& index, const std::string& filename, bool show_progress = false); + +/// Save GBWTGraph to the file. +void save_gbwtgraph(const gbwtgraph::GBWTGraph& graph, const std::string& filename, bool show_progress = false); + +/// Save GBZ to the file. +void save_gbz(const gbwtgraph::GBZ& gbz, const std::string& filename, bool show_progress = false); + +/// Save GBWT and GBWTGraph to the GBZ file. +void save_gbz(const gbwt::GBWT& index, gbwtgraph::GBWTGraph& graph, const std::string& filename, bool show_progress = false); + +/// Save GBZ to separate GBWT / GBWTGraph files. +void save_gbz(const gbwtgraph::GBZ& gbz, const std::string& gbwt_name, const std::string& graph_name, bool show_progress = false); + +/// Save a minimizer index to the file. +void save_minimizer(const gbwtgraph::DefaultMinimizerIndex& index, const std::string& filename, bool show_progress = false); + +//------------------------------------------------------------------------------ + +/// Return a mapping of the original segment ids to a list of chopped node ids +std::unordered_map> load_translation_map(const gbwtgraph::GBWTGraph& graph); + +/// Return a backwards mapping of chopped node to original segment position (id,offset pair) +std::unordered_map> load_translation_back_map(const gbwtgraph::GBWTGraph& graph); + +//------------------------------------------------------------------------------ + +/// Returns an empty GBWTGraph handle corresponding to the GBWT endmarker. +inline handle_t empty_gbwtgraph_handle() { + return gbwtgraph::GBWTGraph::node_to_handle(0); +} + +/// Returns a string representation of a GBWTGraph handle. +std::string to_string_gbwtgraph(handle_t handle); + +/// Returns a string representation of a GBWTGraph node. +std::string to_string_gbwtgraph(gbwt::node_type node); + +//------------------------------------------------------------------------------ + +} // namespace vg + +#endif // VG_GBWTGRAPH_HELPER_HPP_INCLUDED diff --git a/src/gbzgraph.hpp b/src/gbzgraph.hpp new file mode 100644 index 00000000000..78d6b13e896 --- /dev/null +++ b/src/gbzgraph.hpp @@ -0,0 +1,49 @@ +#ifndef VG_GBZGRAPH_HPP_INCLUDED +#define VG_GBZGRAPH_HPP_INCLUDED + +/** + * \file gbzgraph.hpp + * Defines a GBZGraph that owns a GBZ object and implements PathHandleGraph. + */ + +#include "handle.hpp" + +#include + +#include + +namespace vg { + +/** + * A PathHandleGraph that owns and is backed by a GBZ. + * Necessary because GBWTGraph implements PathHandleGraph but GBZ doesn't. + * + * Should be removed if/when GBZ implements PathHandleGraph. + */ +class GBZGraph : public bdsg::PathHandleGraphProxy { +public: + /// This is the GBZ object we own that actually holds the graph and GBWT + /// data. + gbwtgraph::GBZ gbz; + +protected: + /** + * Get the object that actually provides the graph methods. + */ + inline gbwtgraph::GBWTGraph* get() { + return &gbz.graph; + } + + /** + * Get the object that actually provides the graph methods. + */ + inline const gbwtgraph::GBWTGraph* get() const { + return &gbz.graph; + } + +}; + +} + +#endif + diff --git a/src/gcsa_helper.cpp b/src/gcsa_helper.cpp new file mode 100644 index 00000000000..5f1fff102e6 --- /dev/null +++ b/src/gcsa_helper.cpp @@ -0,0 +1,55 @@ +#include "gcsa_helper.hpp" + +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +void load_gcsa(gcsa::GCSA& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Loading GCSA from " << filename << std::endl; + } + std::unique_ptr loaded = vg::io::VPKG::load_one(filename); + if (loaded.get() == nullptr) { + std::cerr << "error: [load_gcsa()] cannot load GCSA " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + index = std::move(*loaded); +} + +void load_lcp(gcsa::LCPArray& lcp, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Loading LCP from " << filename << std::endl; + } + std::unique_ptr loaded = vg::io::VPKG::load_one(filename); + if (loaded.get() == nullptr) { + std::cerr << "error: [load_lcp()] cannot load LCP " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + lcp = std::move(*loaded); +} + +void save_gcsa(const gcsa::GCSA& index, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Saving GCSA to " << filename << std::endl; + } + if (!sdsl::store_to_file(index, filename)) { + std::cerr << "error: [save_gcsa()] cannot write GCSA to " << filename << std::endl; + std::exit(EXIT_FAILURE); + } +} + +void save_lcp(const gcsa::LCPArray& lcp, const std::string& filename, bool show_progress) { + if (show_progress) { + std::cerr << "Saving LCP to " << filename << std::endl; + } + if (!sdsl::store_to_file(lcp, filename)) { + std::cerr << "error: [save_gcsa()] cannot write LCP to " << filename << std::endl; + std::exit(EXIT_FAILURE); + } +} + +//------------------------------------------------------------------------------ + +} // namespace vg diff --git a/src/gcsa_helper.hpp b/src/gcsa_helper.hpp new file mode 100644 index 00000000000..cab68097d44 --- /dev/null +++ b/src/gcsa_helper.hpp @@ -0,0 +1,36 @@ +#ifndef VG_GCSA_HELPER_HPP_INCLUDED +#define VG_GCSA_HELPER_HPP_INCLUDED + +/** \file + * Utility classes and functions for working with GCSA. + */ + +#include +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +/* + These are the proper ways of saving and loading GCSA structures. + Loading with `vg::io::VPKG::load_one` is also supported. +*/ + +/// Load GCSA from the file. +void load_gcsa(gcsa::GCSA& index, const std::string& filename, bool show_progress = false); + +/// Load LCP array from the file. +void load_lcp(gcsa::LCPArray& lcp, const std::string& filename, bool show_progress = false); + +/// Save GCSA to the file. +void save_gcsa(const gcsa::GCSA& index, const std::string& filename, bool show_progress = false); + +/// Save LCP array to the file. +void save_lcp(const gcsa::LCPArray& lcp, const std::string& filename, bool show_progress = false); + +//------------------------------------------------------------------------------ + +} // namespace vg + +#endif // VG_GCSA_HELPER_HPP_INCLUDED diff --git a/src/genome_state.cpp b/src/genome_state.cpp index 325a759c497..36485b65fcb 100644 --- a/src/genome_state.cpp +++ b/src/genome_state.cpp @@ -574,7 +574,7 @@ ReplaceLocalHaplotypeCommand GenomeState::replace_snarl_haplotype(const ReplaceS handle_t into = net_graphs.at(snarl).get_inward_backing_handle(handle_and_lane.first); // Get the child we are actually reading into from the SnarlManager - const Snarl* child = manager.into_which_snarl(backing_graph->to_visit(into)); + const Snarl* child = manager.into_which_snarl(to_visit(*backing_graph, into)); // Get the chain for the child const Chain* child_chain = manager.chain_of(child); @@ -680,7 +680,7 @@ void GenomeState::trace_haplotype(const pair& telome #endif // Get the snarl we really are entering, because get_inward_backing_handle works. - const Snarl* entered = manager.into_which_snarl(backing_graph->to_visit(into)); + const Snarl* entered = manager.into_which_snarl(to_visit(*backing_graph, into)); // Decide if we are entering it through its end bool entered_snarl_via_end = entered->start().node_id() != backing_graph->get_id(into); @@ -811,7 +811,7 @@ void GenomeState::insert_handles(const vector& to_add, for (auto& next_handle : to_add) { // For each handle, look at it as a visit in the base graph - Visit next_visit = backing_graph->to_visit(next_handle); + Visit next_visit = to_visit(*backing_graph, next_handle); #ifdef debug cerr << "Stack: "; @@ -881,7 +881,7 @@ void GenomeState::insert_handles(const vector& to_add, auto& net_graph = net_graphs.at(parent); // Get the backing graph handle reading out of the chain - handle_t backing_outward = backing_graph->get_handle(next_visit); + handle_t backing_outward = backing_graph->get_handle(next_visit.node_id(), next_visit.backward()); // Make it inward handle_t backing_inward = backing_graph->flip(backing_outward); @@ -891,7 +891,7 @@ void GenomeState::insert_handles(const vector& to_add, handle_t chain_handle = net_graph.flip(net_graph.get_handle_from_inward_backing_handle(backing_inward)); #ifdef debug - cerr << "Represent chain traversal with " << net_graph.to_visit(chain_handle) << endl; + cerr << "Represent chain traversal with " << to_visit(net_graph, chain_handle) << endl; #endif // Tack it on to the parent diff --git a/src/genotypekit.cpp b/src/genotypekit.cpp index 94c29f8c3d0..4b665c7c4b0 100644 --- a/src/genotypekit.cpp +++ b/src/genotypekit.cpp @@ -18,7 +18,7 @@ SnarlTraversal get_traversal_of_snarl(VG& graph, const Snarl* snarl, const Snarl for(size_t i = 0; i < path.mapping_size(); i++) { const Mapping& mapping = path.mapping(i); - if(contents.first.count(graph.get_node(mapping.position().node_id()))) { + if(contents.first.count(mapping.position().node_id())) { // We're inside the bubble. This is super simple when we have the contents! *to_return.add_visit() = to_visit(mapping, true); } @@ -137,15 +137,17 @@ vector AugmentedGraph::get_alignments(pair } } -Support AugmentedGraph::get_support(Node* node) { +Support AugmentedGraph::get_support(id_t node) { Support support; - support.set_forward(get_alignments(node->id()).size()); + support.set_forward(get_alignments(node).size()); return support; } -Support AugmentedGraph::get_support(Edge* edge) { +Support AugmentedGraph::get_support(edge_t edge) { Support support; - support.set_forward(get_alignments(NodeSide::pair_from_edge(edge)).size()); + NodeSide from(graph.get_id(edge.first), !graph.get_is_reverse(edge.first)); + NodeSide to(graph.get_id(edge.second), graph.get_is_reverse(edge.second)); + support.set_forward(get_alignments(make_pair(from, to)).size()); return support; } @@ -228,7 +230,8 @@ void AugmentedGraph::augment_from_alignment_edits(vector& alignments, // Run them through vg::edit() to modify the graph, but don't embed them // as paths. Update the paths in place, and save the translations. - vector augmentation_translations = graph.edit(paths, false, true, false); + vector augmentation_translations; + graph.edit(paths, &augmentation_translations, false, true, false); for (size_t i = 0; i < paths.size(); i++) { // Copy all the modified paths back. @@ -285,12 +288,12 @@ void AugmentedGraph::load_translations(istream& in_file) { function lambda = [&](Translation& translation) { translator.translations.push_back(translation); }; - stream::for_each(in_file, lambda); + vg::io::for_each(in_file, lambda); translator.build_position_table(); } void AugmentedGraph::write_translations(ostream& out_file) { - stream::write_buffered(out_file, translator.translations, 0); + vg::io::write_buffered(out_file, translator.translations, 0); } void SupportAugmentedGraph::clear() { @@ -302,27 +305,65 @@ bool SupportAugmentedGraph::has_supports() const { return !node_supports.empty() || !edge_supports.empty(); } -Support SupportAugmentedGraph::get_support(Node* node) { +Support SupportAugmentedGraph::get_support(id_t node) { return node_supports.count(node) ? node_supports.at(node) : Support(); } -Support SupportAugmentedGraph::get_support(Edge* edge) { +Support SupportAugmentedGraph::get_support(edge_t edge) { return edge_supports.count(edge) ? edge_supports.at(edge) : Support(); } void SupportAugmentedGraph::load_supports(istream& in_file) { + // This loads LocationSupport objects. We use them instead of pileups. + // TODO: We need a way to view them with vg view node_supports.clear(); edge_supports.clear(); function lambda = [&](LocationSupport& location_support) { +#ifdef debug + cerr << pb2json(location_support) << endl; +#endif if (location_support.oneof_location_case() == LocationSupport::kNodeId) { - node_supports[graph.get_node(location_support.node_id())] = location_support.support(); + node_supports[location_support.node_id()] = location_support.support(); } else { const Edge& edge = location_support.edge(); - edge_supports[graph.get_edge(NodeSide(edge.from(), !edge.from_start()), - NodeSide(edge.to(), edge.to_end()))] = location_support.support(); + edge_t edge_handle = graph.edge_handle(graph.get_handle(edge.from(), edge.from_start()), + graph.get_handle(edge.to(), edge.to_end())); + edge_supports[edge_handle] = location_support.support(); } }; - stream::for_each(in_file, lambda); + vg::io::for_each(in_file, lambda); +} + +void SupportAugmentedGraph::load_pack_as_supports(const string& pack_file_name, const HandleGraph* vectorizable_graph) { + Packer packer(vectorizable_graph); + packer.load_from_file(pack_file_name); + vectorizable_graph->for_each_handle([&](const handle_t& handle) { + Position pos; + pos.set_node_id(vectorizable_graph->get_id(handle)); + size_t sequence_offset = packer.position_in_basis(pos); + size_t total_coverage = 0; + size_t node_length = vectorizable_graph->get_length(handle); + for (size_t i = 0; i < node_length; ++i) { + total_coverage += packer.coverage_at_position(sequence_offset + i); + } + double avg_coverage = node_length > 0 ? (double)total_coverage / node_length : 0.; + Support support; + // we just get one value and put it in "forward". can't fill out the rest of the Support object. + support.set_forward(avg_coverage); + node_supports[vectorizable_graph->get_id(handle)] = support; + }); + vectorizable_graph->for_each_edge([&](const edge_t& handle_edge) { + Edge edge; + edge.set_from(vectorizable_graph->get_id(handle_edge.first)); + edge.set_from_start(vectorizable_graph->get_is_reverse(handle_edge.first)); + edge.set_to(vectorizable_graph->get_id(handle_edge.second)); + edge.set_to_end(vectorizable_graph->get_is_reverse(handle_edge.second)); + Support support; + support.set_forward(packer.edge_coverage(edge)); + edge_supports[graph.edge_handle(graph.get_handle(edge.from(), edge.from_start()), + graph.get_handle(edge.to(), edge.to_end()))] = support; + return true; + }); } void SupportAugmentedGraph::write_supports(ostream& out_file) { @@ -330,18 +371,23 @@ void SupportAugmentedGraph::write_supports(ostream& out_file) { for (auto& node_support : node_supports) { LocationSupport location_support; *location_support.mutable_support() = node_support.second; - location_support.set_node_id(node_support.first->id()); + location_support.set_node_id(node_support.first); buffer.push_back(location_support); - stream::write_buffered(out_file, buffer, 500); + vg::io::write_buffered(out_file, buffer, 500); } for (auto& edge_support : edge_supports) { LocationSupport location_support; - *location_support.mutable_support() = edge_support.second; - *location_support.mutable_edge() = *edge_support.first; + *location_support.mutable_support() = edge_support.second; + Edge edge; + edge.set_from(graph.get_id(edge_support.first.first)); + edge.set_from_start(graph.get_is_reverse(edge_support.first.first)); + edge.set_to(graph.get_id(edge_support.first.second)); + edge.set_to_end(graph.get_is_reverse(edge_support.first.second)); + *location_support.mutable_edge() = edge; buffer.push_back(location_support); - stream::write_buffered(out_file, buffer, 500); + vg::io::write_buffered(out_file, buffer, 500); } - stream::write_buffered(out_file, buffer, 0); + vg::io::write_buffered(out_file, buffer, 0); } @@ -570,6 +616,13 @@ Support support_max(const Support& a, const Support& b) { return to_return; } +Support flip(const Support& to_flip) { + Support flipped = to_flip; + flipped.set_forward(to_flip.reverse()); + flipped.set_reverse(to_flip.forward()); + return flipped; +} + Support operator+(const Support& one, const Support& other) { Support sum; sum.set_forward(one.forward() + other.forward()); diff --git a/src/genotypekit.hpp b/src/genotypekit.hpp index 2199c8b6212..25dfcd573cf 100644 --- a/src/genotypekit.hpp +++ b/src/genotypekit.hpp @@ -15,15 +15,15 @@ #include #include #include -#include "vg.pb.h" +#include #include "vg.hpp" #include "translator.hpp" #include "hash_map.hpp" -#include "utility.hpp" #include "types.hpp" -#include "distributions.hpp" +#include "statistics.hpp" #include "snarls.hpp" #include "path_index.hpp" +#include "packer.hpp" namespace vg { @@ -182,6 +182,11 @@ struct AugmentedGraph { auto be_ret = base_edge(augmented_edge); return be_ret.first == NULL && be_ret.second == false; } + + // Do we have a base graph in order to run the above methods? + bool has_base_graph() const { + return base_graph != nullptr; + } /// Get the alignments, if any, embedded in the graph that touch the given /// node ID. @@ -196,13 +201,13 @@ struct AugmentedGraph { * Get the Support for a given Node, or 0 if it has no recorded support. * (only forward strand) */ - virtual Support get_support(Node* node); + virtual Support get_support(id_t node); /** * Get the Support for a given Edge, or 0 if it has no recorded support. * (only forward strand) */ - virtual Support get_support(Edge* edge); + virtual Support get_support(edge_t edge); virtual bool has_supports() const; @@ -263,9 +268,9 @@ struct SupportAugmentedGraph : public AugmentedGraph { // This holds support info for nodes. Note that we discard the "os" other // support field from StrandSupport. // Supports for nodes are minimum distinct reads that use the node. - map node_supports; + map node_supports; // And for edges - map edge_supports; + unordered_map edge_supports; /** * Return true if we have support information, and false otherwise. @@ -275,12 +280,12 @@ struct SupportAugmentedGraph : public AugmentedGraph { /** * Get the Support for a given Node, or 0 if it has no recorded support. */ - virtual Support get_support(Node* node); + virtual Support get_support(id_t node); /** * Get the Support for a given Edge, or 0 if it has no recorded support. */ - virtual Support get_support(Edge* edge); + virtual Support get_support(edge_t edge); /** * Clear the contents. @@ -292,6 +297,13 @@ struct SupportAugmentedGraph : public AugmentedGraph { */ void load_supports(istream& in_file); + /** + * Read the suppors from output of vg pack + * Everything put in forward support, average used for nodes + * Graph must implement VectorizableHandleGraph + */ + void load_pack_as_supports(const string& pack_file_name, const HandleGraph* vectorizable_graph); + /** * Write the supports to protobuf */ @@ -377,6 +389,11 @@ Support support_min(const Support& a, const Support& b); */ Support support_max(const Support& a, const Support& b); +/** + * Flip the orientations of a Support. + */ +Support flip(const Support& to_flip); + /** * Add two Support values together, accounting for strand. */ diff --git a/src/genotyper.cpp b/src/genotyper.cpp index 612632c8b4c..c2fd85bf132 100644 --- a/src/genotyper.cpp +++ b/src/genotyper.cpp @@ -1,7 +1,10 @@ #include #include "genotyper.hpp" -#include "algorithms/topological_sort.hpp" #include "traversal_finder.hpp" +#include "cactus_snarl_finder.hpp" +#include "path_index.hpp" +#include "utility.hpp" +#include "translator.hpp" //#define debug @@ -139,24 +142,61 @@ void Genotyper::run(AugmentedGraph& augmented_graph, if (snarl->type() != ULTRABUBBLE) { // We only work on ultrabubbles right now + cerr << "Skip snarl " << snarl->start() << " - " << snarl->end() << " due to not being an ultrabubble" << endl; return; } // Get the contents - pair, unordered_set > snarl_contents = + pair, unordered_set > snarl_contents = manager.deep_contents(snarl, graph, true); // Test if the snarl can be longer than the reads - bool read_bounded = is_snarl_smaller_than_reads(snarl, snarl_contents, reads_by_name); + bool read_bounded = is_snarl_smaller_than_reads(augmented_graph, snarl, snarl_contents, reads_by_name); TraversalAlg use_traversal_alg = traversal_alg; if (traversal_alg == TraversalAlg::Adaptive) { use_traversal_alg = read_bounded ? TraversalAlg::Reads : TraversalAlg::Representative; } - if ((use_traversal_alg != TraversalAlg::Reads && !manager.is_leaf(snarl)) || - (use_traversal_alg == TraversalAlg::Reads && !manager.is_root(snarl))) { + if (use_traversal_alg == TraversalAlg::Exhaustive && + !manager.is_leaf(snarl)) { + // The SupportRestrictedTraversalFinder we use in Exhaustive mode + // can only handle leaf snarls. + + // Todo : support nesting hierarchy! + if (show_progress) { + cerr << "Skip snarl " << snarl->start() << " - " << snarl->end() + << " because it isn't a leaf and traversal algorithm " + << alg2name[use_traversal_alg] << " only works on leaves" << endl; + } + return; + } + + if (use_traversal_alg == TraversalAlg::Representative && !manager.all_children_trivial(snarl, graph)) { + // The RepresentativeTraversalFinder works for root and leaf + // snarls, but unless we're in a leaf snarl, or a snarl with only + // trivial children, it outputs traversals with child snarls in + // them that the rest of genotype can't yet handle. + // Todo : support nesting hierarchy! + if (show_progress) { + cerr << "Skip snarl " << snarl->start() << " - " << snarl->end() + << " because it has nontrivial children and traversal algorithm " + << alg2name[use_traversal_alg] << " will produce nested child snarl traversals" << endl; + } + return; + } + + + if (use_traversal_alg == TraversalAlg::Reads && !manager.is_root(snarl)) { + // The ReadRestrictedTraversalFinder only works for root snarls. + // TODO: How do we know this? + // Todo : support nesting hierarchy! + if (show_progress) { + cerr << "Skip snarl " << snarl->start() << " - " << snarl->end() + << " because it isn't a root and traversal algorithm " + << alg2name[use_traversal_alg] << " only works on roots" << endl; + } return; } @@ -262,7 +302,7 @@ void Genotyper::run(AugmentedGraph& augmented_graph, } else { // Write out in Protobuf buffer[tid].push_back(genotyped); - stream::write_buffered(cout, buffer[tid], 100); + vg::io::write_buffered(cout, buffer[tid], 100); } } }); @@ -270,7 +310,7 @@ void Genotyper::run(AugmentedGraph& augmented_graph, if(!output_json && !output_vcf) { // Flush the protobuf output buffers for(int i = 0; i < buffer.size(); i++) { - stream::write_buffered(cout, buffer[i], 0); + vg::io::write_buffered(cout, buffer[i], 0); } } @@ -306,7 +346,8 @@ pair, bool> Genotyper::get_snarl_reference_bounds(const S // position along the reference at which it occurs. Our bubble // goes forward in the reference, so we must come out of the // opposnarl end of the node from the one we have stored. - auto referenceIntervalStart = index.by_id.at(first_id).first + graph->get_length(graph->get_handle(snarl->start())); + auto referenceIntervalStart = index.by_id.at(first_id).first + graph->get_length(graph->get_handle(snarl->start().node_id(), + snarl->start().backward())); // The position we have stored for the end node is the first // position it occurs in the reference, and we know we go into @@ -324,7 +365,8 @@ pair, bool> Genotyper::get_snarl_reference_bounds(const S // Recalculate reference positions Use the end node, which we've now // made first_id, to get the length offset to the start of the actual // internal variable bit. - referenceIntervalStart = index.by_id.at(first_id).first + graph->get_length(graph->get_handle(snarl->end())); + referenceIntervalStart = index.by_id.at(first_id).first + graph->get_length(graph->get_handle(snarl->end().node_id(), + snarl->end().backward())); referenceIntervalPastEnd = index.by_id.at(last_id).first; } @@ -390,15 +432,17 @@ int Genotyper::alignment_qual_score(VG& graph, const Snarl* snarl, const Alignme return round(total); } -bool Genotyper::is_snarl_smaller_than_reads(const Snarl* snarl, - const pair, unordered_set >& contents, +bool Genotyper::is_snarl_smaller_than_reads(AugmentedGraph& augmented_graph, + const Snarl* snarl, + const pair, unordered_set >& contents, map& reads_by_name) { size_t read_length = reads_by_name.empty() ? 50 : reads_by_name.begin()->second->sequence().length(); size_t snarl_total_length = 0; - for (auto snarl_node : contents.first) { - if (snarl_node->id() != snarl->start().node_id() && - snarl_node->id() != snarl->end().node_id()) { - snarl_total_length += snarl_node->sequence().length(); + for (auto snarl_id : contents.first) { + + if (snarl_id != snarl->start().node_id() && + snarl_id != snarl->end().node_id()) { + snarl_total_length += augmented_graph.graph.get_length(augmented_graph.graph.get_handle(snarl_id)); } if (snarl_total_length >= read_length) { return false; @@ -410,7 +454,7 @@ bool Genotyper::is_snarl_smaller_than_reads(const Snarl* snarl, vector Genotyper::get_snarl_traversals(AugmentedGraph& augmented_graph, SnarlManager& manager, map& reads_by_name, const Snarl* snarl, - const pair, unordered_set >& contents, + const pair, unordered_set >& contents, PathIndex* ref_path_index, TraversalAlg use_traversal_alg) { vector paths; @@ -430,11 +474,22 @@ vector Genotyper::get_snarl_traversals(AugmentedGraph& augmented } else if (use_traversal_alg == TraversalAlg::Representative) { // representative search from vg call. only current implementation that works for big sites // Now start looking for traversals of the sites. - read_trav_finder = unique_ptr(new RepresentativeTraversalFinder( - augmented_graph, manager, 1000, 1000, - 100, [&] (const Snarl& site) -> PathIndex* { + auto* finder = new RepresentativeTraversalFinder( + augmented_graph.graph, manager, 1000, 1000, + 100, 1, 1, [&] (const Snarl& site) -> PathIndex* { return ref_path_index; - })); + }, + [&] (id_t node) -> Support { + return augmented_graph.get_support(node); + }, + [&] (edge_t edge) -> Support { + return augmented_graph.get_support(edge); + }); + + // Since we can't sensibly handle any children, glom trivial children in. + finder->eat_trivial_children = true; + + read_trav_finder = unique_ptr(finder); } else { assert(false); } @@ -645,7 +700,7 @@ map> Genotyper::get_affinities(AugmentedGraph& aug, const map& reads_by_name, const Snarl* snarl, - const pair, unordered_set >& contents, + const pair, unordered_set >& contents, const SnarlManager& manager, const vector& snarl_paths) { @@ -670,9 +725,9 @@ map> cerr << "Snarl contains " << contents.first.size() << " nodes" << endl; #endif - for(Node* node : contents.first) { + for(id_t node_id : contents.first) { // For every node in the ultrabubble, what reads visit it? - for (const Alignment* aln : aug.get_alignments(node->id())) { + for (const Alignment* aln : aug.get_alignments(node_id)) { // Each read that visits this node is relevant. relevant_read_names.insert(aln->name()); @@ -684,9 +739,9 @@ map> // name. Maybe we can just use alignment pointers? } - for(Node* node: contents.first) { + for(id_t node_id : contents.first) { // Throw out all the IDs that are also used in the ultrabubble itself - relevant_ids.erase(node->id()); + relevant_ids.erase(node_id); } // This is a temporary hack to break out of this function if there's too much to do. @@ -779,7 +834,7 @@ map> for(size_t i = 0; i < read->path().mapping_size(); i++) { // Look at every node the read touches id_t touched = read->path().mapping(i).position().node_id(); - if(contents.first.count(aug.graph.get_node(touched))) { + if(contents.first.count(touched)) { // If it's in the ultrabubble, keep it touched_set.insert(touched); } @@ -967,7 +1022,7 @@ map > Genotyper::get_affinities_fast(AugmentedGraph& aug, const map& reads_by_name, const Snarl* snarl, - const pair, unordered_set >& contents, + const pair, unordered_set >& contents, const SnarlManager& manager, const vector& snarl_paths, bool allow_internal_alignments) { @@ -990,9 +1045,9 @@ Genotyper::get_affinities_fast(AugmentedGraph& aug, allele_strings.push_back(traversal_to_string(aug.graph, path)); } - for(Node* node : contents.first) { + for(id_t node_id : contents.first) { // For every node in the ultrabubble, what reads visit it? - for (const Alignment* aln : aug.get_alignments(node->id())) { + for (const Alignment* aln : aug.get_alignments(node_id)) { // Each read that visits this node is relevant. relevant_read_names.insert(aln->name()); } @@ -1803,7 +1858,7 @@ vcflib::VariantCallFile* Genotyper::start_vcf(std::ostream& stream, const PathIn vector Genotyper::locus_to_variant(VG& graph, const Snarl* snarl, - const pair, unordered_set >& contents, + const pair, unordered_set >& contents, const SnarlManager& manager, const PathIndex& index, vcflib::VariantCallFile& vcf, @@ -2034,8 +2089,8 @@ Genotyper::locus_to_variant(VG& graph, // Also the snarl statistics // Ultrabubble bases size_t ultrabubble_bases = 0; - for(Node* node : contents.first) { - ultrabubble_bases += node->sequence().size(); + for(id_t node_id : contents.first) { + ultrabubble_bases += graph.get_length(graph.get_handle(node_id)); } variant.info["XSBB"].push_back(to_string(ultrabubble_bases)); // Ultrabubble nodes diff --git a/src/genotyper.hpp b/src/genotyper.hpp index 87137117d61..f9e19ab13c8 100644 --- a/src/genotyper.hpp +++ b/src/genotyper.hpp @@ -14,19 +14,12 @@ #include #include #include -#include "vg.pb.h" +#include #include "vg.hpp" -#include "translator.hpp" -#include "deconstructor.hpp" -#include "srpe.hpp" #include "hash_map.hpp" -#include "utility.hpp" #include "types.hpp" #include "genotypekit.hpp" -#include "srpe.hpp" -#include "path_index.hpp" -#include "index.hpp" -#include "distributions.hpp" +#include "statistics.hpp" namespace vg { @@ -145,6 +138,12 @@ class Genotyper { // Toggle traversal finder for testing enum TraversalAlg { Reads, Exhaustive, Representative, Adaptive }; + map alg2name = { + {Reads, "Reads"}, + {Exhaustive, "Exhaustive"}, + {Representative, "Representative"}, + {Adaptive, "Adaptive"} + }; TraversalAlg traversal_alg = TraversalAlg::Reads; // Show progress @@ -189,8 +188,9 @@ class Genotyper { /** * Check if a snarl is small enough to be covered by reads (very conservative) */ - static bool is_snarl_smaller_than_reads(const Snarl* snarl, - const pair, unordered_set >& contents, + static bool is_snarl_smaller_than_reads(AugmentedGraph& augmented_graph, + const Snarl* snarl, + const pair, unordered_set >& contents, map& reads_by_name); /** @@ -199,7 +199,7 @@ class Genotyper { vector get_snarl_traversals(AugmentedGraph& augmented_graph, SnarlManager& manager, map& reads_by_name, const Snarl* snarl, - const pair, unordered_set >& contents, + const pair, unordered_set >& contents, PathIndex* reference_index, TraversalAlg use_traversal_alg); @@ -228,7 +228,7 @@ class Genotyper { map> get_affinities(AugmentedGraph& aug, const map& reads_by_name, const Snarl* snarl, - const pair, unordered_set >& contents, + const pair, unordered_set >& contents, const SnarlManager& manager, const vector& superbubble_paths); @@ -239,7 +239,7 @@ class Genotyper { map> get_affinities_fast(AugmentedGraph& aug, const map& reads_by_name, const Snarl* snarl, - const pair, unordered_set >& contents, + const pair, unordered_set >& contents, const SnarlManager& manager, const vector& superbubble_paths, bool allow_internal_alignments = false); @@ -285,7 +285,7 @@ class Genotyper { * reference path, we'll emit 0 variants. */ vector locus_to_variant(VG& graph, const Snarl* snarl, - const pair, unordered_set >& contents, + const pair, unordered_set >& contents, const SnarlManager& manager, const PathIndex& index, vcflib::VariantCallFile& vcf, const Locus& locus, diff --git a/src/gfa.cpp b/src/gfa.cpp index 7dbe70d94ed..435fd29dcd3 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -1,1134 +1,250 @@ #include "gfa.hpp" -#include +#include "utility.hpp" +#include "path.hpp" +#include -// Use sonLib pinch graphs -#include - -#include +#include namespace vg { using namespace std; -using namespace gfak; - -// We augment the pinch graph API with functions that can work as if segments are in blocks even when they aren't. - -/// Get the segment's orientation in its block (false=backward, true=forward), or true (forward) if there is no block -bool stPinchSegment_getBlockOrientationSafe(stPinchSegment* segment) { - stPinchBlock* block = stPinchSegment_getBlock(segment); - return (block != nullptr) ? stPinchSegment_getBlockOrientation(segment) : true; -} - -/// Represents a translation from GFA node name string to pinch thread name number. -/// Tries to translate numerical node names as themselves, to the extent possible. -class GFAToPinchTranslator { -private: - /// Map from string name to numerical name number - unordered_map forward; - /// Map from numerical name number back to string name - unordered_map backward; - /// What is the next unused name we can assign? - int64_t next_unused = 1; -public: - /// Translate from GFA name to pinch thread name - int64_t translate(const string& name); - /// Translate back from pinch thread name to GFA name - const string& untranslate(const int64_t& name); -}; - -int64_t GFAToPinchTranslator::translate(const string& name) { - // Look up the name - auto found = forward.find(name); - - if (found != forward.end()) { - // We have a translation already. Use it. - return found->second; - } - - // Otherwise we need to make a translation. - // Try to find an unused number to use. - // To start with, we have no clue what to use (0). - int64_t assigned = 0; - if (is_number(name)) { - // There's a preferred number for this string. Try to use it. - assigned = std::stol(name); - } - - if (assigned <= 0 || backward.count(assigned)) { - // We need to find an unused number. - // next_unused is always guaranteed to be unused. - assigned = next_unused; - next_unused++; - } - - if (assigned >= next_unused) { - // If we read in a node ID from the GFA, this can happen. - // Budge out the assignment cursor past any numbers yet mentioned in the GFA. - // This is guaranteed to be past the largest assigned name, and therefore unused. - next_unused = assigned + 1; - } - - // Save the assigned numeric name - forward[name] = assigned; - // Mark it used and record the back translation - backward[assigned] = name; - // Return it - return assigned; -} - -const string& GFAToPinchTranslator::untranslate(const int64_t& name) { - // If it was translated, it must have a reverse entry - return backward.at(name); -} - -/// Represents a translation from pinch thread segments' blocks to vg node IDs. -/// Tries to use pinch thread name numbers as IDs for single-thread blocks, and otherwise assigns IDs. -class PinchToVGTranslator { -private: - /// Map from block or segment pointer to node ID - unordered_map block_to_id; - /// Track assigned numeric names - unordered_set used; - /// What is the next unused name we can assign? - id_t next_unused = 1; -public: - /// Translate from pinch thread segment's block to node ID - id_t translate(stPinchSegment* segment); -}; -id_t PinchToVGTranslator::translate(stPinchSegment* segment) { - // Work out what pointer will represent the segment's block - void* block = (void*) stPinchSegment_getBlock(segment); - if (block == nullptr) { - // No block was found. The segment represents itself - block = (void*) segment; - } - - // Look up the block - auto found = block_to_id.find(block); - - if (found != block_to_id.end()) { - // We have a translation already. Use it. - return found->second; - } - - // Otherwise we need to make a translation. Try to find an unused number to - // use. To start with, we will use the segment's thread number. If it is - // the only segment in the block, and the block the only block for it, then - // it is the right hint. Otherwise, it doesn't matter what we hint, because - // we get to assign IDs arbitrarily. - id_t assigned = (id_t) stPinchSegment_getName(segment); - - if (assigned <= 0 || used.count(assigned)) { - // We need to find an unused number. - // next_unused is always guaranteed to be unused. - assigned = next_unused; - next_unused++; - } - - if (assigned >= next_unused) { - // Budge out the assignment cursor past any numbers yet mentioned in the pinch names. - // This is guaranteed to be past the largest assigned ID, and therefore unused. - next_unused = assigned + 1; - } - - // Save the assigned ID - block_to_id[block] = assigned; - // Mark it used - used.insert(assigned); - // Return it - return assigned; -} - -bool gfa_to_graph(istream& in, VG* graph, bool only_perfect_match) { - - // Things That Are Things - // The GFA has "sequences" and "links" and "paths" - // Each link's CIGAR is an alignment of the start of the second sequence to the end of the first - // The pinch graph has "threads", "adjacencies", and "blocks" - // The vg graph has "nodes" and "edges" and "paths" (again) - - // We have two layers of name/ID translation - // One is from string GFA node name to pinch thread "name" number. - // One is from pinch block pointer to vg graph node ID - // For a GFA with no overlap and numeric names we want them to be invisible. - // Otherwise in some places we will have to rework things. - - - // New algorithm - - // If we are doing only perfect match: - // Make a pinch thread set - // Every sequence goes in - // For each link - // Discard the link if it is not all-M in any overlap it has - // Pinch according to the link - // Convert the pinch blocks to graph nodes - // Sequence is always supplied by the first block member, whatever that is - // Links with no overlap bypass the pinch graph entirely and become edges directly by fiddly lookup - // Then create the paths by finding the each visited GFA sequence's thread in the pinch graph and translating via the blocks to vg space - // When overlaps happen we know the overlap length in the second sequence (which arbitrarily loses) - // And so we know what offset in the second sequence to start at - // And if we didn't unify anything in the pinch graph we know a vg node will start/end there - - // If we are allowing non-perfect matches - // Make the sequences into pinch graph paths the same way - // Read the links' CIGAR strings and merge based on them - // Except don't merge mismatched bases - // Then resolve the sequences of pinch blocks the same way (since all of a block will be identical) - // Then for the paths, when the path goes over an overlap edge, work out what offset in the next GFA sequence it ought to start at - // And add a mapping to that vg node - - // So overall the algorithm is: - // Make a pinch thread set - // Every sequence goes in - // For each link - // If doing perfect match mode, discard if not all M - // Compute CIGAR length in each sequence - // Pinch according to the CIGAR, only merging mismatches if in perfect match mode - // Convert the pinch blocks to graph nodes - // Sequence is always supplied by the first block member, whatever that is - // Links with no overlap bypass the pinch graph entirely and become edges directly by fiddly lookup - // Then create the paths by finding the each visited GFA sequence's thread in the pinch graph and translating via the blocks to vg space - // When overlaps happen we know the overlap length in the second sequence from the link CIGAR - // And so we know what offset in the second sequence to start at - // If any link was discarded, also discard the path (and warn). - - // So let's start - - // Parse the GFA - GFAKluge gg; - gg.parse_gfa_file(in); - // This maps from GFA sequence name to GFA sequence record - map gfa_sequences = gg.get_name_to_seq(); - // This maps from GFA sequence name to the GFA links for which it is the source - map > gfa_links = gg.get_seq_to_edges(); - // This maps from path name to GFA path record - map gfa_paths = gg.get_name_to_path(); - - // Make a pinch thread set - unique_ptr pinch(stPinchThreadSet_construct(), &stPinchThreadSet_destruct); - - // Make a translator to convert from GFA string names to numeric pinch thread names - GFAToPinchTranslator gfa_to_pinch; - - for(auto& name_and_record : gfa_sequences) { - // For each GFA sequence record by string name - auto& name = name_and_record.first; - auto& record = name_and_record.second; - - // Assign it a numeric pinch thread name - auto pinch_name = gfa_to_pinch.translate(name); - - // Add the thread to the pinch thread set - stPinchThreadSet_addThread(pinch.get(), pinch_name, 0, record.sequence.size()); - } - - // As we go through the links, we need to remmeber links which are straight abutments of sequences. - // These won't be processed by the pinch graph; we need to store them and process them later. - // We store them in pinch thread name terms, as source, sink, source reverse, sink reverse. - vector> abut_links; - - // We also need to remember how much of the destination segment you should - // skip due to overlap when reading across each edge in each direction. We - // need this for interpreting GFA path records. We measure out to the end - // of the alignment from the GFA edge, and don't "fix" things like terminal - // insert/leading delete operations where the alignment really could just - // be shorter. We keep entries for all non-discarded edges, even those with - // no overlap. - unordered_map, size_t> link_skips; - - // Finally, we need to keep track of the dangling ends of alignments, which - // need to be "tucked in". If an overlapping block starts or ends with - // something other than a match, there will be a dangling node. It is in - // correspondence with a position in the other sequence, but it doesn't get - // pinched into it. We want to link it to the right next/previous node, - // without actually pinching it in. - - // Because tucks can result in transative collapses and the creation of a - // bunch of edges that would be hard to predict, we're going to use a - // union-find over tucked sides and he sides they tuck into (i.e. have the - // sme edges as). We can't make that union-find until we know everything - // that needs to be involved, so we're going to store a list of pairs of - // (thread name, base, is_left) tripples that get merged. - using tuck_side_t = tuple; - vector> tucks; - - for (auto& name_and_links : gfa_links) { - // For each set of links, by source node name - auto& name = name_and_links.first; - auto& links = name_and_links.second; - - // Find the source pinch thread - auto source_pinch_name = gfa_to_pinch.translate(name); - auto source_thread = stPinchThreadSet_getThread(pinch.get(), source_pinch_name); - - // And the source sequence string - auto& source_string = gfa_sequences[name].sequence; - // And sequence length - auto source_sequence_length = stPinchThread_getLength(stPinchThreadSet_getThread(pinch.get(), source_pinch_name)); - - for (auto& link : links) { - // For each link on this source node - - // Get the CIGAR alignment for the link - auto cigar = vcflib::splitCigar(link.alignment); - - if (only_perfect_match) { - // We only care about all-match/mismatch CIGARs. Does this one qualify? - bool is_all_match_mismatch = true; - for (auto& elem : cigar) { - if (elem.second != "M" && elem.second != "=" && elem.second != "X") { - // This operation isn't any of the match/mismatch operations - is_all_match_mismatch = false; - break; - } - } - - if (!is_all_match_mismatch) { - // This CIGAR has other operations in it and we want to discard the link because of it - continue; - } +/// Determine if a path should be written as a GFA W line or a GFA P line. +static bool should_write_as_w_line(const PathHandleGraph* graph, path_handle_t path_handle); +/// Write out a W line for a path. Uses a map to keep track of fake offset +/// ranges used to distinguish multiple phase blocks on a haplotype, since GFA +/// doesn't support them. +static void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path_handle, unordered_map, size_t>& last_phase_block_end); + +void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& rgfa_paths, + bool rgfa_pline, bool use_w_lines) { + + // TODO: Support sorting nodes, paths, and/or edges for canonical output + // TODO: Use a NamedNodeBackTranslation (or forward translation?) to properly round-trip GFA that has had to be chopped. + + // Compute reference-sense sample header tags + unordered_set reference_samples; + graph->for_each_path_matching({PathSense::REFERENCE}, {}, {}, [&](const path_handle_t& h) { + if (!rgfa_paths.count(graph->get_path_name(h)) || rgfa_pline) { + // If it is going to be something other than an rGFA path, + // we'll have to convey its reference-ness another way. + reference_samples.insert(graph->get_sample_name(h)); } - - // Now we know we need to do the link and process the CIGAR. - - // Get the CIGAR's length in the source and the sink. - // TODO: Is it really true that source is reference and sink is query? - size_t source_alignment_length = 0; - size_t sink_alignment_length = 0; - for (auto& elem : cigar) { - // Parse each CIGAR element - auto& length = elem.first; - assert(!elem.second.empty()); - switch(elem.second[0]) { - case 'M': - case '=': - case 'X': - // Equal-length in both - source_alignment_length += length; - sink_alignment_length += length; - break; - case 'I': - // Insert = more sink - sink_alignment_length += length; - break; - case 'D': - // Deletion = more source - source_alignment_length += length; - break; - case 'S': - // Soft clip = extra sink? - // Probably shouldn't be allowed. - throw runtime_error("GFA CIGAR contains a soft-clip operation; semantics unclear"); - break; - case 'H': - // Hard clip = extra sink also, but even weirder than a soft clip. - throw runtime_error("GFA CIGAR contains a hard-clip operation; semantics unclear"); - break; - default: - // This is an invalid operation; the GFA is invalid. - cerr << "error:[gfa_to_graph]: GFA CIGAR invalid: " << elem.second << " operation in " << link.to_string_2() << endl; - return false; + }); + + // Start with the header for a GFA1.1 file + out << "H\tVN:Z:1.1"; + if (!reference_samples.empty()) { + // Include a reference sample name tag if we have reference paths. + out << "\t" << gbwtgraph::REFERENCE_SAMPLE_LIST_GFA_TAG << ":Z:" << gbwtgraph::compose_reference_samples_tag(reference_samples); + } + out << "\n"; + + //Compute the rGFA tags of given paths (todo: support non-zero ranks) + unordered_map> node_offsets; + for (const string& path_name : rgfa_paths) { + path_handle_t path_handle = graph->get_path_handle(path_name); + size_t offset = 0; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + nid_t node_id = graph->get_id(handle); + if (graph->get_is_reverse(handle)) { + stringstream ss; + ss << "error [gfa]: unable to write rGFA tags for path " << path_name << " because node " + << node_id << " is traversed on its reverse strand. rGFA only supports the forward strand." << endl; + throw runtime_error(ss.str()); } - } - - // Work out what thread the link is to - auto sink_pinch_name = gfa_to_pinch.translate(link.sink_name); - auto sink_thread = stPinchThreadSet_getThread(pinch.get(), sink_pinch_name); - - // Get the orientations - bool source_backward = !link.source_orientation_forward; - bool sink_backward = !link.sink_orientation_forward; - - // Record the link skip distances from the alignment, for interpreting paths. - // When traversion source to sink, skip the alignment's length in the sink - link_skips[make_tuple(source_pinch_name, sink_pinch_name, source_backward, sink_backward)] = sink_alignment_length; - // When traversing sink to source, skip the alignment's length in the source. - link_skips[make_tuple(sink_pinch_name, source_pinch_name, !sink_backward, !source_backward)] = source_alignment_length; - -#ifdef debug - cerr << "Found edge " << link.source_name << " = " << source_pinch_name << (source_backward ? 'L' : 'R') - << " -> " << link.sink_name << " = " << sink_pinch_name << (sink_backward ? 'R' : 'L') << endl; - cerr << "Skips: " << sink_alignment_length << " forward, " << source_alignment_length << " reverse" << endl; -#endif - - if (source_alignment_length == 0 || sink_alignment_length == 0) { - // This link is just an end-to-end abutment with no overlap. - // It can't be sent through the pinch graph; we store it separately. - // We also have no need to do any tucks. - abut_links.push_back(make_tuple(source_pinch_name, sink_pinch_name, source_backward, sink_backward)); - - // TODO: What exactly are the semantics on e.g. an all-insert - // alignment? Is it really just that the ends abut? Or do we - // connect in to the end of the insert? - - // Skip the link CIGAR execution - continue; - } - - // Find the sink sequence data - auto& sink_string = gfa_sequences[link.sink_name].sequence; - // And sequence length - auto sink_sequence_length = stPinchThread_getLength(stPinchThreadSet_getThread(pinch.get(), sink_pinch_name)); - - if (source_alignment_length > source_sequence_length) { - // The GFA file is invalid and specifies using more bases than are present. - if (only_perfect_match) { - // Be tolerant and reject just this edge. - continue; + if (node_offsets.count(node_id)) { + cerr << "warning [gfa]: multiple selected rgfa paths found on node " << node_id << ": keeping tags for " + << graph->get_path_name(node_offsets[node_id].first) << " and ignoring those for " << path_name << endl; + } else { + node_offsets[node_id] = make_pair(path_handle, offset); } - - // Otherwise complain to the user - cerr << "error:[gfa_to_graph]: GFA file contains a link " << link.source_name << " " << (source_backward ? 'L' : 'R') - << " to " << link.sink_name << " " << (sink_backward ? 'R' : 'L') - << " that tries to consume more source sequence (" << source_alignment_length - << ") than is present (" << source_sequence_length << ")" << endl; - - return false; + offset += graph->get_length(handle); + }); + } + + //Go through each node in the graph + graph->for_each_handle([&](const handle_t& h) { + out << "S\t"; + nid_t node_id = graph->get_id(h); + out << node_id << "\t"; + out << graph->get_sequence(h); + auto it = node_offsets.find(node_id); + if (it != node_offsets.end()) { + // add rGFA tags + out << "\t" << "SN:Z:" << graph->get_path_name(it->second.first) + << "\t" << "SO:i:" << it->second.second + << "\t" << "SR:i:0"; // todo: support non-zero ranks? + } + out << "\n"; // Writing `std::endl` would flush the buffer. + return true; + }); + + // Sort the paths by name, making sure to treat subpath coordinates numerically + vector path_handles; + graph->for_each_path_matching(nullptr, nullptr, nullptr, [&](const path_handle_t& h) { + path_handles.push_back(h); + }); + std::sort(path_handles.begin(), path_handles.end(), [&](const path_handle_t& p1, const path_handle_t& p2) { + string n1 = graph->get_path_name(p1); + string n2 = graph->get_path_name(p2); + subrange_t subrange1; + subrange_t subrange2; + n1 = Paths::strip_subrange(n1, &subrange1); + n2 = Paths::strip_subrange(n2, &subrange2); + if (n1 < n2) { + return true; + } else if (n1 == n2) { + return subrange1 < subrange2; } - - if (sink_alignment_length > sink_sequence_length) { - // The GFA file is invalid and specifies using more bases than are present. - if (only_perfect_match) { - // Be tolerant and reject just this edge. - continue; - } - - // Otherwise complain to the user - cerr << "error:[gfa_to_graph]: GFA file contains a link " << link.source_name << " " << (source_backward ? 'L' : 'R') - << " to " << link.sink_name << " " << (sink_backward ? 'R' : 'L') - << " that tries to consume more sink sequence (" << sink_alignment_length - << ") than is present (" << sink_sequence_length << ")" << endl; - - return false; + return false; + }); + + vector w_line_paths; + + // Paths as P-lines + for (const path_handle_t& h : path_handles) { + auto path_name = graph->get_path_name(h); + if (rgfa_pline || !rgfa_paths.count(path_name)) { + if (graph->get_sense(h) != PathSense::REFERENCE && reference_samples.count(graph->get_sample_name(h))) { + // We have a mix of reference and non-reference paths on the same sample which GFA can't handle. + cerr << "warning [gfa]: path " << path_name << " will be interpreted as reference sense " + << "because reference paths exist on its sample" << endl; } - - // Set up some cursors in each node's sequence that go the right - // direction, based on orientations. Cursors start at the first - // base in the CIGAR, which may be past the end/before the - // beginning on the source if the CIGAR is 0 length. - int64_t source_cursor = source_backward ? (-1 + source_alignment_length) : (source_sequence_length - source_alignment_length); - int64_t source_motion = source_backward ? -1 : 1; - int64_t sink_cursor = sink_backward ? (sink_sequence_length - 1) : 0; - int64_t sink_motion = sink_backward ? -1 : 1; - - // Decide if we are pinching in agreeing orientations - bool pinch_same_strand = (source_backward == sink_backward); - - // Interpret the CIGAR string and perform pinches. - - for (size_t cigar_index = 0; cigar_index < cigar.size(); cigar_index++) { - // For each cigar operation - auto& elem = cigar[cigar_index]; - - if (elem.first == 0) { - // Skip 0-length operations - continue; - } - - if (cigar_index != 0 && (elem.second == "D" && cigar[cigar_index - 1].second == "I" || - elem.second == "I" && cigar[cigar_index - 1].second == "D")) { - // We found adjacent inserts and deletions, which throws - // off our dangling tip tucking algorithm and which - // shouldn't happen anyway in a well-behaved alignment. - // TODO: accomodate this somehow. - throw runtime_error("GFA importer cannot (yet) handle adjacent insertions and deletions."); - } - - // Decompose each operation into a series of suboperations. - // This gives us an opportunity to replace M operations with =/X - vector> suboperations; + + if (use_w_lines && should_write_as_w_line(graph, h)) { + w_line_paths.push_back(h); + } else { + out << "P\t"; + out << path_name << "\t"; - if (elem.second == "M" && !only_perfect_match) { - // This is an M operation that needs to be decomposed into = and X - for (size_t i = 0; i < elem.first; i++) { - // For each position along the M operation - - // Find the next character in the source - auto source_char = source_string.at(source_cursor + source_motion * i); - if (source_backward) { - source_char = reverse_complement(source_char); - } - // And the sink - auto sink_char = sink_string.at(sink_cursor + sink_motion * i); - if (sink_backward) { - sink_char = reverse_complement(sink_char); - } - // Work out what kind of operation we need for this pairing of bases. - // TODO: Handle Ns specially? - string opcode = (source_char == sink_char) ? "=" : "X"; - - if (!suboperations.empty() && suboperations.back().second == opcode) { - // We can accumulate onto the existing suboperation of this type - suboperations.back().first++; - } else { - // We need a new suboperation of this type - suboperations.push_back(make_pair(1, opcode)); - } - } - } else { - // This operation can be passed through as-is - suboperations.push_back(elem); - } - - for (auto subelem_index = 0; subelem_index < suboperations.size(); subelem_index++) { - // For each suboperation - auto& subelem = suboperations[subelem_index]; - - // Get its length - auto& length = subelem.first; + bool first = true; + graph->for_each_step_in_path(h, [&](const step_handle_t& ph) { + handle_t step_handle = graph->get_handle_of_step(ph); - // Compute source and sink lengths - size_t source_length = 0; - size_t sink_length = 0; - assert(!subelem.second.empty()); - switch(subelem.second[0]) { - case 'M': - case '=': - case 'X': - source_length = length; - // Fall through - case 'I': - sink_length = length; - break; - case 'D': - source_length = length; - break; - default: - // We should have already checked for weird operations. - throw runtime_error("Invalid operation " + subelem.second + " in pre-screened CIGAR"); + if (!first) { + out << ','; } - - // Work out the sequence-local start of the region in each sequence that it may apply to, which depends on orientation. - int64_t source_region_start = source_backward ? (source_cursor - source_length + 1) : source_cursor; - int64_t sink_region_start = sink_backward ? (sink_cursor - sink_length + 1) : sink_cursor; - - // And also the end positions (last in region, on the other side of the start if the region is empty) - int64_t source_region_last = source_backward ? source_cursor : (source_cursor + source_length - 1); - int64_t sink_region_last = sink_backward ? sink_cursor : (sink_cursor + sink_length - 1); - - // Note thatb these are just lowest/highest coordinate in - // thread space, not corresponding to each other to to the - // start/end of the operation. - -#ifdef debug - cerr << "Suboperation " << subelem.first << subelem.second << " runs " - << source_region_start << " through " << source_region_last << " in " << source_pinch_name - << " and " << sink_region_start << " through " << sink_region_last << " in " << sink_pinch_name << endl; -#endif - - // We need to know when to wire in dangling source/sink bits. - bool is_first_subelement = (cigar_index == 0 && subelem_index == 0); - bool is_last_subelement = (cigar_index == cigar.size() - 1 && subelem_index == suboperations.size() - 1); - - assert(!subelem.second.empty()); - switch(subelem.second[0]) { - case 'M': - if (only_perfect_match) { - // The whole match can be merged - stPinchThread_pinch(source_thread, sink_thread, source_region_start, sink_region_start, length, pinch_same_strand); - } else { - // If we aren't in always_perfect_match mode this should have become =/X - throw runtime_error("Encountered unparsed M operation"); - } - break; - case '=': - // Always pinch. - // TODO: should we check sequence equality? - stPinchThread_pinch(source_thread, sink_thread, source_region_start, sink_region_start, length, pinch_same_strand); - break; - case 'X': - // Only pinch if we are forcing matches (in which case this was X originally) - if (only_perfect_match) { - stPinchThread_pinch(source_thread, sink_thread, source_region_start, sink_region_start, length, pinch_same_strand); - } else { - if (is_first_subelement) { - // We dangled the sink and potentially created - // a tip. We need to remember the pinch we - // would have made, so we can wire up the - // dangling end to what we would have wired it - // to if we had pinched it. - - // Describe the end that is dangling as (thread name, base, is_left) - tuck_side_t dangling_end = make_tuple(sink_pinch_name, - sink_backward ? stPinchThread_getLength(sink_thread) - 1 : 0, !sink_backward); - // Describe what it would have merged with. - tuck_side_t tuck_into = make_tuple(source_pinch_name, source_cursor, !source_backward); - // Queue up the edge exchange - tucks.emplace_back(dangling_end, tuck_into); - -#ifdef debug - cerr << "\tSuboperation requires tucking " - << get<0>(dangling_end) << ":" << get<1>(dangling_end) << (get<2>(dangling_end) ? 'L' : 'R') - << " in with " << get<0>(tuck_into) << ":" << get<1>(tuck_into) << (get<2>(tuck_into) ? 'L' : 'R') << endl; -#endif - - // Splits to allow tucks will be done later. - } - - if (is_last_subelement) { - // We dangled the source as well, on the right of the overlap - - // Describe the end that is dangling as (thread name, base, is_left) - tuck_side_t dangling_end = make_tuple(source_pinch_name, - source_backward ? 0 : stPinchThread_getLength(source_thread) - 1, source_backward); - // Describe what it would have merged with. - tuck_side_t tuck_into = make_tuple(sink_pinch_name, sink_cursor + sink_motion * (sink_length - 1), sink_backward); - // Queue up the edge exchange - tucks.emplace_back(dangling_end, tuck_into); - -#ifdef debug - cerr << "\tSuboperation requires tucking " - << get<0>(dangling_end) << ":" << get<1>(dangling_end) << (get<2>(dangling_end) ? 'L' : 'R') - << " in with " << get<0>(tuck_into) << ":" << get<1>(tuck_into) << (get<2>(tuck_into) ? 'L' : 'R') << endl; -#endif - - // Splits to allow tucks will be done later. - } - } - break; - case 'I': - // We don't need to do any pinching - if (is_first_subelement) { - // We dangled the sink on the left of the overlap - - // Describe the end that is dangling as (thread name, base, is_left) - tuck_side_t dangling_end = make_tuple(sink_pinch_name, - sink_backward ? stPinchThread_getLength(sink_thread) - 1 : 0, !sink_backward); - // Describe what it would have merged with. - tuck_side_t tuck_into = make_tuple(source_pinch_name, source_cursor, !source_backward); - // Queue up the edge exchange - tucks.emplace_back(dangling_end, tuck_into); - -#ifdef debug - cerr << "\tSuboperation requires tucking " - << get<0>(dangling_end) << ":" << get<1>(dangling_end) << (get<2>(dangling_end) ? 'L' : 'R') - << " in with " << get<0>(tuck_into) << ":" << get<1>(tuck_into) << (get<2>(tuck_into) ? 'L' : 'R') << endl; -#endif - - // Splits to allow tucks will be done later. - } - break; - case 'D': - // No pinching - if (is_last_subelement) { - // We dangled the source on the right of the overlap - - // Describe the end that is dangling as (thread name, base, is_left) - tuck_side_t dangling_end = make_tuple(source_pinch_name, - source_backward ? 0 : stPinchThread_getLength(source_thread) - 1, source_backward); - // Describe what it would have merged with. - // We have to back out sink motion since the cursor is right now at the after-the-deletion position. - tuck_side_t tuck_into = make_tuple(sink_pinch_name, sink_cursor - sink_motion, sink_backward); - // Queue up the edge exchange - tucks.emplace_back(dangling_end, tuck_into); - -#ifdef debug - cerr << "\tSuboperation requires tucking " - << get<0>(dangling_end) << ":" << get<1>(dangling_end) << (get<2>(dangling_end) ? 'L' : 'R') - << " in with " << get<0>(tuck_into) << ":" << get<1>(tuck_into) << (get<2>(tuck_into) ? 'L' : 'R') << endl; -#endif - - } - break; - default: - // We should have already checked for weird operations twice now. - throw runtime_error("Invalid operation " + subelem.second + " in pre-screened CIGAR"); - } - - // Advance the cursors - sink_cursor += sink_motion * sink_length; - source_cursor += source_motion * source_length; - } + out << graph->get_id(step_handle); + out << (graph->get_is_reverse(step_handle) ? '-' : '+'); + first = false; + return true; + }); + + out << "\t*" << "\n"; } } } - - // Now all the pinches have been made - - // Also we know all the tucks that need to be union-finded. - - // Make a set of them - unordered_set tuck_side_set; - for (auto& to_union : tucks) { - tuck_side_set.insert(to_union.first); - tuck_side_set.insert(to_union.second); - } - - for (auto& side : tuck_side_set) { - // Don't forget to split the threads to expose the sides we need to copy edges between - auto& thread_name = get<0>(side); - auto& position = get<1>(side); - auto& is_left = get<2>(side); - -#ifdef debug - cerr << "Need to expose thread " << thread_name << ":" << position << " side " << (is_left ? 'L' : 'R') << endl; -#endif - - // Find the thread - stPinchThread* thread = stPinchThreadSet_getThread(pinch.get(), thread_name); - - if (is_left && position == 0) { - // Don't try and ask to split at -1; this side is already exposed - continue; - } - - // Split the thread to expose this side - stPinchThread_split(thread, is_left ? position - 1 : position); - } - - // Everything else should wait until after graph nodes are made -#ifdef debug + // Paths as W-lines { - size_t segment_count = 0; - auto segment_iter = stPinchThreadSet_getSegmentIt(pinch.get()); - stPinchSegment* segment = stPinchThreadSetSegmentIt_getNext(&segment_iter); - for (; segment != nullptr; segment = stPinchThreadSetSegmentIt_getNext(&segment_iter)) { - segment_count++; + unordered_map, size_t> last_phase_block_end; + for (const path_handle_t& h : w_line_paths) { + write_w_line(graph, out, h, last_phase_block_end); } - - cerr << "Total pinch segments: " << segment_count << endl; } -#endif - // Convert the pinch blocks into vg nodes - - // We use this translator to translate block pointers to node IDs - PinchToVGTranslator pinch_to_vg; + graph->for_each_edge([&](const edge_t& h) { + + nid_t from_id = graph->get_id(h.first); + bool from_is_reverse = graph->get_is_reverse(h.first); + nid_t to_id = graph->get_id(h.second); + bool to_is_reverse = graph->get_is_reverse(h.second); - { - auto segment_iter = stPinchThreadSet_getSegmentIt(pinch.get()); - stPinchSegment* segment = stPinchThreadSetSegmentIt_getNext(&segment_iter); - for (; segment != nullptr; segment = stPinchThreadSetSegmentIt_getNext(&segment_iter)) { - - // For each segment in the pinch set (including those not in blocks) - - // Generate or assign the node ID for its block or itself - id_t node_id = pinch_to_vg.translate(segment); - - if (graph->has_node(node_id)) { - // We already did this graph node, from another segment in the block. - continue; - } - - // Get the segment's thread name, + strand offset, length, and orientation - auto segment_thread_name = stPinchSegment_getName(segment); - auto segment_start = stPinchSegment_getStart(segment); - auto segment_length = stPinchSegment_getLength(segment); - auto segment_backward = !stPinchSegment_getBlockOrientationSafe(segment); - - // Go find the source DNA for the GFA sequence that created the thread - const string& thread_sequence = gfa_sequences[gfa_to_pinch.untranslate(segment_thread_name)].sequence; - - // Compute the sequence for the vg node we will make - string node_sequence = thread_sequence.substr(segment_start, segment_length); - if (segment_backward) { - node_sequence = reverse_complement(node_sequence); - } + if (from_is_reverse && (to_is_reverse || to_id < from_id)) { + // Canonicalize edges to be + orientation first if possible, and + // then low-ID to high-ID if possible, for testability. This edge + // needs to flip. - // Make the node in the graph - graph->create_node(node_sequence, node_id); + // Swap the nodes + std::swap(from_id, to_id); + // Swap the orientations + std::swap(from_is_reverse, to_is_reverse); + // Reverse the orientations + from_is_reverse = !from_is_reverse; + to_is_reverse = !to_is_reverse; } - } - - { - // Add edges from pinch graph adjacencies - auto thread_iter = stPinchThreadSet_getIt(pinch.get()); - stPinchThread* thread = stPinchThreadSetIt_getNext(&thread_iter); - for (; thread != nullptr; thread = stPinchThreadSetIt_getNext(&thread_iter)) { - // For each thread in the pinch thread set - // Start at the beginning - stPinchSegment* here = stPinchThread_getFirst(thread); - assert(here != nullptr); - - // Look one segment ahead - stPinchSegment* next = stPinchSegment_get3Prime(here); - - while (next != nullptr) { - // For each pinch graph connection from here to next - - // Get the nodes we are connecting - id_t here_id = pinch_to_vg.translate(here); - id_t next_id = pinch_to_vg.translate(next); - - // Get their orientations - bool here_backward = !stPinchSegment_getBlockOrientationSafe(here); - bool next_backward = !stPinchSegment_getBlockOrientationSafe(next); - - // Make the edge if not present already - Edge* e = graph->create_edge(here_id, next_id, here_backward, next_backward); - -#ifdef debug - cerr << "Created pinch graph edge " << pb2json(*e) << endl; -#endif - - // Advance right - here = next; - next = stPinchSegment_get3Prime(here); - } - } - } - - // Add edges from abut_links - for (auto& abutment : abut_links) { - // Unpack each abutment record - auto& source_name = get<0>(abutment); - auto& sink_name = get<1>(abutment); - auto& source_backward = get<2>(abutment); - auto& sink_backward = get<3>(abutment); - - // Get the threads by name - stPinchThread* source_thread = stPinchThreadSet_getThread(pinch.get(), source_name); - stPinchThread* sink_thread = stPinchThreadSet_getThread(pinch.get(), sink_name); - - // Find the segment of the source that is relevant. - // If the source sequence is forward, it is the last one, but if it is backward, it is the first one. - stPinchSegment* source_segment = source_backward ? stPinchThread_getFirst(source_thread) : stPinchThread_getLast(source_thread); - // And conversely for the sink - stPinchSegment* sink_segment = sink_backward ? stPinchThread_getLast(sink_thread) : stPinchThread_getFirst(sink_thread); - - // Get the node IDs to connect - id_t from_id = pinch_to_vg.translate(source_segment); - id_t to_id = pinch_to_vg.translate(sink_segment); - - // Figure out the orientation of each node. We take whether the segemnt - // is backward in its node, and flip it if the segment itself is - // visited backward. - bool from_start = (!stPinchSegment_getBlockOrientationSafe(source_segment) != source_backward); - bool to_end = (!stPinchSegment_getBlockOrientationSafe(sink_segment) != sink_backward); - - // Make the edge - Edge* e = graph->create_edge(from_id, to_id, from_start, to_end); - -#ifdef debug - cerr << "Created abutment edge " << pb2json(*e) << endl; -#endif - } - - // Copy edges to equivalent nodes caused by tucking - -#ifdef debug - cerr << "Before tucks:" << endl; - cerr << pb2json(graph->graph) << endl; -#endif + out << "L\t" << from_id << "\t" << (from_is_reverse ? '-' : '+') + << "\t" << to_id << "\t" << (to_is_reverse ? '-' : '+') << "\t0M\n"; // Writing `std::endl` would flush the buffer. + return true; + }, false); +} +bool should_write_as_w_line(const PathHandleGraph* graph, path_handle_t path_handle) { + // Until we can change the tests, default to sending reference and + // haplotype paths as W lines, and generic paths as P lines. + return graph->get_sense(path_handle) != PathSense::GENERIC; +} - // OK now we need to do the tucking - - // Make a vector of tuck sides to give them all indexes - vector tuck_side_vector(tuck_side_set.begin(), tuck_side_set.end()); - tuck_side_set.clear(); - - // Get a map to invert the vector - unordered_map tuck_side_to_index; - for (size_t i = 0; i < tuck_side_vector.size(); i++) { - tuck_side_to_index[tuck_side_vector[i]] = i; +void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path_handle, unordered_map, size_t>& last_phase_block_end) { + // Extract the path metadata + string sample = graph->get_sample_name(path_handle); + string contig = graph->get_locus_name(path_handle); + int64_t hap_index = graph->get_haplotype(path_handle); + int64_t phase_block = graph->get_phase_block(path_handle); + auto subrange = graph->get_subrange(path_handle); + size_t start_offset = 0; + size_t end_offset = 0; + if (subrange != PathMetadata::NO_SUBRANGE) { + start_offset = subrange.first; + if (subrange.second != PathMetadata::NO_END_POSITION) { + end_offset = subrange.second; + } } - // Create the union-find - structures::UnionFind tuck_merger(tuck_side_vector.size()); - - for (auto& to_union : tucks) { - // Do all the unioning - auto side1 = tuck_side_to_index[to_union.first]; - auto side2 = tuck_side_to_index[to_union.second]; - tuck_merger.union_groups(side1, side2); + if (sample == PathMetadata::NO_SAMPLE_NAME) { + // Represent an elided sample name with "*"; + sample = "*"; } - // We don't need these anymore now that we have the union-find filled in. - tuck_side_to_index.clear(); - tucks.clear(); - // Convert all the tuck sides to outward-pointing handles - vector tuck_handles(tuck_side_vector.size()); - // Also invert the handle vector so we can get handle indexes - unordered_map handle_to_index; - for (size_t i = 0; i < tuck_side_vector.size(); i++) { - // Unpack each side - auto& side = tuck_side_vector[i]; - auto& thread_name = get<0>(side); - auto& position = get<1>(side); - auto& is_left = get<2>(side); - - // Find the thread - stPinchThread* thread = stPinchThreadSet_getThread(pinch.get(), thread_name); - - // Find the segment - stPinchSegment* segment = stPinchThread_getSegment(thread, position); - - // Find the node and segment-relative orientation - id_t node = pinch_to_vg.translate(segment); - bool segment_reverse_in_node = !stPinchSegment_getBlockOrientationSafe(segment); - - // Make the handle - tuck_handles[i] = graph->get_handle(node, is_left != segment_reverse_in_node); - // And store its index under it - handle_to_index[tuck_handles[i]] = i; + if (hap_index == PathMetadata::NO_HAPLOTYPE) { + // No haplotype is actually assigned here. + // We probably won't have paths with it assigned and not assigned but + // the same sample and contig, so assign it 0 and make the sample + // haploid. + // TODO: check for collisions somehow? + hap_index = 0; } - tuck_side_vector.clear(); - - // Keep a set, for each union-find group, of all the destination handles - // that everything in the group has to have an edge to. - unordered_map> destinations; - - for (size_t i = 0; i < tuck_handles.size(); i++) { - // For each tuck side handle - auto& handle = tuck_handles[i]; - - // Find the destination set for the union-find group this tuck side handle belongs to - auto& my_destinations = destinations[tuck_merger.find_group(i)]; - - graph->follow_edges(handle, false, [&](const handle_t& next) { - // Loop over every handle it goes to (facing inward) - - // Add each handle to the destinations set for the union-find group that owns this tuck side - my_destinations.insert(next); - - // Check and see if this place we go itself belongs to a union-find - // group (making sure to make it face us first) - auto found = handle_to_index.find(graph->flip(next)); - - if (found != handle_to_index.end()) { - // It does, so we have to pretend it is also all the other things in its group - for(size_t other_index : tuck_merger.group(found->second)) { - // Go get all the handles that tuck in with it - handle_t& also_tucked = tuck_handles.at(other_index); - - // And add them all as destinations too - my_destinations.insert(graph->flip(also_tucked)); - } - } + + // Get the path length. + // TODO: sniff if the graph has this cached somehow? + size_t path_length = 0 ; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + path_length += graph->get_length(graph->get_handle_of_step(step_handle)); }); + + if (end_offset != 0 && start_offset + path_length != end_offset) { + cerr << "[gfa] warning: incorrect end offset (" << end_offset << ") extracted from from path name " << graph->get_path_name(path_handle) + << ", using " << (start_offset + path_length) << " instead" << endl; } - for (size_t i = 0; i < tuck_handles.size(); i++) { - // For each tuck side handle again - auto& handle = tuck_handles[i]; - - // Look up all the destinations for its union-find group - auto& my_destinations = destinations[tuck_merger.find_group(i)]; - - for (auto& destination : my_destinations) { - // Connect it to each of them - graph->create_edge(handle, destination); - } - - } - - // Now all the nodes and edges exist. - - // Process the GFA paths - - for (auto& name_and_path : gfa_paths) { - // For each path record by name - auto& name = name_and_path.first; - auto& path = name_and_path.second; - -#ifdef debug - cerr << "Import path " << name << endl; - cerr << path.to_string() << endl; -#endif - - // Create each path - graph->paths.create_path(name); - - if (path.segment_names.size() == 0) { - // Empty paths need nothing else. - continue; - } - - // Start the path with visits to the entirety of the first thread it traces - { - // Find the thread to visit - int64_t thread_name = gfa_to_pinch.translate(path.segment_names[0]); - // Determine if it is visited backward - bool thread_backward = !path.orientations[0]; - - // Get the actual thread - stPinchThread* thread = stPinchThreadSet_getThread(pinch.get(), thread_name); - - // Get the starting end appropriate to the orientation - stPinchSegment* segment = thread_backward ? stPinchThread_getLast(thread) : stPinchThread_getFirst(thread); - -#ifdef debug - cerr << "\tBegin at " << path.segment_names[0] - << " = " << thread_name << (thread_backward ? 'L' : 'R') << endl; -#endif - - while (segment != nullptr) { - // Look up the node - id_t node = pinch_to_vg.translate(segment); - // Compute its visit orientation - bool node_backward = (!stPinchSegment_getBlockOrientationSafe(segment) != thread_backward); - -#ifdef debug - cerr << "\t\tPath starts with " << stPinchSegment_getLength(segment) - << " bases on node " << node << " orientation " << (node_backward ? "rev" : "fwd") << endl; -#endif - - // Visit it - graph->paths.append_mapping(name, node, node_backward, stPinchSegment_getLength(segment), 0); - // Advance to the next segment - segment = thread_backward ? stPinchSegment_get5Prime(segment) : stPinchSegment_get3Prime(segment); - } - } - - // If we find a nonexistent/skipped link we need to abort the entire path. - bool abort_path = false; - for (size_t i = 1; i < path.segment_names.size() && !abort_path; i++) { - // For each subsequent GFA path visit (which becomes a thread) - - // Find the thread to visit - int64_t thread_name = gfa_to_pinch.translate(path.segment_names[i]); - // Determine if it is visited backward - bool thread_backward = !path.orientations[i]; - - // And the previous thread - int64_t prev_thread_name = gfa_to_pinch.translate(path.segment_names[i - 1]); - bool prev_thread_backward = !path.orientations[i - 1]; - -#ifdef debug - cerr << "\tCross edge " << path.segment_names[i - 1] - << " = " << prev_thread_name << (prev_thread_backward ? 'L' : 'R') - << " to " << path.segment_names[i] << " = " << thread_name << (thread_backward ? 'R' : 'L') << endl; -#endif - - // Work out how much of this thread the previous thread ate, by looking at the overlaps on the links - auto overlap_to = link_skips.find(tie(prev_thread_name, thread_name, prev_thread_backward, thread_backward)); - if (overlap_to == link_skips.end()) { - // This thread crosses a link that isn't there. - // We want to get rid of the path entirely since we can't represent it. - - if (only_perfect_match) { - // Edge may have been removed for having a bad alignment. - cerr << "warning [gfa_to_graph]: path " << name << ": edge " << path.segment_names[i - 1] - << " = " << prev_thread_name << (prev_thread_backward ? 'L' : 'R') - << " to " << path.segment_names[i] << " = " << thread_name << (thread_backward ? 'R' : 'L') - << " is not present. It may have been removed due to having a bad alignment. Discarding path!" << endl; - } else { - // All the edges should be there. This is an error in the GFA. - stringstream msg; - msg << "error [gfa_to_graph]: path " << name << ": edge " << path.segment_names[i - 1] - << " = " << prev_thread_name << (prev_thread_backward ? 'L' : 'R') - << " to " << path.segment_names[i] << " = " << thread_name << (thread_backward ? 'R' : 'L') - << " is not present. The GFA file is malformed!"; - throw runtime_error(msg.str()); - } - - // Remove the path and skip out on adding the rest of it - graph->paths.remove_path(name); - abort_path = true; - break; - } - - // TODO: We don't check for *vg graph edges* that aren't present, only *links*. - - // Start at the near end of the next thread - stPinchThread* thread = stPinchThreadSet_getThread(pinch.get(), thread_name); - stPinchSegment* segment = thread_backward ? stPinchThread_getLast(thread) : stPinchThread_getFirst(thread); - - // Skip segments until we have accounted for the overlap. - size_t overlap_skipped = 0; - while (overlap_skipped < overlap_to->second && segment != nullptr) { - overlap_skipped += stPinchSegment_getLength(segment); -#ifdef debug - cerr << "\t\tSkip overlap of " << stPinchSegment_getLength(segment) - << " from segment for node " << pinch_to_vg.translate(segment) << endl; -#endif - segment = thread_backward ? stPinchSegment_get5Prime(segment) : stPinchSegment_get3Prime(segment); - } - - // We should always reach the overlap at a segment boundary - assert(overlap_skipped == overlap_to->second); - - // Continue adding segments to the path as nodes from there until the end of the thread. - while (segment != nullptr) { - // Look up the node - id_t node = pinch_to_vg.translate(segment); - // Compute its visit orientation - bool node_backward = (!stPinchSegment_getBlockOrientationSafe(segment) != thread_backward); - -#ifdef debug - cerr << "\t\tPath follows " << stPinchSegment_getLength(segment) - << " bases on node " << node << " orientation " << (node_backward ? "rev" : "fwd") << endl; -#endif - - // Visit it - graph->paths.append_mapping(name, node, node_backward, stPinchSegment_getLength(segment), 0); - // Advance to the next segment - segment = thread_backward ? stPinchSegment_get5Prime(segment) : stPinchSegment_get3Prime(segment); - } + // See if we need to bump along the start offset to avoid collisions of phase blocks + auto key = std::tuple(sample, hap_index, contig); + auto& phase_block_end_cursor = last_phase_block_end[key]; + if (phase_block_end_cursor != 0) { + if (start_offset != 0) { + // TODO: Work out a way to support phase blocks and subranges at the same time. + cerr << "[gfa] error: cannot write multiple phase blocks on a sample, haplotyope, and contig in GFA format" + << " when paths already have subranges. Fix path " << graph->get_path_name(path_handle) << endl; + exit(1); } + // Budge us to after the last thing and budge the cursor to after us. + // TODO: GBWTGraph algorithm just uses phase block number as start + // position so it can roudn trip. Settle on a way to round trip the + // small phase block numbers somehow? + start_offset += phase_block_end_cursor; + phase_block_end_cursor += path_length; } - - // Save the paths to the graph - graph->paths.rebuild_mapping_aux(); - graph->paths.to_graph(graph->graph); - - // Now the graph is done! - // TODO: validate graph and paths and assign path mapping ranks - - return true; - -} - -void graph_to_gfa(const VG* graph, ostream& out) { - GFAKluge gg; - gg.set_version(1.0); - for (auto h : gg.get_header()){ - out << h.second.to_string(); - } - // TODO moving to GFAKluge - // problem: protobuf longs don't easily go to strings.... - - graph->for_each_node([&](const Node* n) { - sequence_elem s_elem; - // Fill seq element for a node - s_elem.name = to_string(n->id()); - s_elem.sequence = n->sequence(); - out << s_elem.to_string_1() << endl; - //gg.add_sequence(s_elem); - }); - - auto& pathmap = graph->paths._paths; - for (auto p : pathmap){ - path_elem p_elem; - p_elem.name = p.first; - for (auto m : p.second){ - p_elem.segment_names.push_back( std::to_string(m.node_id()) ); - p_elem.orientations.push_back( !m.is_reverse() ); - const Node* n = graph->get_node( m.node_id() ); - stringstream cigaro; - //cigaro << n->sequence().size() << (p.mapping(m_ind.position().is_reverse()) ? "M" : "M"); - cigaro << n->sequence().size() << (m.is_reverse() ? "M" : "M"); - p_elem.overlaps.push_back( cigaro.str() ); - } - out << p_elem.to_string_1() << endl; - //gg.add_path(p_elem.name, p_elem); - } + out << "W\t" << sample << "\t" << hap_index << "\t" << contig << "\t" << start_offset << "\t" << (start_offset + path_length) << "\t"; - graph->for_each_edge([&](const Edge* e) { - edge_elem ee; - ee.type = 1; - ee.source_name = to_string(e->from()); - ee.sink_name = to_string(e->to()); - ee.source_orientation_forward = ! e->from_start(); - ee.sink_orientation_forward = ! e->to_end(); - ee.alignment = std::to_string(e->overlap()) + "M"; - out << ee.to_string_1() << endl;; - //gg.add_edge(ee.source_name, ee); - //link_elem l; - //l.source_name = to_string(e->from()); - //l.sink_name = to_string(e->to()); - //l.source_orientation_forward = ! e->from_start(); - //l.sink_orientation_forward = ! e->to_end(); - //l.cigar = std::to_string(e->overlap()) + "M"; - //gg.add_link(l.source_name, l); - }); - //gg.output_to_stream(cout); + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + out << (graph->get_is_reverse(handle) ? "<" : ">") << graph->get_id(handle); + }); + out << "\n"; } } diff --git a/src/gfa.hpp b/src/gfa.hpp index d460554f8a5..3e914c21499 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -4,38 +4,27 @@ /** * \file gfa.hpp * - * Defines GFA I/O algorithms for VG graphs. + * Defines GFA I/O algorithms for PathHandleGraphs graphs. * * Includes an algorithm for converting from GFA, including non-perfect-match * edge overlaps and edges that specify containment of one node in another, to * a blunt-ended VG. */ -#include "vg.hpp" +#include "handle.hpp" namespace vg { using namespace std; -/** - * Import the given GFA file into the given (empty) VG. If only_perfect_match - * is set, only completely-M-operation CIGARs of length <= node length for - * edge overlaps will be used, and sequence differences will be resolved - * arbitrarily. Otherwise, CIGAR strings will be respected, files containing - * alingments using more bases than the sequences will be rejected, and - * mismatches in CIGAR-M sequences will form bubbles. - * - * Returns true if the import was successful. Returns false if the import - * failed because the GFA file is invalid. Throws an error if the import failed - * because of an apparent bug in the import code, or if the GFA tries to do - * something that might be technically valid but which we don't know how to - * interpret. - */ -bool gfa_to_graph(istream& in, VG* graph, bool only_perfect_match = false); - /// Export the given VG graph to the given GFA file. -void graph_to_gfa(const VG* graph, ostream& out); - +/// Express paths mentioned in rgfa_paths as rGFA. +/// If rgfa_pline is set, also express them as dedicated lines. +/// If use_w_lines is set, reference and haplotype paths will use W lines instead of P lines. +void graph_to_gfa(const PathHandleGraph* graph, ostream& out, + const set& rgfa_paths = {}, + bool rgfa_pline = false, + bool use_w_lines = true); } diff --git a/src/gff_reader.cpp b/src/gff_reader.cpp new file mode 100644 index 00000000000..fd03c03134b --- /dev/null +++ b/src/gff_reader.cpp @@ -0,0 +1,118 @@ +#include "gff_reader.hpp" + +namespace vg { + + map GFFRecord::parse_attributes() { + + map parsed_attributes; + stringstream attr_stream(attributes); + + string buffer; + while (attr_stream.good()) { + getline(attr_stream, buffer, ';'); + + stringstream split_stream(buffer); + + string attr_type; + string attr_value; + + getline(split_stream, attr_type, '='); + getline(split_stream, attr_value, '\0'); + + parsed_attributes[attr_type] = attr_value; + + buffer.clear(); + } + + return parsed_attributes; + } + + GFFReader::GFFReader(istream& in) : in(in) { + + } + + void GFFReader::for_each_gff_record(function& lambda) { + + while (in.good()) { + // skip header lines + if (in.peek() == '#') { + in.ignore(numeric_limits::max(), '\n'); + continue; + } + + GFFRecord record; + + string buffer; + char* ignored; + + // parse sequence ID + getline(in, buffer, '\t'); + if (buffer.empty()) { + continue; + } + else if (buffer != ".") { + record.sequence_id = move(buffer); + } + buffer.clear(); + + // parse data source + getline(in, buffer, '\t'); + if (buffer != ".") { + record.source = move(buffer); + } + buffer.clear(); + + // parse type of annotation + getline(in, buffer, '\t'); + if (buffer != ".") { + record.type = move(buffer); + } + buffer.clear(); + + // parse start coordinate + getline(in, buffer, '\t'); + if (buffer != ".") { + record.start = strtol(buffer.c_str(), &ignored, 10) - 1; + } + buffer.clear(); + + // parse end coordinate + getline(in, buffer, '\t'); + if (buffer != ".") { + record.end = strtol(buffer.c_str(), &ignored, 10) - 1; + } + buffer.clear(); + + // parse score + getline(in, buffer, '\t'); + if (buffer != ".") { + record.score = strtod(buffer.c_str(), &ignored); + } + buffer.clear(); + + // parse strand + getline(in, buffer, '\t'); + if (buffer != ".") { + record.strand_is_rev = (buffer == "-"); + } + buffer.clear(); + + // parse phase + getline(in, buffer, '\t'); + if (buffer != ".") { + record.phase = stoi(buffer); + } + buffer.clear(); + + // parse annotations (but leave as an unparsed string) + getline(in, buffer, '\n'); + if (buffer != ".") { + record.attributes = move(buffer); + } + + // execute the iteratee + lambda(record); + } + } + +} diff --git a/src/gff_reader.hpp b/src/gff_reader.hpp new file mode 100644 index 00000000000..722084ec4fa --- /dev/null +++ b/src/gff_reader.hpp @@ -0,0 +1,56 @@ +#ifndef VG_GFF_READER_HPP_INCLUDED +#define VG_GFF_READER_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include + +namespace vg { + + using namespace std; + + /** + * A package of the information contained in a GFF3 record. The null "." entries in a + * a GFF are parsed into empty strings or the default values of the numerical fields + * as given below. + */ + struct GFFRecord { + public: + GFFRecord() = default; + ~GFFRecord() = default; + + string sequence_id; + string source; + string type; + // 0-based indexing, unlike the actual GFF standard + int64_t start = -1; + // 0-based, inclusive + int64_t end = -1; + double score = numeric_limits::quiet_NaN(); + bool strand_is_rev = false; + int32_t phase = -1; + string attributes; + + map parse_attributes(); + }; + + /** + * A class that can parse and iterate over a GFF3 file. + */ + class GFFReader { + public: + GFFReader(istream& in); + ~GFFReader() = default; + + void for_each_gff_record(function& lambda); + + private: + istream& in; + }; + +} + +#endif diff --git a/src/graph.cpp b/src/graph.cpp index 9e344a44fc8..beca52b5e12 100644 --- a/src/graph.cpp +++ b/src/graph.cpp @@ -88,5 +88,40 @@ void flip_doubly_reversed_edges(Graph& graph) { } } } + +void from_handle_graph(const HandleGraph& from, Graph& to) { + from.for_each_handle([&](const handle_t& h) { + Node* node = to.add_node(); + node->set_id(from.get_id(h)); + node->set_sequence(from.get_sequence(h)); + }); + from.for_each_edge([&](const edge_t& e) { + Edge* edge = to.add_edge(); + edge->set_from(from.get_id(e.first)); + edge->set_from_start(from.get_is_reverse(e.first)); + edge->set_to(from.get_id(e.second)); + edge->set_to_end(from.get_is_reverse(e.second)); + }); +} + +void from_path_handle_graph(const PathHandleGraph& from, Graph& to) { + + from_handle_graph(from, to); + + from.for_each_path_handle([&](const path_handle_t& p) { + Path* path = to.add_path(); + path->set_name(from.get_path_name(p)); + path->set_is_circular(from.get_is_circular(p)); + int64_t rank = 1; + for (handle_t step : from.scan_path(p)) { + Mapping* mapping = path->add_mapping(); + Position* position = mapping->mutable_position(); + position->set_node_id(from.get_id(step)); + position->set_is_reverse(from.get_is_reverse(step)); + mapping->set_rank(rank); + ++rank; + } + }); +} } diff --git a/src/graph.hpp b/src/graph.hpp index 69426a7f8c0..964e46ccebb 100644 --- a/src/graph.hpp +++ b/src/graph.hpp @@ -1,8 +1,9 @@ #ifndef VG_GRAPH_HPP_INCLUDED #define VG_GRAPH_HPP_INCLUDED -#include "vg.pb.h" +#include #include "types.hpp" +#include "handle.hpp" #include #include @@ -43,6 +44,12 @@ bool has_inversion(const Graph& graph); /// clean up doubly-reversed edges void flip_doubly_reversed_edges(Graph& graph); +// transfer data from a HandleGraph into an empty Graph +void from_handle_graph(const HandleGraph& from, Graph& to); + +// transfer data from a PathHandleGraph into an empty Graph +void from_path_handle_graph(const PathHandleGraph& from, Graph& to); + } #endif diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp new file mode 100644 index 00000000000..c9a518718e0 --- /dev/null +++ b/src/graph_caller.cpp @@ -0,0 +1,2463 @@ +#include "graph_caller.hpp" +#include "algorithms/expand_context.hpp" +#include "annotation.hpp" + +//#define debug + +namespace vg { + +GraphCaller::GraphCaller(SnarlCaller& snarl_caller, + SnarlManager& snarl_manager) : + snarl_caller(snarl_caller), snarl_manager(snarl_manager) { +} + +GraphCaller::~GraphCaller() { +} + +void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType recurse_type) { + + // Used to recurse on children of parents that can't be called + size_t thread_count = get_thread_count(); + vector> snarl_queue(thread_count); + + // Run the snarl caller on a snarl, and queue up the children if it fails + auto process_snarl = [&](const Snarl* snarl) { + + if (!snarl_manager.is_trivial(snarl, graph)) { + +#ifdef debug + cerr << "GraphCaller running call_snarl on " << pb2json(*snarl) << endl; +#endif + + bool was_called = call_snarl(*snarl); + if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) { + const vector& children = snarl_manager.children_of(snarl); + vector& thread_queue = snarl_queue[omp_get_thread_num()]; + thread_queue.insert(thread_queue.end(), children.begin(), children.end()); + } + } + }; + + // Start with the top level snarls + snarl_manager.for_each_top_level_snarl_parallel(process_snarl); + + // Then recurse on any children the snarl caller failed to handle + while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), + [](const vector& snarl_vec) {return snarl_vec.empty();})) { + vector cur_queue; + for (vector& thread_queue : snarl_queue) { + cur_queue.reserve(cur_queue.size() + thread_queue.size()); + std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); + thread_queue.clear(); + } + +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < cur_queue.size(); ++i) { + process_snarl(cur_queue[i]); + } + } + +} + +static void flip_snarl(Snarl& snarl) { + Visit v = snarl.start(); + *snarl.mutable_start() = reverse(snarl.end()); + *snarl.mutable_end() = reverse(v); +} + +void GraphCaller::call_top_level_chains(const HandleGraph& graph, size_t max_edges, size_t max_trivial, RecurseType recurse_type) { + // Used to recurse on children of parents that can't be called + size_t thread_count = get_thread_count(); + vector> chain_queue(thread_count); + + // Run the snarl caller on a chain. queue up the children if it fails + auto process_chain = [&](const Chain* chain) { + +#ifdef debug + cerr << "calling top level chain "; + for (const auto& i : *chain) { + cerr << pb2json(*i.first) << "," << i.second << ","; + } + cerr << endl; +#endif + // Break up the chain + vector chain_pieces = break_chain(graph, *chain, max_edges, max_trivial); + + for (Chain& chain_piece : chain_pieces) { + // Make a fake snarl spanning the chain + // It is important to remember that along with not actually being a snarl, + // it's not managed by the snarl manager so functions looking into its nesting + // structure will not work + Snarl fake_snarl; + *fake_snarl.mutable_start() = chain_piece.front().second == true ? reverse(chain_piece.front().first->end()) : + chain_piece.front().first->start(); + *fake_snarl.mutable_end() = chain_piece.back().second == true ? reverse(chain_piece.back().first->start()) : + chain_piece.back().first->end(); + +#ifdef debug + cerr << "calling fake snarl " << pb2json(fake_snarl) << endl; +#endif + + bool was_called = call_snarl(fake_snarl); + if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) { + vector& thread_queue = chain_queue[omp_get_thread_num()]; + for (pair chain_link : chain_piece) { + const deque& child_chains = snarl_manager.chains_of(chain_link.first); + thread_queue.insert(thread_queue.end(), child_chains.begin(), child_chains.end()); + } + } + } + }; + + // Start with the top level snarls + snarl_manager.for_each_top_level_chain_parallel(process_chain); + + // Then recurse on any children the snarl caller failed to handle + while (!std::all_of(chain_queue.begin(), chain_queue.end(), + [](const vector& chain_vec) {return chain_vec.empty();})) { + vector cur_queue; + for (vector& thread_queue : chain_queue) { + cur_queue.reserve(cur_queue.size() + thread_queue.size()); + std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); + thread_queue.clear(); + } + +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < cur_queue.size(); ++i) { + process_chain(&cur_queue[i]); + } + + } +} + +vector GraphCaller::break_chain(const HandleGraph& graph, const Chain& chain, size_t max_edges, size_t max_trivial) { + + vector chain_frags; + + // keep track of the current fragment and add it to chain_frags as soon as it gets too big + Chain frag; + size_t frag_edge_count = 0; + size_t frag_triv_count = 0; + + for (const pair& link : chain) { + // todo: we're getting the contents here as well as within the caller. + auto contents = snarl_manager.deep_contents(link.first, graph, false); + + // todo: use annotation from snarl itself? + bool trivial = contents.second.empty(); + + if ((trivial && frag_triv_count > max_trivial) || + (contents.second.size() + frag_edge_count > max_edges)) { + // adding anything more to the chain would make it too long, so we + // add it to the output and clear the current fragment + if (!frag.empty() && frag_triv_count < frag.size()) { + chain_frags.push_back(frag); + } + frag.clear(); + frag_edge_count = 0; + frag_triv_count = 0; + } + + if (!trivial || (frag_triv_count < max_trivial)) { + // we start a new fragment or add to an existing fragment + frag.push_back(link); + frag_edge_count += contents.second.size(); + if (trivial) { + ++frag_triv_count; + } + } + } + + // and the last one + if (!frag.empty()) { + chain_frags.push_back(frag); + } + + return chain_frags; +} + +VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name), translation(nullptr), include_nested(false) +{ + output_variants.resize(get_thread_count()); +} + +VCFOutputCaller::~VCFOutputCaller() { +} + +string VCFOutputCaller::vcf_header(const PathHandleGraph& graph, const vector& contigs, + const vector& contig_length_overrides) const { + stringstream ss; + ss << "##fileformat=VCFv4.2" << endl; + for (int i = 0; i < contigs.size(); ++i) { + const string& contig = contigs[i]; + size_t length; + if (i < contig_length_overrides.size()) { + // length override provided + length = contig_length_overrides[i]; + } else { + length = 0; + for (handle_t handle : graph.scan_path(graph.get_path_handle(contig))) { + length += graph.get_length(handle); + } + } + ss << "##contig=" << endl; + } + if (include_nested) { + ss << "##INFO=" << endl; + ss << "##INFO=" << endl; + } + ss << "##INFO=" << endl; + return ss.str(); +} + +void VCFOutputCaller::add_variant(vcflib::Variant& var) const { + var.setVariantCallFile(output_vcf); + stringstream ss; + ss << var; + string dest; + zstdutil::CompressString(ss.str(), dest); + // the Variant object is too big to keep in memory when there are many genotypes, so we + // store it in a zstd-compressed string + output_variants[omp_get_thread_num()].push_back(make_pair(make_pair(var.sequenceName, var.position), dest)); +} + +void VCFOutputCaller::write_variants(ostream& out_stream, const SnarlManager* snarl_manager) { + assert(include_nested == false || snarl_manager != nullptr); + if (include_nested) { + update_nesting_info_tags(snarl_manager); + } + vector, string>> all_variants; + for (const auto& buf : output_variants) { + all_variants.reserve(all_variants.size() + buf.size()); + std::move(buf.begin(), buf.end(), std::back_inserter(all_variants)); + } + std::sort(all_variants.begin(), all_variants.end(), [](const pair, string>& v1, + const pair, string>& v2) { + return v1.first.first < v2.first.first || (v1.first.first == v2.first.first && v1.first.second < v2.first.second); + }); + for (auto v : all_variants) { + string dest; + zstdutil::DecompressString(v.second, dest); + out_stream << dest << endl; + } +} + +static int countAlts(vcflib::Variant& var, int alleleIndex) { + int alts = 0; + for (map > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { + map >& sample = s->second; + map >::iterator gt = sample.find("GT"); + if (gt != sample.end()) { + map genotype = vcflib::decomposeGenotype(gt->second.front()); + for (map::iterator g = genotype.begin(); g != genotype.end(); ++g) { + if (g->first == alleleIndex) { + alts += g->second; + } + } + } + } + return alts; +} + +static int countAlleles(vcflib::Variant& var) { + int alleles = 0; + for (map > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { + map >& sample = s->second; + map >::iterator gt = sample.find("GT"); + if (gt != sample.end()) { + map genotype = vcflib::decomposeGenotype(gt->second.front()); + for (map::iterator g = genotype.begin(); g != genotype.end(); ++g) { + if (g->first != vcflib::NULL_ALLELE) { + alleles += g->second; + } + } + } + } + return alleles; +} + +// this isn't from vcflib, but seems to make more sense than just returning the number of samples in the file again and again +static int countSamplesWithData(vcflib::Variant& var) { + int samples_with_data = 0; + for (map > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { + map >& sample = s->second; + map >::iterator gt = sample.find("GT"); + bool has_data = false; + if (gt != sample.end()) { + map genotype = vcflib::decomposeGenotype(gt->second.front()); + for (map::iterator g = genotype.begin(); g != genotype.end(); ++g) { + if (g->first != vcflib::NULL_ALLELE) { + has_data = true; + break; + } + } + } + if (has_data) { + ++samples_with_data; + } + } + return samples_with_data; +} + +void VCFOutputCaller::vcf_fixup(vcflib::Variant& var) const { + // copied from https://github.com/vgteam/vcflib/blob/master/src/vcffixup.cpp + + stringstream ns; + ns << countSamplesWithData(var); + var.info["NS"].clear(); + var.info["NS"].push_back(ns.str()); + + var.info["AC"].clear(); + var.info["AF"].clear(); + var.info["AN"].clear(); + + int allelecount = countAlleles(var); + stringstream an; + an << allelecount; + var.info["AN"].push_back(an.str()); + + for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { + string& allele = *a; + int altcount = countAlts(var, var.getAltAlleleIndex(allele) + 1); + stringstream ac; + ac << altcount; + var.info["AC"].push_back(ac.str()); + stringstream af; + double faf = (double) altcount / (double) allelecount; + if(faf != faf) faf = 0; + af << faf; + var.info["AF"].push_back(af.str()); + } +} + +void VCFOutputCaller::set_translation(const unordered_map>* translation) { + this->translation = translation; +} + +void VCFOutputCaller::set_nested(bool nested) { + include_nested = nested; +} + +void VCFOutputCaller::add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav, + bool reversed, bool one_based) const { + auto& trav_info = v.info["AT"]; + assert(allele < trav_info.size()); + + vector nodes; + nodes.reserve(trav.visit_size()); + const Visit* prev_visit = nullptr; + unordered_map>::const_iterator prev_trans; + + for (size_t i = 0; i < trav.visit_size(); ++i) { + size_t j = !reversed ? i : trav.visit_size() - 1 - i; + const Visit& visit = trav.visit(j); + nid_t node_id = visit.node_id(); + string node_name = std::to_string(node_id); + bool skip = false; + // todo: check one_based? (we kind of ignore that when writing the snarl name, so maybe not pertienent) + if (translation) { + auto i = translation->find(node_id); + if (i == translation->end()) { + throw runtime_error("Error [vg deconstruct]: Unable to find node " + node_name + " in translation file"); + } + if (prev_visit) { + nid_t prev_node_id = prev_visit->node_id(); + if (prev_trans->second.first == i->second.first && node_id != prev_node_id) { + // here is a case where we have two consecutive nodes that map back to + // the same source node. + // todo: sanity check! (could verify if translation node properly covered) + skip = true; + } + } + node_name = i->second.first; + prev_trans = i; + } + + if (!skip) { + bool vrev = visit.backward() != reversed; + trav_info[allele] += (vrev ? "<" : ">"); + trav_info[allele] += node_name; + } + prev_visit = &visit; + } +} + +string VCFOutputCaller::trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const { + string seq; + for (int i = 0; i < trav.visit_size(); ++i) { + const Visit& visit = trav.visit(i); + if (visit.node_id() > 0) { + seq += graph.get_sequence(graph.get_handle(visit.node_id(), visit.backward())); + } else { + seq += print_snarl(visit.snarl(), true); + } + } + return seq; +} + +void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, + const Snarl& snarl, const vector& called_traversals, + const vector& genotype, int ref_trav_idx, const unique_ptr& call_info, + const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy, + function&, const vector&, int, int, int)> trav_to_string) { + +#ifdef debug + cerr << "emitting variant for " << pb2json(snarl) << endl; + for (int i = 0; i < called_traversals.size(); ++i) { + if (i == ref_trav_idx) { + cerr << "*"; + } + cerr << "ct[" << i << "]=" << pb2json(called_traversals[i]) << endl; + } + for (int i = 0; i < genotype.size(); ++i) { + cerr << "gt[" << i << "]=" << genotype[i] << endl; + } +#endif + + if (trav_to_string == nullptr) { + trav_to_string = [&](const vector& travs, const vector& travs_genotype, int trav_allele, int genotype_allele, int ref_trav_idx) { + return trav_string(graph, travs[trav_allele]); + }; + } + + vcflib::Variant out_variant; + + vector site_traversals = {called_traversals[ref_trav_idx]}; + vector site_genotype; + auto ref_gt_it = std::find(genotype.begin(), genotype.end(), ref_trav_idx); + out_variant.ref = trav_to_string(called_traversals, genotype, ref_trav_idx, + ref_gt_it != genotype.end() ? ref_gt_it - genotype.begin() : 0, + ref_trav_idx); + + // deduplicate alleles and compute the site traversals and genotype + map allele_to_gt; + allele_to_gt[out_variant.ref] = 0; + for (int i = 0; i < genotype.size(); ++i) { + if (genotype[i] == ref_trav_idx) { + site_genotype.push_back(0); + } else { + string allele_string = trav_to_string(called_traversals, genotype, genotype[i], i, ref_trav_idx); + if (allele_to_gt.count(allele_string)) { + site_genotype.push_back(allele_to_gt[allele_string]); + } else { + site_traversals.push_back(called_traversals[genotype[i]]); + site_genotype.push_back(allele_to_gt.size()); + allele_to_gt[allele_string] = site_genotype.back(); + } + } + } + + // add on fixed number of uncalled traversals if we're making a ref-call + // with genotype_snarls set to true + if (genotype_snarls && site_traversals.size() <= 1) { + // note: we're adding all the strings here and sorting to make this deterministic + // at the cost of speed + map allele_map; + for (int i = 0; i < called_traversals.size(); ++i) { + // todo: verify index below. it's for uncalled traversals so not important tho + string allele_string = trav_to_string(called_traversals, genotype, i, max(0, (int)genotype.size() - 1), ref_trav_idx); + if (!allele_map.count(allele_string)) { + allele_map[allele_string] = &called_traversals[i]; + } + } + // pick out the first "max_uncalled_alleles" traversals to add + int i = 0; + for (auto ai = allele_map.begin(); i < max_uncalled_alleles && ai != allele_map.end(); ++i, ++ai) { + if (!allele_to_gt.count(ai->first)) { + allele_to_gt[ai->first] = allele_to_gt.size(); + site_traversals.push_back(*ai->second); + } + } + } + + out_variant.alt.resize(allele_to_gt.size() - 1); + out_variant.alleles.resize(allele_to_gt.size()); + + // init the traversal info + out_variant.info["AT"].resize(allele_to_gt.size()); + + for (auto& allele_gt : allele_to_gt) { +#ifdef debug + cerr << "allele " << allele_gt.first << " -> gt " << allele_gt.second << endl; +#endif + if (allele_gt.second > 0) { + out_variant.alt[allele_gt.second - 1] = allele_gt.first; + } + out_variant.alleles[allele_gt.second] = allele_gt.first; + + // update the traversal info + add_allele_path_to_info(out_variant, allele_gt.second, site_traversals.at(allele_gt.second), false, false); + } + + // resolve subpath naming + subrange_t subrange; + string basepath_name = Paths::strip_subrange(ref_path_name, &subrange); + size_t basepath_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + // in VCF we usually just want a contig + string contig_name = PathMetadata::parse_locus_name(basepath_name); + if (contig_name != PathMetadata::NO_LOCUS_NAME) { + basepath_name = contig_name; + } + // fill out the rest of the variant + out_variant.sequenceName = basepath_name; + // +1 to convert to 1-based VCF + out_variant.position = get<0>(get_ref_interval(graph, snarl, ref_path_name)) + ref_offset + 1 + basepath_offset; + out_variant.id = print_snarl(snarl, false); + out_variant.filter = "PASS"; + out_variant.updateAlleleIndexes(); + + // add the genotype + out_variant.format.push_back("GT"); + auto& genotype_vector = out_variant.samples[sample_name]["GT"]; + + stringstream vcf_gt; + if (!genotype.empty()) { + for (int i = 0; i < site_genotype.size(); ++i) { + vcf_gt << site_genotype[i]; + if (i != site_genotype.size() - 1) { + vcf_gt << "/"; + } + } + } else { + for (int i = 0; i < ploidy; ++i) { + vcf_gt << "."; + if (i != ploidy - 1) { + vcf_gt << "/"; + } + } + } + + genotype_vector.push_back(vcf_gt.str()); + + // add some support info + snarl_caller.update_vcf_info(snarl, site_traversals, site_genotype, call_info, sample_name, out_variant); + + // if genotype_snarls, then we only flatten up to the snarl endpoints + // (this is when we are in genotyping mode and want consistent calls regardless of the sample) + int64_t flatten_len_s = 0; + int64_t flatten_len_e = 0; + if (genotype_snarls) { + flatten_len_s = graph.get_length(graph.get_handle(snarl.start().node_id())); + assert(flatten_len_s >= 0); + flatten_len_e = graph.get_length(graph.get_handle(snarl.end().node_id())); + } + // clean up the alleles to not have so man common prefixes + flatten_common_allele_ends(out_variant, true, flatten_len_e); + flatten_common_allele_ends(out_variant, false, flatten_len_s); +#ifdef debug + for (int i = 0; i < site_traversals.size(); ++i) { + cerr << " site trav[" << i << "]=" << pb2json(site_traversals[i]) << endl; + } + for (int i = 0; i < site_genotype.size(); ++i) { + cerr << " site geno[" << i << "]=" << site_genotype[i] << endl; + } +#endif + + if (genotype_snarls || !out_variant.alt.empty()) { + add_variant(out_variant); + } +} + +tuple VCFOutputCaller::get_ref_interval( + const PathPositionHandleGraph& graph, const Snarl& snarl, const string& ref_path_name) const { + path_handle_t path_handle = graph.get_path_handle(ref_path_name); + + handle_t start_handle = graph.get_handle(snarl.start().node_id(), snarl.start().backward()); + map start_steps; + graph.for_each_step_on_handle(start_handle, [&](step_handle_t step) { + if (graph.get_path_handle_of_step(step) == path_handle) { + start_steps[graph.get_position_of_step(step)] = step; + } + }); + + handle_t end_handle = graph.get_handle(snarl.end().node_id(), snarl.end().backward()); + map end_steps; + graph.for_each_step_on_handle(end_handle, [&](step_handle_t step) { + if (graph.get_path_handle_of_step(step) == path_handle) { + end_steps[graph.get_position_of_step(step)] = step; + } + }); + + assert(start_steps.size() > 0 && end_steps.size() > 0); + step_handle_t start_step = start_steps.begin()->second; + step_handle_t end_step = end_steps.begin()->second; + // just because we found a pair of steps on our path that correspond to the snarl ends, doesn't + // mean the path threads the snarl. verify that we can actaully walk, either forwards or backwards + // along the path from the start node and hit then end node in the right orientation. + bool start_rev = graph.get_is_reverse(graph.get_handle_of_step(start_step)) != snarl.start().backward(); + bool end_rev = graph.get_is_reverse(graph.get_handle_of_step(end_step)) != snarl.end().backward(); + bool found_end = start_rev == end_rev && start_rev == start_steps.begin()->first > end_steps.begin()->first; + + // if we're on a cycle, we keep our start step and find the end step by scanning the path + if (start_steps.size() > 1 || end_steps.size() > 1) { + found_end = false; + // try each start step + for (auto i = start_steps.begin(); i != start_steps.end() && !found_end; ++i) { + start_step = i->second; + bool scan_backward = graph.get_is_reverse(graph.get_handle_of_step(start_step)) != snarl.start().backward(); + if (scan_backward) { + // if we're going backward, we expect to reach the end backward + end_handle = graph.get_handle(snarl.end().node_id(), !snarl.end().backward()); + } + if (scan_backward) { + for (step_handle_t cur_step = start_step; graph.has_previous_step(cur_step) && !found_end; + cur_step = graph.get_previous_step(cur_step)) { + if (graph.get_handle_of_step(cur_step) == end_handle) { + end_step = cur_step; + found_end = true; + } + } + } else { + for (step_handle_t cur_step = start_step; graph.has_next_step(cur_step) && !found_end; + cur_step = graph.get_next_step(cur_step)) { + if (graph.get_handle_of_step(cur_step) == end_handle) { + end_step = cur_step; + found_end = true; + } + } + } + } + } + int64_t start_position = start_steps.begin()->first; + step_handle_t out_start_step = start_step; + int64_t end_position = end_step == end_steps.begin()->second ? end_steps.begin()->first : graph.get_position_of_step(end_step); + step_handle_t out_end_step = end_step == end_steps.begin()->second ? end_steps.begin()->second : end_step; + bool backward = end_position < start_position; + + + if (!found_end) { + // oops, once of the above checks failed. we tell caller we coudlnt find by hacking in a -1 coordinate. + start_position = -1; + end_position = -1; + } + + if (backward) { + return make_tuple(end_position, start_position, backward, out_end_step, out_start_step); + } else { + return make_tuple(start_position, end_position, backward, out_start_step, out_end_step); + } +} + +pair VCFOutputCaller::get_ref_position(const PathPositionHandleGraph& graph, const Snarl& snarl, const string& ref_path_name, + int64_t ref_path_offset) const { + + subrange_t subrange; + string basepath_name = Paths::strip_subrange(ref_path_name, &subrange); + size_t basepath_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + // +1 to convert to 1-based VCF + int64_t position = get<0>(get_ref_interval(graph, snarl, ref_path_name)) + ref_path_offset + 1 + basepath_offset; + return make_pair(basepath_name, position); +} + +void VCFOutputCaller::flatten_common_allele_ends(vcflib::Variant& variant, bool backward, size_t len_override) const { + if (variant.alt.size() == 0) { + return; + } + + // find the minimum allele length to make sure we don't delete an entire allele + size_t min_allele_len = variant.alleles[0].length(); + for (int i = 1; i < variant.alleles.size(); ++i) { + min_allele_len = std::min(min_allele_len, variant.alleles[i].length()); + } + + // the maximum number of bases we want ot zip up, applying override if provided + size_t max_flatten_len = len_override > 0 ? len_override : min_allele_len; + + // want to leave at least one in the reference position + if (max_flatten_len == min_allele_len) { + --max_flatten_len; + } + + bool match = true; + int shared_prefix_len = 0; + for (int i = 0; i < max_flatten_len && match; ++i) { + char c1 = std::toupper(variant.alleles[0][!backward ? i : variant.alleles[0].length() - 1 - i]); + for (int j = 1; j < variant.alleles.size() && match; ++j) { + char c2 = std::toupper(variant.alleles[j][!backward ? i : variant.alleles[j].length() - 1 - i]); + match = c1 == c2; + } + if (match) { + ++shared_prefix_len; + } + } + + if (!backward) { + variant.position += shared_prefix_len; + } + for (int i = 0; i < variant.alleles.size(); ++i) { + if (!backward) { + variant.alleles[i] = variant.alleles[i].substr(shared_prefix_len); + } else { + variant.alleles[i] = variant.alleles[i].substr(0, variant.alleles[i].length() - shared_prefix_len); + } + if (i == 0) { + variant.ref = variant.alleles[i]; + } else { + variant.alt[i - 1] = variant.alleles[i]; + } + } +} + +string VCFOutputCaller::print_snarl(const Snarl& snarl, bool in_brackets) const { + // todo, should we canonicalize here by putting lexicographic lowest node first? + nid_t start_node_id = snarl.start().node_id(); + nid_t end_node_id = snarl.end().node_id(); + string start_node = std::to_string(start_node_id); + string end_node = std::to_string(end_node_id); + if (translation) { + auto i = translation->find(start_node_id); + if (i == translation->end()) { + throw runtime_error("Error [VCFOutputCaller]: Unable to find node " + start_node + " in translation file"); + } + start_node = i->second.first; + i = translation->find(end_node_id); + if (i == translation->end()) { + throw runtime_error("Error [VCFOutputCaller]: Unable to find node " + end_node + " in translation file"); + } + end_node = i->second.first; + } + stringstream ss; + if (in_brackets) { + ss << "("; + } + ss << (snarl.start().backward() ? "<" : ">") << start_node << (snarl.end().backward() ? "<" : ">") << end_node; + if (in_brackets) { + ss << ")"; + } + return ss.str(); +} + +void VCFOutputCaller::scan_snarl(const string& allele_string, function callback) const { + int left = -1; + int last = 0; + Snarl snarl; + string frag; + for (int i = 0; i < allele_string.length(); ++i) { + if (allele_string[i] == '(') { + assert(left == -1); + if (last < i) { + frag = allele_string.substr(last, i-last); + callback(frag, snarl); + } + left = i; + } else if (allele_string[i] == ')') { + assert(left >= 0 && i > left + 3); + frag = allele_string.substr(left + 1, i - left - 1); + auto toks = split_delims(frag, "><"); + assert(toks.size() == 2); + assert(frag[0] == '<' || frag[0] == '>'); + int64_t start = std::stoi(toks[0]); + snarl.mutable_start()->set_node_id(start); + snarl.mutable_start()->set_backward(frag[0] == '<'); + assert(frag[toks[0].size() + 1] == '<' || frag[toks[0].size() + 1] == '<'); + int64_t end = std::stoi(toks[1]); + snarl.mutable_end()->set_node_id(abs(end)); + snarl.mutable_end()->set_backward(frag[toks[0].size() + 1] == '<'); + callback("", snarl); + left = -1; + last = i + 1; + } + } + if (last == 0) { + callback(allele_string, snarl); + } else { + frag = allele_string.substr(last); + callback(frag, snarl); + } +} + +GAFOutputCaller::GAFOutputCaller(AlignmentEmitter* emitter, const string& sample_name, const vector& ref_paths, + size_t trav_padding) : + emitter(emitter), + gaf_sample_name(sample_name), + ref_paths(ref_paths.begin(), ref_paths.end()), + trav_padding(trav_padding) { + +} + +GAFOutputCaller::~GAFOutputCaller() { +} + +void GAFOutputCaller::emit_gaf_traversals(const PathHandleGraph& graph, const string& snarl_name, + const vector& travs, + int64_t ref_trav_idx, + const string& ref_path_name, int64_t ref_path_position, + const TraversalSupportFinder* support_finder) { + assert(emitter != nullptr); + vector aln_batch; + aln_batch.reserve(travs.size()); + + stringstream ss; + if (!ref_path_name.empty()) { + ss << ref_path_name << "#" << ref_path_position << "#"; + } + ss << snarl_name << "#" << gaf_sample_name; + string variant_id = ss.str(); + + // create allele ordering where reference is 0 + vector alleles; + if (ref_trav_idx >= 0) { + alleles.push_back(ref_trav_idx); + } + for (int i = 0; i < travs.size(); ++i) { + if (i != ref_trav_idx) { + alleles.push_back(i); + } + } + // make an alignment for each traversal + for (int i = 0; i < alleles.size(); ++i) { + const SnarlTraversal& trav = travs[alleles[i]]; + Alignment trav_aln; + if (trav_padding > 0) { + trav_aln = to_alignment(pad_traversal(graph, trav), graph); + } else { + trav_aln = to_alignment(trav, graph); + } + trav_aln.set_name(variant_id + "#" + std::to_string(i)); + if (support_finder) { + int64_t support = support_finder->support_val(support_finder->get_traversal_support(trav)); + set_annotation(trav_aln, "support", std::to_string(support)); + } + aln_batch.push_back(trav_aln); + } + emitter->emit_singles(std::move(aln_batch)); +} + +void GAFOutputCaller::emit_gaf_variant(const PathHandleGraph& graph, const string& snarl_name, + const vector& travs, + const vector& genotype, + int64_t ref_trav_idx, + const string& ref_path_name, int64_t ref_path_position, + const TraversalSupportFinder* support_finder) { + assert(emitter != nullptr); + + // pretty bare bones for now, just output the genotype as a pair of traversals + // todo: we could embed some basic information (likelihood, ploidy, sample etc) in the gaf + vector gt_travs; + for (int allele : genotype) { + gt_travs.push_back(travs[allele]); + } + emit_gaf_traversals(graph, snarl_name, gt_travs, ref_trav_idx, ref_path_name, ref_path_position, support_finder); +} + +SnarlTraversal GAFOutputCaller::pad_traversal(const PathHandleGraph& graph, const SnarlTraversal& trav) const { + + assert(trav.visit_size() >= 2); + + SnarlTraversal out_trav; + + // traversal endpoints + handle_t start_handle = graph.get_handle(trav.visit(0).node_id(), trav.visit(0).backward()); + handle_t end_handle = graph.get_handle(trav.visit(trav.visit_size() - 1).node_id(), trav.visit(trav.visit_size() - 1).backward()); + + // find a reference path that touches the start node + // todo: we could be more clever by finding the longest one or something + path_handle_t reference_path; + step_handle_t reference_step; + bool found = false; + size_t padding = 0; + graph.for_each_step_on_handle(start_handle, [&](step_handle_t step_handle) { + reference_path = graph.get_path_handle_of_step(step_handle); + string name = graph.get_path_name(reference_path); + if (!Paths::is_alt(name) && (ref_paths.empty() || ref_paths.count(name))) { + reference_step = step_handle; + found = true; + } + return !found; + }); + + // add left padding + if (found) { + deque left_padding; + + if (graph.get_is_reverse(start_handle) == graph.get_is_reverse(graph.get_handle_of_step(reference_step))) { + // path and handle oriented the same, we can just backtrack along the path to get previous stuff + for (step_handle_t step = graph.get_previous_step(reference_step); + step != graph.path_front_end(reference_path) && padding < trav_padding; + step = graph.get_previous_step(step)) { + left_padding.push_front(to_visit(graph, graph.get_handle_of_step(step))); + padding += graph.get_length(graph.get_handle_of_step(step)); + } + } else { + // path and handle oriented differently, we go forward in the path, flipping each step + for (step_handle_t step = graph.get_next_step(reference_step); + step != graph.path_end(reference_path) && padding < trav_padding; + step = graph.get_next_step(step)) { + left_padding.push_front(to_visit(graph, graph.get_handle_of_step(step))); + padding += graph.get_length(graph.get_handle_of_step(step)); + } + } + + for (const Visit& visit : left_padding) { + *out_trav.add_visit() = visit; + } + } + + // copy over center + for (int i = 0; i < trav.visit_size(); ++i) { + *out_trav.add_visit() = trav.visit(i); + } + + // go through the whole thing again with the end + found = false; + padding = 0; + graph.for_each_step_on_handle(end_handle, [&](step_handle_t step_handle) { + reference_path = graph.get_path_handle_of_step(step_handle); + string name = graph.get_path_name(reference_path); + if (!Paths::is_alt(name) && (ref_paths.empty() || ref_paths.count(name))) { + reference_step = step_handle; + found = true; + } + return !found; + }); + + // add right padding + if (found) { + if (graph.get_is_reverse(end_handle) == graph.get_is_reverse(graph.get_handle_of_step(reference_step))) { + // path and handle oriented the same, we can just continue along the path to get next stuff + for (step_handle_t step = graph.get_next_step(reference_step); + step != graph.path_end(reference_path) && padding < trav_padding; + step = graph.get_next_step(step)) { + Visit* visit = out_trav.add_visit(); + *visit = to_visit(graph, graph.get_handle_of_step(step)); + padding += graph.get_length(graph.get_handle_of_step(step)); + } + } else { + // path and handle oriented differently, we go backward in the path, flipping each step + for (step_handle_t step = graph.get_previous_step(reference_step); + step != graph.path_front_end(reference_path) && padding < trav_padding; + step = graph.get_previous_step(step)) { + Visit* visit = out_trav.add_visit(); + *visit = to_visit(graph, graph.flip(graph.get_handle_of_step(step))); + padding += graph.get_length(graph.get_handle_of_step(step)); + } + } + } + + return out_trav; +} + +void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager) { + + // index the snarl tree by name + unordered_map name_to_snarl; + Snarl flipped_snarl; + snarl_manager->for_each_snarl_preorder([&](const Snarl* snarl) { + name_to_snarl[print_snarl(*snarl)] = snarl; + // also add a map from the flipped snarl (as call sometimes messes with orientation) + flipped_snarl.mutable_start()->set_node_id(snarl->end().node_id()); + flipped_snarl.mutable_start()->set_backward(!snarl->end().backward()); + flipped_snarl.mutable_end()->set_node_id(snarl->start().node_id()); + flipped_snarl.mutable_end()->set_backward(!snarl->start().backward()); + name_to_snarl[print_snarl(flipped_snarl)] = snarl; + }); + + // pass 1) index sites in vcf + // (todo: this could be done more quickly upstream) + unordered_set names_in_vcf; + for (auto& thread_buf : output_variants) { + for (auto& output_variant_record : thread_buf) { + string output_variant_string; + zstdutil::DecompressString(output_variant_record.second, output_variant_string); + vector toks = split_delims(output_variant_string, "\t", 4); + names_in_vcf.insert(toks[2]); + } + } + + // determine the tags from the index + function(const string&)> get_lv_ps_tags = [&](const string& name) { + string parent_name; + size_t ancestor_count = 0; + const Snarl* snarl = name_to_snarl.at(name); + assert(snarl != nullptr); + // walk up the snarl tree + while (snarl = snarl_manager->parent_of(snarl)) { + string cur_name = print_snarl(*snarl); + if (names_in_vcf.count(cur_name)) { + // only count snarls that are in the vcf + ++ancestor_count; + if (parent_name.empty()) { + // remembert the first parent + parent_name = cur_name; + } + } + } + return make_pair(ancestor_count, parent_name); + }; + + // pass 2) add the LV and PS tags +#pragma omp parallel for + for (uint64_t i = 0; i < output_variants.size(); ++i) { + auto& thread_buf = output_variants[i]; + for (auto& output_variant_record : thread_buf) { + string output_variant_string; + zstdutil::DecompressString(output_variant_record.second, output_variant_string); + //string& output_variant_string = output_variant_record.second; + vector toks = split_delims(output_variant_string, "\t", 9); + const string& name = toks[2]; + + pair lv_ps = get_lv_ps_tags(name); + string nesting_tags = ";LV=" + std::to_string(lv_ps.first); + if (lv_ps.first != 0) { + assert(!lv_ps.second.empty()); + nesting_tags += ";PS=" + lv_ps.second; + } + + // rewrite the output string using the updated info toks + output_variant_string.clear(); + for (size_t i = 0; i < toks.size(); ++i) { + output_variant_string += toks[i]; + if (i == 7) { + output_variant_string += nesting_tags; + } + if (i != toks.size() - 1) { + output_variant_string += "\t"; + } + } + output_variant_record.second.clear(); + zstdutil::CompressString(output_variant_string, output_variant_record.second); + } + } +} + +VCFGenotyper::VCFGenotyper(const PathHandleGraph& graph, + SnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + vcflib::VariantCallFile& variant_file, + const string& sample_name, + const vector& ref_paths, + const vector& ref_path_ploidies, + FastaReference* ref_fasta, + FastaReference* ins_fasta, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding) : + GraphCaller(snarl_caller, snarl_manager), + VCFOutputCaller(sample_name), + GAFOutputCaller(aln_emitter, sample_name, ref_paths, trav_padding), + graph(graph), + input_vcf(variant_file), + traversal_finder(graph, snarl_manager, variant_file, ref_paths, ref_fasta, ins_fasta, snarl_caller.get_skip_allele_fn()), + traversals_only(traversals_only), + gaf_output(gaf_output) { + + scan_contig_lengths(); + + assert(ref_paths.size() == ref_path_ploidies.size()); + for (int i = 0; i < ref_paths.size(); ++i) { + path_to_ploidy[ref_paths[i]] = ref_path_ploidies[i]; + } +} + +VCFGenotyper::~VCFGenotyper() { + +} + +bool VCFGenotyper::call_snarl(const Snarl& snarl) { + + // could be that our graph is a subgraph of the graph the snarls were computed from + // so bypass snarls we can't process + if (!graph.has_node(snarl.start().node_id()) || !graph.has_node(snarl.end().node_id())) { + return false; + } + + // get our traversals out of the finder + vector>> alleles; + vector variants; + std::tie(alleles, variants) = traversal_finder.find_allele_traversals(snarl); + + if (!alleles.empty()) { + + // hmm, maybe find a way not to copy? + vector travs; + travs.reserve(alleles.size()); + for (const auto& ta : alleles) { + travs.push_back(ta.first); + } + + // find the reference traversal + // todo: is it the reference always first? + int ref_trav_idx = -1; + for (int i = 0; i < alleles.size() && ref_trav_idx < 0; ++i) { + if (std::all_of(alleles[i].second.begin(), alleles[i].second.end(), [](int x) {return x == 0;})) { + ref_trav_idx = i; + } + } + + // find a path range corresponding to our snarl by way of the VCF variants. + tuple ref_positions = get_ref_positions(variants); + + // just print the traversals if requested + if (traversals_only) { + assert(gaf_output); + // todo: can't get ref position here without pathposition graph + emit_gaf_traversals(graph, print_snarl(snarl), travs, ref_trav_idx, "", -1); + return true; + } + + // use our support caller to choose our genotype (int traversal coordinates) + vector trav_genotype; + unique_ptr trav_call_info; + std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, path_to_ploidy[get<0>(ref_positions)], + get<0>(ref_positions), make_pair(get<1>(ref_positions), get<2>(ref_positions))); + + assert(trav_genotype.size() <= 2); + + if (gaf_output) { + // todo: can't get ref position here without pathposition graph + emit_gaf_variant(graph, print_snarl(snarl), travs, trav_genotype, ref_trav_idx, "", -1); + return true; + } + + // map our genotype back to the vcf + for (int i = 0; i < variants.size(); ++i) { + vector vcf_alleles; + set used_vcf_alleles; + string vcf_genotype; + vector vcf_traversals(variants[i]->alleles.size()); + if (trav_genotype.empty()) { + vcf_genotype = "./."; + } else { + // map our traversal genotype to a vcf variant genotype + // using the information out of the traversal finder + for (int j = 0; j < trav_genotype.size(); ++j) { + int trav_allele = trav_genotype[j]; + int vcf_allele = alleles[trav_allele].second[i]; + vcf_genotype += std::to_string(vcf_allele); + if (j < trav_genotype.size() - 1) { + vcf_genotype += "/"; + } + vcf_alleles.push_back(vcf_allele); + used_vcf_alleles.insert(vcf_allele); + vcf_traversals[vcf_allele] = travs[trav_allele]; + } + // add traversals that correspond to vcf genotypes that are not + // present in the traversal_genotypes + for (int j = 0; j < travs.size(); ++j) { + int vcf_allele = alleles[j].second[i]; + if (!used_vcf_alleles.count(vcf_allele)) { + vcf_traversals[vcf_allele] = travs[j]; + used_vcf_alleles.insert(vcf_allele); + } + } + } + // create an output variant from the input one + vcflib::Variant out_variant; + out_variant.sequenceName = variants[i]->sequenceName; + out_variant.position = variants[i]->position; + out_variant.id = variants[i]->id; + out_variant.ref = variants[i]->ref; + out_variant.alt = variants[i]->alt; + out_variant.alleles = variants[i]->alleles; + out_variant.filter = "PASS"; + out_variant.updateAlleleIndexes(); + + // add the genotype + out_variant.format.push_back("GT"); + auto& genotype_vector = out_variant.samples[sample_name]["GT"]; + genotype_vector.push_back(vcf_genotype); + + // add some info + snarl_caller.update_vcf_info(snarl, vcf_traversals, vcf_alleles, trav_call_info, sample_name, out_variant); + + // print the variant + add_variant(out_variant); + } + return true; + } + + return false; + +} + +string VCFGenotyper::vcf_header(const PathHandleGraph& graph, const vector& ref_paths, + const vector& contig_length_overrides) const { + assert(contig_length_overrides.empty()); // using this override makes no sense + + // get the contig length overrides from the VCF + vector vcf_contig_lengths; + auto length_map = scan_contig_lengths(); + for (int i = 0; i < ref_paths.size(); ++i) { + vcf_contig_lengths.push_back(length_map[ref_paths[i]]); + } + + string header = VCFOutputCaller::vcf_header(graph, ref_paths, vcf_contig_lengths); + header += "##FORMAT=\n"; + snarl_caller.update_vcf_header(header); + header += "##FILTER=\n"; + header += "##SAMPLE=\n"; + header += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_name; + assert(output_vcf.openForOutput(header)); + header += "\n"; + return header; +} + +tuple VCFGenotyper::get_ref_positions(const vector& variants) const { + // if there is more than one path in our snarl (unlikely for most graphs we'll vcf-genoetype) + // then we return the one with the biggest interval + map> path_offsets; + for (const vcflib::Variant* var : variants) { + if (path_offsets.count(var->sequenceName)) { + pair& record = path_offsets[var->sequenceName]; + record.first = std::min((size_t)var->position, record.first); + record.second = std::max((size_t)var->position + var->ref.length(), record.second); + } else { + path_offsets[var->sequenceName] = make_pair(var->position, var->position + var->ref.length()); + } + } + + string ref_path; + size_t ref_range_size = 0; + pair ref_range; + for (auto& path_offset : path_offsets) { + size_t len = path_offset.second.second - path_offset.second.first; + if (len > ref_range_size) { + ref_range_size = len; + ref_path = path_offset.first; + ref_range = path_offset.second; + } + } + + return make_tuple(ref_path, ref_range.first, ref_range.second); +} + +unordered_map VCFGenotyper::scan_contig_lengths() const { + + unordered_map ref_lengths; + + // copied from dumpContigsFromHeader.cpp in vcflib + vector headerLines = split(input_vcf.header, "\n"); + for(vector::iterator it = headerLines.begin(); it != headerLines.end(); it++) { + if((*it).substr(0,8) == "##contig"){ + string contigInfo = (*it).substr(10, (*it).length() -11); + vector info = split(contigInfo, ","); + string id; + int64_t length = -1; + for(vector::iterator sub = info.begin(); sub != info.end(); sub++) { + vector subfield = split((*sub), "="); + if(subfield[0] == "ID"){ + id = subfield[1]; + } + if(subfield[0] == "length"){ + length = parse(subfield[1]); + } + } + if (!id.empty() && length >= 0) { + ref_lengths[id] = length; + } + } + } + + return ref_lengths; +} + + +LegacyCaller::LegacyCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_ploidies) : + GraphCaller(snarl_caller, snarl_manager), + VCFOutputCaller(sample_name), + graph(graph), + ref_paths(ref_paths) { + + for (int i = 0; i < ref_paths.size(); ++i) { + ref_offsets[ref_paths[i]] = i < ref_path_offsets.size() ? ref_path_offsets[i] : 0; + ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2; + } + + is_vg = dynamic_cast(&graph) != nullptr; + if (is_vg) { + // our graph is in vg format. we index the paths and make a traversal finder just + // like in the old call code + for (auto ref_path : ref_paths) { + path_indexes.push_back(new PathIndex(graph, ref_path)); + } + // map snarl to the first reference path that spans it + function get_path_index = [&](const Snarl& site) -> PathIndex* { + return find_index(site, path_indexes).second; + }; + // initialize our traversal finder + traversal_finder = new RepresentativeTraversalFinder(graph, snarl_manager, + max_search_depth, + max_search_width, + max_bubble_paths, + 0, + 0, + get_path_index, + [&](id_t id) { return snarl_caller.get_support_finder().get_min_node_support(id);}, + [&](edge_t edge) { return snarl_caller.get_support_finder().get_edge_support(edge);}); + + } else { + // our graph is not in vg format. we will make graphs for each site as needed and work with those + traversal_finder = nullptr; + } +} + +LegacyCaller::~LegacyCaller() { + delete traversal_finder; + for (PathIndex* path_index : path_indexes) { + delete path_index; + } +} + +bool LegacyCaller::call_snarl(const Snarl& snarl) { + + // if we can't handle the snarl, then the GraphCaller framework will recurse on its children + if (!is_traversable(snarl)) { + return false; + } + + RepresentativeTraversalFinder* rep_trav_finder; + vector site_path_indexes; + function get_path_index; + VG vg_graph; + SupportBasedSnarlCaller& support_caller = dynamic_cast(snarl_caller); + bool was_called = false; + + if (is_vg) { + // our graph is in VG format, so we've sorted this out in the constructor + rep_trav_finder = traversal_finder; + get_path_index = [&](const Snarl& site) { + return find_index(site, path_indexes).second; + }; + + } else { + // our graph isn't in VG format. we are using a (hopefully temporary) workaround + // of converting the subgraph into VG. + pair, unordered_set > contents = snarl_manager.deep_contents(&snarl, graph, true); + size_t total_snarl_length = 0; + for (auto node_id : contents.first) { + handle_t new_handle = vg_graph.create_handle(graph.get_sequence(graph.get_handle(node_id)), node_id); + if (node_id != snarl.start().node_id() && node_id != snarl.end().node_id()) { + total_snarl_length += vg_graph.get_length(new_handle); + } + } + for (auto edge : contents.second) { + vg_graph.create_edge(vg_graph.get_handle(graph.get_id(edge.first), vg_graph.get_is_reverse(edge.first)), + vg_graph.get_handle(graph.get_id(edge.second), vg_graph.get_is_reverse(edge.second))); + total_snarl_length += 1; + } + // add the paths to the subgraph + algorithms::expand_context_with_paths(&graph, &vg_graph, 1); + // and index them + for (auto& ref_path : ref_paths) { + if (vg_graph.has_path(ref_path)) { + site_path_indexes.push_back(new PathIndex(vg_graph, ref_path)); + } else { + site_path_indexes.push_back(nullptr); + } + } + get_path_index = [&](const Snarl& site) -> PathIndex* { + return find_index(site, site_path_indexes).second; + }; + // determine the support threshold for the traversal finder. if we're using average + // support, then we don't use any (set to 0), other wise, use the minimum support for a call + SupportBasedSnarlCaller& support_caller = dynamic_cast(snarl_caller); + size_t threshold = support_caller.get_support_finder().get_average_traversal_support_switch_threshold(); + double support_cutoff = total_snarl_length <= threshold ? support_caller.get_min_total_support_for_call() : 0; + rep_trav_finder = new RepresentativeTraversalFinder(vg_graph, snarl_manager, + max_search_depth, + max_search_width, + max_bubble_paths, + support_cutoff, + support_cutoff, + get_path_index, + [&](id_t id) { return support_caller.get_support_finder().get_min_node_support(id);}, + // note: because our traversal finder and support caller have + // different graphs, they can't share edge handles + [&](edge_t edge) { return support_caller.get_support_finder().get_edge_support( + vg_graph.get_id(edge.first), vg_graph.get_is_reverse(edge.first), + vg_graph.get_id(edge.second), vg_graph.get_is_reverse(edge.second));}); + + } + + PathIndex* path_index = get_path_index(snarl); + if (path_index != nullptr) { + string path_name = find_index(snarl, is_vg ? path_indexes : site_path_indexes).first; + + // orient the snarl along the reference path + tuple ref_interval = get_ref_interval(graph, snarl, path_name); + if (get<2>(ref_interval) == true) { + snarl_manager.flip(&snarl); + } + + // recursively genotype the site beginning here at the top level snarl + vector called_traversals; + // these integers map the called traversals to their positions in the list of all traversals + // of the top level snarl. + vector genotype; + int ploidy = ref_ploidies[path_name]; + std::tie(called_traversals, genotype) = top_down_genotype(snarl, *rep_trav_finder, ploidy, + path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); + + if (!called_traversals.empty()) { + // regenotype our top-level traversals now that we know they aren't nested, and we have a + // good idea of all the sizes + unique_ptr call_info; + std::tie(called_traversals, genotype, call_info) = re_genotype(snarl, *rep_trav_finder, called_traversals, genotype, ploidy, + path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); + + // emit our vcf variant + emit_variant(graph, snarl_caller, snarl, called_traversals, genotype, 0, call_info, path_name, ref_offsets.find(path_name)->second, false, + ploidy); + + was_called = true; + } + } + if (!is_vg) { + // delete the temporary vg subgraph and traversal finder we created for this snarl + delete rep_trav_finder; + for (PathIndex* path_index : site_path_indexes) { + delete path_index; + } + } + + return was_called; +} + +string LegacyCaller::vcf_header(const PathHandleGraph& graph, const vector& ref_paths, + const vector& contig_length_overrides) const { + string header = VCFOutputCaller::vcf_header(graph, ref_paths, contig_length_overrides); + header += "##FORMAT=\n"; + snarl_caller.update_vcf_header(header); + header += "##FILTER=\n"; + header += "##SAMPLE=\n"; + header += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_name; + assert(output_vcf.openForOutput(header)); + header += "\n"; + return header; +} + +pair, vector> LegacyCaller::top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy, + const string& ref_path_name, pair ref_interval) const { + + // get the traversals through the site + vector traversals = trav_finder.find_traversals(snarl); + + // use our support caller to choose our genotype + vector trav_genotype; + unique_ptr trav_call_info; + std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, traversals, 0, ploidy, ref_path_name, ref_interval); + if (trav_genotype.empty()) { + return make_pair(vector(), vector()); + } + + assert(trav_genotype.size() == ploidy); + + vector called_travs(ploidy); + + // do we have two paths going through a given traversal? This is handled + // as a special case below + bool hom = trav_genotype.size() == 2 && trav_genotype[0] == trav_genotype[1]; + + for (int i = 0; i < trav_genotype.size() && (!hom || i < 1); ++i) { + int allele = trav_genotype[i]; + const SnarlTraversal& traversal = traversals[allele]; + Visit prev_end; + for (int j = 0; j < traversal.visit_size(); ++j) { + if (traversal.visit(j).node_id() > 0) { + *called_travs[i].add_visit() = traversal.visit(j); + if (hom && i == 0) { + *called_travs[1].add_visit() = traversal.visit(j); + } + } else { + // recursively determine the traversal + const Snarl* into_snarl = snarl_manager.into_which_snarl(traversal.visit(j)); + bool flipped = traversal.visit(j).backward(); + if (flipped) { + // we're always processing our snarl from start to end, so make sure + // it lines up with the parent (note that we've oriented the root along the ref path) + snarl_manager.flip(into_snarl); + } + vector child_genotype = top_down_genotype(*into_snarl, + trav_finder, hom ? 2: 1, ref_path_name, ref_interval).first; + if (child_genotype.empty()) { + return make_pair(vector(), vector()); + } + bool back_to_back = j > 0 && traversal.visit(j - 1).node_id() == 0 && prev_end == into_snarl->start(); + + for (int k = back_to_back ? 1 : 0; k < child_genotype[0].visit_size(); ++k) { + *called_travs[i].add_visit() = child_genotype[0].visit(k); + } + if (hom) { + assert(child_genotype.size() == 2 && i == 0); + for (int k = back_to_back ? 1 : 0; k < child_genotype[1].visit_size(); ++k) { + *called_travs[1].add_visit() = child_genotype[1].visit(k); + } + } + prev_end = into_snarl->end(); + if (flipped) { + // leave our snarl like we found it + snarl_manager.flip(into_snarl); + } + } + } + } + + return make_pair(called_travs, trav_genotype); +} + +SnarlTraversal LegacyCaller::get_reference_traversal(const Snarl& snarl, TraversalFinder& trav_finder) const { + + // get the ref traversal through the site + // todo: don't avoid so many traversal recomputations + SnarlTraversal traversal = trav_finder.find_traversals(snarl)[0]; + SnarlTraversal out_traversal; + + Visit prev_end; + for (int i = 0; i < traversal.visit_size(); ++i) { + const Visit& visit = traversal.visit(i); + if (visit.node_id() != 0) { + *out_traversal.add_visit() = visit; + } else { + const Snarl* into_snarl = snarl_manager.into_which_snarl(visit); + if (visit.backward()) { + snarl_manager.flip(into_snarl); + } + bool back_to_back = i > 0 && traversal.visit(i - 1).node_id() == 0 && prev_end == into_snarl->start(); + + SnarlTraversal child_ref = get_reference_traversal(*into_snarl, trav_finder); + for (int j = back_to_back ? 1 : 0; j < child_ref.visit_size(); ++j) { + *out_traversal.add_visit() = child_ref.visit(j); + } + prev_end = into_snarl->end(); + if (visit.backward()) { + // leave our snarl like we found it + snarl_manager.flip(into_snarl); + } + } + } + return out_traversal; +} + +tuple, vector, unique_ptr> +LegacyCaller::re_genotype(const Snarl& snarl, TraversalFinder& trav_finder, + const vector& in_traversals, + const vector& in_genotype, + int ploidy, + const string& ref_path_name, + pair ref_interval) const { + + assert(in_traversals.size() == in_genotype.size()); + + // create a set of unique traversal candidates that must include the reference first + vector rg_traversals; + // add our reference traversal to the front + for (int i = 0; i < in_traversals.size() && !rg_traversals.empty(); ++i) { + if (in_genotype[i] == 0) { + rg_traversals.push_back(in_traversals[i]); + } + } + if (rg_traversals.empty()) { + rg_traversals.push_back(get_reference_traversal(snarl, trav_finder)); + } + set gt_set = {0}; + for (int i = 0; i < in_traversals.size(); ++i) { + if (!gt_set.count(in_genotype[i])) { + rg_traversals.push_back(in_traversals[i]); + gt_set.insert(in_genotype[i]); + } + } + + // re-genotype the candidates + vector rg_genotype; + unique_ptr rg_call_info; + std::tie(rg_genotype, rg_call_info) = snarl_caller.genotype(snarl, rg_traversals, 0, ploidy, ref_path_name, ref_interval); + + return make_tuple(rg_traversals, rg_genotype, std::move(rg_call_info)); +} + +bool LegacyCaller::is_traversable(const Snarl& snarl) { + // we need this to be true all the way down to use the RepresentativeTraversalFinder on our snarl. + bool ret = snarl.start_end_reachable() && snarl.directed_acyclic_net_graph() && + graph.has_node(snarl.start().node_id()) && graph.has_node(snarl.end().node_id()); + if (ret == true) { + const vector& children = snarl_manager.children_of(&snarl); + for (int i = 0; i < children.size() && ret; ++i) { + ret = is_traversable(*children[i]); + } + } + return ret; +} + +pair LegacyCaller::find_index(const Snarl& snarl, const vector path_indexes) const { + assert(path_indexes.size() == ref_paths.size()); + for (int i = 0; i < path_indexes.size(); ++i) { + PathIndex* path_index = path_indexes[i]; + if (path_index != nullptr && + path_index->by_id.count(snarl.start().node_id()) && + path_index->by_id.count(snarl.end().node_id())) { + // This path threads through this site + return make_pair(ref_paths[i], path_index); + } + } + return make_pair("", nullptr); +} + +FlowCaller::FlowCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + TraversalFinder& traversal_finder, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_ploidies, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding, + bool genotype_snarls, + const pair& ref_allele_length_range) : + GraphCaller(snarl_caller, snarl_manager), + VCFOutputCaller(sample_name), + GAFOutputCaller(aln_emitter, sample_name, ref_paths, trav_padding), + graph(graph), + traversal_finder(traversal_finder), + ref_paths(ref_paths), + traversals_only(traversals_only), + gaf_output(gaf_output), + genotype_snarls(genotype_snarls), + ref_allele_length_range(ref_allele_length_range) +{ + for (int i = 0; i < ref_paths.size(); ++i) { + ref_offsets[ref_paths[i]] = i < ref_path_offsets.size() ? ref_path_offsets[i] : 0; + ref_path_set.insert(ref_paths[i]); + ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2; + } + +} + +FlowCaller::~FlowCaller() { + +} + +bool FlowCaller::call_snarl(const Snarl& managed_snarl) { + + // todo: In order to experiment with merging consecutive snarls to make longer traversals, + // I am experimenting with sending "fake" snarls through this code. So make a local + // copy to work on to do things like flip -- calling any snarl_manager code that + // wants a pointer will crash. + Snarl snarl = managed_snarl; + + if (snarl.start().node_id() == snarl.end().node_id() || + !graph.has_node(snarl.start().node_id()) || !graph.has_node(snarl.end().node_id())) { + // can't call one-node or out-of graph snarls. + return false; + } + // toggle average flow / flow width based on snarl length. this is a bit inconsistent with + // downstream which uses the longest traversal length, but it's a bit chicken and egg + // todo: maybe use snarl length for everything? + const auto& support_finder = dynamic_cast(snarl_caller).get_support_finder(); + bool greedy_avg_flow = false; + { + auto snarl_contents = snarl_manager.deep_contents(&snarl, graph, false); + if (snarl_contents.second.size() > max_snarl_edges) { + // size cap needed as FlowCaller doesn't have nesting support yet + return false; + } + size_t len_threshold = support_finder.get_average_traversal_support_switch_threshold(); + size_t length = 0; + for (auto i = snarl_contents.first.begin(); i != snarl_contents.first.end() && length < len_threshold; ++i) { + length += graph.get_length(graph.get_handle(*i)); + } + greedy_avg_flow = length > len_threshold; + } + + handle_t start_handle = graph.get_handle(snarl.start().node_id(), snarl.start().backward()); + handle_t end_handle = graph.get_handle(snarl.end().node_id(), snarl.end().backward()); + + // as we're writing to VCF, we need a reference path through the snarl. we + // look it up directly from the graph, and abort if we can't find one + set start_path_names; + graph.for_each_step_on_handle(start_handle, [&](step_handle_t step_handle) { + string name = graph.get_path_name(graph.get_path_handle_of_step(step_handle)); + if (!Paths::is_alt(name) && (ref_path_set.empty() || ref_path_set.count(name))) { + start_path_names.insert(name); + } + return true; + }); + + set end_path_names; + if (!start_path_names.empty()) { + graph.for_each_step_on_handle(end_handle, [&](step_handle_t step_handle) { + string name = graph.get_path_name(graph.get_path_handle_of_step(step_handle)); + if (!Paths::is_alt(name) && (ref_path_set.empty() || ref_path_set.count(name))) { + end_path_names.insert(name); + } + return true; + }); + } + + // we do the full intersection (instead of more quickly finding the first common path) + // so that we always take the lexicographically lowest path, rather than depending + // on the order of iteration which could change between implementations / runs. + vector common_names; + std::set_intersection(start_path_names.begin(), start_path_names.end(), + end_path_names.begin(), end_path_names.end(), + std::back_inserter(common_names)); + + if (common_names.empty()) { + return false; + } + + string& ref_path_name = common_names.front(); + + // find the reference traversal and coordinates using the path position graph interface + tuple ref_interval = get_ref_interval(graph, snarl, ref_path_name); + if (get<0>(ref_interval) == -1) { + // could not find reference path interval consisten with snarl due to orientation conflict + return false; + } + if (get<2>(ref_interval) == true) { + // calling code assumes snarl forward on reference + flip_snarl(snarl); + ref_interval = get_ref_interval(graph, snarl, ref_path_name); + } + + step_handle_t cur_step = get<3>(ref_interval); + step_handle_t last_step = get<4>(ref_interval); + if (get<2>(ref_interval)) { + std::swap(cur_step, last_step); + } + bool start_backwards = snarl.start().backward() != graph.get_is_reverse(graph.get_handle_of_step(cur_step)); + + SnarlTraversal ref_trav; + while (true) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + Visit* visit = ref_trav.add_visit(); + visit->set_node_id(graph.get_id(cur_handle)); + visit->set_backward(start_backwards ? !graph.get_is_reverse(cur_handle) : graph.get_is_reverse(cur_handle)); + if (graph.get_id(cur_handle) == snarl.end().node_id()) { + break; + } else if (get<2>(ref_interval) == true) { + if (!graph.has_previous_step(cur_step)) { + cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; + return false; + } + cur_step = graph.get_previous_step(cur_step); + } else { + if (!graph.has_next_step(cur_step)) { + cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; + return false; + } + cur_step = graph.get_next_step(cur_step); + } + // todo: we can compute flow at the same time + } + assert(ref_trav.visit(0) == snarl.start() && ref_trav.visit(ref_trav.visit_size() - 1) == snarl.end()); + + // optional reference length clamp can, ex, avoid trying to resolve a giant snarl + if (ref_trav.visit_size() > 1 && ref_allele_length_range.first > 0 || ref_allele_length_range.second < numeric_limits::max()) { + size_t ref_trav_len = 0; + for (size_t j = 1; j < ref_trav.visit_size() - 1; ++j) { + ref_trav_len += graph.get_length(graph.get_handle(ref_trav.visit(j).node_id())); + } + if (ref_trav_len < ref_allele_length_range.first || ref_trav_len > ref_allele_length_range.second) { + return false; + } + } + + vector travs; + FlowTraversalFinder* flow_trav_finder = dynamic_cast(&traversal_finder); + if (flow_trav_finder != nullptr) { + // find the max flow traversals using specialized interface that accepts avg heurstic toggle + pair, vector> weighted_travs = flow_trav_finder->find_weighted_traversals(snarl, greedy_avg_flow); + travs = std::move(weighted_travs.first); + } else { + // find the traversals using the generic interface + travs = traversal_finder.find_traversals(snarl); + } + + if (travs.empty()) { + cerr << "Warning [vg call]: Unable, due to bug or corrupt graph, to search for any traversals through snarl " << pb2json(managed_snarl) << endl; + return false; + } + + // find the reference traversal in the list of results from the traversal finder + int ref_trav_idx = -1; + for (int i = 0; i < travs.size() && ref_trav_idx < 0; ++i) { + // todo: is there a way to speed this up? + if (travs[i] == ref_trav) { + ref_trav_idx = i; + } + } + + if (ref_trav_idx == -1) { + ref_trav_idx = travs.size(); + // we didn't get the reference traversal from the finder, so we add it here + travs.push_back(ref_trav); + } + + bool ret_val = true; + + if (traversals_only) { + assert(gaf_output); + pair pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]); + emit_gaf_traversals(graph, print_snarl(snarl), travs, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); + } else { + // use our support caller to choose our genotype + vector trav_genotype; + unique_ptr trav_call_info; + int ploidy = ref_ploidies[ref_path_name]; + std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, ref_path_name, + make_pair(get<0>(ref_interval), get<1>(ref_interval))); + + assert(trav_genotype.empty() || trav_genotype.size() == ploidy); + + if (!gaf_output) { + emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name, + ref_offsets[ref_path_name], genotype_snarls, ploidy); + } else { + pair pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]); + emit_gaf_variant(graph, print_snarl(snarl), travs, trav_genotype, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); + } + + ret_val = trav_genotype.size() == ploidy; + } + + return ret_val; +} + +string FlowCaller::vcf_header(const PathHandleGraph& graph, const vector& contigs, + const vector& contig_length_overrides) const { + string header = VCFOutputCaller::vcf_header(graph, contigs, contig_length_overrides); + header += "##FORMAT=\n"; + snarl_caller.update_vcf_header(header); + header += "##FILTER=\n"; + header += "##SAMPLE=\n"; + header += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_name; + assert(output_vcf.openForOutput(header)); + header += "\n"; + return header; +} + + +NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + TraversalFinder& traversal_finder, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_ploidies, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding, + bool genotype_snarls) : + GraphCaller(snarl_caller, snarl_manager), + VCFOutputCaller(sample_name), + GAFOutputCaller(aln_emitter, sample_name, ref_paths, trav_padding), + graph(graph), + traversal_finder(traversal_finder), + ref_paths(ref_paths), + traversals_only(traversals_only), + gaf_output(gaf_output), + genotype_snarls(genotype_snarls), + nested_support_finder(dynamic_cast(snarl_caller.get_support_finder())){ + + for (int i = 0; i < ref_paths.size(); ++i) { + ref_offsets[ref_paths[i]] = i < ref_path_offsets.size() ? ref_path_offsets[i] : 0; + ref_path_set.insert(ref_paths[i]); + ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2; + } + +} + +NestedFlowCaller::~NestedFlowCaller() { + +} + +bool NestedFlowCaller::call_snarl(const Snarl& managed_snarl) { + + // remember the calls for each child snarl in this table + CallTable call_table; + + bool called = call_snarl_recursive(managed_snarl, -1, "", make_pair(0, 0), call_table); + + if (called) { + emit_snarl_recursive(managed_snarl, -1, call_table); + } + + return called; +} + +bool NestedFlowCaller::call_snarl_recursive(const Snarl& managed_snarl, int max_ploidy, + const string& parent_ref_path_name, pair parent_ref_interval, + CallTable& call_table) { + + // todo: In order to experiment with merging consecutive snarls to make longer traversals, + // I am experimenting with sending "fake" snarls through this code. So make a local + // copy to work on to do things like flip -- calling any snarl_manager code that + // wants a pointer will crash. + Snarl snarl = managed_snarl; + + // hook into our table entry + CallRecord& record = call_table[managed_snarl]; + + // get some reference information if possible + // todo: make a function + + handle_t start_handle = graph.get_handle(snarl.start().node_id(), snarl.start().backward()); + handle_t end_handle = graph.get_handle(snarl.end().node_id(), snarl.end().backward()); + + // as we're writing to VCF, we need a reference path through the snarl. we + // look it up directly from the graph, and abort if we can't find one + set start_path_names; + graph.for_each_step_on_handle(start_handle, [&](step_handle_t step_handle) { + string name = graph.get_path_name(graph.get_path_handle_of_step(step_handle)); + if (!Paths::is_alt(name) && (ref_path_set.empty() || ref_path_set.count(name))) { + start_path_names.insert(name); + } + return true; + }); + + set end_path_names; + if (!start_path_names.empty()) { + graph.for_each_step_on_handle(end_handle, [&](step_handle_t step_handle) { + string name = graph.get_path_name(graph.get_path_handle_of_step(step_handle)); + if (!Paths::is_alt(name) && (ref_path_set.empty() || ref_path_set.count(name))) { + end_path_names.insert(name); + } + return true; + }); + } + + // we do the full intersection (instead of more quickly finding the first common path) + // so that we always take the lexicographically lowest path, rather than depending + // on the order of iteration which could change between implementations / runs. + vector common_names; + std::set_intersection(start_path_names.begin(), start_path_names.end(), + end_path_names.begin(), end_path_names.end(), + std::back_inserter(common_names)); + + string ref_path_name; + SnarlTraversal ref_trav; + int ref_trav_idx = -1; + tuple ref_interval; + string gt_ref_path_name; + pair gt_ref_interval; + + if (!common_names.empty()) { + ref_path_name = common_names.front(); + + // find the reference traversal and coordinates using the path position graph interface + ref_interval = get_ref_interval(graph, snarl, ref_path_name); + if (get<0>(ref_interval) == -1) { + // no reference path found due to orientation conflict + return false; + } + if (get<2>(ref_interval) == true) { + // calling code assumes snarl forward on reference + flip_snarl(snarl); + ref_interval = get_ref_interval(graph, snarl, ref_path_name); + } + + step_handle_t cur_step = get<3>(ref_interval); + step_handle_t last_step = get<4>(ref_interval); + if (get<2>(ref_interval)) { + std::swap(cur_step, last_step); + } + bool start_backwards = snarl.start().backward() != graph.get_is_reverse(graph.get_handle_of_step(cur_step)); + + while (true) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + Visit* visit = ref_trav.add_visit(); + visit->set_node_id(graph.get_id(cur_handle)); + visit->set_backward(start_backwards ? !graph.get_is_reverse(cur_handle) : graph.get_is_reverse(cur_handle)); + if (graph.get_id(cur_handle) == snarl.end().node_id()) { + break; + } else if (get<2>(ref_interval) == true) { + if (!graph.has_previous_step(cur_step)) { + cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(snarl) << endl; + return false; + } + cur_step = graph.get_previous_step(cur_step); + } else { + if (!graph.has_next_step(cur_step)) { + cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(snarl) << endl; + return false; + } + cur_step = graph.get_next_step(cur_step); + } + // todo: we can compute flow at the same time + } + assert(ref_trav.visit(0) == snarl.start() && ref_trav.visit(ref_trav.visit_size() - 1) == snarl.end()); + + gt_ref_path_name = ref_path_name; + gt_ref_interval = make_pair(get<0>(ref_interval), get<1>(ref_interval)); + if (max_ploidy == -1) { + max_ploidy = ref_ploidies[ref_path_name]; + } + } else { + // if we have no reference infromation, try to get it from the parent snarl + gt_ref_path_name = parent_ref_path_name; + gt_ref_interval = parent_ref_interval; + if (gt_ref_path_name.empty()) { + // there's just no reference path through this snarl + return false; + } + assert(max_ploidy >= 0); + } + + // recurse on the children + // todo: do we need to make this iterative for deep snarl trees? + const vector& children = snarl_manager.children_of(&managed_snarl); + + for (const Snarl* child : children) { + if (!snarl_manager.is_trivial(child, graph)) { + bool called = call_snarl_recursive(*child, max_ploidy, gt_ref_path_name, gt_ref_interval, call_table); + if (!called) { + return false; + } + } + } + +#ifdef debug + cerr << "recursively calling " << pb2json(managed_snarl) << " with " << children.size() << " children" + << " and ref_path " << gt_ref_path_name << " and parent ref_path " << parent_ref_path_name << endl << endl; +#endif + + // abstract away the child snarls in the graph. traversals will bypass them via + // "virtual" edges + SnarlGraph snarl_graph(&graph, snarl_manager, children); + + if (snarl.start().node_id() == snarl.end().node_id() || + !graph.has_node(snarl.start().node_id()) || !graph.has_node(snarl.end().node_id())) { + // can't call one-node or out-of graph snarls. + return false; + } + // toggle average flow / flow width based on snarl length. this is a bit inconsistent with + // downstream which uses the longest traversal length, but it's a bit chicken and egg + // todo: maybe use snarl length for everything? + const auto& support_finder = dynamic_cast(snarl_caller).get_support_finder(); + + bool greedy_avg_flow = false; + { + auto snarl_contents = snarl_manager.shallow_contents(&snarl, graph, false); + if (max(snarl_contents.first.size(), snarl_contents.second.size()) > max_snarl_shallow_size) { + return false; + } + size_t len_threshold = support_finder.get_average_traversal_support_switch_threshold(); + size_t length = 0; + for (auto i = snarl_contents.first.begin(); i != snarl_contents.first.end() && length < len_threshold; ++i) { + length += graph.get_length(graph.get_handle(*i)); + } + greedy_avg_flow = length > len_threshold; + } + + vector travs; + FlowTraversalFinder* flow_trav_finder = dynamic_cast(&traversal_finder); + if (flow_trav_finder != nullptr) { + // find the max flow traversals using specialized interface that accepts avg heurstic toggle and overlay + pair, vector> weighted_travs = flow_trav_finder->find_weighted_traversals(snarl, greedy_avg_flow, &snarl_graph); + travs = std::move(weighted_travs.first); + + } else { + // find the traversals using the generic interface + assert(false); + travs = traversal_finder.find_traversals(snarl); + } + + // todo: we need to make reference traversal nesting aware +#ifdef debug + for (int i = 0; i < travs.size(); ++i) { + cerr << "[" << i << "]: " << pb2json(travs[i]) << endl; + } +#endif + + // find the reference traversal in the list of results from the traversal finder + if (!ref_path_name.empty()) { + for (int i = 0; i < travs.size() && ref_trav_idx < 0; ++i) { + // todo: is there a way to speed this up? + if (travs[i] == ref_trav) { + ref_trav_idx = i; + } + } + + if (ref_trav_idx == -1) { + ref_trav_idx = travs.size(); + // we didn't get the reference traversal from the finder, so we add it here + travs.push_back(ref_trav); +#ifdef debug + cerr << "[ref]: " << pb2json(ref_trav) << endl; +#endif + } + } + // store the reference traversal information, which could be empty + record.ref_path_name = ref_path_name; + record.ref_trav_idx = ref_trav_idx; + + // in the snarl graph, snarls a represented by a snarl end point and that's it. here we fix up the traversals + // to actually embed the snarls + // todo: should be able to avoid copy here! + vector embedded_travs = travs; + for (int i = 0; i < embedded_travs.size(); ++i) { + SnarlTraversal& traversal = embedded_travs[i]; + if (i != ref_trav_idx) { + snarl_graph.embed_snarls(traversal); + } else { + snarl_graph.embed_ref_path_snarls(traversal); + } + } + + bool ret_val = true; + + if (traversals_only) { + assert(gaf_output); + for (SnarlTraversal& traversal : travs) { + snarl_graph.embed_snarls(traversal); + } + pair pos_info = get_ref_position(graph, snarl, ref_path_name, 0); + emit_gaf_traversals(graph, print_snarl(snarl), travs, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); + } else { + // use our support caller to choose our genotype + for (int ploidy = 1; ploidy <= max_ploidy; ++ploidy) { + vector trav_genotype; + unique_ptr trav_call_info; + std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, gt_ref_path_name, gt_ref_interval); + assert(trav_genotype.empty() || trav_genotype.size() == ploidy); + + // update the traversal finder with summary support statistics from this call + // todo: be smarted about ploidy here + NestedCachedPackedTraversalSupportFinder::SupportMap& child_support_map = nested_support_finder.child_support_map; + // todo: re-use information that was produced in genotype!! + int max_trav_size = 0; + vector genotype_supports = nested_support_finder.get_traversal_genotype_support(embedded_travs, trav_genotype, {}, ref_trav_idx, &max_trav_size); + Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support()); + // todo: do we want to use max_trav_size, or something derived from the genotype? + child_support_map[snarl] = make_tuple(total_site_support, total_site_support, max_trav_size); + + // and now we need to update our own table with the genotype + if (record.genotype_by_ploidy.size() < ploidy) { + record.genotype_by_ploidy.resize(ploidy); + } + record.genotype_by_ploidy[ploidy-1].first = trav_genotype; + record.genotype_by_ploidy[ploidy-1].second.reset(trav_call_info.release()); + record.travs = embedded_travs; + + ret_val = trav_genotype.size() == ploidy; + } + } + + return ret_val; +} + +bool NestedFlowCaller::emit_snarl_recursive(const Snarl& managed_snarl, int ploidy, CallTable& call_table) { + // fetch the current snarl from the table + CallRecord& record = call_table[managed_snarl]; + + // only emit snarl with reference backbone: + // todo: emit when no call (at least optionally) + if (record.ref_trav_idx >= 0 && !record.genotype_by_ploidy.empty() && ploidy != 0) { + + if (ploidy < 0) { + ploidy = ref_ploidies[record.ref_path_name]; + } + + pair, unique_ptr>& genotype = record.genotype_by_ploidy[ploidy - 1]; + + // compute count how many times a nested snarl appears in the genotype. this will be the ploidy + // it gets emitted with + // todo: feed into flatten_alt_allele! + map nested_ploidy; + for (int allele : genotype.first) { + const SnarlTraversal& allele_trav = record.travs[allele]; + for (size_t i = 0; i < allele_trav.visit_size(); ++i) { + const Visit& visit = allele_trav.visit(i); + if (visit.node_id() == 0) { + ++nested_ploidy[visit.snarl()]; + } + } + } + + // recurse on the children + // todo: do we need to make this iterative for deep snarl trees? + const vector& children = snarl_manager.children_of(&managed_snarl); + + for (const Snarl* child : children) { + if (!snarl_manager.is_trivial(child, graph)) { + emit_snarl_recursive(*child, nested_ploidy[*child], call_table); + } + } + +#ifdef debug + cerr << "Recursively emitting " << pb2json(managed_snarl) << "with ploidy " << ploidy << endl; +#endif + function&, const vector&, int, int, int)> trav_to_flat_string = + [&](const vector& travs, const vector& travs_genotype, int trav_allele, int genotype_allele, int ref_trav_idx) { + + string allele_string = trav_string(graph, travs[trav_allele]); + if (trav_allele == ref_trav_idx) { + return flatten_reference_allele(allele_string, call_table); + } else { + int allele_ploidy = std::max((int)std::count(travs_genotype.begin(), travs_genotype.end(), trav_allele), 1); + return flatten_alt_allele(allele_string, std::min(allele_ploidy-1, genotype_allele), allele_ploidy, call_table); + } + }; + + if (!gaf_output) { + emit_variant(graph, snarl_caller, managed_snarl, record.travs, genotype.first, record.ref_trav_idx, genotype.second, record.ref_path_name, + ref_offsets[record.ref_path_name], genotype_snarls, ploidy, trav_to_flat_string); + } else { + // todo: + // emit_gaf_variant(graph, snarl, travs, trav_genotype); + } + } + + return true; +} + +string NestedFlowCaller::flatten_reference_allele(const string& nested_allele, const CallTable& call_table) const { + + string flat_allele; + + scan_snarl(nested_allele, [&](const string& fragment, Snarl& snarl) { + if (!fragment.empty()) { + flat_allele += fragment; + } else { + const CallRecord& record = call_table.at(snarl); + assert(record.ref_trav_idx >= 0); + if (record.travs.empty()) { + flat_allele += "<***>"; + assert(false); + } else{ + const SnarlTraversal& traversal = record.travs[record.ref_trav_idx]; + string nested_snarl_allele = trav_string(graph, traversal); + flat_allele += flatten_reference_allele(nested_snarl_allele, call_table); + } + } + }); + + return flat_allele; +} + +string NestedFlowCaller::flatten_alt_allele(const string& nested_allele, int allele, int ploidy, const CallTable& call_table) const { + + string flat_allele; +#ifdef debug + cerr << "Flattening " << nested_allele << " at allele " << allele << endl; +#endif + scan_snarl(nested_allele, [&](const string& fragment, Snarl& snarl) { + if (!fragment.empty()) { + flat_allele += fragment; + } else { + const CallRecord& record = call_table.at(snarl); +#ifdef debug + cerr << "got record with " << record.travs.size() << " travs and " << record.genotype_by_ploidy.size() << " gts" << endl; +#endif + int fallback_allele = -1; + if (record.genotype_by_ploidy[ploidy-1].first.empty()) { + // there's no call here. but we really want to emit something, so try picking the reference + // or first allele + if (record.ref_trav_idx >= 0) { + fallback_allele = record.ref_trav_idx; + } else if (!record.travs.empty()) { + fallback_allele = 0; + } + } + if (fallback_allele >= (int)record.travs.size()) { + flat_allele += "<...>"; + } else { + // todo: passing in a single ploidy simplisitic, would need to derive from the calls when + // reucrising + // in practice, the results will nearly the same but still needs fixing + // we try to get the the allele from the genotype if possible, but fallback on the fallback_allele + int trav_allele = fallback_allele >= 0 ? fallback_allele : record.genotype_by_ploidy[ploidy-1].first[allele]; + const SnarlTraversal& traversal = record.travs[trav_allele]; + string nested_snarl_allele = trav_string(graph, traversal); + flat_allele += flatten_alt_allele(nested_snarl_allele, allele, ploidy, call_table); + } + } + }); + + return flat_allele; +} + + + +string NestedFlowCaller::vcf_header(const PathHandleGraph& graph, const vector& contigs, + const vector& contig_length_overrides) const { + string header = VCFOutputCaller::vcf_header(graph, contigs, contig_length_overrides); + header += "##FORMAT=\n"; + snarl_caller.update_vcf_header(header); + header += "##FILTER=\n"; + header += "##SAMPLE=\n"; + header += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_name; + assert(output_vcf.openForOutput(header)); + header += "\n"; + return header; +} + + +SnarlGraph::SnarlGraph(const HandleGraph* backing_graph, SnarlManager& snarl_manager, vector snarls) : + backing_graph(backing_graph), + snarl_manager(snarl_manager) { + for (const Snarl* snarl : snarls) { + if (!snarl_manager.is_trivial(snarl, *backing_graph)) { + this->snarls[backing_graph->get_handle(snarl->start().node_id(), snarl->start().backward())] = + make_pair(backing_graph->get_handle(snarl->end().node_id(), snarl->end().backward()), true); + this->snarls[backing_graph->get_handle(snarl->end().node_id(), !snarl->end().backward())] = + make_pair(backing_graph->get_handle(snarl->start().node_id(), !snarl->start().backward()), false); + } + } +} + +pair SnarlGraph::node_to_snarl(handle_t handle) const { + auto i = snarls.find(handle); + if (i != snarls.end()) { + return make_pair(true, i->second.first); + } else { + return make_pair(false, handle); + } +} + +tuple SnarlGraph::edge_to_snarl_edge(edge_t edge) const { + auto i = snarls.find(edge.first); + edge_t out_edge; + handle_t out_node; + bool out_found = false; + if (i != snarls.end()) { + // edge is from snarl start to after snarl end + out_edge.first = i->second.first; + out_edge.second = edge.second; + out_node = edge.first; + out_found = true; + } else { + // reverse of above + i = snarls.find(backing_graph->flip(edge.second)); + if (i != snarls.end()) { + out_edge.first = edge.first; + out_edge.second = backing_graph->flip(i->second.first); + out_node = edge.second; + out_found = true; + } + } + // note that we only have those two cases since our internal map contains + // both orientations of the snarl. + + return make_tuple(out_found, out_node, out_edge); +} + +void SnarlGraph::embed_snarl(Visit& visit) { + handle_t handle = backing_graph->get_handle(visit.node_id(), visit.backward()); + auto it = snarls.find(handle); + if (it != snarls.end()) { + // edit the Visit in place to replace id, with the full snarl + Snarl* snarl = visit.mutable_snarl(); + snarl->mutable_start()->set_node_id(visit.node_id()); + snarl->mutable_start()->set_backward(visit.backward()); + handle_t other = it->second.first; + snarl->mutable_end()->set_node_id(backing_graph->get_id(other)); + snarl->mutable_end()->set_backward(backing_graph->get_is_reverse(other)); + if (it->second.second == false) { + // put the snarl in an orientation consisten with other indexes + swap(*snarl->mutable_start(), *snarl->mutable_end()); + snarl->mutable_start()->set_backward(!snarl->start().backward()); + snarl->mutable_end()->set_backward(!snarl->end().backward()); + } + visit.set_node_id(0); + } +} + +void SnarlGraph::embed_snarls(SnarlTraversal& traversal) { + for (size_t i = 0; i < traversal.visit_size(); ++i) { + Visit& visit = *traversal.mutable_visit(i); + if (visit.node_id() > 0) { + embed_snarl(visit); + } + } +} + +void SnarlGraph::embed_ref_path_snarls(SnarlTraversal& traversal) { + vector out_trav; + size_t snarl_count = 0; + bool in_snarl = false; + handle_t snarl_end; + for (size_t i = 0; i < traversal.visit_size(); ++i) { + Visit& visit = *traversal.mutable_visit(i); + handle_t handle = backing_graph->get_handle(visit.node_id(), visit.backward()); + if (in_snarl) { + // nothing to do if we're in a snarl except check for the end and come out + if (handle == snarl_end) { + in_snarl = false; + } + } else { + // if we're not in a snarl, check for a new one + auto it = snarls.find(handle); + if (it != snarls.end()) { + embed_snarl(visit); + snarl_end = it->second.first; + in_snarl = true; + ++snarl_count; + } + out_trav.push_back(visit); + } + } + + // switch in the updated traversal + if (snarl_count > 0) { + traversal.clear_visit(); + for (Visit& visit : out_trav) { + *traversal.add_visit() = visit; + } + } +} + +bool SnarlGraph::follow_edges_impl(const handle_t& handle, bool go_left, const std::function& iteratee) const { + if (!go_left) { + auto i = snarls.find(handle); + if (i == snarls.end()) { + return backing_graph->follow_edges(handle, go_left, iteratee); + } else { + return backing_graph->follow_edges(i->second.first, go_left, iteratee); + } + } else { + return this->follow_edges_impl(backing_graph->flip(handle), !go_left, iteratee); + } +} + +// a lot of these don't strictly make sense. ex, we would want has_node to +// hide stuff inside snarls. but... we don't want to pay the cost of maintining +// structures for functions that aren't used.. +bool SnarlGraph::has_node(nid_t node_id) const { + return backing_graph->has_node(node_id); +} +handle_t SnarlGraph::get_handle(const nid_t& node_id, bool is_reverse) const { + return backing_graph->get_handle(node_id, is_reverse); +} +nid_t SnarlGraph::get_id(const handle_t& handle) const { + return backing_graph->get_id(handle); +} +bool SnarlGraph::get_is_reverse(const handle_t& handle) const { + return backing_graph->get_is_reverse(handle); +} +handle_t SnarlGraph::flip(const handle_t& handle) const { + return backing_graph->flip(handle); +} +size_t SnarlGraph::get_length(const handle_t& handle) const { + return backing_graph->get_length(handle); +} +std::string SnarlGraph::get_sequence(const handle_t& handle) const { + return backing_graph->get_sequence(handle); +} +size_t SnarlGraph::get_node_count() const { + return backing_graph->get_node_count(); +} +nid_t SnarlGraph::min_node_id() const { + return backing_graph->min_node_id(); +} +nid_t SnarlGraph::max_node_id() const { + return backing_graph->max_node_id(); +} +bool SnarlGraph::for_each_handle_impl(const std::function& iteratee, bool parallel) const { + return backing_graph->for_each_handle(iteratee, parallel); +} + +} + diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp new file mode 100644 index 00000000000..25d719aeb3d --- /dev/null +++ b/src/graph_caller.hpp @@ -0,0 +1,581 @@ +#ifndef VG_GRAPH_CALLER_HPP_INCLUDED +#define VG_GRAPH_CALLER_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include "handle.hpp" +#include "snarls.hpp" +#include "traversal_finder.hpp" +#include "snarl_caller.hpp" +#include "region.hpp" +#include "zstdutil.hpp" +#include "vg/io/alignment_emitter.hpp" + +namespace vg { + +using namespace std; + +using vg::io::AlignmentEmitter; + +/** + * GraphCaller: Use the snarl decomposition to call snarls in a graph + */ +class GraphCaller { +public: + + enum RecurseType { RecurseOnFail, RecurseAlways, RecurseNever }; + + GraphCaller(SnarlCaller& snarl_caller, + SnarlManager& snarl_manager); + + virtual ~GraphCaller(); + + /// Run call_snarl() on every top-level snarl in the manager. + /// For any that return false, try the children, etc. (when recurse_on_fail true) + /// Snarls are processed in parallel + virtual void call_top_level_snarls(const HandleGraph& graph, RecurseType recurse_type = RecurseOnFail); + + /// For every chain, cut it up into pieces using max_edges and max_trivial to cap the size of each piece + /// then make a fake snarl for each chain piece and call it. If a fake snarl fails to call, + /// It's child chains will be recursed on (if selected)_ + virtual void call_top_level_chains(const HandleGraph& graph, + size_t max_edges, + size_t max_trivial, + RecurseType recurise_type = RecurseOnFail); + + /// Call a given snarl, and print the output to out_stream + virtual bool call_snarl(const Snarl& snarl) = 0; + +protected: + + /// Break up a chain into bits that we want to call using size heuristics + vector break_chain(const HandleGraph& graph, const Chain& chain, size_t max_edges, size_t max_trivial); + +protected: + + /// Our Genotyper + SnarlCaller& snarl_caller; + + /// Our snarls + SnarlManager& snarl_manager; +}; + +/** + * Helper class that vcf writers can inherit from to for some common code to output sorted VCF + */ +class VCFOutputCaller { +public: + VCFOutputCaller(const string& sample_name); + + virtual ~VCFOutputCaller(); + + /// Write the vcf header (version and contigs and basic info) + virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, + const vector& contig_length_overrides) const; + + /// Add a variant to our buffer + void add_variant(vcflib::Variant& var) const; + + /// Sort then write variants in the buffer + /// snarl_manager needed if include_nested is true + void write_variants(ostream& out_stream, const SnarlManager* snarl_manager = nullptr); + + /// Run vcffixup from vcflib + void vcf_fixup(vcflib::Variant& var) const; + + /// Add a translation map + void set_translation(const unordered_map>* translation); + + /// Assume writing nested snarls is enabled + void set_nested(bool nested); + +protected: + + /// add a traversal to the VCF info field in the format of a GFA W-line or GAF path + void add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav, bool reversed, bool one_based) const; + + /// convert a traversal into an allele string + string trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const; + + /// print a vcf variant + void emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller, + const Snarl& snarl, const vector& called_traversals, + const vector& genotype, int ref_trav_idx, const unique_ptr& call_info, + const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy, + function&, const vector&, int, int, int)> trav_to_string = nullptr); + + /// get the interval of a snarl from our reference path using the PathPositionHandleGraph interface + /// the bool is true if the snarl's backward on the path + /// first returned value -1 if no traversal found + tuple get_ref_interval(const PathPositionHandleGraph& graph, const Snarl& snarl, + const string& ref_path_name) const; + + /// used for making gaf traversal names + pair get_ref_position(const PathPositionHandleGraph& graph, const Snarl& snarl, const string& ref_path_name, + int64_t ref_path_offset) const; + + /// clean up the alleles to not share common prefixes / suffixes + /// if len_override given, just do that many bases without thinking + void flatten_common_allele_ends(vcflib::Variant& variant, bool backward, size_t len_override) const; + + /// print a snarl in a consistent form like >3435<12222 + /// if in_brackets set to true, do (>3435<12222) instead (this is only used for nested caller) + string print_snarl(const Snarl& snarl, bool in_brackets = false) const; + + /// do the opposite of above + /// So a string that looks like AACT(>12<17)TTT would invoke the callback three times with + /// ("AACT", Snarl), ("", Snarl(12,-17)), ("TTT", Snarl(12,-17)) + /// The parameters are to be treated as unions: A sequence fragment if non-empty, otherwise a snarl + void scan_snarl(const string& allele_string, function callback) const; + + // update the PS and LV tags in the output buffer (called in write_variants if include_nested is true) + void update_nesting_info_tags(const SnarlManager* snarl_manager); + + /// output vcf + mutable vcflib::VariantCallFile output_vcf; + + /// Sample name + string sample_name; + + /// output buffers (1/thread) (for sorting) + /// variants stored as strings (and position key pairs) because vcflib::Variant in-memory struct so huge + mutable vector, string>>> output_variants; + + /// print up to this many uncalled alleles when doing ref-genotpes in -a mode + size_t max_uncalled_alleles = 5; + + // optional node translation to apply to snarl names in variant IDs + const unordered_map>* translation; + + // need to write LV/PS info tags + bool include_nested; +}; + +/** + * Helper class for outputing snarl traversals as GAF + */ +class GAFOutputCaller { +public: + /// The emitter object is created and owned by external forces + GAFOutputCaller(AlignmentEmitter* emitter, const string& sample_name, const vector& ref_paths, + size_t trav_padding); + virtual ~GAFOutputCaller(); + + /// print the GAF traversals + void emit_gaf_traversals(const PathHandleGraph& graph, const string& snarl_name, + const vector& travs, + int64_t ref_trav_idx, + const string& ref_path_name, int64_t ref_path_position, + const TraversalSupportFinder* support_finder = nullptr); + + /// print the GAF genotype + void emit_gaf_variant(const PathHandleGraph& graph, const string& snarl_name, + const vector& travs, + const vector& genotype, + int64_t ref_trav_idx, + const string& ref_path_name, int64_t ref_path_position, + const TraversalSupportFinder* support_finder = nullptr); + + /// pad a traversal with (first found) reference path, adding up to trav_padding to each side + SnarlTraversal pad_traversal(const PathHandleGraph& graph, const SnarlTraversal& trav) const; + +protected: + + AlignmentEmitter* emitter; + + /// Sample name + string gaf_sample_name; + + /// Add padding from reference paths to traversals to make them at least this long + /// (only in emit_gaf_traversals(), not emit_gaf_variant) + size_t trav_padding = 0; + + /// Reference paths are used to pad out traversals. If there are none, then first path found is used + unordered_set ref_paths; + +}; + +/** + * VCFGenotyper : Genotype variants in a given VCF file + */ +class VCFGenotyper : public GraphCaller, public VCFOutputCaller, public GAFOutputCaller { +public: + VCFGenotyper(const PathHandleGraph& graph, + SnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + vcflib::VariantCallFile& variant_file, + const string& sample_name, + const vector& ref_paths, + const vector& ref_path_ploidies, + FastaReference* ref_fasta, + FastaReference* ins_fasta, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding); + + virtual ~VCFGenotyper(); + + virtual bool call_snarl(const Snarl& snarl); + + virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, + const vector& contig_length_overrides = {}) const; + +protected: + + /// get path positions bounding a set of variants + tuple get_ref_positions(const vector& variants) const; + + /// munge out the contig lengths from the VCF header + virtual unordered_map scan_contig_lengths() const; + +protected: + + /// the graph + const PathHandleGraph& graph; + + /// input VCF to genotype, must have been loaded etc elsewhere + vcflib::VariantCallFile& input_vcf; + + /// traversal finder uses alt paths to map VCF alleles from input_vcf + /// back to traversals in the snarl + VCFTraversalFinder traversal_finder; + + /// toggle whether to genotype or just output the traversals + bool traversals_only; + + /// toggle whether to output vcf or gaf + bool gaf_output; + + /// the ploidies + unordered_map path_to_ploidy; +}; + + +/** + * LegacyCaller : Preserves (most of) the old vg call logic by using + * the RepresentativeTraversalFinder to recursively find traversals + * through arbitrary sites. + */ +class LegacyCaller : public GraphCaller, public VCFOutputCaller { +public: + LegacyCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + const vector& ref_paths = {}, + const vector& ref_path_offsets = {}, + const vector& ref_path_ploidies = {}); + + virtual ~LegacyCaller(); + + virtual bool call_snarl(const Snarl& snarl); + + virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, + const vector& contig_length_overrides = {}) const; + +protected: + + /// recursively genotype a snarl + /// todo: can this be pushed to a more generic class? + pair, vector> top_down_genotype(const Snarl& snarl, TraversalFinder& trav_finder, int ploidy, + const string& ref_path_name, pair ref_interval) const; + + /// we need the reference traversal for VCF, but if the ref is not called, the above method won't find it. + SnarlTraversal get_reference_traversal(const Snarl& snarl, TraversalFinder& trav_finder) const; + + /// re-genotype output of top_down_genotype. it may give slightly different results as + /// it's working with fully-defined traversals and can exactly determine lengths and supports + /// it will also make sure the reference traversal is in the beginning of the output + tuple, vector, unique_ptr> re_genotype(const Snarl& snarl, + TraversalFinder& trav_finder, + const vector& in_traversals, + const vector& in_genotype, + int ploidy, + const string& ref_path_name, + pair ref_interval) const; + + /// check if a site can be handled by the RepresentativeTraversalFinder + bool is_traversable(const Snarl& snarl); + + /// look up a path index for a site and return its name too + pair find_index(const Snarl& snarl, const vector path_indexes) const; + +protected: + + /// the graph + const PathPositionHandleGraph& graph; + /// non-vg inputs are converted into vg as-needed, at least until we get the + /// traversal finding ported + bool is_vg; + + /// The old vg call traversal finder. It is fairly efficient but daunting to maintain. + /// We keep it around until a better replacement is implemented. It is *not* compatible + /// with the Handle Graph API because it relise on PathIndex. We convert to VG as + /// needed in order to use it. + RepresentativeTraversalFinder* traversal_finder; + /// Needed by above (only used when working on vg inputs -- generated on the fly otherwise) + vector path_indexes; + + /// keep track of the reference paths + vector ref_paths; + + /// keep track of offsets in the reference paths + map ref_offsets; + + /// keep track of ploidies in the reference paths + map ref_ploidies; + + /// Tuning + + /// How many nodes should we be willing to look at on our path back to the + /// primary path? Keep in mind we need to look at all valid paths (and all + /// combinations thereof) until we find a valid pair. + int max_search_depth = 1000; + /// How many search states should we allow on the DFS stack when searching + /// for traversals? + int max_search_width = 1000; + /// What's the maximum number of bubble path combinations we can explore + /// while finding one with maximum support? + size_t max_bubble_paths = 100; + +}; + + +/** + * FlowCaller : Uses any traversals finder (ex, FlowTraversalFinder) to find + * traversals, and calls those based on how much support they have. + * Should work on any graph but will not + * report cyclic traversals. Does not (yet, anyway) support nested + * calling, so the entire site is processes in one shot. + * Designed to replace LegacyCaller, as it should miss fewer obviously + * good traversals, and is not dependent on old protobuf-based structures. + */ +class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputCaller { +public: + FlowCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + TraversalFinder& traversal_finder, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_ploidies, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding, + bool genotype_snarls, + const pair& ref_allele_length_range); + + virtual ~FlowCaller(); + + virtual bool call_snarl(const Snarl& snarl); + + virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, + const vector& contig_length_overrides = {}) const; + +protected: + + /// the graph + const PathPositionHandleGraph& graph; + + /// the traversal finder + TraversalFinder& traversal_finder; + + /// keep track of the reference paths + vector ref_paths; + unordered_set ref_path_set; + + /// keep track of offsets in the reference paths + map ref_offsets; + + /// keep traco of the ploidies (todo: just one map for all path stuff!!) + map ref_ploidies; + + /// until we support nested snarls, cap snarl size we attempt to process + size_t max_snarl_edges = 10000; + + /// alignment emitter. if not null, traversals will be output here and + /// no genotyping will be done + AlignmentEmitter* alignment_emitter; + + /// toggle whether to genotype or just output the traversals + bool traversals_only; + + /// toggle whether to output vcf or gaf + bool gaf_output; + + /// toggle whether to genotype every snarl + /// (by default, uncalled snarls are skipped, and coordinates are flattened + /// out to minimize variant size -- this turns all that off) + bool genotype_snarls; + + /// clamp calling to reference alleles of a given length range + pair ref_allele_length_range; +}; + +class SnarlGraph; + +/** + * FlowCaller : Uses any traversals finder (ex, FlowTraversalFinder) to find + * traversals, and calls those based on how much support they have. + * Should work on any graph but will not + * report cyclic traversals. + * + * todo: this is a generalization of FlowCaller and should be able to replace it entirely after testing + * to get rid of duplicated code. + */ +class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputCaller { +public: + NestedFlowCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + TraversalFinder& traversal_finder, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_ploidies, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding, + bool genotype_snarls); + + virtual ~NestedFlowCaller(); + + virtual bool call_snarl(const Snarl& snarl); + + virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, + const vector& contig_length_overrides = {}) const; + +protected: + + /// stuff we remember for each snarl call, to be used when genotyping its parent + struct CallRecord { + vector travs; + vector, unique_ptr>> genotype_by_ploidy; + string ref_path_name; + pair ref_path_interval; + int ref_trav_idx; // index of ref paths in CallRecord::travs + }; + typedef map CallTable; + + /// update the table of calls for each child snarl (and the input snarl) + bool call_snarl_recursive(const Snarl& managed_snarl, int ploidy, + const string& parent_ref_path_name, pair parent_ref_path_interval, + CallTable& call_table); + + /// emit the vcf of all reference-spanning snarls + /// The call_table needs to be completely resolved + bool emit_snarl_recursive(const Snarl& managed_snarl, int ploidy, + CallTable& call_table); + + /// transform the nested allele string from something like AAC<6_10>TTT to + /// a proper string by recursively resolving the nested snarls into alleles + string flatten_reference_allele(const string& nested_allele, const CallTable& call_table) const; + string flatten_alt_allele(const string& nested_allele, int allele, int ploidy, const CallTable& call_table) const; + + /// the graph + const PathPositionHandleGraph& graph; + + /// the traversal finder + TraversalFinder& traversal_finder; + + /// keep track of the reference paths + vector ref_paths; + unordered_set ref_path_set; + + /// keep track of offsets in the reference paths + map ref_offsets; + + /// keep traco of the ploidies (todo: just one map for all path stuff!!) + map ref_ploidies; + + /// until we support nested snarls, cap snarl size we attempt to process + size_t max_snarl_shallow_size = 50000; + + /// alignment emitter. if not null, traversals will be output here and + /// no genotyping will be done + AlignmentEmitter* alignment_emitter; + + /// toggle whether to genotype or just output the traversals + bool traversals_only; + + /// toggle whether to output vcf or gaf + bool gaf_output; + + /// toggle whether to genotype every snarl + /// (by default, uncalled snarls are skipped, and coordinates are flattened + /// out to minimize variant size -- this turns all that off) + bool genotype_snarls; + + /// a hook into the snarl_caller's nested support finder + NestedCachedPackedTraversalSupportFinder& nested_support_finder; +}; + + +/** Simplification of a NetGraph that ignores chains. It is designed only for + traversal finding. Todo: generalize NestedFlowCaller to the point where we + can remove this and use NetGraph instead */ +class SnarlGraph : virtual public HandleGraph { +public: + // note: can only deal with one snarl "level" at a time + SnarlGraph(const HandleGraph* backing_graph, SnarlManager& snarl_manager, vector snarls); + + // go from node to snarl (first val false if not a snarl) + pair node_to_snarl(handle_t handle) const; + + // go from edge to snarl (first val false if not a virtual edge) + tuple edge_to_snarl_edge(edge_t edge) const; + + // replace a snarl node with an actual snarl in the traversal + void embed_snarl(Visit& visit); + void embed_snarls(SnarlTraversal& traversal); + + // replace a refpath through the snarl with the actual snarl in the traversal + // todo: this is a bed of a hack + void embed_ref_path_snarls(SnarlTraversal& traversal); + + //////////////////////////////////////////////////////////////////////////// + // Handle-based interface (which is all identical to backing graph) + //////////////////////////////////////////////////////////////////////////// + bool has_node(nid_t node_id) const; + handle_t get_handle(const nid_t& node_id, bool is_reverse = false) const; + nid_t get_id(const handle_t& handle) const; + bool get_is_reverse(const handle_t& handle) const; + handle_t flip(const handle_t& handle) const; + size_t get_length(const handle_t& handle) const; + std::string get_sequence(const handle_t& handle) const; + size_t get_node_count() const; + nid_t min_node_id() const; + nid_t max_node_id() const; + +protected: + + bool for_each_handle_impl(const std::function& iteratee, bool parallel = false) const; + + /// this is the only function that's changed to do anything different from the backing graph: + /// it is changed to "pass through" snarls by pretending there are edges from into snarl starts out of ends and + /// vice versa. + bool follow_edges_impl(const handle_t& handle, bool go_left, const std::function& iteratee) const; + + /// the backing graph + const HandleGraph* backing_graph; + + /// the snarl manager + SnarlManager& snarl_manager; + + /// the snarls (indexed both ways). flag is true for original orientation + unordered_map> snarls; +}; + + +} + +#endif diff --git a/src/graph_synchronizer.cpp b/src/graph_synchronizer.cpp index 7e1ae31b06c..c32b1888dce 100644 --- a/src/graph_synchronizer.cpp +++ b/src/graph_synchronizer.cpp @@ -8,7 +8,17 @@ namespace vg { using namespace std; GraphSynchronizer::GraphSynchronizer(VG& graph) : graph(graph) { - // Nothing to do! + // Because in general paths can overlap each other, and because we can't + // build a path index after a path has been modified (since we don't keep + // the ranks up to date internally), we need to build all the indexes up + // front, even if we're just working on a single path. + graph.for_each_path_handle([&](const path_handle_t& path) { + string name = graph.get_path_name(path); + if (!Paths::is_alt(name)) { + // We only care about reference paths. + get_path_index(name); + } + }); } void GraphSynchronizer::with_path_index(const string& path_name, const function& to_run) { @@ -29,7 +39,10 @@ const string& GraphSynchronizer::get_path_sequence(const string& path_name) { // We need a function to grab the index for a path PathIndex& GraphSynchronizer::get_path_index(const string& path_name) { - + + // We don't work on alt paths; there could be too many to pre-index. + assert(!Paths::is_alt(path_name)); + if (!indexes.count(path_name)) { // Not already made. Generate it. indexes.emplace(piecewise_construct, @@ -115,7 +128,7 @@ void GraphSynchronizer::Lock::lock() { cerr << endl; } #endif - + // Make them into pos_ts that point left to right, the way Jordan thinks. pos_t left_pos = make_pos_t(start_left.node, start_left.is_end, 0); pos_t right_pos = make_pos_t(end_right.node, !end_right.is_end, @@ -130,9 +143,7 @@ void GraphSynchronizer::Lock::lock() { (past_end - start) * 2, left_pos, right_pos, - false, // Disallow terminal node cycles, so we don't duplicate nodes - true, // We don't want extraneous material that doesn't connect the positions - false); // But we don't care about being strictly less than the specified length + false); // We don't care about being strictly less than the specified length #ifdef debug cerr << "Extracted " << context.graph.node_size() << " nodes and " << context.graph.edge_size() << " edges between " << path_name << ":" << start << "-" << past_end << endl; @@ -301,12 +312,12 @@ set GraphSynchronizer::Lock::get_peripheral_attachments(NodeSide graph } } -vector GraphSynchronizer::Lock::apply_edit(const Path& path) { +vector GraphSynchronizer::Lock::apply_edit(const Path& path, size_t max_node_size) { set dangling; - return apply_edit(path, dangling); + return apply_edit(path, dangling, max_node_size); } -vector GraphSynchronizer::Lock::apply_edit(const Path& path, set& dangling) { +vector GraphSynchronizer::Lock::apply_edit(const Path& path, set& dangling, size_t max_node_size) { // Make sure we have exclusive ownership of the graph itself since we're // going to be modifying its data structures. std::lock_guard guard(synchronizer.whole_graph_lock); @@ -320,7 +331,7 @@ vector GraphSynchronizer::Lock::apply_edit(const Path& path, set translations = synchronizer.graph.edit_fast(path, dangling); + vector translations = synchronizer.graph.edit_fast(path, dangling, max_node_size); // Lock all the nodes that result from the translations. They're guaranteed // to either be nodes we already have or novel nodes with fresh IDs. @@ -347,7 +358,7 @@ vector GraphSynchronizer::Lock::apply_edit(const Path& path, set GraphSynchronizer::Lock::apply_full_length_edit(const Path& path) { +vector GraphSynchronizer::Lock::apply_full_length_edit(const Path& path, size_t max_node_size) { // Find the left and right outer nodesides of the subgraph auto ends = get_endpoints(); @@ -357,7 +368,7 @@ vector GraphSynchronizer::Lock::apply_full_length_edit(const Path& // Apply the edit, attaching its left end to the stuff attached to the left // end of the graph. Get back in the dangling set where the right end of the // edit's material is. - auto translations = apply_edit(path, dangling); + auto translations = apply_edit(path, dangling, max_node_size); // Get the places that the right end of the graph attaches to auto right_periphery = get_peripheral_attachments(ends.second); diff --git a/src/graph_synchronizer.hpp b/src/graph_synchronizer.hpp index f29e58e061d..de3fdfb35db 100644 --- a/src/graph_synchronizer.hpp +++ b/src/graph_synchronizer.hpp @@ -120,14 +120,14 @@ class GraphSynchronizer { * The set will be populated with the NodeSides for the ends of nodes * created/visited at the end of the alignment. */ - vector apply_edit(const Path& path, set& dangling); + vector apply_edit(const Path& path, set& dangling, size_t max_node_size = 1024); /** * May only be called when locked. Apply a path as an edit to the base * graph, leaving new nodes at the ends of the path unattached on their * outer sides. */ - vector apply_edit(const Path& path); + vector apply_edit(const Path& path, size_t max_node_size = 1024); /** * May only be called when locked. Apply a path as an edit to the base @@ -139,7 +139,7 @@ class GraphSynchronizer { * The alignment must be in the local forward orientation of the graph * for this to make sense. */ - vector apply_full_length_edit(const Path& path); + vector apply_full_length_edit(const Path& path, size_t max_node_size = 1024); protected: diff --git a/src/gssw_aligner.cpp b/src/gssw_aligner.cpp deleted file mode 100644 index 217111a4b4f..00000000000 --- a/src/gssw_aligner.cpp +++ /dev/null @@ -1,1633 +0,0 @@ -#include "gssw_aligner.hpp" -#include "xdrop_aligner.hpp" -#include "json2pb.h" - -static const double quality_scale_factor = 10.0 / log(10.0); -static const double exp_overflow_limit = log(std::numeric_limits::max()); - -using namespace vg; -using namespace std; - -BaseAligner::~BaseAligner(void) { - free(nt_table); - free(score_matrix); -} - -gssw_graph* BaseAligner::create_gssw_graph(Graph& g) { - - // add a dummy sink node if we're pinning - gssw_graph* graph = gssw_graph_create(g.node_size()); - unordered_map nodes; - - for (int i = 0; i < g.node_size(); ++i) { - Node* n = g.mutable_node(i); - // switch any non-ATGCN characters from the node sequence to N - auto cleaned_seq = nonATGCNtoN(n->sequence()); - gssw_node* node = (gssw_node*)gssw_node_create(n, n->id(), - cleaned_seq.c_str(), - nt_table, - score_matrix); - nodes[n->id()] = node; - gssw_graph_add_node(graph, node); - } - - for (int i = 0; i < g.edge_size(); ++i) { - // Convert all the edges - Edge* e = g.mutable_edge(i); - if(!e->from_start() && !e->to_end()) { - // This is a normal end to start edge. - gssw_nodes_add_edge(nodes[e->from()], nodes[e->to()]); - } else if(e->from_start() && e->to_end()) { - // This is a start to end edge, but isn't reversing and can be converted to a normal end to start edge. - - // Flip the start and end - gssw_nodes_add_edge(nodes[e->to()], nodes[e->from()]); - } else { - // TODO: It's a reversing edge, which gssw doesn't support yet. What - // we should really do is do a topological sort to break cycles, and - // then flip everything at the lower-rank end of this edge around, - // so we don't have to deal with its reversing-ness. But for now we - // just die so we don't get nonsense into gssw. -#pragma omp critical - { - // We need the critical section so we don't throw uncaught - // exceptions in multiple threads at once, leading to C++ trying - // to run termiante in parallel. This doesn't make it safe, just - // slightly safer. - cerr << "Can't gssw over reversing edge " <from() << (e->from_start() ? " start" : " end") << " -> " - << e->to() << (e->to_end() ? " end" : " start") << endl; - // TODO: there's no safe way to kill the program without a way - // to signal the master to do it, via a shared variable in the - // clause that made us parallel. - } - exit(1); - } - } - - return graph; - -} - -void BaseAligner::load_scoring_matrix(istream& matrix_stream) { - if(score_matrix) free(score_matrix); - score_matrix = (int8_t*)calloc(25, sizeof(int8_t)); - for(size_t i=0; i<25; i++){ - if(!matrix_stream.good()){ - std::cerr << "error: vg BaseAligner::load_scoring_matrix requires a 5x5 whitespace separated integer matrix\n"; - throw ""; - } - int score; - matrix_stream >> score; - if(score > 127 || score < -127){ - std::cerr << "error: vg BaseAligner::load_scoring_matrix requires values in the range [-127,127]\n"; - throw ""; - } - score_matrix[i] = score; - } -} - -void BaseAligner::gssw_mapping_to_alignment(gssw_graph* graph, - gssw_graph_mapping* gm, - Alignment& alignment, - bool pinned, - bool pin_left, - bool print_score_matrices) { - alignment.clear_path(); - alignment.set_score(gm->score); - alignment.set_query_position(0); - Path* path = alignment.mutable_path(); - //alignment.set_cigar(graph_cigar(gm)); - - gssw_graph_cigar* gc = &gm->cigar; - gssw_node_cigar* ncs = gc->elements; - //cerr << "gm->position " << gm->position << endl; - string& to_seq = *alignment.mutable_sequence(); - //cerr << "-------------" << endl; - - if (print_score_matrices) { - gssw_graph_print_score_matrices(graph, to_seq.c_str(), to_seq.size(), stderr); - //cerr << alignment.DebugString() << endl; - } - - int to_pos = 0; - int from_pos = gm->position; - - for (int i = 0; i < gc->length; ++i) { - // check that the current alignment has a non-zero length - gssw_cigar* c = ncs[i].cigar; - int l = c->length; - if (l == 0) continue; - gssw_cigar_element* e = c->elements; - - Node* from_node = (Node*) ncs[i].node->data; - string& from_seq = *from_node->mutable_sequence(); - Mapping* mapping = path->add_mapping(); - - if (i > 0) { - // reset for each node after the first - from_pos = 0; - } - - mapping->mutable_position()->set_node_id(ncs[i].node->id); - mapping->mutable_position()->set_offset(from_pos); - mapping->set_rank(path->mapping_size()); - - //cerr << from_node->id() << ":" << endl; - - for (int j=0; j < l; ++j, ++e) { - int32_t length = e->length; - //cerr << e->length << e->type << endl; - - Edit* edit; - switch (e->type) { - case 'M': - case 'X': - case 'N': { - // do the sequences match? - // emit a stream of "SNPs" and matches - int h = from_pos; - int last_start = from_pos; - int k = to_pos; - for ( ; h < from_pos + length; ++h, ++k) { - //cerr << h << ":" << k << " " << from_seq[h] << " " << to_seq[k] << endl; - if (from_seq[h] != to_seq[k]) { - // emit the last "match" region - if (h - last_start > 0) { - edit = mapping->add_edit(); - edit->set_from_length(h-last_start); - edit->set_to_length(h-last_start); - } - // set up the SNP - edit = mapping->add_edit(); - edit->set_from_length(1); - edit->set_to_length(1); - edit->set_sequence(to_seq.substr(k,1)); - last_start = h+1; - } - } - // handles the match at the end or the case of no SNP - if (h - last_start > 0) { - edit = mapping->add_edit(); - edit->set_from_length(h-last_start); - edit->set_to_length(h-last_start); - } - to_pos += length; - from_pos += length; - } break; - case 'D': - edit = mapping->add_edit(); - edit->set_from_length(length); - edit->set_to_length(0); - from_pos += length; - break; - case 'I': - edit = mapping->add_edit(); - edit->set_from_length(0); - edit->set_to_length(length); - edit->set_sequence(to_seq.substr(to_pos, length)); - to_pos += length; - break; - case 'S': - // note that soft clips and insertions are semantically equivalent - // and can only be differentiated by their position in the read - // with soft clips coming at the start or end - edit = mapping->add_edit(); - edit->set_from_length(0); - edit->set_to_length(length); - edit->set_sequence(to_seq.substr(to_pos, length)); - to_pos += length; - break; - default: - cerr << "error:[Aligner::gssw_mapping_to_alignment] " - << "unsupported cigar op type " << e->type << endl; - exit(1); - break; - - } - } - } - - // compute and set identity - alignment.set_identity(identity(alignment.path())); -} - -void BaseAligner::reverse_graph(Graph& g, Graph& reversed_graph_out) { - if (reversed_graph_out.node_size()) { - cerr << "error:[Aligner::reverse_graph] output graph is not empty" << endl; - exit(EXIT_FAILURE); - } - - // add reversed nodes in reverse order (Graphs come in topologically sorted and gssw - // depends on this fact) - for (int64_t i = g.node_size() - 1; i >= 0; i--) { - const Node& original_node = g.node(i); - - Node* reversed_node = reversed_graph_out.add_node(); - - // reverse the sequence - string* reversed_node_seq = reversed_node->mutable_sequence(); - reversed_node_seq->resize(original_node.sequence().length()); - reverse_copy(original_node.sequence().begin(), original_node.sequence().end(), reversed_node_seq->begin()); - - // preserve ids for easier translation - reversed_node->set_id(original_node.id()); - } - - // add reversed edges - for (int64_t i = 0; i < g.edge_size(); i++) { - const Edge& original_edge = g.edge(i); - - Edge* reversed_edge = reversed_graph_out.add_edge(); - - // reverse edge orientation - reversed_edge->set_from(original_edge.to()); - reversed_edge->set_to(original_edge.from()); - - // we will swap the 5'/3' labels of the node ends after reversing the sequence so that - // an edge leaving an end now enters a beginning and an edge entering a beginning now - // leaves an end - reversed_edge->set_from_start(original_edge.to_end()); - reversed_edge->set_to_end(original_edge.from_start()); - } - -} - -void BaseAligner::unreverse_graph(Graph& graph) { - // this is only for getting correct reference-relative edits, so we can get away with only - // reversing the sequences and not paying attention to the edges - for (int64_t i = 0; i < graph.node_size(); i++) { - Node* node = graph.mutable_node(i); - string* node_seq = node->mutable_sequence(); - reverse(node_seq->begin(), node_seq->end()); - } -} - -void BaseAligner::unreverse_graph_mapping(gssw_graph_mapping* gm) { - - gssw_graph_cigar* graph_cigar = &(gm->cigar); - gssw_node_cigar* node_cigars = graph_cigar->elements; - - // reverse the order of the node cigars - int32_t num_switching_nodes = graph_cigar->length / 2; - int32_t last_idx = graph_cigar->length - 1; - for (int32_t i = 0; i < num_switching_nodes; i++) { - std::swap(node_cigars[i], node_cigars[last_idx - i]); - } - - // reverse the actual cigar string for each node cigar - for (int32_t i = 0; i < graph_cigar->length; i++) { - gssw_cigar* node_cigar = node_cigars[i].cigar; - gssw_cigar_element* elements = node_cigar->elements; - - int32_t num_switching_elements = node_cigar->length / 2; - last_idx = node_cigar->length - 1; - for (int32_t j = 0; j < num_switching_elements; j++) { - std::swap(elements[j], elements[last_idx - j]); - } - } - - // compute the position in the first node - if (graph_cigar->length > 0) { - gssw_cigar_element* first_node_elements = node_cigars[0].cigar->elements; - int32_t num_first_node_elements = node_cigars[0].cigar->length; - uint32_t num_ref_aligned = 0; // the number of characters on the node sequence that are aligned - for (int32_t i = 0; i < num_first_node_elements; i++) { - switch (first_node_elements[i].type) { - case 'M': - case 'X': - case 'N': - case 'D': - num_ref_aligned += first_node_elements[i].length; - break; - - } - } - gm->position = node_cigars[0].node->len - num_ref_aligned - (graph_cigar->length == 1 ? gm->position : 0); - } - else { - gm->position = 0; - } -} - -string BaseAligner::graph_cigar(gssw_graph_mapping* gm) { - stringstream s; - gssw_graph_cigar* gc = &gm->cigar; - gssw_node_cigar* nc = gc->elements; - int to_pos = 0; - int from_pos = gm->position; - //string& to_seq = *alignment.mutable_sequence(); - s << from_pos << '@'; - for (int i = 0; i < gc->length; ++i, ++nc) { - if (i > 0) from_pos = 0; // reset for each node after the first - Node* from_node = (Node*) nc->node->data; - s << from_node->id() << ':'; - gssw_cigar* c = nc->cigar; - int l = c->length; - gssw_cigar_element* e = c->elements; - for (int j=0; j < l; ++j, ++e) { - s << e->length << e->type; - } - if (i + 1 < gc->length) { - s << ","; - } - } - return s.str(); -} - -void BaseAligner::init_mapping_quality(double gc_content) { - log_base = gssw_dna_recover_log_base(match, mismatch, gc_content, 1e-12); -} - -int32_t BaseAligner::score_gap(size_t gap_length) { - return gap_length ? -gap_open - (gap_length - 1) * gap_extension : 0; -} - -double BaseAligner::maximum_mapping_quality_exact(vector& scaled_scores, size_t* max_idx_out) { - - // if necessary, assume a null alignment of 0.0 for comparison since this is local - bool padded = false; - if (scaled_scores.size() == 1) { - scaled_scores.push_back(0.0); - padded = true; - } - - // work in log transformed values to avoid risk of overflow - double log_sum_exp = numeric_limits::lowest(); - double max_score = numeric_limits::lowest(); - // go in reverse order because this has fewer numerical problems when the scores are sorted (as usual) - for (int64_t i = scaled_scores.size() - 1; i >= 0; i--) { - log_sum_exp = add_log(log_sum_exp, scaled_scores[i]); - if (scaled_scores[i] >= max_score) { - // Since we are going in reverse order, make sure to break ties in favor of the earlier item. - *max_idx_out = i; - max_score = scaled_scores[i]; - } - } - - if (padded && *max_idx_out == 1) { - // Force us not to try to return the injected 0 as the winner. - // TODO: doesn't this mean the score is negative? - cerr << "warning:[BaseAligner::maximum_mapping_quality_exact]: Max score of " << max_score - << " is the padding score; changing to " << scaled_scores[0] << endl; - max_score = scaled_scores[0]; - *max_idx_out = 0; - } - - double direct_mapq = -quality_scale_factor * subtract_log(0.0, max_score - log_sum_exp); - return std::isinf(direct_mapq) ? (double) numeric_limits::max() : direct_mapq; -} - -// TODO: this algorithm has numerical problems that would be difficult to solve without increasing the -// time complexity: adding the probability of the maximum likelihood tends to erase the contribution -// of the other terms so that when you subtract them off you get scores of 0 or infinity - -//vector Aligner::all_mapping_qualities_exact(vector& scaled_scores) { -// -// double max_score = *max_element(scaled_scores.begin(), scaled_scores.end()); -// size_t size = scaled_scores.size(); -// -// vector mapping_qualities(size); -// -// if (max_score * size < exp_overflow_limit) { -// // no risk of double overflow, sum exp directly (half as many transcendental function evals) -// vector exp_scaled_scores(size); -// for (size_t i = 0; i < size; i++) { -// exp_scaled_scores[i] = exp(scaled_scores[i]); -// } -// double denom = std::accumulate(exp_scaled_scores.begin(), exp_scaled_scores.end(), 0.0); -// for (size_t i = 0; i < size; i++) { -// mapping_qualities[i] = -10.0 * log10((denom - exp_scaled_scores[i]) / denom); -// } -// } -// else { -// // work in log transformed valued to avoid risk of overflow -// double log_sum_exp = scaled_scores[0]; -// for (size_t i = 1; i < size; i++) { -// log_sum_exp = add_log(log_sum_exp, scaled_scores[i]); -// } -// for (size_t i = 0; i < size; i++) { -// mapping_qualities[i] = -10.0 * log10(1.0 - exp(scaled_scores[i] - log_sum_exp)); -// } -// } -// return mapping_qualities; -//} - -double BaseAligner::maximum_mapping_quality_approx(vector& scaled_scores, size_t* max_idx_out) { - - // if necessary, assume a null alignment of 0.0 for comparison since this is local - bool padded = false; - if (scaled_scores.size() == 1) { - scaled_scores.push_back(0.0); - padded = true; - } - - double max_score = scaled_scores[0]; - size_t max_idx = 0; - - double next_score = std::numeric_limits::lowest(); - int32_t next_count = 0; - - for (int32_t i = 1; i < scaled_scores.size(); ++i) { - double score = scaled_scores[i]; - if (score > max_score) { - if (next_score == max_score) { - next_count++; - } - else { - next_score = max_score; - next_count = 1; - } - max_score = score; - max_idx = i; - } - else if (score > next_score) { - next_score = score; - next_count = 1; - } - else if (score == next_score) { - next_count++; - } - } - - if (padded && max_idx == 1) { - // Force us not to try to return the injected 0 as the winner. - // TODO: doesn't this mean the score is negative? - cerr << "warning:[BaseAligner::maximum_mapping_quality_approx]: Max score of " << max_score - << " is the padding score; changing to " << scaled_scores[0] << endl; - max_score = scaled_scores[0]; - max_idx = 0; - } - - *max_idx_out = max_idx; - - return max(0.0, quality_scale_factor * (max_score - next_score - (next_count > 1 ? log(next_count) : 0.0))); -} - -double BaseAligner::group_mapping_quality_exact(vector& scaled_scores, vector& group) { - - // if necessary, assume a null alignment of 0.0 for comparison since this is local - if (scaled_scores.size() == 1) { - scaled_scores.push_back(0.0); - } - - // work in log transformed values to avoid risk of overflow - double total_log_sum_exp = numeric_limits::lowest(); - double non_group_log_sum_exp = numeric_limits::lowest(); - - // go in reverse order because this has fewer numerical problems when the scores are sorted (as usual) - int64_t group_idx = group.size() - 1; - for (int64_t i = scaled_scores.size() - 1; i >= 0; i--) { - total_log_sum_exp = add_log(total_log_sum_exp, scaled_scores[i]); - if (group_idx >= 0 ? i == group[group_idx] : false) { - group_idx--; - } - else { - non_group_log_sum_exp = add_log(non_group_log_sum_exp, scaled_scores[i]); - } - } - double direct_mapq = quality_scale_factor * (total_log_sum_exp - non_group_log_sum_exp); - return (std::isinf(direct_mapq) || direct_mapq > numeric_limits::max()) ? - (double) numeric_limits::max() : direct_mapq; -} - -void BaseAligner::compute_mapping_quality(vector& alignments, - int max_mapping_quality, - bool fast_approximation, - double cluster_mq, - bool use_cluster_mq, - int overlap_count, - double mq_estimate, - double maybe_mq_threshold, - double identity_weight) { - - if (log_base <= 0.0) { - cerr << "error:[Aligner] must call init_mapping_quality before computing mapping qualities" << endl; - exit(EXIT_FAILURE); - } - - if (alignments.empty()) { - return; - } - - vector scaled_scores(alignments.size()); - for (size_t i = 0; i < alignments.size(); i++) { - scaled_scores[i] = log_base * alignments[i].score(); - } - - double mapping_quality; - size_t max_idx; - if (!fast_approximation) { - mapping_quality = maximum_mapping_quality_exact(scaled_scores, &max_idx); - } - else { - mapping_quality = maximum_mapping_quality_approx(scaled_scores, &max_idx); - } - - if (use_cluster_mq) { - mapping_quality = prob_to_phred(sqrt(phred_to_prob(cluster_mq + mapping_quality))); - } - - if (overlap_count) { - mapping_quality -= quality_scale_factor * log(overlap_count); - } - - auto& max_aln = alignments.at(max_idx); - int l = max(alignment_to_length(max_aln), alignment_from_length(max_aln)); - double identity = 1. - (double)(l * match - max_aln.score()) / (match + mismatch) / l; - - mapping_quality /= 2; - - mapping_quality *= pow(identity, identity_weight); - - if (mq_estimate < maybe_mq_threshold && mq_estimate < mapping_quality) { - mapping_quality = prob_to_phred(sqrt(phred_to_prob(mq_estimate + mapping_quality))); - } - - if (mapping_quality > max_mapping_quality) { - mapping_quality = max_mapping_quality; - } - - if (alignments[max_idx].score() == 0) { - mapping_quality = 0; - } - - alignments[max_idx].set_mapping_quality(max(0, (int32_t) round(mapping_quality))); - for (int i = 1; i < alignments.size(); ++i) { - alignments[0].add_secondary_score(alignments[i].score()); - } -} - -int32_t BaseAligner::compute_mapping_quality(vector& scores, bool fast_approximation) { - - vector scaled_scores(scores.size(), 0.0); - for (size_t i = 0; i < scores.size(); i++) { - scaled_scores[i] = log_base * scores[i]; - } - size_t idx; - return (int32_t) (fast_approximation ? maximum_mapping_quality_approx(scaled_scores, &idx) - : maximum_mapping_quality_exact(scaled_scores, &idx)); -} - -int32_t BaseAligner::compute_group_mapping_quality(vector& scores, vector& group) { - - // ensure that group is in sorted order as following function expects - if (!is_sorted(group.begin(), group.end())) { - sort(group.begin(), group.end()); - } - - vector scaled_scores(scores.size(), 0.0); - for (size_t i = 0; i < scores.size(); i++) { - scaled_scores[i] = log_base * scores[i]; - } - return group_mapping_quality_exact(scaled_scores, group); -} - -void BaseAligner::compute_paired_mapping_quality(pair, vector>& alignment_pairs, - const vector& frag_weights, - int max_mapping_quality1, - int max_mapping_quality2, - bool fast_approximation, - double cluster_mq, - bool use_cluster_mq, - int overlap_count1, - int overlap_count2, - double mq_estimate1, - double mq_estimate2, - double maybe_mq_threshold, - double identity_weight) { - - if (log_base <= 0.0) { - cerr << "error:[Aligner] must call init_mapping_quality before computing mapping qualities" << endl; - exit(EXIT_FAILURE); - } - - size_t size = min(alignment_pairs.first.size(), - alignment_pairs.second.size()); - - if (size == 0) { - return; - } - - vector scaled_scores(size); - - for (size_t i = 0; i < size; i++) { - auto& aln1 = alignment_pairs.first[i]; - auto& aln2 = alignment_pairs.second[i]; - scaled_scores[i] = log_base * (aln1.score() + aln2.score()); - // + frag_weights[i]); - // ^^^ we could also incorporate the fragment weights, but this does not seem to help performance in the current form - } - - size_t max_idx; - double mapping_quality; - if (!fast_approximation) { - mapping_quality = maximum_mapping_quality_exact(scaled_scores, &max_idx); - } - else { - mapping_quality = maximum_mapping_quality_approx(scaled_scores, &max_idx); - } - - if (use_cluster_mq) { - mapping_quality = prob_to_phred(sqrt(phred_to_prob(cluster_mq + mapping_quality))); - } - - double mapping_quality1 = mapping_quality; - double mapping_quality2 = mapping_quality; - - if (overlap_count1) { - mapping_quality1 -= quality_scale_factor * log(overlap_count1); - } - if (overlap_count2) { - mapping_quality2 -= quality_scale_factor * log(overlap_count2); - } - - auto& max_aln1 = alignment_pairs.first.at(max_idx); - int len1 = max(alignment_to_length(max_aln1), alignment_from_length(max_aln1)); - double identity1 = 1. - (double)(len1 * match - max_aln1.score()) / (match + mismatch) / len1; - auto& max_aln2 = alignment_pairs.second.at(max_idx); - int len2 = max(alignment_to_length(max_aln2), alignment_from_length(max_aln2)); - double identity2 = 1. - (double)(len2 * match - max_aln2.score()) / (match + mismatch) / len2; - - mapping_quality1 /= 2; - mapping_quality2 /= 2; - - mapping_quality1 *= pow(identity1, identity_weight); - mapping_quality2 *= pow(identity2, identity_weight); - - double mq_estimate = min(mq_estimate1, mq_estimate2); - if (mq_estimate < maybe_mq_threshold && mq_estimate < mapping_quality1) { - mapping_quality1 = prob_to_phred(sqrt(phred_to_prob(mq_estimate + mapping_quality1))); - } - if (mq_estimate < maybe_mq_threshold && mq_estimate < mapping_quality2) { - mapping_quality2 = prob_to_phred(sqrt(phred_to_prob(mq_estimate + mapping_quality2))); - } - - if (mapping_quality1 > max_mapping_quality1) { - mapping_quality1 = max_mapping_quality1; - } - if (mapping_quality2 > max_mapping_quality2) { - mapping_quality2 = max_mapping_quality2; - } - - if (alignment_pairs.first[max_idx].score() == 0) { - mapping_quality1 = 0; - } - if (alignment_pairs.second[max_idx].score() == 0) { - mapping_quality2 = 0; - } - - mapping_quality = max(0, (int32_t)round(min(mapping_quality1, mapping_quality2))); - - alignment_pairs.first[max_idx].set_mapping_quality(mapping_quality); - alignment_pairs.second[max_idx].set_mapping_quality(mapping_quality); - - for (int i = 1; i < alignment_pairs.first.size(); ++i) { - alignment_pairs.first[0].add_secondary_score(alignment_pairs.first[i].score()); - } - for (int i = 1; i < alignment_pairs.second.size(); ++i) { - alignment_pairs.second[0].add_secondary_score(alignment_pairs.second[i].score()); - } - -} - -double BaseAligner::mapping_quality_score_diff(double mapping_quality) const { - return mapping_quality / (quality_scale_factor * log_base); -} - -double BaseAligner::estimate_next_best_score(int length, double min_diffs) { - return ((length - min_diffs) * match - min_diffs * mismatch); -} - -double BaseAligner::max_possible_mapping_quality(int length) { - double max_score = log_base * length * match; - vector v = { max_score }; - size_t max_idx; - return maximum_mapping_quality_approx(v, &max_idx); -} - -double BaseAligner::estimate_max_possible_mapping_quality(int length, double min_diffs, double next_min_diffs) { - double max_score = log_base * ((length - min_diffs) * match - min_diffs * mismatch); - double next_max_score = log_base * ((length - next_min_diffs) * match - next_min_diffs * mismatch); - vector v = { max_score, next_max_score }; - size_t max_idx; - return maximum_mapping_quality_approx(v, &max_idx); -} - -double BaseAligner::score_to_unnormalized_likelihood_ln(double score) { - // Log base needs to be set, or this can't work. It's set by default in - // QualAdjAligner but needs to be set up manually in the normal Aligner. - assert(log_base != 0); - // Likelihood is proportional to e^(lambda * score), so ln is just the exponent. - return log_base * score; -} - -size_t BaseAligner::longest_detectable_gap(const Alignment& alignment, const string::const_iterator& read_pos) const { - // algebraic solution for when score is > 0 assuming perfect match other than gap - int64_t overhang_length = min(read_pos - alignment.sequence().begin(), alignment.sequence().end() - read_pos); - int64_t numer = match * overhang_length + full_length_bonus; - int64_t gap_length = (numer - gap_open) / gap_extension + 1; - return gap_length >= 0 && overhang_length > 0 ? gap_length : 0; -} - -size_t BaseAligner::longest_detectable_gap(const Alignment& alignment) const { - // longest detectable gap across entire read is in the middle - return longest_detectable_gap(alignment, alignment.sequence().begin() + (alignment.sequence().size() / 2)); - -} - -int32_t BaseAligner::score_gappy_alignment(const Alignment& aln, const function& estimate_distance, - bool strip_bonuses) const { - - int score = 0; - int read_offset = 0; - auto& path = aln.path(); - - // We keep track of whether the last edit was a deletion for coalescing - // adjacent deletions across node boundaries - bool last_was_deletion = false; - - for (int i = 0; i < path.mapping_size(); ++i) { - // For each mapping - auto& mapping = path.mapping(i); - for (int j = 0; j < mapping.edit_size(); ++j) { - // For each edit in the mapping - auto& edit = mapping.edit(j); - - // Score the edit according to its type - if (edit_is_match(edit)) { - score += score_exact_match(aln, read_offset, edit.to_length()); - last_was_deletion = false; - } else if (edit_is_sub(edit)) { - score -= mismatch * edit.sequence().size(); - last_was_deletion = false; - } else if (edit_is_deletion(edit)) { - if (last_was_deletion) { - // No need to charge a gap open - score -= edit.from_length() * gap_extension; - } else { - // We need a gap open - score -= edit.from_length() ? gap_open + (edit.from_length() - 1) * gap_extension : 0; - } - - if (edit.from_length()) { - // We already charged a gap open - last_was_deletion = true; - } - // If there's a 0-length deletion, leave the last_was_deletion flag unchanged. - } else if (edit_is_insertion(edit) && !((i == 0 && j == 0) || - (i == path.mapping_size()-1 && j == mapping.edit_size()-1))) { - // todo how do we score this qual adjusted? - score -= edit.to_length() ? gap_open + (edit.to_length() - 1) * gap_extension : 0; - last_was_deletion = false; - // No need to track if the last edit was an insertion because - // insertions will be all together in a single edit at a point. - } else { - // Edit has no score effect. Probably a softclip. - last_was_deletion = false; - } - read_offset += edit.to_length(); - } - // score any intervening gaps in mappings using approximate distances - if (i+1 < path.mapping_size()) { - // what is the distance between the last position of this mapping - // and the first of the next - Position last_pos = mapping.position(); - last_pos.set_offset(last_pos.offset() + mapping_from_length(mapping)); - Position next_pos = path.mapping(i+1).position(); - // Estimate the distance - int dist = estimate_distance(make_pos_t(last_pos), make_pos_t(next_pos), aln.sequence().size()); - if (dist > 0) { - // If it's nonzero, score it as a deletion gap - score -= gap_open + (dist - 1) * gap_extension; - } - } - } - - if (!strip_bonuses) { - // We should report any bonuses used in the DP in the final score - if (!softclip_start(aln)) { - score += full_length_bonus; - } - if (!softclip_end(aln)) { - score += full_length_bonus; - } - } - - return score; -} - -int32_t BaseAligner::score_ungapped_alignment(const Alignment& aln, bool strip_bonuses) const { - return score_gappy_alignment(aln, [](pos_t, pos_t, size_t){return (size_t) 0;}, strip_bonuses); -} - -int32_t BaseAligner::remove_bonuses(const Alignment& aln, bool pinned, bool pin_left) const { - int32_t score = aln.score(); - if (softclip_start(aln) == 0 && !(pinned && pin_left)) { - // No softclip at the start, and a left end bonus was applied. - score -= full_length_bonus; - } - if (softclip_end(aln) == 0 && !(pinned && !pin_left)) { - // No softclip at the end, and a right end bonus was applied. - score -= full_length_bonus; - } - return score; -} - - -Aligner::Aligner(int8_t _match, - int8_t _mismatch, - int8_t _gap_open, - int8_t _gap_extension, - int8_t _full_length_bonus, - double gc_content) - : Aligner(_match, _mismatch, _gap_open, _gap_extension, _full_length_bonus, gc_content, default_max_gap_length) -{ -} - -Aligner::Aligner(int8_t _match, - int8_t _mismatch, - int8_t _gap_open, - int8_t _gap_extension, - int8_t _full_length_bonus, - double gc_content, - uint32_t _max_gap_length) - : xdrop(_match, _mismatch, _gap_open, _gap_extension, _full_length_bonus, _max_gap_length) -{ - match = _match; - mismatch = _mismatch; - gap_open = _gap_open; - gap_extension = _gap_extension; - full_length_bonus = _full_length_bonus; - // these are used when setting up the nodes - nt_table = gssw_create_nt_table(); - score_matrix = gssw_create_score_matrix(match, mismatch); - BaseAligner::init_mapping_quality(gc_content); - // bench_init(bench); -} - -/* -Aligner::~Aligner() -{ - // fprintf(stderr, "gssw: time(%lu), count(%lu)\n", bench_get(bench) / 1000, bench_get_count(bench)); -} -*/ - -void Aligner::align_internal(Alignment& alignment, vector* multi_alignments, Graph& g, - bool pinned, bool pin_left, int32_t max_alt_alns, - bool traceback_aln, bool print_score_matrices) { - // bench_start(bench); - // check input integrity - if (pin_left && !pinned) { - cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl; - exit(EXIT_FAILURE); - } - if (multi_alignments && !pinned) { - cerr << "error:[Aligner] multiple traceback is not implemented in local alignment, only pinned and global" << endl; - exit(EXIT_FAILURE); - } - if (!multi_alignments && max_alt_alns != 1) { - cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl; - exit(EXIT_FAILURE); - } - if (max_alt_alns <= 0) { - cerr << "error:[Aligner] cannot do less than 1 alignment" << endl; - exit(EXIT_FAILURE); - } - - // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top - // left we need to reverse all the sequences first and translate the alignment back later - - // create reversed graph if necessary - Graph reversed_graph; - if (pin_left) { - reverse_graph(g, reversed_graph); - } - - // choose forward or reversed objects - // note: have to make a copy of the sequence because we will modify it to add a pinning point - Graph* align_graph = &g; - string align_sequence = alignment.sequence(); - if (pin_left) { - align_graph = &reversed_graph; - reverse(align_sequence.begin(), align_sequence.end()); - } - - // convert into gssw graph - gssw_graph* graph = create_gssw_graph(*align_graph); - - // perform dynamic programming - gssw_graph_fill_pinned(graph, align_sequence.c_str(), - nt_table, score_matrix, - gap_open, gap_extension, full_length_bonus, - pinned ? 0 : full_length_bonus, 15, 2, traceback_aln); - - // traceback either from pinned position or optimal local alignment - if (traceback_aln) { - if (pinned) { - // trace back pinned alignment - gssw_graph_mapping** gms = gssw_graph_trace_back_pinned_multi (graph, - max_alt_alns, - true, - align_sequence.c_str(), - align_sequence.size(), - nt_table, - score_matrix, - gap_open, - gap_extension, - full_length_bonus, - 0); - - if (pin_left) { - // translate graph and mappings into original node space - unreverse_graph(reversed_graph); - for (int32_t i = 0; i < max_alt_alns; i++) { - unreverse_graph_mapping(gms[i]); - } - } - - // convert optimal alignment and store it in the input Alignment object (in the multi alignment, - // this will have been set to the first in the vector) - // We know the 0th alignment always exists because we enforce that max_alt_alns >= 1 - if (gms[0]->score > 0) { - // have a mapping, can just convert normally - gssw_mapping_to_alignment(graph, gms[0], alignment, pinned, pin_left, print_score_matrices); - } - else if (g.node_size() > 0) { - // gssw will not identify mappings with 0 score, infer location based on pinning - - Mapping* mapping = alignment.mutable_path()->add_mapping(); - mapping->set_rank(1); - - - // locate at a beginning of an arbitrary source node or end of an arbitrary sink node as appropriate - Position* position = mapping->mutable_position(); - if (pin_left) { - position->set_node_id(g.node(0).id()); - position->set_offset(0); - } - else { - position->set_node_id(g.node(g.node_size() - 1).id()); - position->set_offset(g.node(g.node_size() - 1).sequence().length()); - } - - // soft clip - Edit* edit = mapping->add_edit(); - edit->set_to_length(alignment.sequence().length()); - edit->set_sequence(alignment.sequence()); - } - - if (multi_alignments) { - // determine how many non-null alignments were returned - int32_t num_non_null = max_alt_alns; - for (int32_t i = 1; i < max_alt_alns; i++) { - if (gms[i]->score <= 0) { - num_non_null = i; - break; - } - } - - // reserve to avoid illegal access errors that occur when the vector reallocates - multi_alignments->reserve(num_non_null); - - // copy the primary alignment - multi_alignments->emplace_back(alignment); - - // convert the alternate alignments and store them at the back of the vector (this will not - // execute if we are doing single alignment) - for (int32_t i = 1; i < num_non_null; i++) { - gssw_graph_mapping* gm = gms[i]; - - // make new alignment object - multi_alignments->emplace_back(); - Alignment& next_alignment = multi_alignments->back(); - - // copy over sequence information from the primary alignment - next_alignment.set_sequence(alignment.sequence()); - next_alignment.set_quality(alignment.quality()); - - // get path of the alternate alignment - gssw_mapping_to_alignment(graph, gm, next_alignment, pinned, pin_left, print_score_matrices); - - } - - } - - for (int32_t i = 0; i < max_alt_alns; i++) { - gssw_graph_mapping_destroy(gms[i]); - } - free(gms); - } - else { - // trace back local alignment - gssw_graph_mapping* gm = gssw_graph_trace_back (graph, - align_sequence.c_str(), - align_sequence.size(), - nt_table, - score_matrix, - gap_open, - gap_extension, - full_length_bonus, - full_length_bonus); - - gssw_mapping_to_alignment(graph, gm, alignment, pinned, pin_left, print_score_matrices); - gssw_graph_mapping_destroy(gm); - } - } else { - // get the alignment position and score - alignment.set_score(graph->max_node->alignment->score1); - Mapping* m = alignment.mutable_path()->add_mapping(); - Position* p = m->mutable_position(); - p->set_node_id(graph->max_node->id); - p->set_offset(graph->max_node->alignment->ref_end1); // mark end position; for de-duplication - } - - //gssw_graph_print_score_matrices(graph, sequence.c_str(), sequence.size(), stderr); - - gssw_graph_destroy(graph); - // bench_end(bench); -} - -void Aligner::align(Alignment& alignment, Graph& g, bool traceback_aln, bool print_score_matrices) { - - align_internal(alignment, nullptr, g, false, false, 1, traceback_aln, print_score_matrices); -} - -void Aligner::align_pinned(Alignment& alignment, Graph& g, bool pin_left) { - - align_internal(alignment, nullptr, g, true, pin_left, 1, true, false); -} - -void Aligner::align_pinned_multi(Alignment& alignment, vector& alt_alignments, Graph& g, - bool pin_left, int32_t max_alt_alns) { - - if (alt_alignments.size() != 0) { - cerr << "error:[Aligner::align_pinned_multi] output vector must be empty for pinned multi-aligning" << endl; - exit(EXIT_FAILURE); - } - - align_internal(alignment, &alt_alignments, g, true, pin_left, max_alt_alns, true, false); -} - -void Aligner::align_global_banded(Alignment& alignment, Graph& g, - int32_t band_padding, bool permissive_banding) { - - // We need to figure out what size ints we need to use. - // Get upper and lower bounds on the scores. TODO: if these overflow int64 we're out of luck - int64_t best_score = alignment.sequence().size() * match; - size_t total_bases = 0; - for(size_t i = 0; i < g.node_size(); i++) { - total_bases += g.node(i).sequence().size(); - } - int64_t worst_score = max(alignment.sequence().size(), total_bases) * -max(max(mismatch, gap_open), gap_extension); - - // TODO: put this all into another template somehow? - - if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { - // We'll fit in int8 - BandedGlobalAligner band_graph(alignment, - g, - band_padding, - permissive_banding, - false); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); - } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { - // We'll fit in int16 - BandedGlobalAligner band_graph(alignment, - g, - band_padding, - permissive_banding, - false); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); - } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { - // We'll fit in int32 - BandedGlobalAligner band_graph(alignment, - g, - band_padding, - permissive_banding, - false); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); - } else { - // Fall back to int64 - BandedGlobalAligner band_graph(alignment, - g, - band_padding, - permissive_banding, - false); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); - } - -} - -void Aligner::align_global_banded_multi(Alignment& alignment, vector& alt_alignments, Graph& g, - int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) { - - // We need to figure out what size ints we need to use. - // Get upper and lower bounds on the scores. TODO: if these overflow int64 we're out of luck - int64_t best_score = alignment.sequence().size() * match; - size_t total_bases = 0; - for(size_t i = 0; i < g.node_size(); i++) { - total_bases += g.node(i).sequence().size(); - } - int64_t worst_score = max(alignment.sequence().size(), total_bases) * -max(max(mismatch, gap_open), gap_extension); - - if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { - // We'll fit in int8 - BandedGlobalAligner band_graph(alignment, - g, - alt_alignments, - max_alt_alns, - band_padding, - permissive_banding, - false); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); - } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { - // We'll fit in int16 - BandedGlobalAligner band_graph(alignment, - g, - alt_alignments, - max_alt_alns, - band_padding, - permissive_banding, - false); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); - } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { - // We'll fit in int32 - BandedGlobalAligner band_graph(alignment, - g, - alt_alignments, - max_alt_alns, - band_padding, - permissive_banding, - false); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); - } else { - // Fall back to int64 - BandedGlobalAligner band_graph(alignment, - g, - alt_alignments, - max_alt_alns, - band_padding, - permissive_banding, - false); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); - } -} - -// X-drop aligner -void Aligner::align_xdrop(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, bool multithreaded) -{ - // cerr << "X-drop aligner" << endl; - if (multithreaded) { - auto xdrop_copy = xdrop; // make thread safe - xdrop_copy.align(alignment, g, mems, reverse_complemented); - } else { - xdrop.align(alignment, g, mems, reverse_complemented); - } -} - -void Aligner::align_xdrop_multi(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, int32_t max_alt_alns) -{ -} - - -// Scoring an exact match is very simple in an ordinary Aligner - -int32_t Aligner::score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const { - return match * length; -} - -int32_t Aligner::score_exact_match(const string& sequence) const { - return match * sequence.length(); -} - -int32_t Aligner::score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end) const { - return match * (seq_end - seq_begin); -} - -int32_t Aligner::score_exact_match(const string& sequence, const string& base_quality) const { - return score_exact_match(sequence); -} - -int32_t Aligner::score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, - string::const_iterator base_qual_begin) const { - return score_exact_match(seq_begin, seq_end); -} - -int32_t Aligner::score_partial_alignment(const Alignment& alignment, VG& graph, const Path& path, - string::const_iterator seq_begin) const{ - - int32_t score = 0; - string::const_iterator read_pos = seq_begin; - for (size_t i = 0; i < path.mapping_size(); i++) { - const Mapping& mapping = path.mapping(i); - - for (size_t j = 0; j < mapping.edit_size(); j++) { - const Edit& edit = mapping.edit(j); - - if (edit.from_length() > 0) { - if (edit.to_length() > 0) { - if (edit.sequence().empty()) { - // match - score += match * edit.from_length(); - } - else { - // mismatch - score -= mismatch * edit.from_length(); - } - - // apply full length bonus - if (read_pos == alignment.sequence().begin()) { - score += full_length_bonus; - } - if (read_pos + edit.from_length() == alignment.sequence().end()) { - score += full_length_bonus; - } - } - else { - // deletion - score -= gap_open + (edit.from_length() - 1) * gap_extension; - } - } - else if (edit.to_length() > 0) { - // don't score soft clips - if (read_pos != alignment.sequence().begin() && - read_pos + edit.to_length() != alignment.sequence().end()) { - // insert - score -= gap_open + (edit.to_length() - 1) * gap_extension; - } - } - - read_pos += edit.to_length(); - } - } - return score; -} - -QualAdjAligner::QualAdjAligner(int8_t _match, - int8_t _mismatch, - int8_t _gap_open, - int8_t _gap_extension, - int8_t _full_length_bonus, - int8_t _max_scaled_score, - uint8_t _max_qual_score, - double gc_content) -{ - - max_qual_score = _max_qual_score; - match = _match; - mismatch = _mismatch; - gap_open = _gap_open; - gap_extension = _gap_extension; - full_length_bonus = _full_length_bonus; - - int8_t original_gap_open = gap_open; - - nt_table = gssw_create_nt_table(); - score_matrix = gssw_dna_scaled_adjusted_qual_matrix(_max_scaled_score, max_qual_score, &gap_open, - &gap_extension, match, mismatch, - gc_content, 1e-12); - scale_factor = gap_open / original_gap_open; - match *= scale_factor; - mismatch *= scale_factor; - full_length_bonus *= scale_factor; - - BaseAligner::init_mapping_quality(gc_content); -} - -void QualAdjAligner::align_internal(Alignment& alignment, vector* multi_alignments, Graph& g, - bool pinned, bool pin_left, int32_t max_alt_alns, bool traceback_aln, bool print_score_matrices) { - - // check input integrity - if (pin_left && !pinned) { - cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl; - exit(EXIT_FAILURE); - } - if (multi_alignments && !pinned) { - cerr << "error:[Aligner] multiple traceback is not implemented in local alignment, only pinned and global" << endl; - exit(EXIT_FAILURE); - } - if (!multi_alignments && max_alt_alns != 1) { - cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl; - exit(EXIT_FAILURE); - } - if (max_alt_alns <= 0) { - cerr << "error:[Aligner] cannot do less than 1 alignment" << endl; - exit(EXIT_FAILURE); - } - - // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top - // left we need to reverse all the sequences first and translate the alignment back later - - // create reversed graph if necessary - Graph reversed_graph; - if (pin_left) { - reverse_graph(g, reversed_graph); - } - - // choose forward or reversed objects - // note: have to make copies of the strings because we will modify them to add a pinning point - Graph* align_graph = &g; - string align_sequence = alignment.sequence(); - string align_quality = alignment.quality(); - if (pin_left) { - align_graph = &reversed_graph; - reverse(align_sequence.begin(), align_sequence.end()); - reverse(align_quality.begin(), align_quality.end()); - } - - if (align_quality.length() != align_sequence.length()) { - cerr << "error:[QualAdjAligner] Read " << alignment.name() << " has sequence and quality strings with different lengths. Cannot perform base quality adjusted alignment. Consider toggling off base quality adjusted alignment at the command line." << endl; - exit(EXIT_FAILURE); - } - - // convert into gssw graph and get dummy pinned node (if pinning) - gssw_graph* graph = create_gssw_graph(*align_graph); - - // perform dynamic programming - // offer a full length bonus on each end, or only on the left if the right end is pinned. - gssw_graph_fill_pinned_qual_adj(graph, align_sequence.c_str(), align_quality.c_str(), - nt_table, score_matrix, - gap_open, gap_extension, - full_length_bonus, pinned ? 0 : full_length_bonus, 15, 2, traceback_aln); - - // traceback either from pinned position or optimal local alignment - if (traceback_aln) { - if (pinned) { - // trace back pinned alignment - gssw_graph_mapping** gms = gssw_graph_trace_back_pinned_qual_adj_multi (graph, - max_alt_alns, - true, - align_sequence.c_str(), - align_quality.c_str(), - align_sequence.size(), - nt_table, - score_matrix, - gap_open, - gap_extension, - full_length_bonus, - 0); - - if (pin_left) { - // translate graph and mappings into original node space - unreverse_graph(reversed_graph); - for (int32_t i = 0; i < max_alt_alns; i++) { - unreverse_graph_mapping(gms[i]); - } - } - - // convert optimal alignment and store it in the input Alignment object (in the multi alignment, - // this will have been set to the first in the vector) - // We know that the 0th alignment will always exist because we enforce that max_alt_alns >= 1 - if (gms[0]->score > 0) { - // have a mapping, can just convert normally - gssw_mapping_to_alignment(graph, gms[0], alignment, pinned, pin_left, print_score_matrices); - } - else if (g.node_size() > 0) { - // gssw will not identify mappings with 0 score, infer location based on pinning - - Mapping* mapping = alignment.mutable_path()->add_mapping(); - mapping->set_rank(1); - - // locate at a beginning of a source node or end of a sink node as appropriate - Position* position = mapping->mutable_position(); - if (pin_left) { - position->set_node_id(g.node(0).id()); - position->set_offset(0); - } - else { - position->set_node_id(g.node(g.node_size() - 1).id()); - position->set_offset(g.node(g.node_size() - 1).sequence().length()); - } - - // soft clip - Edit* edit = mapping->add_edit(); - edit->set_to_length(alignment.sequence().length()); - edit->set_sequence(alignment.sequence()); - } - - - if (multi_alignments) { - // determine how many non-null alignments were returned - int32_t num_non_null = max_alt_alns; - for (int32_t i = 1; i < max_alt_alns; i++) { - if (gms[i]->score <= 0) { - num_non_null = i; - break; - } - } - - // reserve to avoid illegal access errors that occur when the vector reallocates - multi_alignments->reserve(num_non_null); - - // copy the primary alignment - multi_alignments->emplace_back(alignment); - - // convert the alternate alignments and store them at the back of the vector (this will not - // execute if we are doing single alignment) - for (int32_t i = 1; i < num_non_null; i++) { - gssw_graph_mapping* gm = gms[i]; - - // make new alignment object - multi_alignments->emplace_back(); - Alignment& next_alignment = multi_alignments->back(); - - // copy over sequence information from the primary alignment - next_alignment.set_sequence(alignment.sequence()); - next_alignment.set_quality(alignment.quality()); - - // get path of the alternate alignment - gssw_mapping_to_alignment(graph, gm, next_alignment, pinned, pin_left, print_score_matrices); - - } - } - - for (int32_t i = 0; i < max_alt_alns; i++) { - gssw_graph_mapping_destroy(gms[i]); - } - free(gms); - } - else { - // trace back local alignment - gssw_graph_mapping* gm = gssw_graph_trace_back_qual_adj (graph, - align_sequence.c_str(), - align_quality.c_str(), - align_sequence.size(), - nt_table, - score_matrix, - gap_open, - gap_extension, - full_length_bonus, - full_length_bonus); - - gssw_mapping_to_alignment(graph, gm, alignment, pinned, pin_left, print_score_matrices); - gssw_graph_mapping_destroy(gm); - } - } else { - // get the alignment position and score - alignment.set_score(graph->max_node->alignment->score1); - Mapping* m = alignment.mutable_path()->add_mapping(); - Position* p = m->mutable_position(); - p->set_node_id(graph->max_node->id); - p->set_offset(graph->max_node->alignment->ref_end1); // mark end position; for de-duplication - } - - //gssw_graph_print_score_matrices(graph, sequence.c_str(), sequence.size(), stderr); - - gssw_graph_destroy(graph); - -} - -void QualAdjAligner::align(Alignment& alignment, Graph& g, bool traceback_aln, bool print_score_matrices) { - - align_internal(alignment, nullptr, g, false, false, 1, traceback_aln, print_score_matrices); -} - -void QualAdjAligner::align_pinned(Alignment& alignment, Graph& g, bool pin_left) { - - align_internal(alignment, nullptr, g, true, pin_left, 1, true, false); - -} - -void QualAdjAligner::align_pinned_multi(Alignment& alignment, vector& alt_alignments, Graph& g, - bool pin_left, int32_t max_alt_alns) { - align_internal(alignment, &alt_alignments, g, true, pin_left, max_alt_alns, true, false); -} - -void QualAdjAligner::align_global_banded(Alignment& alignment, Graph& g, - int32_t band_padding, bool permissive_banding) { - - BandedGlobalAligner band_graph = BandedGlobalAligner(alignment, - g, - band_padding, - permissive_banding, - true); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); -} - -void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector& alt_alignments, Graph& g, - int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) { - - BandedGlobalAligner band_graph = BandedGlobalAligner(alignment, - g, - alt_alignments, - max_alt_alns, - band_padding, - permissive_banding, - true); - - band_graph.align(score_matrix, nt_table, gap_open, gap_extension); -} - -// X-drop aligner -void QualAdjAligner::align_xdrop(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, bool multithreaded) -{ -} - -void QualAdjAligner::align_xdrop_multi(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, int32_t max_alt_alns) -{ -} - -int32_t QualAdjAligner::score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const { - auto& sequence = aln.sequence(); - auto& base_quality = aln.quality(); - int32_t score = 0; - for (int32_t i = 0; i < length; i++) { - // index 5 x 5 score matrices (ACGTN) - // always have match so that row and column index are same and can combine algebraically - score += score_matrix[25 * base_quality[read_offset + i] + 6 * nt_table[sequence[read_offset + i]]]; - } - return score; -} - -int32_t QualAdjAligner::score_exact_match(const string& sequence, const string& base_quality) const { - int32_t score = 0; - for (int32_t i = 0; i < sequence.length(); i++) { - // index 5 x 5 score matrices (ACGTN) - // always have match so that row and column index are same and can combine algebraically - score += score_matrix[25 * base_quality[i] + 6 * nt_table[sequence[i]]]; - } - return score; -} - - -int32_t QualAdjAligner::score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, - string::const_iterator base_qual_begin) const { - int32_t score = 0; - for (auto seq_iter = seq_begin, qual_iter = base_qual_begin; seq_iter != seq_end; seq_iter++) { - // index 5 x 5 score matrices (ACGTN) - // always have match so that row and column index are same and can combine algebraically - score += score_matrix[25 * (*qual_iter) + 6 * nt_table[*seq_iter]]; - qual_iter++; - } - return score; -} - -int32_t QualAdjAligner::score_partial_alignment(const Alignment& alignment, VG& graph, const Path& path, - string::const_iterator seq_begin) const{ - - int32_t score = 0; - string::const_iterator read_pos = seq_begin; - string::const_iterator qual_pos = alignment.quality().begin() + (seq_begin - alignment.sequence().begin()); - - for (size_t i = 0; i < path.mapping_size(); i++) { - const Mapping& mapping = path.mapping(i); - - // get the sequence of this node on the proper strand - string rc_seq; - const string* node_seq = &(graph.get_node(mapping.position().node_id())->sequence()); - if (mapping.position().is_reverse()) { - rc_seq = reverse_complement(*node_seq); - node_seq = &rc_seq; - } - - auto ref_pos = node_seq->begin() + mapping.position().offset(); - - for (size_t j = 0; j < mapping.edit_size(); j++) { - const Edit& edit = mapping.edit(j); - - if (edit.from_length() > 0) { - if (edit.to_length() > 0) { - - for (auto siter = read_pos, riter = ref_pos, qiter = qual_pos; - siter != read_pos + edit.from_length(); siter++, qiter++, riter++) { - score += score_matrix[25 * (*qiter) + 5 * nt_table[*riter] + nt_table[*siter]]; - } - - // apply full length bonus - if (read_pos == alignment.sequence().begin()) { - score += full_length_bonus; - } - if (read_pos + edit.from_length() == alignment.sequence().end()) { - score += full_length_bonus; - } - } - else { - // deletion - score -= gap_open + (edit.from_length() - 1) * gap_extension; - } - } - else if (edit.to_length() > 0) { - // don't score soft clips - if (read_pos != alignment.sequence().begin() && - read_pos + edit.to_length() != alignment.sequence().end()) { - // insert - score -= gap_open + (edit.to_length() - 1) * gap_extension; - } - } - - read_pos += edit.to_length(); - qual_pos += edit.to_length(); - ref_pos += edit.from_length(); - } - } - return score; -} diff --git a/src/gssw_aligner.hpp b/src/gssw_aligner.hpp deleted file mode 100644 index aab5e6a1662..00000000000 --- a/src/gssw_aligner.hpp +++ /dev/null @@ -1,384 +0,0 @@ -#ifndef VG_GSSW_ALIGNER_HPP_INCLUDED -#define VG_GSSW_ALIGNER_HPP_INCLUDED - -#include -#include -#include -#include -#include -#include -#include "gssw.h" -#include "vg.pb.h" -#include "vg.hpp" -#include "Variant.h" -#include "Fasta.h" -#include "path.hpp" -#include "utility.hpp" -#include "banded_global_aligner.hpp" -#include "xdrop_aligner.hpp" - -// #define BENCH -// #include "bench.h" - -namespace vg { - - static const int8_t default_match = 1; - static const int8_t default_mismatch = 4; - static const int8_t default_gap_open = 6; - static const int8_t default_gap_extension = 1; - static const int8_t default_full_length_bonus = 5; - static const int8_t default_max_scaled_score = 32; - static const uint8_t default_max_qual_score = 255; - static const double default_gc_content = 0.5; - static const uint32_t default_max_gap_length = 40; - - - class VG; // forward declaration - - /** - * The interface that any Aligner should implement, with some default implementations. - */ - class BaseAligner { - protected: - BaseAligner() = default; - ~BaseAligner(); - - // for construction - // needed when constructing an alignable graph from the nodes - gssw_graph* create_gssw_graph(Graph& g); - void visit_node(gssw_node* node, - list& sorted_nodes, - set& unmarked_nodes, - set& temporary_marks); - - // create a reversed graph for left-pinned alignment - void reverse_graph(Graph& g, Graph& reversed_graph_out); - // reverse all node sequences (other aspects of graph object not unreversed) - void unreverse_graph(Graph& graph); - // convert graph mapping back into unreversed node positions - void unreverse_graph_mapping(gssw_graph_mapping* gm); - - // alignment functions - void gssw_mapping_to_alignment(gssw_graph* graph, - gssw_graph_mapping* gm, - Alignment& alignment, - bool pinned, - bool pin_left, - bool print_score_matrices = false); - string graph_cigar(gssw_graph_mapping* gm); - - public: - /// Given a nonempty vector of nonnegative scaled alignment scores, - /// compute the mapping quality of the maximal score in the vector. - /// Sets max_idx_out to the index of that score in the vector. May - /// modify the input vector. - static double maximum_mapping_quality_exact(vector& scaled_scores, size_t* max_idx_out); - /// Given a nonempty vector of nonnegative scaled alignment scores, - /// approximate the mapping quality of the maximal score in the vector. - /// Sets max_idx_out to the index of that score in the vector. May - /// modify the input vector. - static double maximum_mapping_quality_approx(vector& scaled_scores, size_t* max_idx_out); - protected: - double group_mapping_quality_exact(vector& scaled_scores, vector& group); - double estimate_next_best_score(int length, double min_diffs); - - // must be called before querying mapping_quality - void init_mapping_quality(double gc_content); - - // TODO: this algorithm has numerical problems, just removing it for now - //vector all_mapping_qualities_exact(vector scaled_scores); - - public: - - double max_possible_mapping_quality(int length); - double estimate_max_possible_mapping_quality(int length, double min_diffs, double next_min_diffs); - - /// Store optimal local alignment against a graph in the Alignment object. - /// Gives the full length bonus separately on each end of the alignment. - /// Assumes that graph is topologically sorted by node index. - virtual void align(Alignment& alignment, Graph& g, bool traceback_aln, bool print_score_matrices) = 0; - - // store optimal alignment against a graph in the Alignment object with one end of the sequence - // guaranteed to align to a source/sink node - // - // pinning left means that that the alignment starts with the first base of the read sequence and - // the first base of a source node sequence, pinning right means that the alignment starts with - // the final base of the read sequence and the final base of a sink node sequence - // - // Gives the full length bonus only on the non-pinned end of the alignment. - // - // assumes that graph is topologically sorted by node index - virtual void align_pinned(Alignment& alignment, Graph& g, bool pin_left) = 0; - - // store the top scoring pinned alignments in the vector in descending score order up to a maximum - // number of alignments (including the optimal one). if there are fewer than the maximum number in - // the return value, then it includes all alignments with a positive score. the optimal alignment - // will be stored in both the vector and in the main alignment object - // - // assumes that graph is topologically sorted by node index - virtual void align_pinned_multi(Alignment& alignment, vector& alt_alignments, Graph& g, - bool pin_left, int32_t max_alt_alns) = 0; - - // store optimal global alignment against a graph within a specified band in the Alignment object - // permissive banding auto detects the width of band needed so that paths can travel - // through every node in the graph - virtual void align_global_banded(Alignment& alignment, Graph& g, - int32_t band_padding = 0, bool permissive_banding = true) = 0; - - // store top scoring global alignments in the vector in descending score order up to a maximum number - // of alternate alignments (including the optimal alignment). if there are fewer than the maximum - // number of alignments in the return value, then the vector contains all possible alignments. the - // optimal alignment will be stored in both the vector and the original alignment object - virtual void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, - Graph& g, int32_t max_alt_alns, int32_t band_padding = 0, - bool permissive_banding = true) = 0; - // xdrop aligner - virtual void align_xdrop(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, bool multithreaded) = 0; - virtual void align_xdrop_multi(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, int32_t max_alt_alns) = 0; - - /// Compute the score of an exact match in the given alignment, from the - /// given offset, of the given length. - virtual int32_t score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const = 0; - /// Compute the score of an exact match of the given sequence with the given qualities. - /// Qualities may be ignored by some implementations. - virtual int32_t score_exact_match(const string& sequence, const string& base_quality) const = 0; - /// Compute the score of an exact match of the given range of sequence with the given qualities. - /// Qualities may be ignored by some implementations. - virtual int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, - string::const_iterator base_qual_begin) const = 0; - /// Compute the score of a path against the given range of subsequence with the given qualities. - virtual int32_t score_partial_alignment(const Alignment& alignment, VG& graph, const Path& path, - string::const_iterator seq_begin) const = 0; - - /// Returns the score of an insert or deletion of the given length - int32_t score_gap(size_t gap_length); - - /// stores -10 * log_10(P_err) in alignment mapping_quality field where P_err is the - /// probability that the alignment is not the correct one (assuming that one of the alignments - /// in the vector is correct). alignments must have been created with this Aligner for quality - /// score to be valid - void compute_mapping_quality(vector& alignments, - int max_mapping_quality, - bool fast_approximation, - double cluster_mq, - bool use_cluster_mq, - int overlap_count, - double mq_estimate, - double maybe_mq_threshold, - double identity_weight); - /// same function for paired reads, mapping qualities are stored in both alignments in the pair - void compute_paired_mapping_quality(pair, vector>& alignment_pairs, - const vector& frag_weights, - int max_mapping_quality1, - int max_mapping_quality2, - bool fast_approximation, - double cluster_mq, - bool use_cluster_mq, - int overlap_count1, - int overlap_count2, - double mq_estimate1, - double mq_estimate2, - double maybe_mq_threshold, - double identity_weight); - - /// Computes mapping quality for the optimal score in a vector of scores - int32_t compute_mapping_quality(vector& scores, bool fast_approximation); - - /// Computes mapping quality for a group of scores in a vector of scores (group given by indexes) - int32_t compute_group_mapping_quality(vector& scores, vector& group); - - /// Returns the difference between an optimal and second-best alignment scores that would - /// result in this mapping quality using the fast mapping quality approximation - double mapping_quality_score_diff(double mapping_quality) const; - - /// Convert a score to an unnormalized log likelihood for the sequence. - /// Requires log_base to have been set. - double score_to_unnormalized_likelihood_ln(double score); - - /// The longest gap detectable from a read position without soft-clipping - size_t longest_detectable_gap(const Alignment& alignment, const string::const_iterator& read_pos) const; - - /// The longest gap detectable from any read position without soft-clipping - size_t longest_detectable_gap(const Alignment& alignment) const; - - /// Use the score values in the aligner to score the given alignment, - /// scoring gaps caused by jumping between between nodes using a custom - /// gap length estimation function (which takes the from position, the - /// to position, and a search limit in bp that happens to be the read - /// length). - /// - /// May include full length bonus or not. TODO: bool flags are bad. - virtual int32_t score_gappy_alignment(const Alignment& aln, - const function& estimate_distance, - bool strip_bonuses = false) const; - - /// Use the score values in the aligner to score the given alignment assuming - /// that there are no gaps between Mappings in the Path - virtual int32_t score_ungapped_alignment(const Alignment& aln, - bool strip_bonuses = false) const; - - /// Reads a 5x5 substitution scoring matrix from an input stream (can be an ifstream) - /// expecting 5 whitespace-separated 8-bit integers per line - virtual void load_scoring_matrix(std::istream& matrix_stream); - - /// Without necessarily rescoring the entire alignment, return the score - /// of the given alignment with bonuses removed. Assumes that bonuses - /// are actually included in the score. - /// Needs to know if the alignment was pinned-end or not, and, if so, which end was pinned. - virtual int32_t remove_bonuses(const Alignment& aln, bool pinned = false, bool pin_left = false) const; - - // members - int8_t* nt_table = nullptr; - int8_t* score_matrix = nullptr; - int8_t match; - int8_t mismatch; - int8_t gap_open; - int8_t gap_extension; - int8_t full_length_bonus; - - // log of the base of the logarithm underlying the log-odds interpretation of the scores - double log_base = 0.0; - - }; - - /** - * An ordinary aligner. - */ - class Aligner : public BaseAligner { - private: - - // internal function interacting with gssw for pinned and local alignment - void align_internal(Alignment& alignment, vector* multi_alignments, Graph& g, - bool pinned, bool pin_left, int32_t max_alt_alns, - bool traceback_aln, - bool print_score_matrices); - - // members - XdropAligner xdrop; - // bench_t bench; - public: - Aligner(int8_t _match = default_match, - int8_t _mismatch = default_mismatch, - int8_t _gap_open = default_gap_open, - int8_t _gap_extension = default_gap_extension, - int8_t _full_length_bonus = default_full_length_bonus, - double _gc_content = default_gc_content); - // xdrop_aligner wrapper, omit default values to call the one above for Aligner(); - Aligner(int8_t _match, - int8_t _mismatch, - int8_t _gap_open, - int8_t _gap_extension, - int8_t _full_length_bonus, - double _gc_content, - uint32_t _max_gap_length); - ~Aligner(void) = default; - - /// Store optimal local alignment against a graph in the Alignment object. - /// Gives the full length bonus separately on each end of the alignment. - /// Assumes that graph is topologically sorted by node index. - void align(Alignment& alignment, Graph& g, bool traceback_aln, bool print_score_matrices); - - // store optimal alignment against a graph in the Alignment object with one end of the sequence - // guaranteed to align to a source/sink node - // - // pinning left means that that the alignment starts with the first base of the read sequence and - // the first base of a source node sequence, pinning right means that the alignment starts with - // the final base of the read sequence and the final base of a sink node sequence - // - // Gives the full length bonus only on the non-pinned end of the alignment. - // - // assumes that graph is topologically sorted by node index - void align_pinned(Alignment& alignment, Graph& g, bool pin_left); - - // store the top scoring pinned alignments in the vector in descending score order up to a maximum - // number of alignments (including the optimal one). if there are fewer than the maximum number in - // the return value, then it includes all alignments with a positive score. the optimal alignment - // will be stored in both the vector and in the main alignment object - // - // assumes that graph is topologically sorted by node index - void align_pinned_multi(Alignment& alignment, vector& alt_alignments, Graph& g, - bool pin_left, int32_t max_alt_alns); - - // store optimal global alignment against a graph within a specified band in the Alignment object - // permissive banding auto detects the width of band needed so that paths can travel - // through every node in the graph - void align_global_banded(Alignment& alignment, Graph& g, - int32_t band_padding = 0, bool permissive_banding = true); - - // store top scoring global alignments in the vector in descending score order up to a maximum number - // of alternate alignments (including the optimal alignment). if there are fewer than the maximum - // number of alignments in the return value, then the vector contains all possible alignments. the - // optimal alignment will be stored in both the vector and the original alignment object - void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, Graph& g, - int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true); - - // xdrop aligner - void align_xdrop(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, bool multithreaded); - void align_xdrop_multi(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, int32_t max_alt_alns); - - int32_t score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const; - int32_t score_exact_match(const string& sequence, const string& base_quality) const; - int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, - string::const_iterator base_qual_begin) const; - int32_t score_exact_match(const string& sequence) const; - int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end) const; - - int32_t score_partial_alignment(const Alignment& alignment, VG& graph, const Path& path, - string::const_iterator seq_begin) const; - }; - - /** - * An aligner that uses read base qualities to adjust its scores and alignments. - */ - class QualAdjAligner : public BaseAligner { - public: - - QualAdjAligner(int8_t _match = default_match, - int8_t _mismatch = default_mismatch, - int8_t _gap_open = default_gap_open, - int8_t _gap_extension = default_gap_extension, - int8_t _full_length_bonus = default_full_length_bonus, - int8_t _max_scaled_score = default_max_scaled_score, - uint8_t _max_qual_score = default_max_qual_score, - double gc_content = default_gc_content); - - ~QualAdjAligner(void) = default; - - // base quality adjusted counterparts to functions of same name from Aligner - void align(Alignment& alignment, Graph& g, bool traceback_aln, bool print_score_matrices); - void align_global_banded(Alignment& alignment, Graph& g, - int32_t band_padding = 0, bool permissive_banding = true); - void align_pinned(Alignment& alignment, Graph& g, bool pin_left); - void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, Graph& g, - int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true); - void align_pinned_multi(Alignment& alignment, vector& alt_alignments, Graph& g, - bool pin_left, int32_t max_alt_alns); - // xdrop aligner - void align_xdrop(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, bool multithreaded); - void align_xdrop_multi(Alignment& alignment, Graph& g, const vector& mems, bool reverse_complemented, int32_t max_alt_alns); - - void init_mapping_quality(double gc_content); - - int32_t score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const; - int32_t score_exact_match(const string& sequence, const string& base_quality) const; - int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end, - string::const_iterator base_qual_begin) const; - - int32_t score_partial_alignment(const Alignment& alignment, VG& graph, const Path& path, - string::const_iterator seq_begin) const; - - uint8_t max_qual_score; - int8_t scale_factor; - - private: - - void align_internal(Alignment& alignment, vector* multi_alignments, Graph& g, - bool pinned, bool pin_left, int32_t max_alt_alns, - bool traceback_aln, - bool print_score_matrices); - - - }; -} // end namespace vg - -#endif diff --git a/src/handle.cpp b/src/handle.cpp deleted file mode 100644 index 9f179fbda4d..00000000000 --- a/src/handle.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#include "handle.hpp" -#include "snarls.hpp" - -/** \file handle.cpp - * Implement handle graph utility methods. - */ - -namespace vg { - -using namespace std; - -handle_t HandleGraph::get_handle(const Visit& visit) const { - return get_handle(visit.node_id(), visit.backward()); -} - -Visit HandleGraph::to_visit(const handle_t& handle) const { - return vg::to_visit(this->get_id(handle), this->get_is_reverse(handle)); -} - -handle_t HandleGraph::forward(const handle_t& handle) const { - return this->get_is_reverse(handle) ? this->flip(handle) : handle; -} - -pair HandleGraph::edge_handle(const handle_t& left, const handle_t& right) const { - // The degeneracy is between any pair and a pair of the same nodes but reversed in order and orientation. - // We compare those two pairs and construct the smaller one. - auto flipped_right = this->flip(right); - - if (as_integer(left) > as_integer(flipped_right)) { - // The other orientation would be smaller. - return make_pair(flipped_right, this->flip(left)); - } else if(as_integer(left) == as_integer(flipped_right)) { - // Our left and the flipped pair's left would be equal. - auto flipped_left = this->flip(left); - if (as_integer(right) > as_integer(flipped_left)) { - // And our right is too big, so flip. - return make_pair(flipped_right, flipped_left); - } else { - // No difference or we're smaller. - return make_pair(left, right); - } - } else { - // We're smaller - return make_pair(left, right); - } -} - -handle_t HandleGraph::traverse_edge_handle(const edge_t& edge, const handle_t& left) const { - if (left == edge.first) { - // The cannonical orientation is the one we want - return edge.second; - } else if (left == this->flip(edge.second)) { - // We really want the other orientation - return this->flip(edge.first); - } else { - // This isn't either handle that the edge actually connects. Something has gone wrong. - throw runtime_error("Cannot view edge " + - to_string(this->get_id(edge.first)) + " " + to_string(this->get_is_reverse(edge.first)) + " -> " + - to_string(this->get_id(edge.second)) + " " + to_string(this->get_is_reverse(edge.second)) + - " from non-participant " + to_string(this->get_id(left)) + " " + to_string(this->get_is_reverse(left))); - } -} - -} - - diff --git a/src/handle.hpp b/src/handle.hpp index c46e432d2fd..c804ffa90c8 100644 --- a/src/handle.hpp +++ b/src/handle.hpp @@ -2,418 +2,97 @@ #define VG_HANDLE_HPP_INCLUDED /** \file - * Defines a handle type that can refer to oriented nodes of, and be used to - * traverse, any backing graph implementation. Not just an ID or a pos_t because - * XG (and maybe other implementations) provide more efficient local traversal - * mechanisms if you can skip ID lookups. + * One stop shop for libhandlegraph types and things we need to work with them. */ -#include "types.hpp" -#include "vg.pb.h" -#include "hash_map.hpp" +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include -#include -#include + +#include "hash_map.hpp" +#include +#include "types.hpp" namespace vg { using namespace std; -/// A handle is 8 (assuming id_t is still int64_t) opaque bytes. -/// A handle refers to an oriented node. -/// Two handles are equal iff they refer to the same orientation of the same node. -/// Only handles in the same graph may be compared. -/// Handles have no ordering, but can be hashed. -struct handle_t { - char data[sizeof(id_t)]; -}; - -typedef pair edge_t; - -// XG is going to store node index in its g-vector and node orientation in there -// VG is going to store ID and orientation -// Other implementations can store other things (or maybe int indexes into tables) - -/// View a handle as an integer -inline int64_t& as_integer(handle_t& handle) { - return reinterpret_cast(handle); -} - -/// View a const handle as a const integer -inline const int64_t& as_integer(const handle_t& handle) { - return reinterpret_cast(handle); -} - -/// View an integer as a handle -inline handle_t& as_handle(int64_t& value) { - return reinterpret_cast(value); -} - -/// View a const integer as a const handle -inline const handle_t& as_handle(const int64_t& value) { - return reinterpret_cast(value); -} - -/// Define equality on handles -inline bool operator==(const handle_t& a, const handle_t& b) { - return as_integer(a) == as_integer(b); -} - -/// Define inequality on handles -inline bool operator!=(const handle_t& a, const handle_t& b) { - return as_integer(a) != as_integer(b); -} +namespace handlealgs = handlegraph::algorithms; + +// Import all the handle stuff into the vg namespace for transition purposes. +using handle_t = handlegraph::handle_t; +using nid_t = handlegraph::nid_t; +using offset_t = handlegraph::offset_t; +using subrange_t = handlegraph::subrange_t; +using path_handle_t = handlegraph::path_handle_t; +using PathSense = handlegraph::PathSense; +using step_handle_t = handlegraph::step_handle_t; +using edge_t = handlegraph::edge_t; +using oriented_node_range_t = handlegraph::oriented_node_range_t; + +using HandleGraph = handlegraph::HandleGraph; +using RankedHandleGraph = handlegraph::RankedHandleGraph; +using MutableHandleGraph = handlegraph::MutableHandleGraph; +using PathMetadata = handlegraph::PathMetadata; +using PathHandleGraph = handlegraph::PathHandleGraph; +using PathPositionHandleGraph = handlegraph::PathPositionHandleGraph; +using MutablePathHandleGraph = handlegraph::MutablePathHandleGraph; +using MutablePathMutableHandleGraph = handlegraph::MutablePathMutableHandleGraph; +using DeletableHandleGraph = handlegraph::DeletableHandleGraph; +using MutablePathDeletableHandleGraph = handlegraph::MutablePathDeletableHandleGraph; +using SerializableHandleGraph = handlegraph::SerializableHandleGraph; +using VectorizableHandleGraph = handlegraph::VectorizableHandleGraph; +using NamedNodeBackTranslation = handlegraph::NamedNodeBackTranslation; /** - * Define hashes for handles. + * Define wang hashes for handles. */ template<> struct wang_hash { - size_t operator()(const vg::handle_t& handle) const { - return wang_hash()(as_integer(handle)); - } -}; - -struct path_handle_t { - char data[sizeof(int64_t)]; -}; - -/// View a path handle as an integer -inline int64_t& as_integer(path_handle_t& path_handle) { - return reinterpret_cast(path_handle); -} - -/// View a const path handle as a const integer -inline const int64_t& as_integer(const path_handle_t& path_handle) { - return reinterpret_cast(path_handle); -} - -/// View an integer as a path handle -inline path_handle_t& as_path_handle(int64_t& value) { - return reinterpret_cast(value); -} - -/// View a const integer as a const path handle -inline const path_handle_t& as_path_handle(const int64_t& value) { - return reinterpret_cast(value); -} - -/// Define equality on path handles -inline bool operator==(const path_handle_t& a, const path_handle_t& b) { - return as_integer(a) == as_integer(b); -} - -/// Define inequality on path handles -inline bool operator!=(const path_handle_t& a, const path_handle_t& b) { - return as_integer(a) != as_integer(b); -} - -struct occurrence_handle_t { - char data[2 * sizeof(int64_t)]; -}; - -/// View an occurrence handle as an integer -inline int64_t* as_integers(occurrence_handle_t& occurrence_handle) { - return reinterpret_cast(&occurrence_handle); -} - -/// View a const occurrence handle as a const integer -inline const int64_t* as_integers(const occurrence_handle_t& occurrence_handle) { - return reinterpret_cast(&occurrence_handle); -} - -/// Define equality on occurrence handles -inline bool operator==(const occurrence_handle_t& a, const occurrence_handle_t& b) { - return as_integers(a)[0] == as_integers(b)[0] && as_integers(a)[1] == as_integers(b)[1]; -} - -/// Define inequality on occurrence handles -inline bool operator!=(const occurrence_handle_t& a, const occurrence_handle_t& b) { - return !(a == b); -} - -} // namespace vg - -// This needs to be outside the vg namespace - -namespace std { - -/** - * Define hashes for handles. - */ -template<> struct hash { -public: - inline size_t operator()(const vg::handle_t& handle) const { - return std::hash()(vg::as_integer(handle)); + size_t operator()(const handlegraph::handle_t& handle) const { + return wang_hash()(handlegraph::as_integer(handle)); } }; -/** - * Define hashes for path handles. - */ -template<> struct hash { -public: - inline size_t operator()(const vg::path_handle_t& path_handle) const { - return std::hash()(vg::as_integer(path_handle)); - } -}; - -} - -namespace vg { - -using namespace std; - -/** - * This is the interface that a graph that uses handles needs to support. - * It is also the interface that users should code against. - */ -class HandleGraph { -public: - - //////////////////////////////////////////////////////////////////////////// - // Interface that needs to be implemented - //////////////////////////////////////////////////////////////////////////// - - /// Look up the handle for the node with the given ID in the given orientation - virtual handle_t get_handle(const id_t& node_id, bool is_reverse = false) const = 0; - - /// Get the ID from a handle - virtual id_t get_id(const handle_t& handle) const = 0; - - /// Get the orientation of a handle - virtual bool get_is_reverse(const handle_t& handle) const = 0; - - /// Invert the orientation of a handle (potentially without getting its ID) - virtual handle_t flip(const handle_t& handle) const = 0; - - /// Get the length of a node - virtual size_t get_length(const handle_t& handle) const = 0; - - /// Get the sequence of a node, presented in the handle's local forward - /// orientation. - virtual string get_sequence(const handle_t& handle) const = 0; - - /// Loop over all the handles to next/previous (right/left) nodes. Passes - /// them to a callback which returns false to stop iterating and true to - /// continue. Returns true if we finished and false if we stopped early. - virtual bool follow_edges(const handle_t& handle, bool go_left, const function& iteratee) const = 0; - - /// Loop over all the nodes in the graph in their local forward - /// orientations, in their internal stored order. Stop if the iteratee returns false. - virtual void for_each_handle(const function& iteratee, bool parallel = false) const = 0; - - /// Return the number of nodes in the graph - /// TODO: can't be node_count because XG has a field named node_count. - virtual size_t node_size() const = 0; - - //////////////////////////////////////////////////////////////////////////// - // Interface that needs to be using'd - //////////////////////////////////////////////////////////////////////////// - - /// Loop over all the handles to next/previous (right/left) nodes. Works - /// with a callback that just takes all the handles and returns void. - /// MUST be pulled into implementing classes with `using` in order to work! - template - auto follow_edges(const handle_t& handle, bool go_left, T&& iteratee) const - -> typename std::enable_if::value>::type { - // Implementation only for void-returning iteratees - // We ought to just overload on the std::function but that's not allowed until C++14. - // See - - // We also can't use result_of::type to sniff the return - // type out because that ::type would not exist (since that's what you - // get for a void apparently?) and we couldn't check if it's bool or - // void. - - // So we do this nonsense thing with a trailing return type (to get the - // actual arg into scope) and a decltype (which is allowed to resolve to - // void) and is_void (which is allowed to take void) and a fake - // get_handle call (which is the shortest handle_t-typed expression I - // could think of). - - // Make a wrapper that puts a bool return type on. - function lambda = [&](const handle_t& found) { - iteratee(found); - return true; - }; - - // Use that - follow_edges(handle, go_left, lambda); - - // During development I managed to get earlier versions of this template to build infinitely recursive functions. - static_assert(!std::is_void::value, "can't take our own lambda"); - } - - /// Loop over all the nodes in the graph in their local forward - /// orientations, in their internal stored order. Works with void-returning iteratees. - /// MUST be pulled into implementing classes with `using` in order to work! - template - auto for_each_handle(T&& iteratee, bool parallel = false) const - -> typename std::enable_if::value>::type { - // Make a wrapper that puts a bool return type on. - function lambda = [&](const handle_t& found) { - iteratee(found); - return true; - }; - - // Use that - for_each_handle(lambda, parallel); +template<> +struct wang_hash { + size_t operator()(const handlegraph::path_handle_t& handle) const { + return wang_hash()(handlegraph::as_integer(handle)); } - - //////////////////////////////////////////////////////////////////////////// - // Concrete utility methods - //////////////////////////////////////////////////////////////////////////// - - /// Get a handle from a Visit Protobuf object. - /// Must be using'd to avoid shadowing. - handle_t get_handle(const Visit& visit) const; - - /// Get a Protobuf Visit from a handle. - Visit to_visit(const handle_t& handle) const; - - /// Get the locally forward version of a handle - handle_t forward(const handle_t& handle) const; - - /// A pair of handles can be used as an edge. When so used, the handles have a - /// canonical order and orientation. - edge_t edge_handle(const handle_t& left, const handle_t& right) const; - - /// Such a pair can be viewed from either inward end handle and produce the - /// outward handle you would arrive at. - handle_t traverse_edge_handle(const edge_t& edge, const handle_t& left) const; }; -/** - * This is the interface for a handle graph that stores embedded paths. - */ -class PathHandleGraph : virtual public HandleGraph { -public: - - //////////////////////////////////////////////////////////////////////////// - // Path handle interface that needs to be implemented - //////////////////////////////////////////////////////////////////////////// - - /// Look up the path handle for the given path name - virtual path_handle_t get_path_handle(const string& path_name) const = 0; - - /// Look up the name of a path from a handle to it - virtual string get_path_name(const path_handle_t& path_handle) const = 0; - - /// Returns the number of node occurrences in the path - virtual size_t get_occurrence_count(const path_handle_t& path_handle) const = 0; - - /// Returns the number of paths stored in the graph - virtual size_t get_path_count() const = 0; - - /// Execute a function on each path in the graph - virtual void for_each_path_handle(const function& iteratee) const = 0; - - /// Get a node handle (node ID and orientation) from a handle to an occurrence on a path - virtual handle_t get_occurrence(const occurrence_handle_t& occurrence_handle) const = 0; - - /// Get a handle to the first occurrence in a path - virtual occurrence_handle_t get_first_occurrence(const path_handle_t& path_handle) const = 0; - - /// Get a handle to the last occurrence in a path - virtual occurrence_handle_t get_last_occurrence(const path_handle_t& path_handle) const = 0; - - /// Returns true if the occurrence is not the last occurence on the path, else false - virtual bool has_next_occurrence(const occurrence_handle_t& occurrence_handle) const = 0; - - /// Returns true if the occurrence is not the first occurence on the path, else false - virtual bool has_previous_occurrence(const occurrence_handle_t& occurrence_handle) const = 0; - - /// Returns a handle to the next occurrence on the path - virtual occurrence_handle_t get_next_occurrence(const occurrence_handle_t& occurrence_handle) const = 0; - - /// Returns a handle to the previous occurrence on the path - virtual occurrence_handle_t get_previous_occurrence(const occurrence_handle_t& occurrence_handle) const = 0; - - /// Returns a handle to the path that an occurrence is on - virtual path_handle_t get_path_handle_of_occurrence(const occurrence_handle_t& occurrence_handle) const = 0; - - /// Returns the 0-based ordinal rank of a occurrence on a path - virtual size_t get_ordinal_rank_of_occurrence(const occurrence_handle_t& occurrence_handle) const = 0; -}; - -/** - * This is the interface for a handle graph that supports modification. - */ -class MutableHandleGraph : virtual public HandleGraph { -public: - /* - * Note: All operations may invalidate path handles and occurrence handles. - */ - - /// Create a new node with the given sequence and return the handle. - virtual handle_t create_handle(const string& sequence) = 0; - - /// Create a new node with the given id and sequence, then return the handle. - virtual handle_t create_handle(const string& sequence, const id_t& id) = 0; - - /// Remove the node belonging to the given handle and all of its edges. - /// Does not update any stored paths. - virtual void destroy_handle(const handle_t& handle) = 0; - - /// Create an edge connecting the given handles in the given order and orientations. - /// Ignores existing edges. - virtual void create_edge(const handle_t& left, const handle_t& right) = 0; - - /// Convenient wrapper for create_edge. - inline void create_edge(const edge_t& edge) { - create_edge(edge.first, edge.second); - } - - /// Remove the edge connecting the given handles in the given order and orientations. - /// Ignores nonexistent edges. - /// Does not update any stored paths. - virtual void destroy_edge(const handle_t& left, const handle_t& right) = 0; - - /// Convenient wrapper for destroy_edge. - inline void destroy_edge(const edge_t& edge) { - destroy_edge(edge.first, edge.second); - } - - /// Remove all nodes and edges. Does not update any stored paths. - virtual void clear() = 0; - - /// Swap the nodes corresponding to the given handles, in the ordering used - /// by for_each_handle when looping over the graph. Other handles to the - /// nodes being swapped must not be invalidated. If a swap is made while - /// for_each_handle is running, it affects the order of the handles - /// traversed during the current traversal (so swapping an already seen - /// handle to a later handle's position will make the seen handle be visited - /// again and the later handle not be visited at all). - virtual void swap_handles(const handle_t& a, const handle_t& b) = 0; - - /// Alter the node that the given handle corresponds to so the orientation - /// indicated by the handle becomes the node's local forward orientation. - /// Rewrites all edges pointing to the node and the node's sequence to - /// reflect this. Invalidates all handles to the node (including the one - /// passed). Returns a new, valid handle to the node in its new forward - /// orientation. Note that it is possible for the node's ID to change. - /// Does not update any stored paths. May change the ordering of the underlying - /// graph. - virtual handle_t apply_orientation(const handle_t& handle) = 0; - - /// Split a handle's underlying node at the given offsets in the handle's - /// orientation. Returns all of the handles to the parts. Other handles to - /// the node being split may be invalidated. The split pieces stay in the - /// same local forward orientation as the original node, but the returned - /// handles come in the order and orientation appropriate for the handle - /// passed in. - /// Updates stored paths. - virtual vector divide_handle(const handle_t& handle, const vector& offsets) = 0; - - /// Specialization of divide_handle for a single division point - inline pair divide_handle(const handle_t& handle, size_t offset) { - auto parts = divide_handle(handle, vector{offset}); - return make_pair(parts.front(), parts.back()); - } -}; - +using handlegraph::ExpandingOverlayGraph; } #endif diff --git a/src/handle_to_vg.cpp b/src/handle_to_vg.cpp deleted file mode 100644 index 1ed04f960c9..00000000000 --- a/src/handle_to_vg.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include "handle_to_vg.hpp" - -namespace vg { - using namespace std; - - VG handle_to_vg(const HandleGraph* xg) { - // If xg is a null pointer, throw a runtime error - if (xg == nullptr) { - throw runtime_error("There is no xg to convert"); - } - // Initialize the VG graph - VG vg; - // Iterate through each handle in xg and create the same handle in vg - xg->for_each_handle([&](const handle_t& here) { - // Get the id of the xg handle - id_t xg_id = xg->get_id(here); - // Get the sequence of the xg handle - string xg_seq = xg->get_sequence(here); - // Create a handle in vg using the xg id and sequence - vg.create_handle(xg_seq,xg_id); - }); - // Iterate through each handle in xg - xg->for_each_handle([&](const handle_t& handle) { - id_t id = xg->get_id(handle); - bool rev = xg->get_is_reverse(handle); - // Return a vg handle using the xg handle's id and orientation - handle_t current = vg.get_handle(id,rev); - // Follow the right edges of the xg handle - xg->follow_edges(handle, false, [&](const handle_t& r) { - id_t id_r = xg->get_id(r); - bool rev_r = xg->get_is_reverse(r); - // Return a vg handle using the xg handle's id and orientation - handle_t next = vg.get_handle(id_r, rev_r); - // Create an edge in vg using the handles - vg.create_edge(current,next); - }); - // Follow the left edges of the xg handle - xg->follow_edges(handle, true, [&](const handle_t& l) { - id_t id_l = xg->get_id(l); - bool rev_l = xg->get_is_reverse(l); - // Return a vg handle using the xg handle's id and orientation - handle_t prev = vg.get_handle(id_l, rev_l); - // Use the handles created from following the xg edges to create a vg edge - vg.create_edge(prev,current); //error here - }); - }); - return vg; - } - -} diff --git a/src/handle_to_vg.hpp b/src/handle_to_vg.hpp deleted file mode 100644 index 1c349c094f8..00000000000 --- a/src/handle_to_vg.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef VG_HANDLE_TO_VG_HPP_INCLUDED -#define VG_HANDLE_TO_VG_HPP_INCLUDED - -#include "handle.hpp" -#include "vg.hpp" - -namespace vg { - using namespace std; - /// Takes in a pointer to a HandleGraph and converts it to a VG graph. - VG handle_to_vg(const HandleGraph* g); -} - -#endif diff --git a/src/haplotype_extracter.cpp b/src/haplotype_extracter.cpp index 7b67cda5d05..44cc385db15 100644 --- a/src/haplotype_extracter.cpp +++ b/src/haplotype_extracter.cpp @@ -1,32 +1,52 @@ #include #include "vg.hpp" #include "haplotype_extracter.hpp" -#include "json2pb.h" -#include "xg.hpp" +#include "vg/io/json2pb.h" +#include "graph.hpp" namespace vg { using namespace std; -void trace_haplotypes_and_paths(xg::XG& index, const gbwt::GBWT* haplotype_database, +static id_t side_id(int64_t side) { + return abs(side); +} + +static bool side_is_end(int64_t side) { + return side < 0; +} + +static int64_t make_side(id_t id, bool is_end) { + return !is_end ? id : -1 * id; +} + + +void trace_haplotypes_and_paths(const PathHandleGraph& source, const gbwt::GBWT& haplotype_database, vg::id_t start_node, int extend_distance, Graph& out_graph, map& out_thread_frequencies, bool expand_graph) { // get our haplotypes - xg::XG::ThreadMapping n = {start_node, false}; - vector > haplotypes = haplotype_database ? - list_haplotypes(index, *haplotype_database, n, extend_distance) : - list_haplotypes(index, n, extend_distance); + handle_t n = source.get_handle(start_node, false); + vector > haplotypes = list_haplotypes(source, haplotype_database, n, + [&extend_distance](const vector& new_thread) { + return new_thread.size() >= extend_distance; + }); #ifdef debug - cerr << "Haplotype database " << haplotype_database << " produced " << haplotypes.size() << " haplotypes" << endl; + cerr << "Haplotype database " << &haplotype_database << " produced " << haplotypes.size() << " haplotypes" << endl; #endif if (expand_graph) { - // get our subgraph and "regular" paths by expanding forward - *out_graph.add_node() = index.node(start_node); - index.expand_context(out_graph, extend_distance, true, true, true, false); + // get our subgraph and "regular" paths by expanding forward + handle_t handle = source.get_handle(start_node); + bdsg::HashGraph extractor; + extractor.create_handle(source.get_sequence(handle), source.get_id(handle)); + // TODO: is expanding only forward really the right behavior here? + algorithms::expand_context_with_paths(&source, &extractor, extend_distance, true, true, false); + + // Convert to Protobuf I guess + from_path_handle_graph(extractor, out_graph); } // add a frequency of 1 for each normal path @@ -36,22 +56,22 @@ void trace_haplotypes_and_paths(xg::XG& index, const gbwt::GBWT* haplotype_datab // add our haplotypes to the subgraph, naming ith haplotype "thread_i" for (int i = 0; i < haplotypes.size(); ++i) { - Path p = path_from_thread_t(haplotypes[i].first, index); + Path p = path_from_thread_t(haplotypes[i].first, source); p.set_name("thread_" + to_string(i)); - out_thread_frequencies[p.name()] = haplotypes[i].second; + out_thread_frequencies[p.name()] = haplotypes[i].second.size(); *(out_graph.add_path()) = move(p); } } void output_haplotype_counts(ostream& annotation_ostream, - vector>& haplotype_list, xg::XG& index) { + vector>& haplotype_list) { for(int i = 0; i < haplotype_list.size(); i++) { annotation_ostream << i << "\t" << haplotype_list[i].second << endl; } } -Graph output_graph_with_embedded_paths(vector>& haplotype_list, xg::XG& index) { +Graph output_graph_with_embedded_paths(vector>& haplotype_list, const HandleGraph& source) { Graph g; set nodes; set > edges; @@ -59,9 +79,9 @@ Graph output_graph_with_embedded_paths(vector>& haplotype_lis add_thread_nodes_to_set(haplotype_list[i].first, nodes); add_thread_edges_to_set(haplotype_list[i].first, edges); } - construct_graph_from_nodes_and_edges(g, index, nodes, edges); + construct_graph_from_nodes_and_edges(g, source, nodes, edges); for(int i = 0; i < haplotype_list.size(); i++) { - Path p = path_from_thread_t(haplotype_list[i].first, index); + Path p = path_from_thread_t(haplotype_list[i].first, source); p.set_name(to_string(i)); *(g.add_path()) = move(p); } @@ -69,8 +89,8 @@ Graph output_graph_with_embedded_paths(vector>& haplotype_lis } void output_graph_with_embedded_paths(ostream& subgraph_ostream, - vector>& haplotype_list, xg::XG& index, bool json) { - Graph g = output_graph_with_embedded_paths(haplotype_list, index); + vector>& haplotype_list, const HandleGraph& source, bool json) { + Graph g = output_graph_with_embedded_paths(haplotype_list, source); if (json) { subgraph_ostream << pb2json(g); @@ -81,69 +101,75 @@ void output_graph_with_embedded_paths(ostream& subgraph_ostream, } } -void thread_to_graph_spanned(thread_t& t, Graph& g, xg::XG& index) { +void thread_to_graph_spanned(thread_t& t, Graph& g, const HandleGraph& source) { set nodes; set > edges; - nodes.insert(t[0].node_id); + nodes.insert(gbwt::Node::id(t[0])); for(int i = 1; i < t.size(); i++) { - nodes.insert(t[i].node_id); - edges.insert(make_pair(xg::make_side(t[i-1].node_id,t[i-1].is_reverse), - xg::make_side(t[i].node_id,t[i].is_reverse))); - } - for (auto& n : nodes) { - *g.add_node() = index.node(n); - } + nodes.insert(gbwt::Node::id(t[i])); + edges.insert(make_pair(make_side(gbwt::Node::id(t[i-1]),gbwt::Node::is_reverse(t[i-1])), + make_side(gbwt::Node::id(t[i]),gbwt::Node::is_reverse(t[i])))); + } + for (auto& n : nodes) { + handle_t handle = source.get_handle(n); + Node* node = g.add_node(); + node->set_sequence(source.get_sequence(handle)); + node->set_id(n); + } for (auto& e : edges) { Edge edge; - edge.set_from(xg::side_id(e.first)); - edge.set_from_start(xg::side_is_end(e.first)); - edge.set_to(xg::side_id(e.second)); - edge.set_to_end(xg::side_is_end(e.second)); + edge.set_from(side_id(e.first)); + edge.set_from_start(side_is_end(e.first)); + edge.set_to(side_id(e.second)); + edge.set_to_end(side_is_end(e.second)); *g.add_edge() = edge; } } void add_thread_nodes_to_set(thread_t& t, set& nodes) { for(int i = 0; i < t.size(); i++) { - nodes.insert(t[i].node_id); + nodes.insert(gbwt::Node::id(t[i])); } } void add_thread_edges_to_set(thread_t& t, set >& edges) { for(int i = 1; i < t.size(); i++) { - edges.insert(make_pair(xg::make_side(t[i-1].node_id,t[i-1].is_reverse), - xg::make_side(t[i].node_id,t[i].is_reverse))); + edges.insert(make_pair(make_side(gbwt::Node::id(t[i-1]),gbwt::Node::is_reverse(t[i-1])), + make_side(gbwt::Node::id(t[i]),gbwt::Node::is_reverse(t[i])))); } } -void construct_graph_from_nodes_and_edges(Graph& g, xg::XG& index, +void construct_graph_from_nodes_and_edges(Graph& g, const HandleGraph& source, set& nodes, set >& edges) { for (auto& n : nodes) { - *g.add_node() = index.node(n); - } + handle_t handle = source.get_handle(n); + Node* node = g.add_node(); + node->set_sequence(source.get_sequence(handle)); + node->set_id(n); + } for (auto& e : edges) { Edge edge; - edge.set_from(xg::side_id(e.first)); - edge.set_from_start(xg::side_is_end(e.first)); - edge.set_to(xg::side_id(e.second)); - edge.set_to_end(xg::side_is_end(e.second)); + edge.set_from(side_id(e.first)); + edge.set_from_start(side_is_end(e.first)); + edge.set_to(side_id(e.second)); + edge.set_to_end(side_is_end(e.second)); *g.add_edge() = edge; } } -Path path_from_thread_t(thread_t& t, xg::XG& index) { +Path path_from_thread_t(thread_t& t, const HandleGraph& source) { Path toReturn; int rank = 1; for(int i = 0; i < t.size(); i++) { Mapping* mapping = toReturn.add_mapping(); // Set up the position - mapping->mutable_position()->set_node_id(t[i].node_id); - mapping->mutable_position()->set_is_reverse(t[i].is_reverse); + mapping->mutable_position()->set_node_id(gbwt::Node::id(t[i])); + mapping->mutable_position()->set_is_reverse(gbwt::Node::is_reverse(t[i])); // Set up the edits Edit* e = mapping->add_edit(); - size_t l = index.node_length(t[i].node_id); + size_t l = source.get_length(source.get_handle(gbwt::Node::id(t[i]))); e->set_from_length(l); e->set_to_length(l); @@ -154,149 +180,82 @@ Path path_from_thread_t(thread_t& t, xg::XG& index) { return toReturn; } -vector > list_haplotypes(xg::XG& index, - xg::XG::ThreadMapping start_node, int extend_distance) { - vector > search_intermediates; - vector > search_results; - thread_t first_thread = {start_node}; - xg::XG::ThreadSearchState first_state; - index.extend_search(first_state,first_thread); - vector edges = start_node.is_reverse ? - index.edges_on_start(start_node.node_id) : - index.edges_on_end(start_node.node_id); - for(int i = 0; i < edges.size(); i++) { - xg::XG::ThreadMapping next_node; - next_node.node_id = edges[i].to(); - next_node.is_reverse = edges[i].to_end(); - xg::XG::ThreadSearchState new_state = first_state; - thread_t t = {next_node}; - index.extend_search(new_state, t); - thread_t new_thread = first_thread; - new_thread.push_back(next_node); - if(!new_state.is_empty()) { - search_intermediates.push_back(make_pair(new_thread,new_state)); - } - } - while(search_intermediates.size() > 0) { - pair last = search_intermediates.back(); - search_intermediates.pop_back(); - int check_size = search_intermediates.size(); - vector edges = last.first.back().is_reverse ? - index.edges_on_start(last.first.back().node_id) : - index.edges_on_end(last.first.back().node_id); - if(edges.size() == 0) { - search_results.push_back(make_pair(last.first,last.second.count())); - } else { - for(int i = 0; i < edges.size(); i++) { - xg::XG::ThreadMapping next_node; - next_node.node_id = edges[i].to(); - next_node.is_reverse = edges[i].to_end(); - xg::XG::ThreadSearchState new_state = last.second; - thread_t next_thread = {next_node}; - index.extend_search(new_state,next_thread); - thread_t new_thread = last.first; - new_thread.push_back(next_node); - if(!new_state.is_empty()) { - if(new_thread.size() >= extend_distance) { - search_results.push_back(make_pair(new_thread,new_state.count())); - } else { - search_intermediates.push_back(make_pair(new_thread,new_state)); - } - } - } - if(check_size == search_intermediates.size() && - last.first.size() < extend_distance - 1) { - search_results.push_back(make_pair(last.first,last.second.count())); - } - } - } - return search_results; -} - -vector > list_haplotypes(xg::XG& index, const gbwt::GBWT& haplotype_database, - xg::XG::ThreadMapping start_node, int extend_distance) { +vector, gbwt::SearchState> > list_haplotypes(const HandleGraph& graph, + const gbwt::GBWT& gbwt, + handle_t start, + function&)> stop_fn) { + + // Keep track of all the different paths we're extending + vector, gbwt::SearchState> > search_intermediates; + vector, gbwt::SearchState> > search_results; + // Look up the start node in GBWT and start a thread + gbwt::node_type start_node = handle_to_gbwt(graph, start); + vector first_thread = {start_node}; + gbwt::SearchState first_state = gbwt.find(start_node); + #ifdef debug - cerr << "Extracting haplotypes from GBWT" << endl; + cerr << "Start with state " << first_state << " for node " << gbwt::Node::id(start_node) << ":" + << gbwt::Node::is_reverse(start_node) << endl; #endif - vector > search_intermediates; - vector > search_results; - // We still keep our data as thread_ts full of xg ThreadMappings and convert on the fly. - thread_t first_thread = {start_node}; - auto first_node = gbwt::Node::encode(start_node.node_id, start_node.is_reverse); - gbwt::SearchState first_state = haplotype_database.find(first_node); -#ifdef debug - cerr << "Start with state " << first_state << " for node " << gbwt::Node::id(first_node) << endl; -#endif - vector edges = start_node.is_reverse ? - index.edges_on_start(start_node.node_id) : - index.edges_on_end(start_node.node_id); - - // TODO: this is just most of the loop body repeated! - for(int i = 0; i < edges.size(); i++) { - xg::XG::ThreadMapping next_node; - next_node.node_id = edges[i].to(); - next_node.is_reverse = edges[i].to_end(); - auto extend_node = gbwt::Node::encode(next_node.node_id, next_node.is_reverse); - auto new_state = haplotype_database.extend(first_state, extend_node); -#ifdef debug - cerr << "Extend state " << first_state << " to " << new_state << " with " << gbwt::Node::id(extend_node) << endl; -#endif - thread_t new_thread = first_thread; - new_thread.push_back(next_node); - if(!new_state.empty()) { -#ifdef debug - cerr << "\tGot " << new_state.size() << " results; extending more" << endl; -#endif - search_intermediates.push_back(make_pair(new_thread,new_state)); + if (!first_state.empty()) { + search_intermediates.push_back(make_pair(first_thread, first_state)); } - } - while(search_intermediates.size() > 0) { - pair last = search_intermediates.back(); - search_intermediates.pop_back(); - int check_size = search_intermediates.size(); - vector edges = last.first.back().is_reverse ? - index.edges_on_start(last.first.back().node_id) : - index.edges_on_end(last.first.back().node_id); - if(edges.size() == 0) { -#ifdef debug - cerr << "Hit end of graph on state " << last.second << endl; -#endif - search_results.push_back(make_pair(last.first,last.second.size())); - } else { - for(int i = 0; i < edges.size(); i++) { - xg::XG::ThreadMapping next_node; - next_node.node_id = edges[i].to(); - next_node.is_reverse = edges[i].to_end(); - auto extend_node = gbwt::Node::encode(next_node.node_id, next_node.is_reverse); - auto new_state = haplotype_database.extend(last.second, extend_node); + + while(!search_intermediates.empty()) { + + // pick up a thread to continue from the queue + auto last = std::move(search_intermediates.back()); + search_intermediates.pop_back(); + + vector> next_handle_states; + graph.follow_edges(gbwt_to_handle(graph, last.first.back()), false, [&](const handle_t& next) { + // extend the last node of the thread using gbwt + auto extend_node = handle_to_gbwt(graph, next); + auto new_state = gbwt.extend(last.second, extend_node); #ifdef debug - cerr << "Extend state " << last.second << " to " << new_state << " with " << gbwt::Node::id(extend_node) << endl; + cerr << "Extend state " << last.second << " to " << new_state << " with " << gbwt::Node::id(extend_node) << endl; #endif - thread_t new_thread = last.first; - new_thread.push_back(next_node); - if(!new_state.empty()) { - if(new_thread.size() >= extend_distance) { + if (!new_state.empty()) { + next_handle_states.push_back(make_tuple(next, extend_node, new_state)); + } + }); + + for (auto& nhs : next_handle_states) { + + const handle_t& next = get<0>(nhs); + gbwt::node_type& extend_node = get<1>(nhs); + gbwt::SearchState& new_state = get<2>(nhs); + + vector new_thread; + if (&nhs == &next_handle_states.back()) { + // avoid a copy by re-using the vector for the last thread. this way simple cases + // like scanning along one path don't blow up to n^2 + new_thread = std::move(last.first); + } else { + new_thread = last.first; + } + new_thread.push_back(extend_node); + + if (stop_fn(new_thread)) { #ifdef debug - cerr << "\tGot " << new_state.size() << " results at limit; emitting" << endl; + cerr << "\tGot " << new_state.size() << " results at limit; emitting" << endl; #endif - search_results.push_back(make_pair(new_thread,new_state.size())); - } else { + search_results.push_back(make_pair(std::move(new_thread), new_state)); + } + else { #ifdef debug - cerr << "\tGot " << new_state.size() << " results; extending more" << endl; + cerr << "\tGot " << new_state.size() << " results; extending more" << endl; #endif - search_intermediates.push_back(make_pair(new_thread,new_state)); - } + search_intermediates.push_back(make_pair(std::move(new_thread), new_state)); + } } - } - if(check_size == search_intermediates.size() && - last.first.size() < extend_distance - 1) { - search_results.push_back(make_pair(last.first,last.second.size())); - } } - } - return search_results; + + return search_results; } + + } diff --git a/src/haplotype_extracter.hpp b/src/haplotype_extracter.hpp index c25034659c5..028b101c742 100644 --- a/src/haplotype_extracter.hpp +++ b/src/haplotype_extracter.hpp @@ -7,66 +7,64 @@ #include #include +#include +#include "bdsg/hash_graph.hpp" -#include "vg.pb.h" -#include "xg.hpp" +#include "algorithms/expand_context.hpp" +#include "gbwt_helper.hpp" namespace vg { using namespace std; -using thread_t = vector; - +using thread_t = vector; + // Walk forward from a node, collecting all haplotypes. Also do a regular // subgraph search for all the paths too. Haplotype thread i will be embedded // as Paths a path with name thread_i. Each path name (including threads) is // mapped to a frequency in out_thread_frequencies. Haplotypes will be pulled -// from the xg index's gPBWT if haplotype_database is null, and from the given -// GBWT index otherwise. -void trace_haplotypes_and_paths(xg::XG& index, const gbwt::GBWT* haplotype_database, +// from the GBWT index. +void trace_haplotypes_and_paths(const PathHandleGraph& source, + const gbwt::GBWT& haplotype_database, vg::id_t start_node, int extend_distance, Graph& out_graph, map& out_thread_frequencies, bool expand_graph = true); -// Turns an (xg-based) thread_t into a (vg-based) Path -Path path_from_thread_t(thread_t& t, xg::XG& index); - -// Lists all the sub-haplotypes of length extend_distance nodes starting at node -// start_node from the set of haplotypes embedded as thread_t's in xg index. -// Records, for each thread_t t the number of haplotypes of which t is a -// subhaplotype -vector > list_haplotypes(xg::XG& index, - xg::XG::ThreadMapping start_node, int extend_distance); +// Turns a (GBWT-based) thread_t into a (vg-based) Path +Path path_from_thread_t(thread_t& t, const HandleGraph& source); -// Lists all the sub-haplotypes of length extend_distance nodes starting at +// Lists all the sub-haplotypes of nodes starting at // node start_node from the set of haplotypes embedded in the geven GBWT -// haplotype database. Records, for each thread_t t the number of haplotypes -// of which t is a subhaplotype -vector > list_haplotypes(xg::XG& index, const gbwt::GBWT& haplotype_database, - xg::XG::ThreadMapping start_node, int extend_distance); +// haplotype database. At each step stop_fn() is called on the thread being created, and if it returns true +// then the search stops and the thread is added two the list to be returned. +vector, gbwt::SearchState> > list_haplotypes(const HandleGraph& graph, + const gbwt::GBWT& gbwt, + handle_t start, + function&)> stop_fn); + // writes to subgraph_ostream the subgraph covered by // the haplotypes in haplotype_list, as well as these haplotypes embedded as // Paths. Will output in JSON format if json set to true and Protobuf otherwise. void output_graph_with_embedded_paths(ostream& subgraph_ostream, - vector>& haplotype_list, xg::XG& index, bool json = true); + vector>& haplotype_list, const HandleGraph& source, bool json = true); // get the graph directly -Graph output_graph_with_embedded_paths(vector>& haplotype_list, xg::XG& index); +Graph output_graph_with_embedded_paths(vector>& haplotype_list, const HandleGraph& source); // writes to annotation_ostream the list of counts of identical subhaplotypes // using the same ordering as the Paths from output_graph_with_embedded_paths void output_haplotype_counts(ostream& annotation_ostream, - vector>& haplotype_list, xg::XG& index); + vector>& haplotype_list); // Adds to a Graph the nodes and edges touched by a thread_t -void thread_to_graph_spanned(thread_t& t, Graph& graph, xg::XG& index); +void thread_to_graph_spanned(thread_t& t, Graph& graph, const HandleGraph& source); // Adds to a set of nodes all those touched by thread_t t void add_thread_nodes_to_set(thread_t& t, set& nodes); // Adds to a set of edges all those touched by thread_t t void add_thread_edges_to_set(thread_t& t, set >& edges); // Turns a set of nodes and a set of edges into a Graph -void construct_graph_from_nodes_and_edges(Graph& g, xg::XG& index, +void construct_graph_from_nodes_and_edges(Graph& g, const HandleGraph& source, set& nodes, set >& edges); } diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp new file mode 100644 index 00000000000..528160485cd --- /dev/null +++ b/src/haplotype_indexer.cpp @@ -0,0 +1,474 @@ +/** + * \file haplotype_indexer.cpp: implementations of haplotype indexing with the GBWT + */ + +#include +#include +#include +#include + +#include +#include + +#include "gbwt_helper.hpp" + +#include "haplotype_indexer.hpp" + +#include "path.hpp" +#include "alignment.hpp" + +using namespace std; + +namespace vg { + +HaplotypeIndexer::HaplotypeIndexer() { + // Build the GBWTs silently. + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); +} + +std::vector HaplotypeIndexer::parse_vcf(const std::string& filename, const PathHandleGraph& graph, const std::string& job_name) const { + + // Parse all non-alt paths. + std::vector path_handles; + graph.for_each_path_handle([&](path_handle_t path_handle) { + std::string path_name = graph.get_path_name(path_handle); + if (!Paths::is_alt(path_name)) { + path_handles.push_back(path_handle); + } + }); + + return this->parse_vcf(filename, graph, path_handles, job_name); +} + +std::vector HaplotypeIndexer::parse_vcf(const std::string& filename, const PathHandleGraph& graph, const std::vector& paths, const std::string& job_name) const { + + // Open the VCF file. + vcflib::VariantCallFile variant_file; + variant_file.parseSamples = false; // vcflib parsing is very slow if there are many samples. + std::string temp_filename = filename; + variant_file.open(temp_filename); + if (!variant_file.is_open()) { + std::cerr << "error: [HaplotypeIndexer::parse_vcf] could not open " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + + // How many samples are there? + size_t num_samples = variant_file.sampleNames.size(); + if (num_samples == 0) { + std::cerr << "error: [HaplotypeIndexer::parse_vcf] variant file '" << filename << "' does not contain phasings" << std::endl; + std::exit(EXIT_FAILURE); + } + + // Determine the samples we want to index. + std::pair sample_range = this->sample_range; + sample_range.second = std::min(sample_range.second, num_samples); + std::vector sample_names(variant_file.sampleNames.begin() + sample_range.first, variant_file.sampleNames.begin() + sample_range.second); + if (this->show_progress) { + #pragma omp critical + { + std::cerr << job_name << ": Parsing VCF file " << filename << " with options"; + if (!this->phase_homozygous) { + std::cerr << " --actual-phasing"; + } + if (this->force_phasing) { + std::cerr << " --force-phasing"; + } + if (this->discard_overlaps) { + std::cerr << " --discard-overlaps"; + } + if (!this->rename_variants) { + std::cerr << " --vcf-variants"; + } + std::cerr << std::endl; + std::cerr << job_name << ": Samples " << sample_range.first << " to " << (sample_range.second - 1) << ", batch size " << samples_in_batch << std::endl; + } + } + + // Parse the contigs we are interested in. + std::vector result; + size_t total_variants_processed = 0; + std::mt19937 rng(0xDEADBEEF); + std::uniform_int_distribution random_bit(0, 1); + size_t found_missing_variants = 0; + for (size_t path_id = 0; path_id < paths.size(); path_id++) { + std::string path_name = graph.get_path_name(paths[path_id]); + std::string vcf_contig_name = (this->path_to_vcf.count(path_name) > 0 ? this->path_to_vcf.at(path_name) : path_name); + + // Set the VCF region or process the entire contig. + if (this->regions.count(vcf_contig_name)) { + std::pair region = this->regions.at(vcf_contig_name); + variant_file.setRegion(vcf_contig_name, region.first, region.second); + } else { + variant_file.setRegion(vcf_contig_name); + } + + // Check that the VCF file contains this contig. + vcflib::Variant var(variant_file); + if (!(variant_file.is_open() && variant_file.getNextVariant(var) && var.sequenceName == vcf_contig_name)) { + std::cerr << "warning: [HaplotypeIndexer::parse_vcf] contig " << vcf_contig_name << " not present in file " << filename << std::endl; + continue; + } + if (this->show_progress) { + #pragma omp critical + { + std::cerr << job_name << ": Path " << path_name << " matches VCF contig " << vcf_contig_name; + if (this->regions.count(vcf_contig_name)) { + std::pair region = this->regions.at(vcf_contig_name); + std::cerr << ", region " << region.first << " to " << region.second; + } + std::cerr << std::endl; + } + } + + // Structures to parse the VCF file into. + std::string parse_file = (this->batch_file_prefix.empty() ? gbwt::TempFile::getName("parse") : this->batch_file_prefix + '_' + vcf_contig_name); + gbwt::VariantPaths variants(graph.get_step_count(paths[path_id])); + variants.setSampleNames(sample_names); + variants.setContigName(path_name); + std::vector phasings; + + // Add the reference to VariantPaths. + for (handle_t handle : graph.scan_path(paths[path_id])) { + variants.appendToReference(gbwt::Node::encode(graph.get_id(handle), graph.get_is_reverse(handle))); + } + variants.indexReference(); + + // Create a PhasingInformation file for each batch. + for (size_t batch_start = sample_range.first; batch_start < sample_range.second; batch_start += samples_in_batch) { + size_t batch_size = std::min(samples_in_batch, sample_range.second - batch_start); + if (!this->batch_file_prefix.empty()) { + // Use a permanent file. + phasings.emplace_back(parse_file, batch_start, batch_size); + } else { + // Use a temporary file that persists until the program exits. + phasings.emplace_back(batch_start, batch_size); + phasings.back().makePersistent(); + } + variants.addFile(phasings.back().name(), phasings.back().offset(), phasings.back().size()); + } + + // Parse the variants and the phasings. + size_t variants_processed = 0; + std::vector was_diploid(sample_range.second, true); // Was the sample diploid at the previous site? + do { + // Skip variants with non-DNA sequence, as they are not included in the graph. + bool isDNA = allATGC(var.ref); + for (std::vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { + if (!allATGC(*a)) isDNA = false; + } + if (!isDNA) { + continue; + } + + if (this->rename_variants) { + // We need to move the variant over to the contig name + // used in the graph, in order to get the right id for + // it in the graph. + var.sequenceName = path_name; + } + + // Determine the reference nodes for the current variant and create a variant site. + // If the variant is not an insertion, there should be a path for the ref allele. + std::string var_name = make_variant_id(var); + std::string ref_path_name = "_alt_" + var_name + "_0"; + gbwt::vector_type ref_path = extract_as_gbwt_path(graph, ref_path_name); + + size_t ref_pos = variants.invalid_position(); + if (!ref_path.empty()) { + ref_pos = variants.firstOccurrence(ref_path.front()); + if (ref_pos == variants.invalid_position()) { + #pragma omp critical + { + std::cerr << "warning: [HaplotypeIndexer::parse_vcf] invalid ref path for " << var_name << " at " + << var.sequenceName << ":" << var.position << std::endl; + } + continue; + } + } else { + // Try using the alternate alleles instead. + bool found = false; + for (size_t alt_index = 1; alt_index < var.alleles.size(); alt_index++) { + std::string alt_path_name = "_alt_" + var_name + "_" + std::to_string(alt_index); + size_t candidate_pos = 0; + bool candidate_found = false; + gbwt::vector_type pred_nodes = path_predecessors(graph, alt_path_name); + if (!pred_nodes.empty()) { + for (auto node : pred_nodes) { + size_t pred_pos = variants.firstOccurrence(node); + if (pred_pos != variants.invalid_position()) { + candidate_pos = std::max(candidate_pos, pred_pos + 1); + candidate_found = true; + found = true; + } + } + // For each alternate allele, find the rightmost reference node among + // its predecessors. If multiple alleles have candidates for the + // reference position, choose the leftmost one. + if (candidate_found) { + ref_pos = std::min(ref_pos, candidate_pos); + } + } + } + if (!found) { + // This variant from the VCF is just not in the graph, so skip it. + found_missing_variants++; + if (this->warn_on_missing_variants && found_missing_variants <= this->max_missing_variant_warnings) { + #pragma omp critical + { + // The user might not know it. Warn them in case they mixed up their VCFs. + std::cerr << "warning: [HaplotypeIndexer::parse_vcf] alt and ref paths for " << var_name + << " at " << var.sequenceName << ":" << var.position + << " missing/empty! Was the variant skipped during construction?" << std::endl; + if (found_missing_variants == this->max_missing_variant_warnings) { + std::cerr << "warning: [HaplotypeIndexer::parse_vcf] suppressing further missing variant warnings" << std::endl; + } + } + } + continue; + } + } + variants.addSite(ref_pos, ref_pos + ref_path.size()); + + // Add alternate alleles to the site. + for (size_t alt_index = 1; alt_index < var.alleles.size(); alt_index++) { + std::string alt_path_name = "_alt_" + var_name + "_" + std::to_string(alt_index); + variants.addAllele(extract_as_gbwt_path(graph, alt_path_name)); + } + + // Store the phasings in PhasingInformation structures. + std::vector genotypes = parseGenotypes(var.originalLine, num_samples); + for (size_t batch = 0; batch < phasings.size(); batch++) { + std::vector current_phasings; + for (size_t sample = phasings[batch].offset(); sample < phasings[batch].limit(); sample++) { + current_phasings.emplace_back(genotypes[sample], was_diploid[sample], this->phase_homozygous); + was_diploid[sample] = current_phasings.back().diploid; + if(this->force_phasing) { + current_phasings.back().forcePhased([&]() { + return random_bit(rng); + }); + } + } + phasings[batch].append(current_phasings); + } + variants_processed++; + } + while (variant_file.is_open() && variant_file.getNextVariant(var) && var.sequenceName == vcf_contig_name); // End of variants. + if (this->show_progress) { + size_t phasing_bytes = 0; + for (size_t batch = 0; batch < phasings.size(); batch++) { + phasing_bytes += phasings[batch].bytes(); + } + #pragma omp critical + { + std::cerr << job_name << ": Processed " << variants_processed << " variants on path " << path_name << ", " << gbwt::inMegabytes(phasing_bytes) << " MiB phasing information" << std::endl; + std::cerr << job_name << ": Saving the VCF parse for path " << path_name << " to " << parse_file << std::endl; + } + } + + // Save the VCF parse. + if (!sdsl::store_to_file(variants, parse_file)) { + std::cerr << "error: [HaplotypeIndexer::parse_vcf] cannot write parse file " << parse_file << std::endl; + std::exit(EXIT_FAILURE); + } + result.push_back(parse_file); + + // End of haplotype generation for the current contig. + total_variants_processed += variants_processed; + } // End of contigs. + + if (this->warn_on_missing_variants && found_missing_variants > 0) { + #pragma omp critical + { + std::cerr << "warning: [HaplotypeIndexer::parse_vcf] Found " << found_missing_variants << "/" << total_variants_processed + << " variants in phasing VCF but not in graph! Do your graph and VCF match?" << std::endl; + } + } + + return result; +} + +std::unique_ptr HaplotypeIndexer::build_gbwt(const std::vector& vcf_parse_files, + const std::string& job_name, + const PathHandleGraph* graph, + const std::unordered_set* paths, + bool skip_unvisited_paths) const { + + // GBWT index. + std::unique_ptr index(new gbwt::DynamicGBWT()); + index->addMetadata(); + if (vcf_parse_files.empty() && !graph) { + return index; + } + + { + // New stuff we're adding to the GBWT metadata + std::set haplotypes; + std::vector sample_names, contig_names; + + // Haplotype construction for each contig from VCF. + for (const std::string& filename : vcf_parse_files) { + gbwt::VariantPaths variants; + if (!sdsl::load_from_file(variants, filename)) { + std::cerr << "error: [HaplotypeIndexer::build_gbwt] cannot load VCF parse from " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + if (!variants.hasContigName() || !variants.hasSampleNames()) { + std::cerr << "error: [HaplotypeIndexer::build_gbwt] VCF parse file " << filename << " does not contain sample/contig names" << std::endl; + std::exit(EXIT_FAILURE); + } + if (sample_names.empty()) { + sample_names = variants.getSampleNames(); + } else if (sample_names != variants.getSampleNames()) { + std::cerr << "error: [HaplotypeIndexer::build_gbwt] invalid sample names in VCF parse file " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + contig_names.emplace_back(variants.getContigName()); + if (this->show_progress) { + #pragma omp critical + { + std::cerr << job_name << ": Generating haplotypes for path " << variants.getContigName() << " from file " << filename << std::endl; + } + } + gbwt::GBWTBuilder builder(variants.nodeWidth(true), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval); + builder.swapIndex(*index); + gbwt::generateHaplotypes(variants, std::set(), + [&](gbwt::size_type sample_id) -> bool { + return (this->excluded_samples.find(sample_names[sample_id]) == this->excluded_samples.end()); + }, [&](const gbwt::Haplotype& haplotype) { + builder.insert(haplotype.path, true); // Insert in both orientations. + builder.index.metadata.addPath(haplotype.sample, contig_names.size() - 1, haplotype.phase, haplotype.count); + haplotypes.insert(gbwt::range_type(haplotype.sample, haplotype.phase)); + }, [&](gbwt::size_type, gbwt::size_type) -> bool { + // For each overlap, discard it if our global flag is set. + return this->discard_overlaps; + }); + builder.finish(); + builder.swapIndex(*index); + } + + // Now count the haplotypes we added + index->metadata.setHaplotypes(index->metadata.haplotypes() + haplotypes.size()); + // And remember the samples and contigs we created + index->metadata.setSamples(sample_names); + index->metadata.setContigs(contig_names); + } + + if (graph) { + // Also include graph named paths from the graph + + // Actual work. + if (show_progress) { + #pragma omp critical + { + std::cerr << "Indexing embedded paths" << std::endl; + } + } + + // GBWT construction, into existing cumulative index. + gbwt::GBWTBuilder builder(gbwt_node_width(*graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval); + builder.swapIndex(*index); + + std::unordered_set visited_contig_names; + if (skip_unvisited_paths) { + // Make a set of the contigs visited so we can filter down to them. + for (size_t i = 0; i < builder.index.metadata.contig_names.size(); i++) { + visited_contig_names.insert(builder.index.metadata.contig(i)); + } + } + + // Set up a filter to drop alt paths and other paths we don't want. + std::function path_filter = [&](const path_handle_t& path_handle) { + std::string path_name = graph->get_path_name(path_handle); + if (Paths::is_alt(path_name)) { + return false; + } + if (skip_unvisited_paths && !visited_contig_names.count(graph->get_locus_name(path_handle))) { + // This path is on an unvisited contig + return false; + } + if (paths && !paths->count(path_handle)) { + return false; + } + return true; + }; + // And add every path that passes the filter (including haplotype paths) from the source graph. + gbwtgraph::store_paths(builder, *graph, {PathSense::GENERIC, PathSense::REFERENCE, PathSense::HAPLOTYPE}, &path_filter); + + // Finish the construction for this set of threads and put the index back. + builder.finish(); + builder.swapIndex(*index); + } + + // Finish the construction overall. + if (this->show_progress) { + std::cerr << job_name << ": "; + gbwt::operator<<(std::cerr, index->metadata); + std::cerr << std::endl; + } + return index; +} + +std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandleGraph& graph) const { + // Fall back to the general vcf-and-graph implementation + return build_gbwt({}, "GBWT", &graph); +} + +std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandleGraph& graph, + const std::vector& aln_filenames, const std::string& aln_format) const { + + // GBWT metadata. + std::vector sample_names, contig_names; + std::map> sample_info; // name -> (id, count) + contig_names.push_back("0"); // An artificial contig. + size_t haplotype_count = 0; + + // GBWT construction. + gbwt::GBWTBuilder builder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval); + builder.index.addMetadata(); + + // Actual work. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Converting " << aln_format << " to threads" << std::endl; + } + } + std::function lambda = [&](Alignment& aln) { + gbwt::vector_type buffer; + for (auto& m : aln.path().mapping()) { + buffer.push_back(mapping_to_gbwt(m)); + } + builder.insert(buffer, true); // Insert in both orientations. + size_t sample_id = 0, sample_count = 0; + auto iter = sample_info.find(aln.name()); + if (iter == sample_info.end()) { + sample_id = sample_names.size(); + sample_names.push_back(aln.name()); + sample_info[aln.name()] = std::pair(sample_id, sample_count); + haplotype_count++; + } else { + sample_id = iter->second.first; + sample_count = iter->second.second; + iter->second.second++; + } + builder.index.metadata.addPath(sample_id, 0, 0, sample_count); + }; + for (auto& file_name : aln_filenames) { + if (aln_format == "GAM") { + get_input_file(file_name, [&](istream& in) { + vg::io::for_each(in, lambda); + }); + } else { + assert(aln_format == "GAF"); + vg::io::gaf_unpaired_for_each(graph, file_name, lambda); + } + } + + // Finish the construction and extract the index. + finish_gbwt_constuction(builder, sample_names, contig_names, haplotype_count, this->show_progress); + std::unique_ptr built(new gbwt::DynamicGBWT()); + builder.swapIndex(*built); + return built; +} + +} diff --git a/src/haplotype_indexer.hpp b/src/haplotype_indexer.hpp new file mode 100644 index 00000000000..f555aec947b --- /dev/null +++ b/src/haplotype_indexer.hpp @@ -0,0 +1,146 @@ +#ifndef VG_HAPLOTYPE_INDEXER_HPP_INCLUDED +#define VG_HAPLOTYPE_INDEXER_HPP_INCLUDED + +/** + * \file haplotype_indexer.hpp: defines how to construct GBWT indexes and VCF parses. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +#include "handle.hpp" +#include "progressive.hpp" + +namespace vg { + +using namespace std; + +/** + * Allows indexing haplotypes, either to pre-parsed haplotype files or to a GBWT. + */ +class HaplotypeIndexer : public Progressive { +public: + /// Print a warning if variants in the VCF can't be found in the graph + bool warn_on_missing_variants = true; + + /// Only report up to this many of them + size_t max_missing_variant_warnings = 10; + + /// Path names in the graph are mapped to VCF contig names via path_to_vcf, + /// or used as-is if no entry there is found. + std::map path_to_vcf; + + /// Use graph path names instead of VCF path names when composing variant + /// alt paths. + bool rename_variants = true; + + /// If batch_file_prefix is nonempty, a file for each contig is saved to + /// PREFIX_VCFCONTIG, and files for each batch of haplotypes are saved to + /// files named like PREFIX_VCFCONTIG_STARTSAMPLE_ENDSAMPLE. Otherwise, the + /// batch files are still saved, but to temporary files. + std::string batch_file_prefix = ""; + + /// Phase homozygous unphased variants + bool phase_homozygous = true; + + /// Arbitrarily phase all unphased variants + bool force_phasing = false; + + /// Join together overlapping haplotypes + bool discard_overlaps = false; + + /// Number of samples to process together in a haplotype batch. + size_t samples_in_batch = 200; + + /// Size of the GBWT buffer in millions of nodes + size_t gbwt_buffer_size = gbwt::DynamicGBWT::INSERT_BATCH_SIZE / gbwt::MILLION; + + /// Interval at which to sample for GBWT locate + size_t id_interval = gbwt::DynamicGBWT::SAMPLE_INTERVAL; + + /// Range of VCF samples to process (first to past-last). + std::pair sample_range = std::pair(0, std::numeric_limits::max()); + + /// Region restrictions for contigs, in VCF name space, as 0-based + /// exclusive-end ranges. + std::map> regions; + + /// Excluded VCF sample names, for which threads will not be generated. + /// Ignored during VCF parsing. + std::unordered_set excluded_samples; + + /// Perform initialization of backing libraries + HaplotypeIndexer(); + + /** + * Parse the VCF file into the types needed for GBWT indexing. + * + * Returns the file names for the VCF parses of non-alt paths. If + * batch_file_prefix is set, these are permanent files. Otherwise they + * are temporary files that persist until the program exits. + */ + std::vector parse_vcf(const std::string& filename, const PathHandleGraph& graph, const std::string& job_name = "GBWT") const; + + /** + * Parse the VCF file into the types needed for GBWT indexing. + * + * Returns the file names for the VCF parses of the specified paths. If + * batch_file_prefix is set, these are permanent files. Otherwise they + * are temporary files that persist until the program exits. + */ + std::vector parse_vcf(const std::string& filename, const PathHandleGraph& graph, const std::vector& paths, const std::string& job_name = "GBWT") const; + + /** + * Build a GBWT from the haplotypes in the given VCF parse files. + * + * Respects excluded_samples and does not produce threads for them. + * + * We expect that all parse files contain sample/contig names and + * that the sample names are the same in all files. + * + * There may be no parse files. + * + * If graph is provided and is not null, also includes embedded non-alt + * paths from the graph. + * + * If paths is provided and is not null, include only those specified paths + * from the graph. If skip_unvisited_paths is set, paths whose contigs are + * not visited by VCF parse files will be skipped. + */ + std::unique_ptr build_gbwt(const std::vector& vcf_parse_files, + const std::string& job_name = "GBWT", + const PathHandleGraph* graph = nullptr, + const std::unordered_set* paths = nullptr, + bool skip_unvisited_paths = false) const; + + /** + * Build a GBWT from the embedded non-alt paths in the graph. + */ + std::unique_ptr build_gbwt(const PathHandleGraph& graph) const; + + /** + * Build a GBWT from the alignments. Each distinct alignment name becomes + * a sample in the GBWT metadata. If there are multiple alignments with + * the same name, the corresponding GBWT path names will have the same + * sample identifier but different values in the count field. + * + * aln_format can be "GAM" or "GAF" + */ + std::unique_ptr build_gbwt(const PathHandleGraph& graph, + const std::vector& aln_filenames, const std::string& aln_format) const; +}; + +} + +#endif diff --git a/src/haplotypes.cpp b/src/haplotypes.cpp index 13363f7fabb..b2e24782285 100644 --- a/src/haplotypes.cpp +++ b/src/haplotypes.cpp @@ -7,107 +7,6 @@ namespace haplo { // By default, should we warn when haplotype scoring fails? bool warn_on_score_fail = false; -/******************************************************************************* -haplo_DP_edge_memo -*******************************************************************************/ - -haplo_DP_edge_memo::haplo_DP_edge_memo() : - in(vector(0)), out(vector(0)) { - -} - -haplo_DP_edge_memo::haplo_DP_edge_memo(xg::XG& graph, - xg::XG::ThreadMapping last_node, - xg::XG::ThreadMapping node) { - if(has_edge(graph, last_node, node)) { - out = last_node.is_reverse ? - graph.edges_on_start(last_node.node_id) : - graph.edges_on_end(last_node.node_id); - in = node.is_reverse ? - graph.edges_on_end(node.node_id) : - graph.edges_on_start(node.node_id); - } else { - out = vector(0); - in = vector(0); - } -} - -const vector& haplo_DP_edge_memo::edges_in() const { - return in; -} - -const vector& haplo_DP_edge_memo::edges_out() const { - return out; -} - -bool haplo_DP_edge_memo::is_null() const { - return out.size() == 0; -} - -bool haplo_DP_edge_memo::has_edge(xg::XG& graph, xg::XG::ThreadMapping old_node, xg::XG::ThreadMapping new_node) { - vg::Edge edge_taken = xg::make_edge(old_node.node_id, old_node.is_reverse, new_node.node_id, new_node.is_reverse); - - bool edge_found = false; - - const vector& edges = old_node.is_reverse ? graph.edges_on_start(old_node.node_id) : - graph.edges_on_end(old_node.node_id); - - for(auto& edge : edges) { - // Look at every edge in order. - if(xg::edges_equivalent(edge, edge_taken)) { - // If we found the edge we're taking, break. - edge_found = true; - break; - } - } - return edge_found; -} - -/******************************************************************************* -hDP_graph_accessor -*******************************************************************************/ - -hDP_graph_accessor::hDP_graph_accessor(xg::XG& graph, - xg::XG::ThreadMapping new_node, - haploMath::RRMemo& memo) : - graph(graph), edges(haplo_DP_edge_memo()), - old_node(xg::XG::ThreadMapping()), new_node(new_node), memo(memo) { - -} - -hDP_graph_accessor::hDP_graph_accessor(xg::XG& graph, - xg::XG::ThreadMapping old_node, - xg::XG::ThreadMapping new_node, - haploMath::RRMemo& memo) : - graph(graph), old_node(old_node), new_node(new_node), memo(memo), - edges(haplo_DP_edge_memo(graph, old_node, new_node)) { - -} - -bool hDP_graph_accessor::has_edge() const { - return !edges.is_null(); -} - -int64_t hDP_graph_accessor::new_side() const { - return graph.id_to_rank(new_node.node_id) * 2 + new_node.is_reverse; -} - -int64_t hDP_graph_accessor::new_height() const { - return graph.node_height(new_node); -} - -int64_t hDP_graph_accessor::old_height() const { - return graph.node_height(old_node); -} - -int64_t hDP_graph_accessor::new_length() const { - return graph.node_length(new_node.node_id); -} - -void hDP_graph_accessor::print(ostream& output_stream) const { - output_stream << "From node: ID " << old_node.node_id << " is_reverse " << old_node.is_reverse << " ; To node: ID " << old_node.node_id << " is_reverse " << new_node.is_reverse << " ; Reference haplotypes visiting To Node: " << new_height() << endl; -} - /******************************************************************************* hDP_gbwt_graph_accessor *******************************************************************************/ @@ -171,32 +70,6 @@ haplo_DP_rectangle::haplo_DP_rectangle(bool inclusive_interval) : int_is_inc(inc } -void haplo_DP_rectangle::extend(hDP_graph_accessor& ga) { - int64_t new_side = ga.new_side(); - if(previous_index == -1) { - // We're extending an empty state - state.first = 0; - state.second = ga.new_height(); - } else if(!ga.edges.is_null()) { - state.first = ga.graph.where_to(flat_node, - state.first, - new_side, - ga.edges.edges_in(), - ga.edges.edges_out()); - state.second = ga.graph.where_to(flat_node, - state.second, - new_side, - ga.edges.edges_in(), - ga.edges.edges_out()); - } else { - // gPBWT can't extend across an edge it doesn't know about; don't try - state.first = 0; - state.second = 0; - } - flat_node = new_side; - inner_value = -1; -} - void haplo_DP_rectangle::calculate_I(int64_t succ_o_val) { inner_value = interval_size() - succ_o_val; } @@ -261,6 +134,8 @@ haplo_DP_column::~haplo_DP_column() { void haplo_DP_column::update_inner_values() { for(size_t i = 0; i + 1 < entries.size(); i++) { + assert(entries[i].get() != nullptr); + assert(entries[i+1].get() != nullptr); entries[i]->calculate_I(entries[i+1]->interval_size()); } if(!entries.empty()) { @@ -285,7 +160,9 @@ void haplo_DP_column::update_inner_values() { // } void haplo_DP_column::update_score_vector(haploMath::RRMemo& memo) { + assert(!entries.empty()); auto r_0 = entries.at(0); + assert(r_0.get() != nullptr); if(entries.size() == 1 && entries.at(0)->prev_idx() == -1) { r_0->R = -memo.log_population_size(); sum = r_0->R + log(r_0->interval_size()); @@ -299,6 +176,7 @@ void haplo_DP_column::update_score_vector(haploMath::RRMemo& memo) { vector continuing_Rs(entries.size() - offset); vector continuing_counts(entries.size() - offset); for(size_t i = offset; i < entries.size(); i++) { + assert(entries[i].get() != nullptr); continuing_Rs.at(i - offset) = previous_values[entries[i]->prev_idx()]; continuing_counts.at(i - offset) = entries[i]->I(); } @@ -314,14 +192,16 @@ void haplo_DP_column::update_score_vector(haploMath::RRMemo& memo) { i = 1; } if(length == 1) { - for(i; i < entries.size(); i++) { + for(; i < entries.size(); i++) { + assert(entries[i].get() != nullptr); double logLHS = memo.logT_base + previous_R(i) + memo.logT(length); entries[i]->R = haploMath::logsum(logLHS, logpS1S2RRS); } } else { - for(i; i < entries.size(); i++) { + for(; i < entries.size(); i++) { + assert(entries[i].get() != nullptr); double logLHS = memo.logT_base + haploMath::logsum(logS1RRD, previous_R(i) + memo.logT(length)); entries[i]->R = haploMath::logsum(logLHS, logpS1S2RRS); @@ -334,12 +214,14 @@ void haplo_DP_column::update_score_vector(haploMath::RRMemo& memo) { } double haplo_DP_column::previous_R(size_t i) const { + assert(entries.at(i).get() != nullptr); return previous_values[(entries.at(i))->prev_idx()]; } vector haplo_DP_column::get_scores() const { vector to_return; for(size_t i = 0; i < entries.size(); i++) { + assert(entries[i].get() != nullptr); to_return.push_back(entries[i]->R); } return to_return; @@ -348,6 +230,7 @@ vector haplo_DP_column::get_scores() const { vector haplo_DP_column::get_sizes() const { vector to_return; for(size_t i = 0; i < entries.size(); i++) { + assert(entries[i].get() != nullptr); to_return.push_back(entries[i]->I()); } return to_return; @@ -363,6 +246,7 @@ void haplo_DP_column::print(ostream& out) const { for(size_t j = 0; j < get_sizes().size() - i - 1; j++) { out << " "; } + assert(entries.at(i).get() != nullptr); out << entries.at(i)->I() << "] : " << entries.at(i)->interval_size() << endl; } } @@ -371,46 +255,6 @@ bool haplo_DP_column::is_empty() const { return entries.size() == 0; } -/******************************************************************************* -haplo_DP -*******************************************************************************/ -haplo_score_type haplo_DP::score(const vg::Path& path, xg::XG& graph, haploMath::RRMemo& memo) { - return score(path_to_thread_t(path), graph, memo); -} - -haplo_score_type haplo_DP::score(const thread_t& thread, xg::XG& graph, haploMath::RRMemo& memo) { - hDP_graph_accessor ga_i(graph, thread[0], memo); - haplo_DP hdp(ga_i); -#ifdef debug - cerr << "After entry 0 (" << thread[0].node_id << ") height: " << ga_i.new_height() << " score: " << hdp.DP_column.current_sum() << endl; -#endif - if(ga_i.new_height() == 0) { - if (warn_on_score_fail) { - cerr << "[WARNING] Initial node in path is visited by 0 reference haplotypes" << endl; - cerr << "Cannot compute a meaningful haplotype likelihood score" << endl; - ga_i.print(cerr); - } - return pair(nan(""), false); - } - for(size_t i = 1; i < thread.size(); i++) { - hDP_graph_accessor ga(graph, thread[i-1], thread[i], memo); - if(ga.new_height() == 0) { - if (warn_on_score_fail) { - cerr << "[WARNING] Node " << i + 1 << " in path is visited by 0 reference haplotypes" << endl; - cerr << "Cannot compute a meaningful haplotype likelihood score" << endl; - ga.print(cerr); - } - return pair(nan(""), false); - } else { - hdp.DP_column.extend(ga); - } -#ifdef debug - cerr << "After entry " << i << " (" << thread[i].node_id << ") height: " << ga.new_height() << " score: " << hdp.DP_column.current_sum() << endl; -#endif - } - return pair(hdp.DP_column.current_sum(), true); -} - haplo_DP_column* haplo_DP::get_current_column() { return &DP_column; } @@ -442,15 +286,36 @@ size_t linear_haplo_structure::path_mapping_offset(const vg::Path& path, size_t } int64_t linear_haplo_structure::get_SNP_ref_position(size_t node_id) const { - vector lnbr_edges = xg_index.edges_on_start(node_id); - int64_t lnbr = lnbr_edges[0].from(); - vector SNP_allele_edges = xg_index.edges_on_end(lnbr); - for(size_t i = 0; i < SNP_allele_edges.size(); i++) { - if(xg_index.path_contains_node(xg_index.path_name(xg_ref_rank), SNP_allele_edges[i].to())) { - return position_assuming_acyclic(SNP_allele_edges[i].to()); + // walk to the left and the neighbor there + vg::handle_t lnbr; + bool found_lnbr = !graph.follow_edges(graph.get_handle(node_id), true, + [&](const vg::handle_t& prev) { + lnbr = prev; + return false; + }); + if (!found_lnbr) { + throw runtime_error("SNP at node ID " + to_string(node_id) + " does not have neighbors that can be used to find reference path " + graph.get_path_name(ref_path_handle)); } - } - throw runtime_error("no ref allele at SNP"); + + // walk back to the right and get the position of the allele that's on the + // reference path + int64_t ref_pos; + bool found_ref_pos = !graph.follow_edges(lnbr, false, + [&](const vg::handle_t& next) { + return graph.for_each_step_on_handle(next, [&](const vg::step_handle_t& step) { + if (graph.get_path_handle_of_step(step) == ref_path_handle) { + ref_pos = graph.get_position_of_step(step); + return false; + } + return true; + }); + }); + + if (!found_ref_pos) { + throw runtime_error("SNP at node ID " + to_string(node_id) + " is not adjacent to the reference path " + graph.get_path_name(ref_path_handle)); + } + + return ref_pos; } void linear_haplo_structure::SNVvector::push_back(alleleValue allele, size_t ref_pos, bool deletion) { @@ -463,7 +328,7 @@ void linear_haplo_structure::SNVvector::push_back(alleleValue allele, size_t ref } alleleValue linear_haplo_structure::get_SNV_allele(int64_t node_id) const { - char allele_char = xg_index.node_sequence(node_id).at(0); + char allele_char = graph.get_sequence(graph.get_handle(node_id)).at(0); return allele::from_char(allele_char); } @@ -480,7 +345,7 @@ size_t linear_haplo_structure::SNVvector::size() const { } bool linear_haplo_structure::sn_deletion_between_ref(int64_t left, int64_t right) const { - int64_t gap = position_assuming_acyclic(right) - position_assuming_acyclic(left) - xg_index.node_length(left); + int64_t gap = position_assuming_acyclic(right) - position_assuming_acyclic(left) - graph.get_length(graph.get_handle(left)); if(gap == 0) { return false; } else if(gap == 1) { @@ -491,26 +356,32 @@ bool linear_haplo_structure::sn_deletion_between_ref(int64_t left, int64_t right } int64_t linear_haplo_structure::get_ref_following(int64_t node_id) const { - vector r_edges = xg_index.edges_on_end(node_id); - vector refs; - for(size_t i = 0; i < r_edges.size(); i++) { - if(xg_index.path_contains_node(xg_index.path_name(xg_ref_rank), r_edges[i].to())) { - refs.push_back(r_edges[i].to()); + // walk to the right and get all nodes on the reference path + vector refs; + graph.follow_edges(graph.get_handle(node_id), false, [&](const vg::handle_t& next) { + graph.for_each_step_on_handle(next, [&](const vg::step_handle_t& step) { + if (graph.get_path_handle_of_step(step) == ref_path_handle) { + refs.push_back(graph.get_handle_of_step(step)); + return false; + } + return true; + }); + }); + + if (refs.empty()) { + throw runtime_error("SNP at node ID " + to_string(node_id) + " does not have a following node on the reference path " + graph.get_path_name(ref_path_handle)); } - } - if(refs.size() == 0) { - throw runtime_error("no ref node following"); - } - size_t smallest = SIZE_MAX; - int64_t node = refs[0]; - for(size_t i = 0; i < refs.size(); i++) { - auto pos = position_assuming_acyclic(refs[i]); - if(pos < smallest) { - smallest = pos; - node = refs[i]; + + size_t smallest = numeric_limits::max(); + vg::handle_t node_at_smallest; + for(size_t i = 0; i < refs.size(); i++) { + auto pos = position_assuming_acyclic(graph.get_id(refs[i])); + if(pos < smallest) { + smallest = pos; + node_at_smallest = refs[i]; + } } - } - return node; + return graph.get_id(node_at_smallest); } linear_haplo_structure::SNVvector linear_haplo_structure::SNVs(const vg::Path& path) const { @@ -543,14 +414,14 @@ linear_haplo_structure::SNVvector linear_haplo_structure::SNVs(const vg::Path& p throw linearUnrepresentable("not an SNV"); } else if(this_type == snv) { this_pos = get_SNP_ref_position(this_node); - if(this_pos != last_pos + xg_index.node_length(last_node)) { + if(this_pos != last_pos + graph.get_length(graph.get_handle(last_node))) { throw linearUnrepresentable("indel immediately before SNV"); } to_return.push_back(get_SNV_allele(this_node), this_pos, false); } else { this_pos = position_assuming_acyclic(this_node); if(last_type == snv) { - if(this_pos != last_pos + xg_index.node_length(last_node)) { + if(this_pos != last_pos + graph.get_length(graph.get_handle(last_node))) { throw linearUnrepresentable("indel immediately after SNV"); } } else { @@ -569,123 +440,119 @@ linear_haplo_structure::SNVvector linear_haplo_structure::SNVs(const vg::Path& p } size_t linear_haplo_structure::position_assuming_acyclic(int64_t node_id) const { - if(!xg_index.path_contains_node(xg_index.path_name(xg_ref_rank), node_id)) { - throw runtime_error("requested position-in-path of node " + to_string(node_id) + " not in path " + xg_index.path_name(xg_ref_rank)); + + // check occurrences o this node on paths + size_t pos; + bool found_pos = !graph.for_each_step_on_handle(graph.get_handle(node_id), [&](const vg::step_handle_t& step) { + + // get the pos if the path matches + if (graph.get_path_handle_of_step(step) == ref_path_handle) { + pos = graph.get_position_of_step(step); + return false; + } + return true; + }); + + if (!found_pos) { + throw runtime_error("requested position-in-path of node " + to_string(node_id) + " not in path " + graph.get_path_name(ref_path_handle)); } - - // First vet the orientation. - // TODO: This is an extra query. - auto oriented_rank = xg_index.oriented_occurrences_on_path(node_id, xg_ref_rank).at(0); - // The whole system we use for the linear index assumes the graph nodes are all forward - assert(!oriented_rank.second); - - // Get the actual position - return xg_index.position_in_path(node_id, xg_ref_rank).at(0); + + return pos; } bool linear_haplo_structure::is_solitary_ref(int64_t node_id) const { - if(!xg_index.path_contains_node(xg_index.path_name(xg_ref_rank), node_id)) { - return false; - } - - vector l_edges = xg_index.edges_on_start(node_id); - vector r_edges = xg_index.edges_on_end(node_id); - for(size_t i = 0; i < l_edges.size(); i++) { - if(xg_index.edges_on_end(l_edges[i].from()).size() != 1) { - bool is_deletion_neighbour = true; - vector neighbour_r_edges = xg_index.edges_on_end(l_edges[i].from()); - for(size_t i = 0; i < neighbour_r_edges.size(); i++) { - if(neighbour_r_edges[i].to() != node_id) { - vector neighbour_rr_edges = xg_index.edges_on_end(neighbour_r_edges[i].to()); - if(!(neighbour_rr_edges.size() == 1 && neighbour_rr_edges[0].to() == node_id)) { - is_deletion_neighbour = false; - } - } - } - if(!is_deletion_neighbour) { + vg::handle_t handle = graph.get_handle(node_id); + bool on_ref = !graph.for_each_step_on_handle(handle, [&](const vg::step_handle_t& step) { + return graph.get_path_handle_of_step(step) != ref_path_handle; + }); + + if (!on_ref) { return false; - } } - } - for(size_t i = 0; i < r_edges.size(); i++) { - if(xg_index.edges_on_start(r_edges[i].to()).size() != 1) { - bool is_deletion_neighbour = true; - vector neighbour_l_edges = xg_index.edges_on_start(r_edges[i].to()); - for(size_t i = 0; i < neighbour_l_edges.size(); i++) { - if(neighbour_l_edges[i].from() != node_id) { - vector neighbour_ll_edges = xg_index.edges_on_start(neighbour_l_edges[i].from()); - if(!(neighbour_ll_edges.size() == 1 && neighbour_ll_edges[0].from() == node_id)) { - is_deletion_neighbour = false; - } + + bool is_deletion_neighbour = true; + graph.follow_edges(handle, true, [&](const vg::handle_t& prev) { + if (graph.get_degree(prev, false) != 1) { + graph.follow_edges(prev, false, [&](const vg::handle_t& next) { + if (next != handle) { + size_t rr_count = 0; + graph.follow_edges(next, false, [&](const vg::handle_t& next_next) { + rr_count++; + if (next_next != handle || rr_count > 1) { + is_deletion_neighbour = false; + } + }); + } + }); } - } - if(!is_deletion_neighbour) { + }); + + graph.follow_edges(handle, false, [&](const vg::handle_t& next) { + if (graph.get_degree(next, true) != 1) { + graph.follow_edges(next, true, [&](const vg::handle_t& prev) { + if (prev != handle) { + size_t ll_count = 0; + graph.follow_edges(prev, true, [&](const vg::handle_t& prev_prev) { + ll_count++; + if (prev_prev != handle || ll_count > 1) { + is_deletion_neighbour = false; + } + }); + } + }); + } + }); + + if (!is_deletion_neighbour) { return false; - } } - } - return true; + return true; } -bool linear_haplo_structure::is_snv(int64_t node_id) const { - // has only one left and one right neighbour - int64_t lnbr; - int64_t rnbr; - - vector l_edges = xg_index.edges_on_start(node_id); - vector r_edges = xg_index.edges_on_end(node_id); - - if(l_edges.size() == 1 && r_edges.size() == 1) { - lnbr = l_edges[0].from(); - rnbr = r_edges[0].to(); - } else { - // has too many or too few neighbours to be an SNV - return false; - } - - vector lnbr_edges = xg_index.edges_on_end(lnbr); - vector rnbr_edges = xg_index.edges_on_start(rnbr); - if(lnbr_edges.size() != rnbr_edges.size()) { - // neigbours must have a node not in common - return false; - } - - vector r_of_lnbr(lnbr_edges.size()); - vector l_of_rnbr(rnbr_edges.size()); - for(size_t i = 0; i < lnbr_edges.size(); i++) { - r_of_lnbr[i] = lnbr_edges[i].to(); - } - for(size_t i = 0; i < rnbr_edges.size(); i++) { - l_of_rnbr[i] = rnbr_edges[i].from(); - } - for(size_t i = 0; i < r_of_lnbr.size(); i++) { - if(xg_index.node_length(r_of_lnbr[i]) != 1) { - if(r_of_lnbr[i] != rnbr) { +bool linear_haplo_structure::is_snv(int64_t node_id) const { + // has only one left and one right neighbour + vg::handle_t lnbr, rnbr; + size_t lnbr_count = 0, rnbr_count = 0; + + vg::handle_t handle = graph.get_handle(node_id); + graph.follow_edges(handle, true, [&](const vg::handle_t& prev) { + lnbr = prev; + ++lnbr_count; + }); + graph.follow_edges(handle, false, [&](const vg::handle_t& next) { + rnbr = next; + ++rnbr_count; + }); + + if (lnbr_count != 1 || rnbr_count != 1) { + // has too many or too few neighbours to be an SNV return false; - } } - } - // given guarantee that edge_sets contain no duplicates, check that there is an injective map - // from r_of_lnbr to l_of_rnbr. If so, must be bijection as are finite sets of same size - for(size_t i = 0; i < r_of_lnbr.size(); i++) { - if(r_of_lnbr[i] == rnbr) { - r_of_lnbr[i] = -1; - } else { - for(size_t j = 0; j < l_of_rnbr.size(); j++) { - if(r_of_lnbr[i] == l_of_rnbr[j]) { - r_of_lnbr[i] = -1; - } - } + unordered_set from_lnbr; + bool all_snv = graph.follow_edges(lnbr, false, [&](const vg::handle_t& next) { + from_lnbr.insert(next); + return graph.get_length(next) == 1; + }); + + if (!all_snv) { + // some alleles are not SNVs + return false; } - } - for(size_t i = 0; i < r_of_lnbr.size(); i++) { - if(r_of_lnbr[i] != -1) { - return false; + + size_t from_rnbr_count = 0; + bool all_match = graph.follow_edges(rnbr, true, [&](const vg::handle_t& prev) { + ++from_rnbr_count; + return (bool) from_lnbr.count(prev); + }); + + if (!all_match || from_rnbr_count != from_lnbr.size()) { + // we didn't find all of the neighbors in common + return false; } - } - return true; + + return true; } inputHaplotype* linear_haplo_structure::path_to_input_haplotype(const vg::Path& path) const { @@ -772,7 +639,10 @@ inputHaplotype* linear_haplo_structure::path_to_input_haplotype(const vg::Path& return to_return; } -linear_haplo_structure::linear_haplo_structure(istream& slls_index, double log_mut_penalty, double log_recomb_penalty, xg::XG& xg_index, size_t xg_ref_rank) : xg_index(xg_index), xg_ref_rank(xg_ref_rank) { +linear_haplo_structure::linear_haplo_structure(istream& slls_index, double log_mut_penalty, + double log_recomb_penalty, + const vg::PathPositionHandleGraph& graph, + vg::path_handle_t ref_path_handle) : graph(graph), ref_path_handle(ref_path_handle) { if (log_mut_penalty > 0) { throw runtime_error("log mutation penalty must be negative"); @@ -782,15 +652,13 @@ linear_haplo_structure::linear_haplo_structure(istream& slls_index, double log_m throw runtime_error("log recombination penalty must be negative"); } - if(xg_ref_rank > xg_index.max_path_rank()) { - throw runtime_error("reference path rank out of bounds"); - } index = new siteIndex(slls_index); cohort = new haplotypeCohort(slls_index, index); penalties = new penaltySet(log_recomb_penalty, log_mut_penalty, cohort->get_n_haplotypes()); } linear_haplo_structure::~linear_haplo_structure() { + delete index; delete cohort; delete penalties; } @@ -810,15 +678,25 @@ haplo_score_type linear_haplo_structure::score(const vg::Path& path) const { } /******************************************************************************* -XGScoreProvider +ScoreProvider *******************************************************************************/ -XGScoreProvider::XGScoreProvider(xg::XG& index) : index(index) { - // Nothing to do! +int64_t ScoreProvider::get_haplotype_count() const { + // By default, say that we don't know the haplotype count. + return -1; } -pair XGScoreProvider::score(const vg::Path& path, haploMath::RRMemo& memo) { - return haplo_DP::score(path, index, memo); +bool ScoreProvider::has_incremental_search() const { + // By default, say that we lack incremental search support. + return false; +} + +IncrementalSearchState ScoreProvider::incremental_find(const vg::Position& pos) const { + throw runtime_error("Incremental search not implemented"); +} + +IncrementalSearchState ScoreProvider::incremental_extend(const IncrementalSearchState& state, const vg::Position& pos) const { + throw runtime_error("Incremental search not implemented"); } /******************************************************************************* @@ -841,21 +719,6 @@ pair LinearScoreProvider::score(const vg::Path& path, haploMath::R return scored; } -/******************************************************************************* -path conversion -*******************************************************************************/ - -thread_t path_to_thread_t(const vg::Path& path) { - thread_t t; - for(size_t i = 0; i < path.mapping_size(); i++) { - vg::Mapping mapping = path.mapping(i); - auto pos = mapping.position(); - xg::XG::ThreadMapping m = {pos.node_id(), pos.is_reverse()}; - t.push_back(m); - } - return t; -} - /******************************************************************************* math functions *******************************************************************************/ @@ -873,7 +736,9 @@ RRMemo::RRMemo(double recombination_penalty, size_t population_size) : // log versions logT_base = log1p(-exp_rho); - for(int i = 0; i < population_size; i++) { + // Populate the tabel out to twice the haplotype count. + // In regions between unphased variants, we can have twice as many hits as real haplotypes in the index. + for(int i = 0; i < population_size * 2; i++) { logS_bases.push_back(log1p(i*exp_rho)); } } @@ -954,7 +819,14 @@ double RRMemo::logT(int width) { } double RRMemo::logS(int height, int width) { - return (width-1)*logS_bases[height-1]; //logS_base = log(1 + i*exp_rho) + if (height <= logS_bases.size()) { + // Fulfil from lookup table + return (width-1)*logS_bases[height-1]; //logS_base = log(1 + i*exp_rho) + } else { + // We must have a cycle or something; we have *way* more hits than haplotypes. + // Uncommon; just recompute the logS base as we do in the constructor. + return (width-1)*log1p((height-1)*exp_rho); + } } double RRMemo::logRRDiff(int height, int width) { diff --git a/src/haplotypes.hpp b/src/haplotypes.hpp index 17e6559f662..827723a450e 100644 --- a/src/haplotypes.hpp +++ b/src/haplotypes.hpp @@ -4,9 +4,10 @@ #include #include #include +#include -#include "vg.pb.h" -#include "xg.hpp" +#include +#include "handle.hpp" #include #include @@ -23,12 +24,14 @@ using namespace std; //////////////////////////////////////////////////////////////////////////////// // // A. Construct the following shared objects -// 1. an index, either a -// i. xg::XG index -// ii. gbwt::GBWT -// iii. gbwt::DynamicGBWT +// 1. An index, either a +// i. gbwt::GBWT +// ii. gbwt::DynamicGBWT // 2. An appropriate ScoreProvider implementation that will use the index. -// 3. a memo for shared values used in calculations; a +// It is also responsible for determining the population size from its index, if able. +// It can also implement incremental haplotype search, because we need that +// functionality in places where the haplotype index is abstracted as a ScoreProvider. +// 3. A memo for shared values used in calculations; a // haplo::haploMath::RRMemo, which takes the parameters // i. double -log(recombination probability) // ii. size_t population size @@ -49,11 +52,9 @@ using namespace std; // which takes in inputs // i. const vg::Path& Path // ii. indexType& index where indexType is one of -// a. xg::XG -// b. gbwt::GBWT -// c. gbwt::DynamicGBWT +// a. gbwt::GBWT +// b. gbwt::DynamicGBWT // iii. haploMath::RRMemo - // //////////////////////////////////////////////////////////////////////////////// @@ -61,8 +62,6 @@ namespace haplo { // If this global is set, warn the user when scoring fails extern bool warn_on_score_fail; - -using thread_t = vector; namespace haploMath{ double logsum(double a, double b); @@ -120,55 +119,6 @@ struct int_itvl_t{ // ----------------------------------------------------------------------------- -struct haplo_DP_edge_memo { -private: - vector in; - vector out; -public: - haplo_DP_edge_memo(); // for constructing null edge_memos - haplo_DP_edge_memo(xg::XG& graph, - xg::XG::ThreadMapping last_node, - xg::XG::ThreadMapping node); - const vector& edges_in() const; - const vector& edges_out() const; - bool is_null() const; - static bool has_edge(xg::XG& graph, xg::XG::ThreadMapping old_node, xg::XG::ThreadMapping new_node); -}; - -// ----------------------------------------------------------------------------- - -class hDP_graph_accessor { -public: - const xg::XG::ThreadMapping old_node; - const xg::XG::ThreadMapping new_node; - const haplo_DP_edge_memo edges; - const xg::XG& graph; - haploMath::RRMemo& memo; - - // accessor for noninitial nodes in a haplotype - hDP_graph_accessor(xg::XG& graph, - xg::XG::ThreadMapping old_node, - xg::XG::ThreadMapping new_node, - haploMath::RRMemo& memo); - // accessor for initial node in a haplotype - // old_node and edge-vectors are null; do not use to extend nonempty states - hDP_graph_accessor(xg::XG& graph, - xg::XG::ThreadMapping new_node, - haploMath::RRMemo& memo); - - int64_t new_side() const; - int64_t new_height() const; - int64_t old_height() const; - int64_t new_length() const; - - bool has_edge() const; - bool inclusive_interval() const { return false; } - - void print(ostream& output_stream) const; -}; - -// ----------------------------------------------------------------------------- - struct gbwt_thread_t { private: gbwt::vector_type nodes; @@ -235,7 +185,6 @@ struct haplo_DP_rectangle{ haplo_DP_rectangle(bool inclusive_interval); double R; void set_offset(int offset); - void extend(hDP_graph_accessor& ga); template void false_extend(accessorType& ga, int_itvl_t delta); template @@ -281,8 +230,6 @@ struct haplo_DP_column { bool is_empty() const; }; -thread_t path_to_thread_t(const vg::Path& path); - //------------------------------------------------------------------------------ // Outward-facing //------------------------------------------------------------------------------ @@ -299,7 +246,6 @@ struct haplo_DP { public: //------------------------------------------------------------------------------ // API functions - static haplo_score_type score(const vg::Path& path, xg::XG& graph, haploMath::RRMemo& memo); template static haplo_score_type score(const vg::Path& path, GBWTType& graph, haploMath::RRMemo& memo); //------------------------------------------------------------------------------ @@ -308,7 +254,6 @@ struct haplo_DP { template haplo_DP(accessorType& ga); haplo_DP_column* get_current_column(); - static haplo_score_type score(const thread_t& thread, xg::XG& graph, haploMath::RRMemo& memo); template static haplo_score_type score(const gbwt_thread_t& thread, GBWTType& graph, haploMath::RRMemo& memo); }; @@ -321,8 +266,8 @@ struct linear_haplo_structure{ siteIndex* index = nullptr; haplotypeCohort* cohort = nullptr; penaltySet* penalties = nullptr; - xg::XG& xg_index; - size_t xg_ref_rank; + const vg::PathPositionHandleGraph& graph; + vg::path_handle_t ref_path_handle; public: typedef enum nodeType{ ref_span, @@ -347,9 +292,10 @@ struct linear_haplo_structure{ size_t size() const; }; - /// Make a new linear_haplo_structure with the given indexes, mutation and recombination scoring parameters, and reference path in the XG. + /// Make a new linear_haplo_structure with the given indexes, mutation and recombination scoring parameters, and reference path in the graph. /// Penalties *must* be negative, and ought to be something like -9*2.3 mutation and -6*2.3 recombination. - linear_haplo_structure(istream& slls_index, double log_mut_penalty, double log_recomb_penalty, xg::XG& xg_index, size_t xg_ref_rank); + linear_haplo_structure(istream& slls_index, double log_mut_penalty, double log_recomb_penalty, + const vg::PathPositionHandleGraph& graph, vg::path_handle_t ref_path_handle); ~linear_haplo_structure(); haplo_score_type score(const vg::Path& path) const; @@ -374,30 +320,60 @@ struct linear_haplo_structure{ }; +/// Incremental haplotype search range type used for ScoreProvider's +/// incremental search API. Default constructed, represents an empty or +/// un-started search. Supports an empty() and a length(). Copyable. +/// TODO: There's some overlap with the graph accessors here, but I don't +/// understand them enough to work out exactly what it is and eliminate it. +/// TODO: This should become a real type (base class and implementations +/// wrapping implementation-specific data) when we get any other +/// implementations. +using IncrementalSearchState = gbwt::SearchState; + /// Interface abstracting over the various ways of generating haplotype scores. -/// You probably want the implementations: XGScoreProvider, GBWTScoreProvider, LinearScoreProvider +/// You probably want the implementations: GBWTScoreProvider, LinearScoreProvider /// TODO: Const-ify the indexes used class ScoreProvider { public: + /// Score the given path usign the given memo virtual pair score(const vg::Path&, haploMath::RRMemo& memo) = 0; + /// Return the haplotype count (number of expected haplotypes that agree with + /// a path that is fixed in the population) that should be used to construct + /// the memo to pass to score, or -1 if the indexes backing the ScoreProvider + /// do not make this information available. + virtual int64_t get_haplotype_count() const; + + // We have optional support for incremental search. TODO: We need a search + // state abstraction that encompasses GBWT search state and other search + // state implementations. + + /// Return true if this ScoreProvider supports incremental search for + /// counting haplotypes. + virtual bool has_incremental_search() const; + + /// Start a new search state with the node visit described by the given + /// Position, if incremental search is supported. + virtual IncrementalSearchState incremental_find(const vg::Position& pos) const; + + /// Extend the given search state with the node visit described by the given + /// Position, if incremental search is supported. + virtual IncrementalSearchState incremental_extend(const IncrementalSearchState& state, const vg::Position& pos) const; + virtual ~ScoreProvider() = default; }; -/// Score haplotypes using the gPBWT haplotype data stored in an XG index -class XGScoreProvider : public ScoreProvider { -public: - XGScoreProvider(xg::XG& index); - pair score(const vg::Path&, haploMath::RRMemo& memo); -private: - xg::XG& index; -}; - /// Score haplotypes using a GBWT haplotype database (normal or dynamic) template class GBWTScoreProvider : public ScoreProvider { public: GBWTScoreProvider(GBWTType& index); pair score(const vg::Path&, haploMath::RRMemo& memo); + + int64_t get_haplotype_count() const; + + bool has_incremental_search() const; + IncrementalSearchState incremental_find(const vg::Position& pos) const; + IncrementalSearchState incremental_extend(const IncrementalSearchState& state, const vg::Position& pos) const; private: GBWTType& index; }; @@ -498,6 +474,7 @@ void haplo_DP_rectangle::false_extend(accessorType& ga, template haplo_DP_column::haplo_DP_column(accessorType& ga) { haplo_DP_rectangle* first_rectangle = new haplo_DP_rectangle(ga.inclusive_interval()); + assert(first_rectangle != nullptr); entries.push_back(shared_ptr(first_rectangle)); first_rectangle->extend(ga); update_inner_values(); @@ -509,6 +486,7 @@ void haplo_DP_column::standard_extend(accessorType& ga) { previous_values = get_scores(); previous_sizes = get_sizes(); haplo_DP_rectangle* new_rectangle = new haplo_DP_rectangle(ga.inclusive_interval()); + assert(new_rectangle != nullptr); new_rectangle->extend(ga); decltype(entries) new_entries; new_entries.push_back(shared_ptr(new_rectangle)); @@ -548,7 +526,7 @@ void haplo_DP_column::extend(accessorType& ga) { //------------------------------------------------------------------------------ template -haplo_DP::haplo_DP(accessorType& ga) : DP_column(haplo_DP_column(ga)) { +haplo_DP::haplo_DP(accessorType& ga) : DP_column(ga) { } @@ -559,6 +537,13 @@ haplo_score_type haplo_DP::score(const vg::Path& path, GBWTType& graph, haploMat template haplo_score_type haplo_DP::score(const gbwt_thread_t& thread, GBWTType& graph, haploMath::RRMemo& memo) { + if (thread.size() == 0) { + if (warn_on_score_fail) { + cerr << "[WARNING] Path is empty and cannot be scored" << endl; + cerr << "Cannot compute a meaningful haplotype likelihood score" << endl; + } + return pair(nan(""), false); + } if (!graph.contains(thread[0])) { // We start on a node that has no haplotype index entry if (warn_on_score_fail) { @@ -606,7 +591,7 @@ haplo_score_type haplo_DP::score(const gbwt_thread_t& thread, GBWTType& graph, h hdp.DP_column.extend(ga); } #ifdef debug - cerr << "After entry " << i << " (" << gbwt::Node::id(thread[i]) << ") height: " << ga.new_height() << " intervals: "; + cerr << "After entry " << i << "/" << thread.size() << " (" << gbwt::Node::id(thread[i]) << ") height: " << ga.new_height() << " intervals: "; for (auto& interval : hdp.DP_column.get_sizes()) { cerr << interval << " "; } @@ -616,7 +601,6 @@ haplo_score_type haplo_DP::score(const gbwt_thread_t& thread, GBWTType& graph, h return pair(hdp.DP_column.current_sum(), true); } - //------------------------------------------------------------------------------ template @@ -629,6 +613,33 @@ pair GBWTScoreProvider::score(const vg::Path& path, hapl return haplo_DP::score(path, index, memo); } +template +int64_t GBWTScoreProvider::get_haplotype_count() const { + if (!index.hasMetadata()) { + // No metadata available + return -1; + } + + // TODO: Does this haplotype count have the same expected-count-for-fixed-path semantics that we want? + // Or does it count fragments of haplotypes? + return index.metadata.haplotypes(); +} + +template +bool GBWTScoreProvider::has_incremental_search() const { + // We are going to implement incremental search. + return true; +} + +template +IncrementalSearchState GBWTScoreProvider::incremental_find(const vg::Position& pos) const { + return index.find(gbwt::Node::encode(pos.node_id(), pos.is_reverse())); +} + +template +IncrementalSearchState GBWTScoreProvider::incremental_extend(const IncrementalSearchState& state, const vg::Position& pos) const { + return index.extend(state, gbwt::Node::encode(pos.node_id(), pos.is_reverse())); +} } // namespace haplo diff --git a/src/hash_map.hpp b/src/hash_map.hpp index ca83c4ca0f5..f4299c9983e 100644 --- a/src/hash_map.hpp +++ b/src/hash_map.hpp @@ -5,6 +5,8 @@ #include #include +#include "wang_hash.hpp" + // Comment out to use sparse_hash_map and sparse_hash_set instead of // dense_hash_map and dense_hash_set. //#define USE_DENSE_HASH @@ -87,21 +89,6 @@ struct hash> namespace vg { - -// Thomas Wang's integer hash function. In many implementations, std::hash -// is identity function for integers, which leads to performance issues. - -inline size_t wang_hash_64(size_t key) { - key = (~key) + (key << 21); // key = (key << 21) - key - 1; - key = key ^ (key >> 24); - key = (key + (key << 3)) + (key << 8); // key * 265 - key = key ^ (key >> 14); - key = (key + (key << 2)) + (key << 4); // key * 21 - key = key ^ (key >> 28); - key = key + (key << 31); - return key; -} - // We need this second type for enable_if-based specialization template struct wang_hash; diff --git a/src/hfile_cppstream.cpp b/src/hfile_cppstream.cpp deleted file mode 100644 index 2191158a75e..00000000000 --- a/src/hfile_cppstream.cpp +++ /dev/null @@ -1,292 +0,0 @@ -#include "hfile_cppstream.hpp" - -// We need the hFILE* internals available. -#include - -#include - -#include - -namespace vg { - -namespace stream { - -using namespace std; - -/// Define a c-style-inheritance derived struct that holds the hFILE and the -/// stream pointers. Either stream pointer may be null (if we are in the other -/// mode), or both can be non-null and point to the same iostream object. -typedef struct { - hFILE base; - istream* input; - ostream* output; -} hFILE_cppstream; - - -// Define read, write, seek (which also can tell), flush, and close functions - -/// Read data. Return bytes read, or a negative value on error. Set errno on error. -static ssize_t cppstream_read(hFILE *fpv, void *buffer, size_t nbytes) { -#ifdef debug - cerr << "cppstream_read(" << fpv << ", " << buffer << ", " << nbytes << ")" << endl; -#endif - - // Cast the hFILE to the derived class - hFILE_cppstream* fp = (hFILE_cppstream*) fpv; - - if (fp->input == nullptr) { - // No input stream - errno = EBADF; - return -1; - } - - // Read the data and record how much we got - fp->input->clear(); - fp->input->read((char*) buffer, nbytes); - ssize_t found = fp->input->gcount(); - - if (!fp->input->good() && !fp->input->eof()) { - // An error happened - errno = EIO; - return -1; - } - -#ifdef debug - cerr << "\tFound " << found << "/" << nbytes << " bytes up to " << fp->input->tellg() << endl; -#endif - - // Otherwise the read worked - return found; -} - -/// Write data. Return the number of bytes actually written. Return a negative -/// value and set errno on error. -static ssize_t cppstream_write(hFILE *fpv, const void *buffer, size_t nbytes) { - // Cast the hFILE to the derived class - hFILE_cppstream* fp = (hFILE_cppstream*) fpv; - - if (fp->output == nullptr) { - // No output stream - errno = EBADF; - return -1; - } - - // Write the data and record how much we put - fp->output->clear(); - fp->output->write((char*) buffer, nbytes); - - if (!fp->output->good()) { - // An error happened - errno = EIO; - return -1; - } - - // Otherwise the write worked, and we wrote all the bytes - return nbytes; -} - -/// Seek relative to SEEK_SET (beginning), SEEK_CUR, or SEEK_END. Return the -/// resulting offset from the beginning of the file. -/// Returns a negative value on error. -static off_t cppstream_seek(hFILE *fpv, off_t offset, int whence) { - -#ifdef debug - cerr << "cppstream_seek(" << fpv << ", " << offset << ", " << whence << ")" << endl; -#endif - - // Cast the hFILE to the derived class - hFILE_cppstream* fp = (hFILE_cppstream*) fpv; - - // How are we seeking? - ios_base::seekdir way; - switch (whence) { - case SEEK_SET: - way = ios_base::beg; - break; - case SEEK_CUR: - way = ios_base::cur; - break; - case SEEK_END: - way = ios_base::end; - break; - default: - errno = EINVAL; - return -1; - } - - off_t arrived_at = 0; - - if (fp->input != nullptr) { - // Seek the input stream - fp->input->clear(); - fp->input->seekg(offset, way); - if (!fp->input->good()) { - // Seek failed. - // Assume it is because this is a pipe. - errno = ESPIPE; - return -1; - } - - auto reached = fp->input->tellg(); - if (reached == -1) { - // Definitely a pipe - errno = ESPIPE; - return -1; - } - - arrived_at = reached; - } - - if (fp->output != nullptr) { - // Seek the output stream - fp->output->clear(); - fp->output->seekp(offset, way); - if (!fp->output->good()) { - // Seek failed. - // Assume it is because this is a pipe. - errno = ESPIPE; - return -1; - } - - auto reached = fp->output->tellp(); - if (reached == -1) { - // Definitely a pipe - errno = ESPIPE; - return -1; - } - - if (fp->input != nullptr && reached != arrived_at) { - // We have two streams and they are out of sync! - errno = EIO; - return -1; - } - - arrived_at = reached; - } - -#ifdef debug - cerr << "\t" << arrived_at << endl; -#endif - - // We worked! - return arrived_at; -} - -/// Flush the output stream, if we are doing output. Return 0 for success, or a -/// negative number and set errno on error. -static int cppstream_flush(hFILE *fpv) { - // Cast the hFILE to the derived class - hFILE_cppstream* fp = (hFILE_cppstream*) fpv; - - if (fp->output != nullptr) { - // We have an output stream to flush - fp->output->clear(); - fp->output->flush(); - - if (!fp->output->good()) { - // Flushing did not work - errno = EIO; - return -1; - } - } - - return 0; -} - -/// Close the file. Return 0 on success, or a negative number and set errno on -/// failure. -static int cppstream_close(hFILE *fpv) { - // This is tricky because we don't own the stream. We also can't close generic istreams and ostreams. - - // Cast the hFILE to the derived class - hFILE_cppstream* fp = (hFILE_cppstream*) fpv; - - // Just null out the stream fields. They will be closed when destroyed, and we don't own them. - fp->input = nullptr; - fp->output = nullptr; - - return 0; - -} - -/// Define an hFILE backend for cpp streams -static const struct hFILE_backend cppstream_backend = { - cppstream_read, - cppstream_write, - cppstream_seek, - cppstream_flush, - cppstream_close -}; - -hFILE* hfile_wrap(std::istream& input) { - /// Make the base struct, making sure it knows how big we are - hFILE_cppstream* fp = (hFILE_cppstream*) hfile_init(sizeof(hFILE_cppstream), "r", 0); - - if (fp == nullptr) { - // Couldn't allocate the file for some reason? - return nullptr; - } - - // Do our initialization - fp->input = &input; - fp->output = nullptr; - - // Set the backend - fp->base.backend = &cppstream_backend; - - // Tell the file that it is starting at the offset that the stream is at - input.clear(); - auto start_pos = input.tellg(); - if (start_pos < 0 || !input.good()) { - // The offset can't be determined, because this isn't a seekable stream. - // Use a 0 offset. - start_pos = 0; - - // TODO: There's no real way to prevent the hfile from seeking - // internally in its buffer. - } - fp->base.offset = start_pos; - - // Return the base hFILE* - return &fp->base; -} - -hFILE* hfile_wrap(std::ostream& output) { - /// Make the base struct, making sure it knows how big we are - hFILE_cppstream* fp = (hFILE_cppstream*) hfile_init(sizeof(hFILE_cppstream), "w", 0); - - if (fp == nullptr) { - // Couldn't allocate the file for some reason? - return nullptr; - } - - // Do our initialization - fp->input = nullptr; - fp->output = &output; - - // Set the backend - fp->base.backend = &cppstream_backend; - - // Tell the file that it is starting at the offset that the stream is at - output.clear(); - auto start_pos = output.tellp(); - if (start_pos < 0 || !output.good()) { - // The offset can't be determined, because this isn't a seekable stream. - // Use a 0 offset. - start_pos = 0; - - // TODO: There's no real way to prevent the hfile from seeking - // internally in its buffer. - - } - fp->base.offset = start_pos; - - - - // Return the base hFILE* - return &fp->base; -} - -} - -} diff --git a/src/hfile_cppstream.hpp b/src/hfile_cppstream.hpp deleted file mode 100644 index 4d6e33aae4a..00000000000 --- a/src/hfile_cppstream.hpp +++ /dev/null @@ -1,31 +0,0 @@ -/// \file hfile_cppstream.hpp -/// hFILE* C++ streams wrapper -/// Modeled on https://github.com/samtools/htslib-plugins/blob/master/hfile_mmap.c - -// We need to provide a C++ stream plugin for htslib hFILE* files so we can -// connect Protobuf Zero Copy Streams to C++ streams while filtering through -// BGZF file handles. - -#ifndef VG_HFILE_CPPSTREAM_HPP_INCLUDED -#define VG_HFILE_CPPSTREAM_HPP_INCLUDED - -#include - -#include -#include - -namespace vg { - -namespace stream { - -/// Wrap a C++ output stream as an hFILE* that can be written by BGZF -hFILE* hfile_wrap(std::ostream& output); - -/// Wrap a C++ input stream as an hFILE* that can be read by BGZF -hFILE* hfile_wrap(std::istream& input); - -} - -} - -#endif // VG_HFILE_CPPSTREAM_HPP_INCLUDED diff --git a/src/homogenizer.cpp b/src/homogenizer.cpp deleted file mode 100644 index b78e7ca011d..00000000000 --- a/src/homogenizer.cpp +++ /dev/null @@ -1,196 +0,0 @@ -#include "homogenizer.hpp" - -using namespace std; -using namespace vg; - -void Homogenizer::homogenize(vg::VG* o_graph, xg::XG* xindex, gcsa::GCSA* gcsa_index, gcsa::LCPArray* lcp_index, vg::Index reads_index){ - /** - * Pattern for SV homogenization - * 1. Locate SV-indicating reads with Sift. Save them in a gam file - * 2. index that gam with index -d dbname -N - * 3. Send those reads here - for each possible position - * Find reads supporting that position - * Generate candidate edges and nodes - * Remap reads locally (w/in some subgraph containing the SV) - * Score it somehow - * Check the reads again for SV signatures. - */ - - } - - -void Homogenizer::homogenize(vg::VG* o_graph, xg::XG* xindex, gcsa::GCSA* gcsa_index, gcsa::LCPArray* lcp_index, Paths cached_paths, int kmer_size){ - - bool in_mem_path_only = true; - - vector tips = find_non_ref_tips(o_graph); - - /* TODO filter by whether a read is on the ref path - // 2. Cache the reference path(s) - // (Not sure how to grab these, so for now just grab the first path in the graph) - map > cached_paths; - set kept_paths; - */ - string ref_path = o_graph->paths.all_path_names()[0]; - - //o_graph->unchop(); - /* - Paths p = o_graph->paths; - // - // 3. Remove all paths in the graph, except the reference - (o_graph->paths).keep_paths(kept_paths); - */ - - /* Generate edges/nodes to add to graph */ - //vector find_smems(const string& seq); - Mapper* mapper; - mapper= new Mapper(xindex, gcsa_index, lcp_index); - - map > matches; - map ref_node_to_clip; - vector seqs; - for (int i = 0; i < tips.size(); i++){ - Node* n = o_graph->get_node(tips[i]); - if (n->sequence().length() < 4 || (o_graph->paths).has_node_mapping(n->id())){ - continue; - } - vector m = mapper->find_mems_simple(n->sequence().begin(), - n->sequence().end(), - 200, - mapper->min_mem_length); - // Why >1? Because we need to match the node AND somewhere else in the graph. - if (m.size() > 1){ - cerr << "POTENTIAL NEW EDGE" << endl; - matches[tips[i]] = m; - // map>> node_mapping; - // Get paths of tip - set paths_of_tip = cached_paths.of_node(tips[i]); - // Find the closest reference node to the tip - vg::id_t ref_node = -1; - - for (auto m : paths_of_tip){ - cerr << m << endl; - Path path_of_tip = cached_paths.path(m); - if (m == ref_path){ - continue; - } - else{ - bool on_ref = false; - for (int j = 0; j < path_of_tip.mapping_size(); j++){ - Mapping nearby_mapping = path_of_tip.mapping(j); - Position pos = nearby_mapping.position(); - vg::id_t n_id = pos.node_id(); - if (o_graph->paths.has_node_mapping(n_id)){ - ref_node = n_id; - on_ref = true; - } - else if (on_ref == false && n_id != tips[i]){ - ref_node = n_id; - } - else{ - continue; - } - - } - } - } - if (ref_node != -1){ - ref_node_to_clip[ref_node] = n->sequence(); - } - } - } - - cerr << ref_node_to_clip.size() << " candidate edges generated. Modifying graph" << endl; - - //need to remove the tips sequences first. - //cut_tips(tips, o_graph); - - vector new_p_vec; - for (auto x : ref_node_to_clip){ - Alignment clip_aln; - cerr << "Length of softclip: " << x.second.size() << endl; - clip_aln = mapper->align(x.second); - if (clip_aln.score() < 30){ - continue; - } - cerr << clip_aln.DebugString(); - Path new_aln_p = clip_aln.path(); - //new_p_vec.clear(); - new_p_vec.push_back(new_aln_p); - //for (int i = 0; i < new_aln_p.mapping_size(); i++){ - //vector tras = o_graph->edit(new_p_vec); - //translator.load(tras); - //o_graph->paths.rebuild_mapping_aux(); - // Edge * e = o_graph->create_edge(x.first, new_aln_p.mapping(i).position().node_id(), false, false); - // o_graph->add_edge(*e); - // cerr << "Edge made from " << x.first << " to " << new_aln_p.mapping(i).position().node_id() << endl; - //} - - /** Reindex graph and reset mapper **/ - //delete xindex; - //xindex = new xg::XG(o_graph->graph); - //delete gcsa_index; - //delete lcp_index; - //o_graph->build_gcsa_lcp(gcsa_index, lcp_index, kmer_size, in_mem_path_only, false, 2); - //delete mapper; - //mapper = new Mapper(xindex, gcsa_index, lcp_index); - - - } - - //vector after_tips = find_tips(o_graph); - //cut_tips(after_tips, o_graph); - - - - //o_graph->unchop(); - - // Remap the paths (reads) that pass through our current set of tips, and - // see if the overall score of the graph improves. - // - //map>> node_mapping; - // - -} - -void Homogenizer::cut_tips(vector tip_ids, vg::VG* graph){ - for (auto i : tip_ids){ - graph->destroy_node(i); - } - graph->remove_orphan_edges(); -} - -void Homogenizer::cut_tips(vg::VG* graph){ - vector tips = find_tips(graph); - cut_tips(tips, graph); -} - -void Homogenizer::cut_nonref_tips(vg::VG* graph){ - -} - -vector Homogenizer::find_non_ref_tips(vg::VG* graph){ - vector ret; - std::function is_tip = [graph, &ret](Node* n){ - if ((graph->start_degree(n) == 0 | graph->end_degree(n) == 0) && - !(graph->paths.has_node_mapping(n))){ - #pragma omp critical - ret.push_back(n->id()); - } - }; - graph->for_each_node_parallel(is_tip); - return ret; - -} - -vector Homogenizer::find_tips(vg::VG* graph){ - vector ret; - std::function is_tip = [graph, &ret](Node* n){ - if ((graph->start_degree(n) == 0 | graph->end_degree(n) == 0)){ - #pragma omp critical - ret.push_back(n->id()); - } - }; - graph->for_each_node_parallel(is_tip); - return ret; -} diff --git a/src/homogenizer.hpp b/src/homogenizer.hpp deleted file mode 100644 index 586e9784588..00000000000 --- a/src/homogenizer.hpp +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef VG_HOMOGENIZER_HPP_INCLUDED -#define VG_HOMOGENIZER_HPP_INCLUDED -#include -#include -#include "vg.hpp" -#include "translator.hpp" -#include "filter.hpp" -#include "mapper.hpp" -#include "vg.pb.h" -#include "types.hpp" - - -using namespace vg; -using namespace std; - -namespace vg{ - class Homogenizer{ - public: - /** Locates tips in the graph - * and tries to generate a single - * edge / node to represent them. - * This edge is then added, the offending sequences - * are remapped, and the process is repeated until the - * graph becomes stable. - */ - void homogenize(vg::VG* graph, xg::XG* xindex, gcsa::GCSA* gcsa_index, gcsa::LCPArray* lcp_index, Paths p, int kmer_size); - void homogenize(vg::VG* graph, xg::XG* xindex, gcsa::GCSA* gcsa_index, gcsa::LCPArray* lcp_index, vg::Index reads_index); - private: - - Translator translator; - /** Find tips (nodes with an indegree/outdegree of 0 in the graph */ - vector find_tips(vg::VG* graph); - - /** Find non-ref tips */ - vector find_non_ref_tips(vg::VG* graph); - - /** remap a set of Alignments to the graph */ - int remap(vector reads, vg::VG graph); - /** Remove all tips from the graph. - * WARNING: may cut head/tail nodes.*/ - void cut_tips(vg::VG* graph); - /** Remove specific nodes and their edges from the graph */ - void cut_tips(vector tip_ids, vg::VG* graph); - /** Remove non-reference tips from the graph. */ - void cut_nonref_tips(vg::VG* graph); - - - }; -} -#endif diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp new file mode 100644 index 00000000000..2df5a1fac51 --- /dev/null +++ b/src/hts_alignment_emitter.cpp @@ -0,0 +1,1064 @@ +/** + * \file alignment_emitter.cpp + * + * Implements a system for emitting alignments and groups of alignments in multiple formats. + */ + +#include "hts_alignment_emitter.hpp" +#include "surjecting_alignment_emitter.hpp" +#include "back_translating_alignment_emitter.hpp" +#include "alignment.hpp" +#include "vg/io/json2pb.h" +#include "algorithms/find_translation.hpp" +#include +#include + +#include + +//#define debug + +namespace vg { +using namespace std; + +unique_ptr get_alignment_emitter(const string& filename, const string& format, + const vector>& paths, size_t max_threads, + const HandleGraph* graph, int flags) { + + + unique_ptr emitter; + + if (format == "SAM" || format == "BAM" || format == "CRAM") { + // We are doing linear HTSLib output + + // Make sure we actually have a PathPositionalHandleGraph + const PathPositionHandleGraph* path_graph = dynamic_cast(graph); + if (path_graph == nullptr) { + cerr << "error[vg::get_alignment_emitter]: No graph available supporting path length queries needed for " << format << " output." << endl; + exit(1); + } + + // Build a path name and length list from the handles (this is for the sequence dictionary and reflects *base* paths + // as opposed to subpaths in the graph -- if there are no subpaths there is no distinction) + vector> path_names_and_lengths; + // Remember the actual path lengths (this is for coordinate transformations) + unordered_map subpath_to_length; + std::tie(path_names_and_lengths, subpath_to_length) = extract_path_metadata(paths, *path_graph, true); + + if (flags & ALIGNMENT_EMITTER_FLAG_HTS_SPLICED) { + // Use a splicing emitter as the final emitter + emitter = make_unique(filename, format, path_names_and_lengths, subpath_to_length, *path_graph, max_threads); + } else { + // Use a normal emitter + emitter = make_unique(filename, format, path_names_and_lengths, subpath_to_length, max_threads); + } + + if (!(flags & ALIGNMENT_EMITTER_FLAG_HTS_RAW)) { + // Need to surject + + // Make a set of the path handles to surject into + unordered_set target_paths; + for (const auto& path_info : paths) { + target_paths.insert(get<0>(path_info)); + } + // Interpose a surjecting AlignmentEmitter + emitter = make_unique(path_graph, target_paths, std::move(emitter), + flags & ALIGNMENT_EMITTER_FLAG_HTS_PRUNE_SUSPICIOUS_ANCHORS); + } + + } else { + // The non-HTSlib formats don't actually use the path name and length info. + // See https://github.com/vgteam/libvgio/issues/34 + + const NamedNodeBackTranslation* translation = nullptr; + if (flags & ALIGNMENT_EMITTER_FLAG_VG_USE_SEGMENT_NAMES) { + // Need to translate from node IDs to segment names + + translation = vg::algorithms::find_translation(graph); + if (translation == nullptr) { + cerr << "error[vg::get_alignment_emitter]: No graph available supporting translation to named-segment space" << endl; + exit(1); + } + } + + // TODO: Push some logic here into libvgio? Or move this top function out of hts_alignment_emitter.cpp? + // TODO: Only GAF actually handles the translation in the emitter right now. + // TODO: Move BackTranslatingAlignmentEmitter to libvgio so they all can and we don't have to sniff format here. + emitter = get_non_hts_alignment_emitter(filename, format, {}, max_threads, graph, translation); + if (translation && format != "GAF") { + // Need to translate from node IDs to segment names beforehand. + // Interpose a translating AlignmentEmitter + emitter = make_unique(translation, std::move(emitter)); + } + } + + return emitter; +} + +/// Run the given iteratee for each path that is either the path with the given +/// name (if present), or a subrange of a path with the given name as the base +/// name (otherwise). +/// +/// If a path and subpaths both exist, only look at the full path. +/// +/// If the name describes a subpath, look only at that subpath. +/// +/// Iteratee returns false to stop. +/// +/// Returns true if we reached the end, and false if asked to stop. +static bool for_each_subpath_of(const PathPositionHandleGraph& graph, const string& path_name, const std::function& iteratee) { + if (graph.has_path(path_name)) { + // Just look at the full path. + return iteratee(graph.get_path_handle(path_name)); + } + + // Parse out the metadata of the thing we want subpaths of + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(path_name, + sense, + sample, + locus, + haplotype, + phase_block, + subrange); + + if (subrange != PathMetadata::NO_SUBRANGE) { + // The path name described a subpath, and we didn't find it. + // Don't call the iteratee. + return true; + } + + // Look at every subpath on it + return graph.for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t& match) { + // TODO: There's no way to search by haplotype and phase block, we have to scan + if (graph.get_haplotype(match) != haplotype) { + // Skip this haplotype + return true; + } + if (graph.get_phase_block(match) != phase_block) { + // Skip this phase block + return true; + } + // Don't need to check subrange, we know we don't have one and this candidate does. + return iteratee(match); + }); +} + +/// Returns the base path name for this path (i.e. the path's name without any subrange). +static string get_path_base_name(const PathPositionHandleGraph& graph, const path_handle_t& path) { + if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) { + // This is a full path + return graph.get_path_name(path); + } else { + // This is a subpath, so remember what it's a subpath of, and use that. + return PathMetadata::create_path_name(graph.get_sense(path), + graph.get_sample_name(path), + graph.get_locus_name(path), + graph.get_haplotype(path), + graph.get_phase_block(path), + PathMetadata::NO_SUBRANGE); + } +} + +pair>, unordered_map> extract_path_metadata( + const vector>& paths, const PathPositionHandleGraph& graph, + bool subpath_support) { + + // Build a path name and length list from the handles (this is for the sequence dictionary and reflects *base* paths + // as opposed to subpaths in the graph -- if there are no subpaths there is no distinction) + vector> path_names_and_lengths; + unordered_set base_path_set; + // Remember the actual path lengths (this is for coordinate transformations) + unordered_map subpath_to_length; + for (const auto& path_info : paths) { + auto& path = get<0>(path_info); + auto& own_length = get<1>(path_info); + auto& base_length = get<2>(path_info); + string base_path_name = subpath_support ? get_path_base_name(graph, path) : graph.get_path_name(path); + if (!base_path_set.count(base_path_name)) { + path_names_and_lengths.push_back(make_pair(base_path_name, base_length)); + base_path_set.insert(base_path_name); + } + subpath_to_length[graph.get_path_name(path)] = own_length; + } + + return make_pair(path_names_and_lengths, subpath_to_length); +} + +vector> get_sequence_dictionary(const string& filename, const vector& path_names, const PathPositionHandleGraph& graph) { + + // TODO: We assume we're using the one true default path metadata <-> name mapping + + // Subpath support: map to paths from their base name without a subrange. + // Includes full-length paths if they exost, and subrange-bearing paths otherwise. + unordered_map> base_path_to_subpaths; + + // Parse the input into this list. If length was unspecified (ie in regular text file with one column) then it will be -1 + // and filled in later + vector> input_names_lengths; + + // Should we print path subrange warnings? + bool print_subrange_warnings = true; + + // When we get a sequence and possibly its length (or -1 if no length is + // known), put it in the dictionary. + // Can optionally provide a file name for error reporting. + auto handle_sequence = [&](const std::string& sequence_name, int64_t length, const std::string* filename) { + if (graph.has_path(sequence_name)) { + // If the graph does have a path by this exact name, do a length check. + path_handle_t path = graph.get_path_handle(sequence_name); + size_t graph_path_length = graph.get_path_length(path); + if (length == -1) { + // We need to infer the length + length = graph_path_length; + } else if (graph_path_length != length) { + // Length was given but doesn't match + cerr << "error:[vg::get_sequence_dictionary] Graph contains a path " << sequence_name << " of length " << graph_path_length + << " but should have a length of " << length; + if (filename) { + // Report the source file. + cerr << " from sequence dictionary in " << *filename; + } + cerr << endl; + exit(1); + } + + if (print_subrange_warnings) { + subrange_t subrange; + std::string base_path_name = Paths::strip_subrange(sequence_name, &subrange); + if (subrange != PathMetadata::NO_SUBRANGE) { + // The user is asking explicitly to surject to a path that is a + // subrange of some other logical path, like + // GRCh38#0#chr1[1000-2000]. That's weird. Warn. + cerr << "warning:[vg::get_sequence_dictionary] Path " << sequence_name; + if (filename) { + // Report the source file. + cerr << " from sequence dictionary in " << *filename; + } + cerr << " looks like part of a path. Output coordinates will be in " << base_path_name << " instead. Suppressing further warnings." << endl; + print_subrange_warnings = false; + } + } + + // Remember the path + base_path_to_subpaths[sequence_name].push_back(path); + } else { + // The graph doesn't have this exact path; does it have any subregions of a full path with this name? + // If so, remember and use those. + for_each_subpath_of(graph, sequence_name, [&](const path_handle_t& match) { + // We know this can't be an exact match, since we already checked for one. It must be a subrange. + // We found a subpath we're looking for. + base_path_to_subpaths[sequence_name].push_back(match); + // Keep looking for more. + return true; + }); + if (!base_path_to_subpaths.count(sequence_name)) { + // We didn't find any subpaths for this path as a base path either. + cerr << "error:[vg::get_sequence_dictionary] Graph does not have the entirety or any pieces of a path named " << sequence_name; + if (filename) { + // Report the source file. + cerr << " which was indicated in " << *filename; + } + cerr << endl; + exit(1); + } + // The length may still be missing. + } + + input_names_lengths.push_back(make_pair(sequence_name, length)); + }; + + + if (!filename.empty()) { + // TODO: As of right now HTSLib doesn't really let you iterate the sequence dictionary when you use its parser. So we use our own parser. + get_input_file(filename, [&](istream& in) { + for (string line; getline(in, line);) { + // Each line will produce a sequence name and a handle + string sequence_name = ""; + int64_t length = -1; + bool missing_length = false; + + // Trim leading and trailing whitespace + line.erase(line.begin(), find_if(line.begin(), line.end(), [](char ch) {return !isspace(ch);})); + line.erase(find_if(line.rbegin(), line.rend(), [](char ch) {return !isspace(ch);}).base(), line.end()); + + if (line.empty()) { + // Unless it is empty + continue; + } + + // See if each line starts with @SQ and we have to parse it, or @HD and we have to drop it, or if we have to handle it as a name. + if (starts_with(line, "@SQ")) { + // If it is SAM, split on tabs + auto parts = split_delims(line, "\t"); + + for (size_t i = 1; i < parts.size(); i++) { + if (starts_with(parts[i], "SN:")) { + // The rest of this field is the name + sequence_name = parts[i].substr(3); + } else if (starts_with(parts[i], "LN:")) { + // The rest of this field is a length number + length = stoll(parts[i].substr(3)); + } + } + + } else if (starts_with(line, "@HD")) { + // SAM header line also found in dict files. Drop it. + // TODO: Hope nobody named a sequence "@HD"-something + continue; + } else { + // Get the name from the line and the sequence from the graph + vector toks = split_delims(line, "\t"); + sequence_name = toks[0]; + if (toks.size() > 1) { + length = std::stol(toks[1]); + } else { + missing_length = true; + } + } + + if (sequence_name == "") { + cerr << "error:[vg::get_sequence_dictionary] No sequence name for line " << line << endl; + exit(1); + } + + if (!missing_length && length < 0) { + cerr << "error:[vg::get_sequence_dictionary] Unacceptable sequence length " << length << " for sequence " << sequence_name << endl; + exit(1); + } + + // Now record that we want the sequence. + handle_sequence(sequence_name, length, &filename); + } + }); + + if (input_names_lengths.empty()) { + // There were no entries in the file + cerr << "error:[vg::get_sequence_dictionary] No sequence dictionary available in file: " << filename << endl; + exit(1); + } + } + + for (auto& name : path_names) { + // Supplement with any names that were directly provided + handle_sequence(name, -1, nullptr); + } + + if (input_names_lengths.empty()) { + // We got no paths in so we need to guess them. + // We will deduplicate their base names, without subpath info. + unordered_set base_names; + + // When we find a path or subpath we want, we will keep it. + auto keep_path_or_subpath = [&](const path_handle_t& path) { + string base_name = get_path_base_name(graph, path); + int64_t base_length = -1; + if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) { + // This is a full path so we can determine base length now. + base_length = graph.get_path_length(path); + } + if (!base_names.count(base_name)) { + // This is the first time we have seen something on this path. + // Remember we need a length for it. + // TODO: make this a map so we can just max in here instead of doing another pass. + input_names_lengths.push_back(make_pair(base_name, base_length)); + // And remember we are using it. + base_names.insert(base_name); + } + // Remember this path as belonging to the right base name. + base_path_to_subpaths[base_name].push_back(path); + }; + + // First look for reference sense paths and their subpaths + graph.for_each_path_of_sense(PathSense::REFERENCE, keep_path_or_subpath); + + if (input_names_lengths.empty()) { + // If none of those exist, try generic sense paths and their subpaths + cerr << "warning:[vg::get_sequence_dictionary] No reference-sense paths available in the graph; falling back to generic paths." << endl; + graph.for_each_path_of_sense(PathSense::GENERIC, [&](const path_handle_t& path) { + if (Paths::is_alt(graph.get_path_name(path))) { + // Skip this path because it is a stored allele. + return; + } + // Otherwise, keep it. + keep_path_or_subpath(path); + }); + } + + if (input_names_lengths.empty()) { + // No non-alt generic paths either + cerr << "error:[vg::get_sequence_dictionary] No reference or non-alt-allele generic paths available in the graph!" << endl; + exit(1); + } + } + + // We fill in the "dictionary" (which is what SAM calls it; it's not a mapping for us) + // we also store the path length (from the graph) along with the base path length (from the user if specified, from paths otherwise) + vector> dictionary; + + for (auto& base_name_and_length : input_names_lengths) { + // For every base path name we have stuff on + + if (base_name_and_length.second == -1) { + // We need the overall length of this base path still. + for_each_subpath_of(graph, base_name_and_length.first, [&](const path_handle_t& path) { + // So scan it and all its subpaths + + subrange_t subrange = graph.get_subrange(path); + if (subrange == PathMetadata::NO_SUBRANGE) { + // Full path, use its length. + // TODO: probably we should have seen the full path's length by now if it existed. + base_name_and_length.second = graph.get_path_length(path); + } else { + // Subpath, work out where it ends and max it in + auto end_offset = subrange.second; + if (end_offset == PathMetadata::NO_END_POSITION) { + // If there was a stored end we would just trust it, but without it we have to compute it. + end_offset = subrange.first + graph.get_path_length(path); + } + // max the end offset in against all the end offsets we have seen so far. + base_name_and_length.second = std::max(base_name_and_length.second, (int64_t) end_offset); + } + // Keep going + return true; + }); + } + + // Now we have the length, so we can fill in the dictionary. + for (auto& path : base_path_to_subpaths[base_name_and_length.first]) { + // For every subpath we found on the base path (possibly just + // itself), remember the subpath, the subpath's length, and the + // base path's length + dictionary.push_back(make_tuple(path, graph.get_path_length(path), (size_t)base_name_and_length.second)); + } + } + + return dictionary; +} + +// Give the footer length for rewriting BGZF EOF markers. +const size_t HTSWriter::BGZF_FOOTER_LENGTH = 28; + +HTSWriter::HTSWriter(const string& filename, const string& format, + const vector>& path_order_and_length, + const unordered_map& subpath_to_length, + size_t max_threads) : + out_file(filename == "-" ? nullptr : new ofstream(filename)), + multiplexer(out_file.get() != nullptr ? *out_file : cout, max_threads), + format(format), path_order_and_length(path_order_and_length), subpath_to_length(subpath_to_length), + backing_files(max_threads, nullptr), sam_files(max_threads, nullptr), + atomic_header(nullptr), sam_header(), header_mutex(), output_is_bgzf(format != "SAM"), + hts_mode() { + + // We can't work with no streams to multiplex, because we need to be able + // to write BGZF EOF blocks throught he multiplexer at destruction. + assert(max_threads > 0); + + if (out_file.get() != nullptr && !*out_file) { + // Make sure we opened a file if we aren't writing to standard output + cerr << "[vg::HTSWriter] failed to open " << filename << " for writing" << endl; + exit(1); + } + + // Make sure we have an HTS format + assert(format == "SAM" || format == "BAM" || format == "CRAM"); + + // Compute the file mode to send to HTSlib depending on output format + char out_mode[5]; + string out_format = ""; + strcpy(out_mode, "w"); + if (format == "BAM") { + out_format = "b"; + } else if (format == "CRAM") { + out_format = "c"; + } else { + // Must be SAM + out_format = ""; + } + strcat(out_mode, out_format.c_str()); + int compress_level = 9; // TODO: support other compression levels + if (compress_level >= 0) { + char tmp[2]; + tmp[0] = compress_level + '0'; tmp[1] = '\0'; + strcat(out_mode, tmp); + } + // Save to a C++ string that we will use later. + hts_mode = out_mode; + + if (this->subpath_to_length.empty()) { + // no subpath support: just use lengths from path_order_and_length + for (const auto& pl : path_order_and_length) { + this->subpath_to_length[pl.first] = pl.second; + } + } + + // Each thread will lazily open its samFile*, once it has a header ready +} + +HTSWriter::~HTSWriter() { + // Note that the destructor runs in only one thread, and only when + // destruction is safe. No need to lock the header. + if (atomic_header.load() != nullptr) { + // Delete the header + bam_hdr_destroy(atomic_header.load()); + } + + for (size_t thread_number = 0; thread_number < sam_files.size(); thread_number++) { + // For each thread, find its samFile* + auto& sam_file = sam_files.at(thread_number); + + if (sam_file != nullptr) { + // Close out all the open samFile*s and flush their data before the + // multiplexer destructs + sam_close(sam_file); + + if (output_is_bgzf) { + // Discard all the BGZF EOF marker blocks + multiplexer.discard_bytes(thread_number, BGZF_FOOTER_LENGTH); + + // Put a barrier so subsequent writes come later. + multiplexer.register_barrier(thread_number); + } + } + } + + if (output_is_bgzf) { + // Now put one BGZF EOF marker in thread 0's stream. + // It will be the last thing, after all the barriers, and close the file. + vg::io::finish(multiplexer.get_thread_stream(0), true); + } + +} + +bam_hdr_t* HTSWriter::ensure_header(const string& read_group, + const string& sample_name, + size_t thread_number) { + bam_hdr_t* header = atomic_header.load(); + if (header == nullptr) { + // The header does not exist. + + // Lock the header mutex so we have exclusive control of the header + lock_guard header_lock(header_mutex); + + // Load into the enclosing scope header. Don't shadow, because the + // enclosing scope variable is what will get returned. + header = atomic_header.load(); + if (header == nullptr) { + // It is our turn to make the header. + + // Sniff out the read group and sample, and map from RG to sample + map rg_sample; + if (!read_group.empty() && !sample_name.empty()) { + // We have a sample and a read group + rg_sample[read_group] = sample_name; + } + + // Make the header + header = hts_string_header(sam_header, path_order_and_length, rg_sample); + + // Initialize the SAM file for this thread and actually keep the header + // we write, since we are the first thread. + initialize_sam_file(header, thread_number, true); + + // Save back to the atomic only after the header has been written and + // it is safe for other threads to use it. + atomic_header.store(header); + + // We made the header and the SAM file. + return header; + } + } + + // Otherwise, someone else beat us to creating the header. + // Header is ready. We just need to create the samFile* for this thread with it if it doesn't exist. + + if (sam_files[thread_number] == nullptr) { + // The header has been created and written, but hasn't been used to initialize our samFile* yet. + initialize_sam_file(header, thread_number); + } + + return header; +} + + +void HTSWriter::save_records(bam_hdr_t* header, vector& records, size_t thread_number) { + // We need a header and an extant samFile* + assert(header != nullptr); + assert(sam_files[thread_number] != nullptr); + + for (auto& b : records) { + // Emit each record + + if (sam_write1(sam_files[thread_number], header, b) == 0) { + cerr << "[vg::HTSWriter] error: writing to output file failed" << endl; + exit(1); + } + } + + for (auto& b : records) { + // Deallocate all the records + bam_destroy1(b); + } + + if (multiplexer.want_breakpoint(thread_number)) { + // We have written enough that we ought to give the multiplexer a chance to multiplex soon. + // There's no way to do this without closing and re-opening the HTS file. + // So just tear down and reamke the samFile* for this thread. + initialize_sam_file(header, thread_number); + } +} + +void HTSWriter::initialize_sam_file(bam_hdr_t* header, size_t thread_number, bool keep_header) { + if (sam_files[thread_number] != nullptr) { + // A samFile* has been created already. Clear it out. + // Closing the samFile* flushes and destroys the BGZF (if any) and + // hFILE* backing it. + sam_close(sam_files[thread_number]); + + if (output_is_bgzf) { + // Now we know there's a closing empty BGZF block that htslib puts to + // mark EOF. We don't want that in the middle of our stream because it + // is weird and we aren't actually at EOF. + // We know how long it is, so we will trim it off. + multiplexer.discard_bytes(thread_number, BGZF_FOOTER_LENGTH); + } + + // Now place a breakpoint right where we were before that empty block. + multiplexer.register_breakpoint(thread_number); + } + + // Create a new samFile* for this thread + // hts_mode was filled in when the header was. + // hts_hopen demands a filename, but appears to just store it, and + // doesn't document how required it it. + backing_files[thread_number] = vg::io::hfile_wrap(multiplexer.get_thread_stream(thread_number)); + sam_files[thread_number] = hts_hopen(backing_files[thread_number], "-", hts_mode.c_str()); + + if (sam_files[thread_number] == nullptr) { + // We couldn't open the output samFile* + cerr << "[vg::HTSWriter] failed to open internal stream for writing " << format << " output" << endl; + exit(1); + } + + // Write the header again, which is the only way to re-initialize htslib's internals. + // Remember that sam_hdr_write flushes the BGZF to the hFILE*, but does not flush the hFILE*. + if (sam_hdr_write(sam_files[thread_number], header) != 0) { + cerr << "[vg::HTSWriter] error: failed to write the SAM header" << endl; + exit(1); + } + + // Now flush it out of the hFILE* buffer into the backing C++ stream + if (hflush(backing_files[thread_number]) != 0) { + cerr << "[vg::HTSWriter] error: failed to flush the SAM header" << endl; + exit(1); + } + + if (keep_header) { + // We are the first thread to write a header, so we actually want it. + // Place a barrier which is also a breakpoint, so all subsequent writes come later. + multiplexer.register_barrier(thread_number); + } else { + // Discard the header so it won't be in the resulting file again + multiplexer.discard_to_breakpoint(thread_number); + } +} + +HTSAlignmentEmitter::HTSAlignmentEmitter(const string& filename, const string& format, + const vector>& path_order_and_length, + const unordered_map& subpath_to_length, + size_t max_threads) + : HTSWriter(filename, format, path_order_and_length, subpath_to_length, max_threads) +{ + // nothing else to do +} + +void HTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector>& cigar, bool& pos_rev, int64_t& pos, string& path_name) const { + + // We assume the position is available in refpos(0) + assert(aln.refpos_size() == 1); + path_name = aln.refpos(0).name(); + size_t path_len = 0; + if (path_name != "") { + path_len = subpath_to_length.at(path_name); + } + // Extract the position so that it could be adjusted by cigar_against_path if we decided to supperss softclips. Which we don't. + // TODO: Separate out softclip suppression. + pos = aln.refpos(0).offset(); + pos_rev = aln.refpos(0).is_reverse(); + cigar = cigar_against_path(aln, pos_rev, pos, path_len, 0); + + // Resolve subpath naming / offset + subrange_t subrange; + path_name = Paths::strip_subrange(path_name, &subrange); + if (subrange != PathMetadata::NO_SUBRANGE) { + pos += subrange.first; + } +} + +void HTSAlignmentEmitter::convert_unpaired(Alignment& aln, bam_hdr_t* header, vector& dest) { + // Look up the stuff we need from the Alignment to express it in BAM. + vector> cigar; + bool pos_rev; + int64_t pos; + string path_name; + convert_alignment(aln, cigar, pos_rev, pos, path_name); + + dest.emplace_back(alignment_to_bam(header, + aln, + path_name, + pos, + pos_rev, + cigar)); +} + +void HTSAlignmentEmitter::convert_paired(Alignment& aln1, Alignment& aln2, bam_hdr_t* header, int64_t tlen_limit, + vector& dest) { + // Look up the stuff we need from the Alignment to express it in BAM. + + + vector> cigar1, cigar2; + bool pos_rev1, pos_rev2; + int64_t pos1, pos2; + string path_name1, path_name2; + convert_alignment(aln1, cigar1, pos_rev1, pos1, path_name1); + convert_alignment(aln2, cigar2, pos_rev2, pos2, path_name2); + + // Determine the TLEN for each read. + auto tlens = compute_template_lengths(pos1, cigar1, pos2, cigar2); + + dest.emplace_back(alignment_to_bam(header, + aln1, + path_name1, + pos1, + pos_rev1, + cigar1, + path_name2, + pos2, + pos_rev2, + tlens.first, + tlen_limit)); + dest.emplace_back(alignment_to_bam(header, + aln2, + path_name2, + pos2, + pos_rev2, + cigar2, + path_name1, + pos1, + pos_rev1, + tlens.second, + tlen_limit)); + +} + +void HTSAlignmentEmitter::emit_singles(vector&& aln_batch) { + if (aln_batch.empty()) { + // Nothing to do + return; + } + + // Work out what thread we are + size_t thread_number = omp_get_thread_num(); + + // Make sure header exists + bam_hdr_t* header = ensure_header(aln_batch.front().read_group(), + aln_batch.front().sample_name(), thread_number); + assert(header != nullptr); + assert(sam_files[thread_number] != nullptr); + + vector records; + records.reserve(aln_batch.size()); + + for (auto& aln : aln_batch) { + // Convert each alignment to HTS format + convert_unpaired(aln, header, records); + } + + // Save to the stream for this thread. + save_records(header, records, thread_number); +} + +void HTSAlignmentEmitter::emit_mapped_singles(vector>&& alns_batch) { + // Count the total alignments to do + size_t count = 0; + // And find an alignment to base the header on + Alignment* sniff = nullptr; + for (auto& alns : alns_batch) { + count += alns.size(); + if (!alns.empty() && sniff == nullptr) { + sniff = &alns.front(); + } + } + + if (count == 0) { + // Nothing to do + return; + } + + // Work out what thread we are + size_t thread_number = omp_get_thread_num(); + + // Make sure header exists + assert(sniff != nullptr); + bam_hdr_t* header = ensure_header(sniff->read_group(), sniff->sample_name(), + thread_number); + assert(header != nullptr); + assert(sam_files[thread_number] != nullptr); + + vector records; + records.reserve(count); + + for (auto& alns : alns_batch) { + for (auto& aln : alns) { + // Convert each alignment to HTS format + convert_unpaired(aln, header, records); + } + } + + // Save to the stream for this thread. + save_records(header, records, thread_number); + +} + + +void HTSAlignmentEmitter::emit_pairs(vector&& aln1_batch, + vector&& aln2_batch, + vector&& tlen_limit_batch) { + assert(aln1_batch.size() == aln2_batch.size()); + assert(aln1_batch.size() == tlen_limit_batch.size()); + + + if (aln1_batch.empty()) { + // Nothing to do + return; + } + + // Work out what thread we are + size_t thread_number = omp_get_thread_num(); + + // Make sure header exists + bam_hdr_t* header = ensure_header(aln1_batch.front().read_group(), + aln1_batch.front().sample_name(), thread_number); + assert(header != nullptr); + assert(sam_files[thread_number] != nullptr); + + vector records; + records.reserve(aln1_batch.size() * 2); + + for (size_t i = 0; i < aln1_batch.size(); i++) { + // Convert each alignment pair to HTS format + convert_paired(aln1_batch[i], aln2_batch[i], header, tlen_limit_batch[i], records); + } + + // Save to the stream for this thread. + save_records(header, records, thread_number); +} + + +void HTSAlignmentEmitter::emit_mapped_pairs(vector>&& alns1_batch, + vector>&& alns2_batch, + vector&& tlen_limit_batch) { + + assert(alns1_batch.size() == alns2_batch.size()); + assert(alns1_batch.size() == tlen_limit_batch.size()); + + // Count the total alignments to do + size_t count = 0; + // And find an alignment to base the header on + Alignment* sniff = nullptr; + for (size_t i = 0; i < alns1_batch.size(); i++) { + // Go through all pairs + + // Make sure each end of the pair has the same number of mappings + assert(alns1_batch[i].size() == alns2_batch[i].size()); + + count += alns1_batch[i].size() * 2; + if (!alns1_batch[i].empty() && sniff == nullptr) { + sniff = &alns1_batch[i].front(); + } + } + + if (count == 0) { + // Nothing to do + return; + } + + // Work out what thread we are + size_t thread_number = omp_get_thread_num(); + + // Make sure header exists + assert(sniff != nullptr); + bam_hdr_t* header = ensure_header(sniff->read_group(), sniff->sample_name(), + thread_number); + assert(header != nullptr); + assert(sam_files[thread_number] != nullptr); + + vector records; + records.reserve(count); + + for (size_t i = 0; i < alns1_batch.size(); i++) { + for (size_t j = 0; j < alns1_batch[i].size(); j++) { + // Convert each alignment pair to HTS format + convert_paired(alns1_batch[i][j], alns2_batch[i][j], header, tlen_limit_batch[i], records); + } + } + + // Save to the stream for this thread. + save_records(header, records, thread_number); +} + +SplicedHTSAlignmentEmitter::SplicedHTSAlignmentEmitter(const string& filename, const string& format, + const vector>& path_order_and_length, + const unordered_map& subpath_to_length, + const PathPositionHandleGraph& graph, + size_t max_threads) : + HTSAlignmentEmitter(filename, format, path_order_and_length, subpath_to_length, max_threads), graph(graph) { + + // nothing else to do +} + +void SplicedHTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector>& cigar, + bool& pos_rev, int64_t& pos, string& path_name) const { + + // We assume the position is available in refpos(0) + assert(aln.refpos_size() == 1); + path_name = aln.refpos(0).name(); + pos = aln.refpos(0).offset(); + pos_rev = aln.refpos(0).is_reverse(); + + // Convert to a cigar with spliced deletions + cigar = spliced_cigar_against_path(aln, path_name, pos, pos_rev); +} + +vector> SplicedHTSAlignmentEmitter::spliced_cigar_against_path(const Alignment& aln, + const string& path_name, + int64_t pos, bool rev) const { + // the return value + vector> cigar; + + if (aln.has_path() && aln.path().mapping_size() > 0) { + // the read is aligned to the path + + path_handle_t path_handle = graph.get_path_handle(path_name); + step_handle_t step = graph.get_step_at_position(path_handle, pos); + + // to indicate whether we've found the edit that corresponds to the BAM position + bool found_pos = false; + + const Path& path = aln.path(); + for (size_t i = 0; i < path.mapping_size(); ++i) { + + // we traverse backwards on a reverse strand mapping + const Mapping& mapping = path.mapping(rev ? path.mapping_size() - 1 - i : i); + + for (size_t j = 0; j < mapping.edit_size(); ++j) { + + // we traverse backwards on a reverse strand mapping + const Edit& edit = mapping.edit(rev ? mapping.edit_size() - 1 - j : j); + + if (!found_pos) { + // we may still be searching through an initial softclip to find + // the edit that corresponds to the BAM position + if (edit.to_length() > 0 && edit.from_length() == 0) { + append_cigar_operation(edit.to_length(), 'S', cigar); + // skip the main block where we assign cigar operations + continue; + } + else { + found_pos = true; + } + } + + // identify the cigar operation + char cigar_code; + int length; + if (edit.from_length() == edit.to_length()) { + cigar_code = 'M'; + length = edit.from_length(); + } + else if (edit.from_length() > 0 && edit.to_length() == 0) { + cigar_code = 'D'; + length = edit.from_length(); + } + else if (edit.to_length() > 0 && edit.from_length() == 0) { + cigar_code = 'I'; + length = edit.to_length(); + } + else { + throw std::runtime_error("Spliced CIGAR construction can only convert simple edits"); + } + + append_cigar_operation(length, cigar_code, cigar); + } // close loop over edits + + if (found_pos && i + 1 < path.mapping_size()) { + // we're anchored on the path by the annotated position, and we're transitioning between + // two mappings, so we should check for a deletion/splice edge + + step_handle_t next_step = graph.get_next_step(step); + + handle_t next_handle = graph.get_handle_of_step(next_step); + const Position& next_pos = path.mapping(rev ? path.mapping_size() - 2 - i : i + 1).position(); + if (graph.get_id(next_handle) != next_pos.node_id() + || (graph.get_is_reverse(next_handle) != next_pos.is_reverse()) != rev) { + + // the next mapping in the alignment is not the next mapping on the path, so we must have + // taken a deletion + + // find the closest step that is further along the path than the current one + // and matches the next position (usually there will only be one) + size_t curr_offset = graph.get_position_of_step(step); + size_t nearest_offset = numeric_limits::max(); + graph.for_each_step_on_handle(graph.get_handle(next_pos.node_id()), + [&](const step_handle_t& candidate) { + + if (graph.get_path_handle_of_step(candidate) == path_handle) { + size_t candidate_offset = graph.get_position_of_step(candidate); + if (candidate_offset < nearest_offset && candidate_offset > curr_offset) { + nearest_offset = candidate_offset; + next_step = candidate; + } + } + }); + + if (nearest_offset == numeric_limits::max()) { + throw std::runtime_error("Spliced BAM conversion could not find path steps that match alignment"); + } + + // the gap between the current step and the next one along the path + size_t deletion_length = (nearest_offset - curr_offset - + graph.get_length(graph.get_handle_of_step(step))); + + // add to the cigar + if (deletion_length >= min_splice_length) { + // long enough to be a splice + append_cigar_operation(deletion_length, 'N', cigar); + } + else if (deletion_length) { + // create or extend a deletion + append_cigar_operation(deletion_length, 'D', cigar); + } + } + + // iterate along the path + step = next_step; + } + } // close loop over mappings + + if (cigar.back().second == 'I') { + // the final insertion is actually a softclip + cigar.back().second = 'S'; + } + } + + simplify_cigar(cigar); + + return cigar; +} + + +} diff --git a/src/hts_alignment_emitter.hpp b/src/hts_alignment_emitter.hpp new file mode 100644 index 00000000000..da08901f623 --- /dev/null +++ b/src/hts_alignment_emitter.hpp @@ -0,0 +1,291 @@ +#ifndef VG_HTS_ALIGNMENT_EMITTER_HPP_INCLUDED +#define VG_HTS_ALIGNMENT_EMITTER_HPP_INCLUDED + +/** + * \file alignment_emitter.hpp + * + * Defines a system for emitting alignments and groups of alignments in multiple formats. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include "handle.hpp" +#include "vg/io/alignment_emitter.hpp" + +namespace vg { +using namespace std; + +using namespace vg::io; + +/** + * Flag enum for controlling the behavior of alignment emitters behind get_alignment_emitter(). + */ +enum alignment_emitter_flags_t { + /// Value for no flags set. + ALIGNMENT_EMITTER_FLAG_NONE = 0, + /// Skip surjection, and expect pre-surjected alignments. + ALIGNMENT_EMITTER_FLAG_HTS_RAW = 1, + /// Use splicing-aware conversion to HTSlib formats: + /// alignments are spliced at known splice sites (i.e. edges in the graph), so + /// form spliced CIGAR strings + ALIGNMENT_EMITTER_FLAG_HTS_SPLICED = 2, + /// When surjecting, discard low-complexity anchors and realign more freely + /// against the target path. + ALIGNMENT_EMITTER_FLAG_HTS_PRUNE_SUSPICIOUS_ANCHORS = 4, + /// Emit graph alignments in named segment (i.e. GFA space) instead of + /// numerical node ID space. + ALIGNMENT_EMITTER_FLAG_VG_USE_SEGMENT_NAMES = 8 +}; + +/// Get an AlignmentEmitter that can emit to the given file (or "-") in the +/// given format. When writing HTSlib formats (SAM, BAM, CRAM), paths should +/// contain the paths in the linear reference in sequence dictionary order (see +/// get_sequence_dictionary), and a PathPositionHandleGraph must be provided. +/// When writing GAF, a HandleGraph must be provided for obtaining node lengths +/// and sequences. Other formats do not need a graph. +/// +/// flags is an ORed together set of flags from alignment_emitter_flags_t. +/// +/// Automatically applies per-thread buffering, but needs to know how many OMP +/// threads will be in use. +unique_ptr get_alignment_emitter(const string& filename, const string& format, + const vector>& paths, size_t max_threads, + const HandleGraph* graph = nullptr, int flags = ALIGNMENT_EMITTER_FLAG_NONE); + +/** + * Produce a list of path handles in a fixed order, suitable for use with + * get_alignment_emitter_with_surjection(), by parsing a file. The file may be + * an HTSlib-style "sequence dictionary" (consisting of SAM @SQ header lines), + * or a plain list of sequence names (which do not start with "@SQ"). If the + * file is not openable or contains no entries, reports an error and quits. + * + * If path_names has entries, they are treated as path names that supplement + * those in the file, if any. + * + * If the filename is itself an empty string, and no path names are passed, + * then all reference-sense paths from the graph will be collected in arbitrary + * order. If there are none, all non-alt-allele generic sense paths from the + * graph will be collected in arbitrary order. + * + * TODO: Be able to generate the autosomes human-sort, X, Y, MT order typical + * of references. + * + * The tuple is + * For a subpath (ie chr1[1000-10000]) the base path length would be that of chr1 + * This information needs to come from the user in order to be correct, but + * if it's not specified, it'll be guessed from the graph + */ +vector> get_sequence_dictionary(const string& filename, const vector& path_names, const PathPositionHandleGraph& graph); + +/** + * Given a list of path handles and size info (from get_sequence_dictionary), return two things: + * 1) names and lengths of all of base paths in order. + * 2) a mapping of path names to length (reflects paths in the graph including subpaths) + * + * If subpath_support is set to false, there won't be a distinction. + */ +pair>, unordered_map> extract_path_metadata( + const vector>& paths, const PathPositionHandleGraph& graph, + bool subpath_support = false); + +/* + * A class that can write SAM/BAM/CRAM files from parallel threads + */ +class HTSWriter { +public: + /// Create an HTSWriter writing to the given file (or "-") in the + /// given HTS format ("SAM", "BAM", "CRAM"). path_order_and_length must give each + /// contig name and length to include in the header. Sample names and read + /// groups for the header will be guessed from the first reads. HTSlib + /// positions will be read from the alignments' refpos, and the alignments + /// must be surjected. + HTSWriter(const string& filename, const string& format, const vector>& path_order_and_length, + const unordered_map& subpath_to_length, size_t max_threads); + + /// Tear down an HTSWriter and destroy HTSlib structures. + ~HTSWriter(); + + // Not copyable or movable + HTSWriter(const HTSWriter& other) = delete; + HTSWriter& operator=(const HTSWriter& other) = delete; + HTSWriter(HTSWriter&& other) = delete; + HTSWriter& operator=(HTSWriter&& other) = delete; + +protected: + + /// We hack about with htslib's BGZF EOF footers, so we need to know how long they are. + static const size_t BGZF_FOOTER_LENGTH; + + /// If we are doing output to a file, this will hold the open file. Otherwise (for stdout) it will be empty. + unique_ptr out_file; + /// This holds a StreamMultiplexer on the output stream, for sharing it + /// between threads. + vg::io::StreamMultiplexer multiplexer; + + /// This holds our format name, for later error messages. + string format; + + /// Store the path names and lengths in the order to put them in the header. + vector> path_order_and_length; + /// With subpath support, the above list will store base path inoformation for the header + /// The actual path lengths go here: + unordered_map subpath_to_length; + + /// To back our samFile*s, we need the hFILE* objects wrapping our C++ + /// streams. We need to manually flush these after HTS headers are written, + /// since bgzf_flush, which samtools calls, closes a BGZF block and sends + /// the data to the hFILE* but does not actually flush the hFILE*. + /// These will be pointers to the hFILE* for each thread's samFile*. We may + /// only use them while the samFile* they belong to is still open; closing + /// the samFile* will free the hFILE* but not null it out of this vector. + vector backing_files; + + /// We make one samFile* per thread, on each thread's output stream form + /// the multiplexer. As soon as we create them, we show them the header, so + /// they are initialized properly. If they have not yet been filled in + /// (because the header is not ready yet), they are null. + vector sam_files; + + /// We need a header + atomic atomic_header; + /// We also need a header string. + /// Not atomic, because by the time we read it we know the header is ready + /// and nobody is writing to it. + string sam_header; + /// If the header isn't present when we want to write, we need a mutex to control creating it. + mutex header_mutex; + + /// Remember if we are outputting BGZF-compressed data or not. + /// If we are, we trim off spurious EOF markers and append our own. + bool output_is_bgzf; + + /// Remember the HTSlib mode string we need to open our files. + string hts_mode; + + /// Write and deallocate a bunch of BAM records. Takes care of locking the + /// file. Header must have been written already. + void save_records(bam_hdr_t* header, vector& records, size_t thread_number); + + /// Make sure that the HTS header has been written, and the samFile* in + /// sam_files has been created for the given thread. + /// + /// If the header has not been written, blocks until it has been written. + /// + /// If we end up being the thread to write it, sniff header information + /// from the given alignment. + /// + /// Returns the header pointer, so we don't have to do another atomic read + /// later. + bam_hdr_t* ensure_header(const string& read_group, const string& sample_name, size_t thread_number); + + /// Given a header and a thread number, make sure the samFile* for that + /// thread is initialized and ready to have alignments written to it. If + /// true, actually writes the given header into the output file created by + /// the multiplexer. If the samFile* was already initialized, flushes it + /// out and makes a breakpoint. + void initialize_sam_file(bam_hdr_t* header, size_t thread_number, bool keep_header = false); +}; + +/** + * Emit Alignments to a stream in SAM/BAM/CRAM format. + * Thread safe. + */ +class HTSAlignmentEmitter : public AlignmentEmitter, public HTSWriter { +public: + /// Create an HTSAlignmentEmitter writing to the given file (or "-") in the + /// given HTS format ("SAM", "BAM", "CRAM"). path_order_and_length must give + /// contig names and lengths to include in the header, in order. Sample + /// names and read groups for the header will be guessed from the first + /// reads. HTSlib positions will be read from the alignments' refpos, and + /// the alignments must be surjected. + HTSAlignmentEmitter(const string& filename, const string& format, + const vector>& path_order_and_length, + const unordered_map& subpath_to_length, size_t max_threads); + + /// Tear down an HTSAlignmentEmitter and destroy HTSlib structures. + ~HTSAlignmentEmitter() = default; + + // Not copyable or movable + HTSAlignmentEmitter(const HTSAlignmentEmitter& other) = delete; + HTSAlignmentEmitter& operator=(const HTSAlignmentEmitter& other) = delete; + HTSAlignmentEmitter(HTSAlignmentEmitter&& other) = delete; + HTSAlignmentEmitter& operator=(HTSAlignmentEmitter&& other) = delete; + + + /// Emit a batch of Alignments. + void emit_singles(vector&& aln_batch); + /// Emit a batch of Alignments with secondaries. All secondaries must have + /// is_secondary set already. + void emit_mapped_singles(vector>&& alns_batch); + /// Emit a batch of pairs of Alignments. + void emit_pairs(vector&& aln1_batch, vector&& aln2_batch, + vector&& tlen_limit_batch); + /// Emit the mappings of a batch of pairs of Alignments. All secondaries + /// must have is_secondary set already. + void emit_mapped_pairs(vector>&& alns1_batch, + vector>&& alns2_batch, vector&& tlen_limit_batch); + +private: + + virtual void convert_alignment(const Alignment& aln, vector>& cigar, bool& pos_rev, int64_t& pos, string& path_name) const; + + /// Convert an unpaired alignment to HTS format. + /// Header must have been created already. + void convert_unpaired(Alignment& aln, bam_hdr_t* header, vector& dest); + /// Convert a paired alignment to HTS format. + /// Header must have been created already. + void convert_paired(Alignment& aln1, Alignment& aln2, bam_hdr_t* header, int64_t tlen_limit, + vector& dest); + +}; + +/* + * An HTSAlgnmentEmitter that tries to detect splice edges in + * the input data so that they can be encoded as N CIGAR operations + */ +class SplicedHTSAlignmentEmitter : public HTSAlignmentEmitter { + +public: + + SplicedHTSAlignmentEmitter(const string& filename, const string& format, + const vector>& path_order_and_length, + const unordered_map& subpath_to_length, + const PathPositionHandleGraph& graph, + size_t max_threads); + + ~SplicedHTSAlignmentEmitter() = default; + + /// The minimum length of a deletion relative to the path that will be coded as a splice junction in the CIGAR + size_t min_splice_length = 20; + +private: + + /// Override for convert alignment that converts splices implicitly + void convert_alignment(const Alignment& aln, vector>& cigar, bool& pos_rev, int64_t& pos, string& path_name) const; + + /// Convert a spliced alignment against a path to a cigar. The alignment must be + /// colinear along a path and contain only mappings on the path, but it can have + /// deletions relative to the path that follow edges in the graph. + vector> spliced_cigar_against_path(const Alignment& aln, const string& path_name, int64_t pos, + bool rev) const; + + /// Graph that alignments were aligned against + const PathPositionHandleGraph& graph; + +}; + + +} + + +#endif diff --git a/src/identity_overlay.cpp b/src/identity_overlay.cpp new file mode 100644 index 00000000000..38ab1b750e9 --- /dev/null +++ b/src/identity_overlay.cpp @@ -0,0 +1,71 @@ +/** + * \file identity_overlay.cpp: contains the implementation of IdentityOverlay + */ + + +#include "identity_overlay.hpp" + + +namespace vg { + +using namespace std; + + IdentityOverlay::IdentityOverlay(const HandleGraph* graph) : graph(graph) { + + } + + bool IdentityOverlay::has_node(id_t node_id) const { + return graph->has_node(node_id); + } + + handle_t IdentityOverlay::get_handle(const id_t& node_id, bool is_reverse) const { + return graph->get_handle(node_id, is_reverse); + } + + id_t IdentityOverlay::get_id(const handle_t& handle) const { + return graph->get_id(handle); + } + + bool IdentityOverlay::get_is_reverse(const handle_t& handle) const { + return graph->get_is_reverse(handle); + } + + handle_t IdentityOverlay::flip(const handle_t& handle) const { + return graph->flip(handle); + } + + size_t IdentityOverlay::get_length(const handle_t& handle) const { + return graph->get_length(handle); + } + + string IdentityOverlay::get_sequence(const handle_t& handle) const { + return graph->get_sequence(handle); + } + + bool IdentityOverlay::follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const { + return graph->follow_edges(handle, go_left, iteratee); + } + + bool IdentityOverlay::for_each_handle_impl(const function& iteratee, + bool parallel) const { + return graph->for_each_handle(iteratee, parallel); + } + + size_t IdentityOverlay::get_node_count() const { + return graph->get_node_count(); + } + + id_t IdentityOverlay::min_node_id() const { + return graph->min_node_id(); + } + + id_t IdentityOverlay::max_node_id() const { + return graph->max_node_id(); + } + + handle_t IdentityOverlay::get_underlying_handle(const handle_t& handle) const { + return handle; + } +} + diff --git a/src/identity_overlay.hpp b/src/identity_overlay.hpp new file mode 100644 index 00000000000..bd4570a269a --- /dev/null +++ b/src/identity_overlay.hpp @@ -0,0 +1,96 @@ +#ifndef VG_IDENTITY_OVERLAY_HPP_INCLUDED +#define VG_IDENTITY_OVERLAY_HPP_INCLUDED + +/** \file + * identity_overlay.hpp: defines an overlay that does not modify the underlying graph + */ + +#include "handle.hpp" + +namespace vg { + +using namespace std; + + /** + * A HandleGraph overlay that does not modify the underlying graph but is still + * useful for providing a uniform interface when choosing between overlays. + */ + class IdentityOverlay : public ExpandingOverlayGraph { + public: + + /// Initialize as the reverse version of another graph, optionally also + /// complementing + IdentityOverlay(const HandleGraph* graph); + + /// Default constructor -- not actually functional + IdentityOverlay() = default; + + /// Default destructor + ~IdentityOverlay() = default; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + // Method to check if a node exists by ID + bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph + size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + id_t max_node_id() const; + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /** + * Returns the handle in the underlying graph that corresponds to a handle in the + * overlay + */ + handle_t get_underlying_handle(const handle_t& handle) const; + + private: + /// The underlying graph + const HandleGraph* graph = nullptr; + }; +} + +#endif diff --git a/src/incremental_subgraph.cpp b/src/incremental_subgraph.cpp new file mode 100644 index 00000000000..33066ea4d02 --- /dev/null +++ b/src/incremental_subgraph.cpp @@ -0,0 +1,333 @@ +/** + * \file incremental_subgraph.cpp: contains the implementation of IncrementalSubgraph + */ + + +#include "incremental_subgraph.hpp" + +//#define debug_incremental_subgraph + + +namespace vg { + +using namespace std; + +IncrementalSubgraph::IncrementalSubgraph(const HandleGraph& graph, + const pos_t& start_pos, + bool extract_left, + int64_t max_distance, + size_t frontier_copy_limit, + size_t max_num_nodes) : + graph(&graph), extract_left(extract_left), max_distance(max_distance), frontier_copy_limit(frontier_copy_limit), max_num_nodes(max_num_nodes) +{ +#ifdef debug_incremental_subgraph + cerr << "initializing incremental graph from " << start_pos << " in " << (extract_left ? "left" : "right") << " direction up to distance " << max_distance << ", max frontier count " << frontier_copy_limit << endl; +#endif + + handle_t start = graph.get_handle(id(start_pos), is_rev(start_pos)); + int64_t dist = extract_left ? offset(start_pos) - graph.get_length(start) : -offset(start_pos); + + extracted.emplace_back(start, vector(), vector(), dist, dist); + + // initialize the frontier + int64_t dist_thru = dist + graph.get_length(start); + if (dist_thru < max_distance) { + graph.follow_edges(start, extract_left, [&](const handle_t& next) { + + // add all the back edges except the one we're using + auto unseen_back_edges = new unordered_set(); + graph.follow_edges(next, !extract_left, [&](const handle_t& prev) { + if (prev != start) { + // mark all edges unseen except the one we're traversing + unseen_back_edges->emplace(prev); + } + }); + auto seen_back_edges = new vector(1, 0); + // add the frontier and its random access index + auto entry = frontier.emplace(dist_thru, next, unseen_back_edges, seen_back_edges); + ++frontier_count[next]; + auto& target_index = frontier_index[next]; + graph.follow_edges(next, !extract_left, [&](const handle_t& prev) { + if (prev != start) { + // mark all edges unseen except the one we're traversing + target_index[prev].emplace(entry.first); + } + }); + +#ifdef debug_incremental_subgraph + cerr << "initialized frontier with node " << graph.get_id(next) << " " << graph.get_is_reverse(next) << " at " << &(*entry.first) << ", which has unseen backward edges to:" << endl; + for (auto h : *unseen_back_edges) { + cerr << "\t" << graph.get_id(h) << " " << graph.get_is_reverse(h) << endl; + } + cerr << "allocated unseen edges at " << unseen_back_edges << endl; +#endif + }); + } + +} + +IncrementalSubgraph::~IncrementalSubgraph() { + for (auto& record : frontier) { + delete get<2>(record); + delete get<3>(record); + } +} + +bool IncrementalSubgraph::is_extendable() const { + return !frontier.empty() && get_node_count() < max_num_nodes; +} + +handle_t IncrementalSubgraph::extend() { + // get frontier group with fewest uncovered edges (breaking ties arbitrarily) + auto it = frontier.begin(); + auto nearest = *it; + +#ifdef debug_incremental_subgraph + cerr << "####" << endl; + cerr << "extracting a copy of " << graph->get_id(get<1>(nearest)) << " " << graph->get_is_reverse(get<1>(nearest)) << " at " << &(*it) << ", which has remaining unseen backward edges at " << get<2>(nearest) << " to:" << endl; + for (auto h : *get<2>(nearest)) { + cerr << "\t" << graph->get_id(h) << " " << graph->get_is_reverse(h) << endl; + } + cerr << "and seen backward edges to:" << endl; + for (auto i : *get<3>(nearest)) { + auto h = get<0>(extracted[i]); + cerr << "\t" << i << " (" << graph->get_id(h) << " " << graph->get_is_reverse(h) << ")" << endl; + } +#endif + + // remove the node from the frontier + frontier.erase(it); + --frontier_count[get<1>(nearest)]; + + // the index for copies of this node in the frontier + auto& target_index = frontier_index.at(get<1>(nearest)); + for (auto prev : *get<2>(nearest)) { + // the index for all unseen predecessors of this frontier node + auto& source_index = target_index.at(prev); + // the frontier node we're removing should always be the first one + source_index.erase(source_index.begin()); + } + + // make a node in the extracted subgraph + extracted.emplace_back(); + auto& extracted_record = extracted.back(); + get<0>(extracted_record) = get<1>(nearest); + + // add the edges + get<1>(extracted_record) = move(*get<3>(nearest)); + for (size_t prev : get<1>(extracted_record)) { + get<2>(extracted[prev]).push_back(extracted.size() - 1); + } + + // retrieve minimum distance + get<3>(extracted_record) = get<0>(nearest); + // figure out the maximum distance + get<4>(extracted_record) = 0; + for (size_t prev : get<1>(extracted_record)) { + auto& prev_record = extracted[prev]; + get<4>(extracted_record) = max(get<4>(extracted_record), + get<4>(prev_record) + graph->get_length(get<0>(prev_record))); + } + +#ifdef debug_incremental_subgraph + cerr << "min distance: " << get<3>(extracted_record) << ", max distance: " << get<4>(extracted_record) << endl; +#endif + + // the distance to the end of this node + int64_t dist_thru = get<0>(nearest) + graph->get_length(get<1>(nearest)); + + graph->follow_edges(get<1>(nearest), extract_left, [&](const handle_t& next) { + // see if we can mark this edge on one of the copies of the next node + // that are currently in the frontier + bool marked_edge = false; + auto index_iter = frontier_index.find(next); + if (index_iter != frontier_index.end()) { + // this node exists in the frontier + auto source_iter = index_iter->second.find(get<1>(nearest)); + if (source_iter != index_iter->second.end() && !source_iter->second.empty()) { + // there are copies of this node in the frontier that haven't had + // this edge marked yet, get the highest priority copy + auto frontier_iter = *source_iter->second.begin(); + auto frontier_entry = *frontier_iter; + + // remove this frontier entry from everywhere that the index is holding it + for (auto prev : *get<2>(frontier_entry)) { + + index_iter->second.at(prev).erase(frontier_iter); + } + frontier.erase(frontier_iter); + // mark the unseen edge and move it to the seen edges + get<2>(frontier_entry)->erase(get<1>(nearest)); + get<3>(frontier_entry)->push_back(extracted.size() - 1); + // possibly update the minimum distance + get<0>(frontier_entry) = min(get<0>(frontier_entry), dist_thru); + // put it back in the frontier + auto new_entry_iter = frontier.emplace(frontier_entry); + // reinsert the new iterator everwhere it needs to go in the + // random access index + for (auto prev : *get<2>(frontier_entry)) { + index_iter->second.at(prev).emplace(new_entry_iter.first); + } + +#ifdef debug_incremental_subgraph + cerr << "updated frontier node " << graph->get_id(next) << " " << graph->get_is_reverse(next) << ", which has unseen backward edges to:" << endl; + for (auto h : *get<2>(frontier_entry)) { + cerr << "\t" << graph->get_id(h) << " " << graph->get_is_reverse(h) << endl; + } +#endif + + marked_edge = true; + } + } + + // TODO: it can be suboptimal (in terms of distance) to always reject the incoming + // frontier node instead of the current lowest-priority one, but we would need + // another whole index to do that... + + if (!marked_edge && dist_thru < max_distance && frontier_count[next] < frontier_copy_limit) { + // we need to add a new copy of this node to the frontier + + auto unseen_back_edges = new unordered_set(); + graph->follow_edges(next, !extract_left, [&](const handle_t& prev) { + if (prev != get<1>(nearest)) { + // mark all edges unseen except the one we're traversing + unseen_back_edges->emplace(prev); + } + }); + auto seen_back_edges = new vector(1, extracted.size() - 1); + // add the frontier and its random access index + auto entry = frontier.emplace(dist_thru, next, unseen_back_edges, seen_back_edges); + ++frontier_count[next]; + auto& successor_index = frontier_index[next]; + graph->follow_edges(next, !extract_left, [&](const handle_t& prev) { + if (prev != get<1>(nearest)) { + // mark all edges unseen except the one we're traversing + successor_index[prev].emplace(entry.first); + } + }); +#ifdef debug_incremental_subgraph + cerr << "added new frontier copy of node " << graph->get_id(next) << " " << graph->get_is_reverse(next) << " at " << &(*entry.first) << ", which has unseen backward edges to:" << endl; + for (auto h : *unseen_back_edges) { + cerr << "\t" << graph->get_id(h) << " " << graph->get_is_reverse(h) << endl; + } + cerr << "allocated unseen edges at " << unseen_back_edges << endl; +#endif + } + }); + + // clean up the heap objects + delete get<2>(nearest); + delete get<3>(nearest); + + return get_handle(extracted.size(), false); +} + +size_t IncrementalSubgraph::order_of(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_number(handle); +} + +handle_t IncrementalSubgraph::handle_at_order(size_t i) const { + return handlegraph::number_bool_packing::pack(i, false); +} + +int64_t IncrementalSubgraph::min_distance_from_start(const handle_t& handle) const { + return get<3>(extracted[order_of(handle)]); +} + +int64_t IncrementalSubgraph::max_distance_from_start(const handle_t& handle) const { + return get<4>(extracted[order_of(handle)]); +} + +bool IncrementalSubgraph::extracting_left() const { + return extract_left; +} + +bool IncrementalSubgraph::has_node(id_t node_id) const { + return node_id > 0 && node_id <= extracted.size(); +} + +handle_t IncrementalSubgraph::get_handle(const id_t& node_id, bool is_reverse) const { + return handlegraph::number_bool_packing::pack(node_id - 1, is_reverse); +} + +id_t IncrementalSubgraph::get_id(const handle_t& handle) const { + return order_of(handle) + 1; +} + +bool IncrementalSubgraph::get_is_reverse(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_bit(handle); +} + +handle_t IncrementalSubgraph::flip(const handle_t& handle) const { + return handlegraph::number_bool_packing::toggle_bit(handle); +} + +size_t IncrementalSubgraph::get_length(const handle_t& handle) const { + return graph->get_length(get<0>(extracted[order_of(handle)])); +} + +string IncrementalSubgraph::get_sequence(const handle_t& handle) const { + return graph->get_sequence(get_underlying_handle(handle)); +} + +bool IncrementalSubgraph::follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const { + bool left_edges = (go_left != get_is_reverse(handle)) != extract_left; + auto& edges = left_edges ? get<1>(extracted[order_of(handle)]) : get<2>(extracted[order_of(handle)]); + bool keep_going = true; + for (size_t i = 0; i < edges.size() && keep_going; ++i) { + keep_going = iteratee(handlegraph::number_bool_packing::pack(edges[i], + get_is_reverse(handle))); + } + return keep_going; +} + +bool IncrementalSubgraph::for_each_handle_impl(const function& iteratee, + bool parallel) const { + bool keep_going = true; + for (size_t i = 0; i < extracted.size() && keep_going; ++i) { + keep_going = iteratee(handlegraph::number_bool_packing::pack(i, false)); + } + // not doing parallel, never expect to use it + return keep_going; +} + +size_t IncrementalSubgraph::get_node_count() const { + return extracted.size(); +} + +id_t IncrementalSubgraph::min_node_id() const { + return 1; +} + +id_t IncrementalSubgraph::max_node_id() const { + return extracted.size(); +} + +size_t IncrementalSubgraph::get_degree(const handle_t& handle, bool go_left) const { + bool left_edges = (go_left != get_is_reverse(handle)) != extract_left; + return (left_edges ? get<1>(extracted[order_of(handle)]) : get<2>(extracted[order_of(handle)])).size(); +} + +size_t IncrementalSubgraph::get_edge_count() const { + size_t count = 0; + for (const auto& record : extracted) { + count += get<1>(record).size(); + } + return count; +} + +char IncrementalSubgraph::get_base(const handle_t& handle, size_t index) const { + return graph->get_base(get_underlying_handle(handle), index); +} + +string IncrementalSubgraph::get_subsequence(const handle_t& handle, size_t index, size_t size) const { + return graph->get_subsequence(get_underlying_handle(handle), index, size); +} + +handle_t IncrementalSubgraph::get_underlying_handle(const handle_t& handle) const { + auto underlying = get<0>(extracted[order_of(handle)]); + return get_is_reverse(handle) ? graph->flip(underlying) : underlying; +} +} + diff --git a/src/incremental_subgraph.hpp b/src/incremental_subgraph.hpp new file mode 100644 index 00000000000..b8f4cbd4de1 --- /dev/null +++ b/src/incremental_subgraph.hpp @@ -0,0 +1,205 @@ +/** \file + * incremental_subgraph.hpp: defines a subgraph that is extracted from the parent + * graph on an as-needed basis + */ +#ifndef VG_INCREMENTAL_SUBGRAPH_HPP_INCLUDED +#define VG_INCREMENTAL_SUBGRAPH_HPP_INCLUDED + +#include "handle.hpp" +#include + +namespace vg { + +using namespace std; + +/** + * A subgraph that is extracted, made single-stranded, and DAG-fied online on an as-needed + * basis from the parent graph. It is restricted to subgraphs that extend from a single position + * in the graph in one direction. + */ +class IncrementalSubgraph : public ExpandingOverlayGraph { +public: + + /// Initialize + /// the copy number limits how many times a cyce will be traversed before giving + /// up on it (until encountering it again) + IncrementalSubgraph(const HandleGraph& graph, + const pos_t& starting_position, + bool extract_left, + int64_t max_distance = numeric_limits::max(), + size_t frontier_copy_limit = numeric_limits::max(), + size_t max_num_nodes = numeric_limits::max()); + + /// Default constructor -- not actually functional + IncrementalSubgraph() = default; + + /// Default destructor + ~IncrementalSubgraph(); + + ////////////////////////// + /// Specialized interface + ////////////////////////// + + // TODO: prune method that removes outgoing edges into the frontier + // from an extracted node + + /// True if there are additional nodes + bool is_extendable() const; + + /// Extract an additional node + handle_t extend(); + + /// The order of a node in a topological order of the extracted graph + size_t order_of(const handle_t& handle) const; + + /// The node at a given position in the topological order + handle_t handle_at_order(size_t i) const; + + /// The minimum distance from the start position + int64_t min_distance_from_start(const handle_t& handle) const; + + /// The maximum distance from the start position + int64_t max_distance_from_start(const handle_t& handle) const; + + /// Are we doing extraction left or right? + bool extracting_left() const; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + /// Method to check if a node exists by ID + bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph + size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + id_t max_node_id() const; + + /////////////////////////////////// + /// Optional HandleGraph interface + /////////////////////////////////// + + /// Get the number of edges on the right (go_left = false) or left (go_left + /// = true) side of the given handle. The default implementation is O(n) in + /// the number of edges returned, but graph implementations that track this + /// information more efficiently can override this method. + size_t get_degree(const handle_t& handle, bool go_left) const; + + /// Return the total number of edges in the graph. If not overridden, + /// counts them all in linear time. + size_t get_edge_count() const; + + /// Returns one base of a handle's sequence, in the orientation of the + /// handle. + char get_base(const handle_t& handle, size_t index) const; + + /// Returns a substring of a handle's sequence, in the orientation of the + /// handle. If the indicated substring would extend beyond the end of the + /// handle's sequence, the return value is truncated to the sequence's end. + /// By default O(n) in the size of the handle's sequence, but can be overriden. + string get_subsequence(const handle_t& handle, size_t index, size_t size) const; + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /** + * Returns the handle in the underlying graph that corresponds to a handle in the + * overlay + */ + handle_t get_underlying_handle(const handle_t& handle) const; + +private: + + pair underlying_interval(size_t i) const; + + /// direction we're extracting from the start pos + bool extract_left; + + /// farthest distance we will travel from the start pos + int64_t max_distance; + + /// the maximum number of copies of a node we will allow in the frontier at a time + size_t frontier_copy_limit; + + size_t max_num_nodes; + + /// records of (underlying handle, left edges, right edges, min distance, max distance) + vector, vector, int64_t, int64_t>> extracted; + + /// comparator for the frontier, order first by number of unseen incoming edges + /// and then by distance and break ties arbitrarily based on handle values + struct FCmp { + inline bool operator()(const tuple*, vector*>& a, + const tuple*, vector*>& b) const { + return (get<2>(a)->size() < get<2>(b)->size() || + (get<2>(a)->size() == get<2>(b)->size() && a < b)); + } + }; + /// records of (distance, node, unseen edges going into, seen edges going into). + /// serves as an updateable priority queue for nodes that are adjacent to the extracted nodes + /// the container classes are created on the heap so that we can do a remove-modify-replace + /// update to frontier entries without deep-copying the containers + set*, vector*>, FCmp> frontier; + + /// wrapper for the comparator in the frontier index, which guarantees that the iterators + /// end up in the same relative order in the frontier index as in the frontier + struct IterFCmp { + inline bool operator()(const decltype(frontier)::iterator& a, + const decltype(frontier)::iterator& b) { + return FCmp()(*a, *b); + } + }; + /// provides random access into the frontier by handle, in the same order + /// that the handles occur in the frontier. indexed by target node and then + /// by predecessor node + unordered_map>> frontier_index; + + /// the number of a given node there is currently in the frontier + unordered_map frontier_count; + + /// the underlying graph + const HandleGraph* graph = nullptr; +}; + +} + +#endif // VG_INCREMENTAL_SUBGRAPH_HPP_INCLUDED diff --git a/src/index.cpp b/src/index.cpp deleted file mode 100644 index 4de97b2e689..00000000000 --- a/src/index.cpp +++ /dev/null @@ -1,2192 +0,0 @@ -#include "index.hpp" - -namespace vg { - -using namespace std; - -// convenience macro for RocksDB error handling -#define S(x) { rocksdb::Status __s = (x); if (!__s.ok()) throw std::runtime_error("RocksDB operation failed: " + __s.ToString()); } - -Index::Index(void) { - - start_sep = '\x00'; - end_sep = '\xff'; - write_options = rocksdb::WriteOptions(); - // disable write-ahead logging when writing to RocksDB. This is for write durability - // in the event of power failure etc. which is not really relevant to our use case. - write_options.disableWAL = true; - db = nullptr; - - threads = 1; -#pragma omp parallel - { -#pragma omp master - threads = omp_get_num_threads(); - } - -} - -rocksdb::Options Index::GetOptions(bool read_only) { - // TODO: make the following configurable - const size_t block_cache_bytes = 1<<30; - const size_t memtable_bytes = 4 * size_t(1<<30); - - rocksdb::Options options; - - options.create_if_missing = !read_only; - // TODO: error_if_exists by default, with override for user who really wishes - // to add into an existing index. - // options.error_if_exists = true; - if (read_only) { - // dump RocksDB's debug log into /tmp instead of db dir, to avoid - // touching the latter in read-only mode - options.db_log_dir = "/tmp"; - } - options.max_open_files = -1; - options.allow_mmap_reads = true; - - // set up table format - rocksdb::BlockBasedTableOptions topt; - topt.format_version = 2; - topt.block_size = 4 << 20; - topt.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true)); - topt.block_cache = rocksdb::NewLRUCache(block_cache_bytes); - options.table_factory.reset(NewBlockBasedTableFactory(topt)); - - // set up concurrency - options.IncreaseParallelism(threads); - - // set up universal compaction - options.OptimizeUniversalStyleCompaction(memtable_bytes); - options.max_write_buffer_number = 4; // overrides OptimizeUniversalStyleCompaction - options.write_buffer_size = (memtable_bytes/4); // overrides OptimizeUniversalStyleCompaction - options.min_write_buffer_number_to_merge = 1; // overrides OptimizeUniversalStyleCompaction - - // 3 Snappy-compressed levels [0-2]. Using background compactions, try to - // keep fewer than 5 files in L0 (each file being a sorted "sub-level"). - // L1 & L2 are each fully sorted, but split into disjoint 16GB chunks. - // See https://github.com/facebook/rocksdb/wiki/Universal-Compaction for - // explanation of multi-level universal compaction. - // TODO: switch to zstd compression - options.num_levels = 3; - options.compression_per_level.clear(); - options.compression = rocksdb::kSnappyCompression; - options.level0_file_num_compaction_trigger = 5; - options.target_file_size_base = 16 * size_t(1<<30); - options.access_hint_on_compaction_start = rocksdb::Options::AccessHint::SEQUENTIAL; - - if (bulk_load) { - // disable write throttling because we'll have other logic to let - // background compactions converge. - options.level0_slowdown_writes_trigger = (1<<30); - options.level0_stop_writes_trigger = (1<<30); - options.compaction_options_universal.compression_size_percent = -1; - // size amplification is not a factor for our write-once use case - options.compaction_options_universal.max_size_amplification_percent = (1<<30); - options.compaction_options_universal.size_ratio = 10; - options.compaction_options_universal.min_merge_width = 2; - options.compaction_options_universal.max_merge_width = 10; - } else { - options.allow_concurrent_memtable_write = true; - options.enable_write_thread_adaptive_yield = true; - } - - return options; -} - -void Index::open(const std::string& dir, bool read_only) { - - name = dir; - db_options = GetOptions(read_only); - - rocksdb::Status s; - if (read_only) { - //s = rocksdb::DB::Open(db_options, name, &db); - s = rocksdb::DB::OpenForReadOnly(db_options, name, &db); - } else { - s = rocksdb::DB::Open(db_options, name, &db); - } - if (!s.ok()) { - if (db) { - delete db; - } - db = nullptr; - throw indexOpenException("can't open " + dir); - } - - // we store a metadata key DIRTY while the index is open for writing, - // so that we can later detect if the indexing crashed/failed and - // refuse to use it. - string dirty_key = key_for_metadata("DIRTY"), data; - if (db->Get(rocksdb::ReadOptions(), dirty_key, &data).ok()) { - throw indexOpenException("index was not built cleanly, and should be recreated from scratch"); - } - - if (!read_only) { - rocksdb::WriteOptions dirty_write_options; - dirty_write_options.sync = true; - dirty_write_options.disableWAL = false; - if (!db->Put(dirty_write_options, dirty_key, "").ok() || !db->Flush(rocksdb::FlushOptions()).ok()) { - throw indexOpenException("couldn't write to index"); - } - } - - next_nonce = 42; // arbitrary initial value - s = get_metadata("next_nonce", data); - if (s.ok()) { - auto p = strtoull(data.c_str(), nullptr, 10); - if (p < next_nonce || p == ULLONG_MAX) { - throw indexOpenException("corrupt next_nonce entry"); - } - next_nonce = p; - } else if (!s.IsNotFound()) { - throw indexOpenException("couldn't read metadata"); - } -} - -void Index::open_read_only(string& dir) { - bulk_load = false; - open(dir, true); -} - -void Index::open_for_write(string& dir) { - bulk_load = false; - open(dir, false); -} - -void Index::open_for_bulk_load(string& dir) { - bulk_load = true; - open(dir, false); -} - -Index::~Index(void) { - if (db) { - close(); - } -} - -void Index::close(void) { - flush(); - string dirty_key = key_for_metadata("DIRTY"), data; - if (db->Get(rocksdb::ReadOptions(), dirty_key, &data).ok()) { - // persist next_nonce and delete the dirty marker - rocksdb::WriteBatch batch; - batch.Put(key_for_metadata("next_nonce"), to_string(next_nonce)); - batch.Delete(dirty_key); - rocksdb::WriteOptions dirty_write_options; - dirty_write_options.sync = true; - dirty_write_options.disableWAL = false; - if (!db->Write(dirty_write_options, &batch).ok() || !db->Flush(rocksdb::FlushOptions()).ok()) { - throw std::runtime_error("couldn't mark index closed"); - } - } - delete db; - db = nullptr; -} - -void Index::flush(void) { - db->Flush(rocksdb::FlushOptions()); - - if (bulk_load) { - // Wait for compactions to converge. Specifically, wait until - // there's no more than one background compaction running. - // Argument: once that's the case, the number of L0 files isn't - // too much greater than level0_file_num_compaction_trigger, - // or else a second background compaction would start. - uint64_t num_running_compactions = 0; - while(true) { - num_running_compactions = 0; - db->GetIntProperty(rocksdb::DB::Properties::kNumRunningCompactions, - &num_running_compactions); - if (num_running_compactions<=1) { - break; - } - usleep(10000); - } - } -} - -void Index::compact(void) { - db->CompactRange(rocksdb::CompactRangeOptions(), NULL, NULL); -} - -// todo: replace with union / struct -const string Index::key_for_node(int64_t id) { - string key; - id = htobe64(id); - key.resize(5*sizeof(char) + sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'g'; // graph elements - k[2] = start_sep; - memcpy(k + sizeof(char)*3, &id, sizeof(int64_t)); - k[3 + sizeof(int64_t)] = start_sep; - k[3 + sizeof(int64_t) + 1] = 'n'; - return key; -} - -const string Index::key_for_edge_on_start(int64_t node, int64_t other, bool backward) { - // reverse endianness for sorting - node = htobe64(node); - other = htobe64(other); - string key; - key.resize(8*sizeof(char) + 2*sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'g'; // graph elements - k[2] = start_sep; - memcpy(k + sizeof(char)*3, &node, sizeof(int64_t)); - k[3 + sizeof(int64_t)] = start_sep; - k[4 + sizeof(int64_t)] = 's'; // edge on start - k[5 + sizeof(int64_t)] = start_sep; - memcpy(k + sizeof(char)*6 + sizeof(int64_t), &other, sizeof(int64_t)); - k[6 + 2*sizeof(int64_t)] = start_sep; - k[7 + 2*sizeof(int64_t)] = backward ? '1' : '0'; - return key; -} - -const string Index::key_for_edge_on_end(int64_t node, int64_t other, bool backward) { - // reverse endianness for sorting - node = htobe64(node); - other = htobe64(other); - string key; - key.resize(8*sizeof(char) + 2*sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'g'; // graph elements - k[2] = start_sep; - memcpy(k + sizeof(char)*3, &node, sizeof(int64_t)); - k[3 + sizeof(int64_t)] = start_sep; - k[4 + sizeof(int64_t)] = 'e'; // edge on end - k[5 + sizeof(int64_t)] = start_sep; - memcpy(k + sizeof(char)*6 + sizeof(int64_t), &other, sizeof(int64_t)); - k[6 + 2*sizeof(int64_t)] = start_sep; - k[7 + 2*sizeof(int64_t)] = backward ? '1' : '0'; - return key; -} - -const string Index::key_for_kmer(const string& kmer, int64_t id) { - id = htobe64(id); - string key; - key.resize(4*sizeof(char) + kmer.size() + sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'k'; // kmers - k[2] = start_sep; - memcpy(k + sizeof(char)*3, kmer.c_str(), kmer.size()); - k[sizeof(char)*3 + kmer.size()] = start_sep; - memcpy(k + sizeof(char)*4 + kmer.size(), &id, sizeof(int64_t)); - return key; -} - -const string Index::key_for_node_path_position(int64_t node_id, int64_t path_id, int64_t path_pos, bool backward) { - node_id = htobe64(node_id); - path_id = htobe64(path_id); - path_pos = htobe64(path_pos); - string key; - key.resize(9*sizeof(char) + 3*sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'g'; // graph elements - k[2] = start_sep; - memcpy(k + sizeof(char)*3, &node_id, sizeof(int64_t)); - k[3 + sizeof(int64_t)] = start_sep; - k[4 + sizeof(int64_t)] = 'p'; - k[5 + sizeof(int64_t)] = start_sep; - memcpy(k + sizeof(char)*6 + sizeof(int64_t), &path_id, sizeof(int64_t)); - k[6 + 2*sizeof(int64_t)] = start_sep; - memcpy(k + sizeof(char)*7 + 2*sizeof(int64_t), &path_pos, sizeof(int64_t)); - k[7 + 3*sizeof(int64_t)] = start_sep; - k[8 + 3*sizeof(int64_t)] = backward ? '1' : '0'; - return key; -} - -const string Index::key_prefix_for_node_path(int64_t node_id, int64_t path_id) { - node_id = htobe64(node_id); - path_id = htobe64(path_id); - string key; - key.resize(6*sizeof(char) + 2*sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'g'; // graph elements - k[2] = start_sep; - memcpy(k + sizeof(char)*3, &node_id, sizeof(int64_t)); - k[3 + sizeof(int64_t)] = start_sep; - k[4 + sizeof(int64_t)] = 'p'; - k[5 + sizeof(int64_t)] = start_sep; - memcpy(k + sizeof(char)*6 + sizeof(int64_t), &path_id, sizeof(int64_t)); - return key; -} - -const string Index::key_for_path_position(int64_t path_id, int64_t path_pos, bool backward, int64_t node_id) { - node_id = htobe64(node_id); - path_id = htobe64(path_id); - path_pos = htobe64(path_pos); - string key; - key.resize(7*sizeof(char) + 3*sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'p'; // graph elements - k[2] = start_sep; - memcpy(k + sizeof(char)*3, &path_id, sizeof(int64_t)); - k[3 + sizeof(int64_t)] = start_sep; - memcpy(k + sizeof(char)*4 + sizeof(int64_t), &path_pos, sizeof(int64_t)); - k[4 + 2*sizeof(int64_t)] = start_sep; - k[5 + 2*sizeof(int64_t)] = backward ? '1' : '0'; - k[6 + 2*sizeof(int64_t)] = start_sep; - memcpy(k + sizeof(char)*7 + 2*sizeof(int64_t), &node_id, sizeof(int64_t)); - return key; -} - -const string Index::key_prefix_for_kmer(const string& kmer) { - string key; - key.resize(3*sizeof(char) + kmer.size()); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'k'; // kmers - k[2] = start_sep; - memcpy(k + sizeof(char)*3, kmer.c_str(), kmer.size()); - return key; -} - -const string Index::key_for_metadata(const string& tag) { - string key; - key.resize(3*sizeof(char) + tag.size()); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'm'; // metadata - k[2] = start_sep; - memcpy(k + sizeof(char)*3, tag.c_str(), tag.size()); - return key; -} - -const string Index::key_for_mapping_prefix(int64_t node_id) { - node_id = htobe64(node_id); - string key; - key.resize(3*sizeof(char) + sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 's'; // mappings (~sides) - k[2] = start_sep; - memcpy(k + sizeof(char)*3, &node_id, sizeof(int64_t)); - return key; -} - -const string Index::key_for_mapping(const Mapping& mapping) { - string key(key_for_mapping_prefix(mapping.position().node_id())); - // Append a unique nonce to the node-ID-based key prefix, since there can - // be many mappings to one node. It just needs to be unique, so we use - // a variable-length, little-endian encoding to save a few bytes. - key.reserve(key.size() + 9); - key += start_sep; - uint64_t nonce = next_nonce.fetch_add(1); - while (nonce) { - key += (char) (nonce & 0xFF); - nonce >>= 8; - } - return key; -} - -const string Index::key_for_alignment_prefix(int64_t node_id) { - node_id = htobe64(node_id); - string key; - key.resize(3*sizeof(char) + sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'a'; // alignments - k[2] = start_sep; - memcpy(k + sizeof(char)*3, &node_id, sizeof(int64_t)); - return key; -} - -const string Index::key_for_alignment(const Alignment& alignment) { - string key; - if (alignment.has_path()) { - // key the alignment by the lowest node id it maps to - int64_t min_id = 0; - for (auto& mapping : alignment.path().mapping()) { - if (mapping.position().node_id() < min_id || min_id == 0) { - min_id = mapping.position().node_id(); - } - } - key = key_for_alignment_prefix(min_id); - } else { - // make our key using the alignment, to try to bin similar alignments together - key = key_for_alignment_prefix(0) + alignment.sequence().substr(0,32); - } - // Append a unique nonce to the node-ID-based key prefix, since there can - // be many alignments to one node. It just needs to be unique, so we use - // a variable-length encoding to save a few bytes. - key.reserve(key.size() + 9); - key += start_sep; - uint64_t nonce = next_nonce.fetch_add(1); - while (nonce) { - key += (char) (nonce & 0xFF); - nonce >>= 8; - } - return key; -} - -const string Index::key_for_base(int64_t aln_id) { - string key; - key.resize(3*sizeof(char) + sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'b'; // base-alignments - k[2] = start_sep; - aln_id = htobe64(aln_id); - memcpy(k + sizeof(char)*3, &aln_id, sizeof(int64_t)); - return key; -} - -const string Index::key_prefix_for_traversal(int64_t node_id) { - string key; - key.resize(3*sizeof(char) + sizeof(int64_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 't'; // traversals - k[2] = start_sep; - node_id = htobe64(node_id); - memcpy(k + sizeof(char)*3, &node_id, sizeof(int64_t)); - return key; -} - -const string Index::key_for_traversal(int64_t aln_id, const Mapping& mapping) { - int64_t node_id = mapping.position().node_id(); - node_id = htobe64(node_id); - string key; - key.resize(3*sizeof(char) + 2*sizeof(int64_t) + sizeof(int16_t)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 't'; // traversals - k[2] = start_sep; - memcpy(k + sizeof(char)*3, &node_id, sizeof(int64_t)); - aln_id = htobe64(aln_id); - memcpy(k + sizeof(char)*3+sizeof(int64_t), &aln_id, sizeof(int64_t)); - int16_t rank = mapping.rank() * (mapping.position().is_reverse() ? -1 : 1); - memcpy(k + sizeof(char)*3+sizeof(int64_t)*2, &rank, sizeof(int16_t)); - return key; -} - -const string Index::key_prefix_for_edges_on_node_start(int64_t node) { - string key = key_for_edge_on_start(node, 0, false); - return key.substr(0, key.size()-sizeof(int64_t)-2*sizeof(char)); -} - -const string Index::key_prefix_for_edges_on_node_end(int64_t node) { - string key = key_for_edge_on_end(node, 0, false); - return key.substr(0, key.size()-sizeof(int64_t)-2*sizeof(char)); -} - -char Index::graph_key_type(const string& key) { - return key.c_str()[4*sizeof(char) + sizeof(int64_t)]; -} - -string Index::entry_to_string(const string& key, const string& value) { - char type = key[1]; - switch (type) { - case 'g': - return graph_entry_to_string(key, value); - break; - case 'k': - return kmer_entry_to_string(key, value); - break; - case 'p': - return path_position_to_string(key, value); - break; - case 'm': - return metadata_entry_to_string(key, value); - break; - case 's': - return mapping_entry_to_string(key, value); - break; - case 'a': - return alignment_entry_to_string(key, value); - break; - case 'b': - return base_entry_to_string(key, value); - break; - case 't': - return traversal_entry_to_string(key, value); - break; - default: - throw runtime_error("Unrecognized type " + key.substr(0, 1)); - break; - } -} - -void Index::parse_node(const string& key, const string& value, int64_t& id, Node& node) { - const char* k = key.c_str(); - memcpy(&id, (k + 3*sizeof(char)), sizeof(int64_t)); - id = be64toh(id); - node.ParseFromString(value); -} - -void Index::parse_edge(const string& key, char& type, int64_t& node_id, int64_t& other_id, bool& backward) { - // Parse the edge just out of the key - const char* k = key.c_str(); - - // Work out what type the key is ('s' or 'e' depending on if it's on the first node's start or end). - type = graph_key_type(key); - - // Get the node IDs involved. - memcpy(&node_id, (k + 3*sizeof(char)), sizeof(int64_t)); - memcpy(&other_id, (k + 6*sizeof(char)) + sizeof(int64_t), sizeof(int64_t)); - node_id = be64toh(node_id); - other_id = be64toh(other_id); - - // Is the relative orientation forward ('0') or backward ('1')? - char backward_char; - memcpy(&backward_char, (k + 7*sizeof(char)) + 2*sizeof(int64_t), sizeof(char)); - backward = backward_char == '1'; -} - -void Index::parse_edge(const string& key, const string& value, char& type, int64_t& id1, int64_t& id2, Edge& edge) { - // We can take either of the two edge keys: - // +g+node_id+s+other_id+backward - // +g+node_id+e+other_id+backward - - - if(value.size() > 0) { - // We can just deserialize the edge. - edge.ParseFromString(value); - - // But we still need to fill in our output parameters - type = graph_key_type(key); - id1 = edge.from(); - id2 = edge.to(); - - } else { - // We have to synthesize an edge. - - // Get what we can from the key. Arbitrarily say this node is the from. - bool backward; - parse_edge(key, type, id1, id2, backward); - - // Work out if the edge should be from the start - bool from_start = type == 's'; - // And if it should be to the end. We attach to the end of the other - // node when we attached to the start of this node and we want to be - // forward, or when we attached to the end of this node and we want to - // be backward. That works out to: XOR(on start, should be backward). - bool to_end = from_start != backward; - - if(from_start && to_end) { - // If we got that it should be both, we can replace it with the - // normal end to start edge going the other way. - std::swap(id1, id2); - from_start = to_end = false; - } - - // Build the edge - edge.set_from(id1); - edge.set_to(id2); - edge.set_from_start(from_start); - edge.set_to_end(to_end); - - // TODO: get the edge data somehow in these cases instead of making up edges. - } -} - -string Index::graph_entry_to_string(const string& key, const string& value) { - // do we have a node or edge? - stringstream s; - switch (graph_key_type(key)) { - case 'n': { - // it's a node - int64_t id; - Node node; - parse_node(key, value, id, node); - s << "{\"key\":\"+g+" << id << "+n\", \"value\":"<< pb2json(node) << "}"; - } break; - case 's': { - Edge edge; - int64_t id1, id2; - char type; - bool backward; - if(value.size() > 0) { - edge.ParseFromString(value); - } - parse_edge(key, type, id1, id2, backward); - s << "{\"key\":\"+g+" << id1 << "+s+" << id2 << "+" << (backward ? '1' : '0') - << "\", \"value\":"<< (value.size() > 0 ? pb2json(edge) : "") << "}"; - } break; - case 'e': { - Edge edge; - int64_t id1, id2; - char type; - bool backward; - if(value.size() > 0) { - edge.ParseFromString(value); - } - parse_edge(key, type, id1, id2, backward); - s << "{\"key\":\"+g+" << id1 << "+e+" << id2 << "+" << (backward ? '1' : '0') - << "\", \"value\":"<< (value.size() > 0 ? pb2json(edge) : "") << "}"; - } break; - case 'p': { - s << node_path_to_string(key, value); - } break; - } - return s.str(); -} - -void Index::parse_kmer(const string& key, const string& value, string& kmer, int64_t& id, int32_t& pos) { - const char* k = key.c_str(); - kmer = string(k+3*sizeof(char)); - memcpy(&id, k+4*sizeof(char)+kmer.size(), sizeof(int64_t)); - id = be64toh(id); - memcpy(&pos, (char*)value.c_str(), sizeof(int32_t)); -} - -string Index::kmer_entry_to_string(const string& key, const string& value) { - stringstream s; - int64_t id; - int32_t pos; - string kmer; - parse_kmer(key, value, kmer, id, pos); - s << "{\"key\":\"+k+" << kmer << "+" << id << "\", \"value\":"<< pos << "}"; - return s.str(); -} - -void Index::parse_node_path(const string& key, const string& value, - int64_t& node_id, int64_t& path_id, int64_t& path_pos, bool& backward, Mapping& mapping) { - const char* k = key.c_str(); - memcpy(&node_id, (k + 3*sizeof(char)), sizeof(int64_t)); - memcpy(&path_id, (k + 6*sizeof(char)+sizeof(int64_t)), sizeof(int64_t)); - memcpy(&path_pos, (k + 7*sizeof(char)+2*sizeof(int64_t)), sizeof(int64_t)); - backward = (k[8 + 3*sizeof(int64_t)] == '1'); - node_id = be64toh(node_id); - path_id = be64toh(path_id); - path_pos = be64toh(path_pos); - mapping.ParseFromString(value); -} - -void Index::parse_path_position(const string& key, const string& value, - int64_t& path_id, int64_t& path_pos, bool& backward, int64_t& node_id, Mapping& mapping) { - const char* k = key.c_str(); - memcpy(&path_id, (k + 3*sizeof(char)), sizeof(int64_t)); - memcpy(&path_pos, (k + 4*sizeof(char)+sizeof(int64_t)), sizeof(int64_t)); - backward = (k[5 + 2*sizeof(int64_t)] == '1'); - memcpy(&node_id, (k + 7*sizeof(char)+2*sizeof(int64_t)), sizeof(int64_t)); - node_id = be64toh(node_id); - path_id = be64toh(path_id); - path_pos = be64toh(path_pos); - mapping.ParseFromString(value); -} - -void Index::parse_mapping(const string& key, const string& value, int64_t& node_id, uint64_t& nonce, Mapping& mapping) { - const char* k = key.c_str(); - memcpy(&node_id, (k + 3*sizeof(char)), sizeof(int64_t)); - node_id = be64toh(node_id); - nonce = 0; - for (int i = 4+sizeof(char)+sizeof(int64_t); i < key.size(); i++) { - nonce = (nonce<<8)+uint8_t(k[i]); - } - mapping.ParseFromString(value); -} - -void Index::parse_alignment(const string& key, const string& value, int64_t& node_id, uint64_t& nonce, Alignment& alignment) { - const char* k = key.c_str(); - memcpy(&node_id, (k + 3*sizeof(char)), sizeof(int64_t)); - node_id = be64toh(node_id); - nonce = 0; - for (int i = 4+sizeof(char)+sizeof(int64_t); i < key.size(); i++) { - nonce = (nonce<<8)+uint8_t(k[i]); - } - alignment.ParseFromString(value); -} - -void Index::parse_base(const string& key, const string& value, int64_t& aln_id, Alignment& alignment) { - const char* k = key.c_str(); - memcpy(&aln_id, (k + 3*sizeof(char)), sizeof(int64_t)); - aln_id = be64toh(aln_id); - alignment.ParseFromString(value); -} - -void Index::parse_traversal(const string& key, const string& value, int64_t& node_id, int16_t& rank, bool& backward, int64_t& aln_id) { - const char* k = key.c_str(); - memcpy(&node_id, (k + 3*sizeof(char)), sizeof(int64_t)); - node_id = be64toh(node_id); - memcpy(&aln_id, (k + 3*sizeof(char)+sizeof(int64_t)), sizeof(int64_t)); - aln_id = be64toh(aln_id); - memcpy(&rank, (k + 3*sizeof(char) + 2*sizeof(int64_t)), sizeof(int16_t)); - if (rank < 0) { backward = true; } else { backward = false; } - rank = abs(rank); -} - -string Index::node_path_to_string(const string& key, const string& value) { - Mapping mapping; - int64_t node_id, path_id, path_pos; - bool backward; - parse_node_path(key, value, node_id, path_id, path_pos, backward, mapping); - stringstream s; - s << "{\"key\":\"+g+" << node_id << "+p+" << path_id << "+" << path_pos << "+" << (backward ? '1' : '0') - << "\", \"value\":"<< pb2json(mapping) << "}"; - return s.str(); -} - -string Index::path_position_to_string(const string& key, const string& value) { - Mapping mapping; - int64_t node_id, path_id, path_pos; - bool backward; - parse_path_position(key, value, path_id, path_pos, backward, node_id, mapping); - stringstream s; - s << "{\"key\":\"+p+" << path_id << "+" << path_pos << "+" << (backward ? '1' : '0') << "+" << node_id - << "\", \"value\":"<< pb2json(mapping) << "}"; - return s.str(); -} - -string Index::metadata_entry_to_string(const string& key, const string& value) { - stringstream s; - string prefix = key.substr(3); - string val = value; - if (prefix == "max_path_id" - || prefix.substr(0,9) == "path_name") { - stringstream v; - int64_t id; - memcpy(&id, (char*)value.c_str(), sizeof(int64_t)); - v << id; - val = v.str(); - } else if (prefix.substr(0,7) == "path_id") { - stringstream v; - int64_t id; - memcpy(&id, ((char*)prefix.c_str())+7, sizeof(int64_t)); - v << id; - prefix = prefix.substr(0,7) + "+" + v.str(); - } - s << "{\"key\":\"" << "+" << key[1] << "+" << prefix << "\", \"value\":\""<< val << "\"}"; - return s.str(); -} - -string Index::mapping_entry_to_string(const string& key, const string& value) { - Mapping mapping; - int64_t node_id; - uint64_t nonce; - parse_mapping(key, value, node_id, nonce, mapping); - stringstream s; - s << "{\"key\":\"+s+" << node_id << "+" << nonce << "\", \"value\":"<< pb2json(mapping) << "}"; - return s.str(); -} - -string Index::alignment_entry_to_string(const string& key, const string& value) { - Alignment alignment; - int64_t node_id; - uint64_t nonce; - parse_alignment(key, value, node_id, nonce, alignment); - stringstream s; - s << "{\"key\":\"+a+" << node_id << "+" << nonce << "\", \"value\":"<< pb2json(alignment) << "}"; - return s.str(); -} - -string Index::base_entry_to_string(const string& key, const string& value) { - Alignment alignment; - int64_t aln_id; - parse_base(key, value, aln_id, alignment); - stringstream s; - s << "{\"key\":\"+b+" << aln_id << "\", \"value\":"<< pb2json(alignment) << "}"; - return s.str(); -} - -string Index::traversal_entry_to_string(const string& key, const string& value) { - int64_t node_id; - bool backward; - int64_t aln_id; - int16_t rank; - parse_traversal(key, value, node_id, rank, backward, aln_id); - stringstream s; - s << "{\"key\":\"+t+" << node_id << (backward?"r":"f") << rank << "+" << aln_id << "\", \"value\":\"\"}"; - return s.str(); -} - -void Index::dump(ostream& out) { - rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions()); - for (it->SeekToFirst(); it->Valid(); it->Next()) { - out << entry_to_string(it->key().ToString(), it->value().ToString()) << endl; - } - assert(it->status().ok()); // Check for any errors found during the scan - delete it; -} - -void Index::put_node(const Node* node) { - string data; - node->SerializeToString(&data); - string key = key_for_node(node->id()); - S(db->Put(write_options, key, data)); -} - -void Index::batch_node(const Node* node, rocksdb::WriteBatch& batch) { - string data; - node->SerializeToString(&data); - string key = key_for_node(node->id()); - batch.Put(key, data); -} - -void Index::put_edge(const Edge* edge) { - // At least one edge key will hold the serialized edge data - string data; - edge->SerializeToString(&data); - - // One will probably hold an empty string, unless this is a self loop somehow. - string null_data; - - // only store serialized edge in the key linking the edge to the smaller - // node. If the two node IDs are equal, store in both keys (which might just actually be one key). - string& from_data = (edge->from() <= edge->to()) ? data : null_data; - string& to_data = (edge->to() <= edge->from()) ? data : null_data; - - // Is the edge reversing relative node orientation? - bool backward = (edge->from_start() != edge->to_end()); - - if(edge->from_start()) { - // On the from node, we're on the start - S(db->Put(write_options, key_for_edge_on_start(edge->from(), edge->to(), backward), from_data)); - } else { - // On the from node, we're on the end - S(db->Put(write_options, key_for_edge_on_end(edge->from(), edge->to(), backward), from_data)); - } - - if(edge->to_end()) { - // On the to node, we're on the end - S(db->Put(write_options, key_for_edge_on_end(edge->to(), edge->from(), backward), to_data)); - } else { - // On the to node, we're on the start - S(db->Put(write_options, key_for_edge_on_start(edge->to(), edge->from(), backward), to_data)); - } -} - -void Index::batch_edge(const Edge* edge, rocksdb::WriteBatch& batch) { - // At least one edge key will hold the serialized edge data - string data; - edge->SerializeToString(&data); - - // One will probably hold an empty string, unless this is a self loop somehow. - string null_data; - - // only store serialized edge in the key linking the edge to the smaller - // node. If the two node IDs are equal, store in both keys (which might just actually be one key). - string& from_data = (edge->from() <= edge->to()) ? data : null_data; - string& to_data = (edge->to() <= edge->from()) ? data : null_data; - - // Is the edge reversing relative node orientation? - bool backward = (edge->from_start() != edge->to_end()); - - if(edge->from_start()) { - // On the from node, we're on the start - batch.Put(key_for_edge_on_start(edge->from(), edge->to(), backward), from_data); - } else { - // On the from node, we're on the end - batch.Put(key_for_edge_on_end(edge->from(), edge->to(), backward), from_data); - } - - if(edge->to_end()) { - // On the to node, we're on the end - batch.Put(key_for_edge_on_end(edge->to(), edge->from(), backward), to_data); - } else { - // On the to node, we're on the start - batch.Put(key_for_edge_on_start(edge->to(), edge->from(), backward), to_data); - } -} - -void Index::put_metadata(const string& tag, const string& data) { - string key = key_for_metadata(tag); - S(db->Put(write_options, key, data)); -} - -void Index::put_node_path(int64_t node_id, int64_t path_id, int64_t path_pos, bool backward, const Mapping& mapping) { - string data; - mapping.SerializeToString(&data); - S(db->Put(write_options, key_for_node_path_position(node_id, path_id, path_pos, backward), data)); -} - -void Index::put_path_position(int64_t path_id, int64_t path_pos, bool backward, int64_t node_id, const Mapping& mapping) { - string data; - mapping.SerializeToString(&data); - S(db->Put(write_options, key_for_path_position(path_id, path_pos, backward, node_id), data)); -} - -void Index::put_mapping(const Mapping& mapping) { - string data; - mapping.SerializeToString(&data); - S(db->Put(write_options, key_for_mapping(mapping), data)); -} - -void Index::put_alignment(const Alignment& alignment) { - static std::atomic warned_unmapped(false); - string data; - alignment.SerializeToString(&data); - S(db->Put(write_options, key_for_alignment(alignment), data)); -} - -void Index::put_base(int64_t aln_id, const Alignment& alignment) { - string data; - alignment.SerializeToString(&data); - S(db->Put(write_options, key_for_base(aln_id), data)); -} - -void Index::put_traversal(int64_t aln_id, const Mapping& mapping) { - string data; // empty data - S(db->Put(write_options, key_for_traversal(aln_id, mapping), data)); -} - -void Index::cross_alignment(int64_t aln_id, const Alignment& alignment) { - put_base(aln_id, alignment); - if (alignment.has_path()) { - auto& path = alignment.path(); - for (int i = 0; i < path.mapping_size(); ++i) { - put_traversal(aln_id, path.mapping(i)); - } - } -} - -void Index::load_graph(VG& graph) { - // a bit of a hack--- the logging only works with for_each_*parallel - // also the high parallelism may be causing issues - int thread_count = 1; -#pragma omp parallel - { -#pragma omp master - thread_count = omp_get_num_threads(); - } - omp_set_num_threads(1); - graph.preload_progress("indexing nodes of " + graph.name); - rocksdb::WriteBatch batch; - graph.for_each_node_parallel([this, &batch](Node* n) { batch_node(n, batch); }); - graph.preload_progress("indexing edges of " + graph.name); - graph.for_each_edge_parallel([this, &batch](Edge* e) { batch_edge(e, batch); }); - rocksdb::Status s = db->Write(write_options, &batch); - omp_set_num_threads(thread_count); -} - -void Index::load_paths(VG& graph) { - graph.create_progress("indexing paths of " + graph.name, graph.paths._paths.size()); - store_paths(graph); - graph.destroy_progress(); -} - -int64_t Index::get_max_path_id(void) { - string data; - int64_t id; - rocksdb::Status s = get_metadata("max_path_id", data); - if (s.IsNotFound()) { - id = 0; - put_max_path_id(id); - } else { - S(s); - memcpy(&id, data.c_str(), sizeof(int64_t)); - } - return id; -} - -void Index::put_max_path_id(int64_t id) { - // TODO: ensure consistent endianness - string data; - data.resize(sizeof(int64_t)); - memcpy((char*)data.c_str(), &id, sizeof(int64_t)); - put_metadata("max_path_id", data); -} - -int64_t Index::new_path_id(const string& path_name) { - int64_t max_id = get_max_path_id(); - int64_t new_id = max_id + 1; - put_max_path_id(new_id); - put_path_id_to_name(new_id, path_name); - put_path_name_to_id(new_id, path_name); - return new_id; -} - -string Index::path_name_prefix(const string& name) { - return "path_name" + start_sep + name; -} - -string Index::path_id_prefix(int64_t id) { - // TODO: ensure consistent endianness - string prefix = "path_id" + start_sep; - size_t prefix_size = prefix.size(); - prefix.resize(prefix.size() + sizeof(int64_t)); - memcpy((char*)prefix.c_str() + prefix_size, &id, sizeof(int64_t)); - return prefix; -} - -void Index::put_path_id_to_name(int64_t id, const string& name) { - put_metadata(path_id_prefix(id), name); -} - -void Index::put_path_name_to_id(int64_t id, const string& name) { - // TODO: ensure consistent endianness - string data; - data.resize(sizeof(int64_t)); - memcpy((char*)data.c_str(), &id, sizeof(int64_t)); - put_metadata(path_name_prefix(name), data); -} - -string Index::get_path_name(int64_t id) { - string data; - rocksdb::Status s = get_metadata(path_id_prefix(id), data); - if (s.ok()) { - return data; - } else if (!s.IsNotFound()) { - S(s); - } - return string(); -} - -int64_t Index::get_path_id(const string& name) { - string data; - int64_t id = 0; - rocksdb::Status s = get_metadata(path_name_prefix(name), data); - if (s.ok()) { - memcpy(&id, (char*)data.c_str(), sizeof(int64_t)); - } else if (!s.IsNotFound()) { - S(s); - } - return id; -} - -void Index::store_paths(VG& graph) { - function lambda = [this, &graph](const Path& path) { - store_path(graph, path); - }; - graph.paths.for_each(lambda); -} - -void Index::store_path(VG& graph, const Path& path) { - // get a new path id - // if there is no name, cry - if (path.name().empty()) { - cerr << "[vg::Index] error, path has no name" << endl; - exit(1); - } - // check if the path name/id mapping already exists - int64_t path_id; - path_id = get_path_id(path.name()); - // if it doesn't, create it - if (!path_id) { - path_id = new_path_id(path.name()); - } - // keep track of position - int64_t path_pos = 0; - // for each node in the path - for (int64_t i = 0; i < path.mapping_size(); ++i) { - - const Mapping& mapping = path.mapping(i); - // put an entry in the path table - put_path_position(path_id, path_pos, mapping.position().is_reverse(), mapping.position().node_id(), mapping); - // put an entry in the graph table - put_node_path(mapping.position().node_id(), path_id, path_pos, mapping.position().is_reverse(), mapping); - - // get the node, to find the size of this step - Node node; - get_node(mapping.position().node_id(), node); - // TODO use the cigar... if there is one - path_pos += node.sequence().size(); - - graph.increment_progress(); - } -} - -rocksdb::Status Index::get_metadata(const string& key, string& data) { - rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key_for_metadata(key), &data); - return s; -} - -rocksdb::Status Index::get_node(int64_t id, Node& node) { - string value; - rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key_for_node(id), &value); - if (s.ok()) { - node.ParseFromString(value); - } - return s; -} - -rocksdb::Status Index::get_edge(int64_t from, bool from_start, int64_t to, bool to_end, Edge& edge) { - // Are we looking for a reversing edge? - bool backward = from_start != to_end; - - // What key do we need to look up to get the edge data? - string key; - - // TODO: restructure keys so we don't need to do so much figuring to work out what to look up. - if(from < to) { - // We will find the edge data on the record for its attachment to the from node. - if(from_start) { - key = key_for_edge_on_start(from, to, backward); - } else { - key = key_for_edge_on_end(from, to, backward); - } - } else { - // We will find the edge data on the record for its attachment to the to node. - if(to_end) { - key = key_for_edge_on_end(to, from, backward); - } else { - key = key_for_edge_on_start(to, from, backward); - } - } - - string value; - rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key, &value); - if (s.ok()) { - edge.ParseFromString(value); - } - return s; -} - -void Index::get_mappings(int64_t node_id, vector& mappings) { - string start = key_for_mapping_prefix(node_id); - string end = start + end_sep; - for_range(start, end, [this, &mappings](string& key, string& value) { - mappings.emplace_back(); - Mapping& mapping = mappings.back(); - mapping.ParseFromString(value); - }); -} - -void Index::get_alignments(int64_t node_id, vector& alignments) { - string start = key_for_alignment_prefix(node_id); - string end = start + end_sep; - for_range(start, end, [this, &alignments](string& key, string& value) { - alignments.emplace_back(); - Alignment& alignment = alignments.back(); - alignment.ParseFromString(value); - }); -} - -void Index::get_alignments(int64_t id1, int64_t id2, vector& alignments) { - string start = key_for_alignment_prefix(id1); - string end = key_for_alignment_prefix(id2) + end_sep; - for_range(start, end, [this, &alignments](string& key, string& value) { - alignments.emplace_back(); - Alignment& alignment = alignments.back(); - alignment.ParseFromString(value); - }); -} - -void Index::for_alignment_in_range(int64_t id1, int64_t id2, std::function lambda) { - string start = key_for_alignment_prefix(id1); - string end = key_for_alignment_prefix(id2) + end_sep; - for_range(start, end, [this, &lambda](string& key, string& value) { - Alignment alignment; - alignment.ParseFromString(value); - lambda(alignment); - }); -} - -void Index::for_alignment_to_nodes(const vector& ids, std::function lambda) { - set aln_ids; - for (auto id : ids) { - string start = key_prefix_for_traversal(id); - string end = start + end_sep; - for_range(start, end, [this, &lambda, &aln_ids](string& key, string& value) { - // parse the alignment id out - int64_t node_id; - int16_t rank; - bool backward; - int64_t aln_id; - parse_traversal(key, value, node_id, rank, backward, aln_id); - aln_ids.insert(aln_id); - }); - } - for_base_alignments(aln_ids, lambda); -} - -void Index::for_base_alignments(const set& aln_ids, std::function lambda) { - for (auto id : aln_ids) { - string start = key_for_base(id); - string end = start + end_sep; - for_range(start, end, [this, &lambda](string& key, string& value) { - Alignment alignment; - int64_t aln_id; - parse_base(key, value, aln_id, alignment); - lambda(alignment); - }); - } -} - -int Index::get_node_path(int64_t node_id, int64_t path_id, int64_t& path_pos, bool& backward, Mapping& mapping) { - string value; - string key = key_prefix_for_node_path(node_id, path_id); - string start = key + start_sep; - string end = key + end_sep; - // NB: uses the first position in the range - // apply to the range matching the kmer in the db - int count = 0; - for_range(start, end, [this, &count, &node_id, &path_id, &path_pos, &backward, &mapping](string& key, string& value) { - if (count == 0) { - parse_node_path(key, value, - node_id, path_id, - path_pos, backward, mapping); - } - ++count; - }); - return count; -} - -pair>, pair> Index::get_nearest_node_prev_path_member( - int64_t node_id, bool backward, int64_t path_id, int64_t& path_pos, bool& relative_orientation, int max_steps) { - - list> nullpath; - list> bpath; - - // Keeps a list of oriented nodes we can reach, by path taken to reach them - map>, pair> nq; - - { // handle this node - // Put this node on the path - bpath.push_front(make_pair(node_id, backward)); - // Load the node - Node& node = nq[bpath].first; - get_node(node_id, node); - nq[bpath].second = backward; - - - Mapping mapping; - bool backward_on_path; - if (get_node_path(node_id, path_id, path_pos, backward_on_path, mapping) > 0) { - // This node is on the target path. - - // Report if we were looking at it backward relative to the path - relative_orientation = (backward != backward_on_path); - - // Return a search path of just this node, and its ID. We inclue it - // in the search path (due to being the end of the search) even - // though it wouldn't normally be included (due to being the start - // of the search). - return make_pair(bpath, bpath.front()); - } - - // Otherwise, say we're at this node after taking the empty path. - Node n = node; - nq.clear(); - nq[nullpath] = make_pair(n, backward); - } - - // BFS back - int steps_back = 0; - while (steps_back++ < max_steps) { - // We're going to extend all the paths and populate this. - map>, pair> cq; - for (auto& n : nq) { - // Unpack the entry - Node& node = n.second.first; - bool orientation = n.second.second; - const list>& path = n.first; - - // Look off the left side of this oriented node, and get the oriented nodes you find there. - vector> destinations; - get_nodes_prev(node.id(), orientation, destinations); - - for(auto& destination : destinations) { - int64_t id = destination.first; - - // Extend the path on the left with this destination - list> npath = path; - npath.push_front(destination); - - // Fill in the Node object and orientation you can reach via this path - Node& node = cq[npath].first; - get_node(id, node); - cq[npath].second = destination.second; - - Mapping mapping; - bool backward_on_path; - if (get_node_path(id, path_id, path_pos, backward_on_path, mapping) > 0) { - // This node we just reached is on the path - - // Report if we were looking at it backward relative to the path - relative_orientation = (destination.second != backward_on_path); - - if(!relative_orientation) { - // The right side of this oriented node, which we reached, comes later in the path. - path_pos += node.sequence().size(); - } - return make_pair(npath, npath.front()); - } - } - } - // Advance to the next search stage. - nq = cq; - } - - // If we get here, we failed to find a path - relative_orientation = false; - return make_pair(nullpath, make_pair(0, false)); -} - -pair>, pair> Index::get_nearest_node_next_path_member( - int64_t node_id, bool backward, int64_t path_id, int64_t& path_pos, bool& relative_orientation, int max_steps) { - - list> nullpath; - list> bpath; - - // Keeps a list of oriented nodes we can reach, by path taken to reach them - map>, pair> nq; - - { // handle this node - // Put this node on the path - bpath.push_back(make_pair(node_id, backward)); - // Load the node - Node& node = nq[bpath].first; - get_node(node_id, node); - nq[bpath].second = backward; - - - Mapping mapping; - bool backward_on_path; - if (get_node_path(node_id, path_id, path_pos, backward_on_path, mapping) > 0) { - // This node is on the target path. - - // Report if we were looking at it backward relative to the path - relative_orientation = (backward != backward_on_path); - - // Return a search path of just this node, and its ID. We inclue it - // in the search path (due to being the end of the search) even - // though it wouldn't normally be included (due to being the start - // of the search). - return make_pair(bpath, bpath.back()); - } - - // Otherwise, say we're at this node after taking the empty path. - Node n = node; - nq.clear(); - nq[nullpath] = make_pair(n, backward); - } - - // BFS forward - int steps_forward = 0; - while (steps_forward++ < max_steps) { - // We're going to extend all the paths and populate this. - map>, pair> cq; - for (auto& n : nq) { - // Unpack the entry - Node& node = n.second.first; - bool orientation = n.second.second; - const list>& path = n.first; - - // Look off the right side of this oriented node, and get the oriented nodes you find there. - vector> destinations; - get_nodes_next(node.id(), orientation, destinations); - - for(auto& destination : destinations) { - int64_t id = destination.first; - - // Extend the path on the left with this destination - list> npath = path; - npath.push_back(destination); - - // Fill in the Node object and orientation you can reach via this path - Node& node = cq[npath].first; - get_node(id, node); - cq[npath].second = destination.second; - - Mapping mapping; - bool backward_on_path; - if (get_node_path(id, path_id, path_pos, backward_on_path, mapping) > 0) { - // This node we just reached is on the path - - // Report if we were looking at it backward relative to the path - relative_orientation = (destination.second != backward_on_path); - - if(relative_orientation) { - // The *left* side of this oriented node, which we reached, comes later in the path. - path_pos += node.sequence().size(); - } - return make_pair(npath, npath.back()); - } - } - } - // Advance to the next search stage. - nq = cq; - } - - // If we get here, we failed to find a path - relative_orientation = false; - return make_pair(nullpath, make_pair(0, false)); -} - -bool Index::get_node_path_relative_position(int64_t node_id, bool backward, int64_t path_id, - list>& path_prev, int64_t& prev_pos, bool& prev_orientation, - list>& path_next, int64_t& next_pos, bool& next_orientation) { - // scan the range before the node - // start with our node, and walk back BFS until we find a node with a path - // are any parents part of the path? - - list> nullpath; - auto null_pair = make_pair(nullpath, make_pair((int64_t)0, false)); - - auto to_path_prev = get_nearest_node_prev_path_member(node_id, backward, path_id, prev_pos, prev_orientation); - if (to_path_prev == null_pair) { - cerr << "no to path" << endl; - return false; - } else { - path_prev = to_path_prev.first; - } - - auto to_path_next = get_nearest_node_next_path_member(node_id, backward, path_id, next_pos, next_orientation); - if (to_path_next == null_pair) { - cerr << "no from path" << endl; - return false; - } else { - path_next = to_path_next.first; - } - - if(next_orientation != prev_orientation) { - // TODO: this will only happen if cycles are possible, but it's not clear how to handle it. - cerr << "meets path in different orientations from different ends" << endl; - return false; - } - - return true; -} - -Mapping Index::path_relative_mapping(int64_t node_id, bool backward, int64_t path_id, - list>& path_prev, int64_t& prev_pos, bool& prev_orientation, - list>& path_next, int64_t& next_pos, bool& next_orientation) { - Mapping mapping; - // TODO: shouldn't this point to the node(s?) we're changing, not the one we changed to? - // TODO: yes it should, but it does not... - mapping.mutable_position()->set_node_id(node_id); - mapping.mutable_position()->set_is_reverse(backward); - // what about offset? - // TODO I assume this condition works for now, but I do so with a whole salt lick of salt. - if (get_node_path_relative_position(node_id, backward, path_id, - path_prev, prev_pos, prev_orientation, path_next, next_pos, next_orientation)) { - // We found a way to the path. - - Edit* edit = mapping.add_edit(); - Node node; get_node(node_id, node); - // See if we're actually just on that path. - bool in_path = path_prev.back().first == node_id && path_next.front().first == node_id; - int32_t to_length = node.sequence().size(); - // The length we replace is either our length if we're on the path, or - // the distance between where we meet the path on our right and our left - // otherwise. We need to account for being backwards relative to path coordinates though. - int32_t from_length = in_path ? to_length : max(next_pos, prev_pos) - min(next_pos, prev_pos); - //Case to_len == from_len: we're working with a SNP or an exact match. Kinda the base case. - if (from_length == to_length) { - edit->set_from_length(from_length); - edit->set_to_length(to_length); - // TODO set sequence - // TODO path_prev is a lost of pairs - // so accessing the front of it is like accessing the nearest node on the ref path - int64_t p_id = (path_prev.front()).first; - Node p; get_node(p_id, p); - Node n; get_node((path_next.front()).first, n); - string seq = ""; - // Now that we have the alternate node and its neighbors on the path, get - // nodes in the path that are across from the alt node - // (i.e. between the next and prev nodes on path). - vector> level_nodes_prev; - get_nodes_next(p.id(), (path_prev.front()).second, level_nodes_prev); - Mapping m; - int i; - for (i = 0; i < level_nodes_prev.size(); i++){ - pair n_id_and_backward = level_nodes_prev[i]; - //TODO I'm suspicious the position argument here is incorrect. We should be using the position - //of the level node. - if (get_node_path(n_id_and_backward.first, path_id, prev_pos, n_id_and_backward.second, m) <= 0){ - Node n_id_node; get_node(n_id_and_backward.first, n_id_node); - seq = n_id_node.sequence(); - } - } - edit->set_sequence(seq); - } else { - edit->set_from_length(from_length); - edit->set_to_length(to_length); - //TODO set sequence - edit->set_sequence(node.sequence()); - } - } - return mapping; -} - -// transform the path into a path relative to another path (defined by path_name) -// source -> surjection (in path_name coordinate space) -// the product is equivalent to a pairwise alignment between this path and the other - -map Index::paths_by_id(void) { - map byid; - string start = key_for_metadata(path_id_prefix(0)); - start = start.substr(0, start.size()-sizeof(int64_t)); - string end = start + end_sep; - for_range(start, end, [this, &byid](string& key, string& value) { - int64_t& id = byid[value]; - memcpy(&id, (void*)(key.c_str() + 10*sizeof(char)), sizeof(int64_t)); - }); - return byid; -} - -pair Index::path_first_node(int64_t path_id) { - string k = key_for_path_position(path_id, 0, false, 0); - k = k.substr(0, 4 + sizeof(int64_t)); - rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions()); - rocksdb::Slice start = rocksdb::Slice(k); - rocksdb::Slice end = rocksdb::Slice(k+end_sep); - int64_t node_id = 0; - bool backward; - it->Seek(start); - if (it->Valid()) { - string key = it->key().ToString(); - string value = it->value().ToString(); - int64_t path_id2, path_pos; Mapping mapping; - parse_path_position(key, value, path_id2, path_pos, backward, node_id, mapping); - } - delete it; - return make_pair(node_id, backward); -} - -pair Index::path_last_node(int64_t path_id, int64_t& path_length) { - // we aim to seek to the first item in the next path, then step back - string key_start = key_for_path_position(path_id, 0, false, 0); - string key_end = key_for_path_position(path_id+1, 0, false, 0); - rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions()); - //rocksdb::Slice start = rocksdb::Slice(key_start); - rocksdb::Slice end = rocksdb::Slice(key_end); - int64_t node_id = 0; - bool backward; - it->Seek(end); - if (it->Valid()) { - it->Prev(); - } - else { - it->SeekToLast(); - } - if (it->Valid()) { - string key = it->key().ToString(); - string value = it->value().ToString(); - int64_t path_id2, path_pos; Mapping mapping; - parse_path_position(key, value, path_id2, path_pos, backward, node_id, mapping); - Node node; get_node(node_id, node); - path_length = path_pos + node.sequence().size(); - } - delete it; - return make_pair(node_id, backward); -} - -void Index::path_layout(map, pair> >& layout, - map& lengths) { - map pbyid = paths_by_id(); - // for each path - for (auto& p : pbyid) { - // find the start and end nodes - int64_t path_length; - layout[p.first] = make_pair(path_first_node(p.second), - path_last_node(p.second, path_length)); - lengths[p.first] = path_length; - } -} - -void Index::for_each_alignment(function lambda) { - string key; - key.resize(2*sizeof(char)); - char* k = (char*) key.c_str(); - k[0] = start_sep; - k[1] = 'a'; // alignments - string start = key; - string end = start + end_sep; - for_range(start, end, [this, &lambda](string& key, string& value) { - Alignment alignment; - alignment.ParseFromString(value); - lambda(alignment); - }); -} - -void Index::for_each_mapping(function lambda) { - string start = start_sep + "s"; - string end = start + end_sep; - for_range(start, end, [this, &lambda](string& key, string& value) { - Mapping mapping; - mapping.ParseFromString(value); - lambda(mapping); - }); -} - -void Index::expand_context(VG& graph, int steps = 1) { - for (int step = 0; step < steps; ++step) { - set ids; - graph.for_each_edge([this, &graph, &ids](Edge* edge) { - if (!graph.has_node(edge->from())) { - ids.insert(edge->from()); - } - if (!graph.has_node(edge->to())) { - ids.insert(edge->to()); - } - }); - for (auto id : ids) { - get_context(id, graph); - } - // TODO: optimize this to only look at newly added edges on subsequent steps. - } -} - -void Index::get_connected_nodes(VG& graph) { - graph.for_each_edge([this, &graph](Edge* edge) { - if (!graph.has_node(edge->from())) { - Node node; - get_node(edge->from(), node); - graph.add_node(node); - } - if (!graph.has_node(edge->to())) { - Node node; - get_node(edge->to(), node); - graph.add_node(node); - } - }); -} - -void Index::get_context(int64_t id, VG& graph) { - rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions()); - string key_start = key_for_node(id).substr(0,3+sizeof(int64_t)); - rocksdb::Slice start = rocksdb::Slice(key_start); - string key_end = key_start+end_sep; - rocksdb::Slice end = rocksdb::Slice(key_end); - for (it->Seek(start); - it->Valid() && it->key().ToString() < key_end; - it->Next()) { - string s = it->key().ToString(); - char keyt = graph_key_type(s); - switch (keyt) { - case 'n': { - // Key describes the node - Node node; - node.ParseFromString(it->value().ToString()); - graph.add_node(node); - } break; - case 's': { - // Key describes an edge on the start of the node - Edge edge; - int64_t id1, id2; - char type; - parse_edge(it->key().ToString(), it->value().ToString(), type, id1, id2, edge); - graph.add_edge(edge); - } break; - case 'e': { - // Key describes an edge on the end of the node - Edge edge; - int64_t id1, id2; - char type; - parse_edge(it->key().ToString(), it->value().ToString(), type, id1, id2, edge); - // avoid a second lookup - // probably we should index these twice and pay the penalty on *write* rather than read - //get_edge(id2, id1, edge); - graph.add_edge(edge); - - } break; - case 'p': { - // Key describes a path membership - int64_t node_id, path_id, path_pos; - Mapping mapping; - bool backward; - parse_node_path(it->key().ToString(), it->value().ToString(), - node_id, path_id, path_pos, backward, mapping); - // We don't need to pass backward here since it's included in the Mapping object. - graph.paths.append_mapping(get_path_name(path_id), mapping); - } break; - default: - cerr << "vg::Index unrecognized key type " << keyt << endl; - exit(1); - break; - } - } - delete it; -} - -void Index::get_range(int64_t from_id, int64_t to_id, VG& graph) { - auto handle_entry = [this, &graph](string& key, string& value) { - char keyt = graph_key_type(key); - switch (keyt) { - case 'n': { - // Key describes a node - Node node; - node.ParseFromString(value); - graph.add_node(node); - } break; - case 's': { - // Key describes an edge on the start of a node - Edge edge; - int64_t id1, id2; - char type; - parse_edge(key, value, type, id1, id2, edge); - graph.add_edge(edge); - } break; - case 'e': { - // Key describes an edge on the end of a node - Edge edge; - int64_t id1, id2; - char type; - parse_edge(key, value, type, id1, id2, edge); - graph.add_edge(edge); - } break; - case 'p': { - // Key describes a path membership - int64_t node_id, path_id, path_pos; - Mapping mapping; - bool backward; - parse_node_path(key, value, - node_id, path_id, path_pos, backward, mapping); - // We don't need to pass backward here since it's included in the Mapping object. - graph.paths.append_mapping(get_path_name(path_id), mapping); - } break; - default: - cerr << "vg::Index unrecognized key type " << keyt << endl; - exit(1); - break; - } - }; - for_graph_range(from_id, to_id, handle_entry); -} - -void Index::get_kmer_subgraph(const string& kmer, VG& graph) { - // get the nodes in the kmer subgraph - for_kmer_range(kmer, [&graph, this](string& key, string& value) { - int64_t id; - string kmer; - int32_t pos; - parse_kmer(key, value, kmer, id, pos); - get_context(id, graph); - }); -} - -void Index::get_kmer_positions(const string& kmer, map >& positions) { - for_kmer_range(kmer, [&positions, this](string& key, string& value) { - int64_t id; - string kmer; - int32_t pos; - parse_kmer(key, value, kmer, id, pos); - positions[id].push_back(pos); - }); -} - -void Index::get_kmer_positions(const string& kmer, map > >& positions) { - for_kmer_range(kmer, [&positions, this](string& key, string& value) { - int64_t id; - string kmer; - int32_t pos; - parse_kmer(key, value, kmer, id, pos); - positions[kmer].push_back(make_pair(id, pos)); - }); -} - -void Index::for_kmer_range(const string& kmer, function lambda) { - string start = key_prefix_for_kmer(kmer); - string end = start + end_sep; - start = start + start_sep; - // apply to the range matching the kmer in the db - for_range(start, end, lambda); -} - -void Index::for_graph_range(int64_t from_id, int64_t to_id, function lambda) { - // We can't rely on edge keys coming after their node keys, so we need to - // trim off the trailing "+n" from the first key, so we get all the edges. - string start = key_for_node(from_id).substr(0,3+sizeof(int64_t)); - // Similarly, we need to stop before the edges attached to this other node, - // if there are any. - string end = key_for_node(to_id+1).substr(0,3+sizeof(int64_t)); - // apply to the range matching the kmer in the db - for_range(start, end, lambda); -} - -uint64_t Index::approx_size_of_kmer_matches(const string& kmer) { - uint64_t size; - string start = key_prefix_for_kmer(kmer); - string end = start + end_sep; - rocksdb::Range range = rocksdb::Range(start, end); - db->GetApproximateSizes(&range, 1, &size); - return size; -} - -void Index::approx_sizes_of_kmer_matches(const vector& kmers, vector& sizes) { - sizes.resize(kmers.size()); - vector ranges; - for (auto& kmer : kmers) { - string start = key_prefix_for_kmer(kmer); - string end = start + end_sep; - ranges.push_back(rocksdb::Range(start, end)); - } - db->GetApproximateSizes(&ranges[0], kmers.size(), &sizes[0]); -} - -void Index::get_edges_of(int64_t node, vector& edges) { - get_edges_on_start(node, edges); - get_edges_on_end(node, edges); -} - -void Index::get_edges_on_start(int64_t node_id, vector& edges) { - rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions()); - string key_start = key_prefix_for_edges_on_node_start(node_id); - rocksdb::Slice start = rocksdb::Slice(key_start); - string key_end = key_start+end_sep; - rocksdb::Slice end = rocksdb::Slice(key_end); - for (it->Seek(start); - it->Valid() && it->key().ToString() < key_end; - it->Next()) { - string s = it->key().ToString(); - char keyt = graph_key_type(s); - switch (keyt) { - case 's': { - // Parse out the edge - Edge edge; - int64_t id1, id2; - char type; - bool backward; - parse_edge(it->key().ToString(), type, id1, id2, backward); - - // TODO: If we can know we don't really need the edge metadata, we could stop here and save a lookup. - - // What's the other node involved in this edge? - assert(node_id == id1); - int64_t other_id = id2; - - if(other_id < node_id) { - // The edge metadata wasn't stored here. We need to look it up. - - // What's the key for the other end of this edge? If this edge - // is reversing, then it's on the other node's start, too. - // Otherwise it's on the other node's end. - string other_key = backward ? - key_for_edge_on_start(other_id, node_id, backward) : - key_for_edge_on_end(other_id, node_id, backward); - - // Load up that key - string value; - rocksdb::Status status = db->Get(rocksdb::ReadOptions(), other_key, &value); - if (status.ok()) { - edge.ParseFromString(value); - } else { - cerr << entry_to_string(s, "") << " looking for " << entry_to_string(other_key, "") << endl; - throw std::runtime_error("Could not find other end of edge on start"); - } - } - edges.push_back(edge); - } break; - default: - // there should only be edges on the start - cerr << keyt << endl; - assert(false); - break; - } - } -} - -void Index::get_edges_on_end(int64_t node_id, vector& edges) { - rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions()); - string key_start = key_prefix_for_edges_on_node_end(node_id); - rocksdb::Slice start = rocksdb::Slice(key_start); - string key_end = key_start+end_sep; - rocksdb::Slice end = rocksdb::Slice(key_end); - for (it->Seek(start); - it->Valid() && it->key().ToString() < key_end; - it->Next()) { - string s = it->key().ToString(); - char keyt = graph_key_type(s); - switch (keyt) { - case 'e': { - Edge edge; - int64_t id1, id2; - char type; - bool backward; - parse_edge(it->key().ToString(), type, id1, id2, backward); - // TODO: If we can know we don't really need the edge metadata, we could stop here and save a lookup. - - // What's the other node involved in this edge? - assert(node_id == id1); - int64_t other_id = id2; - - if(other_id < node_id) { - // The edge metadata wasn't stored here. We need to look it up. - - // What's the key for the other end of this edge? If this edge - // is reversing, then it's on the other node's end, too. - // Otherwise it's on the other node's start. - string other_key = backward ? - key_for_edge_on_end(other_id, node_id, backward) : - key_for_edge_on_start(other_id, node_id, backward); - - // Load up that key - string value; - rocksdb::Status status = db->Get(rocksdb::ReadOptions(), other_key, &value); - if (status.ok()) { - edge.ParseFromString(value); - } else { - cerr << entry_to_string(s, "") << " looking for " << entry_to_string(other_key, "") << endl; - throw std::runtime_error("Could not find other end of edge on end"); - } - } else { - // We have the whole Edge right here - edge.ParseFromString(it->value().ToString()); - } - edges.push_back(edge); - } break; - default: - // there should only be edges on the end - cerr << keyt << endl; - assert(false); - break; - } - } -} - -void Index::get_nodes_next(int64_t node, bool backward, vector>& destinations) { - - // Get all the edges off the appropriate side of the node. - vector edges_to_follow; - if(backward) { - // "next" = right = start - get_edges_on_start(node, edges_to_follow); - } else { - // "next" = right = end - get_edges_on_end(node, edges_to_follow); - } - - for(Edge& e : edges_to_follow) { - // Get the other node involved in the edge - int64_t other_node = (e.to() == node ? e.from() : e.to()); - - // Work out if this is a reversing edge - bool reversing_edge = e.from_start() != e.to_end(); - - // Put in the other node ID and the relative orientation, which is our - // orientation, only reversed if we crossed a reversing edge. - destinations.emplace_back(other_node, backward != reversing_edge); - } -} - -void Index::get_nodes_prev(int64_t node, bool backward, vector>& destinations) { - // TODO: combine with get_nodes_next, since they're basically the same code. - - // Get all the edges off the appropriate side of the node. - vector edges_to_follow; - if(backward) { - // "prev" = left = end - get_edges_on_end(node, edges_to_follow); - } else { - // "prev" = left = start - get_edges_on_start(node, edges_to_follow); - } - - for(Edge& e : edges_to_follow) { - // Get the other node involved in the edge - int64_t other_node = (e.to() == node ? e.from() : e.to()); - - // Work out if this is a reversing edge - bool reversing_edge = e.from_start() != e.to_end(); - - // Put in the other node ID and the relative orientation, which is our - // orientation, only reversed if we crossed a reversing edge. - destinations.emplace_back(other_node, backward != reversing_edge); - } - -} - -void Index::get_path(VG& graph, const string& name, int64_t start, int64_t end) { - // picks up the specified range in the given path - if (start < 0 && end < 0) { - start = 0; end = LONG_MAX; - } - int64_t path_id = get_path_id(name); - string key_start = key_for_path_position(path_id, start, false, 0); - // This is deliberately before any key we would get for the actual end, because the end is exclusive. - string key_end = key_for_path_position(path_id, end, false, 0); - - for_range(key_start, key_end, [this, &graph](string& key, string& data) { - Mapping mapping; - int64_t path_id, path_pos, node_id; - bool backward; - parse_path_position(key, data, - path_id, path_pos, backward, - node_id, mapping); - get_context(node_id, graph); - }); - // scan the path record in the db to find included nodes - // get these and drop them into the graph -} - - -void node_path_position(int64_t id, string& path_name, int64_t& position, bool backward, int64_t& offset) { - // if we are in the path, trivial - // if not, run a BFS back to the nearest node in the path - // if (get_node_path_relative_position()){ - // Node n; // find the path start. - // // iterate over nodes in path until you arrive at this one, - // // and sum all sequence lengths from said traversal. - // } - // else{ - // // Not on the path, so get previous member. - // get_nearest_node_prev_path_member(); - // // iterate over nodes in path until you arrive at this one, - // // and sum all sequence lengths from said traversal. - // // add this node's sequence as well IE add offset - // } - - throw runtime_error("node_path_position not yet implemented"); - -} - -void Index::put_kmer(const string& kmer, - const int64_t id, - const int32_t pos) { - string key = key_for_kmer(kmer, id); - string data(sizeof(int32_t), '\0'); - memcpy((char*)data.c_str(), &pos, sizeof(int32_t)); - rocksdb::Status s = db->Put(write_options, key, data); - if (!s.ok()) { cerr << "put of " << kmer << " " << id << "@" << pos << " failed" << endl; exit(1); } -} - -void Index::batch_kmer(const string& kmer, - const int64_t id, - const int32_t pos, - rocksdb::WriteBatch& batch) { - string key = key_for_kmer(kmer, id); - string data(sizeof(int32_t), '\0'); - memcpy((char*) data.c_str(), &pos, sizeof(int32_t)); - batch.Put(key, data); -} - -void Index::store_batch(map& items) { - rocksdb::WriteBatch batch; - for (auto& i : items) { - const string& k = i.first; - const string& v = i.second; - batch.Put(k, v); - } - rocksdb::Status s = db->Write(write_options, &batch); - if (!s.ok()) cerr << "an error occurred while inserting items" << endl; -} - -void Index::for_all(std::function lambda) { - string start(1, start_sep); - string end(1, end_sep); - for_range(start, end, lambda); -} - -void Index::for_range(string& key_start, string& key_end, - std::function lambda) { - rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions()); - rocksdb::Slice start = rocksdb::Slice(key_start); - rocksdb::Slice end = rocksdb::Slice(key_end); - for (it->Seek(start); - it->Valid() && it->key().ToString() < key_end; - it->Next()) { - S(it->status()); - string key = it->key().ToString(); - string value = it->value().ToString(); - lambda(key, value); - } - S(it->status()); - delete it; -} - -// todo, get range estimated size - -void Index::prune_kmers(int max_kb_on_disk) { - string start = key_prefix_for_kmer(""); - string end = start + end_sep; - for_range(start, end, [this, max_kb_on_disk](string& key, string& value) { - string kmer; - int64_t id; - int32_t pos; - parse_kmer(key, value, kmer, id, pos); - if (approx_size_of_kmer_matches(kmer) > max_kb_on_disk) { - //cerr << "pruning kmer " << kmer << endl; - db->Delete(write_options, key); - } - }); -} - -void Index::remember_kmer_size(int size) { - stringstream s; - s << "k=" << size; - put_metadata(s.str(), ""); -} - -set Index::stored_kmer_sizes(void) { - set sizes; - auto lambda = [&sizes](string& key, string& value) { - sizes.insert(atoi(key.substr(5).c_str())); - }; - string start = key_for_metadata("k="); - string end = start + end_sep; - start = start + start_sep; - for_range(start, end, lambda); - return sizes; -} - - -void index_positions(VG& graph, map& node_path, map& edge_path) { - // TODO: support orientation here. -} - -string Index::first_kmer_key(const string& kmer) { - string found_key; - function lambda = [&found_key](string& key, string& value) { - if (found_key.empty()) { - found_key = key; - } - }; - string first_key = key_for_kmer(kmer, 0); - string last_key = key_for_kmer(kmer, numeric_limits::max()); - for_range(first_key, last_key, lambda); - return found_key; -} - -pair Index::compare_kmers(Index& other) { - int64_t outFound = 0; - int64_t outNotFound = 0; - string prev_kmer; - - function lambda = [&](string& key, string& value) { - if (key[1] == 'k') { - int64_t id; - int32_t pos; - string kmer; - parse_kmer(key, value, kmer, id, pos); - - // only visit first kmer when multiple occurances with dif. ids in a row - if (kmer != prev_kmer) { - - string remk = reverse_complement(kmer); - string remk_key = first_kmer_key(remk); - - // only visit canonical strand (ie lexicographic less than reverse comp) - if (remk_key.empty() || key < remk_key) { - - // put together a key range that will find all matches to kmer (i think) - string first_key = other.key_for_kmer(kmer, 0); - string last_key = other.key_for_kmer(kmer, numeric_limits::max()); - - // search other index - bool found = false; - function lambda1 = [&found](string& key, string& value) { - found = true; - }; - other.for_range(first_key, last_key, lambda1); - - // wasn't found in other, try reverse complement - if (!found) { - first_key = other.key_for_kmer(remk, 0); - last_key = other.key_for_kmer(remk, numeric_limits::max()); - other.for_range(first_key, last_key, lambda1); - } - - // update stats - if (found) { - ++outFound; - } else { - ++outNotFound; - } - } - } - swap(kmer, prev_kmer); - } - }; - - // skip things that aren't kmers - string first_search_key(1, start_sep); - first_search_key += 'k'; - string last_search_key(1, start_sep); - last_search_key += 'l'; - - for_range(first_search_key, last_search_key, lambda); - - return pair(outFound, outNotFound); -} - -} diff --git a/src/index.hpp b/src/index.hpp deleted file mode 100644 index a79f6d6270e..00000000000 --- a/src/index.hpp +++ /dev/null @@ -1,362 +0,0 @@ -#ifndef VG_INDEX_HPP_INCLUDED -#define VG_INDEX_HPP_INCLUDED - -#include -#include -#include -#include - -#include "rocksdb/db.h" -#include "rocksdb/env.h" -#include "rocksdb/options.h" -#include "rocksdb/write_batch.h" -#include "rocksdb/memtablerep.h" -#include "rocksdb/statistics.h" -#include "rocksdb/cache.h" -#include "rocksdb/slice_transform.h" -#include "rocksdb/table.h" -#include "rocksdb/filter_policy.h" - -#include "json2pb.h" -#include "vg.hpp" -#include "hash_map.hpp" - -namespace vg { - -#ifdef __APPLE__ -#include -#include - -#define htobe16(x) OSSwapHostToBigInt16(x) -#define htole16(x) OSSwapHostToLittleInt16(x) -#define be16toh(x) OSSwapBigToHostInt16(x) -#define le16toh(x) OSSwapLittleToHostInt16(x) - -#define htobe32(x) OSSwapHostToBigInt32(x) -#define htole32(x) OSSwapHostToLittleInt32(x) -#define be32toh(x) OSSwapBigToHostInt32(x) -#define le32toh(x) OSSwapLittleToHostInt32(x) - -#define htobe64(x) OSSwapHostToBigInt64(x) -#define htole64(x) OSSwapHostToLittleInt64(x) -#define be64toh(x) OSSwapBigToHostInt64(x) -#define le64toh(x) OSSwapLittleToHostInt64(x) - -#endif - -/* - - Cache our variant graph in a database (rocksdb-backed) which enables us to quickly: - 1) obtain specific nodes and edges from a large graph - 2) search nodes and edges by kmers that they contain or overlap them - 3) index the kmers of the graph - 4) store paths and determine the relative locations of nodes and edges in them - - Each of these functions uses a different subset of the namespace. Our key format is: - - +=\x00 is our 'start' separator - -=\xff is our 'end' separator --- this makes it easy to do range queries - - ids are stored as raw int64_t - - bools are stored a '0' or '1', not as sizeof(bool) bytes, since sizeof(bool) can vary. - - Note that all the graph keys have a node ID and then a "type" character. - - Also note that "pos" in path-related keys is the base-pair coordinate along the path, not the rank of the node. - - Note that we store the edge data for self loops twice. - - // key // value - -------------------------------------------------------------- - +m+metadata_key value // various information about the table - +g+node_id+n node [vg::Node] - +g+node_id+s+other_id+backward edge [vg::Edge] if node_id <= other_id, else null. edge is on start - +g+node_id+e+other_id+backward edge [vg::Edge] if node_id <= other_id, else null. edge is on end - +g+node_id+p+path_id+pos+backward mapping [vg::Mapping] - +k+kmer+node_id position of kmer in node [int32_t] - +p+path_id+pos+backward+node_id mapping [vg::Mapping] - +s+node_id+offset mapping [vg::Mapping] // mapping-only "side" against one node - +a+node_id+offset align_id // for sorting - +b+align_id alignment [vg::Alignment] // stores base alignments - +t+node_id+strand+align_id alignment traversal // allows us to quickly go from node traversal to alignments - */ - -class Index { - -public: - - Index(void); - Index(string& name); - ~Index(void); - - rocksdb::Options GetOptions(bool read_only); - void open(const std::string& dir, bool read_only = false); - void open_read_only(string& dir); - void open_for_write(string& dir); - void open_for_bulk_load(string& dir); - - void reset_options(void); - void flush(void); - void compact(void); - void close(void); - - string name; - - char start_sep; - char end_sep; - int threads; - - rocksdb::DB* db; - rocksdb::Options db_options; - rocksdb::WriteOptions write_options; - rocksdb::ColumnFamilyOptions column_family_options; - bool bulk_load; - std::atomic next_nonce; - - void load_graph(VG& graph); - void dump(std::ostream& out); - void for_all(std::function lambda); - void for_range(string& key_start, string& key_end, - std::function lambda); - - void put_node(const Node* node); - void put_edge(const Edge* edge); - void batch_node(const Node* node, rocksdb::WriteBatch& batch); - void batch_edge(const Edge* edge, rocksdb::WriteBatch& batch); - // Put a kmer that starts at the given index in the given node in the index. - // The index only stores the kmers that are on the forward strand at their - // start positions. The aligner is responsible for searching both strands of - // any query string. - void put_kmer(const string& kmer, - const int64_t id, - const int32_t pos); - void batch_kmer(const string& kmer, - const int64_t id, - const int32_t pos, - rocksdb::WriteBatch& batch); - void put_metadata(const string& tag, const string& data); - void put_node_path(int64_t node_id, int64_t path_id, int64_t path_pos, bool backward, const Mapping& mapping); - void put_path_position(int64_t path_id, int64_t path_pos, bool backward, int64_t node_id, const Mapping& mapping); - void put_mapping(const Mapping& mapping); - void put_alignment(const Alignment& alignment); - void put_base(int64_t aln_id, const Alignment& alignment); - void put_traversal(int64_t aln_id, const Mapping& mapping); - - // cross-index alignment by aln_id and record its traversals - void cross_alignment(int64_t aln_id, const Alignment& alignment); - - rocksdb::Status get_node(int64_t id, Node& node); - // Takes the nodes and orientations and gets the Edge object with any associated edge data. - rocksdb::Status get_edge(int64_t from, bool from_start, int64_t to, bool to_end, Edge& edge); - rocksdb::Status get_metadata(const string& key, string& data); - // Gets information about the first time the given node appears in the given - // path, and returns the number of times it appears. - int get_node_path(int64_t node_id, int64_t path_id, int64_t& path_pos, bool& backward, Mapping& mapping); - void get_mappings(int64_t node_id, vector& mappings); - void get_alignments(int64_t node_id, vector& alignments); - void get_alignments(int64_t id1, int64_t id2, vector& alignments); - void for_alignment_in_range(int64_t id1, int64_t id2, std::function lambda); - void for_alignment_to_node(int64_t node_id, std::function lambda); - void for_alignment_to_nodes(const vector& ids, std::function lambda); - void for_base_alignments(const set& aln_ids, std::function lambda); - - // obtain the key corresponding to each entity - const string key_for_node(int64_t id); - const string key_for_edge_on_start(int64_t node_id, int64_t other, bool backward); - const string key_for_edge_on_end(int64_t node_id, int64_t other, bool backward); - const string key_prefix_for_edges_on_node_start(int64_t node); - const string key_prefix_for_edges_on_node_end(int64_t node); - const string key_for_kmer(const string& kmer, int64_t id); - const string key_prefix_for_kmer(const string& kmer); - const string key_for_metadata(const string& tag); - const string key_for_path_position(int64_t path_id, int64_t path_pos, bool backward, int64_t node_id); - const string key_for_node_path_position(int64_t node_id, int64_t path_id, int64_t path_pos, bool backward); - const string key_prefix_for_node_path(int64_t node_id, int64_t path_id); - const string key_for_mapping_prefix(int64_t node_id); - const string key_for_mapping(const Mapping& mapping); - const string key_for_alignment_prefix(int64_t node_id); - const string key_for_alignment(const Alignment& alignment); - const string key_for_base(int64_t aln_id); - const string key_prefix_for_traversal(int64_t node_id); - const string key_for_traversal(int64_t aln_id, const Mapping& mapping); - - // deserialize a key/value pair - void parse_node(const string& key, const string& value, int64_t& id, Node& node); - // Parse an edge from any of the three kinds of edge keys. For the key types - // that don't actually store the Edge object, this really constructs a new - // Edge which won't have the data payload and which might have from and to - // swapped, but which is equivalent to the actual edge. Populates id1 and id2 with the from and to nodes, and Edge with the actual edge. Populates type with 's' for on-start keys, 'e' for on-end keys, or 'n' for "normal" two-ID edge keys. - void parse_edge(const string& key, const string& value, char& type, int64_t& id1, int64_t& id2, Edge& edge); - // We have an overload that doesn't actually fill in an Edge and just looks at the key. - void parse_edge(const string& key, char& type, int64_t& node_id, int64_t& other_id, bool& backward); - void parse_kmer(const string& key, const string& value, string& kmer, int64_t& id, int32_t& pos); - void parse_node_path(const string& key, const string& value, - int64_t& node_id, int64_t& path_id, int64_t& path_pos, bool& backward, Mapping& mapping); - void parse_path_position(const string& key, const string& value, - int64_t& path_id, int64_t& path_pos, bool& backward, int64_t& node_id, Mapping& mapping); - void parse_mapping(const string& key, const string& value, int64_t& node_id, uint64_t& nonce, Mapping& mapping); - void parse_alignment(const string& key, const string& value, int64_t& node_id, uint64_t& nonce, Alignment& alignment); - void parse_base(const string& key, const string& value, int64_t& aln_id, Alignment& alignment); - void parse_traversal(const string& key, const string& value, int64_t& node_id, int16_t& rank, bool& backward, int64_t& aln_id); - - // for dumping graph state/ inspection - string entry_to_string(const string& key, const string& value); - string graph_entry_to_string(const string& key, const string& value); - string kmer_entry_to_string(const string& key, const string& value); - string position_entry_to_string(const string& key, const string& value); - string metadata_entry_to_string(const string& key, const string& value); - string node_path_to_string(const string& key, const string& value); - string path_position_to_string(const string& key, const string& value); - string mapping_entry_to_string(const string& key, const string& value); - string alignment_entry_to_string(const string& key, const string& value); - string base_entry_to_string(const string& key, const string& value); - string traversal_entry_to_string(const string& key, const string& value); - - // accessors, traversal, context - void get_context(int64_t id, VG& graph); - // Augment the given graph with the nodes referenced by orphan edges, and - // all the edges of those nodes, repeatedly for the given number of steps. - void expand_context(VG& graph, int steps); - // Add all the elements in the given range to the given graph, if they aren't in it already. - void get_range(int64_t from_id, int64_t to_id, VG& graph); - void for_graph_range(int64_t from_id, int64_t to_id, function lambda); - void get_connected_nodes(VG& graph); - // Get the edges of the given node - void get_edges_of(int64_t node, vector& edges); - // Get the edges on the end of the given node - void get_edges_on_end(int64_t node, vector& edges); - // Get the edges on the start of the given node - void get_edges_on_start(int64_t node, vector& edges); - // Get the IDs and orientations of the nodes to the right of the given oriented node - void get_nodes_next(int64_t node, bool backward, vector>& destinations); - // Get the IDs and orientations of the nodes to the left of the given oriented node - void get_nodes_prev(int64_t node, bool backward, vector>& destinations); - // Get the specified region in bases (start inclusive, end exclusive) along - // the named path, and poipulate the given graph with it. Also gets dangling - // edges not on the path. - void get_path(VG& graph, const string& name, int64_t start, int64_t end); - // TODO: unimplemented. Supposed to get the position of a node in a path, or - // relative to a path (using a BFS to find the nearest previous path node) - // if not actually in the path. - void node_path_position(int64_t id, string& path_name, int64_t& position, bool& backward, int64_t& offset); - - // Given a node ID and orientation, and the ID of a path, fill in path_pos - // with the position along the path of the nearest node left of the node - // specified that is on that path. Fill in relative_orientation with the - // orientation of the node on the path relative to the specified orientation - // of the starting node. Returns the path taken by the breadth-first search, - // and the ID and orientation of the node on the target path that was - // reached. - pair>, pair> - get_nearest_node_prev_path_member(int64_t node_id, bool backward, int64_t path_id, - int64_t& path_pos, bool& relative_orientation, - int max_steps = 4); - // Given a node ID and orientation, and the ID of a path, fill in path_pos - // with the position along the path of the nearest node *right* of the node - // specified that is on that path. Fill in relative_orientation with the - // orientation of the node on the path relative to the specified orientation - // of the starting node. Returns the path taken by the breadth-first search, - // and the ID and orientation of the node on the target path that was - // reached. - pair>, pair> - get_nearest_node_next_path_member(int64_t node_id, bool backward, int64_t path_id, - int64_t& path_pos, bool& relative_orientation, - int max_steps = 4); - // Get the relative position, in both directions, of the given orientation of the given node along the given path. - bool get_node_path_relative_position(int64_t node_id, bool backward, int64_t path_id, - list>& path_prev, int64_t& prev_pos, bool& prev_orientation, - list>& path_next, int64_t& next_pos, bool& next_orientation); - // Get a Mapping for this node relative to the given path. The mapping will point to the given node in the given orientation. - Mapping path_relative_mapping(int64_t node_id, bool backward, int64_t path_id, - list>& path_prev, int64_t& prev_pos, bool& prev_orientation, - list>& path_next, int64_t& next_pos, bool& next_orientation); - - // Populates layout with path start and end nodes (and orientations), - // indexed by path names, and lengths with path lengths indexed by path - // names. - void path_layout(map, pair> >& layout, - map& lengths); - pair path_first_node(int64_t path_id); - pair path_last_node(int64_t path_id, int64_t& path_length); - - // kmers - void get_kmer_subgraph(const string& kmer, VG& graph); - // This is in bytes, and is often 0 for things that occur only once. - uint64_t approx_size_of_kmer_matches(const string& kmer); - // This is in bytes, and is often 0 for things that occur only once. - void approx_sizes_of_kmer_matches(const vector& kmers, vector& sizes); - // Run the given function on all the keys and values in the database describing instances of the given kmer. - void for_kmer_range(const string& kmer, function lambda); - // In the given map by node ID, fill in the vector with the offsets in that node at which the given kmer starts. - void get_kmer_positions(const string& kmer, map >& positions); - // In the given map by kmer, fill in the vector with the node IDs and offsets at which the given kmer starts. - void get_kmer_positions(const string& kmer, map > >& positions); - void prune_kmers(int max_kb_on_disk); - - void remember_kmer_size(int size); - set stored_kmer_sizes(void); - void store_batch(map& items); - //void store_positions(VG& graph, std::map& node_path, std::map& edge_path); - - // once we have indexed the kmers, we can get the nodes and edges matching - void kmer_matches(std::string& kmer, std::set& node_ids, std::set& edge_ids); - - // find lowest key with given kmer string (empty string returned if not found) - // does not check reverse_complement - string first_kmer_key(const string& kmer); - // compare kmers with other index: count the number of unique kmers (taking into account strand) - // in this index that are found in other, and the number not found. return <#found, #not found> pair - pair compare_kmers(Index& other); - - // paths - int64_t get_max_path_id(void); - void put_max_path_id(int64_t id); - int64_t new_path_id(const string& name); - string path_name_prefix(const string& name); - string path_id_prefix(int64_t id); - void put_path_id_to_name(int64_t id, const string& name); - void put_path_name_to_id(int64_t id, const string& name); - string get_path_name(int64_t id); - int64_t get_path_id(const string& name); - void load_paths(VG& graph); - void store_paths(VG& graph); // of graph - void store_path(VG& graph, const Path& path); // path of graph - map paths_by_id(void); - - // alignments and mappings - void for_each_mapping(function lambda); - void for_each_alignment(function lambda); - - // what table is the key in - char graph_key_type(const string& key); - -}; - -class indexOpenException: public exception -{ - string message; - - virtual const char* what() const throw() - { - return message.c_str(); - } - -public: - indexOpenException(string message = ""): message("unable to open variant graph index: " + message) { - } - - -}; - -class keyNotFoundException: public exception -{ - virtual const char* what() const throw() - { - return "unable to find key in index"; - } -}; - -} - -#endif diff --git a/src/index_registry.cpp b/src/index_registry.cpp new file mode 100644 index 00000000000..f3cb3ad2335 --- /dev/null +++ b/src/index_registry.cpp @@ -0,0 +1,5498 @@ +// index_registry.cpp: index registry system implementation + +#include "index_registry.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vg.hpp" +#include "vg_set.hpp" +#include "handle.hpp" +#include "utility.hpp" +#include "constructor.hpp" +#include "hash_map.hpp" +#include "haplotype_indexer.hpp" +#include "phase_unfolder.hpp" +#include "gbwt_helper.hpp" +#include "gbwtgraph_helper.hpp" +#include "gcsa_helper.hpp" +#include "flat_file_back_translation.hpp" +#include "kmer.hpp" +#include "transcriptome.hpp" +#include "integrated_snarl_finder.hpp" +#include "snarl_distance_index.hpp" +#include "gfa.hpp" +#include "job_schedule.hpp" +#include "path.hpp" + +#include "io/save_handle_graph.hpp" + +#include "algorithms/gfa_to_handle.hpp" +#include "algorithms/prune.hpp" +#include "algorithms/component.hpp" +#include "algorithms/find_translation.hpp" + +//#define debug_index_registry +//#define debug_index_registry_setup +//#define debug_index_registry_recipes +//#define debug_index_registry_path_state + +namespace std { + +/// Convert IndexNames to strings, without defining it for all things sharing +/// the same underlying type. +static string to_string(const vg::IndexGroup& name) { + stringstream ss; + for (auto it = name.begin(); it != name.end(); ++it) { + if (it != name.begin()) { + ss << " + "; + } + ss << *it; + } + return ss.str(); +} + +} + +namespace vg { + + +IndexingParameters::MutableGraphImplementation IndexingParameters::mut_graph_impl = PackedGraph; +int IndexingParameters::max_node_size = 32; +int IndexingParameters::pruning_max_node_degree = 128; +int IndexingParameters::pruning_walk_length = 24; +int IndexingParameters::pruning_max_edge_count = 3; +int IndexingParameters::pruning_min_component_size = 33; +double IndexingParameters::pruning_walk_length_increase_factor = 1.5; +double IndexingParameters::pruning_max_node_degree_decrease_factor = 0.75; +int IndexingParameters::gcsa_initial_kmer_length = gcsa::Key::MAX_LENGTH; +int IndexingParameters::gcsa_doubling_steps = gcsa::ConstructionParameters::DOUBLING_STEPS; +int64_t IndexingParameters::gcsa_size_limit = 2ll * 1024ll * 1024ll * 1024ll * 1024ll; +int64_t IndexingParameters::gbwt_insert_batch_size = gbwt::DynamicGBWT::INSERT_BATCH_SIZE; +int IndexingParameters::gbwt_insert_batch_size_increase_factor = 10; +int IndexingParameters::gbwt_sampling_interval = gbwt::DynamicGBWT::SAMPLE_INTERVAL; +bool IndexingParameters::bidirectional_haplo_tx_gbwt = false; +string IndexingParameters::gff_feature_name = "exon"; +string IndexingParameters::gff_transcript_tag = "transcript_id"; +bool IndexingParameters::use_bounded_syncmers = false; +int IndexingParameters::minimizer_k = 29; +int IndexingParameters::minimizer_w = 11; +int IndexingParameters::minimizer_s = 18; +int IndexingParameters::path_cover_depth = gbwtgraph::PATH_COVER_DEFAULT_N; +int IndexingParameters::giraffe_gbwt_downsample = gbwtgraph::LOCAL_HAPLOTYPES_DEFAULT_N; +int IndexingParameters::downsample_threshold = 3; +int IndexingParameters::downsample_context_length = gbwtgraph::PATH_COVER_DEFAULT_K; +double IndexingParameters::max_memory_proportion = 0.75; +double IndexingParameters::thread_chunk_inflation_factor = 2.0; +IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic; + +void copy_file(const string& from_fp, const string& to_fp) { + ifstream from_file(from_fp, std::ios::binary); + ofstream to_file(to_fp, std::ios::binary); + if (!from_file) { + cerr << "error:[IndexRegistry] Couldn't open input file " << from_fp << endl; + exit(1); + } + if (!to_file) { + cerr << "error:[IndexRegistry] Couldn't open output file " << to_fp << endl; + exit(1); + } + to_file << from_file.rdbuf(); +} + +// return file size in bytes +int64_t get_file_size(const string& filename) { + // get the file size + ifstream infile(filename); + infile.seekg(0, ios::end); + return infile.tellg(); +} + +bool is_gzipped(const string& filename) { + if (filename.size() > 2 && filename.substr(filename.size() - 3, 3) == ".gz") { + return true; + } + return false; +} + +int64_t get_num_samples(const string& vcf_filename) { + htsFile* vcf_file = hts_open(vcf_filename.c_str(),"rb"); + if (!vcf_file) { + cerr << "error:[IndexRegistry]: Failed to open VCF file: " << vcf_filename << endl; + exit(1); + } + bcf_hdr_t* header = bcf_hdr_read(vcf_file); + int64_t num_samples = bcf_hdr_nsamples(header); + bcf_hdr_destroy(header); + hts_close(vcf_file); + return num_samples; +} + +// quickly guess the number of variants in a VCF file based on the filesize +double approx_num_vars(const string& vcf_filename) { + + int64_t num_samples = get_num_samples(vcf_filename); + int64_t file_size = get_file_size(vcf_filename); + + // TODO: bcf coefficient + // a shitty regression that Jordan fit on the human autosomes, gives a very rough + // estimate of the number of variants contained in a VCF + if (is_gzipped(vcf_filename)) { + // square root got a pretty good fit, for whatever reason + return 0.255293 * file_size / sqrt(std::max(num_samples, (int64_t) 1)); + } + else { + return 0.192505 * file_size / std::max(num_samples, (int64_t) 1); + } +} + +// the ratio to the HashGraph memory usage, as estimated by a couple of graphs +// that Jordan had laying around when he wrote this +// in case it's useful later: XG is ~0.231 +double format_multiplier() { + switch (IndexingParameters::mut_graph_impl) { + case IndexingParameters::HashGraph: + return 1.0; + case IndexingParameters::PackedGraph: + return 0.187; + case IndexingParameters::VG: + return 2.91; + default: + cerr << "error:[IndexRegistry] unrecognized mutable graph implementation format" << endl; + exit(1); + return 0.0; + } +} + +// approximate the memory of a graph that would be constructed with FASTAs and VCFs +int64_t approx_graph_memory(const vector& fasta_filenames, const vector& vcf_filenames) { + + // compute the size of the reference and the approximate number of + // variants in the VCF + int64_t ref_size = 0; + double num_vars = 0.0; + for (const auto& fasta : fasta_filenames) { + ref_size += get_file_size(fasta); + } + for (const auto& vcf : vcf_filenames) { + num_vars += approx_num_vars(vcf); + } + + // estimates made by regressing the memory usage of a linear reference on the size + // of the FASTA and then regressing the difference in memory usage between the linear + // reference and 1000GP graph on the number of variants in the VCF, all using human + // chromosomes with outliers removed + double linear_memory = 30.4483 * ref_size; + double var_memory = 2242.90 * num_vars; + double hash_graph_memory_usage = linear_memory + var_memory; + return hash_graph_memory_usage * format_multiplier(); +} + +vector each_approx_graph_memory(const vector& fasta_filenames, + const vector& vcf_filenames) { + + auto n = max(fasta_filenames.size(), vcf_filenames.size()); + assert(fasta_filenames.size() == 1 || fasta_filenames.size() == n); + assert(vcf_filenames.empty() || vcf_filenames.size() == 1 || vcf_filenames.size() == n); + + double total_ref_size = 0; + vector ref_sizes(fasta_filenames.size()); + for (int64_t i = 0; i < ref_sizes.size(); ++i) { + double ref_size = get_file_size(fasta_filenames[i]); + ref_sizes[i] = ref_size; + total_ref_size += ref_size; + } + double total_var_count = 0; + vector var_counts(vcf_filenames.size()); + for (int64_t i = 0; i < vcf_filenames.size(); ++i) { + int64_t var_count = approx_num_vars(vcf_filenames[i]); + var_counts[i] = var_count; + total_var_count += var_count; + } + + vector approx_memories(n); + for (int64_t i = 0; i < n; ++i) { + + double ref_size, var_count; + if (vcf_filenames.empty()) { + var_count = 0; + } + else if (vcf_filenames.size() == 1) { + var_count = total_var_count * (ref_sizes[i] / total_ref_size); + } + else { + var_count = var_counts[i]; + } + if (fasta_filenames.size() == 1 && total_var_count != 0.0) { + ref_size = total_ref_size * (var_counts[i] / total_var_count); + } + else if (fasta_filenames.size() == 1) { + ref_size = total_ref_size; + } + else { + ref_size = ref_sizes[i]; + } + + // TODO: repetitive with previous function, magic constants + double linear_memory = 30.4483 * ref_size; + double var_memory = 2242.90 * var_count; + double hash_graph_memory_usage = linear_memory + var_memory; + approx_memories[i] = hash_graph_memory_usage * format_multiplier(); + } + return approx_memories; +} + +int64_t approx_graph_memory(const string& fasta_filename, const string& vcf_filename) { + return approx_graph_memory(vector(1, fasta_filename), vector(1, vcf_filename)); +} + +// estimate the amount of memory of a GFA constructed graph +int64_t approx_graph_memory(const string& gfa_filename) { + int64_t hash_graph_memory_usage = 13.17 * get_file_size(gfa_filename); + return hash_graph_memory_usage * format_multiplier();} + +int64_t approx_gbwt_memory(const string& vcf_filename) { + return 21.9724 * log(std::max(get_num_samples(vcf_filename), (int64_t) 1)) * approx_num_vars(vcf_filename); +} + +int64_t approx_graph_load_memory(const string& graph_filename) { + // TODO: separate regressions for different graph types + // this one was done on hash graphs, which probably have a larger expansion + int64_t hash_graph_memory_usage = 12.52059 * get_file_size(graph_filename); + return hash_graph_memory_usage * format_multiplier(); +} + +// returns true if the GTF/GFF has any non-header lines +bool transcript_file_nonempty(const string& transcripts) { + ifstream strm(transcripts); + string line; + while (strm.good()) { + getline(strm, line); + if (!line.empty() && line[0] != '#') { + return true; + } + line.clear(); + } + return false; +} + +// return all of the contigs with variants in a VCF by iterating through +// the whole damn thing (SQ lines are not required, unfortunately) +vector vcf_contigs(const string& filename) { + + htsFile* vcf = hts_open(filename.c_str(),"rb"); + if (vcf == nullptr) { + cerr << "error:[IndexRegistry] Could not open VCF" << filename << endl; + } + + bcf_hdr_t* header = bcf_hdr_read(vcf); + unordered_set contigs; + bcf1_t* bcf_record = bcf_init(); + while (bcf_read(vcf, header, bcf_record) >= 0) { + + const char* chrom = bcf_hdr_id2name(header, bcf_record->rid); + + contigs.emplace(chrom); + } + bcf_destroy(bcf_record); + vector return_val(contigs.begin(), contigs.end()); + + + bcf_hdr_destroy(header); + hts_close(vcf); + + sort(return_val.begin(), return_val.end()); + return return_val; +} + +/********************* + * Indexing helper functions + ***********************/ + +// These can't be local lambdas in our indexer setup function because they +// would go away when the setup function returns. + +static void init_in(ifstream& in, const string& name) { + in.open(name); + if (!in) { + cerr << "error:[IndexRegistry] could not open input file '" << name << "'" << endl; + exit(1); + } +} + +static void init_out(ofstream& out, const string& name) { + out.open(name); + if (!out) { + cerr << "error:[IndexRegistry] could not write output to '" << name << "'" << endl; + exit(1); + } +} + +static void init_in_out(fstream& strm, const string& name) { + strm.open(name); + if (!strm) { + cerr << "error:[IndexRegistry] could not open '" << name << "'" << endl; + exit(1); + } +} + +static auto init_mutable_graph() -> unique_ptr { + unique_ptr graph; + switch (IndexingParameters::mut_graph_impl) { + case IndexingParameters::HashGraph: + graph = make_unique(); + break; + case IndexingParameters::PackedGraph: + graph = make_unique(); + break; + case IndexingParameters::VG: + graph = make_unique(); + break; + default: + cerr << "error:[IndexRegistry] unrecognized mutable graph implementation format" << endl; + exit(1); + break; + } + return graph; +} + +// execute a function in another process and return true if successful +// REMEMBER TO SAVE ANY INDEXES CONSTRUCTED TO DISK WHILE STILL INSIDE THE LAMBDA!! +int execute_in_fork(const function& exec) { + + // we have to clear out the pool of waiting OMP threads (if any) so that they won't + // be copied with the fork and create deadlocks/races + omp_pause_resource_all(omp_pause_soft); + + pid_t pid = fork(); + + if (pid == -1) { + cerr << "error:[IndexRegistry] failed to fork process" << endl; + exit(1); + } + else if (pid == 0) { + // this is the child process that will actually make the indexes + + // we want the pre-existing temp files to live beyond when this process exits + temp_file::forget(); + + exec(); + + // end the child process successfullycode + exit(0); + } else { + // This is the parent + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + cerr << "[IndexRegistry]: Forked into child process with PID " << pid << "." << endl; + } + } + + // allow the child to finish + int child_stat; + waitpid(pid, &child_stat, 0); // 0 waits until the process fully exits + + // pass through signal-based exits + if (WIFSIGNALED(child_stat)) { + cerr << "error:[IndexRegistry] Child process " << pid << " signaled with status " << child_stat << " representing signal " << WTERMSIG(child_stat) << endl; + if (raise(WTERMSIG(child_stat)) == 0) { + // TODO: on Mac, raise isn't guaranteed to not return before the handler if it succeeds. + // Also the signal might not be one that necessarily kills us. + exit(1); + } else { + // We couldn't send ourselves the signal. + exit(1); + } + } + + assert(WIFEXITED(child_stat)); + + return WEXITSTATUS(child_stat); +} + +IndexRegistry VGIndexes::get_vg_index_registry() { + + IndexRegistry registry; + + /********************* + * Register all of the VG indexes and input files + ***********************/ + + // TODO: we need separate suffixes for co-created indexes + + /// Data files + registry.register_index("Reference FASTA", "fasta"); + registry.register_index("VCF", "vcf"); + registry.register_index("VCF w/ Phasing", "phased.vcf"); + registry.register_index("Insertion Sequence FASTA", "insertions.fasta"); + registry.register_index("Reference GFA", "gfa"); + registry.register_index("Reference GFA w/ Haplotypes", "haplo.gfa"); + registry.register_index("GTF/GFF", "gff"); + registry.register_index("Haplotype GTF/GFF", "haplo.gff"); + + /// Chunked inputs + registry.register_index("Chunked Reference FASTA", "chunked.fasta"); + registry.register_index("Chunked VCF", "chunked.vcf.gz"); + registry.register_index("Chunked VCF w/ Phasing", "phased.chunked.vcf.gz"); + registry.register_index("Chunked GTF/GFF", "chunked.gff"); + + /// True indexes + registry.register_index("VG", "vg"); + registry.register_index("VG w/ Variant Paths", "varpaths.vg"); + registry.register_index("Pruned VG", "pruned.vg"); + registry.register_index("Spliced VG", "spliced.vg"); + registry.register_index("Spliced VG w/ Variant Paths", "spliced.varpaths.vg"); + registry.register_index("Spliced VG w/ Transcript Paths", "spliced.txpaths.vg"); + registry.register_index("Pruned Spliced VG", "spliced.pruned.vg"); + + registry.register_index("XG", "xg"); + registry.register_index("Spliced XG", "spliced.xg"); + + registry.register_index("Unjoined Transcript Origin Table", "unjoined.txorigin.tsv"); + registry.register_index("Transcript Origin Table", "txorigin.tsv"); + + registry.register_index("MaxNodeID", "maxid.txt"); + registry.register_index("Spliced MaxNodeID", "spliced.maxid.txt"); + registry.register_index("Unfolded NodeMapping", "mapping"); + registry.register_index("NamedNodeBackTranslation", "segments.tsv"); + registry.register_index("Haplotype-Pruned VG", "haplopruned.vg"); + registry.register_index("Unfolded Spliced NodeMapping", "spliced.mapping"); + registry.register_index("Haplotype-Pruned Spliced VG", "spliced.haplopruned.vg"); + registry.register_index("GCSA", "gcsa"); + registry.register_index("LCP", "gcsa.lcp"); + registry.register_index("Spliced GCSA", "spliced.gcsa"); + registry.register_index("Spliced LCP", "spliced.gcsa.lcp"); + + registry.register_index("GBWT", "gbwt"); + registry.register_index("Spliced GBWT", "spliced.gbwt"); + registry.register_index("Haplotype-Transcript GBWT", "haplotx.gbwt"); + registry.register_index("Giraffe GBWT", "giraffe.gbwt"); + + registry.register_index("Spliced Snarls", "spliced.snarls"); + + registry.register_index("Giraffe Distance Index", "dist"); + registry.register_index("Spliced Distance Index", "spliced.dist"); + + registry.register_index("GBWTGraph", "gg"); + registry.register_index("GBZ", "gbz"); + registry.register_index("Giraffe GBZ", "giraffe.gbz"); + + registry.register_index("Minimizers", "min"); + + /********************* + * Register all recipes + ***********************/ + + // Note that recipes MAY NOT CAPTURE ANYTHING BY REFERENCE ([&]) from this scope! + // This scope is on the stack and will go away by the time the recipes actually run! + + //////////////////////////////////// + // VCF Recipes + //////////////////////////////////// + +#ifdef debug_index_registry_setup + cerr << "registering VCF recipes" << endl; +#endif + + // alias a phased VCF as an unphased one + registry.register_recipe({"VCF"}, {"VCF w/ Phasing"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + alias_graph.register_alias(*constructing.begin(), inputs[0]); + return vector>(1, inputs.front()->get_filenames()); + }); + registry.register_recipe({"Chunked VCF"}, {"Chunked VCF w/ Phasing"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + alias_graph.register_alias(*constructing.begin(), inputs[0]); + return vector>(1, inputs.front()->get_filenames()); + }); + + //////////////////////////////////// + // GFA Recipes + //////////////////////////////////// + +#ifdef debug_index_registry_setup + cerr << "registering GFA recipes" << endl; +#endif + + // alias a phased GFA as an unphased one + registry.register_recipe({"Reference GFA"}, {"Reference GFA w/ Haplotypes"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + alias_graph.register_alias(*constructing.begin(), inputs[0]); + return vector>(1, inputs.front()->get_filenames()); + }); + + //////////////////////////////////// + // Chunking Recipes + //////////////////////////////////// + +#ifdef debug_index_registry_setup + cerr << "registering chunking recipes" << endl; +#endif + + // meta recipe for with/out phasing and with/out transcripts + auto chunk_contigs = [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing, + bool has_gff, + bool phased_vcf) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Chunking inputs for parallelism." << endl; + } + + // boilerplate + assert(inputs.size() == 1 || inputs.size() == 2 || inputs.size() == 3); + assert(constructing.size() == inputs.size()); + vector fasta_filenames, vcf_filenames, tx_filenames; + bool has_vcf = inputs.size() == 3 || (inputs.size() == 2 && !has_gff); + { + int i = 0; + if (has_gff) { + tx_filenames = inputs[i++]->get_filenames(); + } + fasta_filenames = inputs[i++]->get_filenames(); + if (has_vcf) { + vcf_filenames = inputs[i++]->get_filenames(); + } + } + vector> all_outputs(constructing.size()); + string output_fasta, output_vcf, output_tx; + { + auto it = constructing.begin(); + if (has_gff) { + output_tx = *it; + ++it; + } + output_fasta = *it; + ++it; + if (has_vcf) { + output_vcf = *it; + } + } + auto& output_fasta_names = all_outputs[has_gff ? 1 : 0]; + +#ifdef debug_index_registry_recipes + cerr << "chunking with vcf? " << has_vcf << ", with gff? " << has_gff << endl; +#endif + + // let's do this first, since it can detect data problems + + // i really hate to do two whole passes over the VCFs, but it's hard to see how to not + // do this to be able to distinguish when we need to wait for a contig to become available + // and when it simply doesn't have any variants in the VCFs (especially since it seems + // to be not uncommon for the header to have these sequences included, such as alt scaffolds) + vector> vcf_contigs_with_variants(vcf_filenames.size()); + vector> vcf_samples(vcf_filenames.size()); + { + // do the bigger jobs first to reduce makespan + +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < vcf_filenames.size(); ++i) { + + tbx_t* tabix_index = nullptr; + for (string tabix_name : {vcf_filenames[i] + ".tbi", vcf_filenames[i] + ".csi"}) { + struct stat stat_tbi, stat_vcf; + if (stat(tabix_name.c_str(), &stat_tbi) != 0) { + // the tabix doesn't exist + continue; + } + stat(vcf_filenames[i].c_str(), &stat_vcf); + if (stat_vcf.st_mtime > stat_tbi.st_mtime) { + cerr << "warning:[IndexRegistry] Tabix index " + tabix_name + " is older than VCF " + vcf_filenames[i] + " and will not be used. Consider recreating this tabix index to speed up index creation.\n"; + continue; + } + + tabix_index = tbx_index_load(tabix_name.c_str()); + if (tabix_index == nullptr) { + cerr << "error:[IndexRegistry] failed to load tabix index " << tabix_index << endl; + exit(1); + } + } + + // get the sample set + htsFile* vcf = bcf_open(vcf_filenames[i].c_str(), "r"); + bcf_hdr_t* header = bcf_hdr_read(vcf); + for (int j = 0; j < bcf_hdr_nsamples(header); ++j) { + vcf_samples[i].insert(header->samples[j]); + } + + if (tabix_index != nullptr) { + // we have a tabix index, so we can make contigs query more efficiently + int num_seq_names; + const char** seq_names = tbx_seqnames(tabix_index, &num_seq_names); + for (int j = 0; j < num_seq_names; ++j) { + vcf_contigs_with_variants[i].push_back(seq_names[j]); + } + free(seq_names); + bcf_hdr_destroy(header); + int close_err_code = hts_close(vcf); + if (close_err_code != 0) { + cerr << "error:[IndexRegistry] encountered error closing VCF " << vcf_filenames[i] << endl; + exit(1); + } + continue; + } + + // no tabix index, so we have to do the full scan + + bcf1_t* vcf_rec = bcf_init(); + string curr_contig; + int err_code = bcf_read(vcf, header, vcf_rec); + while (err_code == 0) { + const char* chrom = bcf_hdr_id2name(header, vcf_rec->rid); + if (!curr_contig.empty()) { + if (curr_contig < chrom) { + curr_contig = chrom; + vcf_contigs_with_variants[i].push_back(chrom); + } + else if (curr_contig > chrom) { + cerr << "error:[IndexRegistry] Contigs in VCF must be in ASCII-lexicographic order. Encountered contig '" << chrom << "' after contig '" << curr_contig << "' in VCF file" << vcf_filenames[i] << "." << endl; + exit(1); + } + } + else { + curr_contig = chrom; + vcf_contigs_with_variants[i].push_back(chrom); + } + + err_code = bcf_read(vcf, header, vcf_rec); + } + if (err_code != -1) { + cerr << "error:[IndexRegistry] failed to read from VCF " << vcf_filenames[i] << endl; + exit(1); + } + // we'll be moving on to a different file, so we won't demand that these + // be in order anymore + bcf_destroy(vcf_rec); + bcf_hdr_destroy(header); + err_code = hts_close(vcf); + if (err_code != 0) { + cerr << "error:[IndexRegistry] encountered error closing VCF " << vcf_filenames[i] << endl; + exit(1); + } + } + } + +#ifdef debug_index_registry_recipes + cerr << "contigs that have variants in the VCFs:" << endl; + for (auto& vcf_contigs : vcf_contigs_with_variants) { + for (auto& contig : vcf_contigs) { + cerr << "\t" << contig << endl; + } + } +#endif + + // consolidate this for easy look up later + unordered_set contigs_with_variants; + for (auto& vcf_contigs : vcf_contigs_with_variants) { + for (auto& contig : vcf_contigs) { + contigs_with_variants.insert(contig); + } + } + + unordered_map seq_files; + unordered_map seq_lengths; + // records of (length, name) + priority_queue> seq_queue; + + for (int64_t i = 0; i < fasta_filenames.size(); ++i) { + FastaReference ref; + ref.open(fasta_filenames[i]); + for (const auto& idx_entry : *ref.index) { + seq_files[idx_entry.first] = i; + seq_lengths[idx_entry.first] = idx_entry.second.length; + seq_queue.emplace(idx_entry.second.length, idx_entry.first); + } + } + + // we'll partition sequences that have the same samples (chunking the the VCFs + // ultimately requires that we do this) + map, vector> sample_set_contigs; + for (int i = 0; i < vcf_samples.size(); ++i) { + auto& contigs = sample_set_contigs[vcf_samples[i]]; + for (auto& contig : vcf_contigs_with_variants[i]) { + contigs.push_back(contig); + } + } + // move these lists of contigs into more convenient data structures + vector> contig_groups; + unordered_map contig_to_group; + for (auto it = sample_set_contigs.begin(); it != sample_set_contigs.end(); ++it) { + for (const auto& contig : it->second) { + if (contig_to_group.count(contig)) { + cerr << "error:[IndexRegistry] Contig " << contig << " is found in multiple VCFs with different samples" << endl; + exit(1); + } + contig_to_group[contig] = contig_groups.size(); + } + contig_groups.emplace_back(move(it->second)); + } + +#ifdef debug_index_registry_recipes + cerr << "contigs by sample group" << endl; + for (int i = 0; i < contig_groups.size(); ++i) { + cerr << "group " << i << endl; + for (auto contig : contig_groups[i]) { + cerr << "\t" << contig << endl; + } + } +#endif + + // we'll greedily assign contigs to the smallest bucket (2-opt bin packing algorithm) + // modified to ensure that we can bucket contigs with the same sample sets together + + // one of the threads gets used up to do scheduling after this initial chunking + int num_threads = get_thread_count(); + // we'll let it go a bit larger so we can take advantage of dynamic scheduling + int max_num_buckets = max(contig_groups.size(), + ceil(IndexingParameters::thread_chunk_inflation_factor * num_threads)); + int num_buckets = 0; + int groups_without_bucket = contig_groups.size(); + size_t num_sample_groups = max(contig_groups.size(), 1); + // buckets of contigs, grouped by sample groups + vector>> sample_group_buckets(num_sample_groups); + // records of (total length, bucket index), grouped by sample gorups + vector, vector>, + greater>>> bucket_queues(num_sample_groups); + while (!seq_queue.empty()) { + int64_t length; + string seq_name; + tie(length, seq_name) = seq_queue.top(); + seq_queue.pop(); + + int64_t group = 0; + if (contig_to_group.count(seq_name)) { + // this contig has variant samples associated with it, so we need to + // group it in with them + group = contig_to_group[seq_name]; + } + else { + // this contig has no variants associated, we can put it in whichever + // bucket is smallest + for (int64_t i = 0; i < bucket_queues.size(); ++i) { + int64_t min_bucket_length = numeric_limits::max(); + if (bucket_queues[i].empty()) { + min_bucket_length = 0; + group = i; + } + else if (bucket_queues[i].top().first < min_bucket_length) { + min_bucket_length = bucket_queues[i].top().first; + group = i; + } + + } + } + + // always make sure there's enough room in our budget of buckets + // to make one for each sample group + if (bucket_queues[group].empty() || num_buckets < max_num_buckets - groups_without_bucket) { + // make a new bucket + if (bucket_queues[group].empty()) { + groups_without_bucket--; + } + auto& group_buckets = sample_group_buckets[group]; + bucket_queues[group].emplace(seq_lengths[seq_name], group_buckets.size()); + group_buckets.emplace_back(1, seq_name); + num_buckets++; + } + else { + // add to the smallest bucket + int64_t total_length, b_idx; + tie(total_length, b_idx) = bucket_queues[group].top(); + bucket_queues[group].pop(); + sample_group_buckets[group][b_idx].emplace_back(seq_name); + bucket_queues[group].emplace(total_length + length, b_idx); + } + } + + // merge the list of sample group buckets and collate with their + // FASTA of origin + vector>> buckets; + for (auto& group_buckets : sample_group_buckets) { + for (auto& bucket : group_buckets) { + buckets.emplace_back(); + auto& new_bucket = buckets.back(); + for (auto& contig : bucket) { + new_bucket.emplace_back(move(contig), seq_files[contig]); + } + } + } + + // sort the buckets in descending order by sequence length so that the + // biggest jobs get dynamically scheduled first + sort(buckets.begin(), buckets.end(), + [&](const vector>& a, const vector>& b) { + size_t len_a = 0, len_b = 0; + for (const auto& contig : a) { + len_a += seq_lengths.at(contig.first); + } + for (const auto& contig : b) { + len_b += seq_lengths.at(contig.first); + } + return len_a > len_b; + }); + + // to look bucket index up from a contig + unordered_map contig_to_idx; + for (int64_t i = 0; i < buckets.size(); ++i) { + for (auto& bucket_item : buckets[i]) { + contig_to_idx[bucket_item.first] = i; + } + } + + // sort contigs of each bucket lexicographically so that they occur in the order + // we'll discover them in the VCF + for (auto& bucket : buckets) { + sort(bucket.begin(), bucket.end()); + } + +#ifdef debug_index_registry_recipes + cerr << "assigned contigs into buckets:" << endl; + for (int i = 0; i < buckets.size(); ++i) { + cerr << "bucket " << i << endl; + for (auto& contig : buckets[i]) { + cerr << "\t" << contig.first << ": " << seq_lengths[contig.first] << endl; + } + } +#endif + + + if (buckets.size() == fasta_filenames.size() + && (!has_vcf || buckets.size() == vcf_filenames.size()) + && (!has_gff || buckets.size() == tx_filenames.size())) { + // it looks like we might have just recapitulated the original chunking, let's check to make sure + + // does each bucket come from exactly one FASTA file? + bool all_buckets_match = true; + for (int64_t i = 0; i < fasta_filenames.size() && all_buckets_match; ++i) { + FastaReference ref; + ref.open(fasta_filenames[i]); + int64_t bucket_idx = -1; + for (const auto& idx_entry : *ref.index) { + if (bucket_idx == -1) { + bucket_idx = contig_to_idx.at(idx_entry.second.name); + } + else if (contig_to_idx.at(idx_entry.second.name) != bucket_idx) { + all_buckets_match = false; + break; + } + } + } + + // TODO: shouldn't I also check the correspondence on the input GTFs/VCFs? + + if (all_buckets_match) { + // there's no need for chunking, just alias them +#ifdef debug_index_registry_recipes + cerr << "chunking matches input files, no need to re-chunk" << endl; +#endif + + if (has_gff) { + all_outputs[0] = tx_filenames; + alias_graph.register_alias(output_tx, inputs[0]); + } + + output_fasta_names = fasta_filenames; + alias_graph.register_alias(output_fasta, inputs[has_gff ? 1 : 0]); + + if (has_vcf) { + all_outputs[all_outputs.size() - 1] = vcf_filenames; + alias_graph.register_alias(output_vcf, inputs[inputs.size() - 1]); + } + return all_outputs; + } + } + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Chunking FASTA(s)." << endl; + } + + output_fasta_names.resize(buckets.size()); + if (has_vcf) { + all_outputs[all_outputs.size() - 1].resize(buckets.size()); + } + + // make FASTA sequences for each bucket + // the threading here gets to be pretty simple because the fai allows random access +#pragma omp parallel for schedule(dynamic, 1) + for (int64_t i = 0; i < buckets.size(); ++i) { + + auto chunk_fasta_name = plan->output_filepath(output_fasta, i, buckets.size()); + auto chunk_fai_name = chunk_fasta_name + ".fai"; + output_fasta_names[i] = chunk_fasta_name; + + ofstream outfile_fasta, outfile_fai; + init_out(outfile_fasta, chunk_fasta_name); + init_out(outfile_fai, chunk_fai_name); + for (auto& assigned_seq : buckets[i]) { + string contig = assigned_seq.first; + int64_t ref_idx = assigned_seq.second; + FastaReference ref; + ref.open(fasta_filenames[ref_idx]); + + auto entry = ref.index->entry(contig); + int64_t length = entry.length; + int64_t line_length = entry.line_blen; // the base length + + // copy over the FASTA sequence + outfile_fasta << '>' << contig << '\n'; + int64_t seq_start = outfile_fasta.tellp(); + int64_t j = 0; + while (j < length) { + int64_t end = min(j + line_length, length); + outfile_fasta << ref.getSubSequence(contig, j, end - j) << '\n'; + j = end; + } + + // add an FAI entry + outfile_fai << contig << '\t' << length << '\t' << seq_start << '\t' << line_length << '\t' << line_length + 1 << endl; + } + } + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Chunking VCF(s)." << endl; + } + + // open all of the input VCF files + vector> input_vcf_files(vcf_filenames.size()); + vector> input_checked_out_or_finished(vcf_filenames.size()); + for (int64_t i = 0; i < input_vcf_files.size(); ++i) { + htsFile* vcf = bcf_open(vcf_filenames[i].c_str(), "r"); + if (!vcf) { + cerr << "error:[IndexRegistry] failed to open VCF " << vcf_filenames[i] << endl; + exit(1); + } + bcf_hdr_t* header = bcf_hdr_read(vcf); + bcf1_t* vcf_rec = bcf_init(); + int err_code = bcf_read(vcf, header, vcf_rec); + if (err_code == -1) { + // this vcf is empty, actually + input_checked_out_or_finished[i].store(true); + } + else if (err_code < 0) { + cerr << "error:[IndexRegistry] failed to read VCF " << vcf_filenames[i] << endl; + exit(1); + } + input_vcf_files[i] = make_tuple(vcf, header, vcf_rec); + } + + unordered_map contig_to_vcf_idx; + for (int64_t i = 0; i < vcf_contigs_with_variants.size(); ++i) { + for (const auto& contig : vcf_contigs_with_variants[i]) { + contig_to_vcf_idx[contig] = i; + } + } + + if (has_vcf) { + + auto& output_vcf_names = all_outputs.back(); + + // see if we can identify any chunked VCFs that are identical to our input VCFs + + // records of (input vcf index, bucket index) + vector> copiable_vcfs; + for (int64_t i = 0; i < buckets.size(); ++i) { + int64_t prev_vcf_idx = -1; + int64_t count = 0; + for (auto& contig : buckets[i]) { + if (contig_to_vcf_idx.count(contig.first)) { + int64_t vcf_idx = contig_to_vcf_idx[contig.first]; + if (prev_vcf_idx == -1 || prev_vcf_idx == vcf_idx) { + prev_vcf_idx = vcf_idx; + count++; + } + else { + // we've seen a second input VCF, mark a sentinel and stop looking + count = -1; + break; + } + } + } + if (prev_vcf_idx >= 0 && count == vcf_contigs_with_variants[prev_vcf_idx].size()) { + // we saw all and only contigs from one VCF, we can just copy it + copiable_vcfs.emplace_back(prev_vcf_idx, i); + } + } + +#ifdef debug_index_registry_recipes + cerr << "identified " << copiable_vcfs.size() << " copiable VCFs:" << endl; + for (const auto& copiable_vcf : copiable_vcfs) { + cerr << "\tinput " << copiable_vcf.first << " " << vcf_filenames[copiable_vcf.first] << " -> bucket " << copiable_vcf.second << endl; + } +#endif + output_vcf_names.resize(buckets.size()); + + // check if we can do a sort-of-aliasing for VCFs, since they're the most time- + // consuming part of the chunking + if (copiable_vcfs.size() == vcf_filenames.size()) { + // all of the input VCFs could be copied to 1 bucket, so we'll just alias + // them and make dummies for the rest + for (auto vcf_copy : copiable_vcfs) { + int64_t input_idx, output_idx; + tie(input_idx, output_idx) = vcf_copy; + output_vcf_names[output_idx] = vcf_filenames[input_idx]; + } + for (int64_t i = 0; i < output_vcf_names.size(); ++i) { + if (output_vcf_names[i].empty()) { + // this bucket didn't receive a VCF chunk, let's make a dummy VCF for it + auto output_vcf_name = plan->output_filepath(output_vcf, i, buckets.size()); + htsFile* vcf = bcf_open(output_vcf_name.c_str(), "wz"); + bcf_hdr_t* header = bcf_hdr_init("w"); + // this is to satisfy HaplotypeIndexer, which doesn't like sample-less VCFs + if (phased_vcf) { + int sample_add_code = bcf_hdr_add_sample(header, "dummy"); + if (sample_add_code != 0) { + cerr << "error:[IndexRegistry] error initializing VCF header" << endl; + exit(1); + } + } + int hdr_write_err_code = bcf_hdr_write(vcf, header); + if (hdr_write_err_code != 0) { + cerr << "error:[IndexRegistry] error writing VCF header to " << output_vcf_name << endl; + exit(1); + } + bcf_hdr_destroy(header); + int close_err_code = hts_close(vcf); + if (close_err_code != 0) { + cerr << "error:[IndexRegistry] encountered error closing VCF " << output_vcf_name << endl; + exit(1); + } + output_vcf_names[i] = output_vcf_name; + } + } + // register that this is an alias + alias_graph.register_alias(output_vcf, inputs[inputs.size() - 1]); +#ifdef debug_index_registry_recipes + cerr << "pseudo-aliased VCFs with filenames:" << endl; + for (const auto& filename : output_vcf_names) { + cerr << "\t" << filename << endl; + } +#endif + } + else { + + // trackers for whether we can write to a bucket's vcf + vector> bucket_checked_out_or_finished(buckets.size()); + for (int64_t i = 0; i < buckets.size(); ++i) { + bucket_checked_out_or_finished[i].store(false); + } + + // if we can copy over a vcf, we don't want to check it out for reading/writing + for (auto copiable_vcf : copiable_vcfs) { + input_checked_out_or_finished[copiable_vcf.first].store(true); + bucket_checked_out_or_finished[copiable_vcf.second].store(true); + } + +#ifdef debug_index_registry_recipes + cerr << "initializing chunked VCFs for output" << endl; +#endif + + // the output files + vector> bucket_vcfs(buckets.size()); + for (int64_t i = 0; i < buckets.size(); ++i) { + auto output_vcf_name = plan->output_filepath(output_vcf, i, buckets.size()); + output_vcf_names[i] = output_vcf_name; + + if (bucket_checked_out_or_finished[i].load()) { + // we can copy to make this file, so we don't need to initialize + // a file + continue; + } + + // open to write in bgzipped format + htsFile* vcf_out = bcf_open(output_vcf_name.c_str(), "wz"); + bcf_hdr_t* header_out = bcf_hdr_init("w"); + + // identify which input VCFs we'll be pulling from + unordered_set vcf_indexes; + for (const auto& contig : buckets[i]) { + if (contig_to_vcf_idx.count(contig.first)) { + vcf_indexes.insert(contig_to_vcf_idx[contig.first]); + } + } +#ifdef debug_index_registry_recipes + cerr << "bucket " << i << " will add samples from input VCFs:" << endl; + for (auto j : vcf_indexes) { + cerr << "\t" << j << endl; + } +#endif + // merge will all the input headers + unordered_set samples_added; + for (auto vcf_idx : vcf_indexes) { + + auto input_vcf_file = input_vcf_files[vcf_idx]; + bcf_hdr_t* header_in = get<1>(input_vcf_file); + header_out = bcf_hdr_merge(header_out, header_in); + if (header_out == nullptr) { + cerr << "error:[IndexRegistry] error merging VCF header" << endl; + exit(1); + } + + // add the samples from every header + for (int64_t j = 0; j < bcf_hdr_nsamples(header_in); ++j) { + const char* sample = header_in->samples[j]; + if (!samples_added.count(sample)) { + // TODO: the header has its own dictionary, so this shouldn't be necessary, + // but the khash_t isn't documented very well + samples_added.insert(sample); + // the sample hasn't been added yet + int sample_err_code = bcf_hdr_add_sample(header_out, header_in->samples[j]); + // returns a -1 if the sample is already included, which we expect + if (sample_err_code != 0) { + cerr << "error:[IndexRegistry] error adding samples to VCF header" << endl; + exit(1); + } + } + } + } + + // documentation in htslib/vcf.h says that this has to be called after adding samples + int sync_err_code = bcf_hdr_sync(header_out); + if (sync_err_code != 0) { + cerr << "error:[IndexRegistry] error syncing VCF header" << endl; + exit(1); + } + if (phased_vcf && bcf_hdr_nsamples(header_out) == 0) { + if (!vcf_indexes.empty()) { + cerr << "warning:[IndexRegistry] VCF inputs from file(s)"; + for (auto vcf_idx : vcf_indexes) { + cerr << " " << vcf_filenames[vcf_idx]; + } + cerr << " have been identified as phased but contain no samples. Are these valid inputs?" << endl; + } + + // let's add a dummy so that HaplotypeIndexer doesn't get mad later + int sample_add_code = bcf_hdr_add_sample(header_out, "dummy"); + if (sample_add_code != 0) { + cerr << "error:[IndexRegistry] error initializing VCF header" << endl; + exit(1); + } + // and re-sync, not sure if necessary, but it will be cheap regardless + sync_err_code = bcf_hdr_sync(header_out); + if (sync_err_code != 0) { + cerr << "error:[IndexRegistry] error syncing VCF header" << endl; + exit(1); + } + } + int hdr_write_err_code = bcf_hdr_write(vcf_out, header_out); + if (hdr_write_err_code != 0) { + cerr << "error:[IndexRegistry] error writing VCF header to " << output_vcf_name << endl; + exit(1); + } + + // remember these so that we can check them out later + bucket_vcfs[i] = make_pair(vcf_out, header_out); + } + + // the parallel iteration in here is pretty complicated because contigs from + // the input VCFs are being shuffled among the output bucket VCFs, and contigs + // need to be both read and written in lexicographic order. the mutexes here + // let the threads shift between reading and writing from different pairs of VCFs. + // hopefully this high-contention process won't cause too many problems since + // copying each contig takes up a relatively large amount of time + + // a mutex to lock the process of checking whether the next contig the thread + // needs is exposed + mutex input_vcf_mutex; + // a mutex to lock the process of switching to a new bucket + mutex output_vcf_mutex; + // to keep track of which contig in the bucket we're looking for next + vector contig_idx(buckets.size(), 0); + // how many buckets we've finished so far + atomic buckets_finished(0); + vector workers; + for (int64_t i = 0; i < num_threads; ++i) { + // Worker must not capture i; it will be out of scope! + workers.emplace_back([&]() { + int64_t bucket_idx = -1; + while (buckets_finished.load() < buckets.size()) { + // check if any of the input VCFs need to be moved past a contig that isn't + // in our reference + input_vcf_mutex.lock(); + int64_t contig_skip_idx = -1; + for (int64_t j = 0; j < input_vcf_files.size(); ++j) { + if (input_checked_out_or_finished[j].load()) { + continue; + } + + const char* chrom = bcf_hdr_id2name(get<1>(input_vcf_files[j]), + get<2>(input_vcf_files[j])->rid); + // check this index over the FASTA sequence lengths for the chromosome + if (!seq_lengths.count(chrom)) { + contig_skip_idx = j; + input_checked_out_or_finished[j].store(true); + } + } + input_vcf_mutex.unlock(); + + if (contig_skip_idx != -1) { + // we found a contig in the VCF that isn't present in the FASTA, we'll have to skip it + + auto& input_vcf_file = input_vcf_files[contig_skip_idx]; + string skip_contig = bcf_hdr_id2name(get<1>(input_vcf_file), + get<2>(input_vcf_file)->rid); + cerr << "warning:[IndexRegistry] Skipping contig " + skip_contig + ", which is found in VCF(s) but not reference.\n"; + + + // keep reading until end of file or a different contig + int read_err_code = 0; + while (read_err_code >= 0) { + string contig = bcf_hdr_id2name(get<1>(input_vcf_file), + get<2>(input_vcf_file)->rid); + if (contig != skip_contig) { + break; + } + + read_err_code = bcf_read(get<0>(input_vcf_file), get<1>(input_vcf_file), get<2>(input_vcf_file)); + } + + // check the input back out unless we've finished it + if (read_err_code >= 0) { + input_checked_out_or_finished[contig_skip_idx].store(false); + } + continue; + } + + // select an output VCF corresponding to a bucket + int64_t copy_from_idx = -1, copy_to_idx = -1; + bool found_bucket = false; + output_vcf_mutex.lock(); + if (!copiable_vcfs.empty()) { + // there are copiable VCFs remaining, do these first + tie(copy_from_idx, copy_to_idx) = copiable_vcfs.back(); + copiable_vcfs.pop_back(); + } + else { + // start iteration at 1 so we always advance to a new bucket if possible + for (int64_t j = 1; j <= buckets.size(); ++j) { + int64_t next_bucket_idx = (bucket_idx + j) % buckets.size(); + if (!bucket_checked_out_or_finished[next_bucket_idx].load()) { + bucket_checked_out_or_finished[next_bucket_idx].store(true); + bucket_idx = next_bucket_idx; + found_bucket = true; + break; + } + } + } + output_vcf_mutex.unlock(); + + if (copy_from_idx >= 0) { +#ifdef debug_index_registry_recipes + cerr << "direct copying " + vcf_filenames[copy_from_idx] + " to " + output_vcf_names[copy_to_idx] + "\n"; +#endif + // we can copy an entire file on this iteration instead of parsing + copy_file(vcf_filenames[copy_from_idx], output_vcf_names[copy_to_idx]); + if (file_exists(vcf_filenames[copy_from_idx] + ".tbi")) { + // there's also a tabix, grab that as well + copy_file(vcf_filenames[copy_from_idx] + ".tbi", output_vcf_names[copy_to_idx] + ".tbi"); + } + // this bucket is now totally finished + buckets_finished.fetch_add(1); + continue; + } + + if (!found_bucket) { + // it's now possible for all buckets to be checked out simultaneously + // by other threads, so there's no more need to have this thread running +#ifdef debug_index_registry_recipes + cerr << "thread exiting\n"; +#endif + return; + } + + auto& ctg_idx = contig_idx[bucket_idx]; + + if (!contigs_with_variants.count(buckets[bucket_idx][ctg_idx].first)) { + // this contig doesn't have variants in any of the VCFs, so we skip it + ++ctg_idx; + if (ctg_idx == buckets[bucket_idx].size()) { + buckets_finished.fetch_add(1); + } + else { + bucket_checked_out_or_finished[bucket_idx].store(false); + } + continue; + } + + htsFile* vcf_out = bucket_vcfs[bucket_idx].first; + bcf_hdr_t* header_out = bucket_vcfs[bucket_idx].second; + + // check if any of the VCFs' next contig is the next one we want for + // this bucket (and lock other threads out from checking simultaneously) + int64_t input_idx = -1; + input_vcf_mutex.lock(); + for (int64_t j = 0; j < input_vcf_files.size(); ++j) { + if (input_checked_out_or_finished[j].load()) { + continue; + } + + // what is the next contig in this VCF? + const char* chrom = bcf_hdr_id2name(get<1>(input_vcf_files[j]), + get<2>(input_vcf_files[j])->rid); + if (buckets[bucket_idx][ctg_idx].first == chrom) { + input_idx = j; + input_checked_out_or_finished[j].store(true); + break; + } + } + input_vcf_mutex.unlock(); + + if (input_idx < 0) { + // other threads need to get through earlier contigs until this bucket's next + // contig is exposed + bucket_checked_out_or_finished[bucket_idx].store(false); + continue; + } + + // we've checked out one of the input vcfs, now we can read from it + auto& input_vcf_file = input_vcf_files[input_idx]; + + int read_err_code = 0; + while (read_err_code >= 0) { + + const char* chrom = bcf_hdr_id2name(get<1>(input_vcf_file), get<2>(input_vcf_file)->rid); + if (buckets[bucket_idx][ctg_idx].first != chrom) { + break; + } + + // FIXME: i'm not sure how important it is to handle these malformed VCFs it is + // // read the "END" info field to see if we need to repair it (this seems to be a problem + // // in the grch38 liftover variants from 1kg) + // int32_t* end_dst = NULL; + // int num_end; + // int end_err_code = bcf_get_info_int32(get<1>(input_vcf_file), get<2>(input_vcf_file), "END", + // &end_dst, &num_end); + // if (end_err_code >= 0) { + // // there is an END tag to read + // int64_t end = *end_dst; + // // note: we can query alleles without bcf_unpack, because it will have already + // // unpacked up to info fields + // // calculate it the way the spec says to + // int64_t calc_end = get<2>(input_vcf_file)->pos + strlen(get<2>(input_vcf_file)->d.allele[0]) - 1; + // if (end != calc_end) { + // string msg = "warning:[IndexRegistry] fixing \"END\" of variant " + buckets[bucket_idx][ctg_idx].first + " " + to_string(get<2>(input_vcf_file)->pos) + " from " + to_string(end) + " to " + to_string(calc_end) + "\n"; + //#pragma omp critical + // cerr << msg; + // + // int update_err_code = bcf_update_info_int32(get<1>(input_vcf_file), get<2>(input_vcf_file), "END", + // &calc_end, 1); + // if (update_err_code < 0) { + // cerr << "error:[IndexRegistry] failed to update \"END\"" << endl; + // exit(1); + // } + // } + // free(end_dst); + // } + + bcf_translate(header_out, get<1>(input_vcf_file), get<2>(input_vcf_file)); + + int write_err_code = bcf_write(vcf_out, header_out, get<2>(input_vcf_file)); + if (write_err_code != 0) { + cerr << "error:[IndexRegistry] error writing VCF line to " << output_vcf_names[bucket_idx] << endl; + exit(1); + } + + read_err_code = bcf_read(get<0>(input_vcf_file), get<1>(input_vcf_file), get<2>(input_vcf_file)); + } + + if (read_err_code >= 0) { + // there's still more to read, it's just on different contigs + input_checked_out_or_finished[input_idx].store(false); + } + else if (read_err_code != -1) { + // we encountered a real error + cerr << "error:[IndexRegistry] error reading VCF file " << vcf_filenames[input_idx] << endl; + exit(1); + } + + // we finished this contig + ++ctg_idx; + if (ctg_idx == buckets[bucket_idx].size()) { + buckets_finished.fetch_add(1); + } + else { + bucket_checked_out_or_finished[bucket_idx].store(false); + } + } + }); + } + + // barrier sync + for (auto& worker : workers) { + worker.join(); + } + + // close out files + for (int64_t i = 0; i < input_vcf_files.size(); ++i) { + auto vcf_file = input_vcf_files[i]; + bcf_destroy(get<2>(vcf_file)); + bcf_hdr_destroy(get<1>(vcf_file)); + int err_code = hts_close(get<0>(vcf_file)); + if (err_code != 0) { + cerr << "error:[IndexRegistry] encountered error closing VCF " << vcf_filenames[i] << endl; + exit(1); + } + } + for (int64_t i = 0; i < bucket_vcfs.size(); ++i) { + if (!bucket_vcfs[i].second) { + // we didn't open this VCF (probably because we just copied it) + continue; + } + bcf_hdr_destroy(bucket_vcfs[i].second); + int close_err_code = hts_close(bucket_vcfs[i].first); + if (close_err_code != 0) { + cerr << "error:[IndexRegistry] encountered error closing VCF " << output_vcf_names[i] << endl; + exit(1); + } + } + } + + // TODO: move this into the same work queue as the rest of the VCF chunking? + // tabix index +#pragma omp parallel for schedule(dynamic, 1) + for (int64_t i = 0; i < buckets.size(); ++i) { + // tabix-index the bgzipped VCF we just wrote + + if (file_exists(output_vcf_names[i] + ".tbi")) { + // the tabix already exists + continue; + } + + // parameters inferred from tabix main's sourcecode + int min_shift = 0; + tbx_conf_t conf = tbx_conf_vcf; + int tabix_err_code = tbx_index_build(output_vcf_names[i].c_str(), min_shift, &conf); + if (tabix_err_code == -2) { + cerr << "error:[IndexRegistry] output VCF is not bgzipped: " << output_vcf_names[i] << endl; + exit(1); + } + else if (tabix_err_code != 0) { + cerr << "warning:[IndexRegistry] could not tabix index VCF " + output_vcf_names[i] + "\n"; + } + } + } + + if (has_gff) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Chunking GTF/GFF(s)." << endl; + } + + auto& output_gff_names = all_outputs[0]; + vector tx_files_out(buckets.size()); + for (int64_t i = 0; i < buckets.size(); ++i) { + output_gff_names.emplace_back(plan->output_filepath(output_tx, i, buckets.size())); + init_out(tx_files_out[i], output_gff_names.back()); + } + + // mutexes to lock the process of checking out for writing/reading + mutex gff_out_mutex; + + // we'll thread by input files in this case + vector tx_workers; + vector> chunk_gff_checked_out(buckets.size()); + for (int64_t i = 0; i < chunk_gff_checked_out.size(); ++i) { + chunk_gff_checked_out[i].store(false); + } + + atomic input_gffs_read(0); + for (int64_t i = 0; i < num_threads; ++i) { + // Worker must not capture i; it will be out of scope! + tx_workers.emplace_back([&]() { + while (input_gffs_read.load() < tx_filenames.size()) { + + int64_t idx = input_gffs_read.fetch_add(1); + + if (idx >= tx_filenames.size()) { + break; + } + + ifstream infile_tx; + init_in(infile_tx, tx_filenames[idx]); + + ofstream tx_chunk_out; + + int64_t prev_chunk_idx = -1; + while (infile_tx.good()) { + + string line; + getline(infile_tx, line); + + stringstream line_strm(line); + string chrom; + getline(line_strm, chrom, '\t'); + if (chrom.empty() || chrom.front() == '#') { + // skip header + continue; + } + + auto it = contig_to_idx.find(chrom); + if (it == contig_to_idx.end()) { + cerr << "error:[IndexRegistry] contig " << chrom << " from GTF/GFF " << tx_filenames[idx] << " is not found in reference" << endl; + exit(1); + } + int64_t chunk_idx = it->second; + if (chunk_idx != prev_chunk_idx) { + // we're transitioning between chunks, so we need to check the chunk + // out for writing + + // release the old chunk + if (prev_chunk_idx >= 0) { + tx_chunk_out.close(); + tx_chunk_out.clear(); + chunk_gff_checked_out[prev_chunk_idx].store(false); + } + + // keep trying to check the new chunk until succeeding + bool success = false; + while (!success) { + // only one thread can try to check out at a time + gff_out_mutex.lock(); + if (!chunk_gff_checked_out[chunk_idx].load()) { + // the chunk is free to be written to + chunk_gff_checked_out[chunk_idx].store(true); + success = true; + } + gff_out_mutex.unlock(); + if (!success) { + // wait for a couple seconds to check again if we can write + // to the file + this_thread::sleep_for(chrono::seconds(1)); + } + else { + // open for writing, starting from the end + tx_chunk_out.open(output_gff_names[chunk_idx], ios_base::ate); + if (!tx_chunk_out) { + cerr << "error:[IndexRegistry] could not open " << output_gff_names[chunk_idx] << " for appending" << endl; + exit(1); + } + } + } + } + + // copy the line to the chunk + tx_chunk_out << line << '\n'; + + prev_chunk_idx = chunk_idx; + } + + // release the last chunk we were writing to + if (prev_chunk_idx >= 0) { + tx_chunk_out.close(); + chunk_gff_checked_out[prev_chunk_idx].store(false); + } + } + }); + } + + // barrier sync + for (auto& worker : tx_workers) { + worker.join(); + } + } + + + return all_outputs; + }; + + // call the meta recipe + registry.register_recipe({"Chunked GTF/GFF", "Chunked Reference FASTA", "Chunked VCF w/ Phasing"}, {"GTF/GFF", "Reference FASTA", "VCF w/ Phasing"}, + [=](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return chunk_contigs(inputs, plan, alias_graph, constructing, true, true); + }); + registry.register_recipe({"Chunked GTF/GFF", "Chunked Reference FASTA", "Chunked VCF"}, {"GTF/GFF", "Reference FASTA", "VCF"}, + [=](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return chunk_contigs(inputs, plan, alias_graph, constructing, true, false); + }); + registry.register_recipe({"Chunked Reference FASTA", "Chunked VCF w/ Phasing"}, {"Reference FASTA", "VCF w/ Phasing"}, + [=](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return chunk_contigs(inputs, plan, alias_graph, constructing, false, true); + }); + registry.register_recipe({"Chunked Reference FASTA", "Chunked VCF"}, {"Reference FASTA", "VCF"}, + [=](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return chunk_contigs(inputs, plan, alias_graph, constructing, false, false); + }); + registry.register_recipe({"Chunked GTF/GFF", "Chunked Reference FASTA"}, {"GTF/GFF", "Reference FASTA"}, + [=](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return chunk_contigs(inputs, plan, alias_graph, constructing, true, false); + }); + registry.register_recipe({"Chunked Reference FASTA"}, {"Reference FASTA"}, + [=](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return chunk_contigs(inputs, plan, alias_graph, constructing, false, false); + }); + + + //////////////////////////////////// + // VG Recipes + //////////////////////////////////// + +#ifdef debug_index_registry_setup + cerr << "registering VG recipes" << endl; +#endif + + // meta-recipe for removing variant paths + auto strip_variant_paths = [](const vector& inputs, + const IndexingPlan* plan, + const IndexGroup& constructing) { + + assert(inputs.size() == 1); + assert(constructing.size() == 1); + + auto chunk_filenames = inputs.at(0)->get_filenames(); + auto output_index = *constructing.begin(); + vector> all_outputs(constructing.size()); + + auto& output_names = all_outputs.front(); + output_names.resize(chunk_filenames.size()); + auto strip_chunk = [&](int64_t i) { + // test streams for I/O + ifstream infile; + init_in(infile, chunk_filenames[i]); + + string output_name = plan->output_filepath(output_index, i, chunk_filenames.size()); + + ofstream outfile; + init_out(outfile, output_name); + + // FIXME: this crashes as a MutablePathHandleGraph for some reason... + unique_ptr graph + = vg::io::VPKG::load_one(infile); + + // gather handles to the alt allele paths + vector alt_paths; + graph->for_each_path_handle([&](const path_handle_t& path) { + if (Paths::is_alt(graph->get_path_name(path))) { + alt_paths.push_back(path); + } + }); + + // delete them + for (auto path : alt_paths) { + graph->destroy_path(path); + } + + // and save the graph + vg::io::save_handle_graph(graph.get(), outfile); + + output_names[i] = output_name; + }; + + // approximate the time and memory use for each chunk + vector> approx_job_requirements; + for (auto& chunk_filename : chunk_filenames) { + approx_job_requirements.emplace_back(get_file_size(chunk_filename), + approx_graph_load_memory(chunk_filename)); + } + + JobSchedule schedule(approx_job_requirements, strip_chunk); + schedule.execute(plan->target_memory_usage()); + + // return the filename(s) + return all_outputs; + }; + + // strip alt allele paths from a graph that has them + registry.register_recipe({"VG"}, {"VG w/ Variant Paths"}, + [strip_variant_paths](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Stripping allele paths from VG." << endl; + } + + return strip_variant_paths(inputs, plan, constructing); + }); + + // meta-recipe for creating a VG and its segment space from a GFA + auto construct_from_gfa = [&](const vector& inputs, + const IndexingPlan* plan, + const IndexGroup& constructing) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing VG graph from GFA input." << endl; + } + + assert(constructing.size() == 3); + vector> all_outputs(constructing.size()); + + assert(inputs.size() == 1); + assert(constructing.size() == 3); + IndexName output_max_id = "MaxNodeID"; + assert(constructing.count(output_max_id)); + IndexName output_translation = "NamedNodeBackTranslation"; + assert(constructing.count(output_translation)); + IndexName output_index = "VG"; + assert(constructing.count(output_index)); + auto input_filenames = inputs.at(0)->get_filenames(); + if (input_filenames.size() > 1) { + cerr << "error:[IndexRegistry] Graph construction does not support multiple GFAs at this time." << endl; + exit(1); + } + auto input_filename = input_filenames.front(); + + string output_name = plan->output_filepath(output_index); + string translation_name = plan->output_filepath(output_translation); + string max_id_name = plan->output_filepath(output_max_id); + // The graph and max ID are streams we start here. + ofstream outfile, max_id_outfile; + init_out(outfile, output_name); + init_out(max_id_outfile, max_id_name); + auto graph = init_mutable_graph(); + + // make the graph from GFA, and save segment info to the translation file if there is nontrivial segment info. + try { + // TODO: this could be fragile if we repurpose this lambda for Reference GFA w/ Haplotypes + // if we're constructing from a reference GFA, we don't need anything from W lines + unordered_set ignore{PathSense::HAPLOTYPE}; + algorithms::gfa_to_path_handle_graph(input_filename, graph.get(), numeric_limits::max(), translation_name, &ignore); + } + catch (algorithms::GFAFormatError& e) { + cerr << "error:[IndexRegistry] Input GFA is not usable in VG." << endl; + cerr << e.what() << endl; + exit(1); + } + + + // Now we need to append some splits to the output file. + ofstream translation_outfile; + translation_outfile.open(translation_name, std::ios_base::app); + if (!translation_outfile) { + cerr << "error:[IndexRegistry] could not append output to " << translation_name << endl; + exit(1); + } + + handlealgs::chop(*graph, IndexingParameters::max_node_size, [&](nid_t old_id, size_t offset, size_t rev_offset, handle_t new_node) { + stringstream strm; + strm << "K\t" << old_id << "\t" << offset << "\t" << rev_offset << "\t" << graph->get_id(new_node) << std::endl; +#pragma omp critical (translation_outfile) + { + // Write each cut to a line in the translation file, after the segment names are defined. + translation_outfile << strm.str(); + } + }); + + // save the graph + vg::io::save_handle_graph(graph.get(), outfile); + // and the max id + max_id_outfile << graph->max_node_id(); + + // return the filenames + all_outputs[0].push_back(max_id_name); + all_outputs[1].push_back(translation_name); + all_outputs[2].push_back(output_name); + return all_outputs; + }; + + + + // A meta-recipe to make VG and spliced VG files using the Constructor + // Expects inputs to be ordered: FASTA, VCF[, GTF/GFF][, Insertion FASTA] + auto construct_with_constructor = [](const vector& inputs, + const IndexingPlan* plan, + const IndexGroup& constructing, + bool alt_paths, + bool has_transcripts, + bool has_variants) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing"; + if (has_transcripts) { + cerr << " spliced"; + } + cerr << " VG graph from FASTA"; + if (has_variants) { + cerr << " and VCF"; + } + cerr << " input." << endl; + } + + assert(constructing.size() == 2); + vector> all_outputs(constructing.size()); + auto output_max_id = *constructing.begin(); + auto output_graph = *constructing.rbegin(); + auto& max_id_names = all_outputs[0]; + auto& graph_names = all_outputs[1]; + + bool has_ins_fasta = false; + if (1 + int(has_transcripts) + int(has_variants) != inputs.size()) { + assert(2 + int(has_transcripts) + int(has_variants) == inputs.size()); + has_ins_fasta = true; + } + + // unpack the inputs + vector ref_filenames, vcf_filenames, insertions, transcripts; + { + size_t i = 0; + if (has_transcripts) { + transcripts = inputs[i++]->get_filenames(); + } + ref_filenames = inputs[i++]->get_filenames(); + if (has_variants) { + vcf_filenames = inputs[i++]->get_filenames(); + } + if (has_ins_fasta) { + insertions = inputs[i++]->get_filenames(); + } + } + + if (has_ins_fasta) { + if (insertions.size() > 1) { + cerr << "error:[IndexRegistry] can only provide one FASTA for insertion sequences" << endl; + exit(1); + } + + // make sure this FASTA has an fai index before we get into all the parallel stuff + FastaReference ins_ref; + ins_ref.open(insertions.front()); + } + + if (has_variants && ref_filenames.size() != 1 && vcf_filenames.size() != 1 && + ref_filenames.size() != vcf_filenames.size()) { + cerr << "[IndexRegistry]: When constructing graph from multiple FASTAs and multiple VCFs, the FASTAs and VCFs must be matched 1-to-1, but input contains " << ref_filenames.size() << " FASTA files and " << vcf_filenames.size() << " VCF files." << endl; + exit(1); + } + if (has_transcripts && transcripts.size() != 1 && ref_filenames.size() != 1 && + transcripts.size() != ref_filenames.size()) { + cerr << "[IndexRegistry]: When constructing graph from multiple GTF/GFFs and multiple FASTAs, the GTF/GFFs and the FASTAs must be matched 1-to-1, but input contains " << transcripts.size() << " GTF/GFF files and " << ref_filenames.size() << " FASTA files." << endl; + exit(1); + } + if (has_transcripts && has_variants && transcripts.size() != 1 && vcf_filenames.size() != 1 && + transcripts.size() != vcf_filenames.size()) { + cerr << "[IndexRegistry]: When constructing graph from multiple GTF/GFFs and multiple VCFs, the GTF/GFFs and the VCFs must be matched 1-to-1, but input contains " << transcripts.size() << " GTF/GFF files and " << vcf_filenames.size() << " VCF files." << endl; + exit(1); + } + + // are we broadcasting the transcripts from one chunk to many? + bool broadcasting_txs = transcripts.size() != max(ref_filenames.size(), + vcf_filenames.size()); + + // TODO: this estimate should include splice edges too + vector> approx_job_requirements; + { + size_t i = 0; + for (auto approx_mem : each_approx_graph_memory(ref_filenames, vcf_filenames)) { + int64_t approx_time; + if (!vcf_filenames.empty() && vcf_filenames.size() != 1) { + approx_time = get_file_size(vcf_filenames[i]); + } + else { + approx_time = get_file_size(ref_filenames[i]); + } + approx_job_requirements.emplace_back(approx_time, approx_mem); + ++i; + } + } + +#ifdef debug_index_registry_recipes + cerr << "approximate chunk requirements:" << endl; + for (size_t i = 0; i < approx_job_requirements.size(); ++i) { + auto requirement = approx_job_requirements[i]; + cerr << "\tchunk " << i << " -- time: " << requirement.first << ", memory: " << requirement.second << endl; + } +#endif + graph_names.resize(max(ref_filenames.size(), vcf_filenames.size())); + vector> node_id_ranges(graph_names.size()); + auto make_graph = [&](int64_t idx) { +#ifdef debug_index_registry_recipes + cerr << "making graph chunk " << idx << endl; +#endif + + auto ref_filename = ref_filenames.size() == 1 ? ref_filenames[0] : ref_filenames[idx]; + +#ifdef debug_index_registry_recipes + cerr << "constructing graph with Constructor for ref " << ref_filename << endl; +#endif + + // init and configure the constructor + Constructor constructor; + constructor.do_svs = true; + constructor.alt_paths = alt_paths; + constructor.max_node_size = IndexingParameters::max_node_size; + constructor.show_progress = IndexingParameters::verbosity >= IndexingParameters::Debug; + + if (ref_filenames.size() != 1 && vcf_filenames.size() == 1) { + // we have multiple FASTA but only 1 VCF, so we'll limit the + // constructor to the contigs of this FASTA for this run + FastaReference ref; + ref.open(ref_filename); + for (const string& seqname : ref.index->sequenceNames) { + constructor.allowed_vcf_names.insert(seqname); + } + } + else if (!vcf_filenames.empty() && vcf_filenames.size() != 1 && ref_filenames.size() == 1) { + // we have multiple VCFs but only 1 FASTA, so we'll limit the + // constructor to the contigs of this VCF for this run + + // unfortunately there doesn't seem to be a good way to do this without + // iterating over the entire file: + for (const auto& contig : vcf_contigs(vcf_filenames[idx])) { + constructor.allowed_vcf_names.insert(contig); + } + } + + string output_name = plan->output_filepath(output_graph, idx, + max(ref_filenames.size(), vcf_filenames.size())); + ofstream outfile; + init_out(outfile, output_name); + + auto graph = init_mutable_graph(); + + vector fasta(1, ref_filename); + vector vcf; + if (!vcf_filenames.empty()) { + vcf.emplace_back(vcf_filenames.size() == 1 ? vcf_filenames[0] : vcf_filenames[idx]); + } + + // do the construction + constructor.construct_graph(fasta, vcf, insertions, graph.get()); + +#ifdef debug_index_registry_recipes + cerr << "resulting graph has " << graph->get_node_count() << " nodes" << endl; +#endif + + + if (!transcripts.empty()) { + + auto transcript_filename = transcripts[transcripts.size() == 1 ? 0 : idx]; + +#ifdef debug_index_registry_recipes + cerr << "adding transcripts from " << transcript_filename << endl; +#endif + + ifstream infile_tx; + init_in(infile_tx, transcript_filename); + + vector path_names; + if (broadcasting_txs) { + // get the path names in case we need to report them later for debug output + graph->for_each_path_handle([&](const path_handle_t& path) { + path_names.push_back(graph->get_path_name(path)); + }); + } + + // give away ownership of the graph to the Transcriptome + Transcriptome transcriptome(move(graph)); + transcriptome.error_on_missing_path = !broadcasting_txs; + transcriptome.feature_type = IndexingParameters::gff_feature_name; + transcriptome.transcript_tag = IndexingParameters::gff_transcript_tag; + + // add the splice edges + auto dummy = unique_ptr(new gbwt::GBWT()); + size_t transcripts_added = transcriptome.add_reference_transcripts(vector({&infile_tx}), dummy, false, false); + + if (broadcasting_txs && !path_names.empty() && transcripts_added == 0 + && transcript_file_nonempty(transcripts[idx])) { + cerr << "warning:[IndexRegistry] no matching paths from transcript file " << transcript_filename << " were found in graph chunk containing the following paths:" << endl; + for (const string& path_name : path_names) { + cerr << "\t" << path_name << endl; + } + } + + node_id_ranges[idx] = make_pair(transcriptome.graph().min_node_id(), + transcriptome.graph().max_node_id()); + + // save the file + transcriptome.write_graph(&outfile); + } + else { + + node_id_ranges[idx] = make_pair(graph->min_node_id(), graph->max_node_id()); + + // save the file + vg::io::save_handle_graph(graph.get(), outfile); + } + + graph_names[idx] = output_name; + }; + + // TODO: allow contig renaming through Constructor::add_name_mapping + + // construct the jobs in parallel, trying to use multithreading while also + // restraining memory usage + JobSchedule schedule(approx_job_requirements, make_graph); + schedule.execute(plan->target_memory_usage()); + + // merge the ID spaces if we need to + vector id_increment(1, 1 - node_id_ranges[0].first); + if (graph_names.size() > 1 || id_increment.front() != 0) { + // the increments we'll need to make each ID range non-overlapping + for (int i = 1; i < node_id_ranges.size(); ++i) { + id_increment.push_back(node_id_ranges[i - 1].second + id_increment[i - 1] - node_id_ranges[i].first + 1); + } + + vector> approx_job_requirements; + for (int i = 0; i < node_id_ranges.size(); ++i) { + approx_job_requirements.emplace_back(node_id_ranges[i].second - node_id_ranges[i].first, + approx_graph_load_memory(graph_names[i])); + } + +#ifdef debug_index_registry_recipes + cerr << "computed node ID increments for chunks:" << endl; + for (int i = 0; i < id_increment.size(); ++i) { + cerr << "\t[" << node_id_ranges[i].first << ", " << node_id_ranges[i].second << "] + " << id_increment[i] << endl; + } +#endif + + // do the incrementation in parallel + auto increment_node_ids = [&](int64_t idx) { + + // load the graph + ifstream infile; + init_in(infile, graph_names[idx]); + unique_ptr graph + = vg::io::VPKG::load_one(infile); + + // adjust the IDs + graph->increment_node_ids(id_increment[idx]); + + // save back to the same file + ofstream outfile; + init_out(outfile, graph_names[idx]); + vg::io::save_handle_graph(graph.get(), outfile); + }; + + JobSchedule schedule(approx_job_requirements, increment_node_ids); + schedule.execute(plan->target_memory_usage()); + } + + // save the max node id as a simple text file + auto max_id_name = plan->output_filepath(output_max_id); + ofstream max_id_outfile; + init_out(max_id_outfile, max_id_name); + nid_t max_node_id = node_id_ranges.back().second + id_increment.back(); + max_id_outfile << max_node_id; + + max_id_names.push_back(max_id_name); + + // return the filename(s) + return all_outputs; + }; + + // the specific instantiations of the meta-recipe above + registry.register_recipe({"MaxNodeID", "VG w/ Variant Paths"}, {"Chunked Reference FASTA", "Chunked VCF w/ Phasing", "Insertion Sequence FASTA"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, true, false, true); + }); + registry.register_recipe({"MaxNodeID", "VG w/ Variant Paths"}, {"Chunked Reference FASTA", "Chunked VCF w/ Phasing"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, true, false, true); + }); + registry.register_recipe({"MaxNodeID", "NamedNodeBackTranslation", "VG"}, {"Reference GFA"}, + [&](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_from_gfa(inputs, plan, constructing); + }); + registry.register_recipe({"MaxNodeID", "VG"}, {"Chunked Reference FASTA", "Chunked VCF", "Insertion Sequence FASTA"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, false, false, true); + }); + registry.register_recipe({"MaxNodeID", "VG"}, {"Chunked Reference FASTA", "Chunked VCF"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, false, false, true); + }); + registry.register_recipe({"MaxNodeID", "VG"}, {"Chunked Reference FASTA"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, false, false, false); + }); + +#ifdef debug_index_registry_setup + cerr << "registering Spliced VG recipes" << endl; +#endif + + //////////////////////////////////// + // Spliced VG Recipes + //////////////////////////////////// + + registry.register_recipe({"Spliced VG"}, {"Spliced VG w/ Variant Paths"}, + [strip_variant_paths](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Stripping allele paths from spliced VG." << endl; + } + + return strip_variant_paths(inputs, plan, constructing); + }); + + // TODO: spliced vg from GFA input + + registry.register_recipe({"Spliced MaxNodeID", "Spliced VG w/ Variant Paths"}, + {"Chunked GTF/GFF", "Chunked Reference FASTA", "Chunked VCF w/ Phasing", "Insertion Sequence FASTA"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, true, true, true); + }); + + registry.register_recipe({"Spliced MaxNodeID", "Spliced VG w/ Variant Paths"}, + {"Chunked GTF/GFF", "Chunked Reference FASTA", "Chunked VCF w/ Phasing"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, true, true, true); + }); + + registry.register_recipe({"Spliced MaxNodeID", "Spliced VG"}, + {"Chunked GTF/GFF", "Chunked Reference FASTA", "Chunked VCF", "Insertion Sequence FASTA"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, false, true, true); + }); + + registry.register_recipe({"Spliced MaxNodeID", "Spliced VG"}, + {"Chunked GTF/GFF", "Chunked Reference FASTA", "Chunked VCF"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, false, true, true); + }); + + registry.register_recipe({"Spliced MaxNodeID", "Spliced VG"}, + {"Chunked GTF/GFF", "Chunked Reference FASTA"}, + [construct_with_constructor](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return construct_with_constructor(inputs, plan, constructing, false, true, false); + }); + + + //////////////////////////////////// + // XG Recipes + //////////////////////////////////// + +#ifdef debug_index_registry_setup + cerr << "registering XG recipes" << endl; +#endif + + // TODO: currently disabling this to ensure, but I'd prefer to make a separate + // semantic XG for a node-chopped variety and handle the pipeline differences + // with simplifications + +// registry.register_recipe({"XG"}, {"Reference GFA"}, +// [](const vector& inputs, +// const IndexingPlan* plan, +// AliasGraph& alias_graph, +// const IndexGroup& constructing) { +// if (IndexingParameters::verbosity != IndexingParameters::None) { +// cerr << "[IndexRegistry]: Constructing XG graph from GFA input." << endl; +// } +// assert(constructing.size() == 1); +// vector> all_outputs(constructing.size()); +// auto output_index = *constructing.begin(); +// auto gfa_names = inputs.front()->get_filenames(); +// if (gfa_names.size() > 1) { +// cerr << "error:[IndexRegistry] Graph construction does not support multiple GFAs at this time." << endl; +// exit(1); +// } +// +// string output_name = plan->output_filepath(output_index); +// ofstream outfile; +// init_out(outfile, output_name); +// +// xg::XG xg_index; +// xg_index.from_gfa(gfa_names.front()); +// +// vg::io::save_handle_graph(&xg_index, outfile); +// +// // return the filename +// all_outputs[0].emplace_back(output_name); +// return all_outputs; +// }); + + auto make_xg_from_graph = [](const vector& inputs, + const IndexingPlan* plan, + const IndexGroup& constructing) { + + assert(inputs.size() == 1); + assert(constructing.size() == 1); + auto output_index = *constructing.begin(); + vector> all_outputs(constructing.size()); + + auto graph_filenames = inputs.at(0)->get_filenames(); + + string output_name = plan->output_filepath(output_index); + ofstream outfile; + init_out(outfile, output_name); + + xg::XG xg_index; + + if (graph_filenames.size() == 1) { + // we do the one-graph conversion directly, which is more efficient than the + // VGset option + + // test streams for I/O + ifstream infile; + init_in(infile, graph_filenames.front()); + + unique_ptr graph = vg::io::VPKG::load_one(infile); + + xg_index.from_path_handle_graph(*graph); + } + else { + // the inefficient 3-pass, multi-graph construction algorithm + + // make a mutable copy of the graph names + vector graph_files; + for (const string& graph_file : graph_filenames) { + // test for I/O while we're at it + ifstream infile; + init_in(infile, graph_file); + + graph_files.push_back(graph_file); + } + + VGset graph_set(graph_files); + graph_set.to_xg(xg_index); + } + + vg::io::save_handle_graph(&xg_index, outfile); + + // return the filename + all_outputs[0].emplace_back(output_name); + return all_outputs; + }; + + registry.register_recipe({"XG"}, {"VG"}, + [make_xg_from_graph](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing XG graph from VG graph." << endl; + } + return make_xg_from_graph(inputs, plan, constructing); + }); + + registry.register_recipe({"Spliced XG"}, {"Spliced VG w/ Transcript Paths"}, + [make_xg_from_graph](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing spliced XG graph from spliced VG graph." << endl; + } + return make_xg_from_graph(inputs, plan, constructing); + }); + + //////////////////////////////////// + // MaxNodeID Recipes + //////////////////////////////////// + + //////////////////////////////////// + // GBWT Recipes + //////////////////////////////////// + +#ifdef debug_index_registry_setup + cerr << "registering GBWT recipes" << endl; +#endif + + // merge multiple GBWTs if there are multiple, otherwise leave in place + auto merge_gbwts = [](const vector& gbwt_names, + const IndexingPlan* plan, + const IndexName& constructing_name) { + if (gbwt_names.size() > 1) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Merging contig GBWTs." << endl; + } + // we also need to merge the GBWTs + + string merged_gbwt_name = plan->output_filepath(constructing_name); + ofstream outfile; + init_out(outfile, merged_gbwt_name); + + vector gbwt_indexes(gbwt_names.size()); + for (size_t i = 0; i < gbwt_names.size(); ++i) { + load_gbwt(gbwt_indexes[i], gbwt_names[i], IndexingParameters::verbosity >= IndexingParameters::Debug); + } + gbwt::GBWT merged(gbwt_indexes); + merged.serialize(outfile); + return merged_gbwt_name; + } + else { + // note: we don't need to register an alias here because it all happens + // internally to one index's recipe + return gbwt_names.front(); + } + }; + + // meta-recipe to make GBWTs + auto make_gbwt = [merge_gbwts](const vector& inputs, + bool include_named_paths, + const IndexingPlan* plan, + const IndexGroup& constructing) { + + assert(inputs.size() == 2); + + auto vcf_filenames = inputs[0]->get_filenames(); + auto graph_filenames = inputs[1]->get_filenames(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + + auto output_index = *constructing.begin(); + auto& output_names = all_outputs[0]; + + if ((graph_filenames.size() != 1 && graph_filenames.size() != vcf_filenames.size()) || + (vcf_filenames.size() != 1 && graph_filenames.size() != vcf_filenames.size())) { + cerr << "[IndexRegistry]: When constructing GBWT from multiple graphs and multiple VCFs, the graphs and VCFs must be matched 1-to-1, but input contains " << graph_filenames.size() << " graphs and " << vcf_filenames.size() << " VCF files." << endl; + exit(1); + } + if (vcf_filenames.size() == 1 && graph_filenames.size() != 1) { + // FIXME: it should at least try to join the graph chunks together + cerr << "[IndexRegistry]: GBWT construction currently does not support broadcasting 1 VCF to multiple graph chunks." << endl; + exit(1); + } + + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + gbwt::Verbosity::set(gbwt::Verbosity::BASIC); + } + else { + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + } + + int64_t target_memory_usage = plan->target_memory_usage(); + vector> approx_job_requirements; + + vector gbwt_names(vcf_filenames.size()); + unique_ptr broadcast_graph; + if (graph_filenames.size() == 1) { + // we only have one graph, so we can save time by loading it only one time + // test streams for I/O + ifstream infile; + init_in(infile, graph_filenames.front()); + + // we don't want to double-count the graph's contribution to memory in separate jobs, so we + // subtract it once from the target memory use + target_memory_usage = max(0, target_memory_usage - approx_graph_load_memory(graph_filenames.front())); + + // estimate the time and memory requirements + for (auto vcf_filename : vcf_filenames) { + approx_job_requirements.emplace_back(get_file_size(vcf_filename), approx_gbwt_memory(vcf_filename)); + } + + // load the graph + broadcast_graph = vg::io::VPKG::load_one(infile); + + } + else { + // estimate the time and memory requirements + for (int64_t i = 0; i < vcf_filenames.size(); ++i) { + approx_job_requirements.emplace_back(get_file_size(vcf_filenames[i]), + approx_gbwt_memory(vcf_filenames[i]) + approx_graph_load_memory(graph_filenames[i])); + } + + } + + // Prepare a single shared haplotype indexer, since everything on it is thread safe. + // Make this critical so we don't end up with a race on the verbosity + unique_ptr haplotype_indexer; +#pragma omp critical + { + haplotype_indexer = unique_ptr(new HaplotypeIndexer()); + // HaplotypeIndexer resets this in its constructor + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + gbwt::Verbosity::set(gbwt::Verbosity::BASIC); + } + else { + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + } + } + haplotype_indexer->show_progress = IndexingParameters::verbosity >= IndexingParameters::Debug; + // from the toil-vg best practices + haplotype_indexer->force_phasing = true; + haplotype_indexer->discard_overlaps = true; + + // If we're using a single graph, we're going to need to do each VCF's + // named paths in its job, and then come back and do the rest. So we + // need to know which paths still need to be done (and whether there are any). + // So we first make a set of all the paths that need doing, and then we + // clear them out when they're done, and if any are left we know we + // need a job to do them. + unordered_set broadcast_graph_paths_to_do; + if (include_named_paths && broadcast_graph) { + broadcast_graph->for_each_path_handle([&](const path_handle_t& path_handle) { + // Look at all the paths in advance + if (broadcast_graph->is_empty(path_handle) || Paths::is_alt(broadcast_graph->get_path_name(path_handle))) { + // Skip empty paths and alt allele paths + return; + } + // Keep the rest. + broadcast_graph_paths_to_do.insert(path_handle); + }); + } + + // construct a GBWT from the i-th VCF + auto gbwt_job = [&](size_t i) { + string gbwt_name; + if (vcf_filenames.size() != 1) { + // multiple components, so make a temp file that we will merge later + gbwt_name = temp_file::create(); + } + else { + // one component, so we will actually save the output + gbwt_name = plan->output_filepath(output_index); + } + + // load the contig graph if necessary + unique_ptr contig_graph; + if (graph_filenames.size() != 1) { + ifstream infile; + init_in(infile, graph_filenames[i]); + contig_graph = vg::io::VPKG::load_one(infile); + } + + auto graph = graph_filenames.size() == 1 ? broadcast_graph.get() : contig_graph.get(); + + // Parse the VCFs for this job + vector parse_files = haplotype_indexer->parse_vcf(vcf_filenames[i], + *graph); + + // Build the GBWT from the parse files and the graph. + // For fast merging later, we need to ensure that all threads on a single contig end up in the same initial GBWT. + // So, if we have just one graph, only threads visited by the VCF can go in. + // Then at the end, if there are non-alt paths left over, we add another job to make a GBWT just of those paths. + // Otherwise, if we have one graph per job, all threads from the graph can go in. + unique_ptr gbwt_index = haplotype_indexer->build_gbwt(parse_files, + "GBWT" + std::to_string(i), + include_named_paths ? graph : nullptr, + nullptr, + include_named_paths && (bool)broadcast_graph); + + save_gbwt(*gbwt_index, gbwt_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + gbwt_names[i] = gbwt_name; + + if (include_named_paths && broadcast_graph) { + // We have to check off the paths we embeded in this job. + for (size_t contig_number = 0; contig_number < gbwt_index->metadata.contigs(); contig_number++) { + // Go through all contig names in the metadata + string contig_name = gbwt_index->metadata.contig(contig_number); + + if (graph->has_path(contig_name)) { + // And get the graph path + path_handle_t contig_path = graph->get_path_handle(contig_name); + #pragma omp critical (broadcast_graph_paths_done) + { + // Check it off in a thread-safe way. + // TODO: Will this be too much locking and unlocking when we do transcripts? + broadcast_graph_paths_to_do.erase(contig_path); + } + } + } + } + }; + + { + // Do all the GBWT jobs + JobSchedule schedule(approx_job_requirements, gbwt_job); + schedule.execute(target_memory_usage); + } + + if (include_named_paths && broadcast_graph && !broadcast_graph_paths_to_do.empty()) { + // We're Back for One Last Job. + // We need to embed these remaining paths that weren't VCF contigs. + + // There's no VCF to load, so our memory estimate is just 0. The graph is loaded already. + // TODO: can we improve this? + approx_job_requirements.clear(); + approx_job_requirements.emplace_back(0, 0); + + // This job has exclusive use of our data structures. + auto one_last_job = [&](size_t ignored) { + // Make a temp file + string gbwt_name = temp_file::create(); + + // Make a GBWT of the remaining graph paths. + unique_ptr gbwt_index = haplotype_indexer->build_gbwt({}, + "Leftovers", + broadcast_graph.get(), + &broadcast_graph_paths_to_do); + + // And save it in the temp file + save_gbwt(*gbwt_index, gbwt_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + // And add it as one final GBWT. + gbwt_names.push_back(gbwt_name); + }; + + JobSchedule schedule(approx_job_requirements, one_last_job); + schedule.execute(target_memory_usage); + } + + // merge GBWTs if necessary + output_names.push_back(merge_gbwts(gbwt_names, plan, output_index)); + // return filename + return all_outputs; + }; + + registry.register_recipe({"GBWT"}, {"Chunked VCF w/ Phasing", "VG w/ Variant Paths"}, + [make_gbwt](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing GBWT from VG graph and phased VCF input." << endl; + gbwt::Verbosity::set(gbwt::Verbosity::BASIC); + } + else { + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + } + return make_gbwt(inputs, true, plan, constructing); + }); + + registry.register_recipe({"Spliced GBWT"}, {"Chunked VCF w/ Phasing", "Spliced VG w/ Variant Paths"}, + [make_gbwt](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing GBWT from spliced VG graph and phased VCF input." << endl; + gbwt::Verbosity::set(gbwt::Verbosity::BASIC); + } + else { + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + } + // TODO: If we include named paths here, we then generate + // haplotype-specific transcripts following them when making the + // "Haplotype-Transcript GBWT". It's not clear that that's correct, and + // the "Spliced GBWT" never feeds into a GBZ, so we leave them out for + // now. + return make_gbwt(inputs, false, plan, constructing); + }); + + // Giraffe will prefer to use a downsampled haplotype GBWT if possible + registry.register_recipe({"Giraffe GBWT"}, {"GBWT", "XG"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Downsampling full GBWT." << endl; + } + + assert(inputs.size() == 2); + auto gbwt_filenames = inputs[0]->get_filenames(); + auto xg_filenames = inputs[1]->get_filenames(); + assert(gbwt_filenames.size() == 1); + assert(xg_filenames.size() == 1); + auto gbwt_filename = gbwt_filenames.front(); + auto xg_filename = xg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + auto sampled_gbwt_output = *constructing.begin(); + + ifstream infile_xg, infile_gbwt; + init_in(infile_xg, xg_filename); + init_in(infile_gbwt, gbwt_filename); + + auto output_name = plan->output_filepath(sampled_gbwt_output); + ofstream outfile_sampled_gbwt; + init_out(outfile_sampled_gbwt, output_name); + + auto xg_index = vg::io::VPKG::load_one(infile_xg); + auto gbwt_index = vg::io::VPKG::load_one(infile_gbwt); + + // Downsample only if it would reduce the number of haplotypes sufficiently. + size_t threshold = IndexingParameters::giraffe_gbwt_downsample * IndexingParameters::downsample_threshold; + bool downsample = (gbwt_index->hasMetadata() && gbwt_index->metadata.haplotypes() >= threshold); + + + int code; + if (downsample) { + // Downsample the haplotypes and generate a path cover of components without haplotypes. + + // We need to drop paths that are alt allele paths and not pass them + // through from a graph that has them to the synthesized GBWT. + std::function path_filter = [&xg_index](const path_handle_t& path) { + return !Paths::is_alt(xg_index->get_path_name(path)); + }; + + // clang wants this one cast to function first for some reason? + function exec = [&]() { + gbwt::GBWT cover = gbwtgraph::local_haplotypes(*xg_index, *gbwt_index, + IndexingParameters::giraffe_gbwt_downsample, + IndexingParameters::downsample_context_length, + IndexingParameters::gbwt_insert_batch_size, + IndexingParameters::gbwt_sampling_interval, + true, // Also include named paths from the graph + &path_filter, + IndexingParameters::verbosity >= IndexingParameters::Debug); + save_gbwt(cover, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + }; + code = execute_in_fork(exec); + } + else { + // Augment the GBWT with a path cover of components without haplotypes. + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Not enough haplotypes; augmenting the full GBWT instead." << endl; + } + + code = execute_in_fork([&]() { + gbwt::DynamicGBWT dynamic_index(*gbwt_index); + gbwt_index.reset(); + gbwtgraph::augment_gbwt(*xg_index, dynamic_index, + IndexingParameters::path_cover_depth, + IndexingParameters::downsample_context_length, + IndexingParameters::gbwt_insert_batch_size, + IndexingParameters::gbwt_sampling_interval, + IndexingParameters::verbosity >= IndexingParameters::Debug); + gbwt::GBWT cover = gbwt::GBWT(dynamic_index); + save_gbwt(cover, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + }); + } + + if (code != 0) { + IndexingParameters::gbwt_insert_batch_size *= IndexingParameters::gbwt_insert_batch_size_increase_factor; + throw RewindPlanException("[IndexRegistry]: Exceeded GBWT insert buffer size, expanding and reattempting.", {"Giraffe GBWT"}); + } + + output_names.push_back(output_name); + return all_outputs; + }); + + // do a greedy haplotype cover if we don't have haplotypes + registry.register_recipe({"Giraffe GBWT"}, {"XG"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing a greedy path cover GBWT" << endl; + } + + assert(inputs.size() == 1); + auto xg_filenames = inputs[0]->get_filenames(); + assert(xg_filenames.size() == 1); + auto xg_filename = xg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + auto path_cover_output = *constructing.begin(); + + ifstream infile_xg; + init_in(infile_xg, xg_filename); + + auto output_name = plan->output_filepath(path_cover_output); + ofstream outfile_path_cover_gbwt; + init_out(outfile_path_cover_gbwt, output_name); + + auto xg_index = vg::io::VPKG::load_one(infile_xg); + + auto comp_sizes = algorithms::component_sizes(*xg_index); + size_t max_comp_size = *max_element(comp_sizes.begin(), comp_sizes.end()); + + // We need to drop paths that are alt allele paths and not pass them + // through from a graph that has them to the synthesized GBWT. + std::function path_filter = [&xg_index](const path_handle_t& path) { + return !Paths::is_alt(xg_index->get_path_name(path)); + }; + + // make a GBWT from a greedy path cover + int code = execute_in_fork([&]() { + gbwt::GBWT cover = gbwtgraph::path_cover_gbwt(*xg_index, + IndexingParameters::path_cover_depth, + IndexingParameters::downsample_context_length, + std::max(IndexingParameters::gbwt_insert_batch_size, 20 * max_comp_size), // buffer size recommendation from Jouni + IndexingParameters::gbwt_sampling_interval, + true, // Also include named paths from the graph + &path_filter, + IndexingParameters::verbosity >= IndexingParameters::Debug); + + save_gbwt(cover, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + }); + + if (code != 0) { + IndexingParameters::gbwt_insert_batch_size *= IndexingParameters::gbwt_insert_batch_size_increase_factor; + throw RewindPlanException("[IndexRegistry]: Exceeded GBWT insert buffer size, expanding and reattempting.", {"Giraffe GBWT"}); + } + + output_names.push_back(output_name); + return all_outputs; + }); + + // meta-recipe to either add transcripts paths or also make HST collections + auto do_vg_rna = [merge_gbwts](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + assert(constructing.size() == 3 || constructing.size() == 1); + bool making_hsts = constructing.size() == 3; + + if (IndexingParameters::verbosity != IndexingParameters::None) { + if (making_hsts) { + cerr << "[IndexRegistry]: Constructing haplotype-transcript GBWT and finishing spliced VG." << endl; + } + else { + cerr << "[IndexRegistry]: Finishing spliced VG." << endl; + } + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + gbwt::Verbosity::set(gbwt::Verbosity::BASIC); + } + } + else { + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + } + vector> all_outputs(constructing.size()); + IndexName output_haplo_tx, output_tx_table, output_tx_graph; + { + int i = 0; + for (auto output_index : constructing) { + if (i == 0 && making_hsts) { + output_haplo_tx = output_index; + } + else if (i == 0 && !making_hsts) { + output_tx_graph = output_index; + } + else if (i == 1) { + output_tx_graph = output_index; + } + else { + output_tx_table = output_index; + } + ++i; + } + } + //auto& haplo_tx_gbwt_names = all_outputs[0]; + auto& tx_graph_names = all_outputs[making_hsts ? 1 : 0]; + //auto& tx_table_names = all_outputs[2]; + + vector tx_filenames, gbwt_filenames, graph_filenames; + string gbwt_filename; + unique_ptr haplotype_index; + vector gbwt_chunk_names; + if (making_hsts) { + tx_filenames = inputs[0]->get_filenames(); + auto gbwt_filenames = inputs[1]->get_filenames(); + graph_filenames = inputs[2]->get_filenames(); + + assert(gbwt_filenames.size() == 1); + gbwt_filename = gbwt_filenames.front(); + + haplotype_index = vg::io::VPKG::load_one(gbwt_filename); + + // TODO: i can't find where in the building code you actually ensure this... + assert(haplotype_index->bidirectional()); + + // the HST tables + all_outputs[2].resize(graph_filenames.size()); + + gbwt_chunk_names.resize(graph_filenames.size()); + } + else { + tx_filenames = inputs[0]->get_filenames(); + graph_filenames = inputs[1]->get_filenames(); + } + + tx_graph_names.resize(graph_filenames.size()); + + auto haplo_tx_job = [&](int64_t i) { + + string tx_graph_name = plan->output_filepath(output_tx_graph, i, graph_filenames.size()); + ofstream tx_graph_outfile; + init_out(tx_graph_outfile, tx_graph_name); + + string gbwt_name, info_table_name; + if (making_hsts) { + if (graph_filenames.size() != 1) { + // multiple components, so make a temp file that we will merge later + gbwt_name = temp_file::create(); + } + else { + // one component, so we will actually save the output + gbwt_name = plan->output_filepath(output_haplo_tx, i, graph_filenames.size()); + } + } + + int64_t j = tx_filenames.size() > 1 ? i : 0; + + ifstream infile_graph, infile_tx; + init_in(infile_graph, graph_filenames[i]); + init_in(infile_tx, tx_filenames[j]); + + // are we using 1 transcript file for multiple graphs? + bool broadcasting_txs = (graph_filenames.size() != tx_filenames.size()); + + unique_ptr graph + = vg::io::VPKG::load_one(infile_graph); + + vector path_names; + if (broadcasting_txs) { + // get the path names in case we need to report them later for debug output + graph->for_each_path_handle([&](const path_handle_t& path) { + path_names.push_back(graph->get_path_name(path)); + }); + } + + Transcriptome transcriptome(move(graph)); + transcriptome.error_on_missing_path = !broadcasting_txs; + transcriptome.feature_type = IndexingParameters::gff_feature_name; + transcriptome.transcript_tag = IndexingParameters::gff_transcript_tag; + + // load up the transcripts and add edges on the reference path + size_t transcripts_added = transcriptome.add_reference_transcripts(vector({&infile_tx}), haplotype_index, false, true); + + if (broadcasting_txs && !path_names.empty() && transcripts_added == 0 + && transcript_file_nonempty(tx_filenames[j])) { + cerr << "warning:[IndexRegistry] no matching paths from transcript file " << tx_filenames[j] << " were found in graph chunk containing the following paths:" << endl; + for (const string& path_name : path_names) { + cerr << "\t" << path_name << endl; + } + } + + if (making_hsts) { + + // go back to the beginning of the transcripts + infile_tx.clear(); + infile_tx.seekg(0); + + // add edges on other haplotypes + size_t num_transcripts_projected = transcriptome.add_haplotype_transcripts(vector({&infile_tx}), *haplotype_index, false); + + // init the haplotype transcript GBWT + size_t node_width = gbwt::bit_length(gbwt::Node::encode(transcriptome.graph().max_node_id(), true)); + int code = execute_in_fork([&]() { + gbwt::GBWTBuilder gbwt_builder(node_width, + IndexingParameters::gbwt_insert_batch_size, + IndexingParameters::gbwt_sampling_interval); + // actually build it + transcriptome.add_transcripts_to_gbwt(&gbwt_builder, IndexingParameters::bidirectional_haplo_tx_gbwt, false); + + // save the haplotype transcript GBWT + gbwt_builder.finish(); + save_gbwt(gbwt_builder.index, gbwt_name, IndexingParameters::verbosity == IndexingParameters::Debug); + }); + + if (code != 0) { + IndexingParameters::gbwt_insert_batch_size *= IndexingParameters::gbwt_insert_batch_size_increase_factor; + throw RewindPlanException("[IndexRegistry]: Exceeded GBWT insert buffer size, expanding and reattempting.", + {"Haplotype-Transcript GBWT"}); + } + + // write transcript origin info table + info_table_name = plan->output_filepath(output_tx_table, i, graph_filenames.size()); + ofstream info_outfile; + init_out(info_outfile, info_table_name); + transcriptome.write_transcript_info(&info_outfile, *haplotype_index, false); + } + + // save the graph with the transcript paths added + transcriptome.write_graph(&tx_graph_outfile); + + tx_graph_names[i] = tx_graph_name; + + if (making_hsts) { + gbwt_chunk_names[i] = gbwt_name; + all_outputs[2][i] = info_table_name; + } + }; + + // we'll hold the gbwt in memory, so take it out of our memory budget + int64_t target_memory_usage = plan->target_memory_usage(); + if (making_hsts) { + target_memory_usage = max(0, target_memory_usage - get_file_size(gbwt_filename)); + } + + vector> approx_job_requirements; + for (int64_t i = 0; i < graph_filenames.size(); ++i) { + // FIXME: this should also include the approximate memory of the haplotype transcript + approx_job_requirements.emplace_back(get_file_size(graph_filenames[i]), + approx_graph_load_memory(graph_filenames[i])); + } + + JobSchedule schedule(approx_job_requirements, haplo_tx_job); + schedule.execute(target_memory_usage); + + if (making_hsts) { + // merge the GBWT chunks + all_outputs[0].push_back(merge_gbwts(gbwt_chunk_names, plan, output_haplo_tx)); + } + + return all_outputs; + }; + + // TODO: somewhat repetitive with non-GBZ pipeline, but also some notable differences... + auto gbz_vg_rna = [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + assert(constructing.size() == 4 || constructing.size() == 2); + bool making_hsts = constructing.size() == 4; + assert(inputs.size() == 2 || inputs.size() == 3); + bool projecting_transcripts = (inputs.size() == 2); + if (IndexingParameters::verbosity != IndexingParameters::None) { + if (making_hsts) { + cerr << "[IndexRegistry]: Constructing haplotype-transcript GBWT and spliced graph from GBZ-format graph." << endl; + } + else { + cerr << "[IndexRegistry]: Adding splice junctions to GBZ-format graph." << endl; + } + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + gbwt::Verbosity::set(gbwt::Verbosity::BASIC); + } + } + else { + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + } + + vector> all_outputs(constructing.size()); + IndexName output_haplo_tx, output_tx_table, output_tx_graph, output_max_id; + if (making_hsts) { + int i = 0; + for (auto output_index : constructing) { + if (i == 0) { + output_haplo_tx = output_index; + } + else if (i == 1) { + output_max_id = output_index; + } + else if (i == 2) { + output_tx_graph = output_index; + } + else { + output_tx_table = output_index; + } + ++i; + } + } + else { + output_max_id = *constructing.begin(); + output_tx_graph = *constructing.rbegin(); + } + + //auto& haplo_tx_gbwt_names = all_outputs[0]; + auto& max_id_names = all_outputs[making_hsts ? 1 : 0]; + auto& tx_graph_names = all_outputs[making_hsts ? 2 : 1]; + //auto& tx_table_names = all_outputs[2]; + + auto gbz_filenames = inputs[0]->get_filenames(); + auto tx_filenames = inputs[1]->get_filenames(); + vector haplo_tx_filenames; + if (!projecting_transcripts) { + haplo_tx_filenames = inputs[2]->get_filenames(); + } + assert(gbz_filenames.size() == 1); + auto gbz_filename = gbz_filenames.front(); + + vector infiles_tx, infiles_haplo_tx; + for (auto& tx_filename : tx_filenames) { + infiles_tx.emplace_back(); + init_in(infiles_tx.back(), tx_filename); + } + for (auto& haplo_tx_filename : haplo_tx_filenames) { + infiles_haplo_tx.emplace_back(); + init_in(infiles_haplo_tx.back(), haplo_tx_filename); + } + + string max_id_name = plan->output_filepath(output_max_id); + ofstream max_id_outfile; + init_out(max_id_outfile, max_id_name); + + string tx_graph_name = plan->output_filepath(output_tx_graph); + ofstream tx_graph_outfile; + init_out(tx_graph_outfile, tx_graph_name); + + // load, convert, and discard the GBZ + unique_ptr haplotype_index; + auto tx_graph = init_mutable_graph(); + { + unique_ptr gbz = vg::io::VPKG::load_one(gbz_filename); + // copy topology + handlealgs::copy_handle_graph(&(gbz->graph), tx_graph.get()); + // copy ref paths + gbz->graph.for_each_path_matching({PathSense::GENERIC, PathSense::REFERENCE}, {}, {}, [&](const path_handle_t& path) { + handlegraph::algorithms::copy_path(&(gbz->graph), path, tx_graph.get()); + }); + // copy the gbwt + haplotype_index = make_unique(gbz->index); + } + + // hand over the graph + Transcriptome transcriptome(move(tx_graph)); + transcriptome.error_on_missing_path = true; + transcriptome.feature_type = IndexingParameters::gff_feature_name; + transcriptome.transcript_tag = IndexingParameters::gff_transcript_tag; + + // gather the GTF file pointers + vector tx_file_ptrs; + for (auto& tx_file : infiles_tx) { + tx_file_ptrs.push_back(&tx_file); + } + for (auto& tx_file : infiles_haplo_tx) { + tx_file_ptrs.push_back(&tx_file); + } + + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + gbwt::Verbosity::set(gbwt::Verbosity::BASIC); + } + else { + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + } + + // add the splice edges and ref transcript paths + size_t transcripts_added = transcriptome.add_reference_transcripts(tx_file_ptrs, haplotype_index, + !projecting_transcripts, // use haplotypes as refs for GTF? + projecting_transcripts); // update the GBWT for the new node IDs? + + if (making_hsts) { + auto& gbwt_names = all_outputs.front(); + auto& info_table_names = all_outputs.back(); + + string gbwt_name = plan->output_filepath(output_haplo_tx); + + // add edges on other haplotypes + if (projecting_transcripts) { + + // go back to the beginning of the transcripts + for (auto& tx_file : infiles_tx) { + tx_file.clear(); + tx_file.seekg(0); + } + for (auto& tx_file : infiles_haplo_tx) { + tx_file.clear(); + tx_file.seekg(0); + } + + transcriptome.add_haplotype_transcripts(tx_file_ptrs, *haplotype_index, false); + } + + // init the haplotype transcript GBWT + size_t node_width = gbwt::bit_length(gbwt::Node::encode(transcriptome.graph().max_node_id(), true)); + int code = execute_in_fork([&]() { + gbwt::GBWTBuilder gbwt_builder(node_width, + IndexingParameters::gbwt_insert_batch_size, + IndexingParameters::gbwt_sampling_interval); + // actually build it + transcriptome.add_transcripts_to_gbwt(&gbwt_builder, IndexingParameters::bidirectional_haplo_tx_gbwt, false); + + // save the haplotype transcript GBWT + gbwt_builder.finish(); + save_gbwt(gbwt_builder.index, gbwt_name, IndexingParameters::verbosity == IndexingParameters::Debug); + }); + + if (code != 0) { + IndexingParameters::gbwt_insert_batch_size *= IndexingParameters::gbwt_insert_batch_size_increase_factor; + throw RewindPlanException("[IndexRegistry]: Exceeded GBWT insert buffer size, expanding and reattempting.", + {"Haplotype-Transcript GBWT"}); + } + + // write transcript origin info table + string info_table_name = plan->output_filepath(output_tx_table); + ofstream info_outfile; + init_out(info_outfile, info_table_name); + transcriptome.write_transcript_info(&info_outfile, *haplotype_index, false); + + gbwt_names.push_back(gbwt_name); + info_table_names.push_back(info_table_name); + } + + + // save the graph with the transcript paths added + transcriptome.write_graph(&tx_graph_outfile); + tx_graph_names.push_back(tx_graph_name); + + // write the max ID as well + max_id_outfile << transcriptome.graph().max_node_id(); + max_id_names.push_back(max_id_name); + + return all_outputs; + }; + + + auto vg_rna_gbz_graph_only = + registry.register_recipe({"Spliced MaxNodeID", "Spliced VG w/ Transcript Paths"}, {"GBZ", "GTF/GFF", "Haplotype GTF/GFF"}, + [gbz_vg_rna](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return gbz_vg_rna(inputs, plan, alias_graph, constructing); + }); + + auto vg_rna_gbz_liftover_graph_only = + registry.register_recipe({"Spliced MaxNodeID", "Spliced VG w/ Transcript Paths"}, {"GBZ", "GTF/GFF"}, + [gbz_vg_rna](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return gbz_vg_rna(inputs, plan, alias_graph, constructing); + }); + + auto vg_rna_gbz_full = + registry.register_recipe({"Haplotype-Transcript GBWT", "Spliced MaxNodeID", "Spliced VG w/ Transcript Paths", "Unjoined Transcript Origin Table"}, + {"GBZ", "GTF/GFF", "Haplotype GTF/GFF"}, + [gbz_vg_rna](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return gbz_vg_rna(inputs, plan, alias_graph, constructing); + }); + + auto vg_rna_gbz_liftover_full = + registry.register_recipe({"Haplotype-Transcript GBWT", "Spliced MaxNodeID", "Spliced VG w/ Transcript Paths", "Unjoined Transcript Origin Table"}, + {"GBZ", "GTF/GFF"}, + [gbz_vg_rna](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return gbz_vg_rna(inputs, plan, alias_graph, constructing); + }); + + auto vg_rna_graph_only = + registry.register_recipe({"Spliced VG w/ Transcript Paths"}, + {"Chunked GTF/GFF", "Spliced VG"}, + [do_vg_rna](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + return do_vg_rna(inputs, plan, alias_graph, constructing); + }); + + auto vg_rna_full = + registry.register_recipe({"Haplotype-Transcript GBWT", "Spliced VG w/ Transcript Paths", "Unjoined Transcript Origin Table"}, + {"Chunked GTF/GFF", "Spliced GBWT", "Spliced VG"}, + [do_vg_rna](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + return do_vg_rna(inputs, plan, alias_graph, constructing); + }); + + // if both the full and graph-only are required, only do the full + registry.register_generalization(vg_rna_full, vg_rna_graph_only); + registry.register_generalization(vg_rna_gbz_full, vg_rna_gbz_graph_only); + registry.register_generalization(vg_rna_gbz_liftover_full, vg_rna_gbz_liftover_graph_only); + + //////////////////////////////////// + // Info Table Recipes + //////////////////////////////////// + + registry.register_recipe({"Transcript Origin Table"}, {"Unjoined Transcript Origin Table"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Joining transcript origin table." << endl; + } + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto output_index = *constructing.begin(); + + assert(inputs.size() == 1); + auto input_table_names = inputs[0]->get_filenames(); + + if (input_table_names.size() == 1) { + alias_graph.register_alias(output_index, inputs[0]); + all_outputs[0] = input_table_names; + return all_outputs; + } + string output_name = plan->output_filepath(output_index); + + ofstream outfile; + init_out(outfile, output_name); + // join the tables into one + for (size_t i = 0; i < inputs[0]->get_filenames().size(); ++i) { + ifstream infile(inputs[0]->get_filenames()[i]); + string line; + if (i != 0) { + // skip the header + getline(infile, line); + } + while (infile.good()) { + getline(infile, line); + if (!line.empty()) { + outfile << line << '\n'; + } + } + } + all_outputs[0].push_back(output_name); + return all_outputs; + }); + + //////////////////////////////////// + // Pruned VG Recipes + //////////////////////////////////// + +#ifdef debug_index_registry_setup + cerr << "registering pruning recipes" << endl; +#endif + + // meta-recipe for pruning with/without GBWT + auto prune_graph = [](const vector& inputs, + const IndexingPlan* plan, + const IndexGroup& constructing) { + + // we only want to focus on two specific recipes + assert(inputs.size() == 2 || inputs.size() == 3); + bool using_haplotypes = inputs.size() == 3; + + vector graph_names; + string gbwt_name, max_node_name; + { + size_t i = 0; + if (using_haplotypes) { + auto gbwt_names = inputs[i++]->get_filenames(); + assert(gbwt_names.size() == 1); + gbwt_name = gbwt_names.front(); + } + auto max_node_id_names = inputs[i++]->get_filenames(); + assert(max_node_id_names.size() == 1); + max_node_name = max_node_id_names.front(); + graph_names = inputs[i++]->get_filenames(); + } + + if (using_haplotypes) { + assert(constructing.size() == 2); + } + else { + assert(constructing.size() == 1); + } + + vector> all_outputs(constructing.size()); + auto& pruned_graph_names = all_outputs[0]; + auto output_pruned_graph = *constructing.begin(); + + // test streams for I/O + ifstream infile_gbwt, infile_max_id; + init_in(infile_max_id, max_node_name); + unique_ptr gbwt_index; + if (using_haplotypes) { + init_in(infile_gbwt, gbwt_name); + gbwt_index = vg::io::VPKG::load_one(infile_gbwt); + } + + // read the max node ID (across all chunks) + nid_t max_node_id; + infile_max_id >> max_node_id; + + string mapping_name; + if (using_haplotypes) { + + auto output_mapping = *constructing.rbegin(); + + gcsa::NodeMapping mapping(max_node_id + 1); + + mapping_name = plan->output_filepath(output_mapping); + + ofstream mapping_file; + init_out(mapping_file, mapping_name); + mapping.serialize(mapping_file); + + all_outputs[1].push_back(mapping_name); + } + + pruned_graph_names.resize(graph_names.size()); + + mutex unfold_lock; + + auto prune_job = [&](int64_t i) { + ifstream infile_vg; + init_in(infile_vg, graph_names[i]); + + string vg_output_name = plan->output_filepath(output_pruned_graph, i, graph_names.size()); + + ofstream outfile_vg; + init_out(outfile_vg, vg_output_name); + + unique_ptr graph + = vg::io::VPKG::load_one(infile_vg); + + // destroy all paths, which might be made inconsistent + vector paths; + paths.reserve(graph->get_path_count()); + graph->for_each_path_handle([&](const path_handle_t& path) { + paths.push_back(path); + }); + for (auto path : paths) { + graph->destroy_path(path); + } + + // prune the graph based on topology + size_t removed_high_degree, removed_complex, removed_subgraph; + if (IndexingParameters::pruning_max_node_degree != 0) { + removed_high_degree = algorithms::remove_high_degree_nodes(*graph, IndexingParameters::pruning_max_node_degree); + } + removed_complex = algorithms::prune_complex_with_head_tail(*graph, IndexingParameters::pruning_walk_length, + IndexingParameters::pruning_max_edge_count); + removed_subgraph = algorithms::prune_short_subgraphs(*graph, IndexingParameters::pruning_min_component_size); + + + if ((removed_high_degree != 0 || removed_complex != 0 || removed_subgraph != 0) + && (!paths.empty() || using_haplotypes)) { + // we've removed from this graph but there are paths/threads we could use + // to restore the graph + + // TODO: in a single component graph, it would be more efficient to load + // an XG rather than the mutable graph for this step + ifstream infile_unpruned_vg; + init_in(infile_unpruned_vg, graph_names[i]); + + unique_ptr unpruned_graph + = vg::io::VPKG::load_one(infile_unpruned_vg); + + if (!using_haplotypes) { + // we can bring back edges on embedded paths + + // Make an empty GBWT index to pass along + gbwt::GBWT empty_gbwt; + PhaseUnfolder unfolder(*unpruned_graph, empty_gbwt, max_node_id + 1); + unfolder.restore_paths(*graph, IndexingParameters::verbosity >= IndexingParameters::Debug); + } + else { + // we can expand out complex regions using haplotypes as well as paths + + // TODO: can't do this fully in parallel because each chunk needs to modify + // the same mapping + // TODO: it's a bit inelegant that i keep overwriting the mapping... + PhaseUnfolder unfolder(*unpruned_graph, *gbwt_index, max_node_id + 1); + unfold_lock.lock(); + unfolder.read_mapping(mapping_name); + unfolder.unfold(*graph, IndexingParameters::verbosity >= IndexingParameters::Debug); + unfolder.write_mapping(mapping_name); + unfold_lock.unlock(); + } + } + + vg::io::save_handle_graph(graph.get(), outfile_vg); + + pruned_graph_names[i] = vg_output_name; + }; + + int64_t target_memory_usage = plan->target_memory_usage(); + + if (using_haplotypes) { + // we only need to load the GBWT once, so we take it out of the shared budget + target_memory_usage -= get_file_size(gbwt_name); + } + vector> approx_job_requirements; + for (int64_t i = 0; i < graph_names.size(); ++i) { + // for paths, double the memory because we'll probably need to re-load the graph to restore paths + approx_job_requirements.emplace_back(get_file_size(graph_names[i]), + (using_haplotypes ? 1 : 2) * approx_graph_load_memory(graph_names[i])); + } + + JobSchedule schedule(approx_job_requirements, prune_job); + schedule.execute(target_memory_usage); + + return all_outputs; + }; + + registry.register_recipe({"Pruned VG"}, {"MaxNodeID", "VG"}, + [prune_graph](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Pruning complex regions of VG to prepare for GCSA indexing." << endl; + } + // call the meta-recipe + return prune_graph(inputs, plan, constructing); + }); + + registry.register_recipe({"Haplotype-Pruned VG", "Unfolded NodeMapping"}, {"GBWT", "MaxNodeID", "VG"}, + [prune_graph](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Pruning complex regions of VG to prepare for GCSA indexing with GBWT unfolding." << endl; + } + // call the meta-recipe + return prune_graph(inputs, plan, constructing); + }); + + registry.register_recipe({"Pruned Spliced VG"}, {"Spliced MaxNodeID", "Spliced VG w/ Transcript Paths"}, + [prune_graph](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Pruning complex regions of spliced VG to prepare for GCSA indexing." << endl; + } + // call the meta-recipe + return prune_graph(inputs, plan, constructing); + }); + + // TODO: would it be better to use the Haplotype-Transcript GBWT, or maybe to join them? + // the splice edges will be covered by the transcript paths, so it won't be too bad + registry.register_recipe({"Haplotype-Pruned Spliced VG", "Unfolded Spliced NodeMapping"}, + {"Spliced GBWT", "Spliced MaxNodeID", "Spliced VG w/ Transcript Paths"}, + [prune_graph](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Pruning complex regions of spliced VG to prepare for GCSA indexing with GBWT unfolding." << endl; + } + // call the meta-recipe + return prune_graph(inputs, plan, constructing); + }); + + //////////////////////////////////// + // GCSA + LCP Recipes + //////////////////////////////////// + +#ifdef debug_index_registry_setup + cerr << "registering GCSA recipes" << endl; +#endif + + // meta-recipe for GCSA indexing with or without unfolded input + auto construct_gcsa = [](const vector& inputs, + const IndexingPlan* plan, + const IndexGroup& constructing) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing GCSA/LCP indexes." << endl; + } + + assert(inputs.size() == 1 || inputs.size() == 2); + bool unfolded = inputs.size() == 2; + auto graph_filenames = inputs[0]->get_filenames(); + string mapping_filename; + if (unfolded) { + auto mapping_filenames = inputs[1]->get_filenames(); + assert(mapping_filenames.size() == 1); + mapping_filename = mapping_filenames.front(); + } + + assert(constructing.size() == 2); + vector> all_outputs(constructing.size()); + auto& gcsa_names = all_outputs[0]; + auto& lcp_names = all_outputs[1]; + + auto output_gcsa = *constructing.begin(); + auto output_lcp = *constructing.rbegin(); + + // test streams for I/O + ifstream infile_mapping; + if (unfolded) { + init_in(infile_mapping, mapping_filename); + } + string gcsa_output_name = plan->output_filepath(output_gcsa); + string lcp_output_name = plan->output_filepath(output_lcp); + ofstream outfile_gcsa; + ofstream outfile_lcp; + init_out(outfile_gcsa, gcsa_output_name); + init_out(outfile_lcp, lcp_output_name); + + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + gcsa::Verbosity::set(gcsa::Verbosity::BASIC); + } + else { + gcsa::Verbosity::set(gcsa::Verbosity::SILENT); + } + auto params = gcsa::ConstructionParameters(); + params.setSteps(IndexingParameters::gcsa_doubling_steps); + params.setLimitBytes(IndexingParameters::gcsa_size_limit); + +#ifdef debug_index_registry_recipes + cerr << "enumerating k-mers for input pruned graphs:" << endl; + for (auto& name : graph_filenames) { + cerr << "\t" << name << endl; + } +#endif + // if indexing fails, we'll rewind to whichever of these we used + IndexGroup pruned_graphs{"Pruned VG", "Pruned Spliced VG", "Haplotype-Pruned VG", "Haplotype-Pruned Spliced VG"}; + + VGset graph_set(graph_filenames); + size_t kmer_bytes = params.getLimitBytes(); + vector dbg_names; + try { + dbg_names = graph_set.write_gcsa_kmers_binary(IndexingParameters::gcsa_initial_kmer_length, kmer_bytes); + } + catch (SizeLimitExceededException& ex) { + // update pruning params + IndexingParameters::pruning_walk_length *= IndexingParameters::pruning_walk_length_increase_factor; + IndexingParameters::pruning_max_node_degree *= IndexingParameters::pruning_max_node_degree_decrease_factor; + string msg = "[IndexRegistry]: Exceeded disk use limit while generating k-mers. " + "Rewinding to pruning step with more aggressive pruning to simplify the graph."; + throw RewindPlanException(msg, pruned_graphs); + } + + // it seems to only keep the lowest 8 bits of the exit code? this is hack-y, but it gives us the correct + // code to compare to... + int size_code = execute_in_fork([](){ exit(gcsa::EXIT_SIZE_LIMIT_EXCEEDED); }); + + int code = execute_in_fork([&]() { +#ifdef debug_index_registry_recipes + cerr << "making GCSA2 at " << gcsa_output_name << " and " << lcp_output_name << " after writing de Bruijn graph files to:" << endl; + for (auto dbg_name : dbg_names) { + cerr << "\t" << dbg_name << endl; + } +#endif + + // construct the indexes (giving empty mapping name is sufficient to make + // indexing skip the unfolded code path) + gcsa::InputGraph input_graph(dbg_names, true, gcsa::Alphabet(), + mapping_filename); + gcsa::GCSA gcsa_index(input_graph, params); + gcsa::LCPArray lcp_array(input_graph, params); + +#ifdef debug_index_registry_recipes + cerr << "saving GCSA/LCP pair" << endl; +#endif + + save_gcsa(gcsa_index, gcsa_output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + save_lcp(lcp_array, lcp_output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + }); + + // clean up the k-mer files + for (auto dbg_name : dbg_names) { + temp_file::remove(dbg_name); + } + + if (code == size_code) { + // the indexing was not successful, presumably because of exponential disk explosion + + // update pruning params + IndexingParameters::pruning_walk_length *= IndexingParameters::pruning_walk_length_increase_factor; + IndexingParameters::pruning_max_node_degree *= IndexingParameters::pruning_max_node_degree_decrease_factor; + string msg = "[IndexRegistry]: Exceeded disk use limit while performing k-mer doubling steps. " + "Rewinding to pruning step with more aggressive pruning to simplify the graph."; + throw RewindPlanException(msg, pruned_graphs); + } + else if (code != 0) { + cerr << "[IndexRegistry]: Unrecoverable error in GCSA2 indexing." << endl; + exit(code); + } + + gcsa_names.push_back(gcsa_output_name); + lcp_names.push_back(lcp_output_name); + return all_outputs; + }; + + registry.register_recipe({"GCSA", "LCP"}, {"Haplotype-Pruned VG", "Unfolded NodeMapping"}, + [construct_gcsa](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + // execute meta recipe + return construct_gcsa(inputs, plan, constructing); + }); + + registry.register_recipe({"GCSA", "LCP"}, {"Pruned VG"}, + [construct_gcsa](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + // execute meta recipe + return construct_gcsa(inputs, plan, constructing); + }); + + registry.register_recipe({"Spliced GCSA", "Spliced LCP"}, {"Haplotype-Pruned Spliced VG", "Unfolded Spliced NodeMapping"}, + [construct_gcsa](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + // execute meta recipe + return construct_gcsa(inputs, plan, constructing); + }); + + registry.register_recipe({"Spliced GCSA", "Spliced LCP"}, {"Pruned Spliced VG"}, + [construct_gcsa](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + // execute meta recipe + return construct_gcsa(inputs, plan, constructing); + }); + + //////////////////////////////////// + // Snarls Recipes + //////////////////////////////////// + + // meta-recipe to find snarls + auto find_snarls = [](const HandleGraph& graph, + const IndexingPlan* plan, + const IndexGroup& constructing) { + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto output_snarls = *constructing.begin(); + auto& snarl_names = all_outputs[0]; + + string output_name = plan->output_filepath(output_snarls); + ofstream outfile; + init_out(outfile, output_name); + + // find snarls + unique_ptr snarl_finder = unique_ptr(new IntegratedSnarlFinder(graph)); + SnarlManager snarl_manager = snarl_finder->find_snarls_parallel(); + + // traverse snarl tree and write them out + vector buffer; + for (auto root : snarl_manager.top_level_snarls()) { + vector stack(1, root); + while (!stack.empty()) { + const Snarl* snarl = stack.back(); + stack.pop_back(); + + buffer.push_back(*snarl); + vg::io::write_buffered(outfile, buffer, 1024); + + for (const Snarl* child_snarl : snarl_manager.children_of(snarl)) { + stack.push_back(child_snarl); + } + } + } + // flush + vg::io::write_buffered(outfile, buffer, 0); + + snarl_names.push_back(output_name); + return all_outputs; + }; + + + // TODO: disabling so that we can distinguish giraffe graphs that may have + // different node IDs from the GFA import +// registry.register_recipe({"Snarls"}, {"XG"}, +// [find_snarls](const vector& inputs, +// const IndexingPlan* plan, +// AliasGraph& alias_graph, +// const IndexGroup& constructing) { +// if (IndexingParameters::verbosity != IndexingParameters::None) { +// cerr << "[IndexRegistry]: Finding snarls in graph." << endl; +// } +// return find_snarls(inputs, plan, constructing); +// }); + + registry.register_recipe({"Spliced Snarls"}, {"Spliced XG"}, + [find_snarls](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Finding snarls in spliced graph." << endl; + } + + assert(inputs.size() == 1); + auto graph_filenames = inputs[0]->get_filenames(); + assert(graph_filenames.size() == 1); + auto graph_filename = graph_filenames.front(); + + ifstream infile; + init_in(infile, graph_filename); + unique_ptr graph = vg::io::VPKG::load_one(infile); + + return find_snarls(*graph, plan, constructing); + }); + + //////////////////////////////////// + // Distance Index Recipes + //////////////////////////////////// + + + // meta-recipe to make distance index + auto make_distance_index = [](const HandleGraph& graph, + const IndexingPlan* plan, + const IndexGroup& constructing) { + + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto dist_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + string output_name = plan->output_filepath(dist_output); + ofstream outfile; + init_out(outfile, output_name); + + SnarlDistanceIndex distance_index; + IntegratedSnarlFinder snarl_finder(graph); + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + distance_index.serialize(output_name); + + output_names.push_back(output_name); + return all_outputs; + }; + + registry.register_recipe({"Giraffe Distance Index"}, {"Giraffe GBZ"}, + [make_distance_index](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing distance index for Giraffe." << endl; + } + + assert(inputs.size() == 1); + auto& gbz_filenames = inputs[0]->get_filenames(); + assert(gbz_filenames.size() == 1); + auto gbz_filename = gbz_filenames.front(); + + ifstream infile_gbz; + init_in(infile_gbz, gbz_filename); + unique_ptr gbz = vg::io::VPKG::load_one(infile_gbz); + + return make_distance_index(gbz->graph, plan, constructing); + }); + + registry.register_recipe({"Spliced Distance Index"}, {"Spliced XG"}, + [make_distance_index](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing distance index for a spliced graph." << endl; + } + + assert(inputs.size() == 1); + auto& graph_filenames = inputs[0]->get_filenames(); + assert(graph_filenames.size() == 1); + auto graph_filename = graph_filenames.front(); + + ifstream infile_graph; + init_in(infile_graph, graph_filename); + + unique_ptr graph = vg::io::VPKG::load_one(infile_graph); + + return make_distance_index(*graph, plan, constructing); + }); + + //////////////////////////////////// + // GBZ Recipes + //////////////////////////////////// + + registry.register_recipe({"Giraffe GBZ"}, {"GBZ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + alias_graph.register_alias(*constructing.begin(), inputs[0]); + return vector>(1, inputs.front()->get_filenames()); + }); + + registry.register_recipe({"GBZ"}, {"Reference GFA w/ Haplotypes"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing a GBZ from GFA input." << endl; + } + + assert(inputs.size() == 1); + if (inputs[0]->get_filenames().size() != 1) { + cerr << "error:[IndexRegistry] Graph construction does not support multiple GFAs at this time." << endl; + exit(1); + } + auto gfa_filename = inputs[0]->get_filenames().front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + string output_name = plan->output_filepath(gbz_output); + + gbwtgraph::GFAParsingParameters params = get_best_gbwtgraph_gfa_parsing_parameters(); + // TODO: there's supposedly a heuristic to set batch size that could perform better than this global param, + // but it would be kind of a pain to update it like we do the global param + params.batch_size = IndexingParameters::gbwt_insert_batch_size; + params.sample_interval = IndexingParameters::gbwt_sampling_interval; + params.max_node_length = IndexingParameters::max_node_size; + params.show_progress = IndexingParameters::verbosity == IndexingParameters::Debug; + + int code = execute_in_fork([&]() { + + // jointly generate the GBWT and record sequences + unique_ptr gbwt_index; + unique_ptr seq_source; + tie(gbwt_index, seq_source) = gbwtgraph::gfa_to_gbwt(gfa_filename, params); + + // convert sequences into gbwt graph + gbwtgraph::GBWTGraph gbwt_graph(*gbwt_index, *seq_source); + + // save together as a GBZ + save_gbz(*gbwt_index, gbwt_graph, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + }); + if (code != 0) { + IndexingParameters::gbwt_insert_batch_size *= IndexingParameters::gbwt_insert_batch_size_increase_factor; + throw RewindPlanException("[IndexRegistry]: Exceeded GBWT insert buffer size, expanding and reattempting.", + {"Giraffe GBZ"}); + } + + output_names.push_back(output_name); + return all_outputs; + }); + + registry.register_recipe({"Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Combining Giraffe GBWT and GBWTGraph into GBZ." << endl; + } + + assert(inputs.size() == 2); + auto gbwt_filenames = inputs[1]->get_filenames(); + auto gg_filenames = inputs[0]->get_filenames(); + assert(gbwt_filenames.size() == 1); + assert(gg_filenames.size() == 1); + auto gbwt_filename = gbwt_filenames.front(); + auto gg_filename = gg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbwt_filename, gg_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + string output_name = plan->output_filepath(gbz_output); + save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + // Thses used to be a GBWTGraph recipe, but we don't want to produce GBWTGraphs anymore. + + registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "NamedNodeBackTranslation", "XG"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing GBZ using NamedNodeBackTranslation." << endl; + } + + assert(inputs.size() == 3); + auto gbwt_filenames = inputs[0]->get_filenames(); + auto translation_filenames = inputs[1]->get_filenames(); + auto xg_filenames = inputs[2]->get_filenames(); + assert(gbwt_filenames.size() == 1); + assert(translation_filenames.size() == 1); + assert(xg_filenames.size() == 1); + auto gbwt_filename = gbwt_filenames.front(); + auto translation_filename = translation_filenames.front(); + auto xg_filename = xg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + ifstream infile_xg; + init_in(infile_xg, xg_filename); + auto xg_index = vg::io::VPKG::load_one(infile_xg); + + ifstream infile_translation; + init_in(infile_translation, translation_filename); + // There's only one implementation we can use here at the moment, so + // don't bother with the normal loader/saver system. + FlatFileBackTranslation translation(infile_translation); + + gbwtgraph::GBZ gbz; + load_gbwt(gbz.index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + // TODO: could add simplification to replace XG index with a gbwt::SequenceSource here + gbz.graph = gbwtgraph::GBWTGraph(gbz.index, *xg_index, &translation); + + string output_name = plan->output_filepath(gbz_output); + save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "XG"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing GBZ." << endl; + } + + assert(inputs.size() == 2); + auto gbwt_filenames = inputs[0]->get_filenames(); + auto xg_filenames = inputs[1]->get_filenames(); + assert(gbwt_filenames.size() == 1); + assert(xg_filenames.size() == 1); + auto gbwt_filename = gbwt_filenames.front(); + auto xg_filename = xg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + ifstream infile_xg; + init_in(infile_xg, xg_filename); + auto xg_index = vg::io::VPKG::load_one(infile_xg); + + gbwtgraph::GBZ gbz; + load_gbwt(gbz.index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + // TODO: could add simplification to replace XG index with a gbwt::SequenceSource here + gbz.graph = gbwtgraph::GBWTGraph(gbz.index, *xg_index, algorithms::find_translation(xg_index.get())); + + string output_name = plan->output_filepath(gbz_output); + save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + //////////////////////////////////// + // Minimizers Recipes + //////////////////////////////////// + + // FIXME We may not always want to store the minimizer index. Rebuilding the index may be + // faster than loading it from a network drive. + registry.register_recipe({"Minimizers"}, {"Giraffe Distance Index", "Giraffe GBZ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << "[IndexRegistry]: Constructing minimizer index." << endl; + } + + // TODO: should the distance index input be a joint simplification to avoid serializing it? + + assert(inputs.size() == 2); + auto dist_filenames = inputs[0]->get_filenames(); + auto gbz_filenames = inputs[1]->get_filenames(); + assert(dist_filenames.size() == 1); + assert(gbz_filenames.size() == 1); + auto dist_filename = dist_filenames.front(); + auto gbz_filename = gbz_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto minimizer_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + + ifstream infile_gbz; + init_in(infile_gbz, gbz_filename); + auto gbz = vg::io::VPKG::load_one(infile_gbz); + + ifstream infile_dist; + init_in(infile_dist, dist_filename); + auto distance_index = vg::io::VPKG::load_one(dist_filename); + gbwtgraph::DefaultMinimizerIndex minimizers(IndexingParameters::minimizer_k, + IndexingParameters::use_bounded_syncmers ? + IndexingParameters::minimizer_s : + IndexingParameters::minimizer_w, + IndexingParameters::use_bounded_syncmers); + + gbwtgraph::index_haplotypes(gbz->graph, minimizers, [&](const pos_t& pos) -> gbwtgraph::Payload { + return MIPayload::encode(get_minimizer_distances(*distance_index, pos)); + }); + + string output_name = plan->output_filepath(minimizer_output); + save_minimizer(minimizers, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + return registry; +} + +vector VGIndexes::get_default_map_indexes() { + vector indexes{ + "XG", + "GCSA", + "LCP" + }; + return indexes; +} + +vector VGIndexes::get_default_mpmap_indexes() { + vector indexes{ + "Spliced XG", + "Spliced Distance Index", + "Spliced GCSA", + "Spliced LCP" + }; + return indexes; +} + +vector VGIndexes::get_default_rpvg_indexes() { + vector indexes{ + "Spliced XG", + "Haplotype-Transcript GBWT", + "Transcript Origin Table" + }; + return indexes; +} + +vector VGIndexes::get_default_giraffe_indexes() { + vector indexes{ + "Giraffe Distance Index", + "Giraffe GBZ", + "Minimizers" + }; + return indexes; +} + +bool IndexingPlan::is_intermediate(const IndexName& identifier) const { + if (registry->get_index(identifier)->was_provided_directly()) { + // It's not an intermediate if it is input + return false; + } + // Or if it is directly requested + return !targets.count(identifier); +} + +int64_t IndexingPlan::target_memory_usage() const { + return IndexingParameters::max_memory_proportion * registry->get_target_memory_usage(); +} + +string IndexingPlan::output_filepath(const IndexName& identifier) const { + return output_filepath(identifier, 0, 1); +} + +string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, size_t num_chunks) const { + + string filepath; + if (registry->keep_intermediates || + (!is_intermediate(identifier) && !registry->get_index(identifier)->was_provided_directly())) { + // we're saving this file, put it at the output prefix + filepath = registry->output_prefix; + } + else { + // we're not saving this file, make it temporary + filepath = registry->get_work_dir() + "/" + sha1sum(identifier); + } + if (num_chunks > 1) { + // we add digits to make the suffix unique for this chunk (the setup disallows suffixes + // that start with digits) + filepath += "." + to_string(chunk); + } + filepath += "." + registry->get_index(identifier)->get_suffix(); + return filepath; +} + +const vector& IndexingPlan::get_steps() const { + return steps; +} + +set IndexingPlan::dependents(const IndexName& identifier) const { + + set dependent_steps; + + // seed the successors with the query + IndexGroup successor_indexes{identifier}; + + for (const auto& step : steps) { + + // TODO: should this behavior change if some of the inputs were provided directly? + + // collect inputs and outputs + const auto& outputs = step.first; + IndexGroup involved = registry->get_recipe(step).input_group(); + involved.insert(outputs.begin(), outputs.end()); + + for (const auto& index : involved) { + if (successor_indexes.count(index)) { + // this is a step when a successor was either created or used + dependent_steps.insert(step); + // outputs are also successors + successor_indexes.insert(outputs.begin(), outputs.end()); + break; + } + } + } + return dependent_steps; +} + +IndexRegistry::~IndexRegistry() { + if (!work_dir.empty()) { + // Clean up our work directory with its temporary indexes. + temp_file::remove(work_dir); + work_dir.clear(); + } +} + +IndexRegistry::IndexRegistry(IndexRegistry&& other) : + index_registry(std::move(other.index_registry)), + recipe_registry(std::move(other.recipe_registry)), + registered_suffixes(std::move(other.registered_suffixes)), + work_dir(std::move(other.work_dir)), + output_prefix(std::move(other.output_prefix)), + keep_intermediates(std::move(other.keep_intermediates)) { + + // Make sure other doesn't delete our work dir when it goes away + other.work_dir.clear(); +} + +IndexRegistry& IndexRegistry::operator=(IndexRegistry&& other) { + index_registry = std::move(other.index_registry); + recipe_registry = std::move(other.recipe_registry); + registered_suffixes = std::move(other.registered_suffixes); + work_dir = std::move(other.work_dir); + output_prefix = std::move(other.output_prefix); + keep_intermediates = std::move(other.keep_intermediates); + + // Make sure other doesn't delete our work dir when it goes away + other.work_dir.clear(); + + return *this; +} + +void IndexRegistry::set_prefix(const string& prefix) { + this->output_prefix = prefix; +} + +string IndexRegistry::get_prefix() const { + return this->output_prefix; +} + +void IndexRegistry::set_intermediate_file_keeping(bool keep_intermediates) { + this->keep_intermediates = keep_intermediates; +} + +void IndexRegistry::make_indexes(const vector& identifiers) { + + // figure out the best plan to make the objectives from the inputs + IndexGroup identifier_group(identifiers.begin(), identifiers.end()); + auto plan = make_plan(identifier_group); + + // to keep track of which indexes are aliases of others + AliasGraph alias_graph; + + list steps_remaining(plan.get_steps().begin(), plan.get_steps().end()); + list steps_completed; + + // execute the plan + while (!steps_remaining.empty()) { + // get the next step + auto step = move(steps_remaining.front()); + steps_remaining.pop_front(); + steps_completed.push_back(step); + + // do the recipe + try { + auto recipe_results = execute_recipe(step, &plan, alias_graph); + + // the recipe executed successfully + assert(recipe_results.size() == step.first.size()); + + // record the results + auto it = step.first.begin(); + for (const auto& results : recipe_results) { + auto index = get_index(*it); + // don't overwrite directly-provided inputs + if (!index->was_provided_directly()) { + // and assign the new (or first) ones + index->assign_constructed(results); + } + ++it; + } + } + catch (RewindPlanException& ex) { + + // the recipe failed, but we can rewind and retry following the recipe with + // modified parameters (which should have been set by the exception-throwing code) + if (IndexingParameters::verbosity != IndexingParameters::None) { + cerr << ex.what() << endl; + } + // gather the recipes we're going to need to re-attempt + const auto& rewinding_indexes = ex.get_indexes(); + set dependent_recipes; + for (const auto& index_name : rewinding_indexes) { + assert(index_registry.count(index_name)); + for (const auto& recipe : plan.dependents(index_name)) { + dependent_recipes.insert(recipe); + } + } + + // move rewound steps back onto the queue + vector::iterator> to_move; + for (auto it = steps_completed.rbegin(); it != steps_completed.rend(); ++it) { + if (dependent_recipes.count(*it)) { + to_move.push_back(--it.base()); + } + } + for (auto& it : to_move) { + steps_remaining.emplace_front(*it); + steps_completed.erase(it); + } + } + } +#ifdef debug_index_registry + cerr << "finished executing recipes, resolving aliases" << endl; +#endif + + auto aliases = alias_graph.non_intermediate_aliases(&plan, keep_intermediates); + for (auto& alias_record : aliases) { + IndexName aliasee; + vector aliasors; + tie(aliasee, aliasors) = alias_record; + +#ifdef debug_index_registry + cerr << "index " << aliasee << " is aliased by:" << endl; + for (const auto& aliasor : aliasors) { + cerr << "\t" << aliasor << endl; + } +#endif + + // if the index is itself non-intermediate, it will be in the list of aliases. + // otherwise, we can alias one index by moving instead of copying + auto f = find(aliasors.begin(), aliasors.end(), aliasee); + bool can_move = f == aliasors.end() && !get_index(aliasee)->was_provided_directly(); + if (!can_move) { + // just remove the "alias" so we don't need to deal with it + std::swap(*f, aliasors.back()); + aliasors.pop_back(); + } + + const auto& aliasee_filenames = get_index(aliasee)->get_filenames(); + + // copy aliases for any that we need to (start past index 0 if we can move it) + for (size_t i = can_move; i < aliasors.size(); ++i) { + for (size_t j = 0; j < aliasee_filenames.size(); ++j) { + + auto copy_filename = plan.output_filepath(aliasors[i], j, aliasee_filenames.size()); + copy_file(aliasee_filenames[j], copy_filename); + } + } + // if we can move the aliasee (i.e. it is intermediate), then make + // one index by moving instead of copying + if (can_move) { + for (size_t j = 0; j < aliasee_filenames.size(); ++j) { + auto move_filename = plan.output_filepath(aliasors[0], j, aliasee_filenames.size()); + int code = rename(aliasee_filenames[j].c_str(), move_filename.c_str()); + if (code) { + // moving failed (maybe because the files on separate drives?) fall back on copying + copy_file(aliasee_filenames[j], move_filename); + } + } + } + } + + // Keep all the indexes around. If you want to re-use the object for a + // different set of indexes, you will need to call reset() yourself. +} + +void IndexRegistry::register_index(const IndexName& identifier, const string& suffix) { + // Add this index to the registry + if (identifier.empty()) { + cerr << "error:[IndexRegistry] indexes must have a non-empty identifier" << endl; + exit(1); + } + if (suffix.empty()) { + cerr << "error:[IndexRegistry] indexes must have a non-empty suffix" << endl; + exit(1); + } + if (isdigit(suffix.front())) { + // this ensures that we can add numbers to the suffix to create a unique suffix + // for chunked workflows + cerr << "error:[IndexRegistry] suffixes cannot start with a digit" << endl; + exit(1); + } + if (index_registry.count(identifier)) { + cerr << "error:[IndexRegistry] index registry contains a duplicated identifier: " << identifier << endl; + exit(1); + } + if (registered_suffixes.count(suffix)) { + cerr << "error:[IndexRegistry] index registry contains a duplicated suffix: " << suffix << endl; + exit(1); + } + index_registry[identifier] = unique_ptr(new IndexFile(identifier, suffix)); + registered_suffixes.insert(suffix); +} + + +void IndexRegistry::provide(const IndexName& identifier, const string& filename) { + provide(identifier, vector(1, filename)); +} + +void IndexRegistry::provide(const IndexName& identifier, const vector& filenames) { + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + cerr << "[IndexRegistry]: Provided: " << identifier << endl; + } + if (!index_registry.count(identifier)) { + cerr << "error:[IndexRegistry] cannot provide unregistered index: " << identifier << endl; + exit(1); + } + get_index(identifier)->provide(filenames); +} + +bool IndexRegistry::available(const IndexName& identifier) const { + if (!index_registry.count(identifier)) { + // Index is not registered + return false; + } + const IndexFile* index = get_index(identifier); + if (!index->is_finished()) { + // Index is not made + return false; + } + return true; +} + +vector IndexRegistry::require(const IndexName& identifier) const { + if (!index_registry.count(identifier)) { + cerr << "error:[IndexRegistry] cannot require unregistered index: " << identifier << endl; + exit(1); + } + const IndexFile* index = get_index(identifier); + if (!index->is_finished()) { + cerr << "error:[IndexRegistry] do not have and did not make index: " << identifier << endl; + exit(1); + } + return index->get_filenames(); +} + +void IndexRegistry::set_target_memory_usage(int64_t bytes) { + target_memory_usage = bytes; +} + +int64_t IndexRegistry::get_target_memory_usage() const { + return target_memory_usage; +} + +// from https://stackoverflow.com/questions/2513505/how-to-get-available-memory-c-g +int64_t IndexRegistry::get_system_memory() { + int64_t pages = sysconf(_SC_PHYS_PAGES); + int64_t page_size = sysconf(_SC_PAGE_SIZE); + return pages * page_size; +} + +vector IndexRegistry::completed_indexes() const { + vector indexes; + for (const auto& index : index_registry) { + if (index.second->is_finished()) { + indexes.push_back(index.first); + } + } + return indexes; +} + +RecipeName IndexRegistry::register_recipe(const vector& identifiers, + const vector& input_identifiers, + const RecipeFunc& exec) { + + for (const IndexName& identifier : identifiers) { + if (!index_registry.count(identifier)) { + cerr << "error:[IndexRegistry] cannot register recipe for unregistered index " << identifier << endl; + exit(1); + } + } + + // test that the input identifiers are in alphabetical order + // this is an easy-to-troubleshoot check that lets us use IndexGroup's (which are ordered set) + // internally and still provide the vector in the same order as the input identifiers to + // the RecipeFunc and in the recipe declaration. + // i.e. this helps ensure that the order of the indexes that you code in the recipe declaration + // is the order that they will continue to be given throughout the registry + IndexGroup input_group(input_identifiers.begin(), input_identifiers.end()); + IndexGroup output_group(identifiers.begin(), identifiers.end()); + { + if (input_group.size() != input_identifiers.size()) { + cerr << "error:[IndexRegistry] recipe has duplicate inputs" << endl; + exit(1); + } + size_t i = 0; + for (const auto& sorted_identifier : input_group) { + if (sorted_identifier != input_identifiers[i]) { + cerr << "error:[IndexRegistry] recipe has inputs that are not provided in alphabetical order" << endl; + exit(1); + } + ++i; + } + if (output_group.size() != identifiers.size()) { + cerr << "error:[IndexRegistry] recipe has duplicate outputs" << endl; + exit(1); + } + i = 0; + for (const auto& sorted_identifier : output_group) { + if (sorted_identifier != identifiers[i]) { + cerr << "error:[IndexRegistry] recipe has outputs that are not provided in alphabetical order" << endl; + exit(1); + } + ++i; + } + } + + vector inputs; + for (const auto& input_identifier : input_identifiers) { + if (!index_registry.count(input_identifier)) { + cerr << "error:[IndexRegistry] cannot register recipe from unregistered index " << input_identifier << endl; + exit(1); + } + inputs.push_back(get_index(input_identifier)); + } +#ifdef debug_index_registry_setup + cerr << "registering recipe for " << to_string(output_group) << endl; + cerr << "inputs:" << endl; + for (const auto& input : inputs) { + cerr << "\t" << input->get_identifier() << endl; + } +#endif + + bool first_group_entry = !recipe_registry.count(output_group); + recipe_registry[output_group].emplace_back(inputs, exec); + RecipeName name(output_group, recipe_registry[output_group].size() - 1); + + if (output_group.size() > 1 && first_group_entry) { + // add unboxing recipes at the same priority level as the full recipe + auto it = output_group.begin(); + for (size_t i = 0; i < identifiers.size(); ++i) { +#ifdef debug_index_registry_setup + cerr << "registering unboxing recipe from " << to_string(output_group) << " to " << *it << endl; +#endif + register_recipe({*it}, identifiers, + [=](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + return vector>(1, inputs[i]->get_filenames()); + }); + ++it; + } + } + + return name; +} + +void IndexRegistry::register_generalization(const RecipeName& generalizer, const RecipeName& generalizee) { + for (const auto& index_name : generalizee.first) { + if (!generalizer.first.count(index_name)) { + cerr << "error:[IndexRegistry] registered a generalization that does not contain generalizee's output " << index_name << endl; + exit(1); + } + } + const auto& generalizer_recipe = recipe_registry.at(generalizer.first).at(generalizer.second); + const auto& generalizee_recipe = recipe_registry.at(generalizee.first).at(generalizee.second); + for (const auto& index_name : generalizee_recipe.input_group()) { + if (!generalizer_recipe.input_group().count(index_name)) { + cerr << "error:[IndexRegistry] registered a generalization that does not contain generalizee's input " << index_name << endl; + exit(1); + } + } + + generalizations[generalizee] = generalizer; +} + +IndexFile* IndexRegistry::get_index(const IndexName& identifier) { + return index_registry.at(identifier).get(); +} + +const IndexFile* IndexRegistry::get_index(const IndexName& identifier) const { + return index_registry.at(identifier).get(); +} + +bool IndexRegistry::all_finished(const vector& inputs) const { + IndexGroup group; + for (auto input : inputs) { + group.insert(input->get_identifier()); + } + return all_finished(group); +} + +bool IndexRegistry::all_finished(const IndexGroup& indexes) const { + bool finished = true; + for (const auto& index_name : indexes) { + if (!get_index(index_name)->is_finished()) { + finished = false; + break; + } + } + return finished; +} + +void IndexRegistry::reset() { + for (pair>& index : index_registry) { + index.second->reset(); + } +} + +string IndexRegistry::get_work_dir() { + if (work_dir.empty()) { + // Ensure the directory exists + work_dir = temp_file::create_directory(); + } + return work_dir; +} + +bool IndexRegistry::vcf_is_phased(const string& filepath) { + + if (IndexingParameters::verbosity >= IndexingParameters::Basic) { + cerr << "[IndexRegistry]: Checking for phasing in VCF(s)." << endl; + } + + + htsFile* file = hts_open(filepath.c_str(), "rb"); + if (!file) { + cerr << "error:[IndexRegistry]: Failed to open VCF file: " << filepath << endl; + exit(1); + } + bcf_hdr_t* hdr = bcf_hdr_read(file); + int phase_set_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PS"); + // note: it seems that this is not necessary for expressing phasing after all + int num_samples = bcf_hdr_nsamples(hdr); + + // iterate over records + bcf1_t* line = bcf_init(); + bool found_phased = false; + // check about 30k non-haploid variants before concluding that the VCF isn't phased + // TODO: will there be contig ordering biases that make this a bad assumption? + constexpr int nonhap_vars_to_check = 1 << 15; + int nonhap_iter = 0; + while (bcf_read(file, hdr, line) >= 0 && nonhap_iter < nonhap_vars_to_check && !found_phased) { + if (phase_set_id >= 0) { + if (phase_set_id == BCF_HT_INT) { + // phase sets are integers + int num_phase_set_arr = 0; + int32_t* phase_sets = NULL; + int num_phase_sets = bcf_get_format_int32(hdr, line, "PS", &phase_sets, &num_phase_set_arr); + for (int i = 0; i < num_phase_sets && !found_phased; ++i) { + found_phased = phase_sets[i] != 0; + } + free(phase_sets); + } + else if (phase_set_id == BCF_HT_STR) { + // phase sets are strings + int num_phase_set_arr = 0; + char** phase_sets = NULL; + int num_phase_sets = bcf_get_format_string(hdr, line, "PS", &phase_sets, &num_phase_set_arr); + for (int i = 0; i < num_phase_sets && !found_phased; ++i) { + found_phased = strcmp(phase_sets[i], ".") != 0; + } + if (phase_sets) { + // all phase sets are concatenated in one malloc's char*, pointed to by the first pointer + free(phase_sets[0]); + } + // free the array of pointers + free(phase_sets); + } + } + + // init a genotype array + int32_t* genotypes = nullptr; + int arr_size = 0; + // and query it + int num_genotypes = bcf_get_genotypes(hdr, line, &genotypes, &arr_size); + if (num_genotypes >= 0) { + // we got genotypes, check to see if they're phased. + // We know we can't have genotypes if there are 0 samples. + int ploidy = num_genotypes / num_samples; + if (ploidy > 1) { + for (int i = 0; i < num_genotypes && !found_phased; i += ploidy) { + for (int j = 0; j < ploidy && !found_phased; ++j) { + if (genotypes[i + j] == bcf_int32_vector_end) { + // sample has lower ploidy + break; + } + if (bcf_gt_is_missing(genotypes[i + j])) { + continue; + } + if (bcf_gt_is_phased(genotypes[i + j])) { + // the VCF expresses phasing, we can + found_phased = true;; + } + } + } + ++nonhap_iter; + } + } + + free(genotypes); + } + if (nonhap_iter == 0 && num_samples > 0) { + // We looked at some samples and none of them had any non-haploid genotypes. + // Assume the entire VCF is haploid, which are trivially phased + found_phased = true; + } + // clean up + bcf_destroy(line); + bcf_hdr_destroy(hdr); + hts_close(file); + return found_phased; +} + +bool IndexRegistry::gfa_has_haplotypes(const string& filepath) { + if (IndexingParameters::verbosity >= IndexingParameters::Basic) { + cerr << "[IndexRegistry]: Checking for haplotype lines in GFA." << endl; + } + ifstream strm(filepath); + if (!strm) { + cerr << "error:[IndexRegistry] Could not open GFA file " << filepath << endl; + exit(1); + } + while (strm.good()) { + char line_type = strm.get(); + if (line_type == 'W') { + return true; + } + strm.ignore(numeric_limits::max(), '\n'); + } + return false; +} + +vector IndexRegistry::dependency_order() const { + +#ifdef debug_index_registry + cerr << "finding topological order in dependency graph" << endl; +#endif + + // assign each index file an index in a vector (arbitrarily) and build the dependency graph + map graph_idx; + vector graph_label; + vector> dependency_graph; + // add nodes for the index groups + for (const auto& recipe_record : recipe_registry) { + if (!graph_idx.count(recipe_record.first)) { + graph_idx[recipe_record.first] = graph_label.size(); + graph_label.push_back(recipe_record.first); + dependency_graph.emplace_back(); + } + for (auto output : recipe_record.first) { + IndexGroup singleton_output{output}; + if (!graph_idx.count(singleton_output)) { + graph_idx[singleton_output] = graph_label.size(); + graph_label.push_back(singleton_output); + dependency_graph.emplace_back(); + } + } + for (const auto& recipe : recipe_record.second) { + for (auto input : recipe.input_group()) { + IndexGroup singleton_input{input}; + if (!graph_idx.count(singleton_input)) { + graph_idx[singleton_input] = graph_label.size(); + graph_label.push_back(singleton_input); + dependency_graph.emplace_back(); + } + } + } + } + + // add nodes for the recipes and recipe edges + size_t recipe_node_start = dependency_graph.size(); + size_t recipe_num = 0; + for (const auto& recipe_record : recipe_registry) { + for (const auto& recipe : recipe_record.second) { + IndexName recipe_label = "Recipe " + to_string(recipe_num++); + //cerr << "adding edges for " << recipe_label << endl; + //cerr << "\tinputs " << to_string(recipe.input_group()) << endl; + //cerr << "\toutputs " << to_string(recipe_record.first) << endl; + graph_label.push_back({recipe_label}); + dependency_graph.emplace_back(); + if (recipe_record.first.size() == 1 && + recipe.input_group().count(*recipe_record.first.begin())) { + // this is an unboxing recipe, only link to the collective input, not individual ingredients + dependency_graph[graph_idx.at(recipe.input_group())].push_back(dependency_graph.size() - 1); + //cerr << "\tedge " << to_string(recipe.input_group()) << " -> " << recipe_label << endl; + } + else { + for (auto index_name : recipe.input_group()) { + IndexGroup singleton_input{index_name}; + dependency_graph[graph_idx.at(singleton_input)].push_back(dependency_graph.size() - 1); + //cerr << "\tedge " << index_name << " -> " << recipe_label << endl; + } + } + //cerr << "\tedge " << recipe_label << " -> " << to_string(recipe_record.first) << endl; + dependency_graph.back().push_back(graph_idx.at(recipe_record.first)); + } + } + + // deduplicate any edges + for (auto& adj : dependency_graph) { + sort(adj.begin(), adj.end()); + adj.resize(unique(adj.begin(), adj.end()) - adj.begin()); + } + +#ifdef debug_index_registry + cerr << "dependency graph:" << endl; + for (size_t i = 0; i < dependency_graph.size(); ++i) { + cerr << to_string(graph_label[i]) << endl; + for (auto j : dependency_graph[i]) { + cerr << "\t" << to_string(graph_label[j]) << endl; + } + } +#endif + + // kahn's algorithm to determine a topological order + vector in_degree(dependency_graph.size(), 0); + for (auto& adj : dependency_graph) { + for (size_t i : adj) { + ++in_degree[i]; + } + } + + vector stack; + for (size_t i = 0; i < dependency_graph.size(); ++i) { + if (in_degree[i] == 0) { + stack.push_back(i); + } + } + + vector order; + while (!stack.empty()) { + size_t i = stack.back(); + stack.pop_back(); + order.push_back(i); + for (size_t j : dependency_graph[i]) { + --in_degree[j]; + if (in_degree[j] == 0) { + stack.push_back(j); + } + } + } + + if (order.size() != dependency_graph.size()) { + cerr << "error:[IndexFile] index dependency graph is not a DAG" << endl; + +#ifdef debug_index_registry + // do DFS to find the cycle + bool found_cycle = false; + for (size_t i = 0; i < dependency_graph.size() && !found_cycle; ++i) { + + vector traversed(dependency_graph.size(), false); + vector stacked(dependency_graph.size(), false); + // records of (node, next edge to take) + vector> stack; + stack.emplace_back(i, 0); + stacked[i] = true; + while (!stack.empty()) { + if (stack.back().second == dependency_graph[stack.back().first].size()) { + traversed[stack.back().first] = false; + stack.pop_back(); + continue; + } + traversed[stack.back().first] = true; + size_t next = dependency_graph[stack.back().first][stack.back().second++]; + if (traversed[next]) { + size_t j = stack.size() - 1; + cerr << "found cycle:" << endl; + cerr << "\t" << to_string(graph_label[next]) << endl; + while (stack[j].first != next) { + cerr << "\t" << to_string(graph_label[stack[j].first]) << endl; + --j; + } + cerr << "\t" << to_string(graph_label[stack[j].first]) << endl; + found_cycle = true; + break; + } + if (!stacked[next]) { + stack.emplace_back(next, 0); + stacked[next] = true; + } + } + } +#endif + exit(1); + } + + // convert to return format + vector ordered_identifiers; + for (size_t i = 0; i < order.size(); ++i) { + if (order[i] < recipe_node_start) { + ordered_identifiers.push_back(graph_label[order[i]]); + } + } + +#ifdef debug_index_registry + cerr << "final order:" << endl; + for (const auto& identifier : ordered_identifiers) { + cerr << "\t" << to_string(identifier) << endl; + } +#endif + + return ordered_identifiers; +} + +IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { + +#ifdef debug_index_registry + cerr << "generating plan for indexes:" << endl; + for (const auto& product : end_products) { + cerr << "\t" << product << endl; + } +#endif + + // get the dependency ordering of the indexes + vector identifier_order = dependency_order(); + map dep_order_of_identifier; + for (size_t i = 0; i < identifier_order.size(); ++i) { + dep_order_of_identifier[identifier_order[i]] = i; + } + + // TODO: I'm sure there's a more elegant implementation of this algorithm + set plan_elements; + for (const auto& product : end_products) { +#ifdef debug_index_registry + cerr << "making a plan for end product " << product << endl; +#endif + // make a singleton group for the recipe graph + IndexGroup product_group{product}; + + // records of (identifier, requesters, ordinal index of recipe selected) + vector, size_t>> plan_path; + + // map dependency priority to requesters + map, greater> queue; + + auto num_recipes = [&](const IndexGroup& indexes) { + int64_t num = 0; + if (recipe_registry.count(indexes)) { + num = recipe_registry.at(indexes).size(); + } + return num; + }; + + // update the queue to request the inputs of a recipe from the final index on the plan path + auto request_from_back = [&]() { + + // the index at the back of the plan path is making the request + auto& requester = identifier_order[get<0>(plan_path.back())]; + // get its next recipe + auto inputs = recipe_registry.at(requester).at(get<2>(plan_path.back())).input_group(); + +#ifdef debug_index_registry + cerr << "index(es) " << to_string(requester) << " can be made by a recipe requiring " << to_string(inputs) << endl; +#endif + + if (requester.size() == 1 && inputs.count(*requester.begin())) { + // this is an unboxing recipe, request the whole previous group + queue[dep_order_of_identifier[inputs]].insert(get<0>(plan_path.back())); + } + else { + // this is not an unboxing recipe, request all of the recipe inputs separately + for (auto& input_index : inputs) { + IndexGroup singleton_input{input_index}; + queue[dep_order_of_identifier[singleton_input]].insert(get<0>(plan_path.back())); + } + } + }; + + // place the final step in the plan path back in the queue + auto requeue_back = [&]() { +#ifdef debug_index_registry + cerr << "requeueing " << to_string(identifier_order[get<0>(plan_path.back())]) << ", requested by:" << endl; + for (auto d : get<1>(plan_path.back())) { + cerr << "\t" << to_string(identifier_order[d]) << endl; + } +#endif + // TODO: is this check necessary? + if (!get<1>(plan_path.back()).empty()) { + queue[get<0>(plan_path.back())] = get<1>(plan_path.back()); + } + plan_path.pop_back(); + }; + + // update the queue to remove requests to the inputs of a recipe from the final index on the plan path + auto unrequest_from_back = [&]() { + + auto make_unrequest = [&](const IndexGroup& inputs) { + auto it = queue.find(dep_order_of_identifier[inputs]); + it->second.erase(get<0>(plan_path.back())); + if (it->second.empty()) { +#ifdef debug_index_registry + cerr << "\t\tremoved final request to " << to_string(identifier_order[it->first]) << ", dequeuing" << endl; +#endif + queue.erase(it); + } + }; + + auto& requester = identifier_order[get<0>(plan_path.back())]; + + if (!all_finished(requester) && recipe_registry.count(requester)) { + // this index was using a recipe, we need to update its dependencies + // that are currently in the queue + +#ifdef debug_index_registry + cerr << "retracting requests from " << to_string(requester) << ", recipe " << get<2>(plan_path.back()) << endl; +#endif + auto inputs = recipe_registry.at(requester).at(get<2>(plan_path.back())).input_group(); + +#ifdef debug_index_registry + cerr << "\tmade requests from recipe requiring " << to_string(inputs) << endl; +#endif + + if (requester.size() == 1 && inputs.count(*requester.begin())) { + // this is an unboxing recipe, unrequest the whole previous group + make_unrequest(inputs); + } + else { + // this is not an unboxing recipe, unrequest all of the recipe inputs separately + for (auto& input_index : inputs) { + IndexGroup singleton_input{input_index}; + make_unrequest(singleton_input); + } + } + } +#ifdef debug_index_registry + else { + cerr << "no need to retract requests from " << to_string(requester) << endl; + } +#endif + }; + + // init the queue + queue[dep_order_of_identifier[product_group]] = set(); + + while (!queue.empty()) { +#ifdef debug_index_registry_path_state + cerr << "new iteration, path:" << endl; + for (auto pe : plan_path) { + cerr << "\t" << to_string(identifier_order[get<0>(pe)]) << ", requesters:"; + if (get<1>(pe).empty()) { + cerr << " PLAN TARGET"; + } + else { + for (auto d : get<1>(pe)) { + cerr << " " << to_string(identifier_order[d]); + } + } + cerr << ", recipe " << get<2>(pe) << endl; + } + cerr << "state of queue:" << endl; + for (auto q : queue) { + cerr << "\t" << to_string(identifier_order[q.first]) << ", requesters:"; + if (q.second.empty()) { + cerr << " PLAN TARGET"; + } + else { + for (auto d : q.second) { + cerr << " " << to_string(identifier_order[d]); + } + } + cerr << endl; + } +#endif + + // get the latest file in the dependency order that we have left to build + auto it = queue.begin(); + plan_path.emplace_back(it->first, it->second, 0); + +#ifdef debug_index_registry + cerr << "dequeue " << to_string(identifier_order[it->first]) << " requested from:" << endl; + if (it->second.empty()) { + cerr << "\tPLAN TARGET" << endl; + } + else { + for (auto requester : it->second) { + cerr << "\t" << to_string(identifier_order[requester]) << endl; + } + } +#endif + queue.erase(it); + auto index_group = identifier_order[get<0>(plan_path.back())]; + + // TODO: am i completely sure that no clobbering will happen if only some of the + // inputs are provided? + if (all_finished(index_group)) { + // this index has been provided, we don't need to use a recipe +#ifdef debug_index_registry + cerr << "index has been provided as input" << endl; +#endif + continue; + } + else if (recipe_registry.count(index_group)) { + // there are recipes to make this index, add the requests for the first one + request_from_back(); + } + else { + // we've reached a file that needs to be provided but we don't have it, + // so now we backtrack until hitting something that has remaining + // lower priority recipes +#ifdef debug_index_registry + cerr << "index " << to_string(index_group) << " cannot be made from existing inputs, need to backtrack" << endl; +#endif + + // prune to requester and advance to its next recipe, as many times as necessary until + // requester has remaining un-tried recipes + // note: if we're backtracking from a data file it might not have recipes + while (get<2>(plan_path.back()) >= num_recipes(identifier_order[get<0>(plan_path.back())])) { + // there are no remaining recipes to build the last index in the plan + + if (get<1>(plan_path.back()).empty()) { + // this is the product of the plan path, and we're out of recipes for it + throw InsufficientInputException(product, *this); + } + + // remove items off the plan path until we get to the first index that requested + // this one + size_t requester = *get<1>(plan_path.back()).rbegin(); + +#ifdef debug_index_registry + cerr << "no remaining recipes for " << to_string(identifier_order[get<0>(plan_path.back())]) << ", pruning to earliest requester: " << to_string(identifier_order[requester]) << endl; +#endif + + requeue_back(); // nothing to unrequest from the first one, which is past its last recipe + while (get<0>(plan_path.back()) != requester) { + unrequest_from_back(); + requeue_back(); + } + + // advance to the next recipe + unrequest_from_back(); + ++get<2>(plan_path.back()); + +#ifdef debug_index_registry + cerr << "advance to recipe " << get<2>(plan_path.back()) << " for " << to_string(identifier_order[get<0>(plan_path.back())]) << endl; +#endif + } + + // we pulled back far enough that we found an index with a lower-priority recipe + // remaining + request_from_back(); + } + + } + +#ifdef debug_index_registry + cerr << "final plan path for index " << product << ":" << endl; + for (auto path_elem : plan_path) { + cerr << "\t" << to_string(identifier_order[get<0>(path_elem)]) << ", recipe " << get<2>(path_elem) << ", from:" << endl; + for (auto d : get<1>(path_elem)) { + cerr << "\t\t" << to_string(identifier_order[d]) << endl; + } + } +#endif + + // record the elements of this plan + for (size_t i = 0; i < plan_path.size(); ++i) { + plan_elements.emplace(identifier_order[get<0>(plan_path[i])], get<2>(plan_path[i])); + } + } + + // Now fill in the plan struct that the recipes need to know how to run. + IndexingPlan plan; + + // Copy over the end products + std::copy(end_products.begin(), end_products.end(), std::inserter(plan.targets, plan.targets.begin())); + + // convert the aggregated plan elements into a forward ordered plan + std::copy(plan_elements.begin(), plan_elements.end(), std::back_inserter(plan.steps)); + sort(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& a, const RecipeName& b) { + return dep_order_of_identifier[a.first] < dep_order_of_identifier[b.first]; + }); + +#ifdef debug_index_registry + cerr << "plan before applying generalizations:" << endl; + for (auto plan_elem : plan.steps) { + cerr << "\t" << to_string(plan_elem.first) << " " << plan_elem.second << endl; + } +#endif + + // remove generalizees if we used their generalizers + set plan_set(plan.steps.begin(), plan.steps.end()); + plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe) { + return generalizations.count(recipe) && plan_set.count(generalizations.at(recipe)); + }) - plan.steps.begin()); + +#ifdef debug_index_registry + cerr << "full plan including provided files:" << endl; + for (auto plan_elem : plan.steps) { + cerr << "\t" << to_string(plan_elem.first) << " " << plan_elem.second << endl; + } +#endif + + // Now remove the input data from the plan + plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) { + return all_finished(recipe_choice.first); + }) - plan.steps.begin()); + + // The plan has methods that can come back and modify the registry. + // We're not going to call any of them, but we have to hand off a non-const + // pointer to ourselves so the plan can modify us later. + plan.registry = const_cast(this); + + return plan; +} + +const IndexRecipe& IndexRegistry::get_recipe(const RecipeName& recipe_name) const { + const auto& recipes = recipe_registry.at(recipe_name.first); + assert(recipe_name.second < recipes.size()); + return recipes.at(recipe_name.second); +} + +vector> IndexRegistry::execute_recipe(const RecipeName& recipe_name, const IndexingPlan* plan, + AliasGraph& alias_graph) { + const auto& index_recipe = get_recipe(recipe_name); + if (recipe_name.first.size() > 1 || !index_recipe.input_group().count(*recipe_name.first.begin())) { + // we're not in an unboxing recipe (in which case not all of the indexes might have been + // unboxed yet, in which case they appear unfinished) + for (auto input : index_recipe.inputs) { + assert(input->is_finished()); + } + } +#ifdef debug_index_registry_recipes + cerr << "executing recipe " << recipe_name.second << " for " << to_string(recipe_name.first) << endl; +#endif + return index_recipe.execute(plan, alias_graph, recipe_name.first);; +} + +string IndexRegistry::to_dot() const { + return to_dot(vector()); +} + +string IndexRegistry::to_dot(const vector& targets) const { + + + stringstream strm; + strm << "digraph recipegraph {" << endl; + + set plan_targets; + for (const auto& target : targets) { + plan_targets.insert({target}); + } + set plan_elements; + set plan_indexes; + if (!targets.empty()) { + IndexingPlan plan; + try { + IndexGroup target_group(targets.begin(), targets.end()); + plan = make_plan(target_group); + } + catch (InsufficientInputException ex) { +#ifdef debug_index_registry + cerr << ex.what() << endl; +#endif + strm << "labelloc=\"t\";" << endl; + strm << "label=\"Insufficient input to create targets\";" << endl; + } + for (const auto& plan_elem : plan.steps) { + plan_elements.insert(plan_elem); + plan_indexes.insert(plan_elem.first); + } + } + + // gather all singletons and products of recipes, which will be the index nodes + set all_indexes; + for (const auto& index_record : index_registry) { + all_indexes.insert({index_record.first}); + } + for (const auto& recipe_record : recipe_registry) { + all_indexes.insert(recipe_record.first); + } + + map index_to_dot_id; + size_t index_idx = 0; + for (const auto& index_group : all_indexes) { + index_to_dot_id[index_group] = "I" + to_string(index_idx); + ++index_idx; + strm << index_to_dot_id[index_group] << "[label=\"" << to_string(index_group) << "\" shape=box"; + if (all_finished(index_group)) { + strm << " style=\"filled,bold\" fillcolor=lightgray"; + } + else if (plan_targets.count(index_group)) { + strm << " style=\"filled,bold\" fillcolor=lightblue"; + } + else if (plan_indexes.count(index_group)) { + strm << " style=bold"; + } + strm << "];" << endl; + } + string unselected_col = targets.empty() ? "black" : "gray33"; + size_t recipe_idx = 0; + map recipe_to_dot_id; + for (const auto& recipe_record : recipe_registry) { + const auto& recipes = recipe_record.second; + for (size_t priority_idx = 0; priority_idx < recipes.size(); ++priority_idx, ++recipe_idx) { + const auto& recipe = recipes[priority_idx]; + string recipe_dot_id = "R" + to_string(recipe_idx); + recipe_to_dot_id[RecipeName(recipe_record.first, priority_idx)] = recipe_dot_id; + bool recipe_in_plan = plan_elements.count(RecipeName(recipe_record.first, priority_idx)); + if (recipe_in_plan) { + strm << recipe_dot_id << "[label=\"" << priority_idx << "\" shape=circle style=bold];" << endl; + strm << recipe_dot_id << " -> " << index_to_dot_id[recipe_record.first] << "[style=bold];" << endl; + } + else { + strm << recipe_dot_id << "[label=\"" << priority_idx << "\" shape=circle];" << endl; + strm << recipe_dot_id << " -> " << index_to_dot_id[recipe_record.first] << " [color=" << unselected_col << "];" << endl; + } + auto input_group = recipe.input_group(); + if (recipe_record.first.size() == 1 && input_group.count(*recipe_record.first.begin())) { + // unboxing recipe, link directly to group + if (recipe_in_plan) { + strm << index_to_dot_id[input_group] << " -> " << recipe_dot_id << "[style=bold];" << endl; + } + else { + strm << index_to_dot_id[input_group] << " -> " << recipe_dot_id << " [color=" << unselected_col << "];" << endl; + } + } + else { + // not an unboxing recipe, link to singletons + for (const auto& input : input_group) { + if (recipe_in_plan) { + strm << index_to_dot_id[IndexGroup{input}] << " -> " << recipe_dot_id << "[style=bold];" << endl; + } + else { + strm << index_to_dot_id[IndexGroup{input}] << " -> " << recipe_dot_id << " [color=" << unselected_col << "];" << endl; + } + } + } + + } + } + for (const auto& generalization_record : generalizations) { + strm << recipe_to_dot_id.at(generalization_record.first) << " -> " << recipe_to_dot_id.at(generalization_record.second) << " [style=dashed color=" << unselected_col << "];" << endl; + } + strm << "}" << endl; + return strm.str(); +} + +IndexFile::IndexFile(const IndexName& identifier, const string& suffix) : identifier(identifier), suffix(suffix) { + // nothing more to do +} + +bool IndexFile::is_finished() const { + return !filenames.empty(); +} + +const IndexName& IndexFile::get_identifier() const { + return identifier; +} + +const string& IndexFile::get_suffix() const { + return suffix; +} + +const vector& IndexFile::get_filenames() const { + return filenames; +} + +void IndexFile::provide(const vector& filenames) { + // append all filenames + // TODO: would it be better to sometimes error check that the file isn't a duplicate? + for (const string& filename : filenames) { + this->filenames.emplace_back(filename); + } + provided_directly = true; +} + +void IndexFile::assign_constructed(const vector& filenames) { + this->filenames = filenames; + provided_directly = false; +} + +bool IndexFile::was_provided_directly() const { + return provided_directly; +} + +void IndexFile::reset() { + filenames.clear(); + provided_directly = false; +} + +IndexRecipe::IndexRecipe(const vector& inputs, + const RecipeFunc& exec) : + exec(exec), inputs(inputs) +{ + // nothing more to do +} + +vector> IndexRecipe::execute(const IndexingPlan* plan, AliasGraph& alias_graph, + const IndexGroup& constructing) const { + return exec(inputs, plan, alias_graph, constructing); +} + +IndexGroup IndexRecipe::input_group() const { + IndexGroup group; + for (auto input : inputs) { + group.insert(input->get_identifier()); + } + return group; +} + +void AliasGraph::register_alias(const IndexName& aliasor, const IndexFile* aliasee) { + assert(aliasee->get_identifier() != aliasor); + graph[aliasee->get_identifier()].emplace_back(aliasor); +} + +vector>> AliasGraph::non_intermediate_aliases(const IndexingPlan* plan, + bool keep_all) const { + +#ifdef debug_index_registry + cerr << "finding non intermediate aliases in alias graph" << endl; + for (const auto& adj : graph) { + cerr << adj.first << ":" << endl; + for (const auto& dest : adj.second) { + cerr << "\t" << dest << endl; + } + } +#endif + + vector>> aliases; + + // find the heads in the graph (the origins of aliasing chains) + unordered_set heads; + for (const auto& adj : graph) { + heads.insert(adj.first); + } + for (const auto& adj : graph) { + for (const auto& dest : adj.second) { + if (heads.count(dest)) { + heads.erase(dest); + } + } + } + + for (const auto& head : heads) { + +#ifdef debug_index_registry + cerr << "starting a DFS from head index " << head << endl; +#endif + + // do DFS out from this head to identify aliasors + vector non_inmdt_aliasors; + vector stack(1, head); + unordered_set stacked{head}; + while (!stack.empty()) { + auto here = stack.back(); + stack.pop_back(); + if (!plan->is_intermediate(here) || keep_all) { + non_inmdt_aliasors.push_back(here); + } + if (graph.count(here)) { + for (const auto& dest : graph.at(here)) { + if (!stacked.count(dest)) { + stack.push_back(dest); + stacked.insert(dest); + } + } + } + } + + if (!non_inmdt_aliasors.empty()) { + aliases.emplace_back(head, move(non_inmdt_aliasors)); + } + } +#ifdef debug_index_registry + cerr << "identified aliases" << endl; + for (const auto& alias_record : aliases) { + cerr << alias_record.first << ":" << endl; + for (auto aliasor : alias_record.second) { + cerr << "\t" << aliasor << endl; + } + } +#endif + return aliases; +} + +InsufficientInputException::InsufficientInputException(const IndexName& target, + const IndexRegistry& registry) noexcept : + runtime_error("Insufficient input to create " + target), target(target), inputs(registry.completed_indexes()) +{ + // nothing else to do + stringstream ss; + ss << "Inputs" << endl; + for (const auto& input : inputs) { + ss << "\t" << input << endl; + } + ss << "are insufficient to create target index " << target << endl; + msg = ss.str(); +} + +const char* InsufficientInputException::what() const noexcept { + return msg.c_str(); +} + + +RewindPlanException::RewindPlanException(const string& msg, const IndexGroup& rewind_to) noexcept : msg(msg), indexes(rewind_to) { + // nothing else to do +} + +const char* RewindPlanException::what() const noexcept { + return msg.c_str(); +} + +const IndexGroup& RewindPlanException::get_indexes() const noexcept { + return indexes; +} + +} + diff --git a/src/index_registry.hpp b/src/index_registry.hpp new file mode 100644 index 00000000000..593d8b5e6a5 --- /dev/null +++ b/src/index_registry.hpp @@ -0,0 +1,463 @@ +#ifndef VG_INDEX_REGISTRY_HPP_INCLUDED +#define VG_INDEX_REGISTRY_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace vg { + +using namespace std; + +// Forward declarations +class IndexFile; +class IndexRecipe; +class InsufficientInputException; +class IndexRegistry; +class IndexingPlan; +class AliasGraph; + +/** + * A unique identifier for an Index + */ +using IndexName = string; + +/** + * A group of indexes that can be made simultaneously + */ +using IndexGroup = set; + +/** + * Names a recipe in the collection of registered recipes. + */ +using RecipeName = pair; + +/** + * Is a recipe to create the files (returned by name) associated with some + * index, from a series of input indexes, given the plan it is being + * generated for and the index being generated. + */ +using RecipeFunc = function>(const vector&, + const IndexingPlan*, + AliasGraph&, + const IndexGroup&)>; + +/** + * A struct namespace for global handling of parameters used by + * the IndexRegistry + */ +struct IndexingParameters { + // enums for categorical options + enum MutableGraphImplementation {HashGraph, PackedGraph, VG}; + enum Verbosity {None = 0, Basic = 1, Debug = 2}; + + // the actual parameters + + // the format that "VG" indexes will be created in [HashGraph] + static MutableGraphImplementation mut_graph_impl; + // the maximum node length for graphs that are created from VCFs [32] + static int max_node_size; + // during pruning, remove nodes with degree higher than this [128] // TODO: is this a good default? + static int pruning_max_node_degree; + // during pruning, identify complex regions using walks of this length [24] + static int pruning_walk_length; + // during pruning, remove edges if a walk contains this many branching edges [3] + static int pruning_max_edge_count; + // during pruning, remove any isolated components with at less than this total seq length [33] + static int pruning_min_component_size; + // factor by which the pruning walk length is increased if GCSA2 indexing fails [1.5] + static double pruning_walk_length_increase_factor; + // factor by which the max degree is decreased if GCSA2 indexing fails [0.75] + static double pruning_max_node_degree_decrease_factor; + // length of the k-mers indexed in GCSA2 before any doubling steps [16] + static int gcsa_initial_kmer_length; + // number of k-mer length doubling steps in GCSA2 [4] + static int gcsa_doubling_steps; + // disk limit for temporary files in bytes [2TB] + static int64_t gcsa_size_limit; + // number of gbwt nodes inserted at a time in dynamic gbwt [100M] + static int64_t gbwt_insert_batch_size; + // factor by which the batch size is increased if construction fails [10] + static int gbwt_insert_batch_size_increase_factor; + // the sampling interval in the GBWT suffix array [1024] + static int gbwt_sampling_interval; + // should the haplotype-transcript GBWT be made bidirectional [false] + static bool bidirectional_haplo_tx_gbwt; + // the feature from column 3 of the GTF/GFF that should be added ["gene"] + static string gff_feature_name; + // transcript tag in GTF/GFF ["transcript_id"] + static string gff_transcript_tag; + // if true, minimizer index uses bounded syncmers, otherwise uses minimizers [false] + static bool use_bounded_syncmers; + // length of k-mer used in minimizer index [29] + static int minimizer_k; + // length of window if using minimizers [11] + static int minimizer_w; + // length of internal s-mer if using bounded syncmers [18] + static int minimizer_s; + // the number of paths that will make up the path cover GBWT [16] + static int path_cover_depth; + // the number of haplotypes to downsample to in giraffe's GBWT [64] + static int giraffe_gbwt_downsample; + // sample subpaths of this length (in nodes) [4] + static int downsample_context_length; + // augment the existing GBWT instead of downsampling it if the number of haplotypes is < this * giraffe_gbwt_downsample [3] + static int downsample_threshold; + // actually use this fraction of the maximum memory to give slosh for bad estmates [0.75] + static double max_memory_proportion; + // aim to have X timese as many chunks as threads [2] + static double thread_chunk_inflation_factor; + // whether indexing algorithms will log progress (if available) [Basic] + static Verbosity verbosity; +}; + +/** + * A struct namespace for standard inputs + */ +struct VGIndexes { + /// A complete index registry for VG mapping utilities + static IndexRegistry get_vg_index_registry(); + /// A list of the identifiers of the default indexes to run vg map + static vector get_default_map_indexes(); + /// A list of the identifiers of the default indexes to run vg mpmap + static vector get_default_mpmap_indexes(); + /// A list of the identifiers of the default indexes to run rpvg + static vector get_default_rpvg_indexes(); + /// A list of the identifiers of the default indexes to run vg giraffe + static vector get_default_giraffe_indexes(); +}; + +/** + * A plan for producing indexes, which knows what should be saved and what should be ephemeral. + * Wants to be nested inside IndexRegistry, but you can't forward-declare a nested class. + */ +class IndexingPlan { + +public: + // The IndexRegistry is responsible for setting us up. + friend class IndexRegistry; + + IndexingPlan() = default; + ~IndexingPlan() = default; + + /// Get the suffix with which to save the given index's files. + string output_filepath(const IndexName& identifier) const; + + /// Get the suffix with which to save the given index's files. + string output_filepath(const IndexName& identifier, size_t chunk, size_t num_chunks) const; + + /// Ge the steps of the plan + const vector& get_steps() const; + + /// Returns true if the given index is to be intermediate under the given + /// plan, and false if it is to be preserved. + bool is_intermediate(const IndexName& identifier) const; + + /// TODO: is this where this function wants to live? + int64_t target_memory_usage() const; + + /// Returns the recipes in the plan that depend on this index, including the one in which + /// it was created (if any) + set dependents(const IndexName& identifier) const; + +protected: + + /// The steps to be invoked in the plan. May be empty before the plan is + /// actually planned. + vector steps; + /// The indexes to create as outputs. + set targets; + + /// The registry that the plan is using. + /// The registry must not move while the plan is in use. + /// Can't be const because we need to get_work_dir() on it, which may + /// create the work directory. + IndexRegistry* registry; +}; + + +/** + * An object that can record methods to produce indexes and design + * workflows to create a set of desired indexes. + */ +class IndexRegistry { +public: + + // IndexingPlan can't be a child class, but it needs to be able to + // get_index, so it has to be a friend. + friend class IndexingPlan; + + /// Constructor + IndexRegistry() = default; + + /// Destructor to clean up temp files. + ~IndexRegistry(); + + // Because we own temporary files and unique pointers, we should not be copied. + IndexRegistry(const IndexRegistry& other) = delete; + IndexRegistry& operator=(const IndexRegistry& other) = delete; + + // And we need to be moved carefully + IndexRegistry(IndexRegistry&& other); + IndexRegistry& operator=(IndexRegistry&& other); + + + /// Prefix for all saved outputs + void set_prefix(const string& prefix); + + /// Get the current prefix for saving output files. + string get_prefix() const; + + /// Should intermediate files be saved to the output directory + /// or the temp directory? + void set_intermediate_file_keeping(bool keep_intermediates); + + /// Register an index containing the given identifier + void register_index(const IndexName& identifier, const string& suffix); + + /// Register a recipe to produce an index using other indexes + /// or input files. Recipes registered earlier will have higher priority. + RecipeName register_recipe(const vector& identifiers, + const vector& input_identifiers, + const RecipeFunc& exec); + + /// Indicate one recipe is a broadened version of another. The indexes consumed and produced + /// by the generalization must be semantically identical to those of the generalizee + void register_generalization(const RecipeName& generalizer, const RecipeName& generalizee); + + /// Indicate a serialized file that contains some identified index + void provide(const IndexName& identifier, const string& filename); + + /// Indicate a list of serialized files that contains some identified index + void provide(const IndexName& identifier, const vector& filenames); + + /// Return true if the given index is available and can be require()'d, and + /// false otherwise. + bool available(const IndexName& identifier) const; + + /// Get the filename(s) associated with the given index. Aborts if the + /// index is not a known type, or if it is not provided or made. + vector require(const IndexName& identifier) const; + + /// Set the maximum memory that indexing should try to consume (note: this is + /// not strictly adhered to due to difficulties in estimating memory use) + void set_target_memory_usage(int64_t bytes); + + /// Get the maximum memory we will try to consume + int64_t get_target_memory_usage() const; + + /// Get the amount of free memory + static int64_t get_system_memory(); + + /// Get a list of all indexes that have already been completed or provided + vector completed_indexes() const; + + /// Create and execute a plan to make the indicated indexes using provided inputs + /// If provided inputs cannot create the desired indexes, throws a + /// InsufficientInputException. + /// When completed, all requested index files will be available via require(). + void make_indexes(const vector& identifiers); + + /// Returns the recipe graph in dot format + string to_dot() const; + + /// Returns the recipe graph in dot format with a plan highlighted + string to_dot(const vector& targets) const; + + /// Determine if a VCF file is phased or not + static bool vcf_is_phased(const string& filepath); + + /// Determine if a GFA has haplotypes as W-lines + static bool gfa_has_haplotypes(const string& filepath); + + /// Discard any provided or constructed indexes + void reset(); + +protected: + + /// get a topological ordering of all registered indexes in the dependency DAG + vector dependency_order() const; + + /// generate a plan to create the indexes + IndexingPlan make_plan(const IndexGroup& end_products) const; + + /// use a recipe identifier to get the recipe + const IndexRecipe& get_recipe(const RecipeName& recipe_name) const; + + /// Build the index using the recipe with the provided priority. + /// Expose the plan so that the recipe knows where it is supposed to go. + vector> execute_recipe(const RecipeName& recipe_name, const IndexingPlan* plan, + AliasGraph& alias_graph); + + /// access index file + IndexFile* get_index(const IndexName& identifier); + + /// access const index file + const IndexFile* get_index(const IndexName& identifier) const; + + bool all_finished(const vector& inputs) const; + + bool all_finished(const IndexGroup& inputs) const; + + /// Function to get and/or initialize the temporary directory in which indexes will live + string get_work_dir(); + + /// The storage struct for named indexes. Ordered so it is easier to key on index names. + map> index_registry; + + /// All of the suffixes that have been registered by indexes + unordered_set registered_suffixes; + + /// The storage struct for recipes, which may make index + map> recipe_registry; + + /// Map from generalizees to generalizers + map generalizations; + + /// Temporary directory in which indexes will live + string work_dir; + + /// filepath that will prefix all saved output + string output_prefix = "index"; + + /// should intermediate files end up in the scratch or the output directory? + bool keep_intermediates = false; + + /// the max memory we will *attempt* to use + int64_t target_memory_usage = numeric_limits::max(); +}; + +/** + * An object that generically represents a serializable index or input file + */ +class IndexFile { +public: + + /// Create a new IndexFile with a unique identifier + IndexFile(const IndexName& identifier, const string& suffix); + + /// Get the globally unique identifier for this index + const IndexName& get_identifier() const; + + /// Returns the suffix to be used for this index + const string& get_suffix() const; + + /// Get the filename(s) that contain this index + const vector& get_filenames() const; + + /// Identify a serialized file that already contains this index + void provide(const vector& filenames); + + /// Assign constructed filenames to this index + void assign_constructed(const vector& filenames); + + /// Returns true if the index has already been built or provided + bool is_finished() const; + + /// Returns true if the index was provided through provide method + bool was_provided_directly() const; + + /// Discard any constructed or provided indexes + void reset(); + +private: + + // the global identifier for the + IndexName identifier; + + // the suffix it adds to output files + const string suffix; + + // the filename(s) associated with the index + vector filenames; + + // keep track of whether the index was provided directly + bool provided_directly = false; +}; + +/** + * struct that indicates a method to produce and serialize an index + */ +struct IndexRecipe { + IndexRecipe(const vector& inputs, + const RecipeFunc& exec); + // execute the recipe and return the filename(s) of the indexes created + vector> execute(const IndexingPlan* plan, AliasGraph& alias_graph, + const IndexGroup& constructing) const; + IndexGroup input_group() const; + vector inputs; + RecipeFunc exec; +}; + +/** + * Class to keep track of which indexes are aliasing other indexes + */ +class AliasGraph { +public: + AliasGraph() = default; + ~AliasGraph() = default; + + /// Record that one index is aliasing another + void register_alias(const IndexName& aliasor, const IndexFile* aliasee); + + /// Return a list of all indexes that are being aliased by non-intermediate + /// indexes. If the aliasee is non-intermediate itself, it ill be listed among the + /// aliases too. + vector>> non_intermediate_aliases(const IndexingPlan* plan, + bool keep_all) const; + +private: + + // graph aliasees to their aliasors + unordered_map> graph; + +}; + + +/** + * Exception that is thrown to indicate the input data is insufficient + * to create some index(es) + */ +class InsufficientInputException : public runtime_error { +public: + InsufficientInputException() = delete; + InsufficientInputException(const IndexName& target, + const IndexRegistry& registry) noexcept; + const char* what() const noexcept; +private: + string msg; + IndexName target; + vector inputs; +}; + + +/** + * An exception that indicates that we must rewind the plan to re-create some indexes + */ +class RewindPlanException : public std::exception { +public: + + RewindPlanException() = delete; + RewindPlanException(const string& msg, const IndexGroup& rewind_to) noexcept; + ~RewindPlanException() noexcept = default; + + const char* what() const noexcept; + const IndexGroup& get_indexes() const noexcept; + +private: + + const string msg; + IndexGroup indexes; + +}; + +} + +#endif diff --git a/src/indexed_vg.cpp b/src/indexed_vg.cpp new file mode 100644 index 00000000000..366b48a1134 --- /dev/null +++ b/src/indexed_vg.cpp @@ -0,0 +1,558 @@ +/** + * \file indexed_vg.cpp + * Implementation for the IndexedVG class, which provides a HandleGraph interface to an on-disk VG file. + */ + +#include "indexed_vg.hpp" +#include "utility.hpp" +#include "vg/io/json2pb.h" + +#include + +#include + +namespace vg { + +using namespace std; + +IndexedVG::IndexedVG(string graph_filename) : vg_filename(graph_filename), index(), + cursor_streams(), cursor_pool(), cursor_pool_mutex(), group_cache(100), cache_mutex() { + + // Decide where the index ought to be stored + string index_filename = vg_filename + ".vgi"; + + ifstream index_in_stream(index_filename); + if (index_in_stream.good()) { + // We found the index, load it + index.load(index_in_stream); + } else { + // We need to build the index + + // Get the file to write the index to + ofstream index_out_stream(index_filename); + if (!index_out_stream.good()) { + // We couldn't load the index and we can't save it + throw runtime_error("Could not open index file " + index_filename + " for reading or writing"); + } + + // TODO: Show progress as we do this? + with_cursor([&](cursor_t& cursor) { + // Get a cursor to the start of the file + assert(cursor.seek_group(0)); + + // Compute the index + index.index(cursor); + + // Save the index + index.save(index_out_stream); + }); + } + +} + +void IndexedVG::print_report() const { + cerr << cursor_streams.size() << " cursors outstanding, " << cursor_pool.size() << " cursors free" << endl; + cerr << group_cache.size() << " cache entries" << endl; + // TODO: Cache hit/miss counts from the LRUcache do not appear to be + // correct (hits seem to be counted as misses). So we don't report them + // here. +} + +bool IndexedVG::has_node(id_t node_id) const { + bool id_in_graph = false; + find(node_id, [&](const CacheEntry& entry) -> bool { + // For each relevant entry (which may just have some edges to the node we are looking for) + auto found = entry.id_to_node_index.find(node_id); + if (found != entry.id_to_node_index.end()) { + // We found the node! + id_in_graph = true; + // Stop + return false; + } + + // Otherwise we don't have the node we want + return true; + }); + + return id_in_graph; +} + +// TODO: We ought to use some kind of handle packing that relates to file offsets for graph chunks contasining nodes. +// For now we just use the handlegraph::number_bool_packing and hit the index every time. + +handle_t IndexedVG::get_handle(const id_t& node_id, bool is_reverse) const { + return handlegraph::number_bool_packing::pack(node_id, is_reverse); +} + +id_t IndexedVG::get_id(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_number(handle); +} + +bool IndexedVG::get_is_reverse(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_bit(handle); +} + +handle_t IndexedVG::flip(const handle_t& handle) const { + return handlegraph::number_bool_packing::toggle_bit(handle); +} + +size_t IndexedVG::get_length(const handle_t& handle) const { + // We don't have a more efficient way to get the length than loading the sequence + return get_sequence(handle).size(); +} + +string IndexedVG::get_sequence(const handle_t& handle) const { + + // Get the ID of the node we are looking for + id_t id = get_id(handle); + + // We will pull the sequence out into this string. + string found_sequence; + + find(id, [&](const CacheEntry& entry) -> bool { + // For each relevant entry (which may just have some edges to the node we are looking for) + auto found = entry.id_to_node_index.find(id); + if (found != entry.id_to_node_index.end()) { + // We found the node! + // Copy out its sequence + found_sequence = entry.merged_group.node(found->second).sequence(); + // Stop + return false; + } + + // Otherwise we don't have the node we want + return true; + }); + + if (get_is_reverse(handle)) { + // Reverse complement the sequence if necessary + found_sequence = reverse_complement(found_sequence); + } + + return found_sequence; +} + +bool IndexedVG::follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const { + // TODO: implement stopping early in the backing index! + +#ifdef debug + cerr << "Following edges " << (go_left ? "left" : "right") << " from " << get_id(handle) << " orientation " << get_is_reverse(handle) << endl; +#endif + + if (go_left) { + // Go right from our reverse version, and return flipped results + return follow_edges(flip(handle), false, [&](const handle_t& other) -> bool { + return iteratee(flip(other)); + }); + } + + // Now we only have to handle the going right case + + // If this is false, don't call the iteratee any more. + bool keep_going = true; + + // Get the ID of the node we are looking for + id_t id = get_id(handle); + bool is_reverse = get_is_reverse(handle); + + find(id, [&](const CacheEntry& entry) -> bool { + // For each CacheEntry that describes a graph group that may have edges touching the ID we are looking for + +#ifdef debug + cerr << "Relevant cache entry: " << &entry << endl; +#endif + + // Find the list of edge indices that touch this node ID + auto found = entry.id_to_edge_indices.find(id); + + if (found == entry.id_to_edge_indices.end()) { + // No relevant edges in this cache entry. Get the next potentially relevant one. + return true; + } + +#ifdef debug + cerr << "Entry has " << found->second.size() << " edge indices that touch node " << id << endl; +#endif + + for (auto edge_index : found->second) { + // Look up each relevant edge in the graph + auto& edge = entry.merged_group.edge(edge_index); + +#ifdef debug + cerr << "Consider edge #" << edge_index << " in cache entry graph" << endl; +#endif + + if (edge.from() == id) { + // This edge touches us on its from end + if ( + (!edge.from_start() && !is_reverse) || // Normal down an edge case + (edge.from_start() && is_reverse) // We also end up going down the edge the same way from the edge's point of view + ) { +#ifdef debug + cerr << "Follow edge " << pb2json(edge) << " from from end" << endl; +#endif + keep_going &= iteratee(get_handle(edge.to(), edge.to_end())); + } + } + + if (!keep_going) { + return false; + } + + if (edge.to() == id) { + // This edge touches us on its to end + if ( + (edge.to_end() && !is_reverse) || // We read up this edge + (!edge.to_end() && is_reverse) // We also read up this edge + ) { +#ifdef debug + cerr << "Follow edge " << pb2json(edge) << " from to end" << endl; +#endif + keep_going &= iteratee(get_handle(edge.from(), !edge.from_start())); + } + } + + if (!keep_going) { + return false; + } + } + + return keep_going; + }); + + if (!keep_going) { +#ifdef debug + cerr << "Stopped early!" << endl; +#endif + } + + return keep_going; +} + +bool IndexedVG::for_each_handle_impl(const function& iteratee, bool parallel) const { + // We have to scan the whole graph for this + + int64_t group_vo = 0; + atomic keep_going(true); + + while(keep_going) { + // Look up the cache entry here + bool still_in_file = with_cache_entry(group_vo, [&](const CacheEntry& entry) { + + if (parallel) { + // Handle each node in the cache entry in a task + #pragma omp parallel for + for (size_t i = 0; i < entry.merged_group.node_size(); i++) { + // Show a handle for every node to the iteratee + // stopping is best effort in multithreaded mode; we don't try very hard. + if (keep_going) { + if (!iteratee(get_handle(entry.merged_group.node(i).id(), false))) { + keep_going = false; + } + } + } + } else { + // Do it single threaded + for (auto& node : entry.merged_group.node()) { + // Show a handle for every node to the iteratee + if (!iteratee(get_handle(node.id(), false))) { + keep_going = false; + } + + if (!keep_going) { + // If it is done, stop. + break; + } + } + } + + // Move on to the next cache entry for the next group + group_vo = entry.next_group; + }); + + if (!still_in_file) { + // We hit EOF + break; + } + } + + return keep_going; +} + +size_t IndexedVG::get_node_count() const { + // TODO: Add total distinct node count to the index or cache it or something. + // Right now we just scan. + + size_t count = 0; + for_each_handle([&](const handle_t& ignored) { + count++; + }); + return count; +} + +id_t IndexedVG::min_node_id() const { + // We can just seek to the start and get the first node in the first chunk with nodes. + + // This holds the first real node ID we find + id_t min_node_id = numeric_limits::max(); + + // This is the virtual offset of the serialized graph group we are considering + int64_t group_vo = 0; + + while (min_node_id == numeric_limits::max()) { + // Get graph groups in order + bool still_in_file = with_cache_entry(group_vo, [&](const CacheEntry& entry) { + if (entry.merged_group.node_size() > 0) { + // This graph has a first node + min_node_id = entry.merged_group.node(0).id(); + } + + // Move on to the next cache entry for the next group + group_vo = entry.next_group; + }); + + if (!still_in_file) { + // We hit EOF + break; + } + } + + return min_node_id; +} + +id_t IndexedVG::max_node_id() const { + + // This is the max node ID observed + id_t max_observed = 0; + + // Scan locally-forward but globally-backward through the vg file + index.scan_backward([&](int64_t start_vo, int64_t past_end_vo) -> bool { + + int64_t group_vo = start_vo; + + while (group_vo < past_end_vo) { + bool still_in_file = with_cache_entry(group_vo, [&](const CacheEntry& entry) { + if (entry.merged_group.node_size() > 0) { + // We have nodes. The last one will have the highest ID. + max_observed = entry.merged_group.node(entry.merged_group.node_size() - 1).id(); + } + // Move on to the next cache entry for the next group + group_vo = entry.next_group; + }); + + if (!still_in_file) { + // We hit EOF + break; + } + } + + if (max_observed != 0) { + // We found something. Stop going back towards the beginning. + return false; + } else { + // We haven't seen any nodes yet. Keep searching + return true; + } + }); + + return max_observed; +} + +void IndexedVG::with_cursor(function callback) const { + // We'll fill this in with a cursor from the pool if we can get one, and a new one otherwise + unique_ptr obtained_cursor; + { + // Get ahold of the pool + lock_guard lock(cursor_pool_mutex); + if (!cursor_pool.empty()) { + // Grab a cursor from the pool + obtained_cursor = move(cursor_pool.front()); + cursor_pool.pop_front(); + } else { + // Open a new file stream + cursor_streams.emplace_back(vg_filename); + assert(cursor_streams.back().good()); + + // Make a cursor around it + obtained_cursor = unique_ptr(new cursor_t(cursor_streams.back())); + } + } + + // Let the callback use it + callback(*obtained_cursor.get()); + + { + // Get ahold of the pool + lock_guard lock(cursor_pool_mutex); + + // Put the cursor back in the pool + cursor_pool.emplace_back(move(obtained_cursor)); + } + + // TODO: Does this moving unique_ptrs make sense or should we copy around indexes or real pointers +} + +void IndexedVG::find(id_t id, const function& iteratee) const { + // We will set this to false if the iteratee says to stop + bool keep_going = true; + + index.find(id, [&](int64_t run_start_vo, int64_t run_past_end_vo) -> bool { + // Loop over the index and get all the VO run ranges relevant to the given ID. + + // Scan through each run + // Start at the run's start + int64_t scan_vo = run_start_vo; + + while(keep_going && scan_vo < run_past_end_vo) { + + // We are working on the group that starts at scan_vo + + // Go get it, unless scan_vo is at EOF + bool still_in_file = with_cache_entry(scan_vo, [&](const CacheEntry& entry) { + // Now we have a cache entry for the group we were looking for. + // Show it to the iteratee. + keep_going &= iteratee(entry); + + // Advance to the next group + scan_vo = entry.next_group; + }); + + // We should never hit EOF when operating on ranges from the index. + // If we do, the index is invalid. + assert(still_in_file); + + // When scan_vo hits or passes the past-end for the range, we will be done with it + } + + // Keep looking if the iteratee wants to, and get a new range. + return keep_going; + }); +} + +bool IndexedVG::with_cache_entry(int64_t group_vo, const function& callback) const { + + if (group_vo == numeric_limits::max()) { + // We got the EOF sentinel. We can't seek there. + return false; + } + + // This will point to the cache entry for the group when we find or make it. + shared_ptr cache_entry; + + { + lock_guard lock(cache_mutex); + // See if it is cached. Gets a pair of the item (if found) and a flag for whether it was found + auto cache_pair = group_cache.retrieve(group_vo); + if (cache_pair.second) { + // We found it + cache_entry = move(cache_pair.first); + } + } + + if (!cache_entry) { + // If it wasn't found, load it up. We could synchronize to do + // this with the cache lock held, to stop all threads banging + // on the disk until one of them caches it. But we probably + // want to allow simultaneous reads from disk overall. + + with_cursor([&](cursor_t& cursor) { + // Try to get to the VO we are supposed to go to + auto pre_seek_group = cursor.tell_group(); + if (!cursor.seek_group(group_vo)) { + cerr << "error[vg::IndexedVG]: Could not seek from group pos " << pre_seek_group + << " to group pos " << group_vo << endl; + cerr << "Current position: group " << cursor.tell_group() + << " has_current: " << cursor.has_current() << endl; + assert(false); + } + + if (cursor.has_current()) { + // We seeked to a real thing and not EOF + + // Read the group into a cache entry + cache_entry = shared_ptr(new CacheEntry(cursor)); + } + }); + + if (cache_entry) { + // We actually found a valid group. + + lock_guard lock(cache_mutex); + // Save a copy of the shared pointer into the cache + group_cache.put(group_vo, cache_entry); + } + + } + + if (cache_entry) { + // We aren't at EOF or anything, so call the callback + callback(*cache_entry); + return true; + } + + // We didn't find it in the file. + return false; + +} + +IndexedVG::CacheEntry::CacheEntry(cursor_t& cursor) { + + // We want to cache the group we are pointed at + int64_t group_vo = cursor.tell_group(); + + while (cursor.has_current() && cursor.tell_group() == group_vo) { + // Merge the whole group together + merged_group.MergeFrom(cursor.take()); + } + + if (!cursor.has_current()) { + // We hit EOF + next_group = numeric_limits::max(); + } else { + // We found another group. + // TODO: Can we avoid deserializing its first chunk? + next_group = cursor.tell_group(); + } + + // Compute the indexes + for (size_t i = 0; i < merged_group.node_size(); i++) { + // Record the index of every node + id_to_node_index[merged_group.node(i).id()] = i; + } + + for (size_t i = 0; i < merged_group.edge_size(); i++) { + // And of every edge by end node IDs + auto& edge = merged_group.edge(i); + id_to_edge_indices[edge.from()].push_back(i); + if (edge.to() != edge.from()) { + // If it's not a self loop we need to point to it from both ends + id_to_edge_indices[edge.to()].push_back(i); + } + } +} + +Graph IndexedVG::CacheEntry::query(const id_t& id) const { + Graph to_return; + + auto node_found = id_to_node_index.find(id); + if (node_found != id_to_node_index.end()) { + // We have the node in question, so send it + *to_return.add_node() = merged_group.node(node_found->second); + } + + auto edges_found = id_to_edge_indices.find(id); + if (edges_found != id_to_edge_indices.end()) { + // We have edges on it, so send them + for (auto& edge_index : edges_found->second) { + *to_return.add_edge() = merged_group.edge(edge_index); + } + } + + // TODO: Path visits + + return to_return; +} + + +} + diff --git a/src/indexed_vg.hpp b/src/indexed_vg.hpp new file mode 100644 index 00000000000..b82274dff44 --- /dev/null +++ b/src/indexed_vg.hpp @@ -0,0 +1,207 @@ +#ifndef VG_INDEXED_VG_HPP_INCLUDED +#define VG_INDEXED_VG_HPP_INCLUDED + +/** + * \file indexed_vg.hpp + * Contains an implementation of a HandleGraph backed by a sorted, indexed .vg file + */ + +#include + +#include +#include +#include + +#include "stream_index.hpp" +#include "handle.hpp" + + +namespace vg { + +using namespace std; + +/** Use a .vg file on disk with a .vgi index to provide random access to the + * graph data without loading the entire graph into memory. Sort of a + * compromise between an XG and a VG, except unlike either we don't need the + * whole graph in memory. + * + * We require that all nodes in the graph appear in ID order within their + * chunks, and that all chunks appear in ID order. So all nodes are in ID order + * in the file. + * + * Cannot be copied since internally it contains a ProtobufIterator wrapping an + * open file. Can only be moved. + * + * All operations are thread-safe to call. Internally we can't be seeking a + * cursor off to another location in the middle of looping over a run of + * matchung chunks, but we handle that ourselves. + * + * Internally, we keep a pool of cursors into the backing graph file, and each + * time we need to actually access the backing graph file we grab a cursor or + * make one if we don't have a free one. + * + * Internally we also keep a least-recently-used cache of indexed + * merged-together graph groups. The cache is keyed by group start VO. The + * cache holds shared pointers to cache entries, so that one thread can be + * evicting something from the cache while another is still working with it. + */ +class IndexedVG : public HandleGraph { + +public: + + /// Open a .vg file. If the .vg has a .vg.vgi index, it wil be loaded. If + /// not, an index will be generated and saved. + IndexedVG(string graph_filename); + + // TODO: This gets implicitly deleted and generates warning because of the + // StreamIndex member variable + // We are moveable + //IndexedVG(IndexedVG&& other) = default; + + // TODO: This gets implicitly deleted and generates warning because StreamIndex + // member variable is not movable + //IndexedVG& operator=(IndexedVG&& other) = default; + + void print_report() const; + +private: + // We are not copyable because we keep a pool of open files + IndexedVG(const IndexedVG& other) = delete; + + // TODO: This gets implicitly deleted and generates warning because of the + // StreamIndex member variable + //IndexedVG& operator=(const IndexedVG& other) = delete; + +public: + + /////////////// + // Handle Graph Interface + /////////////// + + /// Check if a node exists by ID + virtual bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + virtual handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + virtual id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + virtual bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + virtual handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + virtual size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + virtual string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + virtual bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph + virtual size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t max_node_id() const; + +protected: + /// We store the graph filename, so we can have cursors to it created on demand. + /// This is necessary to have e.g. random accesses to bits of the graph while looping over the graph as a whole. + /// The downside is we lose BGZF block cacheing between different streams of access. + string vg_filename; + + /// Index data about the vg file + StreamIndex index; + + /// Define the type we use for cursors into the backing file. + using cursor_t = StreamIndex::cursor_t; + + /// Get temporary ownership of a cursor to the backing vg file. + void with_cursor(function callback) const; + + /// Input streams referenced by cursors live in this list that grows forever + mutable list cursor_streams; + /// Cursors live in this free pool + mutable list> cursor_pool; + /// Access is protected by this mutex + mutable mutex cursor_pool_mutex; + + /// Represents an entry in the cache for a parsed group of graphs. + /// Has its own indexes and the virtual offset of the next group. + struct CacheEntry { + /// Make a cache entry for a group by reading a cursor at that group's start + CacheEntry(cursor_t& to_read); + + // Can be moved + CacheEntry(CacheEntry&& other) = default; + CacheEntry& operator=(CacheEntry&& other) = default; + + /// Pull out the subgraph for the given node + Graph query(const id_t& id) const; + + /// All the graphs get merged into this one + Graph merged_group; + + /// This maps from node ID to index in the merged graph. + unordered_map id_to_node_index; + + /// This maps from node ID to indexes of touched edges in the merged graph. + unordered_map> id_to_edge_indices; + + // TODO: Path visits + + /// This is the virtual offset of the next group in the file. + /// If this was the last group in the file, this is numeric_limits::max(). + int64_t next_group; + }; + + /// Wrapper around the index's find, with cacheing. Supports stopping + /// early, but doesn't do internal filtering of chunks/runs where the node + /// being queried is in a hole. Runs the iteratee on CacheEntry objects for + /// the runs that might have info on the requested node, in order. + /// Internally holds shared_ptr copies to the cache entries it is handing + /// out references to. Users must do all everything they need the + /// CacheEntry for within the callback as the reference may not be valid + /// afterwards. + void find(id_t id, const function& iteratee) const; + + /// Load or use the cached version of the CacheEntry for the given group + /// start VO. If the EOF sentinel numeric_limits::max() is passed, + /// the callback is not called and false is returned. (This is to enable + /// easy looping to scan over CacheEntries.) Passing any other past-the-end + /// VO is prohibited, and may produce an error. Handles locking the cache + /// for updates and keeping the CacheEntry reference live while the + /// callback is running. + bool with_cache_entry(int64_t group_vo, const function& callback) const; + + /// This is the cache that holds CacheEntries for groups we have already parsed and indexed. + /// We can only access the cache from one thread at a time, but the shared pointers let us + /// be working with the actual data in ther threads. + mutable LRUCache> group_cache; + /// The cache is protected with this mutex + mutable mutex cache_mutex; +}; + +} + + +#endif diff --git a/src/integrated_snarl_finder.cpp b/src/integrated_snarl_finder.cpp new file mode 100644 index 00000000000..f520f67e7be --- /dev/null +++ b/src/integrated_snarl_finder.cpp @@ -0,0 +1,1886 @@ +/// +/// \file integrated_snarl_finder.cpp +/// +/// + +#include "integrated_snarl_finder.hpp" + +#include "algorithms/three_edge_connected_components.hpp" +#include "subgraph_overlay.hpp" + +#include +#include + +#include +#include + +namespace vg { + +//#define debug + +using namespace std; + +class IntegratedSnarlFinder::MergedAdjacencyGraph { +protected: + /// Hold onto the backing RankedHandleGraph. + /// Union find index is handle rank - 1 (to make it 0-based). + const RankedHandleGraph* graph; + + /// Keep a union-find over the ranks of the merged oriented handles that + /// make up each component. Runs with include_children=true so we can find + /// all the members of each group. + /// + /// Needs to be mutable because union-find find operations do internal tree + /// massaging and aren't const. + /// TODO: this makes read operations not thread safe! + mutable structures::UnionFind union_find; + + /// Get the rank corresponding to the given handle, in the union-find. + /// Our ranks are 0-based. + size_t uf_rank(handle_t into) const; + + /// Get the handle with the given rank in union-find space. + /// Our ranks are 0-based. + handle_t uf_handle(size_t rank) const; + +public: + /// Make a MergedAdjacencyGraph representing the graph of adjacency components of the given RankedHandleGraph. + MergedAdjacencyGraph(const RankedHandleGraph* graph); + + /// Copy a MergedAdjacencyGraph by re-doing all the merges. Uses its own internal vectorization. + MergedAdjacencyGraph(const MergedAdjacencyGraph& other); + + /// Given handles reading into two components, a and b, merge them into a single component. + void merge(handle_t into_a, handle_t into_b); + + /// Find the handle heading the component that the given handle is in. + handle_t find(handle_t into) const; + + /// For each head, call the iteratee. + void for_each_head(const function& iteratee) const; + + /// For each item other than the head in the component headed by the given + /// handle, calls the iteratee with the that other item. Does not call the + /// iteratee for single-item components. + void for_each_other_member(handle_t head, const function& iteratee) const; + + /// For each item, including the head, in the component headed by the given + /// handle, calls the iteratee with the that other item. Does not call the + /// iteratee for single-item components. + void for_each_member(handle_t head, const function& iteratee) const; + + /// For each item other than the head in each component, calls the iteratee + /// with the head and the other item. Does not call the iteratee for + /// single-item components. + void for_each_membership(const function& iteratee) const; + + /// In a graph where all 3-edge-connected components have had their nodes + /// merged, find all the cycles. Cycles are guaranteed to overlap at at + /// most one node, so no special handling of overlapping regions is done. + /// + /// Returns a list of cycle edge length (in bp) and an edge on each cycle + /// (for the longest cycle in each connected component), and a map from + /// each edge on the cycle to the next edge, going around each cycle in one + /// direction (for all cycles). + /// + /// Ignores self loops. + pair>, unordered_map> cycles_in_cactus() const; + + /// Find a path of cycles connecting two components in a Cactus graph. + /// Cycles are represented by the handle that brings that cyle into the component where it intersects the previous cycle. + /// Because the graph is a Cactus graph, cycles are a tree and intersect at at most one node. + /// Uses the given map of cycles, stored in one orientation only, to traverse cycles. + vector find_cycle_path_in_cactus(const unordered_map& next_along_cycle, handle_t start_cactus_head, handle_t end_cactus_head) const; + + /// Return the path length (total edge length in bp) and edges for the + /// longest path in each tree in a forest. Ignores self loops on tree nodes. + /// + /// Also return the map from the head of each component to the edge into + /// the child that is first along the longest path to a leaf. For + /// components not themselves on the longest leaf-leaf path in their tree, + /// these will always be dangling off/rooted by the longest leaf-leaf path + /// or longest simple cycle merged away, whichever is longer. + /// + /// Needs access to the longest simple cycles that were merged out, if any. + /// If a path in the forest doesn't match or beat the length of the cycle + /// that lives in its tree, it is omitted. + pair>>, unordered_map> longest_paths_in_forest(const vector>& longest_simple_cycles) const; + + /// Describe the graph in dot format to the given stream; + void to_dot(ostream& out) const; +}; + +void IntegratedSnarlFinder::MergedAdjacencyGraph::to_dot(ostream& out) const { + out << "digraph G {" << endl; + for_each_head([&](handle_t head) { + // Have a node for every head + out << "\tn" << graph->get_id(head) << (graph->get_is_reverse(head) ? "r" : "f") << "[shape=\"point\"];" << endl; + for_each_member(head, [&](handle_t edge) { + // For everything reading into here + if (graph->get_is_reverse(edge)) { + // If it is coming here backward (we are its start) + handle_t flipped_head = find(graph->flip(edge)); + // Only draw edges in one direction. Label them with their nodes. + out << "\tn" << graph->get_id(head) << (graph->get_is_reverse(head) ? "r" : "f") + << " -> n" << graph->get_id(flipped_head) << (graph->get_is_reverse(flipped_head) ? "r" : "f") + << " [label=" << graph->get_id(edge) << "];" << endl; + } + }); + }); + out << "}" << endl; +} + +size_t IntegratedSnarlFinder::MergedAdjacencyGraph::uf_rank(handle_t into) const { + // We need to 0-base the backing rank + return graph->handle_to_rank(into) - 1; +} + +handle_t IntegratedSnarlFinder::MergedAdjacencyGraph::uf_handle(size_t rank) const { + // We need to 1-base the rank and then get the handle. + return graph->rank_to_handle(rank + 1); +} + +IntegratedSnarlFinder::MergedAdjacencyGraph::MergedAdjacencyGraph(const RankedHandleGraph* graph) : graph(graph), + union_find(graph->get_node_count() * 2, true) { + + // TODO: we want the adjacency components that are just single edges + // between two handles (i.e. trivial snarls) to be implicit, so we don't + // have to do O(n) work for so much of the graph. But to do that we need a + // union-find that lets us declare it over a potentially large space + // without filling it all in. + + // So we do this the easy way and compute all the merges for all adjacency + // components, including tiny/numerous ones, right now. + + // If we ever change this, we should also make MergedAdjacencyGraph + // stackable, to save a copy when we want to do further merges but keep the + // old state. + + graph->for_each_edge([&](const handlegraph::edge_t& e) { + // Get the inward-facing version of the second handle + auto into_b = graph->flip(e.second); + + // Merge to create initial adjacency components + merge(e.first, into_b); + }); +} + +IntegratedSnarlFinder::MergedAdjacencyGraph::MergedAdjacencyGraph(const MergedAdjacencyGraph& other) : MergedAdjacencyGraph(other.graph) { + other.for_each_membership([&](handle_t head, handle_t member) { + // For anything in a component, other than its head, do the merge with the head. + merge(head, member); + }); +} + +void IntegratedSnarlFinder::MergedAdjacencyGraph::merge(handle_t into_a, handle_t into_b) { + // Get ranks and merge + union_find.union_groups(uf_rank(into_a), uf_rank(into_b)); +} + +handle_t IntegratedSnarlFinder::MergedAdjacencyGraph::find(handle_t into) const { + // Get rank, find head, and get handle + return uf_handle(union_find.find_group(uf_rank(into))); +} + +void IntegratedSnarlFinder::MergedAdjacencyGraph::for_each_head(const function& iteratee) const { + // TODO: is this better or worse than getting the vector of vectors of the whole union-find? + // TODO: make iterating groups an actual capability that the union-find has, in O(groups). + // This lets us do it in O(total items). + + // We track if we have seen a head yet. If we haven't, we emit it and mark it seen. + vector seen_heads(union_find.size(), false); + + for (size_t i = 0; i < union_find.size(); i++) { + // For each item in the union-find + if (!seen_heads[i]) { + // If we haven't emitted it, find the head of its group + size_t head = union_find.find_group(i); + if (!seen_heads[head]) { + // If we haven't emitted that head either, say we have + seen_heads[head] = true; + // And emit its corresponding inward-facing handle + iteratee(uf_handle(head)); + } + } + } +} + +void IntegratedSnarlFinder::MergedAdjacencyGraph::for_each_other_member(handle_t head, const function& iteratee) const { + size_t head_rank = uf_rank(head); + // Find the group the head is in + vector group = union_find.group(head_rank); + for (auto& member_rank : group) { + // And go through all the members + if (member_rank != head_rank) { + // We filter out the given head. + // This function will happen to work for non-head inputs, leaving out that input, but we don't guarantee it! + iteratee(uf_handle(member_rank)); + } + } +} + +void IntegratedSnarlFinder::MergedAdjacencyGraph::for_each_member(handle_t head, const function& iteratee) const { + size_t head_rank = uf_rank(head); + // Find the group the head is in + vector group = union_find.group(head_rank); + for (auto& member_rank : group) { + // And go through all the members, including the head. + // This function will happen to work for non-head inputs, leaving out that input, but we don't guarantee it! + iteratee(uf_handle(member_rank)); + } +} + +void IntegratedSnarlFinder::MergedAdjacencyGraph::for_each_membership(const function& iteratee) const { + // We do this weird iteration because it's vaguely efficient in the union-find we use. + vector> uf_components = union_find.all_groups(); + + for (auto& component : uf_components) { + // For each component + for (size_t i = 1; i < component.size(); i++) { + // For everything other than the head, announce with the head. + iteratee(uf_handle(component[0]), uf_handle(component[i])); + } + } +} + +pair>, unordered_map> IntegratedSnarlFinder::MergedAdjacencyGraph::cycles_in_cactus() const { + // Do a DFS over all connected components of the graph + + // We will fill this in + pair>, unordered_map> to_return; + auto& longest_cycles = to_return.first; + auto& next_edge = to_return.second; + + // When we see a back edge that isn't a self loop, we jump up the stack and + // walk down it, writing the cycle relationships. We know the cycles can't + // overlap (due to merged 3 edge connected components) so we also know we + // won't ever re-walk the same part of the stack, so this is efficient. + + // To let us actually find the cycle paths when we see back edges, we need + // to remember stack frame index as our visit marker. No record = not + // visited, record = visited. + // + // If you see something visited already, it must still be on the stack. + // Otherwise it would have already visited you when it was on the stack. + // + // This is on heads, representing nodes. + unordered_map visited_frame; + + // We need a stack. + // Stack is actually in terms of inward edges followed. + struct DFSFrame { + handle_t here; + vector todo; + }; + + vector stack; + + for_each_head([&](handle_t component_root) { + // For every node in the graph + + if (!visited_frame.count(component_root)) { + +#ifdef debug + cerr << "Root simple cycle search at " << graph->get_id(component_root) << (graph->get_is_reverse(component_root) ? "-" : "+") << endl; +#endif + + // If it hasn't been searched yet, start a search of its connected component. + stack.emplace_back(); + stack.back().here = component_root; + + // We'll put the longest cycle start edge result here, or get rid of it if we find no cycle. + longest_cycles.emplace_back(); + auto& longest_cycle = longest_cycles.back(); + + while (!stack.empty()) { + // Until the DFS is done + auto& frame = stack.back(); + // Find the node that following this edge got us to. + auto frame_head = find(frame.here); + +#ifdef debug + cerr << "At stack frame " << stack.size() - 1 << " for edge " << graph->get_id(frame.here) << (graph->get_is_reverse(frame.here) ? "-" : "+") + << " on component " << graph->get_id(frame_head) << (graph->get_is_reverse(frame_head) ? "-" : "+") << endl; +#endif + + auto frame_it = visited_frame.find(frame_head); + if (frame_it == visited_frame.end()) { + // First visit to here. + +#ifdef debug + cerr << "\tFirst visit" << endl; +#endif + + // Mark visited at this stack level + frame_it = visited_frame.emplace_hint(frame_it, frame_head, stack.size() - 1); + + // Queue up edges + for_each_member(frame_head, [&](handle_t member) { + if (member != frame.here || stack.size() == 1) { + // If it's not just turning around and looking up + // the edge we took to get here, or if we're the + // top stack frame and we didn't come from anywhere + // anyway + + // Follow edge by flipping. But queue up the edge + // followed instead of the node reached (head), so we + // can emit the cycle later in terms of edges. + frame.todo.push_back(graph->flip(member)); + +#ifdef debug + cerr << "\t\tNeed to follow " << graph->get_id(frame.todo.back()) << (graph->get_is_reverse(frame.todo.back()) ? "-" : "+") << endl; +#endif + } + }); + } + + if (!frame.todo.empty()) { + // Now do an edge + handle_t edge_into = frame.todo.back(); + handle_t connected_head = find(edge_into); + frame.todo.pop_back(); + +#ifdef debug + cerr << "\tFollow " << graph->get_id(edge_into) << (graph->get_is_reverse(edge_into) ? "-" : "+") + << " to component " << graph->get_id(connected_head) << (graph->get_is_reverse(connected_head) ? "-" : "+") << endl; +#endif + + auto connected_it = visited_frame.find(connected_head); + + if (connected_it == visited_frame.end()) { + +#ifdef debug + cerr << "\t\tNot yet visited. Recurse!" << endl; +#endif + + // Forward edge. Recurse. + // TODO: this immediately does a lookup in the hash table again. + stack.emplace_back(); + stack.back().here = edge_into; + } else { + // Back edge + if (frame_it->second > connected_it->second) { + // We have an edge to something that was visited above + // our stack level. It can't be a self loop, and it + // must close a unique cycle. + +#ifdef debug + cerr << "\tBack edge up stack to frame " << connected_it->second << endl; +#endif + +#ifdef debug + cerr << "\t\tFound cycle:" << endl; +#endif + + // Walk and measure the cycle. But don't count the + // frame we arrived at because its incoming edge + // isn't actually on the cycle. + size_t cycle_length_bp = graph->get_length(edge_into); + handle_t prev_edge = edge_into; + for (size_t i = connected_it->second + 1; i < stack.size(); i++) { + // For each edge along the cycle... + +#ifdef debug + cerr << "\t\t\t" << graph->get_id(stack[i].here) << (graph->get_is_reverse(stack[i].here) ? "-" : "+") << endl; +#endif + + // Measure it + cycle_length_bp += graph->get_length(stack[i].here); + // Record the cycle membership + next_edge[prev_edge] = stack[i].here; + // Advance + prev_edge = stack[i].here; + } + // Close the cycle + next_edge[prev_edge] = edge_into; + +#ifdef debug + cerr << "\t\t\t" << graph->get_id(edge_into) << (graph->get_is_reverse(edge_into) ? "-" : "+") << endl; +#endif + +#ifdef debug + cerr << "\t\tCycle length: " << cycle_length_bp << " bp" << endl; +#endif + + if (cycle_length_bp > longest_cycle.first) { + // New longest cycle (or maybe only longest cycle). + +#ifdef debug + cerr << "\t\t\tNew longest cycle!" << endl; +#endif + + // TODO: Assumes no cycles are 0-length + longest_cycle.first = cycle_length_bp; + longest_cycle.second = edge_into; + } + } + } + } else { + // Now we're done with this stack frame. + + // Clean up + stack.pop_back(); + } + } + + if (longest_cycle.first == 0) { + // No (non-empty) nontrivial cycle found in this connected component. + // Remove its spot. + longest_cycles.pop_back(); + } + } + }); + +#ifdef debug + cerr << "Cycle links:" << endl; + for (auto& kv : next_edge) { + cerr << "\t" << graph->get_id(kv.first) << (graph->get_is_reverse(kv.first) ? "-" : "+") + << " -> " << graph->get_id(kv.second) << (graph->get_is_reverse(kv.second) ? "-" : "+") << endl; + } +#endif + + return to_return; +} + +vector IntegratedSnarlFinder::MergedAdjacencyGraph::find_cycle_path_in_cactus(const unordered_map& next_along_cycle, handle_t start_head, handle_t end_head) const { + // We fill this in with a path of cycles. + // Each cycle is the edge on that cycle leading into the node that it shares with the previous cycle. + vector cycle_path; + + // We just DFS through the cycle tree until we find one that touches the + // other node. We represent each cycle by an edge on it into the node where + // it overlaps the parent cycle, and store the current cycle and the other + // cycles to do that share nodes. + vector, bool>> cycle_stack; + + // We have a list of DFS roots we can stop early on. + vector roots; + for_each_member(start_head, [&](handle_t inbound) { + if (next_along_cycle.count(inbound)) { + // This edge is how a cycle comes into here. Consider this cycle. + roots.push_back(inbound); + } + }); + + for (auto& root : roots) { + // Root at each root + cycle_stack.emplace_back(root, vector(), false); + while (!cycle_stack.empty()) { + auto& cycle_frame = cycle_stack.back(); + if (!get<2>(cycle_frame)) { + // First visit + get<2>(cycle_frame) = true; + + // Need to fill in child cycles. + for (auto it = next_along_cycle.find(get<0>(cycle_frame)); it->second != get<0>(cycle_frame); it = next_along_cycle.find(it->second)) { + // For each other edge around the cycle (in it->second) other than the one we started at + + handle_t node = find(it->second); + if (node == end_head) { + // This cycle intersects the destination. It is the last on the cycle path. + + // Copy the path on the stack over. + // Note that the first think on the path is in the + // start's component, but the last thing on the path + // isn't in the end's component. + cycle_path.reserve(cycle_stack.size()); + for (auto& f : cycle_stack) { + cycle_path.push_back(get<0>(f)); + } + + // Now the cycle path is done + return cycle_path; + } + + for_each_member(node, [&](handle_t inbound) { + // For each edge in the component it enters + if (inbound != it->second && next_along_cycle.count(inbound)) { + // This edge is a cycle coming into a node our current cycle touches. + get<1>(cycle_frame).push_back(inbound); + } + }); + } + } + if (!get<1>(cycle_frame).empty()) { + // Need to recurse on a connected cycle. + handle_t child = get<1>(cycle_frame).back(); + get<1>(cycle_frame).pop_back(); + cycle_stack.emplace_back(child, vector(), false); + } else { + // Need to clean up and return + cycle_stack.pop_back(); + } + } + } + + // If we get here, we never found a path. + // Complain! Something is wrong! + throw runtime_error("Cound not find cycle path!"); +} + +pair>>, unordered_map> IntegratedSnarlFinder::MergedAdjacencyGraph::longest_paths_in_forest( + const vector>& longest_simple_cycles) const { + + // TODO: somehow unify DFS logic with cycle-finding DFS in a way that still + // allows us to inspect our stack in each case? + + // Going up the tree, we need to track the longest path from a leaf to the + // subtree root, and the longest path between leaves in the subtree. These + // ought to overlap substantially, but either may be the real winner when + // we get to where we rooted the DFS. + + // Set up the return value + pair>>, unordered_map> to_return; + + // When we find a longest path in a connected component (tree), we put its + // length and value in here. We describe it as edges followed. + auto& longest_tree_paths = to_return.first; + + // We use this as part of our DFS scratch to record the first edge on the + // deepest path to a leaf in a subtree. The actual length of that path is + // stored in the main record for the head of the component the given edge + // reaches. If we find a longest leaf-leaf path in the tree that beats the + // simple cycle (if any), we rewrite this to be rooted somewhere along that + // leaf-leaf path. Indexed by head. + auto& deepest_child_edge = to_return.second; + + // The DFS also needs records, one per component, indexed by head. + // Absence of a record = unvisited. + struct DFSRecord { + // Remember the edge to traverse to get back to the parent, so we can + // find the path from the longest leaf-leaf path's converging node to + // the DFS root if we need it. + handle_t parent_edge; + // How long is the deepest path to a leaf from here, plus the length of + // the edge followed to here from the parent? + // Filled in when we leave the stack, by looking at deepest_child_edge. + size_t leaf_path_length = 0; + // What edge goes to the second-deepest child, if we have one, to form + // the longest leaf-leaf path converging here? + handle_t second_deepest_child_edge; + // And do we have such a second-deepest child? + bool has_second_deepest_child = false; + // And what head in the graph is the convergance point of the longest + // leaf-leaf path in our subtree? If it points to us, and we don't have + // a second deepest child, there is no leaf-leaf path in our subtree. + // + // Actually filled in when we finish a node. When children write to it + // to max themselves in, they clobber it if it points back to us. + handle_t longest_subtree_path_root; + // We don't need to store this, because it's determined by the leaf + // path lengths of the best and second best children of the longest + // subtree path root, but to save a whole mess of transitive accesses + // we track the longest subtree paht length here as well. Will be 0 + // when there is no subtree leaf-leaf path. + size_t longest_subtree_path_length; + }; + unordered_map records; + + + // We need a stack. + // Stack is actually in terms of inward edges followed. + struct DFSFrame { + handle_t here; + // What edges still need to be followed + vector todo; + }; + + vector stack; + + // We have a function to try DFS from a root, if the root is unvisited. + // If root_cycle_length is nonzero, we will not rewrite deepest_child_edge + // to point towards the longest leaf-leaf path, if it isn't as long as the + // cycle or longer. + auto try_root = [&](handle_t traversal_root, size_t root_cycle_length) { + if (!records.count(traversal_root)) { + // If it hasn't been searched yet, start a search + stack.emplace_back(); + stack.back().here = traversal_root; + +#ifdef debug + cerr << "Root bridge tree traversal at " << graph->get_id(traversal_root) << (graph->get_is_reverse(traversal_root) ? "-" : "+") << endl; +#endif + + while (!stack.empty()) { + // Until the DFS is done + auto& frame = stack.back(); + // Find the node that following this edge got us to. + auto frame_head = find(frame.here); + +#ifdef debug + cerr << "At stack frame " << stack.size() - 1 << " for edge " << graph->get_id(frame.here) << (graph->get_is_reverse(frame.here) ? "-" : "+") + << " into component with head " << graph->get_id(frame_head) << (graph->get_is_reverse(frame_head) ? "-" : "+") << endl; +#endif + + auto frame_it = records.find(frame_head); + if (frame_it == records.end()) { + // First visit to here. + +#ifdef debug + cerr << "\tFirst visit. Find edges." << endl; +#endif + + // Mark visited + frame_it = records.emplace_hint(frame_it, frame_head, DFSRecord()); + // And fill it in with default references. + // Remember how to get back to the parent + frame_it->second.parent_edge = graph->flip(frame.here); + // Say there's no known leaf-leaf path converging anywhere under it yet. + frame_it->second.longest_subtree_path_root = frame_head; + + // Queue up edges + for_each_member(frame_head, [&](handle_t member) { + // Follow edge by flipping. + auto flipped = graph->flip(member); + + if (find(flipped) != frame_head) { + // Only accept non-self-loops. + +#ifdef debug + cerr << "\t\tNeed to follow " << graph->get_id(flipped) << (graph->get_is_reverse(flipped) ? "-" : "+") << endl; +#endif + + // Queue up the edge followed instead of the node + // reached (head), so we can emit the cycle later + // in terms of edges. + frame.todo.push_back(flipped); + } + }); + } + + auto& record = frame_it->second; + + if (!frame.todo.empty()) { + // Now do an edge + handle_t edge_into = frame.todo.back(); + handle_t connected_head = find(edge_into); + frame.todo.pop_back(); + +#ifdef debug + cerr << "\tFollowing " << graph->get_id(edge_into) << (graph->get_is_reverse(edge_into) ? "-" : "+") << endl; +#endif + + if (!records.count(connected_head)) { + // Forward edge. Recurse. + +#ifdef debug + cerr << "\t\tReaches unvisited " << graph->get_id(connected_head) << (graph->get_is_reverse(connected_head) ? "-" : "+") << "; Recurse!" << endl; +#endif + + stack.emplace_back(); + stack.back().here = edge_into; + } + } else { + // No children left. + +#ifdef debug + cerr << "\tDone with all children." << endl; +#endif + + // Did any of our children decalre themselves deepest? + // Or do we have no children. + auto deepest_child_edge_it = deepest_child_edge.find(frame_head); + + if (stack.size() > 1) { + // If we have a parent + auto& parent_frame = stack[stack.size() - 2]; + auto parent_head = find(parent_frame.here); + auto& parent_record = records[parent_head]; + + // The length of the path to a leaf will involve the edge from the parent to here. + record.leaf_path_length = graph->get_length(frame.here); + +#ifdef debug + cerr << "\t\tLength of path to deepest leaf is " << record.leaf_path_length << " bp" << endl; +#endif + + if (deepest_child_edge_it != deepest_child_edge.end()) { + // And if we have a child to go on with, we add the length of that path + record.leaf_path_length += records[find(deepest_child_edge_it->second)].leaf_path_length; + +#ifdef debug + cerr << "\t\t\tPlus length from here to leaf via " + << graph->get_id(deepest_child_edge_it->second) << (graph->get_is_reverse(deepest_child_edge_it->second) ? "-" : "+") + << " for " << record.leaf_path_length << " bp total" << endl; +#endif + + } + + // Fill in deepest_child_edge for the parent if not filled in already, or if we beat what's there. + // Also maintain parent's second_deepest_child_edge. + auto parent_deepest_child_it = deepest_child_edge.find(parent_head); + if (parent_deepest_child_it == deepest_child_edge.end()) { + +#ifdef debug + cerr << "\t\tWe are our parent's deepest child by default!" << endl; +#endif + + // Emplace in the map where we didn't find anything. + deepest_child_edge.emplace_hint(parent_deepest_child_it, parent_head, frame.here); + } else if(records[find(parent_deepest_child_it->second)].leaf_path_length < record.leaf_path_length) { + // We are longer than what's there now + +#ifdef debug + cerr << "\t\tWe are our parent's new deepest child!" << endl; +#endif + + // Demote what's there to second-best + parent_record.second_deepest_child_edge = parent_deepest_child_it->second; + parent_record.has_second_deepest_child = true; + +#ifdef debug + cerr << "\t\t\tWe demote " + << graph->get_id(parent_record.second_deepest_child_edge) << (graph->get_is_reverse(parent_record.second_deepest_child_edge) ? "-" : "+") + << " to second-deepest child" << endl; +#endif + + // Replace the value we found + parent_deepest_child_it->second = frame.here; + } else if (!parent_record.has_second_deepest_child) { + +#ifdef debug + cerr << "\t\tWe are our parent's second deepest child by default!" << endl; +#endif + + // There's no second-deepest recorded so we must be it. + parent_record.second_deepest_child_edge = frame.here; + parent_record.has_second_deepest_child = true; + } else if (records[find(parent_record.second_deepest_child_edge)].leaf_path_length < record.leaf_path_length) { + +#ifdef debug + cerr << "\t\tWe are our parent's new second deepest child!" << endl; +#endif + + // We are a new second deepest child. + parent_record.second_deepest_child_edge = frame.here; + } + } + + // The length of the longest leaf-leaf path converging at or under any child (if any) is in record.longest_subtree_path_length. + + if (record.has_second_deepest_child || stack.size() == 1) { + // If there's a second incoming leaf path there's a converging leaf-leaf path here. + // If we're the root and there *isn't* a second incoming leaf-leaf path, we are ourselves a leaf. + + // Grab the length of the longest leaf-leaf path converging exactly here. + // TODO: can we not look up the deepest child's record again? + size_t longest_here_path_length = 0; + if (deepest_child_edge_it != deepest_child_edge.end()) { + longest_here_path_length += records[find(deepest_child_edge_it->second)].leaf_path_length; + } + if (record.has_second_deepest_child) { + longest_here_path_length += records[find(record.second_deepest_child_edge)].leaf_path_length; + } + +#ifdef debug + cerr << "\t\tPaths converge here with total length " << longest_here_path_length << " bp" << endl; +#endif + + if (record.longest_subtree_path_root == frame_head || longest_here_path_length > record.longest_subtree_path_length) { + +#ifdef debug + cerr << "\t\t\tNew longest path in subtree!" << endl; +#endif + + // If there's no path from a child, or this path is + // longer, set record.longest_subtree_path_root + // (back) to frame_head to record that. + record.longest_subtree_path_root = frame_head; + // And save the length. + record.longest_subtree_path_length = longest_here_path_length; + + // Now we are the new root of the longest leaf-leaf path converging at or under us. + } + } + + if (stack.size() > 1 && record.longest_subtree_path_length > 0) { + // We have a leaf-leaf path converging at or under here, and we have a parent. + // TODO: we assume leaf-leaf paths are nonzero length here. + // TODO: save searching up the parent record again + auto& parent_frame = stack[stack.size() - 2]; + auto parent_head = find(parent_frame.here); + auto& parent_record = records[parent_head]; + + // Max our longest leaf-leaf path in against the paths contributed by previous children. + if (parent_record.longest_subtree_path_root == parent_head || + parent_record.longest_subtree_path_length < record.longest_subtree_path_length) { + +#ifdef debug + cerr << "\t\tLongest path in our subtree converging at " + << graph->get_id(record.longest_subtree_path_root) << (graph->get_is_reverse(record.longest_subtree_path_root) ? "-" : "+") + << " is the new longest path in our parent's subtree." << endl; +#endif + + // No child has contributed their leaf-leaf path so far, or ours is better. + parent_record.longest_subtree_path_root = record.longest_subtree_path_root; + parent_record.longest_subtree_path_length = record.longest_subtree_path_length; + } + } + + if (stack.size() == 1) { + // When we get back to the root + +#ifdef debug + cerr << "\t\tWe were the root of the traversal." << endl; +#endif + + if (record.longest_subtree_path_length >= root_cycle_length) { + // Either we didn't root at a cycle, or we found a longer leaf-leaf path that should be the decomposition root instead. + +#ifdef debug + cerr << "\t\t\tTree has leaf-leaf path that is as long as or longer than any cycle at root (" + << record.longest_subtree_path_length << "bp)." << endl; +#endif + + // We need to record the longest tree path. + longest_tree_paths.emplace_back(); + longest_tree_paths.back().first = record.longest_subtree_path_length; + auto& path = longest_tree_paths.back().second; + + auto& path_root_frame = records[record.longest_subtree_path_root]; + + if (path_root_frame.has_second_deepest_child) { + // This is an actual convergence point + +#ifdef debug + cerr << "\t\t\t\tConverges at real convergence point" << endl; +#endif + + // Collect the whole path down the second deepest child + path.push_back(path_root_frame.second_deepest_child_edge); + auto path_trace_it = deepest_child_edge.find(find(path.back())); + while (path_trace_it != deepest_child_edge.end()) { + // Follow the deepest child relationships until they run out. + path.push_back(path_trace_it->second); + path_trace_it = deepest_child_edge.find(find(path.back())); + } + // Reverse what's there and flip all the edges + vector flipped; + flipped.reserve(path.size()); + for (auto path_it = path.rbegin(); path_it != path.rend(); ++path_it) { + flipped.push_back(graph->flip(*path_it)); + } + path = std::move(flipped); + } else { + // There's no second-longest path; we statted at one of the most distant leaves. +#ifdef debug + cerr << "\t\t\t\tConverges at leaf" << endl; +#endif + } + + if (deepest_child_edge.count(record.longest_subtree_path_root)) { + +#ifdef debug + cerr << "\t\t\t\tNonempty path to distinct other leaf" << endl; +#endif + + // There's a nonempty path to another leaf, + // other than the furthest one (and we aren't + // just a point). + // Trace the actual longest path from root to leaf and add it on + path.push_back(deepest_child_edge[record.longest_subtree_path_root]); + auto path_trace_it = deepest_child_edge.find(find(path.back())); + while (path_trace_it != deepest_child_edge.end()) { + // Follow the deepest child relationships until they run out. + path.push_back(path_trace_it->second); + path_trace_it = deepest_child_edge.find(find(path.back())); + } + } + + // OK now we have the longest leaf-leaf path saved. + + // We need to redo the path from the tree traversal + // root to the longest path convergence point, to + // fix up the subtree rooting information. + + // Go to the convergence point + handle_t cursor = record.longest_subtree_path_root; + + // This will be the path of edges to take from the convergence point (new root) to the traversal root (old root) + vector convergence_to_old_root; + while (cursor != frame_head) { + // Walk up the parent pointers to the traversal root and stack up the heads. + // We may get nothing if the root happened to already be on the longest leaf-leaf path. + auto& cursor_record = records[cursor]; + convergence_to_old_root.push_back(cursor_record.parent_edge); + cursor = find(cursor_record.parent_edge); + } + +#ifdef debug + cerr << "\t\t\t\tRewrite along " << convergence_to_old_root.size() << " edges..." << endl; +#endif + + while (!convergence_to_old_root.empty()) { + // Then go down that stack + + // Define new child and parent + handle_t parent_child_edge = convergence_to_old_root.back(); + handle_t child_head = find(parent_child_edge); + handle_t parent_head = find(graph->flip(parent_child_edge)); + + // TODO: find a way to demote parent to child here on each iteration + auto& child_record = records[child_head]; + auto& parent_record = records[parent_head]; + + // If the deepest child of the child is actually the parent, disqualify it + deepest_child_edge_it = deepest_child_edge.find(child_head); + + if (deepest_child_edge_it != deepest_child_edge.end() && find(deepest_child_edge_it->second) == parent_head) { + // The parent was the child's deepest child. Can't have that. + if (child_record.has_second_deepest_child) { + // Promote the second deepest child. + deepest_child_edge_it->second = child_record.second_deepest_child_edge; + child_record.has_second_deepest_child = false; + } else { + // No more deepest child + deepest_child_edge.erase(deepest_child_edge_it); + deepest_child_edge_it = deepest_child_edge.end(); + } + } + + // The child may not have had a parent before. + // So we need to fill in its longest leaf path + // length counting its new parent edge. + + // But we know all its children are done. + + // The length of the path to a leaf will involve the edge from the parent to the child + child_record.leaf_path_length = graph->get_length(parent_child_edge); + + if (deepest_child_edge_it != deepest_child_edge.end()) { + // And if we have a child to go on with, we add the length of that path + child_record.leaf_path_length += records[find(deepest_child_edge_it->second)].leaf_path_length; + } + + // Now we have to mix ourselves into the parent. + // We do it the same way as normal. Both the deepest and second-deepest child of the parent can't be the grandparent. + // So if they both beat us we can't be the real deepest child. + // If we beat the second deepest child, and the original deepest child gets disqualified for being the grandparent, we become the parent's deepest child. + // And if we beat both it doesn't matter whether either gets disqualified, because we win. + + // TODO: deduplicate code with the original DFS? + + // Fill in deepest_child_edge for the parent if not filled in already, or if we beat what's there. + // Also maintain parent's second_deepest_child_edge. + auto parent_deepest_child_it = deepest_child_edge.find(parent_head); + if (parent_deepest_child_it == deepest_child_edge.end()) { + // Emplace in the map where we didn't find anything. + deepest_child_edge.emplace_hint(parent_deepest_child_it, parent_head, parent_child_edge); + } else if(records[find(parent_deepest_child_it->second)].leaf_path_length < child_record.leaf_path_length) { + // We are longer than what's there now + + // Demote what's there to second-best + parent_record.second_deepest_child_edge = parent_deepest_child_it->second; + parent_record.has_second_deepest_child = true; + + // Replace the value we found + parent_deepest_child_it->second = parent_child_edge; + } else if (!parent_record.has_second_deepest_child) { + // There's no second-deepest recorded so we must be it. + parent_record.second_deepest_child_edge = parent_child_edge; + parent_record.has_second_deepest_child = true; + } else if (records[find(parent_record.second_deepest_child_edge)].leaf_path_length < child_record.leaf_path_length) { + // We are a new second deepest child. + parent_record.second_deepest_child_edge = parent_child_edge; + } + + // Now the new child, if its path is deep enough, is the parent's new deepest or second deepest child edge. + // Go up a level, disqualify the grandparent, and see who wins. + // The new root has no parent itself, so all its edges are eligible and the one with the longest path wins. + convergence_to_old_root.pop_back(); + } + +#ifdef debug + for (auto& item : path) { + cerr << "\t\t\t\tPath visits: " + << graph->get_id(item) << (graph->get_is_reverse(item) ? "-" : "+") + << " length " << graph->get_length(item) << endl; + } +#endif + + if (path.empty()) { + // If the leaf-leaf path is empty, stick in a handle so we can actually find the single leaf in the bridge forest. + assert(longest_tree_paths.back().first == 0); + path.push_back(traversal_root); + } else { + // If anything is on the path, we shouldn't have 0 length. + assert(longest_tree_paths.back().first != 0); + } + + } + } + + // Now we're done with this stack frame. + + // Clean up + stack.pop_back(); + } + } + } + }; + + for (auto it = longest_simple_cycles.begin(); it != longest_simple_cycles.end(); ++it) { + // Try it from the head of the component that each longest input simple + // cycle got merged into. If we end up using that longest cycle to root + // this component, we will have everything pointing the right way + // already. + try_root(find(it->second), it->first); + } + + // And then try it on every head in general to mop up anything without a simple cycle in it + for_each_head([&](handle_t head) { + try_root(head, 0); + }); + + // The DFS records die with this function, but the rewritten deepest child + // edges survive and let us root snarls having only their incoming ends. + // And we have all the longest tree paths that beat their components + // rooting cycles, if any. + +#ifdef debug + cerr << "Edges to deepest children in bridge forest:" << endl; + for (auto& kv : deepest_child_edge) { + cerr << "\t" << graph->get_id(kv.first) << (graph->get_is_reverse(kv.first) ? "-" : "+") + << " -> " << graph->get_id(kv.second) << (graph->get_is_reverse(kv.second) ? "-" : "+") << endl; + } +#endif + + return to_return; +} + + + + + +//////////////////////////////////////////////////////////////////////////////////////////// + + + + +IntegratedSnarlFinder::IntegratedSnarlFinder(const HandleGraph& graph) : HandleGraphSnarlFinder(&graph) { + // Nothing to do! +} + +void IntegratedSnarlFinder::traverse_decomposition(const function& begin_chain, const function& end_chain, + const function& begin_snarl, const function& end_snarl) const { + + // Do the actual snarl finding work and then walk the bilayered tree. + +#ifdef debug + cerr << "Ranking graph handles." << endl; +#endif + + // First we need to ensure that our graph has dense handle ranks + bdsg::RankedOverlayHelper overlay_helper; + auto ranked_graph = overlay_helper.apply(graph); + +#ifdef debug + cerr << "Finding snarls." << endl; +#endif + + // We need a union-find over the adjacency components of the graph, in which we will build the cactus graph. + MergedAdjacencyGraph cactus(ranked_graph); + +#ifdef debug + cerr << "Base adjacency components:" << endl; + cactus.to_dot(cerr); +#endif + + // It magically gets the adjacency components itself. + +#ifdef debug + cerr << "Finding 3 edge connected components..." << endl; +#endif + + // Now we need to do the 3 edge connected component merging, using Tsin's algorithm. + // We don't really have a good dense rank space on the adjacency components, so we use the general version. + // TODO: Somehow have a nice dense rank space on components. Can we just use backing graph ranks and hope it's dense enough? + // We represent each adjacency component (node) by its heading handle. +#ifdef debug + size_t tecc_id = 0; +#endif + // Buffer merges until the algorithm is done. + vector> merge_list; + algorithms::three_edge_connected_component_merges([&](const function& emit_node) { + // Feed all the handles that head adjacency components into the algorithm + cactus.for_each_head([&](handle_t head) { +#ifdef debug + cerr << "Three edge component node " << tecc_id << " is head " << graph->get_id(head) << (graph->get_is_reverse(head) ? "-" : "+") << endl; + tecc_id++; +#endif + emit_node(head); + }); + }, [&](handle_t node, const function& emit_edge) { + // When asked for edges, don't deduplicate or filter. We want all multi-edges. + cactus.for_each_member(node, [&](handle_t other_member) { + // For each handle in the adjacency component that this handle is heading (including the head) + + // Follow as an edge again, by flipping + handle_t member_connected_head = cactus.find(graph->flip(other_member)); + + if (member_connected_head == node && graph->get_is_reverse(other_member)) { + // For self loops, only follow them in one direction. Skip in the other. + return; + } + + // Announce it. Multi-edges are OK. + emit_edge(member_connected_head); + }); + }, [&](handle_t a, handle_t b) { + // Now we got a merge to create the 3 edge connected components. + // We can't actually do the merge now, because we can't let the merges + // be visible to the algorithm while it is working. + merge_list.emplace_back(a, b); + }); + + // Now execute the merges, since the algorithm is done looking at the graph. + for (auto& ab : merge_list) { + cactus.merge(ab.first, ab.second); + } + merge_list.clear(); + + // Now our 3-edge-connected components have been condensed, and we have a proper Cactus graph. + +#ifdef debug + cerr << "After 3ecc merging:" << endl; + cactus.to_dot(cerr); +#endif + +#ifdef debug + cerr << "Creating bridge forest..." << endl; +#endif + + // Then we need to copy the base Cactus graph so we can make the bridge forest + MergedAdjacencyGraph forest(cactus); + +#ifdef debug + cerr << "Finding simple cycles..." << endl; +#endif + + // Get cycle information: longest cycle in each connected component, and next edge along cycle for each edge (in one orientation) + pair>, unordered_map> cycles = cactus.cycles_in_cactus(); + auto& longest_cycles = cycles.first; + auto& next_along_cycle = cycles.second; + + for (auto& kv : next_along_cycle) { + // Merge along all cycles in the bridge forest + forest.merge(kv.first, kv.second); + } + +#ifdef debug + cerr << "Bridge forest:" << endl; + forest.to_dot(cerr); +#endif + +#ifdef debug + cerr << "Finding bridge edge paths..." << endl; +#endif + + // Now we find the longest path in each tree in the bridge forest, with its + // length in bases. + // + // We also find, for each bridge edge component head, the edge towards the + // deepest bridge edge tree leaf, which lets us figure out how to root + // dangly bits into chains. + // + // Make sure to root at the nodes corresponding to the collapsed longest + // cycles, if the leaf-leaf paths don't win their components. + // + // For empty leaf-leaf paths, will emit a single node "path" with a length + // of 0. + pair>>, unordered_map> forest_paths = forest.longest_paths_in_forest(longest_cycles); + auto& longest_paths = forest_paths.first; + auto& towards_deepest_leaf = forest_paths.second; + +#ifdef debug + cerr << "Sorting candidate roots..." << endl; +#endif + + // Make sure we are looking at all the cycles and leaf-leaf paths in order. + // We need basically priority queue between them. But we don't need insert so we jsut sort. + std::sort(longest_cycles.begin(), longest_cycles.end()); + std::sort(longest_paths.begin(), longest_paths.end()); + + // Now that we have computed the graphs we need, do the traversal of them. + // This modifies the structures we have computed in-place, and also does + // some extra merges in the cactus graph to make all chains cycles. + traverse_computed_decomposition(cactus, + forest, + longest_paths, + towards_deepest_leaf, + longest_cycles, + next_along_cycle, + begin_chain, + end_chain, + begin_snarl, + end_snarl); +} + +/** + * A set over the nodes in a handle graph. + * All queries automatically ignore orientation. + */ +class HandleGraphNodeSet { +private: + unordered_set visited; + const HandleGraph* graph; +public: + /** + * Make a new set over the nodes of the given graph. + */ + inline HandleGraphNodeSet(const HandleGraph* graph): graph(graph) { + // Nothing to do + } + + /** + * Get the number of nodes in the set. + */ + inline size_t size() const { + return visited.size(); + } + + /** + * Add a node to the set, given a handle to either orientation. + */ + inline void insert(const handle_t& here) { + visited.insert(graph->forward(here)); + } + + /** + * Return whether a node is in the set, given a handle to either orientation. + */ + inline bool count(const handle_t& here) const { + return visited.count(graph->forward(here)); + } +}; + +void IntegratedSnarlFinder::traverse_computed_decomposition(MergedAdjacencyGraph& cactus, + const MergedAdjacencyGraph& forest, + vector>>& longest_paths, + unordered_map& towards_deepest_leaf, + vector>& longest_cycles, + unordered_map& next_along_cycle, + const function& begin_chain, const function& end_chain, + const function& begin_snarl, const function& end_snarl) const { + + // Now, keep a set of all the edges that have found a place in the decomposition. + // Ignore handle orientation. + // Because we don't want to mess up orientations, we only access the set through accessors. + HandleGraphNodeSet visited(graph); + +#ifdef debug + cerr << "Traversing cactus graph..." << endl; +#endif + + // How many handle graph nodes need to be decomposed? + size_t to_decompose = graph->get_node_count(); + while(visited.size() < to_decompose) { + // While we haven't touched everything + +#ifdef debug + if (!longest_cycles.empty()) { + cerr << "Longest cycle: " << longest_cycles.back().first << " bp" << endl; + } + + if (!longest_paths.empty()) { + cerr << "Longest path: " << longest_paths.back().first << " bp" << endl; + } +#endif + + // We have a stack. + struct SnarlChainFrame { + // Set to true if this is a snarl being generated, and false if it is a chain. + bool is_snarl = true; + + // Set to true if the children have already been enumerated. + // If we get back to a frame, and this is true, and todo is empty, we are done with the frame. + bool saw_children = false; + + // Into and out-of edges of this snarl or chain, within its parent. + // Only set if we aren't the root frame on the stack. + pair bounds; + + // Edges denoting children to process. + // If we are a snarl, an entry may be a bridge edge reading into us. + // If so, we will transform it into a cycle. + // If we are a snarl, an entry may be a cycle edge reading into us (with the next edge around the cycle reading out). + // If so, we will recurse on the chain. + // If we are a chain, an entry may be an edge reading into a child snarl. + // If so, we will find the other side of the snarl and recurse on the snarl. + vector todo; + }; + vector stack; + + if (longest_cycles.empty() || (!longest_paths.empty() && longest_cycles.back().first <= longest_paths.back().first)) { + // There should be a path still + assert(!longest_paths.empty()); + // It should not be empty. It should at least have a single bridge forest node to visit. + assert(!longest_paths.back().second.empty()); + + // We will root on a tip-tip path for its connected component, if + // not already covered, because there isn't a longer cycle. + + if (!visited.count(longest_paths.back().second.front())) { + // This connected component isn't already covered. + + handle_t first_edge = longest_paths.back().second.front(); + + if (longest_paths.back().first == 0) { + // This is a 0-length path, but we want to root the decomposition here. + // This bridge tree has no nonempty cycles and no bridge edges. It's just all one adjacency component. + // All contents spill out into the root snarl as contained nodes. + +#ifdef debug + cerr << "Single node bridge tree with no real cycles for " + << graph->get_id(first_edge) << (graph->get_is_reverse(first_edge) ? "-" : "+") << endl; + + cerr << "\tSpilling contents into root snarl." << endl; +#endif + + cactus.for_each_member(cactus.find(first_edge), [&](handle_t inbound) { + // The contents are all self loops + assert(cactus.find(inbound) == cactus.find(graph->flip(inbound))); + if (!graph->get_is_reverse(inbound)) { + // We only want them forward so each becomes only one empty chain. + +#ifdef debug + cerr << "\t\tContain edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << endl; +#endif + + begin_chain(inbound); + end_chain(inbound); + + visited.insert(inbound); + } + }); + } else { + + // This is a real path between distinct bridge edge tree leaves + +#ifdef debug + cerr << "Rooting component at tip-tip path starting with " << graph->get_id(first_edge) << (graph->get_is_reverse(first_edge) ? "-" : "+") << endl; +#endif + + for (size_t i = 1; i < longest_paths.back().second.size(); i++) { + // Rewrite the deepest bridge graph leaf path map to point from one end of the tip-tip path to the other + // TODO: bump this down into the bridge path finding function + + handle_t prev_path_edge = longest_paths.back().second[i - 1]; + handle_t prev_head = forest.find(prev_path_edge); + handle_t next_path_edge = longest_paths.back().second[i]; + + towards_deepest_leaf[prev_head] = next_path_edge; + +#ifdef debug + cerr << "\tEnforce leaf path goes " << graph->get_id(prev_path_edge) << (graph->get_is_reverse(prev_path_edge) ? "-" : "+") + << " with head " << graph->get_id(prev_head) << (graph->get_is_reverse(prev_head) ? "-" : "+") + << " to next edge " << graph->get_id(next_path_edge) << (graph->get_is_reverse(next_path_edge) ? "-" : "+") << endl; +#endif + + } + + // Stack up a root/null snarl containing this bridge edge. + // Remember to queue it facing inward, toward the new new root at the start of the path. + stack.emplace_back(); + stack.back().is_snarl = true; + stack.back().todo.push_back(graph->flip(first_edge)); + +#ifdef debug + cerr << "\tPut cycles and self edges at tip into root snarl" << endl; +#endif + + // Find all the cycles and self edges that are also here and make sure to do them. Connectivity will be in the root snarl. + cactus.for_each_member(cactus.find(graph->flip(first_edge)), [&](handle_t inbound) { + if (inbound == graph->flip(first_edge)) { + // Skip the one bridge edge we started with + return; + } + +#ifdef debug + cerr << "\t\tLook at edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << " on " << next_along_cycle.count(inbound) << " cycles" << endl; +#endif + + if (next_along_cycle.count(inbound)) { + // Put this cycle on the to do list also + +#ifdef debug + cerr << "\t\t\tLook at cycle edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << endl; +#endif + + stack.back().todo.push_back(inbound); + } else if (cactus.find(inbound) == cactus.find(graph->flip(inbound)) && !graph->get_is_reverse(inbound)) { + // Self loop. + // We only want them forward so each becomes only one empty chain. + +#ifdef debug + cerr << "\t\t\tContain edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << endl; +#endif + + begin_chain(inbound); + end_chain(inbound); + + visited.insert(inbound); + } + }); + } + } + + longest_paths.pop_back(); + } else { + // We will root on a cycle for its component, if not already covered. + + if (!visited.count(longest_cycles.back().second)) { + // This connected component hasn't been done yet. + +#ifdef debug + cerr << "Rooting component at cycle for " << graph->get_id(longest_cycles.back().second) << endl; +#endif + + // We have an edge on the longest cycle. But it may be reading into and out of nodes that also contains other cycles, bridge edges, and so on. + // If we declare this longest cycle to be a chain, we need to make sure that both those nodes become snarls in a chain. + // So we introduce a chain that starts and ends with this edge. + // We can't quite articulate that as a todo list entry, so we forge two stack frames. + + // Stack up a root/null snarl containing this cycle as a chain. + stack.emplace_back(); + stack.back().is_snarl = true; + + // Stack up a frame for doing the chain, with the cycle-closing edge as both ends. + stack.emplace_back(); + stack.back().is_snarl = false; + stack.back().bounds = make_pair(longest_cycles.back().second, longest_cycles.back().second); + + // We'll find all the self edges OK when we look in the first/last snarls on the chain. + } + + longest_cycles.pop_back(); + } + + while (!stack.empty()) { + auto& frame = stack.back(); + +#ifdef debug + cerr << "At stack frame " << stack.size() - 1 << " for "; + if (stack.size() == 1) { + cerr << "root"; + } else { + cerr << (frame.is_snarl ? "snarl" : "chain") << " " << graph->get_id(frame.bounds.first) << (graph->get_is_reverse(frame.bounds.first) ? "-" : "+") + << " to " << graph->get_id(frame.bounds.second) << (graph->get_is_reverse(frame.bounds.second) ? "-" : "+"); + } + cerr << endl; +#endif + + if (stack.size() > 1 && !frame.saw_children) { + // We need to queue up the children; this is the first time we are doing this frame. + frame.saw_children = true; + +#ifdef debug + cerr << "\tAnnouncing entry..." << endl; +#endif + + // Announce entering this snarl or chain in the traversal + (frame.is_snarl ? begin_snarl : begin_chain)(frame.bounds.first); + +#ifdef debug + cerr << "\tLooking for children..." << endl; +#endif + + if (frame.is_snarl) { + + // Visit the start and end of the snarl, for decomposition purposes. + visited.insert(frame.bounds.first); + visited.insert(frame.bounds.second); + // TODO: register as part of snarl in index + + // Make sure this isn't trying to be a unary snarl + assert(frame.bounds.first != frame.bounds.second); + + // For a snarl, we need to find all the bridge edges and all the incoming cycle edges + cactus.for_each_member(cactus.find(frame.bounds.first), [&](handle_t inbound) { + + if (inbound == frame.bounds.first || graph->flip(inbound) == frame.bounds.second) { + // This is our boundary; don't follow it as contents. +#ifdef debug + cerr << "\t\tStay inside snarl-bounding edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << endl; +#endif + } else if (forest.find(graph->flip(inbound)) != forest.find(inbound)) { + // This is a bridge edge. The other side is a different component in the bridge graph. + +#ifdef debug + cerr << "\t\tLook at bridge edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << endl; +#endif + + frame.todo.push_back(inbound); + } else if (next_along_cycle.count(inbound)) { + // This edge is the incoming edge for a cycle. Queue it up. + +#ifdef debug + cerr << "\t\tLook at cycle edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << endl; +#endif + frame.todo.push_back(inbound); + } else if (cactus.find(graph->flip(inbound)) == cactus.find(inbound) && !graph->get_is_reverse(inbound)) { + // Count all self edges as empty chains, but only in one orientation. + +#ifdef debug + cerr << "\t\tContain edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << endl; +#endif + + begin_chain(inbound); + end_chain(inbound); + + visited.insert(inbound); + } + }); + } else { + // For a chain, we need to queue up all the edges reading into child snarls, paired with the edges reading out of them. + // We know we're a cycle that can be followed. + handle_t here = frame.bounds.first; + unordered_set seen; + size_t region_start = frame.todo.size(); + do { + +#ifdef debug + cerr << "\t\tLook at cycle edge " << graph->get_id(here) << (graph->get_is_reverse(here) ? "-" : "+") << endl; +#endif + + // We shouldn't loop around unless we hit the end of the chain. + assert(!seen.count(here)); + seen.insert(here); + + // Queue up + frame.todo.push_back(here); + here = next_along_cycle.at(here); + // TODO: when processing entries, we're going to look them up in next_along_cycle again. + // Can we dispense with the todo list and create stack frames directly? + + // Keep going until we come to the end. + // We do this as a do-while because the start may be the end but we still want to go around the cycle. + } while (here != frame.bounds.second); + + // Now we have put all the snarls in the chain on the to + // do list. But we process the to do list from the end, so + // as is we're going to traverse them backward along the + // chain. We want to see them forward along the chain + // instead, so reverse this part of the vector. + // TODO: should we make the to do list a list? That would + // save a reverse but require a bunch of allocations and + // pointer follows. + std::reverse(frame.todo.begin() + region_start, frame.todo.end()); + } + + } + + if (!frame.todo.empty()) { + // Until we run out of edges to work on + handle_t task = frame.todo.back(); + frame.todo.pop_back(); + + if (frame.is_snarl) { + // May have a bridge edge or a cycle edge, both inbound. + auto next_along_cycle_it = next_along_cycle.find(task); + if (next_along_cycle_it != next_along_cycle.end()) { + // To handle a cycle in the current snarl + +#ifdef debug + cerr << "\tHandle cycle edge " << graph->get_id(task) << (graph->get_is_reverse(task) ? "-" : "+") << endl; +#endif + + // We have the incoming edge, so find the outgoing edge along the same cycle + handle_t outgoing = next_along_cycle_it->second; + +#ifdef debug + cerr << "\t\tEnds chain starting at " << graph->get_id(outgoing) << (graph->get_is_reverse(outgoing) ? "-" : "+") << endl; +#endif + +#ifdef debug + cerr << "\t\t\tRecurse on chain " << graph->get_id(outgoing) << (graph->get_is_reverse(outgoing) ? "-" : "+") << " to " + << graph->get_id(task) << (graph->get_is_reverse(task) ? "-" : "+") << endl; +#endif + + if (stack.size() > 1) { + // We have boundaries. Make sure we don't try and + // do a chain that starts or ends with our + // boundaries. That's impossible. + assert(frame.bounds.first != outgoing); + assert(frame.bounds.second != task); + } + + // Recurse on the chain bounded by those edges, as a child + stack.emplace_back(); + stack.back().is_snarl = false; + stack.back().bounds = make_pair(outgoing, task); + + } else { + // To handle a bridge edge in the current snarl: + +#ifdef debug + cerr << "\tHandle bridge edge " << graph->get_id(task) << (graph->get_is_reverse(task) ? "-" : "+") << endl; +#endif + + // Flip it to look out + handle_t edge = graph->flip(task); +#ifdef debug + cerr << "\t\tWalk edge " << graph->get_id(edge) << (graph->get_is_reverse(edge) ? "-" : "+") << endl; +#endif + // Track the head in the Cactus graph for the bridge edges we walk. + handle_t cactus_head = cactus.find(edge); + // And track where its bridge forest component points to as towards the deepest leaf. + auto deepest_it = towards_deepest_leaf.find(forest.find(cactus_head)); + while (deepest_it != towards_deepest_leaf.end()) { + // Follow its path down bridge graph heads, to the + // deepest bridge graph leaf head (which has no + // deeper child) + + // See what our next bridge edge comes out of in the Cactus graph + handle_t next_back_head = cactus.find(graph->flip(deepest_it->second)); + +#ifdef debug + cerr << "\t\t\tHead: " << graph->get_id(cactus_head) << (graph->get_is_reverse(cactus_head) ? "-" : "+") << endl; + cerr << "\t\t\tNext edge back head: " << graph->get_id(next_back_head) << (graph->get_is_reverse(next_back_head) ? "-" : "+") << endl; +#endif + + if (cactus_head != next_back_head) { + // We skipped over a run of interlinked cycle in the bridge tree. + + // We need to find a path of cycles to complete the path in the bridge tree. + + // Each cycle needs to be cut into two pieces + // that can be alternatives in the snarl. + +#ifdef debug + cerr << "\t\t\tFind skipped cycle path" << endl; +#endif + + vector cycle_path = cactus.find_cycle_path_in_cactus(next_along_cycle, cactus_head, next_back_head); + + while (!cycle_path.empty()) { + // Now pop stuff off the end of the path and + // merge it with the component next_back_head + // is in, making sure to pinch off the cycles + // we cut as we do it. + + // Walk the cycle (again) to find where it hits the end component. + // TODO: Save the first traversal we did! + auto through_path_member = next_along_cycle.find(cycle_path.back()); + auto through_end = through_path_member; + do { + // Follow the cycle until we reach the edge going into the end component. + through_end = next_along_cycle.find(through_end->second); + } while (cactus.find(through_end->first) != cactus.find(next_back_head)); + + // Now pinch the cycle + +#ifdef debug + cerr << "\t\t\tPinch cycle between " << graph->get_id(cycle_path.back()) << (graph->get_is_reverse(cycle_path.back()) ? "-" : "+") + << " and " << graph->get_id(through_end->first) << (graph->get_is_reverse(through_end->first) ? "-" : "+") << endl; +#endif + + // Merge the two components where the bridge edges attach, to close the two new cycles. + cactus.merge(cycle_path.back(), next_back_head); + +#ifdef debug + cerr << "\t\t\t\tExchange successors of " << graph->get_id(through_path_member->first) << (graph->get_is_reverse(through_path_member->first) ? "-" : "+") + << " and " << graph->get_id(through_end->first) << (graph->get_is_reverse(through_end->first) ? "-" : "+") << endl; +#endif + + // Exchange their destinations to pinch the cycle in two. + std::swap(through_path_member->second, through_end->second); + + if (through_path_member->first == through_path_member->second) { + // Now a self loop cycle. Delete the cycle. + +#ifdef debug + cerr << "\t\t\t\t\tDelete self loop cycle " << graph->get_id(through_path_member->first) << (graph->get_is_reverse(through_path_member->first) ? "-" : "+") << endl; +#endif + + // Won't affect other iterators. + next_along_cycle.erase(through_path_member); + } + + if (through_end->first == through_end->second) { + // Now a self loop cycle. Delete the cycle. + +#ifdef debug + cerr << "\t\t\t\t\tDelete self loop cycle " << graph->get_id(through_end->first) << (graph->get_is_reverse(through_end->first) ? "-" : "+") << endl; +#endif + + // Won't affect other iterators. + next_along_cycle.erase(through_end); + } + + // And pop it off and merge the end (which now includes it) with whatever came before it on the path. + cycle_path.pop_back(); + } + } + + // Record the new cycle we are making from this bridge path + next_along_cycle[edge] = deepest_it->second; + + // Advance along the bridge tree path. + edge = deepest_it->second; +#ifdef debug + cerr << "\t\tWalk edge " << graph->get_id(edge) << (graph->get_is_reverse(edge) ? "-" : "+") << endl; +#endif + cactus_head = cactus.find(edge); + deepest_it = towards_deepest_leaf.find(forest.find(cactus_head)); + } + + // When you get to the end + + if (edge == graph->flip(task)) { + // It turns out there's only one edge here. + // It is going to become a contained self-loop, instead of a real cycle + + // Record we visited it. + visited.insert(edge); + +#ifdef debug + cerr << "\t\tContain new self-loop " << graph->get_id(edge) << (graph->get_is_reverse(edge) ? "-" : "+") << endl; +#endif + + // Register as part of snarl in index + begin_chain(graph->forward(edge)); + end_chain(graph->forward(edge)); + } else { + // Close the cycle we are making out of the bridge + // forest path. + // The last edge crossed currently reads into the end + // component, but will read into us after the merge. + // The cycle comes in through there and leaves backward + // through the inbound bridge edge we started with. + next_along_cycle[edge] = graph->flip(task); + +#ifdef debug + cerr << "\t\tClose cycle between " << graph->get_id(edge) << (graph->get_is_reverse(edge) ? "-" : "+") + << " and " << graph->get_id(task) << (graph->get_is_reverse(task) ? "-" : "+") << endl; +#endif + + } + + // Merge the far end of the last bridge edge (which may have cycles on it) into the current snarl + + // First find all the new cycles this brings along. + // It can't bring any bridge edges. + // This will detect the cycle we just created. + cactus.for_each_member(cactus_head, [&](handle_t inbound) { + // TODO: deduplicate with snarl setup + if (next_along_cycle.count(inbound)) { + +#ifdef debug + cerr << "\t\tInherit cycle edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << endl; +#endif + + // This edge is the incoming edge for a cycle. Queue it up. + frame.todo.push_back(inbound); + } else if (cactus.find(graph->flip(inbound)) == cactus.find(inbound) && !graph->get_is_reverse(inbound)) { + +#ifdef debug + cerr << "\t\tInherit contained edge " << graph->get_id(inbound) << (graph->get_is_reverse(inbound) ? "-" : "+") << endl; +#endif + + // Count all self edges as empty chains, but only from one side. + begin_chain(inbound); + end_chain(inbound); + + visited.insert(inbound); + } + }); + + // Then do the actual merge. + cactus.merge(edge, task); + + // Now we've queued up the cycle we just made out of + // the bridge edges, along with any cycles we picked up + // from the end of the bridge tree path. + } + } else { + +#ifdef debug + cerr << "\tHandle cycle edge " << graph->get_id(task) << (graph->get_is_reverse(task) ? "-" : "+") << endl; +#endif + + // We're a chain, and WLOG a chain that represents a cycle. + // We have an edge. + // We need to find the other edge that defines the snarl, and recurse into the snarl. + handle_t out_edge = next_along_cycle.at(task); + +#ifdef debug + cerr << "\t\tRecurse on snarl " << graph->get_id(task) << (graph->get_is_reverse(task) ? "-" : "+") << " to " + << graph->get_id(out_edge) << (graph->get_is_reverse(out_edge) ? "-" : "+")<< endl; +#endif + + stack.emplace_back(); + stack.back().is_snarl = true; + stack.back().bounds = make_pair(task, out_edge); + } + + } else { + // Now we have finished a stack frame! + + if (stack.size() > 1) { + // We have bounds + +#ifdef debug + cerr << "\tAnnouncing exit..." << endl; +#endif + + // Announce leaving this snarl or chain in the traversal + (frame.is_snarl ? end_snarl : end_chain)(frame.bounds.second); + + } + +#ifdef debug + cerr << "\tReturn to parent frame" << endl; +#endif + + stack.pop_back(); + } + + + } + } + +} + +SnarlManager IntegratedSnarlFinder::find_snarls_parallel() { + + vector> weak_components = handlealgs::weakly_connected_components(graph); + vector snarl_managers(weak_components.size()); + + #pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < weak_components.size(); ++i) { + const HandleGraph* subgraph; + if (weak_components.size() == 1) { + subgraph = graph; + } else { + // turn the component into a graph + subgraph = new SubgraphOverlay(graph, &weak_components[i]); + } + IntegratedSnarlFinder finder(*subgraph); + // find the snarls without building the index + snarl_managers[i] = finder.find_snarls_unindexed(); + if (weak_components.size() != 1) { + // delete our component graph overlay + delete subgraph; + } + } + + // merge the managers into the biggest one. + size_t biggest_snarl_idx = 0; + for (size_t i = 1; i < snarl_managers.size(); ++i) { + if (snarl_managers[i].num_snarls() > snarl_managers[biggest_snarl_idx].num_snarls()) { + biggest_snarl_idx = i; + } + } + for (size_t i = 0; i < snarl_managers.size(); ++i) { + if (i != biggest_snarl_idx) { + snarl_managers[i].for_each_snarl_unindexed([&](const Snarl* snarl) { + snarl_managers[biggest_snarl_idx].add_snarl(*snarl); + }); + } + } + snarl_managers[biggest_snarl_idx].finish(); + return std::move(snarl_managers[biggest_snarl_idx]); +} + +} diff --git a/src/integrated_snarl_finder.hpp b/src/integrated_snarl_finder.hpp new file mode 100644 index 00000000000..e27ff62b6b0 --- /dev/null +++ b/src/integrated_snarl_finder.hpp @@ -0,0 +1,102 @@ +/// +/// \file integrated_snarl_finder.hpp +/// +/// Defines a widget for finding snarls using an internal implementation of Cactus graphs over HandleGraphs +/// + +#ifndef VG_INTEGRATED_SNARL_FINDER_HPP_INCLUDED +#define VG_INTEGRATED_SNARL_FINDER_HPP_INCLUDED + +#include "snarls.hpp" + +#include +#include +#include +#include + +namespace vg { + +using namespace std; + + +/** + * Class for finding all snarls using an integrated Cactus graph construction + * algorithm. + * + * Does not produce any unary snarls. May leave edges in the root snarl, at the + * ends of top-level chains. + * + * Does not (yet) use paths for rooting. Roots the decomposition at the simple + * cycle or bridge tree path with the most bases of fixed sequence. + */ +class IntegratedSnarlFinder : public HandleGraphSnarlFinder { +private: + // Forward-declare this member type we use inside some functions. + + /** + * Represents a graph that starts as the graph of adjacency components in + * a HandleGraph, and which can be further merged. Can be used to + * represent a cactus graph or a bridge forest. + * + * Represents a graph of "components". Each component contains some number + * of handles from the backing graph, all reading into the component. Each + * handle connects that component to another component: the component that + * contains the flipped version of the handle. Each component is + * identified by a "head" handle. + */ + class MergedAdjacencyGraph; + + /** + * Find all the snarls, given the Cactus graph, the bridge forest, the + * longest paths and cycles, and the towards-leaf/around-cycle information + * needed to follow them. + */ + void traverse_computed_decomposition(MergedAdjacencyGraph& cactus, + const MergedAdjacencyGraph& forest, + vector>>& longest_paths, + unordered_map& towards_deepest_leaf, + vector>& longest_cycles, + unordered_map& next_along_cycle, + const function& begin_chain, const function& end_chain, + const function& begin_snarl, const function& end_snarl) const; + +public: + /** + * Make a new IntegratedSnarlFinder to find snarls in the given graph. + */ + IntegratedSnarlFinder(const HandleGraph& graph); + + /** + * Find all the snarls of weakly connected components in parallel. + */ + virtual SnarlManager find_snarls_parallel(); + + /** + * Visit all snarls and chains, including trivial snarls and single-node + * empty chains. + * + * Calls begin_chain and end_chain when entrering and exiting chains in the + * traversal. Within each chain, calls begin_snarl and end_snarl when + * entering and exiting each snarl, in order. The caller is intended to + * maintain its own stack to match up begin and end events. + * + * Each begin/end call receives the handle reading into/out of the snarl or + * chain. + * + * Both empty and cyclic chains have the in and out handles the same. + * They are distinguished by context; empty chains have no shild snarls, + * while cyclic chains do. + * + * Roots the decomposition at a global snarl with no bounding nodes, for + * which begin_snarl is not called. So the first call will be begin_chain. + * + * Start handles are inward facing and end handles are outward facing. + * Snarls must be oriented forward in their chains. + */ + void traverse_decomposition(const function& begin_chain, const function& end_chain, + const function& begin_snarl, const function& end_snarl) const; +}; + +} + +#endif diff --git a/src/io/converted_hash_graph.hpp b/src/io/converted_hash_graph.hpp new file mode 100644 index 00000000000..06c9c028a6a --- /dev/null +++ b/src/io/converted_hash_graph.hpp @@ -0,0 +1,23 @@ +#ifndef VG_IO_CONVERTED_HASH_GRAPH_HPP_INCLUDED +#define VG_IO_CONVERTED_HASH_GRAPH_HPP_INCLUDED + +#include + +namespace vg { + +namespace io { + +/** + * Define a type that inherits HashGraph so we can tell the difference between + * a real HashGraph and a HashGraph converted at load time. We care about this + * so that vg stats can tell you the original input file format. + */ +class ConvertedHashGraph : public bdsg::HashGraph { + using bdsg::HashGraph::HashGraph; +}; + +} + +} + +#endif diff --git a/src/json_stream_helper.hpp b/src/io/json_stream_helper.hpp similarity index 93% rename from src/json_stream_helper.hpp rename to src/io/json_stream_helper.hpp index fac69b1bd91..994c9a8fa5e 100644 --- a/src/json_stream_helper.hpp +++ b/src/io/json_stream_helper.hpp @@ -1,16 +1,16 @@ -#ifndef VG_JSON_STREAM_HELPER_HPP_INCLUDED -#define VG_JSON_STREAM_HELPER_HPP_INCLUDED +#ifndef VG_IO_JSON_STREAM_HELPER_HPP_INCLUDED +#define VG_IO_JSON_STREAM_HELPER_HPP_INCLUDED #include #include #include -#include "stream.hpp" -#include "json2pb.h" +#include +#include "vg/io/json2pb.h" namespace vg { -namespace stream { +namespace io { // It's handy to be able to stream in JSON via vg view for testing. // This helper class takes this functionality from vg view -J and @@ -103,7 +103,7 @@ inline int64_t JSONStreamHelper::write(std::ostream& out, bool json_out, } if (!good || buf.size() >= buf_size) { if (!json_out) { - stream::write(out, buf.size(), lambda); + vg::io::write(out, buf.size(), lambda); } else { for (int i = 0; i < buf.size(); ++i) { out << pb2json(buf[i]); @@ -115,7 +115,7 @@ inline int64_t JSONStreamHelper::write(std::ostream& out, bool json_out, } if (!json_out) { - stream::finish(out); + vg::io::finish(out); } out.flush(); diff --git a/src/io/load_proto_to_graph.cpp b/src/io/load_proto_to_graph.cpp new file mode 100644 index 00000000000..93ad159444b --- /dev/null +++ b/src/io/load_proto_to_graph.cpp @@ -0,0 +1,402 @@ +/** + * \file load_proto_to_graph.cpp + * Implementation for backend-agnostic Protobuf loading. + */ + +#include "load_proto_to_graph.hpp" + +#include "../hash_map.hpp" +#include "../handle.hpp" +#include "../crash.hpp" + +#include "load_proto_to_graph.hpp" + +#include "vg/io/json2pb.h" + +#include +#include +#include +#include + +#include +#include + +//#define debug + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg; + +void load_proto_to_graph(vg::MutablePathMutableHandleGraph* destination, const vg::io::message_sender_function_t& for_each_message) { + + load_proto_to_graph(destination, [&](const function& process_chunk) { + // Now we can give all the deserialized Graph chunks to process_chunk. + for_each_message([&](const string& serialized_graph) { + // For each Graph, unpack it + Graph g; + if (!ProtobufIterator::parse_from_string(g, serialized_graph)) { + // TODO: make this an exception if we ever want to be allowed to continue from this. + cerr << "error[load_proto_to_graph]: invalid Graph message" << endl; + exit(1); + } + + // And send it along. + process_chunk(g); + }); + }); +} + +void load_proto_to_graph(vg::MutablePathMutableHandleGraph* destination, const function&)>& chunk_sender) { + + // This holds edges we couldn't make when we saw them because both end nodes didn't exist. + // We organize them by the ID of the node they were waiting for. + hash_map>> deferred_edges; + + // This represents a path in progress + struct path_record_t { + size_t min_rank_added = 0; + size_t max_rank_added = 0; + map> earlier_visits_by_rank; + map> later_visits_by_rank; + }; + + // This holds, for each path, the min and max ranks added to the + // graph, and the trees of everything not added yet above and below those + // ranks. + hash_map paths_in_progress; + + // We can only deal with one chunk coming in at a time, but we have + // ambitions for a parallel Constructor. So make sure to only handle one + // chunk at a time. + mutex chunk_mutex; + + chunk_sender([&](Graph& g) { + // For each Graph chunk... + + // Handle one at a time + lock_guard chunk_guard(chunk_mutex); + + // Within this chunk, we keep a node to handle cache + unordered_map node_to_handle; + + // Define a way to get a handle from a cached handle, or the graph, or to fail. + auto get_handle = [&](nid_t id, bool is_reverse, handle_t& dest) { + auto handle_iter = node_to_handle.find(id); + if (handle_iter == node_to_handle.end()) { + // Handle is not cached. + if (destination->has_node(id)) { + // But we do have the node in the destination graph already. + dest = destination->get_handle(id, is_reverse); + crash_unless(destination->get_id(dest) == id); + crash_unless(destination->get_is_reverse(dest) == is_reverse); + return true; + } else { + // The node doesn't exist yet. + return false; + } + } else { + // Handle is cached. All we have to do is get the correct orientation. + dest = is_reverse ? destination->flip(handle_iter->second) : handle_iter->second; + crash_unless(destination->get_id(dest) == id); + crash_unless(destination->get_is_reverse(dest) == is_reverse); + return true; + } + }; + + for (auto& n : g.node()) { + // Create all the nodes + +#ifdef debug + cerr << "Found node " << n.id() << endl; +#endif + + node_to_handle[n.id()] = destination->create_handle(n.sequence(), n.id()); + + auto edges_waiting = deferred_edges.find(n.id()); + if (edges_waiting != deferred_edges.end()) { + +#ifdef debug + cerr << "Node has deferred edges waiting on it" << endl; +#endif + + // There were edges waiting for this node. Make them and get them out of our memory. + for (auto& edge : edges_waiting->second) { + +#ifdef debug + cerr << "Make deferred edge " << get<0>(edge) << " " << get<1>(edge) + << " " << get<2>(edge) << " " << get<3>(edge) << endl; +#endif + + // For each edge that only lacked this node, get the handles. + handle_t from_handle; + handle_t to_handle; + // We can assert because if we did our bookkepping right both nodes exist now. + crash_unless(get_handle(get<0>(edge), get<1>(edge), from_handle)); + crash_unless(get_handle(get<2>(edge), get<3>(edge), to_handle)); + // Make the edge + destination->create_edge(from_handle, to_handle); + } + + // Forget all those edges we just made. + deferred_edges.erase(edges_waiting); + } + } + + for (auto& e : g.edge()) { + // For each edge + + // See if we have the handle for each end cached + auto from_handle_iter = node_to_handle.find(e.from()); + auto to_handle_iter = node_to_handle.find(e.to()); + + // Get the actual handle for each end, from the cache or graph, if available. + // If not available, defer the edge. + handle_t from_handle; + handle_t to_handle; + if (!get_handle(e.from(), e.from_start(), from_handle)) { + // From node isn't made yet. Defer the edge. + +#ifdef debug + cerr << "Defer edge " << e.from() << " " << e.from_start() << " " + << e.to() << " " << e.to_end() << " on " << e.from() << endl; +#endif + + deferred_edges[e.from()].emplace_back(e.from(), e.from_start(), e.to(), e.to_end()); + // Don't do the edge now. + continue; + } else if (!get_handle(e.to(), e.to_end(), to_handle)) { + // To node isn't made yet. Defer the edge. + +#ifdef debug + cerr << "Defer edge " << e.from() << " " << e.from_start() << " " + << e.to() << " " << e.to_end() << " on " << e.to() << endl; +#endif + + deferred_edges[e.to()].emplace_back(e.from(), e.from_start(), e.to(), e.to_end()); + // Don't do the edge now. + continue; + } + +#ifdef debug + cerr << "Make edge " << e.from() << " " << e.from_start() << " " + << e.to() << " " << e.to_end() << endl; +#endif + + // Make the edge + destination->create_edge(from_handle, to_handle); + } + + // For paths, we have append_step and prepend_step, so we can add path + // stuff to the graph as long as we have a continuous rank range. + // If we get stuff not in a contiguous rank range, we collate it with a map. + + // Path visits can only occur in or after chunks their nodes and edges are in. + + for (auto& p : g.path()) { + // We're going to find or place our path in our index of records. + path_handle_t path; + if (!destination->has_path(p.name())) { + // We need to create a new path! +#ifdef debug + cerr << "Found new path " << p.name() << endl; +#endif + path = destination->create_path_handle(p.name(), p.is_circular()); + } else { + // We need to extend an existing path +#ifdef debug + cerr << "Found existing path " << p.name() << endl; +#endif + path = destination->get_path_handle(p.name()); + + if (p.is_circular() && !destination->get_is_circular(path)) { + // If it ever shows up as circular, make it curcular + destination->set_circularity(path, true); + } + } + + // Find or make the record for the path in progress. + // + // We only need to read records for paths that are split across + // chunks, but we always need to write records in case we need to + // read them. + auto& record = paths_in_progress[path]; + + for (auto& m : p.mapping()) { + // For each mapping in the path part for this chunk, in order + if (m.rank() != 0) { + +#ifdef debug + cerr << "Found ranked mapping (" << m.rank() << ")" << endl; +#endif + + // If it has a rank + if (record.min_rank_added == 0) { + // If it is the first rank, put it in. + +#ifdef debug + cerr << "Add as first mapping" << endl; +#endif + + handle_t visited; + crash_unless(get_handle(m.position().node_id(), m.position().is_reverse(), visited)); + destination->append_step(path, visited); + + // And save its rank as the only one added + record.min_rank_added = m.rank(); + record.max_rank_added = record.min_rank_added; + } else if(record.max_rank_added + 1 == m.rank()) { + // If it is adjacent to the previous rank on the high side (most common case) + +#ifdef debug + cerr << "Add as adjacent mapping" << endl; +#endif + + // Add it + handle_t visited; + if (!get_handle(m.position().node_id(), m.position().is_reverse(), visited)) { + throw std::runtime_error("Could not find node " + std::to_string(m.position().node_id()) + " " + (m.position().is_reverse() ? "-" : "+") + " to add mapping " + pb2json(m) + " adjacent to existing mapping at rank " + std::to_string(record.max_rank_added)); + } + destination->append_step(path, visited); + + // And update the ranks + record.max_rank_added = m.rank(); + } else { + // If it isn't adjacent on the high side, stick it in the appropriate ordered map. + // We can resolve these later. + +#ifdef debug + cerr << "Save for later" << endl; +#endif + + if (m.rank() >= record.min_rank_added && m.rank() <= record.max_rank_added) { + // Prohibit duplicate ranks in our contiguous, added region. + // Note that we may miss them if they are not in the contiguous region when we see them. + cerr << "error[load_proto_to_graph]: duplicate rank " << m.rank() << " in path " << p.name() << endl; + exit(1); + } + + // Decide on which side of the contiguous region we go + auto& dest_map = (m.rank() < record.min_rank_added) ? record.earlier_visits_by_rank : record.later_visits_by_rank; + // Add the visit in + dest_map.emplace(m.rank(), make_pair(m.position().node_id(), m.position().is_reverse())); + } + } else { + // If it has no rank, just stick it into the path when it occurs in the file. + // Mixing ranked and unranked in the same path is undefined behavior. + +#ifdef debug + cerr << "Found unranked ranked mapping" << endl; +#endif + + // Make sure we have the node we are visiting + handle_t visited; + crash_unless(get_handle(m.position().node_id(), m.position().is_reverse(), visited)); + + // Make a step to it. + destination->append_step(path, visited); + } + } + + // Now resolve any out-of-order ranked mappings that we can. Maybe we filled in a gap. + { + auto it = record.earlier_visits_by_rank.rbegin(); + while(it != record.earlier_visits_by_rank.rend() && it->first + 1 == record.min_rank_added) { + // We have something to prepend + +#ifdef debug + cerr << "Resolve earlier mapping (" << it->first << ")" << endl; +#endif + + // Get the handle + handle_t visited; + crash_unless(get_handle(it->second.first, it->second.second, visited)); + + // Prepend it + destination->prepend_step(path, visited); + + // Update rank + record.min_rank_added = it->first; + + // Drop the visit. Because we have a reverse iterator, we + // need to get the next thing (possibly rend) and convert + // to a forward iterator. See + record.earlier_visits_by_rank.erase(std::next(it).base()); + + // Find the next one + it = record.earlier_visits_by_rank.rbegin(); + } + } + { + auto it = record.later_visits_by_rank.begin(); + while(it != record.later_visits_by_rank.end() && it->first == record.max_rank_added + 1) { + // We have something to append + +#ifdef debug + cerr << "Resolve later mapping (" << it->first << ")" << endl; +#endif + + // Get the handle + handle_t visited; + crash_unless(get_handle(it->second.first, it->second.second, visited)); + + // Prepend it + destination->append_step(path, visited); + + // Update rank + record.max_rank_added = it->first; + + // Drop the visit + record.later_visits_by_rank.erase(it); + + // Find the next one + it = record.later_visits_by_rank.begin(); + } + } + } + }); + + if (!deferred_edges.empty()) { + // If there are any deferred edges left, we are missing a node. + // Sometimes we have to deal with dangling edges. We just remove them. + cerr << "warning[load_proto_to_graph]: dangling edges on missing node " << deferred_edges.begin()->first + << " and " << (deferred_edges.size() - 1) << " other missing nodes removed" << endl; + } + + // Now make all the path steps we didn't make yet, allowing for rank gaps + + for (auto& path_and_record : paths_in_progress) { + // For each path and record that we touched. + // TODO: this could be a lot of them when we have alt paths! + auto& path = path_and_record.first; + auto& record = path_and_record.second; + + for (auto it = record.earlier_visits_by_rank.rbegin(); it != record.earlier_visits_by_rank.rend(); ++it) { + // For each earlier thing we still have, add it +#ifdef debug + cerr << "Resolve final earlier mapping (" << it->first << ")" << endl; +#endif + handle_t visited = destination->get_handle(it->second.first, it->second.second); + destination->prepend_step(path, visited); + } + record.earlier_visits_by_rank.clear(); + + for (auto it = record.later_visits_by_rank.begin(); it != record.later_visits_by_rank.end(); ++it) { + // For each earlier thing we still have, add it +#ifdef debug + cerr << "Resolve final later mapping (" << it->first << ")" << endl; +#endif + handle_t visited = destination->get_handle(it->second.first, it->second.second); + destination->append_step(path, visited); + } + record.later_visits_by_rank.clear(); + } + + // Now we're done! +} + +} + +} diff --git a/src/io/load_proto_to_graph.hpp b/src/io/load_proto_to_graph.hpp new file mode 100644 index 00000000000..1b5ae0044c0 --- /dev/null +++ b/src/io/load_proto_to_graph.hpp @@ -0,0 +1,44 @@ +#ifndef VG_IO_LOAD_PROTO_TO_GRAPH_HPP_INCLUDED +#define VG_IO_LOAD_PROTO_TO_GRAPH_HPP_INCLUDED + +/** + * \file load_proto_to_graph.hpp + * Read VG Protobuf into any MutablePathMutableHandleGraph. + * Also useful for converting streams of Protobuf Graph objects into a MutablePathMutableHandleGraph. + */ + +#include "../handle.hpp" +#include +#include + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg; + +/** + * Read all string messages supplied by the given message sender as Protobuf + * Graph objects, and create the specified graph in the destination graph. + * + * Paths need to be cached until the end for ranks to be respected. + */ +void load_proto_to_graph(vg::MutablePathMutableHandleGraph* destination, const vg::io::message_sender_function_t& for_each_message); + +/** + * Call the given function with a callback which it can call with a series of + * Protobuf Graph objects, possibly in multiple threads. The Protobuf Graph + * objects may have dangling edges. + * + * Resolves all the dangling edges and writes all the graph data into the given + * MutablePathMutableHandleGraph, with the destination graph being protected + * from concurrent modification. + */ +void load_proto_to_graph(vg::MutablePathMutableHandleGraph* destination, const function&)>& chunk_sender); + +} + +} + +#endif diff --git a/src/io/register_libvg_io.cpp b/src/io/register_libvg_io.cpp new file mode 100644 index 00000000000..94c0a643a8c --- /dev/null +++ b/src/io/register_libvg_io.cpp @@ -0,0 +1,54 @@ +/** + * \file register_libvg_io.hpp + * Includes calls to register all libvg types with libvgio. + */ + +// Keep these includes in alphabetical order. + +#include "register_loader_saver_distance_index.hpp" +#include "register_loader_saver_gbwt.hpp" +#include "register_loader_saver_r_index.hpp" +#include "register_loader_saver_gbwtgraph.hpp" +#include "register_loader_saver_gbz.hpp" +#include "register_loader_saver_gbzgraph.hpp" +#include "register_loader_saver_gcsa.hpp" +#include "register_loader_saver_lcp.hpp" +#include "register_loader_saver_minimizer.hpp" +#include "register_loader_saver_snarl_manager.hpp" +#include "register_loader_saver_vg.hpp" +#include "register_loader_saver_xg.hpp" +#include "register_loader_saver_packed_graph.hpp" +#include "register_loader_saver_hash_graph.hpp" +#include "register_loader_saver_gfa.hpp" + +#include "register_libvg_io.hpp" + + +namespace vg { + +namespace io { + +using namespace std; + +bool register_libvg_io() { + register_loader_saver_distance_index(); + register_loader_saver_gbwt(); + register_loader_saver_r_index(); + register_loader_saver_gbwtgraph(); + register_loader_saver_gbz(); + register_loader_saver_gbzgraph(); + register_loader_saver_gcsa(); + register_loader_saver_lcp(); + register_loader_saver_minimizer(); + register_loader_saver_snarl_manager(); + register_loader_saver_vg(); + register_loader_saver_gfa(); + register_loader_saver_xg(); + register_loader_saver_packed_graph(); + register_loader_saver_hash_graph(); + return true; +} + +} + +} diff --git a/src/io/register_libvg_io.hpp b/src/io/register_libvg_io.hpp new file mode 100644 index 00000000000..e874ebb7a77 --- /dev/null +++ b/src/io/register_libvg_io.hpp @@ -0,0 +1,28 @@ +#ifndef VG_IO_REGISTER_LIBVG_IO_HPP_INCLUDED +#define VG_IO_REGISTER_LIBVG_IO_HPP_INCLUDED + +/** + * \file register_libvg_io.hpp + * Includes a function to call to register IO handlers for libvg types. + */ + +namespace vg { + +namespace io { + +using namespace std; + +/** + * Register libvg types with libvgio. + * Must be called by library users before doing IO. + * Does not magically statically call itself. + * TODO: work out a way it can. + * Returns true on success. + */ +bool register_libvg_io(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_distance_index.cpp b/src/io/register_loader_saver_distance_index.cpp new file mode 100644 index 00000000000..54245956a92 --- /dev/null +++ b/src/io/register_loader_saver_distance_index.cpp @@ -0,0 +1,49 @@ +/** + * \file register_loader_saver_distance_index.cpp + * Defines IO for an XG index from stream files. + */ + +#include +#include "register_loader_saver_distance_index.hpp" + +#include "../snarl_distance_index.hpp" + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_distance_index() { + // The distance index header is just a text string. We need to make sure + // this looks like a bare distance index file if we are going to load it + // without type-tagged message deserialization. + + bdsg::SnarlDistanceIndex empty; + Registry::register_bare_loader_saver_with_magic_and_filename("DISTANCE2", empty.get_prefix(), + [](istream& input, const string& filename) -> void* { + // Allocate an index and hand it the stream + SnarlDistanceIndex* index = new SnarlDistanceIndex(); + if (!filename.empty()) { + index->deserialize(filename); + } else { + index->deserialize(input); + } + + // Return it so the caller owns it. + return (void*) index; + }, + [](const void* index_void, ostream& output) { + // Cast to SnarlDistanceIndex and serialize to the stream. + assert(index_void != nullptr); + throw std::runtime_error( "warning [vpkg::save]: save the distance index directly with serialize() instead of with vpkg"); + + //((const SnarlDistanceIndex*) index_void)->serialize(output); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_distance_index.hpp b/src/io/register_loader_saver_distance_index.hpp new file mode 100644 index 00000000000..41f67345825 --- /dev/null +++ b/src/io/register_loader_saver_distance_index.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_DISTANCE_INDEX_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_DISTANCE_INDEX_HPP_INCLUDED + +/** + * \file register_loader_saver_distance_index.hpp + * Defines IO for a DistanceIndex from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_distance_index(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_gbwt.cpp b/src/io/register_loader_saver_gbwt.cpp new file mode 100644 index 00000000000..6f5c934418a --- /dev/null +++ b/src/io/register_loader_saver_gbwt.cpp @@ -0,0 +1,58 @@ +/** + * \file register_loader_saver_gbwt.cpp + * Defines IO for a GBWT index from stream files. + */ + +#include +#include "register_loader_saver_gbwt.hpp" + +#include +#include + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_gbwt() { + // GBWT and DynamicGBWT can both load/save the same format. + std::uint32_t magic_number = gbwt::GBWTHeader::TAG; + std::string magic_string(reinterpret_cast(&magic_number), sizeof(magic_number)); + + Registry::register_bare_loader_saver_with_magic("GBWT", magic_string, [](istream& input) -> void* { + // Allocate a GBWT + gbwt::GBWT* index = new gbwt::GBWT(); + + // Load it + index->load(input); + + // Return it so the caller owns it. + return (void*) index; + }, [](const void* index_void, ostream& output) { + // Cast to GBWT and serialize to the stream. + assert(index_void != nullptr); + ((const gbwt::GBWT*) index_void)->simple_sds_serialize(output); + }); + + Registry::register_bare_loader_saver_with_magic("GBWT", magic_string, [](istream& input) -> void* { + // Allocate a DynamicGBWT + gbwt::DynamicGBWT* index = new gbwt::DynamicGBWT(); + + // Load it + index->load(input); + + // Return it so the caller owns it. + return (void*) index; + }, [](const void* index_void, ostream& output) { + // Cast to DynamicGBWT and serialize to the stream. + assert(index_void != nullptr); + ((const gbwt::DynamicGBWT*) index_void)->simple_sds_serialize(output); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_gbwt.hpp b/src/io/register_loader_saver_gbwt.hpp new file mode 100644 index 00000000000..59d5e047623 --- /dev/null +++ b/src/io/register_loader_saver_gbwt.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_GBWT_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_GBWT_HPP_INCLUDED + +/** + * \file register_loader_saver_gbwt.hpp + * Defines IO for a GBWT index from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_gbwt(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_gbwtgraph.cpp b/src/io/register_loader_saver_gbwtgraph.cpp new file mode 100644 index 00000000000..f258db0dfc0 --- /dev/null +++ b/src/io/register_loader_saver_gbwtgraph.cpp @@ -0,0 +1,43 @@ +/** + * \file register_loader_saver_gbwtgraph.cpp + * Defines IO for GBWTGraph from stream files. + */ + +#include +#include "register_loader_saver_gbwtgraph.hpp" + +#include + +#include + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_gbwtgraph() { + + // Use the `SerializableHandleGraph` magic number. + gbwtgraph::GBWTGraph empty; + std::uint32_t magic_number = htonl(empty.get_magic_number()); + std::string magic_string(reinterpret_cast(&magic_number), sizeof(magic_number)); + + Registry::register_bare_loader_saver_with_magic("GBWTGraph", magic_string, [](istream& input) -> void* { + gbwtgraph::GBWTGraph* graph = new gbwtgraph::GBWTGraph(); + graph->deserialize(input); + + // Return the graph so the caller owns it. + return static_cast(graph); + }, [](const void* graph_void, ostream& output) { + assert(graph_void != nullptr); + // Serialize in the SDSL format, which is larger than the simple-sds format but faster to load. + // If we want to use the simple-sds format, we can serialize GBZ instead. + static_cast(graph_void)->serialize(output); + }); +} + +} + +} diff --git a/src/io/register_loader_saver_gbwtgraph.hpp b/src/io/register_loader_saver_gbwtgraph.hpp new file mode 100644 index 00000000000..98322b58461 --- /dev/null +++ b/src/io/register_loader_saver_gbwtgraph.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_GBWTGRAPH_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_GBWTGRAPH_HPP_INCLUDED + +/** + * \file register_loader_saver_gbwtgraph.hpp + * Defines IO for GBWTGraph from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_gbwtgraph(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_gbz.cpp b/src/io/register_loader_saver_gbz.cpp new file mode 100644 index 00000000000..e18dec99f0a --- /dev/null +++ b/src/io/register_loader_saver_gbz.cpp @@ -0,0 +1,35 @@ +/** + * \file register_loader_saver_gbz.cpp + * Defines IO for GBZ from stream files. + */ + +#include +#include "register_loader_saver_gbz.hpp" + +#include + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_gbz() { + std::uint32_t magic_number = gbwtgraph::GBZ::Header::TAG; + std::string magic_string(reinterpret_cast(&magic_number), sizeof(magic_number)); + + Registry::register_bare_loader_saver_with_magic("GBZ", magic_string, [](std::istream& input) -> void* { + gbwtgraph::GBZ* result = new gbwtgraph::GBZ(); + result->simple_sds_load(input); + return reinterpret_cast(result); + }, [](const void* gbz_void, std::ostream& output) { + assert(gbz_void != nullptr); + const gbwtgraph::GBZ* gbz = reinterpret_cast(gbz_void); + gbz->simple_sds_serialize(output); + }); +} + +} + +} diff --git a/src/io/register_loader_saver_gbz.hpp b/src/io/register_loader_saver_gbz.hpp new file mode 100644 index 00000000000..45bf8d9b8c4 --- /dev/null +++ b/src/io/register_loader_saver_gbz.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_GBZ_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_GBZ_HPP_INCLUDED + +/** + * \file register_loader_saver_gbz.hpp + * Defines IO for GBZ from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_gbz(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_gbzgraph.cpp b/src/io/register_loader_saver_gbzgraph.cpp new file mode 100644 index 00000000000..93855fcf4b9 --- /dev/null +++ b/src/io/register_loader_saver_gbzgraph.cpp @@ -0,0 +1,36 @@ +/** + * \file register_loader_saver_gbzgraph.cpp + * Defines IO for GBZ in a handle graph proxy wrapper from stream files. + */ + +#include +#include "register_loader_saver_gbzgraph.hpp" + +#include +#include "../gbzgraph.hpp" + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_gbzgraph() { + std::uint32_t magic_number = gbwtgraph::GBZ::Header::TAG; + std::string magic_string(reinterpret_cast(&magic_number), sizeof(magic_number)); + + Registry::register_bare_loader_saver_with_magic("GBZ", magic_string, [](std::istream& input) -> void* { + GBZGraph* result = new GBZGraph(); + result->gbz.simple_sds_load(input); + return reinterpret_cast(result); + }, [](const void* gbzgraph_void, std::ostream& output) { + assert(gbzgraph_void != nullptr); + const GBZGraph* gbz_graph = reinterpret_cast(gbzgraph_void); + gbz_graph->gbz.simple_sds_serialize(output); + }); +} + +} + +} diff --git a/src/io/register_loader_saver_gbzgraph.hpp b/src/io/register_loader_saver_gbzgraph.hpp new file mode 100644 index 00000000000..06df8de3d68 --- /dev/null +++ b/src/io/register_loader_saver_gbzgraph.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_GBZGRAPH_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_GBZGRAPH_HPP_INCLUDED + +/** + * \file register_loader_saver_gbzgraph.hpp + * Defines IO for GBZ from stream files, as a handle graph. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_gbzgraph(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_gcsa.cpp b/src/io/register_loader_saver_gcsa.cpp new file mode 100644 index 00000000000..b5de1fe8017 --- /dev/null +++ b/src/io/register_loader_saver_gcsa.cpp @@ -0,0 +1,41 @@ +/** + * \file register_loader_saver_gcsa.cpp + * Defines IO for a GCSA index from stream files. + */ + +#include +#include "register_loader_saver_gcsa.hpp" + +#include + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_gcsa() { + std::uint32_t magic_number = gcsa::GCSAHeader::TAG; + std::string magic_string(reinterpret_cast(&magic_number), sizeof(magic_number)); + + Registry::register_bare_loader_saver_with_magic("GCSA", magic_string, [](istream& input) -> void* { + // Allocate a GCSA + gcsa::GCSA* index = new gcsa::GCSA(); + + // Load it + index->load(input); + + // Return it so the caller owns it. + return (void*) index; + }, [](const void* index_void, ostream& output) { + // Cast to GCSA and serialize to the stream. + assert(index_void != nullptr); + ((const gcsa::GCSA*) index_void)->serialize(output); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_gcsa.hpp b/src/io/register_loader_saver_gcsa.hpp new file mode 100644 index 00000000000..81d9762267e --- /dev/null +++ b/src/io/register_loader_saver_gcsa.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_GCSA_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_GCSA_HPP_INCLUDED + +/** + * \file register_loader_saver_gcsa.hpp + * Defines IO for a GCSA index from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_gcsa(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_gfa.cpp b/src/io/register_loader_saver_gfa.cpp new file mode 100644 index 00000000000..bf2e158eed7 --- /dev/null +++ b/src/io/register_loader_saver_gfa.cpp @@ -0,0 +1,137 @@ +/** + * \file register_loader_saver_gfa.cpp + */ + +#include +#include "register_loader_saver_gfa.hpp" +#include "algorithms/gfa_to_handle.hpp" +#include "gfa.hpp" + +#include "handle.hpp" +#include "bdsg/packed_graph.hpp" +#include "save_handle_graph.hpp" + +#include + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +// derived from Registry::sniff_magic() in libvgio/src/registry.cpp +// note that we have a limited number of characters to work with so +// there's no way to do something extremely precise or general. +// this only works when we can seek around a little bet, which the calling +// logic in libvgio verifies... +static bool sniff_gfa(istream& stream) { + if (!stream) { + // Can't read anything, so obviously it can't match. + return false; + } + + // Work out how many characters to try and sniff. + // We require that our C++ STL can do this much ungetting, even though the + // standard guarantees absolutely no ungetting. + const size_t to_sniff = 8; + + // Allocate a buffer + char buffer[to_sniff]; + // Have a cursor in the buffer + size_t buffer_used = 0; + + while (stream.peek() != EOF && buffer_used < to_sniff) { + // Until we fill the buffer or would hit EOF, fill the buffer + buffer[buffer_used] = (char) stream.get(); + buffer_used++; + } + + for (size_t i = 0; i < buffer_used; i++) { + // Now unget all the characters again. + // C++11 says we can unget from EOF. + stream.unget(); + if (!stream) { + // We did something the stream disliked. + throw runtime_error("Ungetting failed after " + to_string(i) + " characters"); + } + } + + // Now all the characters are back in the stream. + + if (!stream) { + // We reached EOF when sniffing the magic. We managed to unget + // everything (maybe the file is empty). But we need to clear errors on + // the stream so it is like it was when we started. + stream.clear(); + } + + if (buffer_used < 2) { + // Todo: GFAs can be empty -- may want to return true for empty files? + // As it stands, we require "H\n" at the very least. + return false; + } + + // We are not accepting leading whitespaces. There is no way to do this generally + // with our limited buffer. Also, this check is not bulletproof, so best to sniff gfas + // as a last resort. + + // Check for a header line + if (buffer[0] == 'H' && + (buffer[1] == '\n' || + (buffer[1] == '\t' && buffer_used >= 8 && strncmp(buffer + 2, "VN:Z:", 5) == 0))) { + return true; + } + + // Check for any other type of line, looking for a + if ((buffer[0] == 'S' || buffer[0] == 'L' || buffer[0] == 'P' || buffer[0] == 'W') && + buffer[1] == '\t' && buffer_used > 3 && isprint(buffer[2])) { + return true; + } + + return false; +} + +void register_loader_saver_gfa() { + Registry::register_bare_loader_saver_with_header_check("GFA", sniff_gfa, [](istream& input, const string& filename) -> void* { + // Allocate a PackedGraph + GFAHandleGraph* gfa_graph = new GFAHandleGraph(); + + try { + + if (!filename.empty() && filename != "-") { + // Load it from a file + algorithms::gfa_to_path_handle_graph(filename, gfa_graph, &gfa_graph->gfa_id_space); + } else { + // Load it from the stream, falling back to temp file if necessary + algorithms::gfa_to_path_handle_graph(input, gfa_graph, &gfa_graph->gfa_id_space); + } + // Make sure the node ID to sequence space translation is ready if anybody wants it. + gfa_graph->gfa_id_space.invert_translation(); + + } catch (algorithms::GFAFormatError& e) { + // There is something wrong with the input GFA file. + // Explain that without a long stack trace, and bail. + cerr << "error[register_loader_saver_gfa] GFA "; + if (!filename.empty() && filename != "-") { + cerr << "file " << filename; + } else { + cerr << "stream"; + } + cerr << " cannot be loaded: " << e.what() << endl; + exit(1); + } + + // Return it so the caller owns it. + return (void*) gfa_graph; + }, [](const void* gfa_graph_void, ostream& output) { + // Cast to GFAHandleGraph and serialize to the stream. + assert(gfa_graph_void != nullptr); + graph_to_gfa((const GFAHandleGraph*)gfa_graph_void, output); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_gfa.hpp b/src/io/register_loader_saver_gfa.hpp new file mode 100644 index 00000000000..5ce6a9977fe --- /dev/null +++ b/src/io/register_loader_saver_gfa.hpp @@ -0,0 +1,22 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_GFA_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_GFA_HPP_INCLUDED + +/** + * \file register_loader_saver_gfa.hpp + * Defines IO for a graph in GFA format. It's best if the GFA is "canonical" + * with S lines before L lines before P lines. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_gfa(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_hash_graph.cpp b/src/io/register_loader_saver_hash_graph.cpp new file mode 100644 index 00000000000..23dfaa603f2 --- /dev/null +++ b/src/io/register_loader_saver_hash_graph.cpp @@ -0,0 +1,68 @@ +/** + * \file register_loader_saver_hash_graph.cpp + * Defines IO for a HashGraph from stream files. + */ + +#include +#include +#include "register_loader_saver_hash_graph.hpp" +#include "load_proto_to_graph.hpp" +#include "converted_hash_graph.hpp" + +#include "../handle.hpp" +#include + + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_hash_graph() { + + // Convert the HashGraph SerializableHandleGraph magic number to a string + bdsg::HashGraph empty; + // Make sure it is in network byte order + uint32_t new_magic_number = htonl(empty.get_magic_number()); + // Load all 4 characters of it into a string + string new_magic((char*)&new_magic_number, 4); + + Registry::register_bare_loader_saver_with_magic("HashGraph", new_magic, [](istream& input) -> void* { + // Allocate a HashGraph + bdsg::HashGraph* hash_graph = new bdsg::HashGraph(); + + // Load it + hash_graph->deserialize(input); + + // Return it so the caller owns it. + return (void*) hash_graph; + }, [](const void* hash_graph_void, ostream& output) { + // Cast to HashGraph and serialize to the stream. + assert(hash_graph_void != nullptr); + ((const bdsg::HashGraph*) hash_graph_void)->serialize(output); + }); + + // Also register to be able to load Protobuf, by converting to a hash graph on input, if vg::VG is not required. + // The default implementation for a VG loaded from a file as a handle graph will now be a HashGraph. + Registry::register_loader( + vector{"VG", ""}, + [](const message_sender_function_t& for_each_message) -> void* { + + // Allocate a HashGraph that's really a ConvertedHashGraph, to mark + // that we converted form Protobuf. + bdsg::HashGraph* hash_graph = new vg::io::ConvertedHashGraph(); + + // Load into it + load_proto_to_graph(hash_graph, for_each_message); + + // Return it so the caller owns it. + return (void*) hash_graph; + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_hash_graph.hpp b/src/io/register_loader_saver_hash_graph.hpp new file mode 100644 index 00000000000..976e599e29d --- /dev/null +++ b/src/io/register_loader_saver_hash_graph.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_HASH_GRAPH_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_HASH_GRAPH_HPP_INCLUDED + +/** + * \file register_loader_saver_hash_graph.hpp + * Defines IO for a HashGraph from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_hash_graph(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_lcp.cpp b/src/io/register_loader_saver_lcp.cpp new file mode 100644 index 00000000000..a8a77c022f8 --- /dev/null +++ b/src/io/register_loader_saver_lcp.cpp @@ -0,0 +1,41 @@ +/** + * \file register_loader_saver_lcp.cpp + * Defines IO for a GCSA LCPArray from stream files. + */ + +#include + +#include +#include + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_lcp() { + std::uint32_t magic_number = gcsa::LCPHeader::TAG; + std::string magic_string(reinterpret_cast(&magic_number), sizeof(magic_number)); + + Registry::register_bare_loader_saver_with_magic("LCP", magic_string, [](istream& input) -> void* { + // Allocate an LCPArray + gcsa::LCPArray* index = new gcsa::LCPArray(); + + // Load it + index->load(input); + + // Return it so the caller owns it. + return (void*) index; + }, [](const void* index_void, ostream& output) { + // Cast to LCP and serialize to the stream. + ((const gcsa::LCPArray*) index_void)->serialize(output); + }); + +} + +} + +} + diff --git a/src/io/register_loader_saver_lcp.hpp b/src/io/register_loader_saver_lcp.hpp new file mode 100644 index 00000000000..56ced93ba74 --- /dev/null +++ b/src/io/register_loader_saver_lcp.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_LCP_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_LCP_HPP_INCLUDED + +/** + * \file register_loader_saver_lcp.hpp + * Defines IO for an LCP array from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_lcp(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_minimizer.cpp b/src/io/register_loader_saver_minimizer.cpp new file mode 100644 index 00000000000..f52b1a10bc6 --- /dev/null +++ b/src/io/register_loader_saver_minimizer.cpp @@ -0,0 +1,37 @@ +/** + * \file register_loader_saver_minimizer.cpp + * Defines IO for a minimizer index from stream files. + */ + +#include +#include "register_loader_saver_minimizer.hpp" + +#include + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_minimizer() { + std::uint32_t magic_number = gbwtgraph::MinimizerHeader::TAG; + std::string magic_string(reinterpret_cast(&magic_number), sizeof(magic_number)); + + Registry::register_bare_loader_saver_with_magic("MinimizerIndex", magic_string, [](istream& input) -> void* { + gbwtgraph::DefaultMinimizerIndex* index = new gbwtgraph::DefaultMinimizerIndex(); + index->deserialize(input); + + // Return the index so the caller owns it. + return static_cast(index); + }, [](const void* index_void, ostream& output) { + assert(index_void != nullptr); + static_cast(index_void)->serialize(output); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_minimizer.hpp b/src/io/register_loader_saver_minimizer.hpp new file mode 100644 index 00000000000..99ecb55fa83 --- /dev/null +++ b/src/io/register_loader_saver_minimizer.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_MINIMIZER_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_MINIMIZER_HPP_INCLUDED + +/** + * \file register_loader_saver_minimizer.hpp + * Defines IO for minimizer index from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_minimizer(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_packed_graph.cpp b/src/io/register_loader_saver_packed_graph.cpp new file mode 100644 index 00000000000..09402b0a958 --- /dev/null +++ b/src/io/register_loader_saver_packed_graph.cpp @@ -0,0 +1,47 @@ +/** + * \file register_loader_saver_packed_graph.cpp + * Defines IO for a PackedGraph from stream files. + */ +#include +#include +#include "register_loader_saver_packed_graph.hpp" + +#include "handle.hpp" +#include "bdsg/packed_graph.hpp" + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_packed_graph() { + + // Convert the PackedGraph SerializableHandleGraph magic number to a string + bdsg::PackedGraph empty; + // Make sure it is in network byte order + uint32_t new_magic_number = htonl(empty.get_magic_number()); + // Load all 4 characters of it into a string + string new_magic((char*)&new_magic_number, 4); + + Registry::register_bare_loader_saver_with_magic("PackedGraph", new_magic, [](istream& input) -> void* { + // Allocate a PackedGraph + bdsg::PackedGraph* packed_graph = new bdsg::PackedGraph(); + + // Load it + packed_graph->deserialize(input); + + // Return it so the caller owns it. + return (void*) packed_graph; + }, [](const void* packed_graph_void, ostream& output) { + // Cast to PackedGraph and serialize to the stream. + assert(packed_graph_void != nullptr); + ((const bdsg::PackedGraph*) packed_graph_void)->serialize(output); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_packed_graph.hpp b/src/io/register_loader_saver_packed_graph.hpp new file mode 100644 index 00000000000..c83b41498c4 --- /dev/null +++ b/src/io/register_loader_saver_packed_graph.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_PACKED_GRAPH_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_PACKED_GRAPH_HPP_INCLUDED + +/** + * \file register_loader_saver_packed_graph.hpp + * Defines IO for a PackedGraph from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_packed_graph(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_r_index.cpp b/src/io/register_loader_saver_r_index.cpp new file mode 100644 index 00000000000..b3d6c84a842 --- /dev/null +++ b/src/io/register_loader_saver_r_index.cpp @@ -0,0 +1,41 @@ +/** + * \file register_loader_saver_r_index.cpp + * Defines IO for an r-index from stream files. + */ + +#include +#include "register_loader_saver_r_index.hpp" + +#include + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_r_index() { + std::uint32_t magic_number = gbwt::FastLocate::Header::TAG; + std::string magic_string(reinterpret_cast(&magic_number), sizeof(magic_number)); + + Registry::register_bare_loader_saver_with_magic("R-INDEX", magic_string, [](istream& input) -> void* { + // Allocate an r-index + gbwt::FastLocate* index = new gbwt::FastLocate(); + + // Load it + index->load(input); + + // Return it so the caller owns it. + return (void*) index; + }, [](const void* index_void, ostream& output) { + // Cast to r-index and serialize to the stream. + assert(index_void != nullptr); + ((const gbwt::FastLocate*) index_void)->serialize(output); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_r_index.hpp b/src/io/register_loader_saver_r_index.hpp new file mode 100644 index 00000000000..a5217789215 --- /dev/null +++ b/src/io/register_loader_saver_r_index.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_R_INDEX_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_R_INDEX_HPP_INCLUDED + +/** + * \file register_loader_saver_r_index.hpp + * Defines IO for an r-index from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_r_index(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_snarl_manager.cpp b/src/io/register_loader_saver_snarl_manager.cpp new file mode 100644 index 00000000000..cc12b367c24 --- /dev/null +++ b/src/io/register_loader_saver_snarl_manager.cpp @@ -0,0 +1,59 @@ +/** + * \file register_loader_saver_snarl_manager.cpp + * Defines IO for a SnarlManager from stream files. + */ + +#include +#include +#include "register_loader_saver_snarl_manager.hpp" + +#include "../snarls.hpp" + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_snarl_manager() { + Registry::register_loader_saver(vector{"SNARL", ""}, [](const message_sender_function_t& for_each_message) -> void* { + SnarlManager* manager = new SnarlManager([&](const function& consume_snarl) { + // Call the source function with a function that deserializes each message and feeds it to the SnarlManager. + for_each_message([&](const string& serialized_snarl) { + // Parse the message to a Snarl + Snarl s; + if (!ProtobufIterator::parse_from_string(s, serialized_snarl)) { + // Handle bad graphs. + // TODO: make this an exception if we ever want to be allowed to continue from this. + cerr << "error[register_loader_saver_snarl_manager]: invalid Snarl message" << endl; + exit(1); + } + + // Feed the Snarl to the SnarlManager. + // TODO: Is all this callback-and-callforth better than just a loop somewhere? + // TODO: Unify with the VG load logic to avoid duplicating deserialization + consume_snarl(s); + }); + }); + + return (void*) manager; + }, [](const void* manager_void, const message_consumer_function_t& send_message) { + // Cast to SnarlManager and serialize to the consumer. + assert(manager_void != nullptr); + + const SnarlManager* manager = (const SnarlManager*) manager_void; + + manager->for_each_snarl_preorder([&](const Snarl* snarl) { + // Serialize and emit each snarl + string s; + snarl->SerializeToString(&s); + send_message(s); + }); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_snarl_manager.hpp b/src/io/register_loader_saver_snarl_manager.hpp new file mode 100644 index 00000000000..48569c8be78 --- /dev/null +++ b/src/io/register_loader_saver_snarl_manager.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_SNARL_MANAGER_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_SNARL_MANAGER_HPP_INCLUDED + +/** + * \file register_loader_saver_snarl_manager.hpp + * Defines IO for a SnarlManager index from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_snarl_manager(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_vg.cpp b/src/io/register_loader_saver_vg.cpp new file mode 100644 index 00000000000..0e9274705b3 --- /dev/null +++ b/src/io/register_loader_saver_vg.cpp @@ -0,0 +1,74 @@ +/** + * \file register_loader_saver_vg.cpp + * Defines IO for a VG graph from stream files of Graph objects. + */ + +#include +#include +#include "register_loader_saver_vg.hpp" + +#include "../vg.hpp" + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_vg() { + // We register for "" so we can handle untagged old-style vg files and make them into HandleGraphs + Registry::register_loader_saver(vector{"VG", ""}, + [](const message_sender_function_t& for_each_message) -> void* { + // We have a bit of a control problem. + // The source function wants to drive; we give it a function of strings, and it calls it with all the strings in turn. + + // But the VG also wants to drive; we give it a function to fill Graph objects, and it calls it until it runs out. + // So we use a new constructor of the VG that we get to drive. + + // Allocate a VG and have it call a callback to request all graph chunks. + VG* vg_graph = new VG([&](const function& consume_graph) { + // Call the source function with a function that deserializes each message and feeds it to the graph. + for_each_message([&](const string& serialized_graph) { + // Parse the message to a Graph + Graph g; + if (!ProtobufIterator::parse_from_string(g, serialized_graph)) { + // Handle bad graphs. + // TODO: make this an exception if we ever want to be allowed to continue from this. + cerr << "error[register_loader_saver_vg]: invalid Graph message" << endl; + exit(1); + } + + // Feed the Graph to the VG. + // TODO: Is all this callback-and-callforth better than just a loop somewhere? + consume_graph(g); + }); + }); + + // Return it so the caller owns it. + return (void*) vg_graph; + }, [](const void* vg_void, const message_consumer_function_t& send_message) { + assert(vg_void != nullptr); + + // Cast to a VG + // TODO: VG has to do some non-const syncing before it can serialize, so we hackily make a non-const pointer! + // Fix this when VG learns to serialize itself without modification! + VG& vg_graph = *((VG*) vg_void); + + // Chunk the VG to graphs and spit them out as strings using the given send_message function. + vg_graph.serialize_to_function([&](const Graph& chunk) { + // Make the Graph into a string + string s; + chunk.SerializeToString(&s); + + // Ship it. + // TODO: make this more move-y. + send_message(s); + }); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_vg.hpp b/src/io/register_loader_saver_vg.hpp new file mode 100644 index 00000000000..1384afdc178 --- /dev/null +++ b/src/io/register_loader_saver_vg.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_VG_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_VG_HPP_INCLUDED + +/** + * \file register_loader_saver_vg.hpp + * Defines IO for a VG graph from stream files of Graph objects. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_vg(); + +} + +} + +#endif diff --git a/src/io/register_loader_saver_xg.cpp b/src/io/register_loader_saver_xg.cpp new file mode 100644 index 00000000000..634133114a4 --- /dev/null +++ b/src/io/register_loader_saver_xg.cpp @@ -0,0 +1,50 @@ +/** + * \file register_loader_saver_xg.cpp + * Defines IO for an XG index from stream files. + */ + +#include +#include +#include "register_loader_saver_xg.hpp" + +#include "handle.hpp" +#include "xg.hpp" + +namespace vg { + +namespace io { + +using namespace std; +using namespace vg::io; + +void register_loader_saver_xg() { + // Convert the XG SerializableHandleGraph magic number to a string + xg::XG empty; + // Make sure it is in network byte order + uint32_t new_magic_number = htonl(empty.get_magic_number()); + // Load all 4 characters of it into a string + string new_magic((char*)&new_magic_number, 4); + + // Register to load with either old or new SerializableHandleGraph-managed + // XG magic number sequences, in addition to the tag. + Registry::register_bare_loader_saver_with_magics("XG", + {"XG", new_magic}, [](istream& input) -> void* { + // Allocate an XG + xg::XG* index = new xg::XG(); + + // Load it + index->deserialize(input); + + // Return it so the caller owns it. + return (void*) index; + }, [](const void* index_void, ostream& output) { + // Cast to XG and serialize to the stream. + assert(index_void != nullptr); + ((const xg::XG*) index_void)->serialize(output); + }); +} + +} + +} + diff --git a/src/io/register_loader_saver_xg.hpp b/src/io/register_loader_saver_xg.hpp new file mode 100644 index 00000000000..15360cadf59 --- /dev/null +++ b/src/io/register_loader_saver_xg.hpp @@ -0,0 +1,21 @@ +#ifndef VG_IO_REGISTER_LOADER_SAVER_XG_HPP_INCLUDED +#define VG_IO_REGISTER_LOADER_SAVER_XG_HPP_INCLUDED + +/** + * \file register_loader_saver_xg.hpp + * Defines IO for an XG index from stream files. + */ + +namespace vg { + +namespace io { + +using namespace std; + +void register_loader_saver_xg(); + +} + +} + +#endif diff --git a/src/io/save_handle_graph.hpp b/src/io/save_handle_graph.hpp new file mode 100644 index 00000000000..bee5b1297c5 --- /dev/null +++ b/src/io/save_handle_graph.hpp @@ -0,0 +1,105 @@ +#ifndef VG_IO_SAVE_HANDLE_GRAPH_HPP_INCLUDED +#define VG_IO_SAVE_HANDLE_GRAPH_HPP_INCLUDED + +/** + * \file save_handle_graph.hpp + * Use vpkg to serialize a HandleGraph object + */ + +#include +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include "vg.hpp" +#include "xg.hpp" +#include +#include +#include "gfa.hpp" +#include + +// TODO: move GFAIDMapInfo out of here +#include "../algorithms/gfa_to_handle.hpp" + +namespace vg { + +/// Use to load in GFAs and remember where they came from +class GFAHandleGraph : public bdsg::PackedGraph { +public: + GFAHandleGraph() : bdsg::PackedGraph() {} + virtual ~GFAHandleGraph() = default; + + /// Store the translation from graph ID space (as initially read in) back to + /// GFA ID space. + /// Won't be useful if the graph is modified. + vg::algorithms::GFAIDMapInfo gfa_id_space; +}; + +namespace io { + +using namespace std; + + +/** + * Save a handle graph. + * Todo: should this be somewhere else (ie in vgio with new types registered?) + */ +inline void save_handle_graph(HandleGraph* graph, ostream& os) { + + if (dynamic_cast(graph) != nullptr) { + // We loaded a GFA into a handle graph, want to write back to GFA + graph_to_gfa(dynamic_cast(graph), os); + } else if (dynamic_cast(graph) != nullptr) { + // SerializableHandleGraphs are all serialized bare, without VPKG framing, for libbdsg compatibility. + dynamic_cast(graph)->serialize(os); + } else if (dynamic_cast(graph) != nullptr) { + // vg::VG doesn't use a magic number and isn't a SerializableHandleGraph + vg::io::VPKG::save(*dynamic_cast(graph), os); + } else { + throw runtime_error("Internal error: unable to serialize graph"); + } +} + +inline void save_handle_graph(HandleGraph* graph, const string& dest_path) { + if (dynamic_cast(graph) != nullptr) { + // We loaded a GFA into a handle graph, want to write back to GFA + ofstream os(dest_path); + if (!os) { + throw runtime_error("error[save_handle_graph]: Unable to write to: " + dest_path); + } + graph_to_gfa(dynamic_cast(graph), os); + } else if (dynamic_cast(graph) != nullptr) { + // SerializableHandleGraphs are all serialized bare, without VPKG framing, for libbdsg compatibility. + dynamic_cast(graph)->serialize(dest_path); + } else if (dynamic_cast(graph) != nullptr) { + // vg::VG doesn't use a magic number and isn't a SerializableHandleGraph + vg::io::VPKG::save(*dynamic_cast(graph), dest_path); + } else { + throw runtime_error("Internal error: unable to serialize graph"); + } +} + +// Check that output format specifier is a valid graph type +inline bool valid_output_format(const string& fmt_string) { + return fmt_string == "vg" || fmt_string == "pg" || fmt_string == "hg" || fmt_string == "gfa"; +} + +// Create a new graph (of handle graph type T) where the implementation is chosen using the format string +template +unique_ptr new_output_graph(const string& fmt_string) { + if (fmt_string == "vg") { + return make_unique(); + } else if (fmt_string == "pg") { + return make_unique(); + } else if (fmt_string == "hg") { + return make_unique(); + } else if (fmt_string == "gfa") { + return make_unique(); + } else { + return unique_ptr(); + } +} + +} + +} + +#endif diff --git a/src/job_schedule.cpp b/src/job_schedule.cpp new file mode 100644 index 00000000000..a9d85c96365 --- /dev/null +++ b/src/job_schedule.cpp @@ -0,0 +1,89 @@ +/** + * \file job_schedule.cpp: implements JobSchedule + */ +#include "job_schedule.hpp" + +#include +#include +#include +#include + +#include "utility.hpp" + +namespace vg { + +using namespace std; + +JobSchedule::JobSchedule(const vector>& job_requirements, + const function& job_func) + : job_func(job_func) +{ + for (int64_t i = 0; i < job_requirements.size(); ++i) { + queue.emplace_back(job_requirements[i].second, i); + } + // sort in decreasing order by time required + queue.sort([&](const pair& a, + const pair& b) { + return job_requirements[a.second].first > job_requirements[b.second].first; + }); +} + +void JobSchedule::execute(int64_t target_memory_usage) { + + atomic est_memory_usage(0); + mutex queue_lock; + int num_threads = get_thread_count(); + vector workers; + for (int i = 0; i < num_threads; ++i) { + workers.emplace_back([&]() { + while (true) { + + int64_t job_memory = -1, job_idx = -1; + queue_lock.lock(); + if (queue.empty()) { + // the queue emptied out while we were waiting + queue_lock.unlock(); + break; + } + if (est_memory_usage.load() == 0) { + // even if we don't have the memory budget to do this job, we're + // going to have to at some point and the memory situation will + // never get any better than this + tie(job_memory, job_idx) = queue.front(); + queue.pop_front(); + est_memory_usage.fetch_add(job_memory); + } + else { + // find the longest-running job that can be done with the available + // memory budget + for (auto it = queue.begin(); it != queue.end(); ++it) { + if (it->first + est_memory_usage.load() <= target_memory_usage) { + tie(job_memory, job_idx) = *it; + queue.erase(it); + est_memory_usage.fetch_add(job_memory); + break; + } + } + } + queue_lock.unlock(); + + if (job_idx == -1) { + // there's nothing we can do right now, so back off a second + // before trying again + this_thread::sleep_for(chrono::seconds(1)); + } + else { + // we think we have enough memory available to attempt this job + job_func(job_idx); + est_memory_usage.fetch_sub(job_memory); + } + } + }); + } + // barrier sync + for (auto& worker : workers) { + worker.join(); + } +} +} + diff --git a/src/job_schedule.hpp b/src/job_schedule.hpp new file mode 100644 index 00000000000..aeaf803df39 --- /dev/null +++ b/src/job_schedule.hpp @@ -0,0 +1,46 @@ +#ifndef VG_JOB_SECHEDULE_HPP_INCLUDED +#define VG_JOB_SECHEDULE_HPP_INCLUDED + +/** \file + * job_schedule.hpp: defines JobSchedule + */ +#include +#include +#include +#include +#include +#include + +namespace vg { + +using namespace std; + +/* + * A parallel job scheduler that tries to (if possible) respect a + * cap on memory usage. Works best with a moderate number of + * relatively large jobs. + */ +class JobSchedule { +public: + + // job requirements are given in pairs of (time estimate, memory estimate) + // with the memory estimate being in bytes, and the time estimate in + // arbitrary units + // the job function should execute the i-th job when called + JobSchedule(const vector>& job_requirements, + const function& job_func); + ~JobSchedule() = default; + + // execute the job schedule with a target maximum memory usage + void execute(int64_t target_memory_usage); + +private: + + function job_func; + list> queue; + +}; + +} + +#endif diff --git a/src/json2pb.cpp b/src/json2pb.cpp deleted file mode 100644 index d79a99b8454..00000000000 --- a/src/json2pb.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/// \file json2pb.cpp -/// Replicate the json2pb interface on top of Protobuf 3's built-in JSON code. - -// Some code is derived from `json2pb`: - -/* - * Copyright (c) 2013 Pavel Shramov - * - * json2pb is free software; you can redistribute it and/or modify - * it under the terms of the MIT license. See LICENSE for details. - */ - -#include - -#include -#include - -#include -#include - -using google::protobuf::Message; - -int json_dump_std_string(const char *buf, size_t size, void *data) -{ - std::string *s = (std::string *) data; - s->append(buf, size); - return 0; -} - -void json2pb(Message &msg, FILE* fp) { - // JSON is self-delimiting, but that's not useful if we don't know how to - // parse it. - - // Protobuf has a JSON parser in it, but it's not exposed in a form that is - // useful to us for pulling out single messages; it only wants to stream. - - // We continue using Jansson and parse to Jansson, then back to string, and - // then to Protobuf. - - // Parse one JSON message from the file. - json_t *root; - json_error_t error; - - root = json_loadf(fp, JSON_DISABLE_EOF_CHECK, &error); - - if (!root) - throw std::runtime_error(std::string("Load failed: ") + error.text); - - if (!json_is_object(root)) - throw std::runtime_error("Malformed JSON: not an object"); - - // Dump back to string - std::string r; - json_dump_callback(root, json_dump_std_string, &r, 0); - - json_decref(root); - - // Parse to Protobuf. - json2pb(msg, r); -} - -void json2pb(Message &msg, const std::string& buf) { - auto status = google::protobuf::util::JsonStringToMessage(buf, &msg); - - if (!status.ok()) { - // This generally will happen if someone feeds in the wrong type of JSON. - // TODO: It would be nice to be able to find the neme of the offending non-existent field. - throw std::runtime_error("Could not deserialize " + msg.GetTypeName() + ": " + status.ToString()); - } -} - -void json2pb(Message &msg, const char *buf, size_t size) { - std::string buffer(buf, size); - json2pb(msg, buffer); -} - -std::string pb2json(const Message &msg) { - // Set options to preserve field names and not camel case them - google::protobuf::util::JsonPrintOptions opts; - opts.preserve_proto_field_names = true; - - std::string buffer; - auto status = google::protobuf::util::MessageToJsonString(msg, &buffer, opts); - - if (!status.ok()) { - throw std::runtime_error("Could not serialize " + msg.GetTypeName() + ": " + status.ToString()); - } - - return buffer; -} diff --git a/src/json2pb.h b/src/json2pb.h deleted file mode 100644 index b1fd7420dec..00000000000 --- a/src/json2pb.h +++ /dev/null @@ -1,33 +0,0 @@ -/// \file json2pb.h -/// Originally from `json2pb` by Pavel Shramov. -/// The only part of that library that remains is the interface; the -/// implementation is now using the provided JSON code in Protobuf. -/// This file also contains a bunch of new vg-specific code. - -/* - * Copyright (c) 2013 Pavel Shramov - * - * json2pb is free software; you can redistribute it and/or modify - * it under the terms of the MIT license. See LICENSE for details. - */ - -#ifndef VG_JSON2PB_H_INCLUDED -#define VG_JSON2PB_H_INCLUDED - -#include -#include - -namespace google { -namespace protobuf { -class Message; -} -} - -void json2pb(google::protobuf::Message &msg, const std::string& buf); -void json2pb(google::protobuf::Message &msg, FILE* file); -void json2pb(google::protobuf::Message &msg, const char *buf, size_t size); -std::string pb2json(const google::protobuf::Message &msg); - - - -#endif//VG_JSON2PB_H_INCLUDED diff --git a/src/kff.cpp b/src/kff.cpp new file mode 100644 index 00000000000..08f2258c07f --- /dev/null +++ b/src/kff.cpp @@ -0,0 +1,279 @@ +#include "kff.hpp" + +namespace vg { + +//------------------------------------------------------------------------------ + +bool kff_is_trivial(const uint8_t* encoding) { + for (size_t i = 0; i < 4; i++) { + if (encoding[i] != i) { + return false; + } + } + return true; +} + +std::string kff_invert(const uint8_t* encoding) { + std::string result(4, ' '); + result[encoding[0]] = 'A'; + result[encoding[1]] = 'C'; + result[encoding[2]] = 'G'; + result[encoding[3]] = 'T'; + return result; +} + +kff_recoding_t kff_recoding(const uint8_t* encoding) { + kff_recoding_t result; + for (size_t i = 0; i < 4; i++) { + result.data[encoding[i]] = i; + } + return result; +} + +uint64_t kff_parse(const uint8_t* data, size_t bytes) { + uint64_t value = 0; + size_t shift = 8 * bytes; + for (size_t i = 0; i < bytes; i++) { + shift -= 8; + value |= static_cast(data[i]) << shift; + } + return value; +} + +//------------------------------------------------------------------------------ + +// Encode up to 4 characters in one byte. +uint8_t kff_encode(const std::string& kmer, size_t start, size_t limit, const uint8_t* encoding) { + uint8_t val = 0; + for (size_t i = start; i < limit; i++) { + val <<= 2; + auto packed = gbwtgraph::KmerEncoding::CHAR_TO_PACK[static_cast(kmer[i])]; + if (packed < 4) { + val |= encoding[packed]; + } + } + return val; +} + +std::vector kff_encode(const std::string& kmer, const uint8_t* encoding) { + std::vector result; + result.reserve(kff_bytes(kmer.length())); + + // If k is not a multiple of 4, KFF adds the padding to the high-order bits + // of the first byte. + size_t remainder = kmer.length() & 3; + if (remainder > 0) { + result.push_back(kff_encode(kmer, 0, remainder, encoding)); + } + for (size_t i = remainder; i < kmer.length(); i += 4) { + result.push_back(kff_encode(kmer, i, i + 4, encoding)); + } + + return result; +} + +// Decode up to 4 characters from one byte +void kff_decode(uint8_t byte, size_t chars, const std::string& decoding, std::string& output) { + size_t offset = 2 * chars; + for (size_t i = 0; i < chars; i++) { + offset -= 2; + output.push_back(decoding[(byte >> offset) & 3]); + } +} + +std::string kff_decode(const uint8_t* kmer, size_t k, const std::string& decoding) { + std::string result; + result.reserve(k); + + size_t bytes = kff_bytes(k); + size_t chars = k & 3; + if (chars == 0) { + chars = 4; + } + for (size_t i = 0; i < bytes; i++) { + kff_decode(kmer[i], chars, decoding, result); + chars = 4; + } + + return result; +} + +//------------------------------------------------------------------------------ + +// Recode up to 4 characters in one byte. +uint8_t kff_recode(gbwtgraph::Key64::value_type kmer, size_t k, size_t chars, const uint8_t* encoding) { + size_t offset = 2 * k; + uint8_t val = 0; + for (size_t i = 0; i < chars; i++) { + offset -= 2; + val = (val << 2) | encoding[(kmer >> offset) & 3]; + } + return val; +} + +std::vector kff_recode(gbwtgraph::Key64::value_type kmer, size_t k, const uint8_t* encoding) { + std::vector result; + result.reserve(kff_bytes(k)); + + size_t remainder = k & 3; + if (remainder > 0) { + result.push_back(kff_recode(kmer, k, remainder, encoding)); + } + for (size_t i = remainder; i < k; i += 4) { + result.push_back(kff_recode(kmer, k - i, 4, encoding)); + } + + return result; +} + +gbwtgraph::Key64::value_type kff_recode(const uint8_t* kmer, size_t k, kff_recoding_t recoding) { + gbwtgraph::Key64::value_type result = 0; + + size_t bytes = kff_bytes(k); + size_t chars = k & 3; + if (chars == 0) { + chars = 4; + } + for (size_t i = 0; i < bytes; i++) { + size_t offset = 2 * chars; + for (size_t j = 0; j < chars; j++) { + offset -= 2; + result = (result << 2) | recoding.data[(kmer[i] >> offset) & 3]; + } + chars = 4; + } + + return result; +} + +gbwtgraph::Key64::value_type kff_recode_trivial(const uint8_t* kmer, size_t k, size_t bytes) { + gbwtgraph::Key64::value_type result = 0; + for (size_t i = 0; i < bytes; i++) { + result = (result << 8) | kmer[i]; + } + return result & sdsl::bits::lo_set[2 * k]; +} + +std::vector kff_recode(const uint8_t* kmers, size_t n, size_t k, kff_recoding_t recoding) { + std::vector result; + result.reserve(n); + + size_t total_chars = n + k - 1; + size_t bytes = kff_bytes(total_chars); + size_t chars = total_chars & 3; + if (chars == 0) { + chars = 4; + } + + gbwtgraph::Key64::value_type curr = 0; + for (size_t i = 0, processed = 0; i < bytes; i++) { + size_t offset = 2 * chars; + for (size_t j = 0; j < chars; j++) { + offset -= 2; + curr = (curr << 2) | recoding.data[(kmers[i] >> offset) & 3]; + processed++; + if (processed >= k) { + result.push_back(curr & sdsl::bits::lo_set[2 * k]); + } + } + chars = 4; + } + + return result; +} + +//------------------------------------------------------------------------------ + +uint8_t kff_get(const uint8_t* kmer, size_t i) { + size_t byte = i / 4; + size_t offset = 3 - (i & 3); + return (kmer[byte] >> (2 * offset)) & 3; +} + +void kff_set(std::vector& kmer, size_t i, uint8_t value) { + size_t byte = i / 4; + size_t offset = 3 - (i & 3); + kmer[byte] |= value << (2 * offset); +} + +std::vector kff_reverse_complement(const uint8_t* kmer, size_t k, const uint8_t* encoding) { + uint8_t complement[4]; + complement[encoding[0]] = encoding[3]; + complement[encoding[1]] = encoding[2]; + complement[encoding[2]] = encoding[1]; + complement[encoding[3]] = encoding[0]; + + size_t offset = (4 - (k & 3)) & 3; + std::vector result(kff_bytes(k), 0); + for (size_t i = 0; i < k; i++) { + kff_set(result, 4 * result.size() - 1 - i, complement[kff_get(kmer, i + offset)]); + } + return result; +} + +//------------------------------------------------------------------------------ + +ParallelKFFReader::ParallelKFFReader(const std::string& filename) : + reader(filename) +{ + this->k = this->reader.get_var("k"); + if (this->k > gbwtgraph::Key64::KMER_MAX_LENGTH) { + throw std::runtime_error("ParallelKFFReader: file " + filename + " contains " + std::to_string(this->k) + + "-mers; cannot use k > " + std::to_string(gbwtgraph::Key64::KMER_MAX_LENGTH)); + } + + this->max_kmers_per_block = this->reader.get_var("max"); + this->data_bytes = this->reader.get_var("data_size"); + + std::uint8_t* buf = this->reader.get_encoding(); + for (size_t i = 0; i < 4; i++) { + this->encoding[i] = buf[i]; + } + this->recoding = kff_recoding(this->encoding); +} + +std::vector> ParallelKFFReader::read(size_t n) { + std::vector> result; + result.reserve(n); + + std::lock_guard lock(this->mtx); + + while (!this->buffer.empty() && result.size() < n) { + result.push_back(this->buffer.front()); + this->buffer.pop_front(); + } + if (result.size() >= n) { + return result; + } + + // Because we read kmers by blocks, we have to preallocate the buffers. + uint8_t* block = new uint8_t[kff_bytes(this->max_kmers_per_block + this->k - 1)]; + uint8_t* data = new uint8_t[this->max_kmers_per_block * this->data_bytes]; + size_t kmer_bytes = kff_bytes(this->k); + bool trivial_encoding(kff_is_trivial(this->encoding)); + while (this->reader.has_next() && result.size() < n) { + size_t block_size = this->reader.next_block(block, data); + if (block_size > 1) { + std::vector kmers = kff_recode(block, block_size, this->k, this->recoding); + for (size_t i = 0; i < block_size; i++) { + std::pair kmer(kmers[i], kff_parse(data + i * data_bytes, data_bytes)); + if (result.size() < n) { + result.push_back(kmer); + } else { + buffer.push_back(kmer); + } + } + } else { + kmer_type kmer = (trivial_encoding ? kff_recode_trivial(block, this->k, kmer_bytes) : kff_recode(block, this->k, this->recoding)); + result.push_back({ kmer, kff_parse(data, data_bytes) }); + } + } + delete[] block; block = nullptr; + delete[] data; data = nullptr; + + return result; +} + +//------------------------------------------------------------------------------ + +} // namespace vg \ No newline at end of file diff --git a/src/kff.hpp b/src/kff.hpp new file mode 100644 index 00000000000..1b5e03fa5fc --- /dev/null +++ b/src/kff.hpp @@ -0,0 +1,128 @@ +#ifndef VG_KFF_HPP_INCLUDED +#define VG_KFF_HPP_INCLUDED + +/** \file + * Tools for working with the Kmer File Format (KFF). + */ + +#include +#include + +#include + +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +/// A mapping of character values from KFF encoding to minimizer index encoding. +struct kff_recoding_t { + uint8_t data[4]; +}; + +/// Returns the number of bytes required for a kmer in KFF format. +inline size_t kff_bytes(size_t k) { + return (k + 3) / 4; +} + +/// Returns `true` if the encoding is trivial (0, 1, 2, 3). +bool kff_is_trivial(const uint8_t* encoding); + +/// Inverts the KFF encoding into a packed -> char table. +std::string kff_invert(const uint8_t* encoding); + +/// Returns a recoding for the given encoding. +kff_recoding_t kff_recoding(const uint8_t* encoding); + +/// Parses a big-endian integer from KFF data. +uint64_t kff_parse(const uint8_t* data, size_t bytes); + +//------------------------------------------------------------------------------ + +/// Encodes a kmer in KFF format according to the given encoding. +/// Non-ACGT characters are encoded as 0s. +std::vector kff_encode(const std::string& kmer, const uint8_t* encoding); + +/// Decodes a kmer in KFF format according to the given encoding. +std::string kff_decode(const uint8_t* kmer, size_t k, const std::string& decoding); + +//------------------------------------------------------------------------------ + +/// Recodes a kmer from a minimizer index in KFF format according to the given encoding. +std::vector kff_recode(gbwtgraph::Key64::value_type kmer, size_t k, const uint8_t* encoding); + +/// Recodes a KFF kmer in the minimizer index format according to the given encoding. +/// Will fail silently if `k` is too large or `recoding` is not from `kff_recoding()`. +gbwtgraph::Key64::value_type kff_recode(const uint8_t* kmer, size_t k, kff_recoding_t recoding); + +/// Recodes a KFF kmer in the minimizer index format, assuming that the encoding is +/// the same. Will fail silently if `k` or `bytes` is too large. +gbwtgraph::Key64::value_type kff_recode_trivial(const uint8_t* kmer, size_t k, size_t bytes); + +/// Recodes `n` KFF kmers in the minimizer index format according to the given encoding. +/// Will fail silently if `k` is too large or `recoding` is not from `kff_recoding()`. +std::vector kff_recode(const uint8_t* kmers, size_t n, size_t k, kff_recoding_t recoding); + +//------------------------------------------------------------------------------ + +/// Returns the reverse complement of a KFF kmer. +std::vector kff_reverse_complement(const uint8_t* kmer, size_t k, const uint8_t* encoding); + +/// Returns the reverse complement of a minimizer index kmer. +inline gbwtgraph::Key64::value_type minimizer_reverse_complement(gbwtgraph::Key64::value_type kmer, size_t k) { + return gbwtgraph::Key64(kmer).reverse_complement(k).get_key(); +} + +//------------------------------------------------------------------------------ + +/** + * A wrapper over `Kff_reader` that allows reading kmers safely from multiple threads. + */ +class ParallelKFFReader { +public: + typedef gbwtgraph::Key64::value_type kmer_type; + + /// Creates a new reader for the given file. Throws `std::runtime_error` if the + /// sanity checks fail. + ParallelKFFReader(const std::string& filename); + + /// Reads the next `n` kmers and counts from the file. This can be called safely + /// from multiple threads. If the returned vector contains fewer than `n` kmers + /// this indicates that the reader has reached the end of the file. + std::vector> read(size_t n); + + /// KFF reader. + Kff_reader reader; + + /// Buffer from unused kmers from the latest block. + std::deque> buffer; + + /// Mutex for accessing `reader` and `buffer`. + std::mutex mtx; + + /// Length of the kmers. + size_t k; + + /// Maximum number of kmers per block. + size_t max_kmers_per_block; + + /// Number of bytes reserved for each kmer count. + size_t data_bytes; + + /// Encoding used for the kmers. + std::uint8_t encoding[4]; + + /// Recoding from KFF kmers to minimizer index kmers. + kff_recoding_t recoding; + +private: + ParallelKFFReader(const ParallelKFFReader&) = delete; + ParallelKFFReader& operator= (const ParallelKFFReader&) = delete; +}; + +//------------------------------------------------------------------------------ + +} // namespace vg + +#endif // VG_KFF_HPP_INCLUDED diff --git a/src/kmer.cpp b/src/kmer.cpp index d6030d42ccf..52d63713096 100644 --- a/src/kmer.cpp +++ b/src/kmer.cpp @@ -1,173 +1,223 @@ #include "kmer.hpp" +#include + +//#define debug + namespace vg { + +const string SizeLimitExceededException::msg = "error: exceeded limit of size on disk"; + +const char* SizeLimitExceededException::what() const throw() { + return msg.c_str(); +} + void for_each_kmer(const HandleGraph& graph, size_t k, const function& lambda, - id_t head_id, id_t tail_id) { + id_t head_id, id_t tail_id, atomic* stop_flag) { // for each position on the forward and reverse of the graph // TODO -- add parallel interface in handlegraph bool using_head_tail = head_id + tail_id > 0; +#ifdef debug + cerr << "Looping over kmers" << endl; +#endif graph.for_each_handle([&](const handle_t& h) { - // for the forward and reverse of this handle - // walk k bases from the end, so that any kmer starting on the node will be represented in the tree we build - for (auto handle_is_rev : { false, true }) { - //cerr << "###########################################" << endl; - handle_t handle = handle_is_rev ? graph.flip(h) : h; - list kmers; - // for each position in the node, set up a kmer with that start position and the node end or kmer length as the end position - // determine next positions - id_t handle_id = graph.get_id(handle); - size_t handle_length = graph.get_length(handle); - string handle_seq = graph.get_sequence(handle); - for (size_t i = 0; i < handle_length; ++i) { - pos_t begin = make_pos_t(handle_id, handle_is_rev, i); - pos_t end = make_pos_t(handle_id, handle_is_rev, min(handle_length, i+k)); - kmer_t kmer = kmer_t(handle_seq.substr(offset(begin), offset(end)-offset(begin)), begin, end, handle); - // determine previous context - // if we are running with head/tail nodes, we'll need to do some trickery to eliminate the reverse complement versions of both - if (i == 0) { - // look at the previous nodes - graph.follow_edges(handle, true, [&](const handle_t& prev) { - size_t prev_length = graph.get_length(prev); - kmer.prev_pos.emplace_back(graph.get_id(prev), graph.get_is_reverse(prev), prev_length-1); - kmer.prev_char.emplace_back(graph.get_sequence(prev).substr(prev_length-1, 1)[0]); - }); - // if we're on the forward head or reverse tail, we need to point to the end of the opposite node - if (kmer.prev_pos.empty() && using_head_tail) { - if (id(begin) == head_id) { - kmer.prev_pos.emplace_back(tail_id, false, 0); - kmer.prev_char.emplace_back(graph.get_sequence(graph.get_handle(tail_id, false))[0]); - } else if (id(begin) == tail_id) { - kmer.prev_pos.emplace_back(head_id, true, 0); - kmer.prev_char.emplace_back(graph.get_sequence(graph.get_handle(head_id, true))[0]); - } +#ifdef debug + cerr << "Process handle " << graph.get_id(h) << endl; +#endif + // for the forward and reverse of this handle + // walk k bases from the end, so that any kmer starting on the node will be represented in the tree we build + for (auto handle_is_rev : { false, true }) { + handle_t handle = handle_is_rev ? graph.flip(h) : h; + list kmers; + // for each position in the node, set up a kmer with that start position and the node end or kmer length as the end position + // determine next positions + id_t handle_id = graph.get_id(handle); + size_t handle_length = graph.get_length(handle); + string handle_seq = graph.get_sequence(handle); + for (size_t i = 0; i < handle_length; ++i) { + pos_t begin = make_pos_t(handle_id, handle_is_rev, i); + pos_t end = make_pos_t(handle_id, handle_is_rev, min(handle_length, i+k)); + kmer_t kmer = kmer_t(handle_seq.substr(offset(begin), offset(end)-offset(begin)), begin, end, handle); + // determine previous context + // if we are running with head/tail nodes, we'll need to do some trickery to eliminate the reverse complement versions of both + if (i == 0) { + // look at the previous nodes + graph.follow_edges(handle, true, [&](const handle_t& prev) { + size_t prev_length = graph.get_length(prev); + kmer.prev_pos.emplace_back(graph.get_id(prev), graph.get_is_reverse(prev), prev_length-1); + kmer.prev_char.emplace_back(graph.get_sequence(prev).substr(prev_length-1, 1)[0]); + if (stop_flag) { + // stop if it evaluates to true + return !stop_flag->load(); + } + else { + // always keep going + return true; + } + }); + // if we're on the forward head or reverse tail, we need to point to the end of the opposite node + if (kmer.prev_pos.empty() && using_head_tail) { + if (id(begin) == head_id) { + kmer.prev_pos.emplace_back(tail_id, false, 0); + kmer.prev_char.emplace_back(graph.get_sequence(graph.get_handle(tail_id, false))[0]); + } else if (id(begin) == tail_id) { + kmer.prev_pos.emplace_back(head_id, true, 0); + kmer.prev_char.emplace_back(graph.get_sequence(graph.get_handle(head_id, true))[0]); } - } else { - // the previous is in this node - kmer.prev_pos.emplace_back(handle_id, handle_is_rev, i-1); - kmer.prev_char.emplace_back(handle_seq[i-1]); } - if (kmer.seq.size() < k) { - kmer.seq.reserve(k); // may reduce allocation costs - // follow edges if we haven't completed the kmer here - graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { - kmers.push_back(kmer); - auto& todo = kmers.back(); - todo.curr = next; - }); - } else { + } else { + // the previous is in this node + kmer.prev_pos.emplace_back(handle_id, handle_is_rev, i-1); + kmer.prev_char.emplace_back(handle_seq[i-1]); + } + if (kmer.seq.size() < k) { + kmer.seq.reserve(k); // may reduce allocation costs + // follow edges if we haven't completed the kmer here + graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { kmers.push_back(kmer); - } + auto& todo = kmers.back(); + todo.curr = next; + if (stop_flag) { + // stop if it evaluates to true + return !stop_flag->load(); + } + else { + // always keep going + return true; + } + }); + } else { + kmers.push_back(kmer); } - - // now expand the kmers until they reach k - while (!kmers.empty()) { - // first we check which ones have reached length k in the current handle; for each of these we run lambda and remove them from our list - auto kmers_end = kmers.end(); - for (list::iterator q = kmers.begin(); q != kmers_end; ++q) { - auto& kmer = *q; - // did we reach our target length? - if (kmer.seq.size() == k) { - // TODO here check if we are at the beginning of the reverse head or the beginning of the forward tail and would need special handling - // establish the context - handle_t end_handle = graph.get_handle(id(kmer.end), is_rev(kmer.end)); - size_t end_length = graph.get_length(end_handle); - if (offset(kmer.end) == end_length) { - // have to check which nodes are next - graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { - kmer.next_pos.emplace_back(graph.get_id(next), graph.get_is_reverse(next), 0); - kmer.next_char.emplace_back(graph.get_sequence(next)[0]); - }); - if (kmer.next_pos.empty() && using_head_tail) { - if (id(kmer.begin) == head_id) { - kmer.next_pos.emplace_back(tail_id, true, 0); - kmer.next_char.emplace_back(graph.get_sequence(graph.get_handle(tail_id, true))[0]); - } else if (id(kmer.begin) == tail_id) { - kmer.next_pos.emplace_back(head_id, false, 0); - kmer.next_char.emplace_back(graph.get_sequence(graph.get_handle(head_id, false))[0]); - } - //cerr << "done head or tail" << endl; + + if (stop_flag && stop_flag->load()) { + break; + } + } + + // now expand the kmers until they reach k + while (!kmers.empty()) { + // first we check which ones have reached length k in the current handle; for each of these we run lambda and remove them from our list + auto kmers_end = kmers.end(); + for (list::iterator q = kmers.begin(); q != kmers_end; ++q) { + auto& kmer = *q; + // did we reach our target length? + if (kmer.seq.size() == k) { + // TODO here check if we are at the beginning of the reverse head or the beginning of the forward tail and would need special handling + // establish the context + handle_t end_handle = graph.get_handle(id(kmer.end), is_rev(kmer.end)); + size_t end_length = graph.get_length(end_handle); + if (offset(kmer.end) == end_length) { + // have to check which nodes are next + graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { + kmer.next_pos.emplace_back(graph.get_id(next), graph.get_is_reverse(next), 0); + kmer.next_char.emplace_back(graph.get_sequence(next)[0]); + }); + if (kmer.next_pos.empty() && using_head_tail) { + if (id(kmer.begin) == head_id) { + kmer.next_pos.emplace_back(tail_id, true, 0); + kmer.next_char.emplace_back(graph.get_sequence(graph.get_handle(tail_id, true))[0]); + } else if (id(kmer.begin) == tail_id) { + kmer.next_pos.emplace_back(head_id, false, 0); + kmer.next_char.emplace_back(graph.get_sequence(graph.get_handle(head_id, false))[0]); } - } else { - // on node - kmer.next_pos.push_back(kmer.end); - kmer.next_char.push_back(graph.get_sequence(end_handle)[offset(kmer.end)]); + //cerr << "done head or tail" << endl; } - // if we have head and tail ids set, iterate through our positions and do the flip - if (using_head_tail) { - // flip the beginning - if (id(kmer.begin) == head_id && is_rev(kmer.begin)) { - get_id(kmer.begin) = tail_id; - get_is_rev(kmer.begin) = false; - } else if (id(kmer.begin) == tail_id && is_rev(kmer.begin)) { - get_id(kmer.begin) = head_id; - get_is_rev(kmer.begin) = false; - } - // flip the nexts - for (auto& pos : kmer.next_pos) { - if (id(pos) == head_id && is_rev(pos)) { - get_id(pos) = tail_id; - get_is_rev(pos) = false; - } else if (id(pos) == tail_id && is_rev(pos)) { - get_id(pos) = head_id; - get_is_rev(pos) = false; - } - } - // if we aren't both from and to a head/tail node, emit - /* - if (!((offset(kmer.begin) == 0 - && id(kmer.begin) == head_id - && kmer.next_pos.size() == 1 - && id(kmer.next_pos.front()) == tail_id) - || (offset(kmer.begin) == 0 - && id(kmer.begin) == tail_id - && kmer.next_pos.size() == 1 - && id(kmer.next_pos.front()) == head_id))) { - lambda(kmer); - } - */ - if (kmer.prev_pos.size() == 1 && kmer.next_pos.size() == 1 - && (offset(kmer.begin) == 0) - && (id(kmer.begin) == head_id || id(kmer.begin) == tail_id) - && (id(kmer.prev_pos.front()) == head_id || id(kmer.prev_pos.front()) == tail_id) - && (id(kmer.next_pos.front()) == head_id || id(kmer.next_pos.front()) == tail_id)) { - // skip - } else { - lambda(kmer); + } else { + // on node + kmer.next_pos.push_back(kmer.end); + kmer.next_char.push_back(graph.get_sequence(end_handle)[offset(kmer.end)]); + } + // if we have head and tail ids set, iterate through our positions and do the flip + if (using_head_tail) { + // flip the beginning + if (id(kmer.begin) == head_id && is_rev(kmer.begin)) { + get_id(kmer.begin) = tail_id; + get_is_rev(kmer.begin) = false; + } else if (id(kmer.begin) == tail_id && is_rev(kmer.begin)) { + get_id(kmer.begin) = head_id; + get_is_rev(kmer.begin) = false; + } + // flip the nexts + for (auto& pos : kmer.next_pos) { + if (id(pos) == head_id && is_rev(pos)) { + get_id(pos) = tail_id; + get_is_rev(pos) = false; + } else if (id(pos) == tail_id && is_rev(pos)) { + get_id(pos) = head_id; + get_is_rev(pos) = false; } + } + // if we aren't both from and to a head/tail node, emit + /* + if (!((offset(kmer.begin) == 0 + && id(kmer.begin) == head_id + && kmer.next_pos.size() == 1 + && id(kmer.next_pos.front()) == tail_id) + || (offset(kmer.begin) == 0 + && id(kmer.begin) == tail_id + && kmer.next_pos.size() == 1 + && id(kmer.next_pos.front()) == head_id))) { + lambda(kmer); + } + */ + if (kmer.prev_pos.size() == 1 && kmer.next_pos.size() == 1 + && (offset(kmer.begin) == 0) + && (id(kmer.begin) == head_id || id(kmer.begin) == tail_id) + && (id(kmer.prev_pos.front()) == head_id || id(kmer.prev_pos.front()) == tail_id) + && (id(kmer.next_pos.front()) == head_id || id(kmer.next_pos.front()) == tail_id)) { + // skip } else { - // now pass the kmer and its context to our callback lambda(kmer); } + } else { + // now pass the kmer and its context to our callback + lambda(kmer); + } + q = kmers.erase(q); + } else { + // do we finish in the current node? + id_t curr_id = graph.get_id(kmer.curr); + size_t curr_length = graph.get_length(kmer.curr); + bool curr_is_rev = graph.get_is_reverse(kmer.curr); + string curr_seq = graph.get_sequence(kmer.curr); + size_t take = min(curr_length, k-kmer.seq.size()); + kmer.end = make_pos_t(curr_id, curr_is_rev, take); + kmer.seq.append(curr_seq.substr(0,take)); + if (kmer.seq.size() < k) { + // if not, we need to expand through the node then follow on + graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { + kmers.push_back(kmer); + auto& todo = kmers.back(); + todo.curr = next; + }); q = kmers.erase(q); } else { - // do we finish in the current node? - id_t curr_id = graph.get_id(kmer.curr); - size_t curr_length = graph.get_length(kmer.curr); - bool curr_is_rev = graph.get_is_reverse(kmer.curr); - string curr_seq = graph.get_sequence(kmer.curr); - size_t take = min(curr_length, k-kmer.seq.size()); - kmer.end = make_pos_t(curr_id, curr_is_rev, take); - kmer.seq.append(curr_seq.substr(0,take)); - if (kmer.seq.size() < k) { - // if not, we need to expand through the node then follow on - graph.follow_edges(kmer.curr, false, [&](const handle_t& next) { - kmers.push_back(kmer); - auto& todo = kmers.back(); - todo.curr = next; - }); - q = kmers.erase(q); - } else { - if (kmer.seq.size() > k) { - assert(kmer.seq.size() <= k); - } + if (kmer.seq.size() > k) { + assert(kmer.seq.size() <= k); } } } } + if (stop_flag && stop_flag->load()) { + break; + } } - }, true); + if (stop_flag && stop_flag->load()) { + break; + } + } + if (stop_flag) { + // stop if it evaluates to true + return !stop_flag->load(); + } + else { + // always keep going + return true; + } + }, true); } ostream& operator<<(ostream& out, const kmer_t& kmer) { @@ -200,7 +250,7 @@ void kmer_to_gcsa_kmers(const kmer_t& kmer, const gcsa::Alphabet& alpha, const f if (offset(kmer.begin) >= 1024) { #pragma omp critical (error) { - cerr << "Found kmer with offset >= 1024. GCSA2 cannot handle nodes greater than 1024 bases long. " + cerr << "error: Found kmer with offset >= 1024. GCSA2 cannot handle nodes greater than 1024 bases long. " << "To enable indexing, modify your graph using `vg mod -X 256 x.vg >y.vg`. " << kmer << endl; exit(1); @@ -233,20 +283,30 @@ void write_gcsa_kmers(const HandleGraph& graph, int kmer_size, ostream& out, siz thread_outputs.resize(omp_get_num_threads()); } } + + // we can't throw from within an OMP block, so instead we have to use some machinery to flag when + // we need to throw + atomic size_limit_exceeded(0); + // This handles the buffered writing for each thread size_t buffer_limit = 1e5; // max 100k kmers per buffer size_t total_bytes = 0; auto handle_kmers = [&](vector& kmers, bool more) { if (!more || kmers.size() > buffer_limit) { size_t bytes_required = kmers.size() * sizeof(gcsa::KMer) + sizeof(gcsa::GraphFileHeader); -#pragma omp critical (gcsa_kmer_out) +#pragma omp critical { - if (total_bytes + bytes_required > size_limit) { - cerr << "error: [write_gcsa_kmers()] size limit exceeded" << endl; - exit(EXIT_FAILURE); + if (!size_limit_exceeded.load()) { + // we didn't exceed the size limit while waiting for the critical block + if (total_bytes + bytes_required > size_limit) { + cerr << "error: [write_gcsa_kmers()] size limit of " << size_limit << " bytes exceeded" << endl; + size_limit_exceeded.store(1); + } + else { + gcsa::writeBinary(out, kmers, kmer_size); + total_bytes += bytes_required; + } } - gcsa::writeBinary(out, kmers, kmer_size); - total_bytes += bytes_required; } kmers.clear(); } @@ -260,11 +320,17 @@ void write_gcsa_kmers(const HandleGraph& graph, int kmer_size, ostream& out, siz handle_kmers(thread_output, true); }; // Run on each KmerPosition. This populates start_end_id, if it was 0, before calling convert_kmer. - for_each_kmer(graph, kmer_size, convert_kmer, head_id, tail_id); + for_each_kmer(graph, kmer_size, convert_kmer, head_id, tail_id, &size_limit_exceeded); for(auto& thread_output : thread_outputs) { // Flush our buffers handle_kmers(thread_output, false); } + // did we end execution because we hit the size limit + if (size_limit_exceeded.load()) { + throw SizeLimitExceededException(); + } + + // FIXME: we seem to use this behavior in VGset, but this is not good semantics size_limit = total_bytes; } @@ -274,7 +340,14 @@ string write_gcsa_kmers_to_tmpfile(const HandleGraph& graph, int kmer_size, size string tmpfile = temp_file::create(base_file_name); ofstream out(tmpfile); // write the kmers to the temporary file - write_gcsa_kmers(graph, kmer_size, out, size_limit, head_id, tail_id); + try { + write_gcsa_kmers(graph, kmer_size, out, size_limit, head_id, tail_id); + } + catch (SizeLimitExceededException& ex) { + out.close(); + temp_file::remove(tmpfile); + throw ex; + } out.close(); return tmpfile; } diff --git a/src/kmer.hpp b/src/kmer.hpp index b8bb279195e..73ea374cbf7 100644 --- a/src/kmer.hpp +++ b/src/kmer.hpp @@ -1,9 +1,10 @@ #ifndef VG_KMER_HPP_INCLUDED #define VG_KMER_HPP_INCLUDED -#include "vg.pb.h" +#include #include -#include "json2pb.h" +#include +#include "vg/io/json2pb.h" #include "handle.hpp" #include "position.hpp" #include "gcsa/gcsa.h" @@ -40,10 +41,27 @@ struct kmer_t { vector next_char; }; +/** + * Exception that indicates that a limit on disk size has been exceeded + */ +class SizeLimitExceededException : public std::exception { +public: + + SizeLimitExceededException() noexcept = default; + ~SizeLimitExceededException() noexcept = default; + + const char* what() const noexcept; +private: + + static const string msg; +}; + /// Iterate over all the kmers in the graph, running lambda on each +/// If the stop flag is included, stop execution if it ever evaluates to true void for_each_kmer(const HandleGraph& graph, size_t k, const function& lambda, - id_t head_id = 0, id_t tail_id = 0); + id_t head_id = 0, id_t tail_id = 0, + atomic* stop_flag = nullptr); /// Print a kmer_t to a stream. ostream& operator<<(ostream& out, const kmer_t& kmer); @@ -62,7 +80,8 @@ gcsa::byte_type encode_chars(const vector& chars, const gcsa::Alphabet& al void write_gcsa_kmers(const HandleGraph& graph, int kmer_size, ostream& out, size_t& size_limit, id_t head_id, id_t tail_id); /// Open a tempfile and write the kmers to it. The calling context should remove it -/// with temp_file::remove(). +/// with temp_file::remove(). In the case that the size limit is exceeded, throws a +/// SizeLimitExceededException and deletes the temp file. string write_gcsa_kmers_to_tmpfile(const HandleGraph& graph, int kmer_size, size_t& size_limit, id_t head_id, id_t tail_id, const string& base_file_name = "vg-kmers-tmp-"); diff --git a/src/kmp.cpp b/src/kmp.cpp new file mode 100644 index 00000000000..2e417c4ac4e --- /dev/null +++ b/src/kmp.cpp @@ -0,0 +1,60 @@ +#include "kmp.hpp" + +#include +#include + +namespace vg { + +// TODO: these functions are very similar, could possibly merge into one? + +vector make_prefix_suffix_table(const char* pattern, size_t len) { + + vector table(len, 0); + + for (size_t i = 1, j = 0; i < len;) { + if (pattern[i] == pattern[j]) { + ++j; + table[i] = j; + ++i; + } + else { + if (j != 0) { + j = table[j - 1]; + } + else { + table[i] = 0; + ++i; + } + } + } + + return table; +} + +size_t kmp_search(const char* text, size_t text_len, + const char* pattern, size_t pattern_len, + const vector& prefix_suffix_table) { + if (text_len >= pattern_len) { + for (size_t i = 0, j = 0, last = text_len - pattern_len; i - j <= last;) { + if (text[i] == pattern[j]) { + ++i; + ++j; + if (j == pattern_len) { + return i - pattern_len; + } + } + else { + if (j != 0) { + j = prefix_suffix_table[j - 1]; + } + else { + ++i; + } + } + } + } + return string::npos; +} + + +} diff --git a/src/kmp.hpp b/src/kmp.hpp new file mode 100644 index 00000000000..0dbbf07d572 --- /dev/null +++ b/src/kmp.hpp @@ -0,0 +1,23 @@ +#ifndef VG_KMP_HPP_INCLUDED +#define VG_KMP_HPP_INCLUDED + +// kmp.hpp: Knuth-Morris-Pratt algorithm + +#include + +namespace vg { + +using namespace std; + +// preprocess search pattern +vector make_prefix_suffix_table(const char* pattern, size_t len); + +// return index of first match or string::npos if there is no match +size_t kmp_search(const char* text, size_t text_len, + const char* pattern, size_t pattern_len, + const vector& prefix_suffix_table); + + +} + +#endif diff --git a/src/main.cpp b/src/main.cpp index 70f9b23ebc1..dc87d7381ad 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,6 +14,8 @@ #include "utility.hpp" #include "crash.hpp" #include "preflight.hpp" +#include "config/allocator_config.hpp" +#include "io/register_libvg_io.hpp" // New subcommand system provides all the subcommands that used to live here #include "subcommand/subcommand.hpp" @@ -40,23 +42,38 @@ void vg_help(char** argv) { }); cerr << endl << "For more commands, type `vg help`." << endl; + cerr << "For technical support, please visit: https://www.biostars.org/tag/vg/" << endl << endl; } // We make sure to compile main for the lowest common denominator architecture. -// This works on GCC and Clang. But we have to decalre main and then define it. -int main(int argc, char *argv[]) __attribute__((__target__("arch=x86-64"))); +// This macro is defined in the preflight header on supported compiler setups. +// But to use it we have to declare and then define main. +int main(int argc, char *argv[]) VG_PREFLIGHT_EVERYWHERE; int main(int argc, char *argv[]) { // Make sure the system meets system requirements (i.e. has all the instructions we need) preflight_check(); - + + // Make sure we configure the memory allocator appropriately for our environment + configure_memory_allocator(); + // Set up stack trace support from crash.hpp enable_crash_handling(); - - // set a higher value for tcmalloc warnings - setenv("TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD", "1000000000000000", 1); + // Determine a sensible default number of threads and apply it. + choose_good_thread_count(); + + // Determine temp directory from environment variables. + temp_file::set_system_dir(); + + // Tell the IO library about libvg types. + // TODO: Make a more generic libvg startup function? + if (!vg::io::register_libvg_io()) { + cerr << "error[vg]: Could not register libvg types with libvgio" << endl; + return 1; + } + if (argc == 1) { vg_help(argv); return 1; @@ -65,6 +82,9 @@ int main(int argc, char *argv[]) { auto* subcommand = vg::subcommand::Subcommand::get(argc, argv); if (subcommand != nullptr) { // We found a matching subcommand, so run it + if (subcommand->get_category() == vg::subcommand::CommandCategory::DEPRECATED) { + cerr << endl << "WARNING:[vg] Subcommand '" << argv[1] << "' is deprecated and is no longer being actively maintained. Future releases may eliminate it entirely." << endl << endl; + } return (*subcommand)(argc, argv); } else { // No subcommand found diff --git a/src/mapper.cpp b/src/mapper.cpp index 49426474842..fd9641489c9 100644 --- a/src/mapper.cpp +++ b/src/mapper.cpp @@ -1,21 +1,37 @@ #include #include "mapper.hpp" -#include "haplotypes.hpp" +#include "graph.hpp" #include "annotation.hpp" +#include "statistics.hpp" +#include "path.hpp" +#include "entropy.hpp" +#include "alignment.hpp" +#include "translator.hpp" +#include "algorithms/subgraph.hpp" +#include "algorithms/nearest_offsets_in_paths.hpp" +#include "algorithms/jump_along_path.hpp" +#include "algorithms/approx_path_distance.hpp" +#include "algorithms/path_string.hpp" +#include "algorithms/alignment_path_offsets.hpp" #include "algorithms/extract_containing_graph.hpp" //#define debug_mapper +//#define debug_strip_match + +using namespace vg::io; + namespace vg { // init the static memo thread_local vector BaseMapper::adaptive_reseed_length_memo; -BaseMapper::BaseMapper(xg::XG* xidex, +BaseMapper::BaseMapper(PathPositionHandleGraph* xidex, gcsa::GCSA* g, gcsa::LCPArray* a, haplo::ScoreProvider* haplo_score_provider) : - xindex(xidex) + AlignerClient(estimate_gc_content(g)) + , xindex(xidex) , gcsa(g) , lcp(a) , haplo_score_provider(haplo_score_provider) @@ -26,17 +42,16 @@ BaseMapper::BaseMapper(xg::XG* xidex, , adaptive_reseed_diff(true) , adaptive_diff_exponent(0.065) , hit_max(0) - , alignment_threads(1) - , qual_adj_aligner(nullptr) - , regular_aligner(nullptr) - , adjust_alignments_for_base_quality(false) , mapping_quality_method(Approx) , max_mapping_quality(60) , strip_bonuses(false) , assume_acyclic(false) + , exclude_unaligned(false) { - init_aligner(default_match, default_mismatch, default_gap_open, - default_gap_extension, default_full_length_bonus); + + if (xindex != nullptr) { + avg_node_length = xindex->get_total_length() / xindex->get_node_count(); + } // TODO: removing these consistency checks because we seem to have violated them pretty wontonly in // the code base already by changing the members directly when they were still public @@ -44,7 +59,7 @@ BaseMapper::BaseMapper(xg::XG* xidex, // // allow a default constructor with no indexes // if (xidex || g || a) { // if(xidex == nullptr) { -// // We need an XG graph. +// // We need an PathPositionHandleGraph graph. // cerr << "error:[vg::Mapper] cannot create an xg-based Mapper with null xg index" << endl; // exit(1); // } @@ -67,12 +82,6 @@ BaseMapper::BaseMapper(void) : BaseMapper(nullptr, nullptr, nullptr) { // Nothing to do. Default constructed and can't really do anything. } -BaseMapper::~BaseMapper(void) { - - clear_aligners(); - -} - // Use the GCSA2 index to find super-maximal exact matches. vector BaseMapper::find_mems_simple(string::const_iterator seq_begin, @@ -86,7 +95,6 @@ BaseMapper::find_mems_simple(string::const_iterator seq_begin, exit(1); } - string::const_iterator cursor = seq_end; vector mems; // an empty sequence matches the entire bwt @@ -113,56 +121,53 @@ BaseMapper::find_mems_simple(string::const_iterator seq_begin, // the temporary MEM we'll build up in this process auto full_range = gcsa::range_type(0, gcsa->size() - 1); - MaximalExactMatch match(cursor, cursor, full_range); - gcsa::range_type last_range = match.range; - --cursor; // start off looking at the last character in the query + string::const_iterator curr_end = seq_end; + string::const_iterator cursor = seq_end - 1; + // start off looking at the last character in the query + gcsa::range_type range = accelerate_mem_query(seq_begin, cursor); while (cursor >= seq_begin) { // hold onto our previous range - last_range = match.range; + auto last_range = range; // execute one step of LF mapping - match.range = gcsa->LF(match.range, gcsa->alpha.char2comp[*cursor]); - if (gcsa::Range::empty(match.range) - || max_mem_length && match.end-cursor > max_mem_length - || match.end-cursor > gcsa->order()) { + range = gcsa->LF(range, gcsa->alpha.char2comp[*cursor]); + if (gcsa::Range::empty(range) + || (max_mem_length && curr_end - cursor > max_mem_length) + || curr_end - cursor > gcsa->order()) { // break on N; which for DNA we assume is non-informative // this *will* match many places in assemblies; this isn't helpful if (*cursor == 'N' || last_range == full_range) { // we mismatched in a single character // there is no MEM here - match.begin = cursor+1; - match.range = last_range; - mems.push_back(match); - match.end = cursor; - match.range = full_range; + mems.emplace_back(cursor + 1, curr_end, last_range); + curr_end = cursor; + range = full_range; --cursor; } else { // we've exhausted our BWT range, so the last match range was maximal // or: we have exceeded the order of the graph (FPs if we go further) // we have run over our parameter-defined MEM limit // record the last MEM - match.begin = cursor+1; - match.range = last_range; - mems.push_back(match); + mems.emplace_back(cursor + 1, curr_end, last_range); // set up the next MEM using the parent node range // length of last MEM, which we use to update our end pointer for the next MEM - size_t last_mem_length = match.end - match.begin; + int64_t last_mem_length = (curr_end - cursor) - 1; // get the parent suffix tree node corresponding to the parent of the last MEM's STNode gcsa::STNode parent = lcp->parent(last_range); // change the end for the next mem to reflect our step size size_t step_size = last_mem_length - parent.lcp(); - match.end = mems.back().end-step_size; + curr_end = curr_end - step_size; // and set up the next MEM using the parent node range - match.range = parent.range(); + range = parent.range(); } } else { - // we are matching - match.begin = cursor; // just step to the next position --cursor; } } // if we have a non-empty MEM at the end, record it - if (match.end - match.begin > 0) mems.push_back(match); + if (curr_end > seq_begin) { + mems.emplace_back(seq_begin, curr_end, range); + } // find the SMEMs from the mostly-SMEM and some MEM list we've built // FIXME: un-hack this (it shouldn't be needed!) @@ -216,8 +221,8 @@ BaseMapper::find_mems_simple(string::const_iterator seq_begin, vector reseeded; for (auto& mem : mems) { // reseed if we have a long singular match - if (mem.length() >= reseed_length - && mem.match_count == 1 + if ((mem.length() >= reseed_length + && mem.match_count == 1) // or if we only have one mem for the entire read (even if it may have many matches) || mems.size() == 1) { // reseed at midway between here and the min mem length and at the min mem length @@ -225,7 +230,6 @@ BaseMapper::find_mems_simple(string::const_iterator seq_begin, int reseeds = 0; while (reseeds == 0 && reseed_to >= min_mem_length) { #ifdef debug_mapper -#pragma omp critical cerr << "reseeding " << mem.sequence() << " with " << reseed_to << endl; #endif vector remems = find_mems_simple(mem.begin, @@ -258,6 +262,655 @@ BaseMapper::find_mems_simple(string::const_iterator seq_begin, return mems; } +vector BaseMapper::find_fanout_mems(string::const_iterator seq_begin, + string::const_iterator seq_end, + string::const_iterator qual_begin, + int max_fans_out, + char max_fanout_base_quality, + vector>>* mem_fanout_breaks) { + +#ifdef debug_mapper + cerr << "find_fanout_mems: sequence " << string(seq_begin, seq_end) << ", max fan-out quality " << (int) max_fanout_base_quality << endl; +#endif + + vector> qual_pos; + qual_pos.reserve(seq_end - seq_begin); + for (auto seq_it = seq_begin, qual_it = qual_begin; seq_it != seq_end; ++seq_it, ++qual_it) { + qual_pos.emplace_back(*(qual_it), seq_it); + } + make_heap(qual_pos.begin(), qual_pos.end(), greater()); + + // find the k lowest base qualities and marking them as the fan out positions (breaking ties + // by moving to start of read) + vector do_fanout(seq_end - seq_begin, false); + while (!qual_pos.empty() && qual_pos.front().first < max_fanout_base_quality + && do_fanout.size() - qual_pos.size() < max_fans_out) { + + do_fanout[qual_pos.front().second - seq_begin] = true; + pop_heap(qual_pos.begin(), qual_pos.end(), greater()); + qual_pos.pop_back(); + } + +#ifdef debug_mapper + cerr << "chose fan-out positions: "; + for (size_t i = 0; i < do_fanout.size(); ++i) { + if (do_fanout[i]) { + cerr << i << " "; + } + } + cerr << endl; +#endif + + // a struct to hold the information needed to continue a search from a fan-out + struct FanOutSearch { + // the SA range going into the search + gcsa::range_type prev_range; + // where the search originally started + string::const_iterator match_end; + // where the search will continue from when this is dequeued + string::const_iterator cursor; + // the char we should use at the first search + char fanout_char; + // the where the MEM will need to be broken up + deque> fanout_breaks; + // record whether the iteration before the + bool prev_iter_jumped_lcp; + + FanOutSearch(gcsa::range_type prev_range, + string::const_iterator seq_begin, + string::const_iterator match_end, + string::const_iterator cursor, + char fanout_char, + bool prev_iter_jumped_lcp, + const deque>& prev_fanout_breaks) + : prev_range(prev_range), match_end(match_end), cursor(cursor), fanout_char(fanout_char), + prev_iter_jumped_lcp(prev_iter_jumped_lcp), fanout_breaks(prev_fanout_breaks) { + if (cursor >= seq_begin && *cursor != fanout_char) { + fanout_breaks.emplace_front(cursor, fanout_char); + } + } + FanOutSearch(gcsa::range_type prev_range, + string::const_iterator seq_begin, + string::const_iterator match_end, + string::const_iterator cursor, + char fanout_char) + : prev_range(prev_range), match_end(match_end), cursor(cursor), fanout_char(fanout_char), prev_iter_jumped_lcp(false) { + if (*cursor != fanout_char) { + fanout_breaks.emplace_front(cursor, fanout_char); + } + } + }; + + gcsa::range_type full_range = gcsa::range_type(0, gcsa->size() - 1); + + // initialize the stack of search problems + vector stack; + if (seq_begin < seq_end) { + // walk backwards from end until finding a non-N base or a base where we want to fan out + auto start_cursor = seq_end - 1; + while (start_cursor >= seq_begin && *start_cursor == 'N' && !do_fanout[start_cursor - seq_begin]) { + seq_end = start_cursor; + --start_cursor; + } + if (start_cursor >= seq_begin) { + if (do_fanout[start_cursor - seq_begin]) { + // fan out at the first base + for (char ch : {'A', 'C', 'G', 'T'}) { + stack.emplace_back(full_range, seq_begin, seq_end, start_cursor, ch); + } + } + else { + // don't fan out at the first base + stack.emplace_back(full_range, seq_begin, seq_end, start_cursor, *start_cursor); + } + } + } + + vector>>> search_results; + + while (!stack.empty()) { + + auto cursor = stack.back().cursor; + auto match_end = stack.back().match_end; + auto range = stack.back().prev_range; + auto fanout_char = stack.back().fanout_char; + auto fanout_breaks = move(stack.back().fanout_breaks); + auto prev_iter_jumped_lcp = stack.back().prev_iter_jumped_lcp; + stack.pop_back(); + +#ifdef debug_mapper + cerr << "starting a fanout search with cursor at " << (cursor - seq_begin) << " and fanout char of " << fanout_char << endl; + cerr << "current suffix: " << string(cursor, match_end) << endl; + cerr << "breaks: " << endl; + for (auto r : fanout_breaks) { + cerr << "\t" << (r.first - seq_begin) << " -> " << r.second << endl; + } +#endif + + + bool use_fanout_char = true; + + while (cursor >= seq_begin) { + + // we only use the fan out character on the first iteration of a fan out search + char ch = use_fanout_char ? fanout_char : *cursor; + +#ifdef debug_mapper + cerr << "LF iter at cursor " << (cursor - seq_begin) << " char " << ch << ", fanout? " << use_fanout_char << endl; +#endif + + if (!use_fanout_char && do_fanout[cursor - seq_begin]) { +#ifdef debug_mapper + cerr << "aborting to search to queue up fan-out searches" << endl; +#endif + // fan out into all possiblities + for (char nt : {'A', 'C', 'G', 'T'}) { + stack.emplace_back(range, seq_begin, match_end, cursor, nt, prev_iter_jumped_lcp, fanout_breaks); + } + // stop the current search + break; + } + + // hold onto our previous range + auto prev_range = range; + + // execute one step of LF mapping + range = gcsa->LF(range, gcsa->alpha.char2comp[ch]); + + if (gcsa::Range::empty(range) || match_end - cursor > gcsa->order() || ch == 'N') { + +#ifdef debug_mapper + cerr << "terminating search at end of a MEM" << endl; +#endif + + // we've exhausted our BWT range, so the last match range was maximal + // or we have exceeded the order of the graph (FPs if we go further) + + if (cursor + 1 == match_end || ch == 'N') { +#ifdef debug_mapper + cerr << "skipping past a full index mismatch or N" << endl; +#endif + // avoid getting caught in infinite loop when a single character mismatches + // entire index (b/c then advancing the LCP doesn't move the search forward + // at all, need to move the cursor instead) + + if (!fanout_breaks.empty() && match_end - cursor - 1 <= fanout_length_threshold) { + // this fan-out search didn't find a long enough match for us to think + // that these are non-random matches + break; + } + + search_results.emplace_back(prev_range, cursor + 1, match_end, fanout_breaks); + + match_end = cursor; + range = full_range; + fanout_breaks.clear(); + --cursor; + + prev_iter_jumped_lcp = false; + } + else { +#ifdef debug_mapper + cerr << "reached end of match" << endl; +#endif + auto match_begin = cursor + 1; + if (!fanout_breaks.empty() && match_end - match_begin <= fanout_length_threshold) { + // this fan-out search didn't find a long enough match for us to think + // that these are non-random matches +#ifdef debug_mapper + cerr << "match length of " << (match_end - match_begin) << " is below threshold " << fanout_length_threshold << ", aborting search" << endl; +#endif + break; + } + + + // record the last MEM, but check to make sure were not actually still searching + // for the end of the next MEM + if (!prev_iter_jumped_lcp) { +#ifdef debug_mapper + cerr << "emitting a search result" << endl; +#endif + search_results.emplace_back(prev_range, match_begin, match_end, fanout_breaks); + } + + // init this outside of the if condition so that we can avoid calling the + // expensive LCP::parent function twice in the code path that checks max LCP + gcsa::STNode parent = lcp->parent(prev_range); + + // set the MEM to be the longest prefix that is shared with another MEM + match_end = match_begin + parent.lcp(); + // and set up the next MEM using the parent node range + range = parent.range(); + // forget about any fan outs that happened after the end of the new match + while (!fanout_breaks.empty() && fanout_breaks.back().first >= match_end) { + fanout_breaks.pop_back(); + } + prev_iter_jumped_lcp = true; + +#ifdef debug_mapper + cerr << "after jumping LCP, suffix is " << string(cursor + 1, match_end) << endl; +#endif + + if (use_fanout_char) { + // this match ended on the fanout character, so we may need to remove + // that fan-out break from the search result + if (!get<3>(search_results.back()).empty() && + get<3>(search_results.back()).front().first < get<1>(search_results.back())) { + get<3>(search_results.back()).pop_front(); + } + // and skip the part of the iteration where we mark ourselves as being + // past the fan-out character + continue; + } + + } + } + else { + prev_iter_jumped_lcp = false; + // just step to the next position + --cursor; + } + + use_fanout_char = false; + } + + if (cursor < seq_begin) { +#ifdef debug_mapper + cerr << "terminating search at start of the read" << endl; +#endif + search_results.emplace_back(range, seq_begin, match_end, fanout_breaks); + } + } + + // filter out redundant MEMs, which are common in this algorithm (often when the + // larger MEM includes a correct mismatch from a fan-out). locate() is the most + // expensive part of this algorithm, so it's worth expending some effort to reduce + // calls to it + + // first order by read interval start and then by decreasing match size and range + // size, this ensures that a search result can only be fully redundant with another + // one that is earlier in the vector + sort(search_results.begin(), search_results.end(), + [](const tuple>>& a, + const tuple>>& b) { + return (get<1>(a) < get<1>(b) + || (get<1>(a) == get<1>(b) && get<2>(a) > get<2>(b)) + || (get<1>(a) == get<1>(b) && get<2>(a) == get<2>(b) + && gcsa::Range::length(get<0>(a)) > gcsa::Range::length(get<0>(b)))); + }); + + size_t res_removed_so_far = 0; + for (size_t i = 0, j = 1; j < search_results.size(); ++j) { + // advance the starting index past the search results that cannot contain + // this one + while (get<2>(search_results[i]) < get<1>(search_results[j])) { + ++i; + } + bool redundant = false; + for (size_t k = i; k < j && !redundant; ++k) { + // does the other result cover the entire read interval and all of its + // locations in the graph? + redundant = (get<2>(search_results[k]) >= get<2>(search_results[j]) + && get<0>(search_results[k]).first <= get<0>(search_results[j]).first + && get<0>(search_results[k]).second >= get<0>(search_results[j]).second); + } + + if (redundant) { + ++res_removed_so_far; + } + else if (res_removed_so_far) { + search_results[j - res_removed_so_far] = move(search_results[j]); + } + } + + if (res_removed_so_far) { +#ifdef debug_mapper + cerr << "removing " << res_removed_so_far << " redundant search results" << endl; +#endif + search_results.resize(search_results.size() - res_removed_so_far); + } + + vector mems; + mems.reserve(search_results.size()); + if (mem_fanout_breaks) { + mem_fanout_breaks->reserve(search_results.size()); + } + for (auto it = search_results.begin(), end = search_results.end(); it != end; ++it) { + + const auto& search_result = *it; + + if (get<2>(search_result) - get<1>(search_result) < min_mem_length) { + continue; + } + + // there are no mismatches in the search, so the whole thing can become 1 MEM + mems.emplace_back(get<1>(search_result), get<2>(search_result), get<0>(search_result), + gcsa->count(get<0>(search_result))); + if (!hard_hit_max || mems.back().match_count < hard_hit_max) { + if (hit_max) { + gcsa->locate(mems.back().range, hit_max, mems.back().nodes); + + } + else { + gcsa->locate(mems.back().range, mems.back().nodes); + } + } + +#ifdef debug_mapper + cerr << "created MEM " << mems.back().sequence() << ", filled " << mems.back().nodes.size() << " of " << mems.back().match_count << " hits" << endl; + for (auto n : mems.back().nodes) { + cerr << "\t" << make_pos_t(n) << endl; + } +#endif + + // keep track of the initial number of hits we query in case the nodes vector is + // modified later (e.g. by prefiltering) + mems.back().queried_count = mems.back().nodes.size(); + + if (mem_fanout_breaks) { + // we're communicating fan-out breaks to the calling environment rather than + // breaking them into exact matches right now + +#ifdef debug_mapper + cerr << "recording fanout breaks:" << endl; + for (auto b : get<3>(search_result)) { + cerr << "\t" << (b.first - seq_begin) << ": " << *b.first << " -> " << b.second << endl; + } +#endif + mem_fanout_breaks->emplace_back(move(get<3>(search_result))); + } + else if (!get<3>(search_result).empty()) { +#ifdef debug_mapper + cerr << "need to break apart fanout MEM" << endl; +#endif + + // find the paths that each hit took + vector> paths; + paths.reserve(mems.back().nodes.size()); + for (gcsa::node_type pos : mems.back().nodes) { + paths.emplace_back(walk_fanout_path(get<1>(search_result), + get<2>(search_result), + get<3>(search_result), + pos)); + } + + // records of (pos index, bases past pos offset) + vector> path_positions(paths.size(), pair(0, 0)); + + for (auto fanout_break : get<3>(search_result)) { +#ifdef debug_mapper + cerr << "breaking fanout mems at " << (fanout_break.first - seq_begin) << " -> " << fanout_break.second << endl; +#endif + + // split up the match into two segments + mems.back().end = fanout_break.first; + if (fanout_break.first + 1 == seq_end) { + break; + } + mems.emplace_back(fanout_break.first + 1, get<2>(search_result), + mems.back().range, mems.back().match_count); + mems.back().queried_count = mems[mems.size() - 2].queried_count; + + // walk forward the specified length along each of the match paths + size_t dist_to_walk = mems.back().begin - mems[mems.size() - 2].begin; +#ifdef debug_mapper + cerr << "need to walk " << dist_to_walk << " from previous match positions" << endl; +#endif + for (size_t i = 0; i < paths.size(); ++i) { + + pair& path_pos = path_positions[i]; + + size_t left_to_walk = dist_to_walk; + + pos_t path_step = paths[i][path_pos.first]; +#ifdef debug_mapper + cerr << "starting a walk " << path_pos.second << " past " << path_step << endl; +#endif + size_t node_len = xindex->get_length(xindex->get_handle(id(path_step))); + size_t remain_len = node_len - offset(path_step) - path_pos.second; + while (remain_len < left_to_walk) { + left_to_walk -= remain_len; + ++path_pos.first; + path_pos.second = 0; + remain_len = xindex->get_length(xindex->get_handle(id(paths[i][path_pos.first]))); + } + + path_pos.second += left_to_walk; + path_step = paths[i][path_pos.first]; + +#ifdef debug_mapper + cerr << "\tadvance " << paths[i].front() << " to " << pos_t(id(path_step), is_rev(path_step), offset(path_step) + path_pos.second) << endl; +#endif + + // add the result as a position for this MEM + mems.back().nodes.emplace_back(gcsa::Node::encode(id(path_step), + offset(path_step) + path_pos.second, + is_rev(path_step))); + + } + } + } + } + + if (mems.size() == 1 && mems.front().length() >= mem_reseed_length && !mems.front().nodes.empty() && + (mem_fanout_breaks ? mem_fanout_breaks->front().empty() : + find(mems.front().begin, mems.front().end, 'N') == mems.front().end)) { + // try looking for reseed MEMs even if base qualities were high, just in case + + int min_sub_mem_length = max(ceil(fast_reseed_length_diff * mems.front().length()), min_mem_length); + + vector>> sub_mems; + find_sub_mems_fast(mems, 0, 1, 0, mems.front().end, mems.front().begin, min_sub_mem_length, sub_mems); + + if (!sub_mems.empty()) { + // we found a reseed sub-MEM + + // consolidate the sub MEMs into the MEM vector and record the parent relationships + mems.reserve(mems.size() + sub_mems.size()); + vector>> containment_graph(1); + containment_graph.reserve(mems.size() + sub_mems.size()); + for (auto& sub_mem_and_parents : sub_mems) { + mems.emplace_back(move(sub_mem_and_parents.first)); + if (mem_fanout_breaks) { + mem_fanout_breaks->emplace_back(); + } + containment_graph.emplace_back(0, move(sub_mem_and_parents.second)); + } + + // try to remove redundant sub-MEMs + if (prefilter_redundant_hits) { + prefilter_redundant_sub_mems(mems, containment_graph); + } + } + } + + auto cmp = [](const MaximalExactMatch& m1, const MaximalExactMatch& m2) { + if (m1.begin < m2.begin) { + return true; + } + else if (m1.begin == m2.begin) { + if (m1.end < m2.end) { + return true; + } + else if (m1.end == m2.end) { + return m1.nodes < m2.nodes; + } + } + return false; + }; + + // put in lexicographic order + if (!is_sorted(mems.begin(), mems.end(), cmp)) { + vector order(mems.size(), 0); + for (size_t i = 1; i < order.size(); ++i) { + order[i] = i; + } + stable_sort(order.begin(), order.end(), [&](size_t i, size_t j) { return cmp(mems[i], mems[j]); }); + vector index(order.size()); + for (size_t i = 0; i < order.size(); ++i) { + index[order[i]] = i; + } + for (size_t i = 0; i < index.size(); ++i) { + while (index[i] != i) { + swap(mems[i], mems[index[i]]); + if (mem_fanout_breaks) { + swap((*mem_fanout_breaks)[i], (*mem_fanout_breaks)[index[i]]); + } + swap(index[i], index[index[i]]); + } + } + } + // remove null matches and duplicates + size_t num_removed_so_far = 0; + for (size_t i = 0; i < mems.size(); ++i) { + if (mems[i].begin == mems[i].end || + (i >= num_removed_so_far + 1 && mems[i] == mems[i - num_removed_so_far - 1])) { + // MEM is either empty or non-unique + ++num_removed_so_far; + } + else if (num_removed_so_far) { + // move the non-removed MEM past the removed ones + mems[i - num_removed_so_far] = move(mems[i]); + if (mem_fanout_breaks) { + (*mem_fanout_breaks)[i - num_removed_so_far] = move((*mem_fanout_breaks)[i]); + } + } + } + + if (num_removed_so_far) { + mems.resize(mems.size() - num_removed_so_far); + if (mem_fanout_breaks) { + mem_fanout_breaks->resize(mem_fanout_breaks->size() - num_removed_so_far); + } + } + + return mems; +} + +vector BaseMapper::walk_fanout_path(string::const_iterator begin, + string::const_iterator end, + const deque>& fanout_breaks, + gcsa::node_type pos) { +#ifdef debug_mapper + cerr << "beginning walk of fan-out path for sequence " << string(begin, end) << endl; + cerr << "breaks:" << endl; + for (auto b : fanout_breaks) { + cerr << (b.first - begin) << " -> " << b.second << endl; + } +#endif + vector path; + + pos_t start_pos = make_pos_t(pos); + handle_t start_handle = xindex->get_handle(id(start_pos), is_rev(start_pos)); + + // records of (read pos, break pos, (node-offset records), next record to check) + vector>::const_iterator, + vector>, + size_t>> stack; + stack.emplace_back(begin, + fanout_breaks.begin(), + vector>(1, make_pair(start_handle, offset(start_pos))), + 0); + + while (!stack.empty()) { + + auto& stack_record = stack.back(); + +#ifdef debug_mapper + cerr << "dequeueing stack record at relative idx " << (get<0>(stack_record) - begin) << ", option idx " << get<3>(stack_record) << " of " << get<2>(stack_record).size() << endl; +#endif + + if (get<3>(stack_record) == get<2>(stack_record).size()) { +#ifdef debug_mapper + cerr << "popping stack record" << endl; +#endif + + // we've already traversed all of this nodes edges without finding a match + stack.pop_back(); + + continue; + } + + + // get the next position we're searching from + handle_t handle; + size_t off; + tie(handle, off) = get<2>(stack_record)[get<3>(stack_record)++]; + +#ifdef debug_mapper + cerr << "offset " << off << " on node " << xindex->get_id(handle) << ": " << xindex->get_sequence(handle) << endl; +#endif + + auto read_it = get<0>(stack_record); + auto next_break = get<1>(stack_record); + + size_t node_len = xindex->get_length(handle); + + // check for a match along this node + while (off < node_len && read_it != end) { + // get either the read char or the fanout char + char rch; + if (next_break != fanout_breaks.end() && read_it == next_break->first) { + rch = next_break->second; + ++next_break; + } + else { + rch = *read_it; + } +#ifdef debug_mapper + cerr << "\tlooking for match of " << rch << " to " << xindex->get_base(handle, off) << " at offset " << off << " on node length " << node_len << endl; +#endif + + if (rch != xindex->get_base(handle, off)) { +#ifdef debug_mapper + cerr << "\tdoes not match" << endl; +#endif + // this isn't a matching path + break; + } + ++read_it; + ++off; + } + + if (read_it == end) { + // we've finished walking out the match, the stack encodes + // the path we took to get here +#ifdef debug_mapper + cerr << "\tfound full length match" << endl; +#endif + path.reserve(stack.size()); + for (const auto& search_record : stack) { + handle_t h; + size_t o; + tie(h, o) = get<2>(search_record)[get<3>(search_record) - 1]; + path.emplace_back(xindex->get_id(h), xindex->get_is_reverse(h), o); + } + break; + } + else if (off == node_len) { + // we walked to the end of the node, queue up the next nodes +#ifdef debug_mapper + cerr << "\treached end of node" << endl; +#endif + stack.emplace_back(read_it, next_break, vector>(), 0); + xindex->follow_edges(handle, false, [&](const handle_t& next) { + get<2>(stack.back()).emplace_back(next, 0); + }); + } + } + +#ifdef debug_mapper + cerr << "walked path:"; + for (auto p : path) { + cerr << " " << p; + } + cerr << endl; +#endif + + return path; +} + // Use the GCSA2 index to find super-maximal exact matches (and optionally sub-MEMs). vector BaseMapper::find_mems_deep(string::const_iterator seq_begin, string::const_iterator seq_end, @@ -272,15 +925,7 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ bool record_max_lcp, int reseed_below) { #ifdef debug_mapper -#pragma omp critical - { - cerr << "find_mems: sequence "; - for (auto iter = seq_begin; iter != seq_end; iter++) { - cerr << *iter; - } - cerr << ", max mem length " << max_mem_length << ", min mem length " << - min_mem_length << ", reseed length " << reseed_length << endl; - } + cerr << "find_mems: sequence " << string(seq_begin, seq_end) << ", max mem length " << max_mem_length << ", min mem length " << min_mem_length << ", reseed length " << reseed_length << endl; #endif if (!gcsa) { @@ -289,16 +934,14 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ } if (min_mem_length > reseed_length && reseed_length) { - cerr << "error:[vg::Mapper] minimimum reseed length for MEMs cannot be less than minimum MEM length" << endl; + cerr << "error:[vg::Mapper] minimum reseed length for MEMs cannot be less than minimum MEM length" << endl; exit(1); } vector mems; - gcsa::range_type full_range = gcsa::range_type(0, gcsa->size() - 1); - // an empty sequence matches the entire bwt if (seq_begin == seq_end) { - mems.push_back(MaximalExactMatch(seq_begin, seq_end, full_range)); + mems.push_back(MaximalExactMatch(seq_begin, seq_end, gcsa::range_type(0, gcsa->size() - 1))); } // find SMEMs using GCSA+LCP array @@ -318,12 +961,10 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ // next position we will extend matches to string::const_iterator cursor = seq_end - 1; - - // range of the last iteration - gcsa::range_type last_range = full_range; - - // the temporary MEM we'll build up in this process - MaximalExactMatch match(cursor, seq_end, full_range); + // the end of the current MEM + string::const_iterator curr_end = seq_end; + // range of the current iteration + gcsa::range_type range = accelerate_mem_query(seq_begin, cursor); // did we move the cursor or the end of the match last iteration? bool prev_iter_jumped_lcp = false; @@ -331,7 +972,6 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ int filtered_mems = 0; int total_mems = 0; int max_lcp = 0; - size_t mem_length = 0; vector lcp_maxima; @@ -341,36 +981,33 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ // break the MEM on N; which for DNA we assume is non-informative // this *will* match many places in assemblies, but it isn't helpful if (*cursor == 'N') { - match.begin = cursor + 1; - - mem_length = match.length(); - - if (mem_length >= min_mem_length) { + auto curr_begin = cursor + 1; + if (curr_end - curr_begin >= min_mem_length) { - mems.push_back(match); + mems.emplace_back(curr_begin, curr_end, range); + mems.back().match_count = gcsa->count(range); + mems.back().primary = true; + lcp_maxima.push_back(max_lcp); #ifdef debug_mapper -#pragma omp critical - { - vector locations; - if (hit_max) { - gcsa->locate(match.range, hit_max, locations); - } else { - gcsa->locate(match.range, locations); - } - cerr << "adding MEM " << match.sequence() << " at positions "; - for (auto nt : locations) { - cerr << make_pos_t(nt) << " "; - } - cerr << endl; + vector locations; + if (hit_max) { + gcsa->locate(mems.back().range, hit_max, locations); + } else { + gcsa->locate(mems.back().range, locations); + } + cerr << "adding MEM " << mems.back().sequence() << " after hitting N at positions "; + for (auto nt : locations) { + cerr << make_pos_t(nt) << " "; } + cerr << endl; #endif } - match.end = cursor; - match.range = full_range; + curr_end = cursor; --cursor; + range = accelerate_mem_query(seq_begin, cursor); prev_iter_jumped_lcp = false; @@ -381,73 +1018,114 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ } // hold onto our previous range - last_range = match.range; + auto last_range = range; // execute one step of LF mapping - match.range = gcsa->LF(match.range, gcsa->alpha.char2comp[*cursor]); + range = gcsa->LF(range, gcsa->alpha.char2comp[*cursor]); - if (gcsa::Range::empty(match.range) - || (max_mem_length && match.end - cursor > max_mem_length) - || match.end - cursor > gcsa->order()) { + if (gcsa::Range::empty(range) + || (max_mem_length && curr_end - cursor > max_mem_length) + || curr_end - cursor > gcsa->order()) { // we've exhausted our BWT range, so the last match range was maximal // or: we have exceeded the order of the graph (FPs if we go further) // or: we have run over our parameter-defined MEM limit - if (cursor + 1 == match.end) { + if (cursor + 1 == curr_end) { // avoid getting caught in infinite loop when a single character mismatches // entire index (b/c then advancing the LCP doesn't move the search forward // at all, need to move the cursor instead) - match.begin = cursor + 1; - match.range = last_range; + auto curr_begin = cursor + 1; - if (match.end - match.begin >= min_mem_length) { - mems.push_back(match); + if (curr_end - curr_begin >= min_mem_length) { + mems.emplace_back(curr_begin, curr_end, last_range); + mems.back().match_count = gcsa->count(mems.back().range); + mems.back().primary = true; lcp_maxima.push_back(max_lcp); } - match.end = cursor; - match.range = full_range; + curr_end = cursor; --cursor; + range = accelerate_mem_query(seq_begin, cursor); // don't reseed in empty MEMs prev_iter_jumped_lcp = false; max_lcp = 0; } else { - match.begin = cursor + 1; - match.range = last_range; - mem_length = match.end - match.begin; + auto curr_begin = cursor + 1; + // record the last MEM, but check to make sure were not actually still searching // for the end of the next MEM - if (mem_length >= min_mem_length && !prev_iter_jumped_lcp) { - mems.push_back(match); + bool add_mem = (curr_end - curr_begin >= min_mem_length && !prev_iter_jumped_lcp); + if (add_mem) { + mems.emplace_back(curr_begin, curr_end, last_range); + mems.back().match_count = gcsa->count(mems.back().range); + mems.back().primary = true; lcp_maxima.push_back(max_lcp); #ifdef debug_mapper -#pragma omp critical - { - vector locations; - if (hit_max) { - gcsa->locate(match.range, hit_max, locations); - } else { - gcsa->locate(match.range, locations); - } - cerr << "adding MEM " << match.sequence() << " at positions "; - for (auto nt : locations) { - cerr << make_pos_t(nt) << " "; - } - cerr << endl; + vector locations; + if (hit_max) { + gcsa->locate(mems.back().range, hit_max, locations); + } else { + gcsa->locate(mems.back().range, locations); + } + cerr << "adding MEM " << mems.back().sequence() << " after hitting an empty extension at positions "; + for (auto nt : locations) { + cerr << make_pos_t(nt) << " "; } + cerr << endl; #endif } - // get the parent suffix tree node corresponding to the parent of the last MEM's STNode - gcsa::STNode parent = lcp->parent(last_range); - // set the MEM to be the longest prefix that is shared with another MEM - match.end = match.begin + parent.lcp(); + // init this outside of the if condition so that we can avoid calling the + // expensive LCP::parent function twice in the code path that checks max LCP + gcsa::STNode parent; + + if (add_mem && use_greedy_mem_restarts + && curr_end - curr_begin >= greedy_restart_min_length + && mems.back().match_count <= greedy_restart_max_count) { + // the current match was relatively long and unique, so we think + // that we can get away with moving past it rather than looking + // for MEMs that overlap it on its left side + bool do_greedy_restart = true; + if (greedy_restart_max_lcp) { + // we also want to check whether this MEM has a long LCP (which + // would indicate that there is another match overlapping it) + parent = lcp->parent(last_range); + do_greedy_restart = (parent.lcp() <= greedy_restart_max_lcp); +#ifdef debug_mapper + cerr << "greedy restart aborted because of LCP of length " << parent.lcp() << endl; +#endif + } + if (do_greedy_restart) { +#ifdef debug_mapper + cerr << "doing greedy restart for next search iteration" << endl; +#endif + + // set up the search at the left end of the current MEM or one + // base further (which will create one fewer noise MEM if the current + // match was ended by a base substitution, but will make an artificially + // short MEM if it was ended by a deletion) + curr_end = greedy_restart_assume_substitution ? cursor : cursor + 1; + cursor = curr_end - 1; + range = accelerate_mem_query(seq_begin, cursor); + // we don't worry that we might still be jumping through prefixes + // of the current MEM because we've moved completely past it + prev_iter_jumped_lcp = false; + continue; + } + } + else { + // get the parent suffix tree node corresponding to the parent of the last MEM's STNode + parent = lcp->parent(last_range); + } + + // set the match to be the longest prefix that is shared with another MEM + curr_end = cursor + 1 + parent.lcp(); // and set up the next MEM using the parent node range - match.range = parent.range(); + range = parent.range(); // record our max lcp if (record_max_lcp) max_lcp = (int)parent.lcp(); prev_iter_jumped_lcp = true; @@ -455,8 +1133,7 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ } else { prev_iter_jumped_lcp = false; - if (record_max_lcp) max_lcp = max(max_lcp, (int)lcp->parent(match.range).lcp()); - ++mem_length; + if (record_max_lcp) max_lcp = max(max_lcp, (int)lcp->parent(range).lcp()); // just step to the next position --cursor; } @@ -465,52 +1142,37 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ // contains multiple non SMEM hits so that the iteration will loop through the LCP routine multiple // times before escaping out of the loop? - // if we have a MEM at the beginning of the read, record it - match.begin = seq_begin; - mem_length = match.end - match.begin; - if (mem_length >= min_mem_length) { - if (record_max_lcp) max_lcp = (int)lcp->parent(match.range).lcp(); - mems.push_back(match); - lcp_maxima.push_back(max_lcp); -#ifdef debug_mapper -#pragma omp critical - { - vector locations; - if (hit_max) { - gcsa->locate(match.range, hit_max, locations); - } else { - gcsa->locate(match.range, locations); - } - cerr << "adding MEM " << match.sequence() << " at positions "; - for (auto nt : locations) { - cerr << make_pos_t(nt) << " "; - } - cerr << endl; + // if we have a MEM at the beginning of the read, record it + if (curr_end - seq_begin >= min_mem_length) { + if (record_max_lcp) max_lcp = (int)lcp->parent(range).lcp(); + mems.emplace_back(seq_begin, curr_end, range); + mems.back().match_count = gcsa->count(mems.back().range); + mems.back().primary = true; + lcp_maxima.push_back(max_lcp); +#ifdef debug_mapper + vector locations; + if (hit_max) { + gcsa->locate(mems.back().range, hit_max, locations); + } else { + gcsa->locate(mems.back().range, locations); } + cerr << "adding MEM " << mems.back().sequence() << " after hitting beginning of read at positions "; + for (auto nt : locations) { + cerr << make_pos_t(nt) << " "; + } + cerr << endl; #endif } if (record_max_lcp) longest_lcp = lcp_maxima.empty() ? 0 : *max_element(lcp_maxima.begin(), lcp_maxima.end()); assert(!record_max_lcp || lcp_maxima.size() == mems.size()); - - // fill the MEMs' node lists and indicate they are primary MEMs - for (MaximalExactMatch& mem : mems) { - // invalid mem - if (mem.begin < seq_begin || mem.end > seq_end) continue; - mem.match_count = gcsa->count(mem.range); - mem.primary = true; - // keep track of the initial number of hits we query in case the nodes vector is - // modified later (e.g. by prefiltering) - mem.queried_count = mem.nodes.size(); - } // the graph of containment relationships between MEMs by index, also keeps track of what the sub-MEM min // length the children should be vector>> sub_mem_containment_graph(mems.size()); if (reseed_length) { - // record what the minimum length of sub-MEMs they contain should be according to the parameters for (size_t i = 0; i < mems.size(); i++) { if (use_diff_based_fast_reseed && adaptive_reseed_diff) { @@ -540,6 +1202,7 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ int min_sub_mem_length = sub_mem_containment_graph[i].first; // should we look for sub-MEMs from this MEM? + // TODO: are the parentheses correct on the LCP heuristic if (mem.length() >= min_mem_length && mem.length() > min_sub_mem_length && (use_lcp_reseed_heuristic && record_max_lcp @@ -585,6 +1248,17 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ } } + // determine the shortest length + size_t min_mem_query_length = 0; + if (filter_short_mems) { + // we will filter out MEMs that are short relative to the longest MEMs + size_t max_mem_length = 0; + for (const auto& mem : mems) { + max_mem_length = max(max_mem_length, mem.length()); + } + min_mem_query_length = max_mem_length * short_mem_filter_factor; + } + // query the locations of the hits // note: iterate in reverse so we remove the parent count from the children MEMs before decrementing // the parent count itself @@ -599,29 +1273,27 @@ vector BaseMapper::find_mems_deep(string::const_iterator seq_ } } - if (mem.match_count > 0) { - if (hit_max) { - gcsa->locate(mem.range, hit_max, mem.nodes); - - } else { - gcsa->locate(mem.range, mem.nodes); + if (mem.match_count > 0 && mem.length() >= min_mem_query_length) { + if (!hard_hit_max || mem.match_count < hard_hit_max) { + if (hit_max) { + gcsa->locate(mem.range, hit_max, mem.nodes); + + } else { + gcsa->locate(mem.range, mem.nodes); + } + // keep track of the initial number of hits we query in case the nodes vector is + // modified later (e.g. by prefiltering) + mem.queried_count = mem.nodes.size(); } - // keep track of the initial number of hits we query in case the nodes vector is - // modified later (e.g. by prefiltering) - mem.queried_count = mem.nodes.size(); - - filtered_mems += mem.match_count - mem.nodes.size(); - total_mems += mem.nodes.size(); } + filtered_mems += mem.match_count - mem.nodes.size(); + total_mems += mem.nodes.size(); #ifdef debug_mapper -#pragma omp critical - { - cerr << "MEM " << mem.sequence() << " has " << mem.match_count << " hits: "; - for (auto nt : mem.nodes) { - cerr << make_pos_t(nt) << ", "; - } - cerr << endl; + cerr << "MEM " << mem.sequence() << " has " << mem.match_count << " hits: "; + for (auto nt : mem.nodes) { + cerr << make_pos_t(nt) << ", "; } + cerr << endl; #endif } @@ -671,14 +1343,11 @@ void BaseMapper::find_sub_mems(const vector& mems, const MaximalExactMatch& mem = mems[mem_idx]; #ifdef debug_mapper -#pragma omp critical - { - cerr << "find_sub_mems: sequence "; - for (auto iter = mem.begin; iter != mem.end; iter++) { - cerr << *iter; - } - cerr << ", min mem length " << min_sub_mem_length << endl; + cerr << "find_sub_mems: sequence "; + for (auto iter = mem.begin; iter != mem.end; iter++) { + cerr << *iter; } + cerr << ", min mem length " << min_sub_mem_length << endl; #endif // how many times does the parent MEM occur in the index? @@ -716,25 +1385,21 @@ void BaseMapper::find_sub_mems(const vector& mems, sub_mems_out.emplace_back(MaximalExactMatch(sub_mem_begin, sub_mem_end, last_range), vector(1, mem_idx)); #ifdef debug_mapper -#pragma omp critical - { - vector locations; - if (hit_max) { - gcsa->locate(last_range, hit_max, locations); - } else { - gcsa->locate(last_range, locations); - } - cerr << "adding sub-MEM "; - for (auto iter = sub_mem_begin; iter != sub_mem_end; iter++) { - cerr << *iter; - } - cerr << " at positions "; - for (auto nt : locations) { - cerr << make_pos_t(nt) << " "; - } - cerr << endl; - + vector locations; + if (hit_max) { + gcsa->locate(last_range, hit_max, locations); + } else { + gcsa->locate(last_range, locations); + } + cerr << "adding sub-MEM "; + for (auto iter = sub_mem_begin; iter != sub_mem_end; iter++) { + cerr << *iter; + } + cerr << " at positions "; + for (auto nt : locations) { + cerr << make_pos_t(nt) << " "; } + cerr << endl; #endif // identify all previous MEMs that also contain this sub-MEM for (int64_t i = mem_idx - 1; i >= parent_layer_begin; --i) { @@ -750,14 +1415,11 @@ void BaseMapper::find_sub_mems(const vector& mems, } #ifdef debug_mapper else { -#pragma omp critical - { - cerr << "minimally more frequent MEM is too short "; - for (auto iter = sub_mem_begin; iter != sub_mem_end; iter++) { - cerr << *iter; - } - cerr << endl; + cerr << "minimally more frequent MEM is too short "; + for (auto iter = sub_mem_begin; iter != sub_mem_end; iter++) { + cerr << *iter; } + cerr << endl; } #endif @@ -781,28 +1443,22 @@ void BaseMapper::find_sub_mems(const vector& mems, sub_mems_out.emplace_back(MaximalExactMatch(mem.begin, sub_mem_end, range), vector(1, mem_idx)); #ifdef debug_mapper -#pragma omp critical - { - cerr << "adding sub-MEM "; - for (auto iter = mem.begin; iter != sub_mem_end; iter++) { - cerr << *iter; - } - cerr << endl; + cerr << "adding sub-MEM "; + for (auto iter = mem.begin; iter != sub_mem_end; iter++) { + cerr << *iter; } + cerr << endl; #endif // note: this sub MEM is at the far left side of the parent MEM, so we don't need to // check whether earlier MEMs contain it as well } #ifdef debug_mapper else { -#pragma omp critical - { - cerr << "minimally more frequent MEM is too short "; - for (auto iter = mem.begin; iter != sub_mem_end; iter++) { - cerr << *iter; - } - cerr << endl; + cerr << "minimally more frequent MEM is too short "; + for (auto iter = mem.begin; iter != sub_mem_end; iter++) { + cerr << *iter; } + cerr << endl; } #endif @@ -834,7 +1490,6 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, vector>>& sub_mems_out) { #ifdef debug_mapper -#pragma omp critical cerr << "find_sub_mems_fast: mem "; for (auto iter = mems[mem_idx].begin; iter != mems[mem_idx].end; iter++) { cerr << *iter; @@ -861,17 +1516,12 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, string::const_iterator probe_string_begin = probe_string_end - min_sub_mem_length; #ifdef debug_mapper -#pragma omp critical - cerr << "probe string is mem[" << probe_string_begin - mems[mem_idx].begin << ":" << probe_string_end - mems[mem_idx].begin << "] "; - for (auto iter = probe_string_begin; iter != probe_string_end; iter++) { - cerr << *iter; - } - cerr << endl; + cerr << "probe string is mem[" << probe_string_begin - mems[mem_idx].begin << ":" << probe_string_end - mems[mem_idx].begin << "] " << string(probe_string_begin, probe_string_end) << endl; #endif // set up LF searching string::const_iterator cursor = probe_string_end - 1; - gcsa::range_type range = gcsa::range_type(0, gcsa->size() - 1); + gcsa::range_type range = accelerate_mem_query(probe_string_begin, cursor); // check if the probe substring is more frequent than the SMEM its contained in bool probe_string_more_frequent = true; @@ -886,6 +1536,9 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, (relative_idx >= sub_mem_thinning_burn_in && (relative_idx - sub_mem_thinning_burn_in) % sub_mem_count_thinning == 0)) { if ((use_approx_sub_mem_count ? gcsa::Range::length(range) : gcsa->count(range)) <= parent_range_count) { +#ifdef debug_mapper + cerr << "partial probe only occurs in parent mem[" << cursor - mems[mem_idx].begin << ":" << probe_string_end - mems[mem_idx].begin << "] " << string(cursor, probe_string_end) << endl; +#endif probe_string_more_frequent = false; break; } @@ -929,25 +1582,28 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, // it here, the binary search is guaranteed to LF along the full sub-MEM in some iteration) gcsa::range_type sub_mem_range = range; + // we'll keep track of the count of this sub-MEM during this loop and also remember it + // for later + size_t sub_mem_count = 0; + // iterate until inteveral contains only one index while (right_search_bound > left_search_bound) { string::const_iterator middle = left_search_bound + (right_search_bound - left_search_bound + 1) / 2; #ifdef debug_mapper -#pragma omp critical - { - cerr << "checking extension mem[" << probe_string_begin - mems[mem_idx].begin << ":" << middle - mems[mem_idx].begin << "] "; - for (auto iter = probe_string_begin; iter != middle; iter++) { - cerr << *iter; - } - cerr << endl; + cerr << "checking extension mem[" << probe_string_begin - mems[mem_idx].begin << ":" << middle - mems[mem_idx].begin << "] "; + for (auto iter = probe_string_begin; iter != middle; iter++) { + cerr << *iter; } + cerr << endl; #endif // set up LF searching cursor = middle - 1; - range = gcsa::range_type(0, gcsa->size() - 1); + range = accelerate_mem_query(probe_string_begin, cursor); + + size_t extension_mem_count = 0; // check if there is an independent occurrence of this substring outside of the SMEM bool contained_in_independent_match = true; @@ -959,8 +1615,11 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, // burn in parameter int64_t relative_idx = middle - cursor - 1; if (cursor == probe_string_begin || - (relative_idx >= sub_mem_thinning_burn_in && (relative_idx - sub_mem_thinning_burn_in) % sub_mem_count_thinning == 0)) { - if ((use_approx_sub_mem_count ? gcsa::Range::length(range) : gcsa->count(range)) <= parent_range_count) { + (relative_idx >= sub_mem_thinning_burn_in + && (relative_idx - sub_mem_thinning_burn_in) % sub_mem_count_thinning == 0)) { + + extension_mem_count = use_approx_sub_mem_count ? gcsa::Range::length(range) : gcsa->count(range); + if (extension_mem_count <= parent_range_count) { // this probe is too long and it no longer is contained in the indendent hit // that we detected contained_in_independent_match = false; @@ -978,6 +1637,9 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, // update the range of matches (this is the longest match we've verified so far) sub_mem_range = range; + + // this current count is the count of the longest match we've verified so far + sub_mem_count = extension_mem_count; } else { // the end of the sub-MEM must be to the left @@ -997,12 +1659,9 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, cerr << "current: " << MaximalExactMatch(probe_string_begin, right_search_bound, sub_mem_range).sequence() << endl; #endif - // the count of the current sub-MEM - size_t current_count = use_approx_sub_mem_count ? gcsa::Range::length(sub_mem_range) : gcsa->count(sub_mem_range); - // get the GCSA range of the current sub-MEM extended one base past the end of the current parent MEM cursor = right_search_bound; - range = gcsa::range_type(0, gcsa->size() - 1); + range = accelerate_mem_query(probe_string_begin, cursor); size_t extended_count = numeric_limits::max(); bool contained_in_independent_match = true; while (cursor >= probe_string_begin) { @@ -1011,7 +1670,7 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, if (cursor == probe_string_begin || (relative_idx >= sub_mem_thinning_burn_in && (relative_idx - sub_mem_thinning_burn_in) % sub_mem_count_thinning == 0)) { extended_count = use_approx_sub_mem_count ? gcsa::Range::length(range) : gcsa->count(range); - if (extended_count < current_count) { + if (extended_count < sub_mem_count) { // the count of the full lengthened sub-MEM will be fewer than the one we just found break; } @@ -1021,18 +1680,17 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, } #ifdef debug_mapper - cerr << "lengthened sub-MEM has count " << extended_count << ", compared to current count " << current_count << endl; + cerr << "lengthened sub-MEM has count " << extended_count << ", compared to current count " << sub_mem_count << endl; #endif // since the current sub-MEM is contained in the lengthened one, the lengthened sub-MEM can only have the same number // of hits or fewer. if it has the same number then the current sub-MEM is artificially truncated - not_redundant = (extended_count < current_count); + not_redundant = (extended_count < sub_mem_count); } if (not_redundant) { #ifdef debug_mapper -#pragma omp critical cerr << "final sub-MEM is mem[" << probe_string_begin - mems[mem_idx].begin << ":" << right_search_bound - mems[mem_idx].begin << "] "; for (auto iter = probe_string_begin; iter != right_search_bound; iter++) { cerr << *iter; @@ -1043,6 +1701,11 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, // record the sub-MEM sub_mems_out.emplace_back(MaximalExactMatch(probe_string_begin, right_search_bound, sub_mem_range), vector(1, mem_idx)); + // annotate the sub-MEM + // note: the count will not be accurate if we were using the approximate algorithm, + // but we will handle that at the end of the function + sub_mems_out.back().first.match_count = sub_mem_count; + sub_mems_out.back().first.primary = false; // identify all previous MEMs that also contain this sub-MEM for (int64_t i = mem_idx - 1; i >= parent_layer_begin; --i) { @@ -1085,19 +1748,178 @@ void BaseMapper::find_sub_mems_fast(const vector& mems, } } - // annotate the MEMs - for (pair>& sub_mem_and_parents : sub_mems_out) { - // count in entire range, including parents - sub_mem_and_parents.first.match_count = gcsa->count(sub_mem_and_parents.first.range); - // mark this is a sub-MEM - sub_mem_and_parents.first.primary = false; + if (use_approx_sub_mem_count) { + // we didn't compute the exact count when looking for the sub-MEMs so now + // we have to + for (pair>& sub_mem_and_parents : sub_mems_out) { + // count in entire range, including parents + sub_mem_and_parents.first.match_count = gcsa->count(sub_mem_and_parents.first.range); + } } + // fast algorithm produces sub-MEMs left-to-right, switch the order so that they remain right-to-left // within the layer (as this algorithm expects in recursive calls) reverse(sub_mems_out.begin(), sub_mems_out.end()); } +vector BaseMapper::find_stripped_matches(string::const_iterator seq_begin, + string::const_iterator seq_end, + size_t strip_length, size_t max_match_length, + size_t target_count) { + if (!gcsa) { + throw runtime_error("error:[vg::Mapper] a GCSA2 index is required to query matches"); + } + if (strip_length <= 0) { + throw runtime_error("error:[vg::Mapper] strip match length must be positive, set to " + to_string(strip_length)); + } + if (target_count <= 0) { + throw runtime_error("error:[vg::Mapper] target match count must be positive, set to " + to_string(target_count)); + } + +#ifdef debug_strip_match + cerr << "starting stripped match algorithm" << endl; + cerr << "\tstrip length:" << strip_length << endl; + cerr << "\tmax match length:" << max_match_length << endl; + cerr << "\ttarget count:" << target_count << endl; + cerr << "\tsequence:" << string(seq_begin, seq_end) << endl; + +#endif + + // init the return value + vector matches; + + if (seq_end != seq_begin) { + // we are not in the empty string + int64_t seq_len = seq_end - seq_begin; + int64_t num_strips = (seq_len - 1) / strip_length + 1; + for (int64_t strip_num = 0; strip_num < num_strips; ++strip_num) { + +#ifdef debug_strip_match + cerr << "strip number " << strip_num << " of " << num_strips << endl; +#endif + + // the end of other strip match we will find + auto strip_end = seq_end - strip_num * strip_length; + // empty string starts matching entire index + // note: because the stopping conditions are different here we can't + // accelerate this MEM init query without changing results + auto range = gcsa::range_type(0, gcsa->size() - 1); + // a pointer to the next char we will match to + auto cursor = strip_end - 1; + + while (cursor >= seq_begin && + (!max_match_length || strip_end - cursor <= max_match_length)) { + + if (*cursor == 'N') { + // N matches are uninformative, so we don't want to match them + break; + } + + // match one more char + auto next_range = gcsa->LF(range, gcsa->alpha.char2comp[*cursor]); + +#ifdef debug_strip_match + cerr << "\tgot next range which is length " << gcsa::Range::length(next_range) << " and " << (gcsa::Range::empty(next_range) ? "" : "not ") << "empty" << endl; +#endif + + if (gcsa::Range::empty(next_range)) { + // we've gone too far, there are no more hits + break; + } + + // the match was successful, advance to the range and move the cursor + range = next_range; + --cursor; + + if (target_count && gcsa::Range::length(next_range) <= target_count) { + // the (approximate) count is below the specified limit, so + // we've found reasonably unique sequence, stop looking for more + break; + } + } + + if (cursor + 1 == strip_end) { + // edge case where one char mismatches the entire index, don't bother + // with this + continue; + } + + if (!matches.empty()) { + if (matches.back().begin <= cursor + 1 && + matches.back().end >= strip_end) { + // this match is entirely contained within the larger match + // of the previous strip, so it's not likely to give us any + // new information + // also, filtering this helps us maintain reverse lexicographic + // order + continue; + } + } + +#ifdef debug_strip_match + cerr << "adding match of sequence " << string(cursor + 1, strip_end) << " and " << gcsa->count(range) << " hits" << endl; +#endif + + matches.emplace_back(cursor + 1, strip_end, range, gcsa->count(range)); + matches.back().primary = true; + + if (cursor < seq_begin) { + // all further hits will be contained in ones we've already seen + break; + } + } + } + + // matches are queried in reverse lexicographic order, flip them around + reverse(matches.begin(), matches.end()); + + for (MaximalExactMatch& match : matches) { + // figure out how many occurrences there are + match.match_count = gcsa->count(match.range); + if (!hard_hit_max || match.match_count < hard_hit_max) { + // the total number of hits is low enough that we think it's at least + // potentially worth querying hits + if (hit_max) { + // we may want to subsample + gcsa->locate(match.range, hit_max, match.nodes); + + } else { + // we won't subsample down to a prespecified maximum + gcsa->locate(match.range, match.nodes); + } + } + match.queried_count = match.nodes.size(); + } + + + return matches; +} + +gcsa::range_type BaseMapper::accelerate_mem_query(string::const_iterator begin, + string::const_iterator& cursor) const { + + if (!accelerator + || cursor - begin < accelerator->length() - 1 + || find(cursor - accelerator->length() + 1, cursor + 1, 'N') <= cursor) { + // we don't have an accelerator, the string we're querying is too short, + // or there's an N (which we can't handle). decline to accelerate + return gcsa::range_type(0, gcsa->size() - 1); + } + + auto range = accelerator->memoized_LF(cursor); + if (gcsa::Range::empty(range)) { + // the acceleration lead to an empty range, so we might have been able to + // get a non-empty one by actually doing each LF + return gcsa::range_type(0, gcsa->size() - 1); + } + else { + // success, update the cursor and return + cursor -= accelerator->length(); + return range; + } +} + void BaseMapper::prefilter_redundant_sub_mems(vector& mems, vector>>& sub_mem_containment_graph) { @@ -1305,7 +2127,7 @@ void BaseMapper::rescue_high_count_order_length_mems(vector& for (size_t i = 0; i < mems.size(); i++) { if (mems[i].nodes.empty()) { unfilled_mem_ranges.emplace_back(i, 0); - while (i < mems.size() ? mems[i].nodes.empty() : false) { + while (i < mems.size() && mems[i].nodes.empty()) { i++; } unfilled_mem_ranges.back().second = i; @@ -1355,289 +2177,14 @@ size_t BaseMapper::get_adaptive_min_reseed_length(size_t parent_mem_length) { return adaptive_reseed_length_memo[parent_mem_length]; } -void BaseMapper::first_hit_positions_by_index(MaximalExactMatch& mem, - vector>& positions_by_index_out) { - // find the hit to the first index in the parent MEM's range - vector all_first_hits; - gcsa->locate(mem.range.first, all_first_hits, true, false); - - // find where in the graph the first hit of the parent MEM is at each index - mem_positions_by_index(mem, make_pos_t(all_first_hits[0]), positions_by_index_out); - - // in case the first hit occurs in more than one place, accumulate all the hits - if (all_first_hits.size() > 1) { - for (size_t i = 1; i < all_first_hits.size(); i++) { - vector> temp_positions_by_index; - mem_positions_by_index(mem, make_pos_t(all_first_hits[i]), - temp_positions_by_index); - - for (size_t i = 0; i < positions_by_index_out.size(); i++) { - for (const pos_t& pos : temp_positions_by_index[i]) { - positions_by_index_out[i].insert(pos); - } - } - } - } -} - -void BaseMapper::fill_nonredundant_sub_mem_nodes(vector& parent_mems, - vector > >::iterator sub_mem_records_begin, - vector > >::iterator sub_mem_records_end) { - - - // for each MEM, a vector of the positions that it touches at each index along the MEM - vector>> positions_by_index(parent_mems.size()); - - for (auto iter = sub_mem_records_begin; iter != sub_mem_records_end; iter++) { - - pair >& sub_mem_and_parents = *iter; - - MaximalExactMatch& sub_mem = sub_mem_and_parents.first; - vector& parent_idxs = sub_mem_and_parents.second; - - // how many total hits does each parent MEM have? - vector num_parent_hits; - // positions their first hits of the parent MEM takes at the start position of the sub-MEM - vector*> first_parent_mem_hit_positions; - for (size_t parent_idx : parent_idxs) { - // get the parent MEM - MaximalExactMatch& parent_mem = parent_mems[parent_idx]; - num_parent_hits.push_back(gcsa->count(parent_mem.range)); - - if (positions_by_index[parent_idx].empty()) { - // the parent MEM's positions by index haven't been calculated yet, so do it - - first_hit_positions_by_index(parent_mem, positions_by_index[parent_idx]); - - } - - // the index along the parent MEM that sub MEM starts - size_t offset = sub_mem.begin - parent_mem.begin; - first_parent_mem_hit_positions.push_back(&(positions_by_index[parent_idx][offset])); - } - - for (gcsa::size_type i = sub_mem.range.first; i <= sub_mem.range.second; i++) { - // TODO: what if this range is too big? - - - // add the locations of the hits, but do not remove duplicates yet - vector hits; - gcsa->locate(i, hits, true, false); - - // the number of subsequent hits (including these) that are inside a parent MEM - size_t parent_hit_jump = 0; - for (gcsa::node_type node : hits) { - // look for the hit in each parent MEM - for (size_t j = 0; j < first_parent_mem_hit_positions.size(); j++) { - if (first_parent_mem_hit_positions[j]->count(make_pos_t(node))) { - // this hit is also a node on a path of the first occurrence of the parent MEM - // that means that this is the first index of the sub-range that corresponds - // to the parent MEM's hits - - // calculate how many more positions to jump - parent_hit_jump = num_parent_hits[j]; - break; - } - } - } - - if (parent_hit_jump > 0) { - // we're at the start of an interval of parent hits, skip the rest of it - i += (parent_hit_jump - 1); - } - else { - // these are nonredundant sub MEM hits, add them - for (gcsa::node_type node : hits) { - sub_mem.nodes.push_back(node); - } - } - } - - // remove duplicates (copied this functionality from the gcsa locate function, but - // I don't actually know what it's purpose is) - gcsa::removeDuplicates(sub_mem.nodes, false); - } -} - -void BaseMapper::mem_positions_by_index(MaximalExactMatch& mem, pos_t hit_pos, - vector>& positions_by_index_out) { - - // this is a specialized DFS that keeps track of both the distance along the MEM - // and the position(s) in the graph in the stack by adding all of the next reachable - // positions in a layer (i.e. vector) in the stack at the end of each iteration. - // it also keeps track of whether a position in the graph matched to a position along - // the MEM can potentially be extended to the full MEM to avoid combinatorially checking - // all paths through bubbles - - size_t mem_length = std::distance(mem.begin, mem.end); - - // indicates a pairing of this graph position and this MEM index could be extended to a full match - positions_by_index_out.clear(); - positions_by_index_out.resize(mem_length); - - // indicates a pairing of this graph position and this MEM index could not be extended to a full match - vector> false_pos_by_mem_index(mem_length); - - // each record indicates the next edge index to traverse, the number of edges that - // cannot reach a MEM end, and the positions along each edge out - vector, vector > > pos_stack; - pos_stack.push_back(make_pair(make_pair((size_t) 0 , (size_t) 0), vector{hit_pos})); - - while (!pos_stack.empty()) { - size_t mem_idx = pos_stack.size() - 1; - - // which edge are we going to search out of this node next? - size_t next_idx = pos_stack.back().first.first; - - if (next_idx >= pos_stack.back().second.size()) { - // we have traversed all of the edges out of this position - - size_t num_misses = pos_stack.back().first.second; - bool no_full_matches_possible = (num_misses == pos_stack.back().second.size()); - - // backtrack to previous node - pos_stack.pop_back(); - - // if necessary, mark the edge into this node as a miss - if (no_full_matches_possible && !pos_stack.empty()) { - // all of the edges out failed to reach the end of a MEM, this position is a dead end - - // get the position that traversed into the layer we just popped off - pos_t prev_graph_pos = pos_stack.back().second[pos_stack.back().first.first - 1]; - - // unlabel this node as a potential hit and instead mark it as a miss - positions_by_index_out[mem_idx].erase(prev_graph_pos); - false_pos_by_mem_index[mem_idx].insert(prev_graph_pos); - - // increase the count of misses in this layer - pos_stack.back().first.second++; - } - - // skip the forward search on this iteration - continue; - } - - // increment to the next edge - pos_stack.back().first.first++; - - pos_t graph_pos = pos_stack.back().second[next_idx]; - - - // did we already find a MEM through this position? - if (positions_by_index_out[mem_idx].count(graph_pos)) { - // we don't need to check the same MEM suffix again - continue; - } - - // did we already determine that you can't reach a MEM through this position? - if (false_pos_by_mem_index[mem_idx].count(graph_pos)) { - // increase the count of misses in this layer - pos_stack.back().first.second++; - - // we don't need to check the same MEM suffix again - continue; - } - - // does this graph position match the MEM? - if (*(mem.begin + mem_idx) != xg_pos_char(graph_pos, xindex)) { - // mark this node as a miss - false_pos_by_mem_index[mem_idx].insert(graph_pos); - - // increase the count of misses in this layer - pos_stack.back().first.second++; - } - else { - // mark this node as a potential hit - positions_by_index_out[mem_idx].insert(graph_pos); - - // are we finished with the MEM? - if (mem_idx < mem_length - 1) { - - // add a layer onto the stack for all of the edges out - pos_stack.push_back(make_pair(make_pair((size_t) 0 , (size_t) 0), - vector())); - - // fill the layer with the next positions - vector& nexts = pos_stack.back().second; - for (const pos_t& next_graph_pos : positions_bp_from(graph_pos, 1, false)) { - nexts.push_back(next_graph_pos); - } - } - } - } -} - - -set BaseMapper::positions_bp_from(pos_t pos, int distance, bool rev) { - return xg_positions_bp_from(pos, distance, rev, xindex); -} - -char BaseMapper::pos_char(pos_t pos) { - return xg_pos_char(pos, xindex); -} - -map BaseMapper::next_pos_chars(pos_t pos) { - return xg_next_pos_chars(pos, xindex); -} - -void BaseMapper::set_alignment_threads(int new_thread_count) { - alignment_threads = new_thread_count; -} - -bool BaseMapper::has_fixed_fragment_length_distr() { +bool PairedEndMapper::has_fixed_fragment_length_distr() { return fragment_length_distr.is_finalized(); } -void BaseMapper::force_fragment_length_distr(double mean, double stddev) { +void PairedEndMapper::force_fragment_length_distr(double mean, double stddev) { fragment_length_distr.force_parameters(mean, stddev); } - -BaseAligner* BaseMapper::get_aligner(bool have_qualities) const { - return (have_qualities && adjust_alignments_for_base_quality) ? - (BaseAligner*) qual_adj_aligner : - (BaseAligner*) regular_aligner; -} - -QualAdjAligner* BaseMapper::get_qual_adj_aligner() const { - assert(qual_adj_aligner != nullptr); - return qual_adj_aligner; -} - -Aligner* BaseMapper::get_regular_aligner() const { - assert(regular_aligner != nullptr); - return regular_aligner; -} - -void BaseMapper::clear_aligners(void) { - delete qual_adj_aligner; - delete regular_aligner; - qual_adj_aligner = nullptr; - regular_aligner = nullptr; -} - -void BaseMapper::init_aligner(int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, uint32_t max_gap_length) { - // hacky, find max score so that scaling doesn't change score - int8_t max_score = match; - if (mismatch > max_score) max_score = mismatch; - if (gap_open > max_score) max_score = gap_open; - if (gap_extend > max_score) max_score = gap_extend; - - double gc_content = estimate_gc_content(); - - qual_adj_aligner = new QualAdjAligner(match, mismatch, gap_open, gap_extend, full_length_bonus, - max_score, 255, gc_content); - regular_aligner = new Aligner(match, mismatch, gap_open, gap_extend, full_length_bonus, gc_content, max_gap_length); -} - -void BaseMapper::load_scoring_matrix(std::ifstream& matrix_stream){ - matrix_stream.clear(); - matrix_stream.seekg(0); - if(regular_aligner) get_regular_aligner()->load_scoring_matrix(matrix_stream); - matrix_stream.clear(); - matrix_stream.seekg(0); - if(qual_adj_aligner) get_qual_adj_aligner()->load_scoring_matrix(matrix_stream); -} - + void BaseMapper::apply_haplotype_consistency_scores(const vector& alns) { if (haplo_score_provider == nullptr) { // There's no haplotype data available, so we can't add consistency scores. @@ -1654,19 +2201,24 @@ void BaseMapper::apply_haplotype_consistency_scores(const vector& al // It won't matter either way return; } - - size_t haplotype_count = xindex->get_haplotype_count(); - - if (haplotype_count == 0) { - // The XG apparently has no path database information. Maybe it wasn't built with the GBWT? - throw runtime_error("Cannot score any haplotypes with a 0 haplotype count; does the XG contain the path database?"); + + // Work out the population size. Try the score provider and then fall back to the xg. + auto haplotype_count = haplo_score_provider->get_haplotype_count(); + if (haplotype_count == -1) { + // The score provider doesn't have a haplotype count. + haplotype_count = 0; + } + + if (haplotype_count == 0 || haplotype_count == -1) { + // We really should have a haplotype count + throw runtime_error("Cannot score any haplotypes with a 0 or -1 haplotype count; are haplotypes available?"); } // We don't look at strip_bonuses here, because we need these bonuses added // always in order to choose between alignments. // Build Yohei's recombination probability calculator. Feed it the haplotype - // count from the XG index that was generated alongside the GBWT. + // count from the PathPositionHandleGraph index that was generated alongside the GBWT. haplo::haploMath::RRMemo haplo_memo(recombination_penalty, haplotype_count); // This holds all the computed haplotype logprobs @@ -1695,6 +2247,12 @@ void BaseMapper::apply_haplotype_consistency_scores(const vector& al bool path_valid; std::tie(haplotype_logprob, path_valid) = haplo_score_provider->score(aln->path(), haplo_memo); + if (std::isnan(haplotype_logprob) && path_valid) { + // This shouldn't happen. Bail out on haplotype adjustment for this read and warn. + cerr << "warning:[vg::Mapper]: NAN population score obtained for read with ostensibly successful query. Changing to failure." << endl; + path_valid = false; + } + if (!path_valid) { // Our path does something the scorer doesn't like. // Bail out of applying haplotype scores. @@ -1722,10 +2280,12 @@ void BaseMapper::apply_haplotype_consistency_scores(const vector& al // We actually did rescore this one // This is a score "penalty" because it is usually negative. But positive = more score. + // Convert to points, raise to haplotype consistency exponent power. double score_penalty = haplotype_consistency_exponent * (haplotype_logprobs[i] / aligner->log_base); - - // Convert to points, raise to haplotype consistency exponent power, and apply - alns[i]->set_score(max((int64_t) 0, alns[i]->score() + (int64_t) round(score_penalty))); + + // Apply "penalty" + int64_t old_score = alns[i]->score(); + alns[i]->set_score(max((int64_t) 0, old_score + (int64_t) round(score_penalty))); // Note that we successfully corrected the score set_annotation(alns[i], "haplotype_score_used", true); // And save the score penalty/bonus @@ -1733,14 +2293,15 @@ void BaseMapper::apply_haplotype_consistency_scores(const vector& al if (debug) { cerr << "Alignment statring at " << alns[i]->path().mapping(0).position().node_id() - << " got logprob " << haplotype_logprobs[i] << " moving score " << score_penalty - << " from " << alns[i]->score() - score_penalty << " to " << alns[i]->score() << endl; + << " got logprob " << haplotype_logprobs[i] << " vs " << haplotype_count + << " haplotypes, moving score by " << score_penalty + << " from " << old_score << " to " << alns[i]->score() << endl; } } } } -double BaseMapper::estimate_gc_content(void) { +double BaseMapper::estimate_gc_content(const gcsa::GCSA* gcsa) { uint64_t at = 0, gc = 0; @@ -1758,7 +2319,11 @@ double BaseMapper::estimate_gc_content(void) { int BaseMapper::random_match_length(double chance_random) { if (xindex) { - size_t length = xindex->seq_length; + // sum up the sequence length + size_t length = 0; + xindex->for_each_handle([&](const handle_t& handle) { + length += xindex->get_length(handle); + }); return ceil(- (log(1.0 - pow(pow(1.0-chance_random, -1), (-1.0/length))) / log(4.0))); } else { return 0; @@ -1766,30 +2331,35 @@ int BaseMapper::random_match_length(double chance_random) { } void BaseMapper::set_alignment_scores(int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, - int8_t full_length_bonus, double haplotype_consistency_exponent, uint32_t max_gap_length) { + int8_t full_length_bonus, double haplotype_consistency_exponent) { - // clear the existing aligners and recreate them - if (regular_aligner || qual_adj_aligner) { - clear_aligners(); - } - init_aligner(match, mismatch, gap_open, gap_extend, full_length_bonus, max_gap_length); + AlignerClient::set_alignment_scores(match, mismatch, gap_open, gap_extend, full_length_bonus); + + // Save the consistency exponent + this->haplotype_consistency_exponent = haplotype_consistency_exponent; +} + +void BaseMapper::set_alignment_scores(istream& matrix_stream, int8_t gap_open, int8_t gap_extend, + int8_t full_length_bonus, double haplotype_consistency_exponent) { + + AlignerClient::set_alignment_scores(matrix_stream, gap_open, gap_extend, full_length_bonus); // Save the consistency exponent this->haplotype_consistency_exponent = haplotype_consistency_exponent; } -void BaseMapper::set_fragment_length_distr_params(size_t maximum_sample_size, size_t reestimation_frequency, +void PairedEndMapper::set_fragment_length_distr_params(size_t maximum_sample_size, size_t reestimation_frequency, double robust_estimation_fraction) { if (fragment_length_distr.is_finalized()) { - cerr << "warning:[vg::Mapper] overwriting a fragment length distribution that has already been estimated" << endl; + cerr << "warning:[vg::PairedEndMapper] overwriting a fragment length distribution that has already been estimated" << endl; } fragment_length_distr = FragmentLengthDistribution(maximum_sample_size, reestimation_frequency, robust_estimation_fraction); } -Mapper::Mapper(xg::XG* xidex, +Mapper::Mapper(PathPositionHandleGraph* xidex, gcsa::GCSA* g, gcsa::LCPArray* a, haplo::ScoreProvider* haplo_score_provider) : @@ -1803,7 +2373,6 @@ Mapper::Mapper(xg::XG* xidex, , max_softclip_iterations(10) , min_identity(0) , max_target_factor(128) - , max_query_graph_ratio(128) , extra_multimaps(512) , band_multimaps(4) , always_rescue(false) @@ -1842,94 +2411,146 @@ Mapper::~Mapper(void) { */ } -double Mapper::graph_entropy(void) { - const size_t seq_bytes = xindex->sequence_bit_size() / 8; - char* seq = (char*) xindex->sequence_data(); - return entropy(seq, seq_bytes); -} // todo add options for aligned global and pinned Alignment Mapper::align_to_graph(const Alignment& aln, - Graph& graph, - size_t max_query_graph_ratio, + HandleGraph& graph, + bool do_flip, bool traceback, - bool acyclic_and_sorted, bool pinned_alignment, bool pin_left, bool banded_global, bool keep_bonuses) { // do not use X-drop alignment when MEMs is not available vector mems; - return align_to_graph(aln, graph, mems, max_query_graph_ratio, traceback, acyclic_and_sorted, pinned_alignment, pin_left, banded_global, false, keep_bonuses); + return align_to_graph(aln, graph, mems, do_flip, traceback, pinned_alignment, pin_left, banded_global, false, keep_bonuses); } + Alignment Mapper::align_to_graph(const Alignment& aln, - Graph& graph, + HandleGraph& graph, const vector& mems, - size_t max_query_graph_ratio, + bool do_flip, bool traceback, - bool acyclic_and_sorted, bool pinned_alignment, bool pin_left, bool banded_global, int xdrop_alignment, bool keep_bonuses) { - // check if we need to make a vg graph to handle this graph - Alignment aligned; - if (!acyclic_and_sorted) { //!is_id_sortable(graph) || has_inversion(graph)) { - VG vg; - vg.extend(graph); - if (aln.quality().empty() || !adjust_alignments_for_base_quality) { - aligned = vg.align(aln, - get_regular_aligner(), - mems, - traceback, - acyclic_and_sorted, - max_query_graph_ratio, - pinned_alignment, - pin_left, - banded_global, - 0, // band padding override - aln.sequence().size(), - 0, // unroll_length - xdrop_alignment, - xdrop_alignment && alignment_threads > 1); - } else { - aligned = vg.align_qual_adjusted(aln, - get_qual_adj_aligner(), - mems, - traceback, - acyclic_and_sorted, - max_query_graph_ratio, - pinned_alignment, - pin_left, - banded_global, - 0, // band padding override - aln.sequence().size()); - // 0, // unroll_length - // xdrop_alignment*/); + + // the longest path we could possibly align to (full gap and a full sequence) + size_t target_length = aln.sequence().size() + get_aligner()->longest_detectable_gap(aln); + + // copy our alignment, which we'll then modify + Alignment aligned = aln; + + // convert from bidirected to directed + unordered_map > node_trans; + bdsg::HashGraph align_graph; + + // check if we can get away with using only one strand of the graph + bool use_single_stranded = handlealgs::is_single_stranded(&graph); + bool mem_strand = false; + if (use_single_stranded) { + if (mems.size()) { + mem_strand = gcsa::Node::rc(mems[0].nodes.front()); + for (size_t i = 1; i < mems.size(); i++) { + if (gcsa::Node::rc(mems[0].nodes.front()) != mem_strand) { + use_single_stranded = false; + break; + } + } + } else if (do_flip) { + mem_strand = true; } - } else { - // we've got an id-sortable graph and we can directly align with gssw - aligned = aln; - if (banded_global) { - size_t max_span = aln.sequence().size(); - size_t band_padding_override = 0; - bool permissive_banding = (band_padding_override == 0); - size_t band_padding = permissive_banding ? max(max_span, (size_t) 1) : band_padding_override; - get_aligner(!aln.quality().empty())->align_global_banded(aligned, graph, band_padding, false); - } else if (pinned_alignment) { - get_aligner(!aln.quality().empty())->align_pinned(aligned, graph, pin_left); - } else if (xdrop_alignment) { - // directly call alignment function without node translation - // cerr << "X-drop alignment, (" << xdrop_alignment << "), rev(" << ((xdrop_alignment == 1) ? false : true) << ")" << endl; - get_aligner(!aln.quality().empty())->align_xdrop(aligned, graph, mems, (xdrop_alignment == 1) ? false : true, alignment_threads > 1); + } + bool flipped_alignment = false; + if (use_single_stranded) { + if (mem_strand && !xdrop_alignment) { + aligned.set_sequence(reverse_complement(aligned.sequence())); + if (!aligned.quality().empty()) { + reverse(aligned.mutable_quality()->begin(), + aligned.mutable_quality()->end()); + } + flipped_alignment = true; + } + if (mem_strand && xdrop_alignment) { + // TODO -- investigate if reversing the mems is cheaper + // xdrop requires that we reverse complement the mems or the graph + handlealgs::reverse_complement_graph(&graph, &align_graph); + node_trans.reserve(align_graph.get_node_count()); + align_graph.for_each_handle([&](const handle_t& handle) { + node_trans[align_graph.get_id(handle)] = make_pair(align_graph.get_id(handle), true); + }); } else { - get_aligner(!aln.quality().empty())->align(aligned, graph, traceback, false); + // if we are using only the forward strand of the current graph, a make trivial node translation so + // the later code's expectations are met + // TODO: can we do this without the copy constructor? + //align_graph = graph; + node_trans.reserve(graph.get_node_count()); + graph.for_each_handle([&](const handle_t& handle) { + handle_t x = align_graph.create_handle(graph.get_sequence(handle), graph.get_id(handle)); + assert(align_graph.get_id(x) == graph.get_id(handle)); + node_trans[graph.get_id(handle)] = make_pair(graph.get_id(handle), false); + return true; + }); + graph.for_each_edge([&](const edge_t& edge) { + align_graph.create_edge(graph.get_handle(graph.get_id(edge.first), graph.get_is_reverse(edge.first)), + graph.get_handle(graph.get_id(edge.second), graph.get_is_reverse(edge.second))); + }); } } + else { + auto node_trans_tmp = handlealgs::split_strands(&graph, &align_graph); + node_trans.reserve(node_trans_tmp.size()); + for (const auto& trans : node_trans_tmp) { + node_trans[align_graph.get_id(trans.first)] = make_pair(graph.get_id(trans.second), + graph.get_is_reverse(trans.second)); + } + } + + // if necessary, convert from cyclic to acylic + if (!handlealgs::is_directed_acyclic(&align_graph)) { + // make a dagified graph and translation + bdsg::HashGraph dagified; + unordered_map dagify_trans = handlealgs::dagify(&align_graph, &dagified, target_length); + // replace the original with the dagified ones + align_graph = move(dagified); + node_trans = overlay_node_translations(dagify_trans, node_trans); + } + + if (banded_global) { + // the banded global alignment no longer constructs an internal representation of the graph + // for topological sorting, etc., instead counting on the HandleGraph to do that itself. accordingly + // we need a more serious/performant implementation of a graph here + size_t max_span = aln.sequence().size(); + size_t band_padding_override = 0; + bool permissive_banding = (band_padding_override == 0); + size_t band_padding = permissive_banding ? max(max_span, (size_t) 1) : band_padding_override; + get_aligner(!aln.quality().empty())->align_global_banded(aligned, align_graph, band_padding, false); + } else if (pinned_alignment) { + get_aligner(!aln.quality().empty())->align_pinned(aligned, align_graph, pin_left); + } else if (xdrop_alignment) { + get_aligner(!aln.quality().empty())->align_xdrop(aligned, align_graph, + translate_mems(mems, node_trans), + xdrop_alignment != 1, + max_xdrop_gap_length); + } else { + get_aligner(!aln.quality().empty())->align(aligned, align_graph, traceback); + } if (traceback && !keep_bonuses && aligned.score()) { remove_full_length_bonuses(aligned); } + // un-reverse complement the alignment + if (flipped_alignment) { + aligned = reverse_complement_alignment( + aligned, + (function) ([&](int64_t id) { + return align_graph.get_length(align_graph.get_handle(id)); + })); + } + if (node_trans.size()) { + translate_oriented_node_ids(*aligned.mutable_path(), node_trans); + } return aligned; } @@ -1939,66 +2560,14 @@ Alignment Mapper::align(const string& seq, int kmer_size, int stride, int max_me return align(aln, kmer_size, stride, max_mem_length, band_width, band_overlap, xdrop_alignment); } -pos_t Mapper::likely_mate_position(const Alignment& aln, bool is_first_mate) { - bool aln_is_rev = aln.path().mapping(0).position().is_reverse(); - int64_t aln_pos = approx_alignment_position(aln); - //if (debug) cerr << "aln pos " << aln_pos << endl; - // can't find the alignment position - if (aln_pos < 0) return make_pos_t(0, false, 0); - bool same_orientation = frag_stats.cached_fragment_orientation_same; - bool forward_direction = frag_stats.cached_fragment_direction; - int64_t delta = frag_stats.cached_fragment_length_mean; - // which way is our delta? - // we are on the forward strand - id_t target; - if (forward_direction) { - if (is_first_mate) { - if (!aln_is_rev) { - target = node_approximately_at(aln_pos + delta); - } else { - target = node_approximately_at(aln_pos - delta); - } - } else { - if (!aln_is_rev) { - target = node_approximately_at(aln_pos + delta); - } else { - target = node_approximately_at(aln_pos - delta); - } - } - } else { - if (is_first_mate) { - if (!aln_is_rev) { - target = node_approximately_at(aln_pos - delta); - } else { - target = node_approximately_at(aln_pos + delta); - } - } else { - if (!aln_is_rev) { - target = node_approximately_at(aln_pos - delta); - } else { - target = node_approximately_at(aln_pos + delta); - } - } - } - if (same_orientation) { - return make_pos_t(target, aln_is_rev, 0); - } else { - return make_pos_t(target, !aln_is_rev, 0); - } -} - -map > > Mapper::alignment_path_offsets(const Alignment& aln, bool just_min, bool nearby) const { - return xg_alignment_path_offsets(aln, just_min, nearby, xindex); -} - vector Mapper::likely_mate_positions(const Alignment& aln, bool is_first_mate) { - // fallback to approx when we don't have paths - if (xindex->path_count == 0) { - return { likely_mate_position(aln, is_first_mate) }; + // when we don't have paths we can't properly estimate the mate position + if (xindex->get_path_count() == 0) { + return { }; } - map > > offsets; + unordered_map > > offsets; for (auto& mapping : aln.path().mapping()) { - auto pos_offs = xindex->nearest_offsets_in_paths(make_pos_t(mapping.position()), aln.sequence().size()); + auto pos_offs = algorithms::nearest_offsets_in_paths(xindex, make_pos_t(mapping.position()), aln.sequence().size()); for (auto& p : pos_offs) { if (offsets.find(p.first) == offsets.end()) { offsets[p.first] = p.second; @@ -2014,7 +2583,7 @@ vector Mapper::likely_mate_positions(const Alignment& aln, bool is_first_ for (auto& seq : offsets) { // find the likely position // then direction - auto& seq_name = seq.first; + const std::string& seq_name = xindex->get_path_name(seq.first); for (auto& p : seq.second) { size_t path_pos = p.first; bool on_reverse_path = p.second; @@ -2048,11 +2617,16 @@ vector Mapper::likely_mate_positions(const Alignment& aln, bool is_first_ } } } - mate_pos = max((int64_t)0, mate_pos); - pos_t target = xindex->graph_pos_at_path_position(seq_name, mate_pos); + path_handle_t path_handle = xindex->get_path_handle(seq_name); + mate_pos = min(max((int64_t)0, mate_pos), (int64_t)xindex->get_path_length(path_handle)-1); + //pos_t target = xindex->graph_pos_at_path_position(seq_name, mate_pos); + step_handle_t step = xindex->get_step_at_position(path_handle, mate_pos); + size_t pos_of_step = xindex->get_position_of_step(step); + handle_t handle = xindex->get_handle_of_step(step); + pos_t target = make_pos_t(xindex->get_id(handle), xindex->get_is_reverse(handle), mate_pos-pos_of_step); // what orientation should we use - if (same_orientation && on_reverse_path - || !same_orientation && !on_reverse_path) { + if ((same_orientation && on_reverse_path) + || (!same_orientation && !on_reverse_path)) { target = reverse(target, get_node_length(id(target))); } likely.push_back(target); @@ -2108,7 +2682,7 @@ pair Mapper::pair_rescue(Alignment& mate1, Alignment& mate2, return make_pair(false, false); } if (mate_positions.empty()) return make_pair(false, false); // can't rescue because the selected mate is unaligned - Graph graph; + bdsg::HashGraph graph; set orientations; #ifdef debug_rescue if (debug) cerr << "got " << mate_positions.size() << " mate positions" << endl; @@ -2123,30 +2697,25 @@ pair Mapper::pair_rescue(Alignment& mate1, Alignment& mate2, (int64_t)max((double)frag_stats.cached_fragment_length_stdev * 10.0, mate1.sequence().size() * 3.0))); //cerr << "Getting at least " << get_at_least << endl; - graph.MergeFrom(xindex->graph_context_id(mate_pos, get_at_least/2)); - graph.MergeFrom(xindex->graph_context_id(reverse(mate_pos, get_node_length(id(mate_pos))), get_at_least/2)); - //if (debug) cerr << "rescue got graph " << pb2json(graph) << endl; + // TODO it may be possible to make this sugraph smaller with little penalty in terms of accuracy + algorithms::extract_context(*xindex, graph, xindex->get_handle(id(mate_pos), is_rev(mate_pos)), offset(mate_pos), get_at_least); // if we're reversed, align the reverse sequence and flip it back // align against it } - sort_by_id_dedup_and_clean(graph); - bool acyclic_and_sorted = is_id_sortable(graph) && !has_inversion(graph); - //VG g; g.extend(graph);string h = g.hash(); - //g.serialize_to_file("rescue-" + h + ".vg"); int max_mate1_score = mate1.score(); int max_mate2_score = mate2.score(); for (auto& orientation : orientations) { if (rescue_off_first) { - Alignment aln2 = align_maybe_flip(mate2, graph, orientation, traceback, acyclic_and_sorted, false, xdrop_alignment); + Alignment aln2 = align_maybe_flip(mate2, graph, orientation, traceback, false, xdrop_alignment); tried2 = true; - //write_alignment_to_file(aln2, "rescue-" + h + ".gam"); + //vg::io::write_to_file(aln2, "rescue-" + h + ".gam"); #ifdef debug_rescue if (debug) cerr << "aln2 score/ident vs " << aln2.score() << "/" << aln2.identity() << " vs " << mate2.score() << "/" << mate2.identity() << endl; #endif if (aln2.score() > max_mate2_score && (double)aln2.score()/perfect_score > min_threshold && pair_consistent(mate1, aln2, accept_pval)) { if (!traceback) { // now get the traceback - aln2 = align_maybe_flip(mate2, graph, orientation, true, acyclic_and_sorted, false, xdrop_alignment); + aln2 = align_maybe_flip(mate2, graph, orientation, true, false, xdrop_alignment); } #ifdef debug_rescue if (debug) cerr << "rescued aln2 " << pb2json(aln2) << endl; @@ -2157,16 +2726,16 @@ pair Mapper::pair_rescue(Alignment& mate1, Alignment& mate2, rescued2 = true; } } else if (rescue_off_second) { - Alignment aln1 = align_maybe_flip(mate1, graph, orientation, traceback, acyclic_and_sorted, false, xdrop_alignment); + Alignment aln1 = align_maybe_flip(mate1, graph, orientation, traceback, false, xdrop_alignment); tried1 = true; - //write_alignment_to_file(aln1, "rescue-" + h + ".gam"); + //vg::io::write_to_file(aln1, "rescue-" + h + ".gam"); #ifdef debug_rescue if (debug) cerr << "aln1 score/ident vs " << aln1.score() << "/" << aln1.identity() << " vs " << mate1.score() << "/" << mate1.identity() << endl; #endif if (aln1.score() > max_mate1_score && (double)aln1.score()/perfect_score > min_threshold && pair_consistent(aln1, mate2, accept_pval)) { if (!traceback) { // now get the traceback - aln1 = align_maybe_flip(mate1, graph, orientation, true, acyclic_and_sorted, false, xdrop_alignment); + aln1 = align_maybe_flip(mate1, graph, orientation, true, false, xdrop_alignment); } #ifdef debug_rescue if (debug) cerr << "rescued aln1 " << pb2json(aln1) << endl; @@ -2212,13 +2781,13 @@ bool Mapper::pair_consistent(Alignment& aln1, double pval) { if (!(aln1.score() && aln2.score())) return false; bool length_ok = false; - if (xindex->path_count == 0) { + if (xindex->get_path_count() == 0) { // use the approximate distance //cerr << "using approx distance" << endl; int len = approx_fragment_length(aln1, aln2); - if (frag_stats.fragment_size && len > 0 && (pval > 0 && frag_stats.fragment_length_pval(len) > pval - || len < frag_stats.fragment_size) - || !frag_stats.fragment_size && len > 0 && len < frag_stats.fragment_max) { + if ((frag_stats.fragment_size && len > 0 && ((pval > 0 && frag_stats.fragment_length_pval(len) > pval) + || len < frag_stats.fragment_size)) + || (!frag_stats.fragment_size && len > 0 && len < frag_stats.fragment_max)) { length_ok = true; } bool aln1_is_rev = aln1.path().mapping(0).position().is_reverse(); @@ -2226,14 +2795,14 @@ bool Mapper::pair_consistent(Alignment& aln1, bool same_orientation = frag_stats.cached_fragment_orientation_same; // XXX todo //bool direction_ok = frag_stats.cached_fragment_direction && - bool orientation_ok = same_orientation && aln1_is_rev == aln2_is_rev - || !same_orientation && aln1_is_rev != aln2_is_rev; + bool orientation_ok = (same_orientation && aln1_is_rev == aln2_is_rev) + || (!same_orientation && aln1_is_rev != aln2_is_rev); return length_ok && orientation_ok; } else { // use the distance induced by the graph paths // won't recompute if we already have refpos - annotate_with_initial_path_positions(aln1); - annotate_with_initial_path_positions(aln2); + algorithms::annotate_with_initial_path_positions(*xindex, aln1); + algorithms::annotate_with_initial_path_positions(*xindex, aln2); map > > offsets1 = alignment_refpos_to_path_offsets(aln1); map > > offsets2 = alignment_refpos_to_path_offsets(aln2); auto pos_consistent = @@ -2244,7 +2813,7 @@ bool Mapper::pair_consistent(Alignment& aln1, bool fwd2 = p2.second; int64_t len = pos2 - pos1; if (frag_stats.fragment_size) { - bool orientation_ok = frag_stats.cached_fragment_orientation_same && fwd1 == fwd2 || fwd1 != fwd2; + bool orientation_ok = (frag_stats.cached_fragment_orientation_same && fwd1 == fwd2) || fwd1 != fwd2; bool direction_ok = frag_stats.cached_fragment_direction && (!fwd1 && len >= 0 || fwd1 && len <= 0) || (fwd1 && len >= 0 || !fwd1 && len <= 0); bool length_ok = frag_stats.fragment_length_pval(abs(len)) > pval;//|| pval == 0 && abs(len) < frag_stats.fragment_size; @@ -2292,8 +2861,6 @@ pair, vector> Mapper::align_paired_multi( read2.set_sequence(second_mate.sequence()); read2.set_quality(second_mate.quality()); - double avg_node_len = average_node_length(); - auto aligner = get_aligner(!read1.quality().empty()); int8_t match = aligner->match; int8_t gap_extension = aligner->gap_extension; @@ -2330,7 +2897,7 @@ pair, vector> Mapper::align_paired_multi( max_mem_length, min_mem_length, mem_reseed_length, - false, true, true, false); + false, true, true, true); // Make sure to actually fill in the longest LCP. vector mems2 = find_mems_deep(read2.sequence().begin(), read2.sequence().end(), @@ -2339,7 +2906,7 @@ pair, vector> Mapper::align_paired_multi( max_mem_length, min_mem_length, mem_reseed_length, - false, true, true, false); + false, true, true, true); // Make sure to actually fill in the longest LCP. double mq_cap1, mq_cap2; mq_cap1 = mq_cap2 = max_mapping_quality; @@ -2408,11 +2975,6 @@ pair, vector> Mapper::align_paired_multi( // if we have a cached fragment orientation, use it to pick the min distance with the correct path relative orientation int64_t approx_dist = (!frag_stats.fragment_size ? min(d.first, d.second) : (frag_stats.cached_fragment_orientation_same ? d.first : d.second)); - /*// could be expensive for pairs - if (approx_dist < max_length) { - approx_dist = min(approx_dist, graph_distance(m1_pos, m2_pos, max_length)); - } - */ if (approx_dist >= max_length) { // too far apart or wrong path/pair relative orientation return -std::numeric_limits::max(); @@ -2469,8 +3031,8 @@ pair, vector> Mapper::align_paired_multi( [&](pos_t n) -> int64_t { return approx_position(n); }, - [&](pos_t n) -> map > > { - return xindex->offsets_in_paths(n); + [&](pos_t n) -> unordered_map > > { + return algorithms::nearest_offsets_in_paths(xindex, n, -1); }, transition_weight, band_width); @@ -2657,7 +3219,7 @@ pair, vector> Mapper::align_paired_multi( auto& aln1 = p->first; auto& aln2 = p->second; auto approx_frag_lengths = min_pair_fragment_length(aln1, aln2); - frag_stats.save_frag_lens_to_alns(aln1, aln2, approx_frag_lengths, pair_consistent(aln1, aln2, 1e-6)); + frag_stats.save_frag_lens_to_alns(aln1, aln2, approx_frag_lengths, xindex, pair_consistent(aln1, aln2, 1e-6)); } // sort the aligned pairs by score sort_shuffling_ties(aln_ptrs.begin(), aln_ptrs.end(), @@ -2895,7 +3457,6 @@ pair, vector> Mapper::align_paired_multi( // we then set the fragment_size cutoff using the moments of the estimated distribution bool imperfect_pair = false; for (int i = 0; i < min(results.first.size(), results.second.size()); ++i) { - if (retrying) break; auto& aln1 = results.first.at(i); auto& aln2 = results.second.at(i); //double ident1 = (double) aln1.score() / max_possible_score; @@ -2904,17 +3465,29 @@ pair, vector> Mapper::align_paired_multi( for (int j = 0; j < aln1.fragment_size(); ++j) { length = min(length, abs(aln1.fragment(j).length())); } - if (results.first.size() == 1 + bool consistent = ((frag_stats.fragment_size && length < frag_stats.fragment_size && pair_consistent(aln1, aln2, 1e-3)) + || (!frag_stats.fragment_size && length < frag_stats.fragment_max)); + if (!retrying + && consistent + && results.first.size() == 1 && results.second.size() == 1 && results.first.front().identity() > frag_stats.perfect_pair_identity_threshold - && results.second.front().identity() > frag_stats.perfect_pair_identity_threshold - && (frag_stats.fragment_size && length < frag_stats.fragment_size && pair_consistent(aln1, aln2, 1e-3) - || !frag_stats.fragment_size && length < frag_stats.fragment_max)) { // hard cutoff + && results.second.front().identity() > frag_stats.perfect_pair_identity_threshold) { // hard cutoff //cerr << "aln\tperfect alignments" << endl; frag_stats.record_fragment_configuration(aln1, aln2, this); - } else if (!frag_stats.fragment_size) { + } else if (!retrying && !frag_stats.fragment_size) { + // mark this pair to be buffered and remapped imperfect_pair = true; } + + if (consistent) { + set_annotation(aln1, "proper_pair", true); + set_annotation(aln2, "proper_pair", true); + } + else { + set_annotation(aln1, "proper_pair", false); + set_annotation(aln2, "proper_pair", false); + } } if (!retrying && imperfect_pair && frag_stats.fragment_max) { @@ -2925,14 +3498,14 @@ pair, vector> Mapper::align_paired_multi( queued_resolve_later = true; } - if(results.first.empty()) { + if(results.first.empty() && !exclude_unaligned) { results.first.push_back(read1); auto& aln = results.first.back(); aln.clear_path(); aln.clear_score(); aln.clear_identity(); } - if(results.second.empty()) { + if(results.second.empty() && !exclude_unaligned) { results.second.push_back(read2); auto& aln = results.second.back(); aln.clear_path(); @@ -2957,44 +3530,28 @@ pair, vector> Mapper::align_paired_multi( aln.set_fragment_length_distribution(fragment_dist.str()); } - // if we have references, annotate the alignments with their reference positions - annotate_with_initial_path_positions(results.first); - annotate_with_initial_path_positions(results.second); + if (!results.first.empty() && !results.second.empty()) { + // if we have references, annotate the alignments with their reference positions + algorithms::annotate_with_initial_path_positions(*xindex, results.first); + algorithms::annotate_with_initial_path_positions(*xindex, results.second); - chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now(); - auto used_time = chrono::duration_cast(t2 - t1).count(); - results.first.front().set_time_used(used_time); - results.second.front().set_time_used(used_time); + chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now(); + auto used_time = chrono::duration_cast(t2 - t1).count(); + results.first.front().set_time_used(used_time); + results.second.front().set_time_used(used_time); + } return results; } -void Mapper::annotate_with_initial_path_positions(vector& alns) const { - for (auto& aln : alns) annotate_with_initial_path_positions(aln); -} - -void Mapper::annotate_with_initial_path_positions(Alignment& aln) const { - if (!aln.refpos_size()) { - auto init_path_positions = alignment_path_offsets(aln); - for (const pair > >& pos_record : init_path_positions) { - for (auto& pos : pos_record.second) { - Position* refpos = aln.add_refpos(); - refpos->set_name(pos_record.first); - refpos->set_offset(pos.first); - refpos->set_is_reverse(pos.second); - } - } - } -} - double Mapper::compute_cluster_mapping_quality(const vector >& clusters, int read_length) { if (clusters.size() == 0) { return 0; } if (clusters.size() == 1) { - return { (double)max_cluster_mapping_quality }; + return (double) max_cluster_mapping_quality; } vector weights; for (auto& cluster : clusters) { @@ -3035,11 +3592,6 @@ double Mapper::compute_cluster_mapping_quality(const vectorseq_length / (double) xindex->node_count; -} - int sub_overlaps_of_first_aln(const vector& alns, float overlap_fraction) { // take the first // now look at the rest and measure overlap @@ -3138,7 +3690,6 @@ Mapper::align_mem_multi(const Alignment& aln, if (debug) cerr << "maybe_mq " << aln.name() << " " << maybe_mq << " " << total_multimaps << " " << mem_max_length << " " << longest_lcp << " " << total_multimaps << " " << mem_read_ratio << " " << fraction_filtered << " " << max_possible_mq << " " << total_multimaps << endl; - double avg_node_len = average_node_length(); // go through the ordered single-hit MEMs // build the clustering model // find the alignments that are the best-scoring walks through it @@ -3166,11 +3717,11 @@ Mapper::align_mem_multi(const Alignment& aln, vector > clusters; if (total_multimaps) { MEMChainModel chainer({ aln.sequence().size() }, { mems }, - [&](pos_t n) { + [&](pos_t n) -> int64_t { return approx_position(n); }, - [&](pos_t n) -> map > > { - return xindex->offsets_in_paths(n); + [&](pos_t n) -> unordered_map > > { + return algorithms::nearest_offsets_in_paths(xindex, n, -1); }, transition_weight, aln.sequence().size()); @@ -3198,15 +3749,6 @@ Mapper::align_mem_multi(const Alignment& aln, size_t offset = gcsa::Node::offset(node); bool is_rev = gcsa::Node::rc(node); cerr << "|" << id << (is_rev ? "-" : "+") << ":" << offset << "," << mem.fragment << ","; - /* - for (auto& ref : node_positions_in_paths(gcsa::Node::encode(id, 0, is_rev))) { - auto& name = ref.first; - for (auto pos : ref.second) { - //cerr << name << (is_rev?"-":"+") << pos + offset; - cerr << "|" << id << (is_rev ? "-" : "+") << ":" << offset << ","; - } - } - */ } cerr << mem.sequence() << " "; } @@ -3331,7 +3873,7 @@ Mapper::align_mem_multi(const Alignment& aln, filter_and_process_multimaps(alns, keep_multimaps); // if we didn't get anything, return an unaligned version of our input - if (alns.empty()) { + if (alns.empty() && !exclude_unaligned) { alns.push_back(aln); auto& unaligned = alns.back(); unaligned.clear_path(); @@ -3342,58 +3884,35 @@ Mapper::align_mem_multi(const Alignment& aln, return alns; } -Alignment Mapper::align_maybe_flip(const Alignment& base, Graph& graph, bool flip, bool traceback, bool acyclic_and_sorted, bool banded_global, bool xdrop_alignment) { +Alignment Mapper::align_maybe_flip(const Alignment& base, HandleGraph& graph, bool flip, bool traceback, bool banded_global, bool xdrop_alignment) { // do not use X-drop alignment when seed position is not available vector mems; - return(align_maybe_flip(base, graph, mems, flip, traceback, acyclic_and_sorted, banded_global, xdrop_alignment)); + return(align_maybe_flip(base, graph, mems, flip, traceback, banded_global, xdrop_alignment)); } -Alignment Mapper::align_maybe_flip(const Alignment& base, Graph& graph, const vector& mems, bool flip, bool traceback, bool acyclic_and_sorted, bool banded_global, bool xdrop_alignment) { +Alignment Mapper::align_maybe_flip(const Alignment& base, HandleGraph& graph, const vector& mems, bool flip, bool traceback, bool banded_global, bool xdrop_alignment) { Alignment aln = base; - map node_length; // do not modify aln.sequence() in X-drop alignment so that seed position is calculated by mem.begin - aln.sequence().begin() - if (flip) { - for (auto& node : graph.node()) { - node_length[node.id()] = node.sequence().size(); - } - aln.set_sequence(reverse_complement(base.sequence())); - if (!base.quality().empty()) { - aln.set_quality(base.quality()); - reverse(aln.mutable_quality()->begin(), - aln.mutable_quality()->end()); - } - } else { - aln.set_sequence(base.sequence()); - if (!base.quality().empty()) { - aln.set_quality(base.quality()); - } + aln.set_sequence(base.sequence()); + if (!base.quality().empty()) { + aln.set_quality(base.quality()); } bool pinned_alignment = false; bool pinned_reverse = false; - aln = align_to_graph(aln, graph, mems, - max_query_graph_ratio, + flip, traceback, - acyclic_and_sorted, pinned_alignment, pinned_reverse, banded_global, - xdrop_alignment ? (flip ? 2 : 1) : 0, + xdrop_alignment ? 1 : 0, include_full_length_bonuses); - if (strip_bonuses && !banded_global && traceback) { // We want to remove the bonuses aln.set_score(get_aligner()->remove_bonuses(aln)); } - if (flip) { - aln = reverse_complement_alignment( - aln, - (function) ([&](int64_t id) { - return node_length[id]; - })); - } return aln; } @@ -3416,126 +3935,16 @@ double Mapper::compute_uniqueness(const Alignment& aln, const vector& mems, bool traceback, bool xdrop_alignment) { - // check if we can just fill out the alignment with exact matches - /* - if (cluster_coverage(mems) == aln.sequence().size()) { - Alignment walked = mems_to_alignment(aln, mems); - assert(walked.identity() == 1); - return walked; - } - */ - // poll the mems to see if we should flip - int count_fwd = 0, count_rev = 0; - for (auto& mem : mems) { - bool is_rev = gcsa::Node::rc(mem.nodes.front()); - if (is_rev) { - ++count_rev; - } else { - ++count_fwd; - } - } - // get the graph with cluster.hpp's cluster_subgraph - Graph graph = cluster_subgraph_walk(*xindex, aln, mems, 1); - bool acyclic_and_sorted = is_id_sortable(graph) && !has_inversion(graph); - // and test each direction for which we have MEM hits - Alignment aln_fwd; - Alignment aln_rev; - // try both ways if we're not sure if we are acyclic - if (count_fwd || !acyclic_and_sorted) { - aln_fwd = align_maybe_flip(aln, graph, mems, false, traceback, acyclic_and_sorted, false, xdrop_alignment); - } - if (count_rev || !acyclic_and_sorted) { - aln_rev = align_maybe_flip(aln, graph, mems, true, traceback, acyclic_and_sorted, false, xdrop_alignment); - } - // TODO check if we have soft clipping on the end of the graph and if so try to expand the context - if (aln_fwd.score() + aln_rev.score() == 0) { - // abject failure, nothing aligned with score > 0 - Alignment result = aln; - result.clear_path(); - result.clear_score(); - return result; - } else if (aln_rev.score() > aln_fwd.score()) { - // reverse won - return aln_rev; - } else { - // forward won - return aln_fwd; - } -} - -VG Mapper::cluster_subgraph_strict(const Alignment& aln, const vector& mems) { -#ifdef debug_mapper -#pragma omp critical - { - if (debug) { - cerr << "Getting a cluster graph for " << mems.size() << " MEMs" << endl; - } - } -#endif - - // As in the multipath aligner, we work out how far we can get from a MEM - // with gaps and use that for how much graph to grab. - vector positions; - vector forward_max_dist; - vector backward_max_dist; - - positions.reserve(mems.size()); - forward_max_dist.reserve(mems.size()); - backward_max_dist.reserve(mems.size()); - - // What aligner are we using? - BaseAligner* aligner = get_aligner(); - - for (const auto& mem : mems) { - // get the start position of the MEM - assert(!mem.nodes.empty()); - positions.push_back(make_pos_t(mem.nodes.front())); - - // search far enough away to get any hit detectable without soft clipping - forward_max_dist.push_back(aligner->longest_detectable_gap(aln, mem.end) - + (aln.sequence().end() - mem.begin)); - backward_max_dist.push_back(aligner->longest_detectable_gap(aln, mem.begin) - + (mem.begin - aln.sequence().begin())); - } - - // Extract the graph - VG graph; - algorithms::extract_containing_graph(xindex, &graph, positions, forward_max_dist, backward_max_dist); - - graph.remove_orphan_edges(); - -#ifdef debug_mapper -#pragma omp critical - { - if (debug) { - cerr << "\tFound " << graph.node_count() << " nodes " << graph.min_node_id() << " - " << graph.max_node_id() - << " and " << graph.edge_count() << " edges" << endl; - } - } -#endif - return graph; -} - -VG Mapper::alignment_subgraph(const Alignment& aln, int context_size) { - set nodes; - auto& path = aln.path(); - for (int i = 0; i < path.mapping_size(); ++i) { - nodes.insert(path.mapping(i).position().node_id()); - } - VG graph; - for (auto& node : nodes) { - *graph.graph.add_node() = xindex->node(node); - } - xindex->expand_context(graph.graph, max(1, context_size), false); // get connected edges - graph.rebuild_indexes(); - return graph; + bdsg::HashGraph graph = cluster_subgraph_containing(*xindex, aln, mems, get_aligner()); + Alignment aligned = align_maybe_flip(aln, graph, mems, false, traceback, false, xdrop_alignment); + return aligned; } // estimate the fragment length as the difference in mean positions of both alignments -map Mapper::min_pair_fragment_length(const Alignment& aln1, const Alignment& aln2) { - map lengths; - auto pos1 = alignment_path_offsets(aln1); - auto pos2 = alignment_path_offsets(aln2); +unordered_map Mapper::min_pair_fragment_length(const Alignment& aln1, const Alignment& aln2) { + unordered_map lengths; + auto pos1 = algorithms::alignment_path_offsets(*xindex, aln1, true, false); + auto pos2 = algorithms::alignment_path_offsets(*xindex, aln2, true, false); for (auto& p : pos1) { auto x = pos2.find(p.first); if (x != pos2.end()) { @@ -3566,13 +3975,15 @@ string FragmentLengthStatistics::fragment_model_str(void) { } void FragmentLengthStatistics::save_frag_lens_to_alns(Alignment& aln1, Alignment& aln2, - const map& approx_frag_lengths, bool is_consistent) { + const unordered_map& approx_frag_lengths, + PathPositionHandleGraph* xindex, + bool is_consistent) { double max_score = 0; aln1.clear_fragment(); aln2.clear_fragment(); for (auto& j : approx_frag_lengths) { Path fragment; - fragment.set_name(j.first); + fragment.set_name(xindex->get_path_name(j.first)); int length = j.second; fragment.set_length(length); *aln1.add_fragment() = fragment; @@ -3590,9 +4001,9 @@ void FragmentLengthStatistics::save_frag_lens_to_alns(Alignment& aln1, Alignment void FragmentLengthStatistics::record_fragment_configuration(const Alignment& aln1, const Alignment& aln2, Mapper* mapper) { if (fixed_fragment_model) return; assert(aln1.path().mapping(0).has_position() && aln2.path().mapping(0).has_position()); - map > lengths; - auto pos1 = mapper->alignment_path_offsets(aln1); - auto pos2 = mapper->alignment_path_offsets(aln2); + unordered_map > lengths; + auto pos1 = algorithms::alignment_path_offsets(*mapper->xindex, aln1, true, false); + auto pos2 = algorithms::alignment_path_offsets(*mapper->xindex, aln2, true, false); for (auto& p : pos1) { auto x = pos2.find(p.first); if (x != pos2.end()) { @@ -3614,7 +4025,7 @@ void FragmentLengthStatistics::record_fragment_configuration(const Alignment& al for (auto& chr : lengths) { int64_t length = get<0>(chr.second); if (abs(length) > fragment_max // keep out super high values - || fragment_size && abs(length) > fragment_size) continue; + || (fragment_size && abs(length) > fragment_size)) continue; bool aln1_is_rev = get<1>(chr.second); bool aln2_is_rev = get<2>(chr.second); bool same_orientation = aln1_is_rev == aln2_is_rev; @@ -3672,7 +4083,7 @@ double FragmentLengthStatistics::fragment_length_pdf(double length) { // that the value is at least as extreme as this one double FragmentLengthStatistics::fragment_length_pval(double length) { double x = abs(length-cached_fragment_length_mean)/cached_fragment_length_stdev; - return 1 - phi(-x,x); + return 1 - (Phi(x) - Phi(-x)); } bool FragmentLengthStatistics::fragment_orientation(void) { @@ -3695,120 +4106,12 @@ bool FragmentLengthStatistics::fragment_direction(void) { return count_fwd > count_rev; } -set Mapper::resolve_paired_mems(vector& mems1, - vector& mems2) { - // find the MEMs that are within estimated_fragment_size of each other - - set pairable; - - // do a wide clustering and then do all pairs within each cluster - // we will use these to determine the alignment strand - //map node_strands; - // records a mapping of id->MEMs, for cluster ranking - map > id_to_mems; - // for clustering - set ids1, ids2; - vector ids; - - // run through the mems - for (auto& mem : mems1) { - for (auto& node : mem.nodes) { - id_t id = gcsa::Node::id(node); - id_to_mems[id].push_back(&mem); - ids1.insert(id); - ids.push_back(id); - } - } - for (auto& mem : mems2) { - for (auto& node : mem.nodes) { - id_t id = gcsa::Node::id(node); - id_to_mems[id].push_back(&mem); - ids2.insert(id); - ids.push_back(id); - } - } - // remove duplicates - //std::sort(ids.begin(), ids.end()); - //ids.erase(std::unique(ids.begin(), ids.end()), ids.end()); - - // get each hit's path-relative position - map > > node_positions; - for (auto& id : ids) { - for (auto& ref : node_positions_in_paths(gcsa::Node::encode(id, 0))) { - auto& name = ref.first; - for (auto pos : ref.second) { - node_positions[name][pos].push_back(id); - } - } - } - - vector > clusters; - for (auto& g : node_positions) { - //if (g.second.empty()) continue; // should be impossible - //cerr << g.first << endl; - clusters.emplace_back(); - int prev = -1; - for (auto& x : g.second) { - auto cluster = &clusters.back(); - //auto& prev = clusters.back().back(); - auto curr = x.first; - if(debug) { - cerr << "p/c " << prev << " " << curr << endl; - } - if (prev != -1) { - if (curr - prev <= frag_stats.fragment_size) { - // in cluster -#ifdef debug_mapper -#pragma omp critical - { - if (debug) { - cerr << "in cluster" << endl; - } - } -#endif - } else { - // It's a new cluster - clusters.emplace_back(); - cluster = &clusters.back(); - } - } - //cerr << " " << x.first << endl; - for (auto& y : x.second) { - //cerr << " " << y << endl; - cluster->push_back(y); - } - prev = curr; - } - } - - for (auto& cluster : clusters) { - // for each pair of ids in the cluster - // which are not from the same read - // estimate the distance between them - // we're roughly in the expected range - bool has_first = false; - bool has_second = false; - for (auto& id : cluster) { - has_first |= ids1.count(id); - has_second |= ids2.count(id); - } - if (!has_first || !has_second) continue; - for (auto& id : cluster) { - for (auto& memptr : id_to_mems[id]) { - pairable.insert(memptr); - } - } - } - - return pairable; -} - // We need a function to get the lengths of nodes, in case we need to // reverse an Alignment, including all its Mappings and Positions. int64_t Mapper::get_node_length(int64_t node_id) { - // Grab the node sequence only from the XG index and get its size. + // Grab the node sequence only from the PathPositionHandleGraph index and get its size. // Make sure to use the cache - return xg_node_length(node_id, xindex); + return xindex->get_length(xindex->get_handle(node_id)); } bool Mapper::check_alignment(const Alignment& aln) { @@ -3816,17 +4119,15 @@ bool Mapper::check_alignment(const Alignment& aln) { // assert that this == the alignment if (aln.path().mapping_size()) { // get the graph corresponding to the alignment path - Graph sub; + bdsg::HashGraph sub; for (int i = 0; i < aln.path().mapping_size(); ++ i) { auto& m = aln.path().mapping(i); if (m.has_position() && m.position().node_id()) { auto id = aln.path().mapping(i).position().node_id(); - // XXXXXX this is single-threaded! - xindex->neighborhood(id, 2, sub); + algorithms::extract_id_range(*xindex, id, id, sub); } } - VG g; g.extend(sub); - auto seq = g.path_string(aln.path()); + auto seq = algorithms::path_string(sub, aln.path()); //if (aln.sequence().find('N') == string::npos && seq != aln.sequence()) { if (aln.quality().size() && aln.quality().size() != aln.sequence().size()) { cerr << "alignment quality is not the same length as its sequence" << endl @@ -3839,11 +4140,12 @@ bool Mapper::check_alignment(const Alignment& aln) { << "expect:\t" << aln.sequence() << endl << "got:\t" << seq << endl; // save alignment - write_alignment_to_file(aln, "fail-" + hash_alignment(aln) + ".gam"); + vg::io::write_to_file(aln, "fail-" + hash_alignment(aln) + ".gam"); // save graph, bigger fragment - xindex->expand_context(sub, 5, true); - VG gn; gn.extend(sub); - gn.serialize_to_file("fail-" + gn.hash() + ".vg"); + algorithms::expand_subgraph_by_steps(*xindex, sub, 5); + algorithms::add_subpaths_to_subgraph(*xindex, sub); + std::ofstream out("fail-" + hash_alignment(aln) + ".hg"); + sub.serialize(out); return false; } } @@ -3970,21 +4272,16 @@ vector Mapper::align_banded(const Alignment& read, int kmer_size, int } }; - if (alignment_threads > 1) { + // Always use OMP parallelism here; restrict threads by setting OMP thread count. #pragma omp parallel for - for (int i = 0; i < bands.size(); ++i) { - do_band(i); - } - } else { - for (int i = 0; i < bands.size(); ++i) { - do_band(i); - } + for (int i = 0; i < bands.size(); ++i) { + do_band(i); } // cost function auto transition_weight = [&](const Alignment& aln1, const Alignment& aln2, - const map > >& pos1, - const map > >& pos2, + const unordered_map > >& pos1, + const unordered_map > >& pos2, int64_t band_distance) { if (aln1.has_path() && !aln2.has_path()) { // pay a lot to go into unaligned from aligned because then we risk dropping into a random place @@ -3995,27 +4292,19 @@ vector Mapper::align_banded(const Alignment& read, int kmer_size, int } else if (!aln1.has_path() && aln2.has_path()) { return 0.0; } - auto aln1_end = make_pos_t(path_end(aln1.path())); - auto aln2_begin = make_pos_t(path_start(aln2.path())); + auto aln1_end = make_pos_t(path_end_position(aln1.path())); + auto aln2_begin = make_pos_t(path_start_position(aln2.path())); pair distances = min_oriented_distances(pos1, pos2); // consider both the forward and inversion case // counter[3]++; bench_t b; bench_init(b); bench_start(b); int64_t dist_fwd = distances.first; - if (dist_fwd < aln2.sequence().size()) { - int64_t graph_dist_fwd = graph_distance(aln1_end, aln2_begin, aln2.sequence().size()); - dist_fwd = min(graph_dist_fwd, dist_fwd); - } dist_fwd -= band_distance; int64_t dist_inv = distances.second; - if (dist_inv < aln2.sequence().size()) { - int64_t graph_dist_inv = graph_distance(aln2_begin, aln1_end, aln2.sequence().size()); - dist_inv = min(graph_dist_inv, dist_inv); - } dist_inv -= band_distance; // bench_end(b); - double fwd_score = -((double)gap_open + (double)dist_fwd * (double)gap_extension); - double inv_score = -2.0*((double)gap_open + (double)dist_inv * (double)gap_extension); + double fwd_score = -((double)(gap_open * dist_fwd != 0) + (double)dist_fwd * (double)gap_extension); + double inv_score = -2.0*((double)(gap_open * dist_fwd != 0) + (double)dist_inv * (double)gap_extension); return max(fwd_score, inv_score); }; @@ -4024,7 +4313,8 @@ vector Mapper::align_banded(const Alignment& read, int kmer_size, int AlignmentChainModel chainer(multi_alns, this, transition_weight, max_band_jump, 64, max_band_jump*2); // bench_end(bench[0]); if (debug) chainer.display(cerr); - vector alignments = chainer.traceback(read, max_multimaps, false, debug); + int total_multimaps = max(max_multimaps, extra_multimaps); + vector alignments = chainer.traceback(read, total_multimaps, false, debug); if (patch_alignments) { for (auto& aln : alignments) { // patch the alignment to deal with short unaligned regions @@ -4046,7 +4336,6 @@ vector Mapper::align_banded(const Alignment& read, int kmer_size, int compute_mapping_qualities(alignments, 0, max_mapping_quality, max_mapping_quality); filter_and_process_multimaps(alignments, max_multimaps); } - //cerr << "got alignment " << pb2json(alignments.front()) << endl; chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now(); // set time for alignment alignments.front().set_time_used(chrono::duration_cast(t2 - t1).count()); @@ -4056,31 +4345,29 @@ vector Mapper::align_banded(const Alignment& read, int kmer_size, int bool Mapper::adjacent_positions(const Position& pos1, const Position& pos2) { // are they the same id, with offset differing by 1? if (pos1.node_id() == pos2.node_id() + && pos1.is_reverse() == pos2.is_reverse() && pos1.offset() == pos2.offset()-1) { return true; } // otherwise, we're going to need to check via the index - VG graph; - // pick up a graph that's just the neighborhood of the start and end positions - int64_t id1 = pos1.node_id(); - int64_t id2 = pos2.node_id(); - if(xindex) { - // Grab the node sequence only from the XG index and get its size. - xindex->get_id_range(id1, id1, graph.graph); - xindex->get_id_range(id2, id2, graph.graph); - xindex->expand_context(graph.graph, 1, false); - graph.rebuild_indexes(); - } else { - throw runtime_error("No index to get nodes from."); - } - // now look in the graph to figure out if we are adjacent - return graph.adjacent(pos1, pos2); + // look in the graph to figure out if we are adjacent + handle_t handle1 = xindex->get_handle(pos1.node_id(), pos1.is_reverse()); + handle_t handle2 = xindex->get_handle(pos2.node_id(), pos2.is_reverse()); + bool adjacent = false; + xindex->follow_edges(handle1, false, [&](const handle_t& next) { + if (next == handle2) { + adjacent = true; + return false; + } + return true; + }); + return adjacent; } void Mapper::compute_mapping_qualities(vector& alns, double cluster_mq, double mq_estimate, double mq_cap) { if (alns.empty()) return; double max_mq = min(mq_cap, (double)max_mapping_quality); - BaseAligner* aligner = get_aligner(); + const GSSWAligner* aligner = get_aligner(); int sub_overlaps = sub_overlaps_of_first_aln(alns, mq_overlap); switch (mapping_quality_method) { case Approx: @@ -4098,7 +4385,7 @@ void Mapper::compute_mapping_qualities(pair, vector if (pair_alns.first.empty() || pair_alns.second.empty()) return; double max_mq1 = min(mq_cap1, (double)max_mapping_quality); double max_mq2 = min(mq_cap2, (double)max_mapping_quality); - BaseAligner* aligner = get_aligner(); + const GSSWAligner* aligner = get_aligner(); int sub_overlaps1 = sub_overlaps_of_first_aln(pair_alns.first, mq_overlap); int sub_overlaps2 = sub_overlaps_of_first_aln(pair_alns.second, mq_overlap); vector frag_weights; @@ -4238,6 +4525,14 @@ vector Mapper::align_multi_internal(bool compute_unpaired_quality, // note that this will in turn call align_multi_internal on fragments of the read if (aln.sequence().size() > band_width) { // TODO: banded alignment currently doesn't support mapping qualities because it only produces one alignment + int expected = 0, desired = 1; + bool exchanged = warned_about_chunking.compare_exchange_strong(expected, desired); + if (exchanged) { + stringstream strm; + strm << "warning: Thread " << omp_get_thread_num() << " encountered sequence of length " << aln.sequence().size() << ", which is longer than the non-chunked limit of " << band_width << ". Alignments may be discontiguous. To adjust this behavior, change the band width parameter. Suppressing further warnings. " << endl; + cerr << strm.str(); + } + #ifdef debug_mapper #pragma omp critical if (debug) cerr << "switching to banded alignment" << endl; @@ -4272,7 +4567,8 @@ vector Mapper::align_multi_internal(bool compute_unpaired_quality, max_mem_length, min_mem_length, mem_reseed_length, - false, true, true, false); + false, true, true, true); // Make sure to actually fill in the longest LCP. + // query mem hits alignments = align_mem_multi(aln, mems, cluster_mq, longest_lcp, fraction_filtered, max_mem_length, keep_multimaps, additional_multimaps_for_quality, xdrop_alignment); } @@ -4295,7 +4591,7 @@ vector Mapper::align_multi_internal(bool compute_unpaired_quality, } #endif - annotate_with_initial_path_positions(alignments); + algorithms::annotate_with_initial_path_positions(*xindex, alignments); chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now(); // set time for alignment alignments.front().set_time_used(chrono::duration_cast(t2 - t1).count()); @@ -4328,17 +4624,8 @@ set gcsa_nodes_to_positions(const vector& nodes) { return positions; } - -int64_t Mapper::graph_distance(pos_t pos1, pos_t pos2, int64_t maximum) { - return xg_distance(pos1, pos2, maximum, xindex); -} - int64_t Mapper::graph_mixed_distance_estimate(pos_t pos1, pos_t pos2, int64_t maximum) { - if (maximum) { - int64_t graph_dist = graph_distance(pos1, pos2, maximum); - if (graph_dist < maximum) return graph_dist; - } - int64_t path_dist = xindex->min_approx_path_distance(id(pos1), id(pos2)); + int64_t path_dist = algorithms::min_approx_path_distance(xindex, pos1, pos2, maximum*2); int64_t approx_dist = abs(approx_distance(pos1, pos2)); return min(path_dist, approx_dist); } @@ -4346,9 +4633,9 @@ int64_t Mapper::graph_mixed_distance_estimate(pos_t pos1, pos_t pos2, int64_t ma int64_t Mapper::approx_position(pos_t pos) { // get nodes on the forward strand if (is_rev(pos)) { - pos = reverse(pos, xg_node_length(id(pos), xindex)); + pos = reverse(pos, get_node_length(id(pos))); } - return (int64_t)xg_node_start(id(pos), xindex) + (int64_t)offset(pos); + return dynamic_cast(xindex)->node_vector_offset((nid_t)id(pos)) + offset(pos); } int64_t Mapper::approx_distance(pos_t pos1, pos_t pos2) { @@ -4381,167 +4668,6 @@ int64_t Mapper::approx_fragment_length(const Alignment& aln1, const Alignment& a } } -id_t Mapper::node_approximately_at(int64_t approx_pos) { - return xindex->node_at_seq_pos( - min(xindex->seq_length, - (size_t)max(approx_pos, (int64_t)1))); -} - -// use LRU caching to get the most-recent node positions -map > Mapper::node_positions_in_paths(gcsa::node_type node) { - return xindex->position_in_paths(gcsa::Node::id(node), gcsa::Node::rc(node), gcsa::Node::offset(node)); -} - -Alignment Mapper::walk_match(const string& seq, pos_t pos) { - //cerr << "in walk match with " << seq << " " << seq.size() << " " << pos << endl; - Alignment aln; - aln.set_sequence(seq); - auto alns = walk_match(aln, seq, pos); - if (!alns.size()) { - //cerr << "no alignments returned from walk match with " << seq << " " << seq.size() << " " << pos << endl; - //assert(false); - return aln; - } - aln = alns.front(); // take the first one we found - //assert(alignment_to_length(aln) == alignment_from_length(aln)); - if (alignment_to_length(aln) != alignment_from_length(aln) - || alignment_to_length(aln) != seq.size()) { - //cerr << alignment_to_length(aln) << " is not " << seq.size() << endl; - //cerr << pb2json(aln) << endl; - //assert(false); - aln.clear_path(); - } -#ifdef debug_mapper - if (debug) { - cerr << "walk_match result " << pb2json(aln) << endl; - if (!check_alignment(aln)) { - cerr << "aln is invalid!" << endl; - exit(1); - } - } -#endif - return aln; -} - -vector Mapper::walk_match(const Alignment& base, const string& seq, pos_t pos) { - //cerr << "in walk_match " << seq << " from " << pos << " with base " << pb2json(base) << endl; - // go to the position in the xg index - // and step in the direction given - // until we exhaust our sequence - // or hit another node - vector alns; - Alignment aln = base; - Path& path = *aln.mutable_path(); - Mapping* mapping = path.add_mapping(); - *mapping->mutable_position() = make_position(pos); -#ifdef debug_mapper -#pragma omp critical - if (debug) cerr << "walking match for seq " << seq << " at position " << pb2json(*mapping) << endl; -#endif - // get the first node we match - int total = 0; - size_t match_len = 0; - for (size_t i = 0; i < seq.size(); ++i) { - char c = seq[i]; - //cerr << string(base.path().mapping_size(), ' ') << pos << " @ " << i << " on " << c << endl; - auto nexts = next_pos_chars(pos); - // we can have a match on the current node - if (nexts.size() == 1 && id(nexts.begin()->first) == id(pos)) { - pos_t npos = nexts.begin()->first; - // check that the next position would match - if (i+1 < seq.size()) { - // we can't step, so we break - //cerr << "Checking if " << pos_char(npos) << " != " << seq[i+1] << endl; - if (pos_char(npos) != seq[i+1]) { -#ifdef debug_mapper -#pragma omp critical - if (debug) cerr << "MEM does not match position, returning without creating alignment" << endl; -#endif - return alns; - } - } - // otherwise we step our counters - ++match_len; - ++get_offset(pos); - } else { // or we go into the next node - // we must be going into another node - // emit the mapping for this node - //cerr << "we are going into a new node" << endl; - // finish the last node - { - // we must have matched / we already checked - ++match_len; - Edit* edit = mapping->add_edit(); - edit->set_from_length(match_len); - edit->set_to_length(match_len); - // reset our counter - match_len = 0; - } - // find the next node that matches our MEM - bool got_match = false; - if (i+1 < seq.size()) { - //cerr << "nexts @ " << i << " " << nexts.size() << endl; - for (auto& p : nexts) { - //cerr << " next : " << p.first << " " << p.second << " (looking for " << seq[i+1] << ")" << endl; - if (p.second == seq[i+1]) { - if (!got_match) { - pos = p.first; - got_match = true; - } else { - auto v = walk_match(aln, seq.substr(i+1), p.first); - if (v.size()) { - alns.reserve(alns.size() + distance(v.begin(), v.end())); - alns.insert(alns.end(), v.begin(), v.end()); - } - } - } - } - if (!got_match) { - // this matching ends here - // and we haven't finished matching - // thus this path doesn't contain the match - //cerr << "got no match" << endl; - return alns; - } - - // set up a new mapping - mapping = path.add_mapping(); - *mapping->mutable_position() = make_position(pos); - } else { - //cerr << "done!" << endl; - } - } - } - if (match_len) { - Edit* edit = mapping->add_edit(); - edit->set_from_length(match_len); - edit->set_to_length(match_len); - } - alns.push_back(aln); -#ifdef debug_mapper -#pragma omp critical - if (debug) { - cerr << "walked alignment(s):" << endl; - for (auto& aln : alns) { - cerr << pb2json(aln) << endl; - } - } -#endif - //cerr << "returning " << alns.size() << endl; - return alns; -} - -// convert one mem into a set of alignments, one for each exact match -vector Mapper::mem_to_alignments(MaximalExactMatch& mem) { - vector alns; - const string seq = mem.sequence(); - for (auto& node : mem.nodes) { - pos_t pos = make_pos_t(node); - alns.emplace_back(walk_match(seq, pos)); - } - return alns; -} - Position Mapper::alignment_end_position(const Alignment& aln) { if (!aln.has_path()) { Position pos; return pos; } Alignment b; @@ -4613,12 +4739,12 @@ Alignment Mapper::patch_alignment(const Alignment& aln, int max_patch_length, bo int max_score = -std::numeric_limits::max(); for (auto& pos : band_ref_pos) { //cerr << "trying position " << pos << endl; - pos_t pos_rev = reverse(pos, xg_node_length(id(pos), xindex)); - Graph graph = xindex->graph_context_id(pos_rev, band.sequence().size()*extend_fwd); - graph.MergeFrom(xindex->graph_context_id(pos, band.sequence().size()*extend_rev)); - sort_by_id_dedup_and_clean(graph); - bool acyclic_and_sorted = is_id_sortable(graph) && !has_inversion(graph); - auto proposed_band = align_maybe_flip(band, graph, is_rev(pos), true, acyclic_and_sorted, false, xdrop_alignment); + bdsg::HashGraph graph; + algorithms::extract_context(*xindex, graph, xindex->get_handle(id(pos), is_rev(pos)), offset(pos), + band.sequence().size()*extend_fwd, true, false); + algorithms::extract_context(*xindex, graph, xindex->get_handle(id(pos), is_rev(pos)), offset(pos), + band.sequence().size()*extend_rev, false, true); + auto proposed_band = align_maybe_flip(band, graph, is_rev(pos), true, false, xdrop_alignment); if (proposed_band.score() > max_score) { band = proposed_band; max_score = band.score(); } } // TODO @@ -4631,13 +4757,13 @@ Alignment Mapper::patch_alignment(const Alignment& aln, int max_patch_length, bo int max_score = -std::numeric_limits::max(); for (auto& pos : band_ref_pos) { //cerr << "trying position " << pos << endl; - Graph graph = xindex->graph_context_id(pos, band.sequence().size()*extend_fwd); - pos_t pos_rev = reverse(pos, xg_node_length(id(pos), xindex)); - graph.MergeFrom(xindex->graph_context_id(pos_rev, band.sequence().size()*extend_rev)); - sort_by_id_dedup_and_clean(graph); + bdsg::HashGraph graph; + algorithms::extract_context(*xindex, graph, xindex->get_handle(id(pos), is_rev(pos)), offset(pos), + band.sequence().size()*extend_fwd, true, false); + algorithms::extract_context(*xindex, graph, xindex->get_handle(id(pos), is_rev(pos)), offset(pos), + band.sequence().size()*extend_rev, false, true); //cerr << "on graph " << pb2json(graph) << endl; - bool acyclic_and_sorted = is_id_sortable(graph) && !has_inversion(graph); - auto proposed_band = align_maybe_flip(band, graph, is_rev(pos), true, acyclic_and_sorted, false, xdrop_alignment); + auto proposed_band = align_maybe_flip(band, graph, is_rev(pos), true, false, xdrop_alignment); if (proposed_band.score() > max_score) { band = proposed_band; max_score = band.score(); } } } else { @@ -4645,13 +4771,13 @@ Alignment Mapper::patch_alignment(const Alignment& aln, int max_patch_length, bo int max_score = -std::numeric_limits::max(); for (auto& pos : band_ref_pos) { //cerr << "trying position " << pos << endl; - Graph graph = xindex->graph_context_id(pos, band.sequence().size()*extend_fwd); - pos_t pos_rev = reverse(pos, xg_node_length(id(pos), xindex)); - graph.MergeFrom(xindex->graph_context_id(pos_rev, band.sequence().size()*extend_rev)); - sort_by_id_dedup_and_clean(graph); + bdsg::HashGraph graph; + algorithms::extract_context(*xindex, graph, xindex->get_handle(id(pos), is_rev(pos)), offset(pos), + band.sequence().size()*extend_fwd, true, false); + algorithms::extract_context(*xindex, graph, xindex->get_handle(id(pos), is_rev(pos)), offset(pos), + band.sequence().size()*extend_rev, false, true); //cerr << "on graph " << pb2json(graph) << endl; - bool acyclic_and_sorted = is_id_sortable(graph) && !has_inversion(graph); - auto proposed_band = align_maybe_flip(band, graph, is_rev(pos), true, acyclic_and_sorted, false, xdrop_alignment); + auto proposed_band = align_maybe_flip(band, graph, is_rev(pos), true, false, xdrop_alignment); if (proposed_band.score() > max_score) { band = proposed_band; max_score = band.score(); } } } @@ -4674,7 +4800,6 @@ Alignment Mapper::patch_alignment(const Alignment& aln, int max_patch_length, bo && from_length >= min_cluster_length && band.identity() > min_identity) { band_ref_pos.clear(); - //cerr << "thing worked " << pb2json(band) << endl; // todo... step our position back just a little to match the banding // right now we're relying on the chunkiness of the graph to get this for us // strip back a little @@ -4695,6 +4820,12 @@ Alignment Mapper::patch_alignment(const Alignment& aln, int max_patch_length, bo } else { //cerr << "clearing the path" << endl; band.clear_path(); + /* + Edit* e = band.mutable_path()->add_mapping()->add_edit(); + e->set_to_length(band.sequence().size()); + band.clear_score(); + band.clear_identity(); + */ // TODO try to align over a bigger chunk after this } } @@ -4713,7 +4844,7 @@ Alignment Mapper::patch_alignment(const Alignment& aln, int max_patch_length, bo // walk graph to estimate next position based on assumption we are ~ homologous to the graph set next_pos; for (auto& pos : band_ref_pos) { - for (auto& next : positions_bp_from(pos, band.sequence().size(), false)) { + for (auto& next : algorithms::jump_along_closest_path(xindex, pos, band.sequence().size(), band.sequence().size())) { next_pos.insert(next); } } @@ -4730,10 +4861,13 @@ Alignment Mapper::patch_alignment(const Alignment& aln, int max_patch_length, bo } } } + /* - cerr << "done bands" << endl; - for (auto& band : bands) { - cerr << "band: " << pb2json(band) << endl; + if (debug) { + cerr << "done bands" << endl; + for (auto& band : bands) { + cerr << "band: " << pb2json(band) << endl; + } } */ patch = simplify(merge_alignments(bands)); @@ -4749,9 +4883,8 @@ Alignment Mapper::patch_alignment(const Alignment& aln, int max_patch_length, bo assert(false); } #endif - //cerr << "adding " << pb2json(patch) << endl; if (patched.path().mapping_size()) { - extend_alignment(patched, patch); + extend_alignment(patched, patch, true); } else { patched = patch; } @@ -4809,85 +4942,22 @@ void Mapper::remove_full_length_bonuses(Alignment& aln) { int32_t Mapper::score_alignment(const Alignment& aln, bool use_approx_distance) { // Find the right aligner to score with - BaseAligner* aligner = get_aligner(); + const GSSWAligner* aligner = get_aligner(); if (use_approx_distance) { // Use an approximation - return aligner->score_gappy_alignment(aln, [&](pos_t last, pos_t next, size_t max_search) { + return aligner->score_discontiguous_alignment(aln, [&](pos_t last, pos_t next, size_t max_search) { return approx_distance(last, next); }, strip_bonuses); } else { // Use the exact method, and if we hit the limit, fall back to the approximate method. - return aligner->score_gappy_alignment(aln, [&](pos_t last, pos_t next, size_t max_search) { + return aligner->score_discontiguous_alignment(aln, [&](pos_t last, pos_t next, size_t max_search) { return graph_mixed_distance_estimate(last, next, min(32, (int)max_search)); }, strip_bonuses); } } -// make a perfect-match alignment out of a vector of MEMs which each have only one recorded hit -// use the base alignment sequence (which the SMEMs relate to) to fill in the gaps -Alignment Mapper::mems_to_alignment(const Alignment& aln, const vector& mems) { - // base case--- empty alignment - if (mems.empty()) { - Alignment aln; return aln; - } - vector alns; - // get reference to the start and end of the sequences - string::const_iterator seq_begin = aln.sequence().begin(); - string::const_iterator seq_end = aln.sequence().end(); - // we use this to track where we need to add sequence - string::const_iterator last_end = seq_begin; - for (int i = 0; i < mems.size(); ++i) { - auto& mem = mems.at(i); - //cerr << "looking at " << mem.sequence() << endl; - // this mem is contained in the last - if (mem.end <= last_end) { - continue; - } - // handle unaligned portion between here and the last SMEM or start of read - if (mem.begin > last_end) { - alns.emplace_back(); - alns.back().set_sequence(aln.sequence().substr(last_end - seq_begin, mem.begin - last_end)); - } - Alignment aln = mem_to_alignment(mem); - // find and trim overlap with previous - if (i > 0) { - // use the end of the last mem we touched (we may have skipped several) - int overlap = last_end - mem.begin; - if (overlap > 0) { - aln = strip_from_start(aln, overlap); - } - } - alns.push_back(aln); - last_end = mem.end; - } - // handle unaligned portion at end of read - int start = last_end - seq_begin; - int length = seq_end - (seq_begin + start); - - alns.emplace_back(); - alns.back().set_sequence(aln.sequence().substr(start, length)); - - auto alnm = simplify(merge_alignments(alns)); - *alnm.mutable_quality() = aln.quality(); - alnm.set_name(aln.name()); - alnm.set_score(score_alignment(alnm)); - alnm.set_identity(identity(alnm.path())); - return alnm; -} - -// convert one mem into an alignment; validates that only one node is given -Alignment Mapper::mem_to_alignment(const MaximalExactMatch& mem) { - const string seq = mem.sequence(); - if (mem.nodes.size() > 1) { - cerr << "[vg::Mapper] warning: generating first alignment from MEM with multiple recorded hits" << endl; - } - auto& node = mem.nodes.front(); - pos_t pos = make_pos_t(node); - return walk_match(seq, pos); -} - const int balanced_stride(int read_length, int kmer_size, int stride) { double r = read_length; double k = kmer_size; @@ -4913,7 +4983,7 @@ const vector balanced_kmers(const string& seq, const int kmer_size, cons AlignmentChainModel::AlignmentChainModel( vector >& bands, Mapper* mapper, - const function > >&, const map > >&, int64_t)>& transition_weight, + const function > >&, const unordered_map > >&, int64_t)>& transition_weight, int vertex_band_width, int position_depth, int max_connections) { @@ -4928,11 +4998,11 @@ AlignmentChainModel::AlignmentChainModel( v.aln = &aln; v.band_begin = offset; v.band_idx = idx; - v.weight = aln.sequence().size() + aln.score() + aln.mapping_quality(); + v.weight = aln.score(); v.prev = nullptr; v.score = 0; - v.positions = mapper->alignment_path_offsets(aln); - v.positions[""].push_back(make_pair(mapper->approx_alignment_position(aln), false)); + v.positions = algorithms::alignment_path_offsets(*mapper->xindex, aln, true, false); + v.positions[handlegraph::as_path_handle(0)].push_back(make_pair(mapper->approx_alignment_position(aln), false)); model.push_back(v); // mapper->counter[1]++; } @@ -4946,11 +5016,12 @@ AlignmentChainModel::AlignmentChainModel( } for (vector::iterator v = model.begin(); v != model.end(); ++v) { for (auto u = v+1; u != model.end(); ++u) { - if (v->next_cost.size() < max_connections && u->prev_cost.size() < max_connections) { + if (v->band_idx != u->band_idx && + v->next_cost.size() < max_connections && u->prev_cost.size() < max_connections) { if (v->band_idx + vertex_band_width >= u->band_idx) { // bench_start(mapper->bench[2]); double weight = transition_weight(*v->aln, *u->aln, v->positions, u->positions, - u->band_begin - v->band_begin+v->aln->sequence().size()); + u->band_begin - (v->band_begin+v->aln->sequence().size())); // bench_end(mapper->bench[2]); if (weight > -std::numeric_limits::max()) { v->next_cost.push_back(make_pair(&*u, weight)); @@ -5066,6 +5137,7 @@ vector AlignmentChainModel::traceback(const Alignment& read, int alt_ } aln_trace[vertex.band_idx] = *vertex.aln; } + //display_dot(cerr, vertex_trace); } vector alns; for (auto& trace : traces) { @@ -5101,6 +5173,54 @@ void AlignmentChainModel::display(ostream& out) { } } +void AlignmentChainModel::display_dot(ostream& out, vector vertex_trace) { + map vertex_ids; + int i = 0; + for (auto& vertex : model) { + vertex_ids[&vertex] = ++i; + } + map in_trace; + i = 0; + for (auto& v : vertex_trace) { + in_trace[v] = ++i; + } + out << "digraph memchain {" << endl; + out << "rankdir=LR;" << endl; + for (auto& vertex : model) { + out << vertex_ids[&vertex] + << " [label=\"" + << vertex.aln->sequence() + << "\nid:" << vertex_ids[&vertex] + << " band:" << vertex.band_idx + << " weight:" << vertex.weight + << " score:" << vertex.score; + out << "\npositions: "; + for (auto& p : vertex.positions) { + for (auto& q : p.second) { + out << as_integer(p.first) << ":" << q.first << (q.second?"-":"+") << " "; + } + } + pos_t p_start = make_pos_t(path_start_position(vertex.aln->path())); + pos_t p_end = make_pos_t(path_end_position(vertex.aln->path())); + out << "\nstart: " << id(p_start) << (is_rev(p_start)?"-":"+") << ":" << offset(p_start) << " "; + out << "end: " << id(p_end) << (is_rev(p_end)?"-":"+") << ":" << offset(p_end) << " "; + out << "\""; + if (in_trace.find(&vertex) != in_trace.end()) { + out << ",color=red"; + } + out << ",shape=box];" << endl; + for (auto& p : vertex.next_cost) { + out << vertex_ids[&vertex] << " -> " << vertex_ids[p.first] << " [label=\"" << p.second << "\""; + if (in_trace.find(&vertex) != in_trace.end() && in_trace.find(p.first) != in_trace.end() && + in_trace[&vertex] - 1 == in_trace[p.first]) { + out << ",color=red,fontcolor=red"; + } + out << "];" << endl; + } + } + out << "}" << endl; +} + FragmentLengthDistribution::FragmentLengthDistribution(size_t maximum_sample_size, size_t reestimation_frequency, double robust_estimation_fraction) : @@ -5173,7 +5293,7 @@ void FragmentLengthDistribution::estimate_distribution() { mu = sum / count; double raw_var = sum_of_sqs / count - mu * mu; // apply method of moments estimation using the appropriate truncated normal distribution - double a = normal_inverse_cdf(1.0 - 0.5 * (1.0 - robust_estimation_fraction)); + double a = Phi_inv(1.0 - 0.5 * (1.0 - robust_estimation_fraction)); sigma = sqrt(raw_var / (1.0 - 2.0 * a * normal_pdf(a, 0.0, 1.0))); } @@ -5181,7 +5301,7 @@ double FragmentLengthDistribution::mean() const { return mu; } -double FragmentLengthDistribution::stdev() const { +double FragmentLengthDistribution::std_dev() const { return sigma; } diff --git a/src/mapper.hpp b/src/mapper.hpp index e83026b8035..cf51859e1b4 100644 --- a/src/mapper.hpp +++ b/src/mapper.hpp @@ -7,26 +7,18 @@ #include #include "omp.h" #include "vg.hpp" -#include "xg.hpp" -#include "index.hpp" +#include "bdsg/hash_graph.hpp" #include #include #include -#include "alignment.hpp" -#include "path.hpp" #include "position.hpp" -#include "xg_position.hpp" -#include "lru_cache.h" -#include "json2pb.h" -#include "entropy.hpp" -#include "gssw_aligner.hpp" +#include "vg/io/json2pb.h" +#include "aligner.hpp" #include "mem.hpp" +#include "mem_accelerator.hpp" #include "cluster.hpp" -#include "graph.hpp" -#include "translator.hpp" // TODO: pull out ScoreProvider into its own file #include "haplotypes.hpp" -#include "algorithms/topological_sort.hpp" // #define BENCH // #include "bench.h" @@ -51,7 +43,7 @@ class AlignmentChainModelVertex { vector > prev_cost; // for backward double weight; double score; - map > > positions; + unordered_map > > positions; int band_begin; int band_idx; AlignmentChainModelVertex* prev; @@ -66,13 +58,13 @@ class AlignmentChainModelVertex { class AlignmentChainModel { public: vector model; - map::iterator> > > positions; + unordered_map::iterator> > > positions; set::iterator> redundant_vertexes; vector unaligned_bands; AlignmentChainModel( vector >& bands, Mapper* mapper, - const function > >&, const map > >&, int64_t)>& transition_weight, + const function > >&, const unordered_map > >&, int64_t)>& transition_weight, int vertex_band_width = 10, int position_depth = 1, int max_connections = 30); @@ -80,6 +72,7 @@ class AlignmentChainModel { AlignmentChainModelVertex* max_vertex(void); vector traceback(const Alignment& read, int alt_alns, bool paired, bool debug); void display(ostream& out); + void display_dot(ostream& out, vector vertex_trace); void clear_scores(void); }; @@ -113,7 +106,7 @@ class FragmentLengthDistribution { double mean() const; /// Robust standard deviation of the distribution observed so far - double stdev() const; + double std_dev() const; /// Returns true if the maximum sample size has been reached, which finalizes the /// distribution estimate @@ -147,50 +140,71 @@ class FragmentLengthDistribution { void estimate_distribution(); }; - -class BaseMapper : public Progressive { - + +/** + * A class trait/mixin which defines a mapper's paired end distribution support. + * + * Doesn't actually define the paired-end mapping methods. + */ +class PairedEndMapper { public: - // Make a Mapper that pulls from an XG succinct graph and a GCSA2 kmer - // index + LCP array, and which can score reads against haplotypes using - // the given ScoreProvider. - BaseMapper(xg::XG* xidex, gcsa::GCSA* g, gcsa::LCPArray* a, haplo::ScoreProvider* haplo_score_provider = nullptr); - BaseMapper(void); - ~BaseMapper(void); - - double estimate_gc_content(void); - - int random_match_length(double chance_random); - - void load_scoring_matrix(std::ifstream& matrix_stream); - void set_alignment_scores(int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, - double haplotype_consistency_exponent = 1, uint32_t max_gap_length = default_max_gap_length); - - // TODO: setting alignment threads could mess up the internal memory for how many threads to reset to + /// Set parameters for estimating fragment length distribution. + /// TODO: setting alignment threads after this could mess up the internal memory for how many threads to reset to void set_fragment_length_distr_params(size_t maximum_sample_size = 1000, size_t reestimation_frequency = 1000, double robust_estimation_fraction = 0.95); - - /// Set the alignment thread count, updating internal data structures that - /// are per thread. Note that this resets aligner scores to their default values! - void set_alignment_threads(int new_thread_count); - - void set_cache_size(int new_cache_size); - + /// Returns true if fragment length distribution has been fixed bool has_fixed_fragment_length_distr(); /// Use the given fragment length distribution parameters instead of /// estimating them. void force_fragment_length_distr(double mean, double stddev); + +protected: + /// Holds the actual fragment length distribution and estimation information + FragmentLengthDistribution fragment_length_distr; + +}; + +/** + * Base class for basic mapping functionality shared between the Mapper, MultipathMapper, etc. + * Handles holding on to the random access and text indexes needed for mapping operations. + */ +class BaseMapper : public AlignerClient, public PairedEndMapper { + +public: + /** + * Make a BaseMapper that pulls from an PathPositionHandleGraph succinct graph and a GCSA2 kmer + * index + LCP array, and which can score reads against haplotypes using + * the given ScoreProvider. + * + * If the GCSA and LCPArray are null, cannot do search, only alignment. + */ + BaseMapper(PathPositionHandleGraph* xidex, gcsa::GCSA* g, gcsa::LCPArray* a, haplo::ScoreProvider* haplo_score_provider = nullptr); + BaseMapper(void); + + /// We need to be able to estimate the GC content from the GCSA index in the constructor. + /// The given index may be null. + static double estimate_gc_content(const gcsa::GCSA* gcsa); + + int random_match_length(double chance_random); + + + /// Override alignment score setting to support haplotype consistency exponent + void set_alignment_scores(int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, + double haplotype_consistency_exponent = 1); + /// Same, but loading a 4x4 substitution score matrix from a stream + void set_alignment_scores(istream& matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, + double haplotype_consistency_exponent = 1); + // MEM-based mapping // find maximal exact matches // These are SMEMs by definition when shorter than the max_mem_length or GCSA2 order. // Designating reseed_length returns minimally-more-frequent sub-MEMs in addition to SMEMs when SMEM is >= reseed_length. // Minimally-more-frequent sub-MEMs are MEMs contained in an SMEM that have occurrences outside of the SMEM. - // SMEMs and sub-MEMs will be automatically filled with the nodes they contain, which the occurrences of the sub-MEMs - // that are inside SMEM hits filtered out. (filling sub-MEMs currently requires an XG index) + // SMEMs and sub-MEMs will be automatically filled with the nodes they contain vector find_mems_deep(string::const_iterator seq_begin, @@ -214,6 +228,31 @@ class BaseMapper : public Progressive { int min_mem_length = 1, int reseed_length = 0); + vector + find_stripped_matches(string::const_iterator seq_begin, + string::const_iterator seq_end, + size_t strip_length, + size_t max_match_length, + size_t target_count); + + // finds MEMs where a pre-specified number of low-quality bases are + // allowed to be any base. if the optional vector is provided, then it + // will be filled to include all of the places that each returned MEM + // mismatches the graph sequence. otherwise, the MEMs are walked out + // and split into exact matches (can be expensive) + vector + find_fanout_mems(string::const_iterator seq_begin, + string::const_iterator seq_end, + string::const_iterator qual_begin, + int max_fans_out, + char max_fanout_base_quality, + vector>>* mem_fanout_breaks = nullptr); + + vector walk_fanout_path(string::const_iterator begin, + string::const_iterator end, + const deque>& fanout_breaks, + gcsa::node_type pos); + /// identifies tracts of order-length MEMs that were unfilled because their hit count was above the max /// and fills one MEM in the tract (the one with the smallest hit count), assumes MEMs are lexicographically /// ordered by read index @@ -238,12 +277,23 @@ class BaseMapper : public Progressive { double fast_reseed_length_diff = 0.45; // how much smaller than its parent a sub-MEM can be in the fast reseed algorithm bool adaptive_reseed_diff = true; // use an adaptive length difference algorithm in reseed algorithm double adaptive_diff_exponent = 0.065; // exponent that describes limiting behavior of adaptive diff algorithm - int hit_max = 0; // ignore or MEMs with more than this many hits + int hit_max = 0; // only query at most this many hits for a MEM (0 for no limit) + int hard_hit_max = 0; // don't query any hits for MEMs with this many occurrences or more (0 for no limit) bool use_approx_sub_mem_count = false; bool prefilter_redundant_hits = true; int max_sub_mem_recursion_depth = 2; + bool use_greedy_mem_restarts = false; + int greedy_restart_min_length = 40; + int greedy_restart_max_count = 2; + int greedy_restart_max_lcp = 0; // 0 for no max + bool greedy_restart_assume_substitution = false; + bool filter_short_mems = false; + double short_mem_filter_factor = 0.45; int unpaired_penalty = 17; bool precollapse_order_length_hits = true; + double avg_node_length = 0; + size_t total_seq_length = 0; + int fanout_length_threshold = 0; // The recombination rate (negative log per-base recombination probability) for haplotype-aware mapping double recombination_penalty = 20.7; // 9 * 2.3 = 20.7 @@ -252,15 +302,16 @@ class BaseMapper : public Progressive { // Does NOT (yet) remove the haplotype consistency bonus. bool strip_bonuses; bool assume_acyclic; // the indexed graph is acyclic - bool adjust_alignments_for_base_quality; // use base quality adjusted alignments + MappingQualityMethod mapping_quality_method; // how to compute mapping qualities int max_mapping_quality; // the cap for mapping quality + bool exclude_unaligned = false; + /// Set to enable debugging messages to cerr from the mapper, so a user can understand why a read maps the way it does. bool debug = false; -protected: /// Locate the sub-MEMs contained in the last MEM of the mems vector that have ending positions /// before the end the next SMEM, label each of the sub-MEMs with the indices of all of the SMEMs /// that contain it @@ -284,30 +335,10 @@ class BaseMapper : public Progressive { int min_sub_mem_length, vector>>& sub_mems_out); - /// finds the nodes of sub MEMs that do not occur inside parent MEMs, each sub MEM should be associated - /// with a vector of the indices of the SMEMs that contain it in the parent MEMs vector - void fill_nonredundant_sub_mem_nodes(vector& parent_mems, - vector > >::iterator sub_mem_records_begin, - vector > >::iterator sub_mem_records_end); - - /// fills a vector where each element contains the set of positions in the graph that the - /// MEM touches at that index for the first MEM hit in the GCSA array - void first_hit_positions_by_index(MaximalExactMatch& mem, - vector>& positions_by_index_out); - - /// fills a vector where each element contains the set of positions in the graph that the - /// MEM touches at that index starting at a given hit - void mem_positions_by_index(MaximalExactMatch& mem, pos_t hit_pos, - vector>& positions_by_index_out); - - // use the xg index to get a character at a particular position (rc or foward) - char pos_char(pos_t pos); - - // the next positions and their characters following the same strand of the graph - map next_pos_chars(pos_t pos); - - // get the positions some specific distance from the given position (in the forward direction) - set positions_bp_from(pos_t pos, int distance, bool rev); + /// If possible, use the MEMAcclerator to get the initial range for a MEM and update the cursor + /// accordingly. If this is not possible, return the full GCSA2 range and leave the cursor unaltered. + gcsa::range_type accelerate_mem_query(string::const_iterator begin, + string::const_iterator& cursor) const; // Use the GCSA index to look up the sequence set sequence_positions(const string& seq); @@ -315,11 +346,6 @@ class BaseMapper : public Progressive { // Algorithm for choosing an adaptive reseed length based on the length of the parent MEM size_t get_adaptive_min_reseed_length(size_t parent_mem_length); - int alignment_threads; // how many threads will *this* mapper use. Should not be set directly. - - void init_aligner(int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, uint32_t max_gap_length = default_max_gap_length); - void clear_aligners(void); - /// Score all of the alignments in the vector for haplotype consistency. If /// all of them can be scored (i.e. none of them visit nodes/edges with no /// haplotypes), adjust all of their scores to reflect haplotype @@ -331,11 +357,12 @@ class BaseMapper : public Progressive { thread_local static vector adaptive_reseed_length_memo; // xg index - xg::XG* xindex = nullptr; + PathPositionHandleGraph* xindex = nullptr; // GCSA index and its LCP array gcsa::GCSA* gcsa = nullptr; gcsa::LCPArray* lcp = nullptr; + MEMAccelerator* accelerator = nullptr; // Haplotype score provider, if any, for determining haplotype concordance haplo::ScoreProvider* haplo_score_provider = nullptr; @@ -344,25 +371,6 @@ class BaseMapper : public Progressive { // 0 = no haplotype consistency scoring done. // 1 = multiply in haplotype likelihood once when computing alignment score double haplotype_consistency_exponent = 1; - - FragmentLengthDistribution fragment_length_distr; - - /// Get the appropriate aligner to use, based on - /// adjust_alignments_for_base_quality. By setting have_qualities to false, - /// you can force the non-quality-adjusted aligner, for reads that lack - /// quality scores. - BaseAligner* get_aligner(bool have_qualities = true) const; - - // Sometimes you really do need the two kinds of aligners, to pass to code - // that expects one or the other. - QualAdjAligner* get_qual_adj_aligner() const; - Aligner* get_regular_aligner() const; - -private: - // GSSW aligners - QualAdjAligner* qual_adj_aligner = nullptr; - Aligner* regular_aligner = nullptr; - }; /** @@ -375,7 +383,10 @@ class FragmentLengthStatistics { void record_fragment_configuration(const Alignment& aln1, const Alignment& aln2, Mapper* mapper); string fragment_model_str(void); - void save_frag_lens_to_alns(Alignment& aln1, Alignment& aln2, const map& approx_frag_lengths, bool is_consistent); + void save_frag_lens_to_alns(Alignment& aln1, Alignment& aln2, + const unordered_map& approx_frag_lengths, + PathPositionHandleGraph* xindex, + bool is_consistent); // These functions are the authorities on the estimated parameters double fragment_length_stdev(void); @@ -458,20 +469,18 @@ class Mapper : public BaseMapper { protected: Alignment align_to_graph(const Alignment& aln, - Graph& graph, - size_t max_query_graph_ratio, + HandleGraph& graph, + bool do_flip, bool traceback, - bool acyclic_and_sorted, bool pinned_alignment = false, bool pin_left = false, bool banded_global = false, bool keep_bonuses = true); Alignment align_to_graph(const Alignment& aln, - Graph& graph, + HandleGraph& graph, const vector& mems, - size_t max_query_graph_ratio, + bool do_flip, bool traceback, - bool acyclic_and_sorted, bool pinned_alignment = false, bool pin_left = false, bool banded_global = false, @@ -481,23 +490,15 @@ class Mapper : public BaseMapper { // make the bands used in banded alignment vector make_bands(const Alignment& read, int band_width, int band_overlap, vector>& to_strip); public: - // Make a Mapper that pulls from an XG succinct graph, a GCSA2 kmer index + + // Make a Mapper that pulls from an PathPositionHandleGraph succinct graph, a GCSA2 kmer index + // LCP array, and an optional haplotype score provider. - Mapper(xg::XG* xidex, gcsa::GCSA* g, gcsa::LCPArray* a, haplo::ScoreProvider* haplo_score_provider = nullptr); + Mapper(PathPositionHandleGraph* xidex, gcsa::GCSA* g, gcsa::LCPArray* a, haplo::ScoreProvider* haplo_score_provider = nullptr); Mapper(void); ~Mapper(void); - map > node_positions_in_paths(gcsa::node_type node); - // a collection of read pairs which we'd like to realign once we have estimated the fragment_size vector > imperfect_pairs_to_retry; - double graph_entropy(void); - - // Use the xg index to get the first position of an alignment on a reference path. Thread safe. - void annotate_with_initial_path_positions(Alignment& aln) const; - void annotate_with_initial_path_positions(vector& alns) const; - // Return true of the two alignments are consistent for paired reads, and false otherwise bool alignments_consistent(const map& pos1, const map& pos2, @@ -511,24 +512,16 @@ class Mapper : public BaseMapper { /// use the fragment configuration statistics to rescue more precisely pair pair_rescue(Alignment& mate1, Alignment& mate2, bool& tried1, bool& tried2, int match_score, int full_length_bonus, bool traceback, bool xdrop_alignment); - set resolve_paired_mems(vector& mems1, - vector& mems2); - // uses heuristic clustering based on node id ranges to find alignment targets, and aligns vector mems_id_clusters_to_alignments(const Alignment& alignment, vector& mems, int additional_multimaps); // use mapper parameters to determine which clusters we should drop set* > clusters_to_drop(const vector >& clusters); - // takes the input alignment (with seq, etc) so we have reference to the base sequence - // for reconstruction the alignments from the SMEMs - Alignment mems_to_alignment(const Alignment& aln, const vector& mems); - Alignment mem_to_alignment(const MaximalExactMatch& mem); - /// Use the scoring provided by the internal aligner to re-score the - /// alignment, scoring gaps between nodes using graph distance from the XG + /// alignment, scoring gaps between nodes using graph distance from the PathPositionHandleGraph /// index. Can use either approximate or exact (with approximate fallback) - /// XG-based distance estimation. Will strip out bonuses if the appropriate + /// PathPositionHandleGraph-based distance estimation. Will strip out bonuses if the appropriate /// Mapper flag is set. /// Does not apply a haplotype consistency bonus, as this function is intended for alignments with large gaps. int32_t score_alignment(const Alignment& aln, bool use_approx_distance = false); @@ -538,20 +531,17 @@ class Mapper : public BaseMapper { // run through the alignment and attempt to align unaligned parts of the alignment to the graph in the region where they are anchored Alignment patch_alignment(const Alignment& aln, int max_patch_length, bool trim_internal_deletions = true, bool xdrop_alignment = false); - // Get the graph context of a particular cluster, not expanding beyond the middles of MEMs. - VG cluster_subgraph_strict(const Alignment& aln, const vector& mems); // for aligning to a particular MEM cluster Alignment align_cluster(const Alignment& aln, const vector& mems, bool traceback, bool xdrop_alignment = false); // compute the uniqueness metric based on the MEMs in the cluster double compute_uniqueness(const Alignment& aln, const vector& mems); // wraps align_to_graph with flipping - Alignment align_maybe_flip(const Alignment& base, Graph& graph, bool flip, bool traceback, bool acyclic_and_sorted, bool banded_global = false, bool xdrop_alignment = false); - Alignment align_maybe_flip(const Alignment& base, Graph& graph, const vector& mems, bool flip, bool traceback, bool acyclic_and_sorted, bool banded_global = false, bool xdrop_alignment = false); + Alignment align_maybe_flip(const Alignment& base, HandleGraph& graph, bool flip, bool traceback, bool banded_global = false, bool xdrop_alignment = false); + Alignment align_maybe_flip(const Alignment& base, HandleGraph& graph, const vector& mems, bool flip, bool traceback, bool banded_global = false, bool xdrop_alignment = false); bool adjacent_positions(const Position& pos1, const Position& pos2); int64_t get_node_length(int64_t node_id); bool check_alignment(const Alignment& aln); - VG alignment_subgraph(const Alignment& aln, int context_size = 1); // Align the given string and return an Alignment. Alignment align(const string& seq, @@ -608,8 +598,6 @@ class Mapper : public BaseMapper { double estimate_max_possible_mapping_quality(int length, double min_diffs, double next_min_diffs); // absolute max possible mq double max_possible_mapping_quality(int length); - // walks the graph one base at a time from pos1 until we find pos2 - int64_t graph_distance(pos_t pos1, pos_t pos2, int64_t maximum = 1e3); // takes the min of graph_distance, approx_distance, and xindex->min_approx_path_distance() int64_t graph_mixed_distance_estimate(pos_t pos1, pos_t pos2, int64_t maximum); // use the offset in the sequence array to give an approximate distance @@ -618,28 +606,15 @@ class Mapper : public BaseMapper { int64_t approx_position(pos_t pos); // get the approximate position of the alignment or return -1 if it can't be had int64_t approx_alignment_position(const Alignment& aln); - // get the full path offsets for the alignment, considering every mapping if just_first is not set - map > > alignment_path_offsets(const Alignment& aln, bool just_min = true, bool nearby = false) const; // get the end position of the alignment Position alignment_end_position(const Alignment& aln); // get the approximate distance between the starts of the alignments or return -1 if undefined int64_t approx_fragment_length(const Alignment& aln1, const Alignment& aln2); - // use the cached fragment model to estimate the likely place we'll find the mate - pos_t likely_mate_position(const Alignment& aln, bool is_first); // get a set of positions that are likely based on the fragment model and the embedded paths vector likely_mate_positions(const Alignment& aln, bool is_first); - // get the node approximately at the given offset relative to our position (offset may be negative) - id_t node_approximately_at(int64_t approx_pos); - // convert a single MEM hit into an alignment (by definition, a perfect one) - Alignment walk_match(const string& seq, pos_t pos); - vector walk_match(const Alignment& base, const string& seq, pos_t pos); - // convert the set of hits of a MEM into a set of alignments - vector mem_to_alignments(MaximalExactMatch& mem); // fargment length estimation - map min_pair_fragment_length(const Alignment& aln1, const Alignment& aln2); - // uses the cached information about the graph in the xg index to get an approximate node length - double average_node_length(void); + unordered_map min_pair_fragment_length(const Alignment& aln1, const Alignment& aln2); // mem mapper parameters // @@ -649,8 +624,6 @@ class Mapper : public BaseMapper { int thread_extension; // add this many nodes in id space to the end of the thread when building thread into a subgraph int max_target_factor; // the maximum multiple of the read length we'll try to align to - size_t max_query_graph_ratio; - // multimapping int max_multimaps; // soft clip resolution @@ -671,6 +644,7 @@ class Mapper : public BaseMapper { bool always_rescue; // Should rescue be attempted for all imperfect alignments? bool include_full_length_bonuses; + int max_xdrop_gap_length; bool simultaneous_pair_alignment; int max_band_jump; // the maximum length edit we can detect via banded alignment @@ -684,6 +658,8 @@ class Mapper : public BaseMapper { // Keep track of fragment length distribution statistics FragmentLengthStatistics frag_stats; + atomic warned_about_chunking{0}; + // bench_t bench[4]; // uint64_t counter[6]; }; diff --git a/src/mcmc_caller.cpp b/src/mcmc_caller.cpp new file mode 100644 index 00000000000..956123d6f3c --- /dev/null +++ b/src/mcmc_caller.cpp @@ -0,0 +1,471 @@ +#include "mcmc_caller.hpp" +#include "graph_caller.hpp" +#include "algorithms/expand_context.hpp" +#include "memoizing_graph.hpp" +#include "phased_genome.hpp" +// #define debug + +namespace vg { + + /** + * MCMCCaller : Inherits from VCFOutputCaller + */ + MCMCCaller::MCMCCaller(const PathPositionHandleGraph* path_position_handle_graph, + PhasedGenome& genome, + SnarlManager& snarl_manager, + const string& sample_name, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_lengths, + ostream& out_stream) : + path_position_handle_graph(path_position_handle_graph), genome(genome), snarl_manager(snarl_manager), VCFOutputCaller(sample_name), + sample_name(sample_name), ref_paths(ref_paths), ref_path_offsets(ref_path_offsets), + ref_path_lengths(ref_path_lengths), out_stream(out_stream) { + + if(path_position_handle_graph == nullptr){ + cerr << "graph is empty" <& ref_paths, + const vector& contig_length_overrides) const { + + string header = VCFOutputCaller::vcf_header(pph_graph, ref_paths, contig_length_overrides); + header += "##FORMAT=\n"; + header += "##FILTER=\n"; + header += "##SAMPLE=\n"; + header += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_name; + assert(output_vcf.openForOutput(header)); + header += "\n"; + return header; + } + + + void MCMCCaller::call_top_level_snarls(bool recurse_on_fail) { + + // Used to recurse on children of parents that can't be called + vector snarl_queue; + + // Run the snarl caller on a snarl, and queue up the children if it fails + auto process_snarl = [&](const Snarl* snarl) { + // cerr << "before call_snarl"<& children = snarl_manager.children_of(snarl); +#pragma omp critical (snarl_queue) + { + snarl_queue.insert(snarl_queue.end(), children.begin(), children.end()); + } + } + }; + + // Start with the top level snarls + // snarl_manager.for_each_top_level_snarl_parallel(process_snarl); + snarl_manager.for_each_top_level_snarl(process_snarl); + + // Then recurse on any children the snarl caller failed to handle + while (!snarl_queue.empty()) { + vector cur_queue; + std::swap(snarl_queue, cur_queue); +#pragma omp parallel for + for (int i = 0; i < cur_queue.size(); ++i) { + process_snarl(cur_queue[i]); + } + } + + } + + bool MCMCCaller::call_snarl(const Snarl& snarl){ + // if we can't handle the snarl, then the GraphCaller framework will recurse on its children + if (!is_traversable(snarl)) { + cerr<< "snarl is not traversable" < ref_path = trav_results.first; + + + + //If it can't find any traversals, you can't output the snarl In VCF. + if(ref_path.empty()){ + // continue the loop of snarl without printing VCF file + cerr << "ref path empty" < > steps = trav_results.second; + // + pair start_and_end_pair = steps[0]; + step_handle_t first_start_step = start_and_end_pair.first; + + + function trav_string = [&](const SnarlTraversal& trav) { + string seq; + for (int i = 0; i < trav.visit_size(); ++i) { + seq += path_position_handle_graph->get_sequence(path_position_handle_graph->get_handle(trav.visit(i).node_id(), trav.visit(i).backward())); + } + return seq; + }; + + + // set ref_path name and seq + path_handle_t path_handle = path_position_handle_graph->get_path_handle_of_step(first_start_step); + string ref_path_name = path_position_handle_graph->get_path_name(path_handle); + // cerr << "ref_path_name " << ref_path_name << endl; + //TODO: check that ref_path name is equal to snarl traversal name + string ref_path_seq = trav_string(ref_trav); + + + //get haplotypes that pass snarl + vector haplos_pass_snarl = genome.get_haplotypes_with_snarl(&snarl); + + assert(!haplos_pass_snarl.empty()); + +#ifdef debug + cerr < haplo_travs = {haplos_pass_snarl.size(), SnarlTraversal()}; + vector haplo_travs; + vector> haplo_nodes; + vector genotype; + int match_ref = 0; + int not_match_ref = 1; + string prev_trav_seq = ""; + + //build SnarlTraversal obj for each haplotype that passes through snarl + for(int i = 0; i node->id(); + bool backward = iter->backward; + Visit* v = fresh_trav.add_visit(); + v->set_node_id(n_id); + v->set_backward(backward); +#ifdef debug + cerr << "node id " << n_id <& genotype, SnarlTraversal ref_trav, + const string& ref_path_name, const vector& haplo_travs) const{ + + // convert traversal to string + // function that converst SnarlTraversals to strings + // usage: trav_string(SnarlTraversal) + + function trav_string = [&](const SnarlTraversal& trav) { + string seq; + for (int i = 0; i < trav.visit_size(); ++i) { + seq += path_position_handle_graph->get_sequence(path_position_handle_graph->get_handle(trav.visit(i).node_id(), trav.visit(i).backward())); + } + return seq; + }; + + + vcflib::Variant out_variant; + + // when calling alt/alt, the reference traversal doesn't end up in called_traversals. + // this should get changed, but in the meantime we add it back here (as we need it for + // the VCF output) + // udpate: the reference traversal will be there when re-genotyping, but we can leave this logic + // in case we want to ever add an option to toggle this. + vector site_traversals; + vector site_genotype; + for (int i = 0; i < genotype.size(); ++i) { + if (genotype[i] == 0) { + // if haplo traversal matches the ref, add to container + site_traversals.push_back(haplo_travs[i]); + break; + } + } + if(site_traversals.empty()){ + //if none of the haplotypes matched, get reference SnarlTraversal + // and convert to string + site_traversals.push_back(ref_trav); + } + + out_variant.ref = trav_string(ref_trav); + + // deduplicate alleles and compute the site traversals and genotype + map allele_to_gt; + allele_to_gt[out_variant.ref] = 0; + for (int i = 0; i < genotype.size(); ++i) { + if (genotype[i] == 0) { + site_genotype.push_back(0); + } else { + string allele_string = trav_string(haplo_travs[i]); + if (allele_to_gt.count(allele_string)) { + site_genotype.push_back(allele_to_gt[allele_string]); + } else { + site_traversals.push_back(haplo_travs[i]); + site_genotype.push_back(allele_to_gt.size()); + allele_to_gt[allele_string] = site_genotype.back(); + } + } + } + + out_variant.alt.resize(allele_to_gt.size() - 1); + out_variant.alleles.resize(allele_to_gt.size()); + for (auto& allele_gt : allele_to_gt) { + if (allele_gt.second > 0) { + out_variant.alt[allele_gt.second - 1] = allele_gt.first; + } + out_variant.alleles[allele_gt.second] = allele_gt.first; + } + + // fill out the rest of the variant + out_variant.sequenceName = ref_path_name; + // +1 to convert to 1-based VCF + out_variant.position = get_ref_position(snarl, ref_path_name).first + ref_offsets.find(ref_path_name)->second + 1; + out_variant.id = std::to_string(snarl.start().node_id()) + "_" + std::to_string(snarl.end().node_id()); + out_variant.filter = "PASS"; + out_variant.updateAlleleIndexes(); + + // add the genotype + out_variant.format.push_back("GT"); + auto& genotype_vector = out_variant.samples[sample_name]["GT"]; + + stringstream vcf_gt; + for (int i = 0; i < site_genotype.size(); ++i) { + vcf_gt << site_genotype[i]; + if (i != site_genotype.size() - 1) { + vcf_gt << "/"; + } + } + genotype_vector.push_back(vcf_gt.str()); + + // clean up the alleles to not have so many common prefixes + flatten_common_allele_ends(out_variant, true); + flatten_common_allele_ends(out_variant, false); + + // add variant to list + if (!out_variant.alt.empty()) { + add_variant(out_variant); + } + + + } + + + void MCMCCaller::update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const string& sample_name, + vcflib::Variant& variant) const{ + + + } + + bool MCMCCaller::is_traversable(const Snarl& snarl) { + + bool ret; + bool is_reachable = snarl.start_end_reachable(); + bool is_DAG = snarl.directed_acyclic_net_graph(); + const Visit& start = snarl.start(); + int64_t node_id = start.node_id(); + bool has_node_start = path_position_handle_graph->has_node(node_id); + bool has_node_end = path_position_handle_graph->has_node(snarl.end().node_id()); + + // we need this to be true all the way down to use the RepresentativeTraversalFinder on our snarl. + if(is_reachable && is_DAG && has_node_start && has_node_end ){ + ret = true; + }else{ + ret = false; + } + + if (ret == true) { + const vector& children = snarl_manager.children_of(&snarl); + for (int i = 0; i < children.size() && ret; ++i) { + ret = is_traversable(*children[i]); + } + } + return ret; + } + + + pair MCMCCaller::get_ref_position(const Snarl& snarl, const string& ref_path_name) const { + path_handle_t path_handle = path_position_handle_graph->get_path_handle(ref_path_name); + + handle_t start_handle = path_position_handle_graph->get_handle(snarl.start().node_id(), snarl.start().backward()); + map start_steps; + path_position_handle_graph->for_each_step_on_handle(start_handle, [&](step_handle_t step) { + if (path_position_handle_graph->get_path_handle_of_step(step) == path_handle) { + start_steps[path_position_handle_graph->get_position_of_step(step)] = step; + } + }); + + handle_t end_handle = path_position_handle_graph->get_handle(snarl.end().node_id(), snarl.end().backward()); + map end_steps; + path_position_handle_graph->for_each_step_on_handle(end_handle, [&](step_handle_t step) { + if (path_position_handle_graph->get_path_handle_of_step(step) == path_handle) { + end_steps[path_position_handle_graph->get_position_of_step(step)] = step; + } + }); + + assert(start_steps.size() > 0 && end_steps.size() > 0); + step_handle_t start_step = start_steps.begin()->second; + step_handle_t end_step = end_steps.begin()->second; + bool scan_backward = path_position_handle_graph->get_is_reverse(path_position_handle_graph->get_handle_of_step(start_step)); + + // if we're on a cycle, we keep our start step and find the end step by scanning the path + if (start_steps.size() > 1 || end_steps.size() > 1) { + bool found_end = false; + if (scan_backward) { + for (step_handle_t cur_step = start_step; path_position_handle_graph->has_previous_step(end_step) && !found_end; + cur_step = path_position_handle_graph->get_previous_step(cur_step)) { + if (path_position_handle_graph->get_handle_of_step(cur_step) == end_handle) { + end_step = cur_step; + found_end = true; + } + } + assert(found_end); + } else { + for (step_handle_t cur_step = start_step; path_position_handle_graph->has_next_step(end_step) && !found_end; + cur_step = path_position_handle_graph->get_next_step(cur_step)) { + if (path_position_handle_graph->get_handle_of_step(cur_step) == end_handle) { + end_step = cur_step; + found_end = true; + } + } + assert(found_end); + } + } + + size_t start_position = start_steps.begin()->first; + size_t end_position = end_step == end_steps.begin()->second ? end_steps.begin()->first : path_position_handle_graph->get_position_of_step(end_step); + bool backward = end_position < start_position; + + return make_pair(backward ? end_position : start_position, backward); + } + + + + void MCMCCaller::flatten_common_allele_ends(vcflib::Variant& variant, bool backward) const { + if (variant.alt.size() == 0) { + return; + } + size_t min_len = variant.alleles[0].length(); + for (int i = 1; i < variant.alleles.size(); ++i) { + min_len = std::min(min_len, variant.alleles[i].length()); + } + // want to leave at least one in the reference position + if (min_len > 0) { + --min_len; + } + + bool match = true; + int shared_prefix_len = 0; + for (int i = 0; i < min_len && match; ++i) { + char c1 = std::toupper(variant.alleles[0][!backward ? i : variant.alleles[0].length() - 1 - i]); + for (int j = 1; j < variant.alleles.size() && match; ++j) { + char c2 = std::toupper(variant.alleles[j][!backward ? i : variant.alleles[j].length() - 1 - i]); + match = c1 == c2; + } + if (match) { + ++shared_prefix_len; + } + } + + if (!backward) { + variant.position += shared_prefix_len; + } + for (int i = 0; i < variant.alleles.size(); ++i) { + if (!backward) { + variant.alleles[i] = variant.alleles[i].substr(shared_prefix_len); + } else { + variant.alleles[i] = variant.alleles[i].substr(0, variant.alleles[i].length() - shared_prefix_len); + } + if (i == 0) { + variant.ref = variant.alleles[i]; + } else { + variant.alt[i - 1] = variant.alleles[i]; + } + } + } + +} + +// #undef debug diff --git a/src/mcmc_caller.hpp b/src/mcmc_caller.hpp new file mode 100644 index 00000000000..73365df7001 --- /dev/null +++ b/src/mcmc_caller.hpp @@ -0,0 +1,93 @@ +#ifndef VG_MCMC_CALLER_HPP_INCLUDED +#define VG_MCMC_CALLER_HPP_INCLUDED + + +#include +#include +#include +#include +#include +#include +#include +#include "handle.hpp" +#include "snarls.hpp" +#include "graph_caller.hpp" +#include "traversal_finder.hpp" +#include "phased_genome.hpp" +#include "region.hpp" + +namespace vg{ + + using namespace std; + + class MCMCCaller : public VCFOutputCaller { + public: + PhasedGenome& genome; + SnarlManager& snarl_manager; + const string sample_name = "SAMPLE"; + const vector ref_path_offsets = {}; + const vector ref_path_lengths = {}; + ostream& out_stream; + const SnarlTraversal trav; + + + MCMCCaller(const PathPositionHandleGraph* path_position_handle_graph, + PhasedGenome& genome, + SnarlManager& snarl_manager, + const string& sample_name, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_lengths, + ostream& out_stream = cout ); + + virtual ~MCMCCaller(); + + /// Run call_snarl() on every top-level snarl in the manager. + /// For any that return false, try the children, etc. (when recurse_on_fail true) + /// Snarls are processed in parallel + void call_top_level_snarls(bool recurse_on_fail = true) ; + + /// print vcf header + virtual string vcf_header(const PathPositionHandleGraph& graph, const vector& ref_paths, + const vector& contig_length_overrides) const ; + + protected: + /// path position handle graph + const PathPositionHandleGraph* path_position_handle_graph; + + /// keep track of the reference paths + vector ref_paths; + + /// keep track of offsets in the reference paths + map ref_offsets; + + /// Update INFO and FORMAT fields of the called variant + void update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const string& sample_name, + vcflib::Variant& variant) const; + + /// print a vcf variant + void emit_variant(const Snarl& snarl, const vector& genotype, SnarlTraversal ref_trav, const string& ref_path_name, const vector& haplo_travs) const; + + /// Call a given snarl, and print the output to out_stream + bool call_snarl(const Snarl& snarl); + + /// check if a site can be handled + bool is_traversable(const Snarl& snarl); + + /// get position of reference path + pair get_ref_position(const Snarl& snarl, const string& ref_path_name) const; + + /// clean up the alleles to not share common prefixes / suffixes + void flatten_common_allele_ends(vcflib::Variant& variant, bool backward) const; + + + }; + + + + +} +#endif diff --git a/src/mcmc_genotyper.cpp b/src/mcmc_genotyper.cpp new file mode 100644 index 00000000000..51c1b7da1a1 --- /dev/null +++ b/src/mcmc_genotyper.cpp @@ -0,0 +1,744 @@ +#include "mcmc_genotyper.hpp" +#include "subgraph.hpp" +#include +#include +#include +#include +#include "multipath_alignment.hpp" + +// #define stdout_for_performance_script +// #define debug_mcmc +// #define debug_make_snarl_graph +// #define debug_karger_stein +// #define debug_proposal_sample + +namespace vg { + + using namespace std; + + MCMCGenotyper::MCMCGenotyper(SnarlManager& snarls, VG& graph, const int n_iterations, const int seed, const int burn_in, const int frequency):snarls(snarls), graph(graph), n_iterations(n_iterations), + seed(seed), random_engine(seed), burn_in(burn_in), frequency(frequency){ + + + } + + unique_ptr MCMCGenotyper::run_genotype(const vector& reads, const double log_base) const{ + + // set a flag for invalid contents so a message is observed + bool return_optimal = false; + + // generate initial value + unique_ptr genome = generate_initial_guess(); + + double max_likelihood = 0.0; + double current_likelihood = 0.0; + double previous_likelihood = 0.0; + int haplotype_0 =0; + int haplotype_1 =1; + unique_ptr optimal; + + //stores entire set generated by karger-stein min cut + unique_ptr>> gamma(nullptr); + //stores the sites we swapped alleles at - returned by alt_proposal + unique_ptr> to_swap_back(nullptr); + int count =0; + enum sample_mode {PROPOSAL_ORIGINAL, PROPOSAL_KARGER_STEIN}; + // build markov chain using Metropolis-Hastings + for(int i = 0; i< n_iterations; i++){ + + int random_num; + + // holds the previous sample allele + double x_prev = log_target(genome, reads); + + tuple > to_receive; + int* modified_haplo; + const Snarl* modified_site; + vector* old_allele; + + if(i < burn_in){ + random_num = PROPOSAL_ORIGINAL; +#ifdef stdout_for_performance_script + cerr << "ITERATION " << i <> (karger_stein(reads, *genome))); +#ifdef stdout_for_performance_script + cerr << "building gamma " <size() << endl; +#endif + } + //generate gamma with n frequency after burn in + if(count == frequency){ + gamma.reset(new vector> (karger_stein(reads, *genome))); + count = 0;//reset counter +#ifdef stdout_for_performance_script + cerr << "building gamma " <size() << endl; +#endif + } + if(gamma->size() <= 0){ +#ifdef stdout_for_performance_script + cerr << "empty gamma "< (alt_proposal_sample(*gamma, *genome))); + if(!to_swap_back){ +#ifdef stdout_for_performance_script + cerr << "to_swap_back() is empty" <print_phased_genome(); + cerr << "modified_haplo before "<< *modified_haplo <(to_receive); + // if the to_receive contents are invalid keep the new allele + // for graphs that do not contain snarls + if (*modified_haplo ==-1){ +#ifdef stdout_for_performance_script + cerr << "modified haplo is empty" <(to_receive); + old_allele = &get<2>(to_receive); + } + } + + /* + *######################################################################################## + * ACCEPT/REJECT SAMPLE + *######################################################################################## + **/ + // holds new sample allele score + double x_new = log_target(genome, reads); + + double likelihood_ratio = exp(log_base*(x_new - x_prev)); + + current_likelihood = previous_likelihood + log_base*(x_new-x_prev); + + if (current_likelihood > max_likelihood){ + max_likelihood = current_likelihood; + optimal.reset(new PhasedGenome(*genome)); + return_optimal=true; + } + + // calculate acceptance probability + double acceptance_probability = min(1.0, likelihood_ratio); + + // if u~U(0,1) > alpha, discard new allele and keep previous + auto uniform_smpl = generate_continuous_uniform(0.0,1.0); + if(uniform_smpl > acceptance_probability){ //REJECT + if(random_num==PROPOSAL_ORIGINAL){ //if rand num==0, use proposal sample swap back method + //swap back to old allele at random snarl, random haplo + genome->set_allele(*modified_site, old_allele->begin(), old_allele->end(), *modified_haplo); + } + if(random_num == PROPOSAL_KARGER_STEIN){ //or if, use alt proposal sample swap back method + //swap alleles at all sites in to_swap_back + for(auto iter = to_swap_back->begin(); iter != to_swap_back->end(); ++iter){ + const Snarl* snarl_to_swap = snarls.translate_snarl_num(*iter); + genome->swap_alleles(*snarl_to_swap, haplotype_0, haplotype_1); + } + } + +#ifdef stdout_for_performance_script + cerr << "Rejected new allele" <print_phased_genome(); +#endif + }else{ +#ifdef stdout_for_performance_script + cerr << "Accepted new allele" <print_phased_genome(); +#endif + previous_likelihood = current_likelihood; //ACCEPT + } + } + if(!return_optimal){ + // for graphs without snarls + return genome; + }else{ +#ifdef stdout_for_performance_script + cerr <<"clikelihood " << max_likelihood <print_phased_genome(); +#endif + return optimal; + } + + } + double MCMCGenotyper::log_target(unique_ptr& phased_genome, const vector& reads)const{ + + // sum of scores given the reads aligned on the haplotype + int32_t sum_scores = 0; + + // get scores for mp alignments + for(const multipath_alignment_t& mp : reads){ + sum_scores += phased_genome->optimal_score_on_genome(mp, graph); + + } + + return sum_scores; + } + + tuple > MCMCGenotyper::proposal_sample(unique_ptr& current)const{ + // get a different traversal through the snarl by uniformly choosing from all possible ways to traverse the snarl + + // bookkeeping: haplotype ID, snarl* (site that we replaced at), get_allele()) + tuple > to_return; + + int& random_haplotype = get<0>(to_return); + const Snarl*& random_snarl = get<1>(to_return); + // the random path through the snarl + vector& old_allele = get<2>(to_return); + + // sample uniformly between snarls + random_snarl = snarls.discrete_uniform_sample(random_engine); +#ifdef debug_proposal_sample + bool is_null = (random_snarl== nullptr); + cerr << "is null " << is_null < matched_haplotypes = current->get_haplotypes_with_snarl(random_snarl); + + + if(matched_haplotypes.empty()){ +#ifdef debug_proposal_sample + cerr << "looking for snarl starting with " << random_snarl->start() << " and snarl ending with " << random_snarl->end() <, unordered_set > contents = snarls.deep_contents(random_snarl, graph, true); + + // unpack the pair, we only care about the node_ids + unordered_set& ids = contents.first; + + // enumerate counts through nodes in snarl not the entire graph + SubHandleGraph subgraph(&graph); + + for (id_t id : ids){ + + // add each node from snarl in super graph to sub graph + subgraph.add_handle(graph.get_handle(id, false)); + } + + + // create a count_map of the subgraph + auto count_contents = handlealgs::count_walks_through_nodes(&subgraph); + + // unpack the count map from the count_contents + unordered_map& count_map = get<1>(count_contents); + + + + // create a topological order of sub graph count map + vector topological_order = handlealgs::lazier_topological_order(&subgraph); + + // we want to get just the sink handle handle + handle_t start = topological_order.back(); + handle_t source = topological_order.front(); + + // start at sink in topological + bool start_from_sink =true; + bool not_source = true; + + vector allele; + + while(not_source){ + + size_t cum_sum = 0; + vector cumulative_sum; + vector paths_to_take; + size_t count = 0; + vector handles; + + subgraph.follow_edges(start, start_from_sink, [&](const handle_t& next) { + unordered_map::iterator it; + it = count_map.find(next); // find the handle + count = it->second; // get the count + cum_sum += count; + cumulative_sum.push_back(cum_sum); + handles.push_back(next); + + }); + + // choose a random path uniformly + int l_bound = 0; + int u_bound = cumulative_sum.back()-1; + int random_num = generate_discrete_uniform(random_engine,l_bound, u_bound); + + // use the random_num to select a random_handle + int found = 0, prev = 0; + for (int i = 0; i< cumulative_sum.size() ; i++) { + // check what range the random_num falls in + if (prev <= random_num && random_num < cumulative_sum[i] ){ + found = i; // will correspond to the index of handles + break; + } + prev = cumulative_sum[i]; + } + + assert(found != -1); + + // start_ptr will point to random handle + start = handles[found]; + + // save the random path + bool position = subgraph.get_is_reverse(start); + Node* n = graph.get_node(subgraph.get_id(start)); + + + // allele should not include boundary nodes of random_snarl + if(n->id() != random_snarl->start().node_id() && n->id() != random_snarl->end().node_id() ){ + allele.push_back(NodeTraversal(n,position)); + } + + + + // check if we are at the source, if so we terminate loop + if(start == source){ + not_source = false; + } + + } + // save old allele so we can swap back to it if we need to + old_allele = current->get_allele(*random_snarl, random_haplotype); + +#ifdef debug_mcmc + cerr << "modifying haplotype " << random_num << endl; + for(auto iter = allele.begin(); iter != allele.end(); iter++ ){ + cerr << "new allele: " <<"node " << iter->node->id() << " " << iter->node->sequence() <node->id() << " " << iter->node->sequence() <set_allele(*random_snarl , allele.rbegin(), allele.rend(), random_haplotype); + + + return to_return; + + } + int MCMCGenotyper::generate_discrete_uniform(minstd_rand0& random_engine, int lower_bound , int upper_bound) const{ + + // choose a number randomly using discrete uniform distribution + uniform_int_distribution distribution(lower_bound, upper_bound); + int random_num = distribution(random_engine); + + return random_num; + } + double MCMCGenotyper::generate_continuous_uniform(const double a, const double b)const{ + + uniform_real_distribution distribution(a,b); + double random_num = distribution(random_engine); + + return random_num; + + } + unique_ptr MCMCGenotyper::generate_initial_guess()const{ + + unique_ptr genome(new PhasedGenome(snarls)); + vector haplotype; //will add twice + + graph.for_each_path_handle([&](const path_handle_t& path){ + // capture all variables (paths) in scope by reference + + if(!Paths::is_alt(graph.get_path_name(path))) { + // If it isn't an alt path, we want to trace it + + for (handle_t handle : graph.scan_path(path)) { + // For each occurrence from start to end + + // get the node and the node postion and add to the vector + Node* node = graph.get_node(graph.get_id(handle)); + bool position = graph.get_is_reverse(handle); + haplotype.push_back(NodeTraversal(node,position)); + + } + } + }); + // construct haplotypes + // haplotype1 = haplotype2 + genome->add_haplotype(haplotype.begin(), haplotype.end()); + genome->add_haplotype(haplotype.begin(), haplotype.end()); + + // index sites + genome->build_indices(); + + return genome; + } + unordered_map, int32_t> MCMCGenotyper::make_snarl_map(const vector& reads, PhasedGenome& phased_genome) const{ + + + unordered_map, int32_t> map; + int32_t score_after_swap,score_before_swap,diff_score; + +#ifdef debug_make_snarl_graph + cerr << "******************************************************"< snarl_set; +#ifdef debug_make_snarl_graph + vector read_nodes; +#endif + //for each pair of snarls that touches that read + for (const auto& subpath : multipath_aln.subpath()) { + if(subpath.has_path()){ + auto& path = subpath.path(); + //for every mapping in the path + for(size_t i = 0; i < path.mapping_size(); i++){ + auto& mapping = path.mapping(i); + int64_t node_id = mapping.position().node_id(); + bool is_reverse = mapping.position().is_reverse(); + +#ifdef debug_make_snarl_graph + read_nodes.push_back(node_id); +#endif + + const Snarl* snarl = snarls.into_which_snarl(node_id, is_reverse); + // insert snarls into unordered set with only unique entries + //TODO: to make it faster remove the snarls not supported by read at boundary nodes + if(snarl){ + snarl_set.insert(snarl); + } +// #ifdef debug_make_snarl_graph +// cerr <<"adding fwd_snarl " <start().node_id() <<" -> " <end().node_id() <"; + } + cerr << endl; + cerr << "snarl_set size " << snarl_set.size() <> pairs; + vector v(snarl_set.begin(), snarl_set.end()); + for(int i =0; i < v.size(); i++){ + for(int j =i+1; j < v.size(); j++){ + //check if both haplotypes visited these snarls +// #ifdef debug_make_snarl_graph +// cerr <<"Searching Pair: " <start().node_id() <<" -> " <end().node_id(); +// cerr <<" , " <start().node_id() <<" -> " <end().node_id() < haplo_ids1 = phased_genome.get_haplotypes_with_snarl(v[i]); + vector haplo_ids2 = phased_genome.get_haplotypes_with_snarl(v[j]); + +// #ifdef debug_make_snarl_graph +// if(!haplo_ids1.empty() && !haplo_ids2.empty()){ +// if((haplo_ids1[0] == 0 && haplo_ids2[0] == 0) && (haplo_ids1[1] == 1 && haplo_ids2[1] == 1)){ +// cerr << "both haplotypes overlap snarl pair" < 2 haplotypes that overlap snarl pair , skip snarl pair for that read + if(random_num == 0){ + //dereference the ptr + const Snarl& snarl_to_swap = *snarl_ptr.first; + //exhange alleles at first snarl in pair + phased_genome.swap_alleles(snarl_to_swap, haplotype_0, haplotype_1); + // get score after swap + score_after_swap = phased_genome.optimal_score_on_genome(multipath_aln, graph); +#ifdef debug_make_snarl_graph + // cerr << "genome after swap " << endl; + // phased_genome.print_phased_genome(); + // cerr << "score_after_swap " << score_after_swap <, int32_t> map) const{ + //TODO: find where the SnarlRecord* are being added to deque and store the index in snarls.cpp + + algorithms::Graph snarl_graph; + + for(auto snarl_pair_to_weight: map){ + pair snarl_pair = snarl_pair_to_weight.first; + const Snarl* snarl_1 = snarl_pair.first; + const Snarl* snarl_2 = snarl_pair.second; +// #ifdef debug_make_snarl_graph +// cerr <<"snarl1 start->end" <start().node_id() <<" -> " <end().node_id() <end" <start().node_id() <<" -> " <end().node_id() < 1 : " << edge_weight < snarl_2 + edge_back.other = snarl_id_1; //snarl_2 -> snarl_1 + +#ifdef debug_make_snarl_graph + cerr << "edge_fwd.other: " << edge_fwd.other < node_ids = snarl_graph.get_node_ids(); + unordered_set id_set(node_ids.begin(), node_ids.end()); + + //if snarl node already exists in graph, we add to its edges vector + if(id_set.count(snarl_id_1)){ + snarl_graph.get_node_by_id(snarl_id_1).edges.push_back(edge_fwd); + snarl_graph.get_node_by_id(snarl_id_1).weight += edge_weight; + }else{ + //else we create a new node and add the node to the graph along with the edge + snarl_node_1.weight += edge_weight; + snarl_node_1.edges.push_back(edge_fwd); + snarl_graph.add_node(snarl_id_1, snarl_node_1); + } + + //similarily for snarl 2, we check if it already exists and if so we add to its edge vector + if(id_set.count(snarl_id_2)){ + snarl_graph.get_node_by_id(snarl_id_2).edges.push_back(edge_back); + snarl_graph.get_node_by_id(snarl_id_2).weight += edge_weight; + }else{ + //else we create a new node and add the node to the graph along with the edge + snarl_node_2.edges.push_back(edge_back); + snarl_node_2.weight += edge_weight; + snarl_graph.add_node(snarl_id_2, snarl_node_2); + } + + } + + } + + return snarl_graph; + + } + + vector> MCMCGenotyper::karger_stein(const vector& reads, PhasedGenome& genome) const{ + +#ifdef debug_karger_stein + cerr << "num reads" << reads.size() <, int32_t> snarl_map = make_snarl_map(reads, genome); + + //make snarl graph + algorithms::Graph snarl_graph = make_snarl_graph(snarl_map); +#ifdef debug_karger_stein + cerr << "size of snarl map" << snarl_map.size() <> to_recv = algorithms::min_cut_decomposition(snarl_graph, seed); + +#ifdef debug_karger_stein + cerr << "min decomposition size of gamma" << to_recv.size() < MCMCGenotyper::alt_proposal_sample(vector>& gamma, PhasedGenome& genome ) const{ + + int haplotype_0 =0; + int haplotype_1 =1; + int lower_bound = 0; + int upper_bound = gamma.size(); + +#ifdef karger_stein + cerr << "rand num " << random_num< sites_to_swap = gamma[random_num]; + + // swap alleles at sites in chosen set using snarl indexes + for(auto iter = sites_to_swap.begin(); iter != sites_to_swap.end(); ++iter){ + const Snarl* snarl_to_swap = snarls.translate_snarl_num(*iter); + genome.swap_alleles(*snarl_to_swap, haplotype_0, haplotype_1); + } + + return sites_to_swap; + + } + + + + + +} + + + diff --git a/src/mcmc_genotyper.hpp b/src/mcmc_genotyper.hpp new file mode 100644 index 00000000000..6691cf09194 --- /dev/null +++ b/src/mcmc_genotyper.hpp @@ -0,0 +1,98 @@ +#ifndef VG_MCMC_GENOTYPER_HPP_INCLUDED +#define VG_MCMC_GENOTYPER_HPP_INCLUDED + +/** \file + * mcmc_genotyper.hpp: defines a class that implements mcmc probabilistic model for variant calling. + */ + +#include +#include +#include +#include +#include "phased_genome.hpp" +#include "multipath_alignment.hpp" +#include "algorithms/min_cut_graph.hpp" +#include "sparse_union_find.hpp" + +namespace vg { + +using namespace std; + +/** + * This class is a genotyper that uses MCMC to find two optimal paths through the graph given a set of aligned reads. + * + */ +class MCMCGenotyper{ + SnarlManager& snarls; + VG& graph; + const int n_iterations; + const int seed; + const int burn_in; //how many iterations we run only original proposal sample dist. + const int frequency; //frequency we remake gamma set + mutable minstd_rand0 random_engine; + +public: + + MCMCGenotyper(SnarlManager& snarls, VG& graph, const int n_iterations, const int seed, const int burn_in, const int frequency); + + /** + * Takes as input a collection of mapped reads stored as a vector of multipath alignments and uses + * MCMC to find two optimal paths through the graph. + * Output: phased genome + */ + unique_ptr run_genotype(const vector& reads, const double log_base) const; + + /** + * Represents the poseterior distribution function + * returns the posterir probability + */ + double log_target(unique_ptr& phased_genome, const vector& reads) const; + + /** + * Generates a proposal sample over the desired distrubution + * returns a sample from the proposal distribution + */ + tuple > proposal_sample(unique_ptr& current) const; + /** + * Generates a number randomly using the discrete uniform distribution + */ + int generate_discrete_uniform(minstd_rand0& random_engine, int lower_bound , int upper_bound) const; + + /** + * Given a range [a,b] will return a random number uniformly distributed within that range + */ + double generate_continuous_uniform(const double a, const double b) const; + + /** + * Generate a PhasedGenome to use as an initial value in M-H + * Uses the two non-alt paths from the linear reference as haplotypes + */ + unique_ptr generate_initial_guess()const; + /** + * Generate a map from a pair of snarls to an edge weight + * Uses snarls read API, reads and the optimal_score_on pahsed genome as a + * scoring scheme for the edge weight overlapping the snarl pair. + */ + unordered_map, int32_t> make_snarl_map(const vector& reads, PhasedGenome& phased_genome) const; + /** + * Generate a graph using the snarl map + */ + algorithms::Graph make_snarl_graph(unordered_map, int32_t> map) const; + + /** + * Make a snarl graph with edge weights scored by how well mapped reads support phasing of snarl + * Use an alternative proposal distribution using sets generated from karger-stein min cut algorithm + * to escape bottlenecks leading to rapid convergence + */ + vector> karger_stein(const vector& reads, PhasedGenome& genome) const; + + unordered_set alt_proposal_sample(vector>& gamma, PhasedGenome& genome) const; + + +}; + +} + + + +#endif diff --git a/src/mem.cpp b/src/mem.cpp index 1e64975a8ae..984e989be71 100644 --- a/src/mem.cpp +++ b/src/mem.cpp @@ -12,10 +12,7 @@ using namespace std; // construct the sequence of the MEM; useful in debugging string MaximalExactMatch::sequence(void) const { - string seq; //seq.resize(end-begin); - string::const_iterator c = begin; - while (c != end) seq += *c++; - return seq; + return string(begin, end); } // length of the MEM diff --git a/src/mem.hpp b/src/mem.hpp index 2f818d8f85f..a3aa06e2951 100644 --- a/src/mem.hpp +++ b/src/mem.hpp @@ -35,7 +35,7 @@ class MaximalExactMatch { int fragment; bool primary; // if not a sub-MEM std::vector nodes; - map > > positions; + unordered_map > > positions; MaximalExactMatch(string::const_iterator b, string::const_iterator e, diff --git a/src/mem_accelerator.cpp b/src/mem_accelerator.cpp new file mode 100644 index 00000000000..efb6a741cae --- /dev/null +++ b/src/mem_accelerator.cpp @@ -0,0 +1,73 @@ +/** + * \file mem_accelerator.hpp + * + * Implements an index for accelerating GCSA2 queries + */ + +#include "mem_accelerator.hpp" +#include +#include + +namespace vg { + +MEMAccelerator::MEMAccelerator(const gcsa::GCSA& gcsa_index, size_t k) : k(k) +{ + // compute the minimum width required to express the integers. + range_table.width(max(sdsl::bits::length(gcsa_index.size()), 1)); + // range table is initialized to size 2^(2k + 1) = 2 * 4^k + range_table.resize(1 << (2 * k + 1)); + + const char alphabet[5] = "ACGT"; + + // records of (next char to query, k-mer integer encoding, range) + vector> stack; + stack.emplace_back(0, 0, gcsa::range_type(0, gcsa_index.size() - 1)); + + // TODO: multithread this? probably would do it single threaded + // to init 128 stacks or something similar + while (!stack.empty()) { + if (stack.size() == k + 1) { + // we've walked the full k-mers + range_table[2 * get<1>(stack.back())] = get<2>(stack.back()).first; + range_table[2 * get<1>(stack.back()) + 1] = get<2>(stack.back()).second; + stack.pop_back(); + } + else if (get<0>(stack.back()) == 4) { + // we've walked all the k-mers that start with this prefix + stack.pop_back(); + } + else { + // extend the current range by the next character + auto next = get<0>(stack.back())++; + auto enc = (next << (2 * (stack.size() - 1))) | get<1>(stack.back()); + + gcsa::range_type range; + if (!gcsa::Range::empty(get<2>(stack.back()))) { + range = gcsa_index.LF(get<2>(stack.back()), + gcsa_index.alpha.char2comp[alphabet[next]]); + if (gcsa::Range::empty(range)) { + // we normalize empty ranges to an empty range that will + // fit within any bit width + range.first = 1; + range.second = 0; + } + } + else { + range = get<2>(stack.back()); + } + + stack.emplace_back(0, enc, range); + } + } +} + +gcsa::range_type MEMAccelerator::memoized_LF(string::const_iterator last) const { + int64_t enc = 0; + for (size_t i = 0; i < k; ++i) { + enc |= (encode(*last) << (i << 1)); + --last; + } + return gcsa::range_type(range_table[enc << 1], range_table[(enc << 1) | 1]); +} + +} diff --git a/src/mem_accelerator.hpp b/src/mem_accelerator.hpp new file mode 100644 index 00000000000..16e5b581559 --- /dev/null +++ b/src/mem_accelerator.hpp @@ -0,0 +1,71 @@ +/** + * \file mem_accelerator.hpp + * + * Defines an index for accelerating GCSA2 queries + */ + +#ifndef VG_MEM_ACCELERATOR_HPP_INCLUDED +#define VG_MEM_ACCELERATOR_HPP_INCLUDED + +#include +#include +#include +#include + +namespace vg { + +using namespace std; + +/* + * An auxilliary index that accelerates the initial steps of + * MEM-finding in + */ +class MEMAccelerator { +public: + + MEMAccelerator() = default; + MEMAccelerator(const gcsa::GCSA& gcsa_index, size_t k); + + // return the length of k-mers that are memoized + inline int64_t length() const; + + // look up the GCSA range that corresponds to a k-length + // string ending at the indicated position. client code + // is responsible for ensuring that the string being + // accessed is at least length k and consists only of ACGT + // characters + gcsa::range_type memoized_LF(string::const_iterator last) const; + +private: + + inline int64_t encode(char c) const; + + // the size k-mer we'll index + const int64_t k = 1; + // the actual table + sdsl::int_vector<> range_table; + +}; + +inline int64_t MEMAccelerator::length() const { + return k; +} + +inline int64_t MEMAccelerator::encode(char c) const { + switch (c) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + default: + return -1; + } +} + +} + +#endif diff --git a/src/memoizing_graph.cpp b/src/memoizing_graph.cpp new file mode 100644 index 00000000000..04ccd5e610d --- /dev/null +++ b/src/memoizing_graph.cpp @@ -0,0 +1,229 @@ +/** + * \file memoizing_graph.cpp: contains the implementation of MemoizingGraph + */ + + +#include "memoizing_graph.hpp" + + +namespace vg { + +using namespace std; + + MemoizingGraph::MemoizingGraph(const PathPositionHandleGraph* graph) : graph(graph) { + + } + + bool MemoizingGraph::has_node(id_t node_id) const { + bool found_node = false; + if (graph->has_node(node_id)) { + if (graph->get_length(graph->get_handle(node_id)) > 0) { + found_node = true; + } + } + return found_node; + } + + handle_t MemoizingGraph::get_handle(const id_t& node_id, bool is_reverse) const { + // we have to do some ugly stuff to keep libhandlegraph's const requirements while still + // updating memos + auto& memo = const_cast(this)->get_handle_memo; + + handle_t to_return; + auto it = memo.find(node_id); + if (it != memo.end()) { + to_return = is_reverse ? graph->flip(it->second) : it->second; + } + else if (memo.size() < max_handle_memo_size) { + handle_t handle = graph->get_handle(node_id); + memo[node_id] = handle; + to_return = is_reverse ? graph->flip(handle) : handle; + } + else { + to_return = graph->get_handle(node_id, is_reverse); + } + return to_return; + } + + id_t MemoizingGraph::get_id(const handle_t& handle) const { + return graph->get_id(handle); + } + + bool MemoizingGraph::get_is_reverse(const handle_t& handle) const { + return graph->get_is_reverse(handle); + } + + handle_t MemoizingGraph::flip(const handle_t& handle) const { + return graph->flip(handle); + } + + size_t MemoizingGraph::get_length(const handle_t& handle) const { + return graph->get_length(handle); + } + + string MemoizingGraph::get_sequence(const handle_t& handle) const { + return graph->get_sequence(handle); + } + + bool MemoizingGraph::follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const { + + return graph->follow_edges(handle, go_left, [&](const handle_t& next) { + return iteratee(next); + }); + } + + bool MemoizingGraph::for_each_handle_impl(const function& iteratee, bool parallel) const { + return graph->for_each_handle([&](const handle_t& handle) { + return iteratee(handle); + }, parallel); + } + + size_t MemoizingGraph::get_node_count() const { + return graph->get_node_count(); + } + + id_t MemoizingGraph::min_node_id() const { + return graph->min_node_id(); + } + + id_t MemoizingGraph::max_node_id() const { + return graph->max_node_id(); + } + + size_t MemoizingGraph::get_path_count() const { + return graph->get_path_count(); + } + + bool MemoizingGraph::has_path(const std::string& path_name) const { + return graph->has_path(path_name); + } + + path_handle_t MemoizingGraph::get_path_handle(const std::string& path_name) const { + return graph->get_path_handle(path_name); + } + + std::string MemoizingGraph::get_path_name(const path_handle_t& path_handle) const { + return graph->get_path_name(path_handle); + } + + bool MemoizingGraph::get_is_circular(const path_handle_t& path_handle) const { + return graph->get_is_circular(path_handle); + } + + size_t MemoizingGraph::get_step_count(const path_handle_t& path_handle) const { + return graph->get_step_count(path_handle); + } + + handle_t MemoizingGraph::get_handle_of_step(const step_handle_t& step_handle) const { + return graph->get_handle_of_step(step_handle); + } + + path_handle_t MemoizingGraph::get_path_handle_of_step(const step_handle_t& step_handle) const { + return graph->get_path_handle_of_step(step_handle); + } + + step_handle_t MemoizingGraph::path_begin(const path_handle_t& path_handle) const { + return graph->path_begin(path_handle); + } + + step_handle_t MemoizingGraph::path_end(const path_handle_t& path_handle) const { + return graph->path_end(path_handle); + } + + step_handle_t MemoizingGraph::path_back(const path_handle_t& path_handle) const { + return graph->path_back(path_handle); + } + + step_handle_t MemoizingGraph::path_front_end(const path_handle_t& path_handle) const { + return graph->path_front_end(path_handle); + } + + bool MemoizingGraph::has_next_step(const step_handle_t& step_handle) const { + return graph->has_next_step(step_handle); + } + + bool MemoizingGraph::has_previous_step(const step_handle_t& step_handle) const { + return graph->has_previous_step(step_handle); + } + + step_handle_t MemoizingGraph::get_next_step(const step_handle_t& step_handle) const { + return graph->get_next_step(step_handle); + } + + step_handle_t MemoizingGraph::get_previous_step(const step_handle_t& step_handle) const { + return graph->get_previous_step(step_handle); + } + + bool MemoizingGraph::for_each_path_handle_impl(const std::function& iteratee) const { + return graph->for_each_path_handle(iteratee); + } + + bool MemoizingGraph::for_each_step_on_handle_impl(const handle_t& handle, + const std::function& iteratee) const { + return graph->for_each_step_on_handle(handle, iteratee); + } + + std::vector MemoizingGraph::steps_of_handle(const handle_t& handle, + bool match_orientation) const { + + // we have to do some ugly stuff to keep libhandlegraph's const requirements while still + // updating memos + auto& memo = const_cast(this)->steps_of_handle_memo; + + vector to_return; + auto it = memo.find(forward(handle)); + if (it != memo.end()) { + if (match_orientation) { + for (const step_handle_t& step : it->second) { + if (graph->get_is_reverse(graph->get_handle_of_step(step)) == graph->get_is_reverse(handle)) { + to_return.push_back(step); + } + } + } + else { + to_return = it->second; + } + } + else if (memo.size() < max_steps_of_handle_memo_size) { + memo[forward(handle)] = graph->steps_of_handle(handle); + if (match_orientation) { + for (const step_handle_t& step : memo[forward(handle)]) { + if (graph->get_is_reverse(graph->get_handle_of_step(step)) == graph->get_is_reverse(handle)) { + to_return.push_back(step); + } + } + } + else { + to_return = memo[forward(handle)]; + } + } + else { + to_return = graph->steps_of_handle(handle, match_orientation); + } + return to_return; + } + + bool MemoizingGraph::is_empty(const path_handle_t& path_handle) const { + return graph->is_empty(path_handle); + } + + size_t MemoizingGraph::get_path_length(const path_handle_t& path_handle) const { + return graph->get_path_length(path_handle); + } + + size_t MemoizingGraph::get_position_of_step(const step_handle_t& step) const { + return graph->get_position_of_step(step); + } + + step_handle_t MemoizingGraph::get_step_at_position(const path_handle_t& path, + const size_t& position) const { + return graph->get_step_at_position(path, position); + } + + bool MemoizingGraph::for_each_step_position_on_handle(const handle_t& handle, + const std::function& iteratee) const { + return graph->for_each_step_position_on_handle(handle, iteratee); + } +} + diff --git a/src/memoizing_graph.hpp b/src/memoizing_graph.hpp new file mode 100644 index 00000000000..04d4a51b7b2 --- /dev/null +++ b/src/memoizing_graph.hpp @@ -0,0 +1,217 @@ +#ifndef VG_MEMOIZING_GRAPH_HPP_INCLUDED +#define VG_MEMOIZING_GRAPH_HPP_INCLUDED + +/** \file + * memoizing_graph.hpp: defines a handle graph implementation memoizes the results + * of certain handle operations + */ + +#include "handle.hpp" + +#include + +namespace vg { + +using namespace std; + + /** + * A PathPositionHandleGraph implementation that memoizes the results of get_handle + * and steps_of_handle. + */ + class MemoizingGraph : public PathPositionHandleGraph { + public: + + /// Initialize with a pointer to graph we want to memoize operations for + MemoizingGraph(const PathPositionHandleGraph* graph); + + /// Default constructor -- not actually functional + MemoizingGraph() = default; + + /// Default destructor + ~MemoizingGraph() = default; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + /// Method to check if a node exists by ID + virtual bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + virtual handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + virtual id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + virtual bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + virtual handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + virtual size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + virtual string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + virtual bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph. + virtual size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t max_node_id() const; + + //////////////////////////////////////////// + // Path handle graph interface + //////////////////////////////////////////// + + /// Returns the number of paths stored in the graph + virtual size_t get_path_count() const; + + /// Determine if a path name exists and is legal to get a path handle for. + virtual bool has_path(const std::string& path_name) const; + + /// Look up the path handle for the given path name. + /// The path with that name must exist. + virtual path_handle_t get_path_handle(const std::string& path_name) const; + + /// Look up the name of a path from a handle to it + virtual std::string get_path_name(const path_handle_t& path_handle) const; + + /// Look up whether a path is circular + virtual bool get_is_circular(const path_handle_t& path_handle) const; + + /// Returns the number of node steps in the path + virtual size_t get_step_count(const path_handle_t& path_handle) const; + + /// Get a node handle (node ID and orientation) from a handle to an step on a path + virtual handle_t get_handle_of_step(const step_handle_t& step_handle) const; + + /// Returns a handle to the path that an step is on + virtual path_handle_t get_path_handle_of_step(const step_handle_t& step_handle) const; + + /// Get a handle to the first step, which will be an arbitrary step in a circular path + /// that we consider "first" based on our construction of the path. If the path is empty, + /// then the implementation must return the same value as path_end(). + virtual step_handle_t path_begin(const path_handle_t& path_handle) const; + + /// Get a handle to a fictitious position past the end of a path. This position is + /// returned by get_next_step for the final step in a path in a non-circular path. + /// Note: get_next_step will *NEVER* return this value for a circular path. + virtual step_handle_t path_end(const path_handle_t& path_handle) const; + + /// Get a handle to the last step, which will be an arbitrary step in a circular path that + /// we consider "last" based on our construction of the path. If the path is empty + /// then the implementation must return the same value as path_front_end(). + virtual step_handle_t path_back(const path_handle_t& path_handle) const; + + /// Get a handle to a fictitious position before the beginning of a path. This position is + /// return by get_previous_step for the first step in a path in a non-circular path. + /// Note: get_previous_step will *NEVER* return this value for a circular path. + virtual step_handle_t path_front_end(const path_handle_t& path_handle) const; + + /// Returns true if the step is not the last step in a non-circular path. + virtual bool has_next_step(const step_handle_t& step_handle) const; + + /// Returns true if the step is not the first step in a non-circular path. + virtual bool has_previous_step(const step_handle_t& step_handle) const; + + /// Returns a handle to the next step on the path. If the given step is the final step + /// of a non-circular path, this method has undefined behavior. In a circular path, + /// the "last" step will loop around to the "first" step. + virtual step_handle_t get_next_step(const step_handle_t& step_handle) const; + + /// Returns a handle to the previous step on the path. If the given step is the first + /// step of a non-circular path, this method has undefined behavior. In a circular path, + /// it will loop around from the "first" step (i.e. the one returned by path_begin) to + /// the "last" step. + virtual step_handle_t get_previous_step(const step_handle_t& step_handle) const; + + protected: + + /// Execute a function on each path in the graph. If it returns false, stop + /// iteration. Returns true if we finished and false if we stopped early. + virtual bool for_each_path_handle_impl(const std::function& iteratee) const; + + /// Execute a function on each step of a handle in any path. If it + /// returns false, stop iteration. Returns true if we finished and false if + /// we stopped early. + virtual bool for_each_step_on_handle_impl(const handle_t& handle, + const std::function& iteratee) const; + + public: + + /// Returns a vector of all steps of a node on paths. Optionally restricts to + /// steps that match the handle in orientation. + virtual std::vector steps_of_handle(const handle_t& handle, + bool match_orientation = false) const; + + /// Returns true if the given path is empty, and false otherwise + virtual bool is_empty(const path_handle_t& path_handle) const; + + + //////////////////////////////////////////////////////////////////////////// + // Path position handle graph interface + //////////////////////////////////////////////////////////////////////////// + + /// Returns the length of a path measured in bases of sequence. + virtual size_t get_path_length(const path_handle_t& path_handle) const; + + /// Returns the position along the path of the beginning of this step measured in + /// bases of sequence. In a circular path, positions start at the step returned by + /// path_begin(). + virtual size_t get_position_of_step(const step_handle_t& step) const; + + /// Returns the step at this position, measured in bases of sequence starting at + /// the step returned by path_begin(). If the position is past the end of the + /// path, returns path_end(). + virtual step_handle_t get_step_at_position(const path_handle_t& path, + const size_t& position) const; + + protected: + + /// Execute an itteratee on each step and its path relative position and orientation + /// on a handle in any path. Iteration will stop early if the iteratee returns false. + /// This method returns false if iteration was stopped early, else true. + virtual bool for_each_step_position_on_handle(const handle_t& handle, + const std::function& iteratee) const; + + public: + + /// The largest number of calls to get_handle we will memoize + size_t max_handle_memo_size = 500; + + /// The largest number of calls to steps_of_handle we will memoize + size_t max_steps_of_handle_memo_size = 500; + + private: + /// The graph we're memoizing operations for + const PathPositionHandleGraph* graph = nullptr; + + /// Memo for get_handle + unordered_map get_handle_memo; + + /// Memo for steps_of_handle + unordered_map> steps_of_handle_memo; + }; +} + +#endif diff --git a/src/memusage.cpp b/src/memusage.cpp new file mode 100644 index 00000000000..eba194939a2 --- /dev/null +++ b/src/memusage.cpp @@ -0,0 +1,98 @@ +#include "memusage.hpp" + +#include +#include +#include + +#include +#include + +namespace vg { + +using namespace std; + +string get_proc_status_value(const string& name) { + + ifstream status_file("/proc/self/status"); + + string line; + while (status_file.good()) { + // Grab each line + getline(status_file, line); + + // Find the first colon + size_t colon_pos = line.find(':'); + if (colon_pos == string::npos) { + // No colon found. Try the next line. + continue; + } + + if (line.substr(0, colon_pos) != name) { + // This isn't what we care about + continue; + } + + if (line.size() == colon_pos + 1) { + // There's nothing after the colon + continue; + } + + // Find the first non-whitespace after the colon + size_t value_pos = line.find_first_not_of(" \t", colon_pos + 1); + if (value_pos == string::npos) { + // No value + continue; + } + + // Get the value text and return it + return line.substr(value_pos); + } + + return ""; + +} + +size_t get_max_rss_kb() { + // This isn't in /proc, we have to get it ourselves + struct rusage usage; + getrusage(RUSAGE_SELF, &usage); + + return usage.ru_maxrss; +} + + +size_t get_max_vmem_kb() { + string value = get_proc_status_value("VmPeak"); + + if (value == "") { + return 0; + } + + stringstream sstream(value); + + size_t result = 0; + + sstream >> result; + + return result; + +} + +size_t get_current_vmem_kb() { + string value = get_proc_status_value("VmSize"); + + if (value == "") { + return 0; + } + + stringstream sstream(value); + + size_t result = 0; + + sstream >> result; + + return result; +} + + +} diff --git a/src/memusage.hpp b/src/memusage.hpp new file mode 100644 index 00000000000..8543cea95c5 --- /dev/null +++ b/src/memusage.hpp @@ -0,0 +1,30 @@ +#ifndef VG_MEMUSAGE_HPP_INCLUDED +#define VG_MEMUSAGE_HPP_INCLUDED + +#include + +/** + * \file memusage.hpp + * Defines an interface to /proc/self/status and other status interfaces for debugging. + */ + +namespace vg { + +using namespace std; + +/// Get the string value for a field in /proc/self/status by name, or "" if unsupported or not found. +string get_proc_status_value(const string& name); + +/// Get the max RSS usage ever, in kb, or 0 if unsupported. +size_t get_max_rss_kb(); + +/// Get the max virtual memory size ever, in kb, or 0 if unsupported. +size_t get_max_vmem_kb(); + +/// Get the current virtual memory size, in kb, or 0 if unsupported. +size_t get_current_vmem_kb(); + + +} + +#endif diff --git a/src/minimizer_mapper.cpp b/src/minimizer_mapper.cpp new file mode 100644 index 00000000000..ae2991c0238 --- /dev/null +++ b/src/minimizer_mapper.cpp @@ -0,0 +1,4746 @@ +/** + * \file minimizer_mapper.cpp + * Defines the code for the minimizer-and-GBWT-based mapper. + */ + +#include "minimizer_mapper.hpp" + +#include "crash.hpp" +#include "annotation.hpp" +#include "path_subgraph.hpp" +#include "multipath_alignment.hpp" +#include "split_strand_graph.hpp" +#include "subgraph.hpp" +#include "statistics.hpp" +#include "algorithms/count_covered.hpp" +#include "algorithms/intersect_path_offsets.hpp" +#include "algorithms/extract_containing_graph.hpp" +#include "algorithms/extract_connecting_graph.hpp" +#include "algorithms/chain_items.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +// Turn on debugging prints +//#define debug +// Turn on printing of minimizer fact tables +//#define print_minimizer_table +// Dump local graphs that we align against +//#define debug_dump_graph +// Dump fragment length distribution information +//#define debug_fragment_distr +//Do a brute force check that clusters are correct +//#define debug_validate_clusters +// Make sure by-index references are correct +//#define debug_validate_index_references + +namespace vg { + +using namespace std; + +MinimizerMapper::MinimizerMapper(const gbwtgraph::GBWTGraph& graph, + const gbwtgraph::DefaultMinimizerIndex& minimizer_index, + SnarlDistanceIndex* distance_index, + const PathPositionHandleGraph* path_graph) : + path_graph(path_graph), minimizer_index(minimizer_index), + distance_index(distance_index), + clusterer(distance_index, &graph), + gbwt_graph(graph), + extender(new GaplessExtender(gbwt_graph, *(get_regular_aligner()))), + fragment_length_distr(1000,1000,0.95) { + + // The GBWTGraph needs a GBWT + crash_unless(graph.index != nullptr); +} + +void MinimizerMapper::set_alignment_scores(const int8_t* score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) { + // Clear the extender before the aligners go away + extender.reset(); + // Call the base class method and remake the aligners + AlignerClient::set_alignment_scores(score_matrix, gap_open, gap_extend, full_length_bonus); + // Remake the extender with new references + extender.reset(new GaplessExtender(gbwt_graph, *(get_regular_aligner()))); +} + +//----------------------------------------------------------------------------- + +string MinimizerMapper::log_name() { + return "T" + to_string(omp_get_thread_num()) + ":\t"; +} + +string MinimizerMapper::log_alignment(const Alignment& aln) { + if (aln.sequence().size() < LONG_LIMIT && aln.path().mapping_size() < LONG_LIMIT/32 ) { + // Log as a short alignment + return pb2json(aln); + } else { + // Log as a long alignment + + stringstream ss; + ss << log_alignment(aln.path(), true); + ss << " score " << aln.score(); + return ss.str(); + } +} + +string MinimizerMapper::log_alignment(const Path& path, bool force_condensed) { + if (path.mapping_size() < LONG_LIMIT/32 && !force_condensed) { + // Log as a short alignment + return pb2json(path); + } else { + // Log as a long alignment + + // Turn it into one big CIGAR string + vector> cigar; + for (auto& mapping : path.mapping()) { + mapping_cigar(mapping, cigar); + } + + // And then put that + stringstream ss; + if (!cigar.empty()) { + ss << cigar_string(cigar); + } else { + ss << ""; + } + if (path.mapping_size() > 0) { + ss << "@" << path.mapping(0).position().node_id() + << (path.mapping(0).position().is_reverse() ? '-' : '+'); + } + return ss.str(); + + // TODO: Include visited nodes as collapsed ranges of consecutive IDs? + } +} + +string MinimizerMapper::log_bits(const std::vector& bits) { + stringstream ss; + + // We're going to take groups of 8 bits into here, and then emit them as + // 8-dot Braille characters. + unsigned char scratch = 0; + for (size_t i = 0; i < bits.size(); i++) { + // We want earlier bits to be top and left aligned, and this is easier + // if we make later bits the high bits and push earlier bits to the low + // bits. + // + // So shift everything down and put in at the highest place. + scratch = (scratch >> 1) | ((unsigned char)bits[i] << 7); + if (i % 8 == 7 || i + 1 == bits.size()) { + // Time to ship it + + // Make sure to finish aligning the first bit down if this is a + // partial byte. + scratch = scratch >> (7 - i % 8); + + // We want our layout to be (numbering LSB to MSB): + // + // 0 4 + // 1 5 + // 2 6 + // 3 7 + // + // Unicode Braille is laid out: + // + // 0 3 + // 1 4 + // 2 5 + // 6 7 + // + // So we have to: + // - Keep 012 and 7 in place + // - Move 3 up by 3 to become 6 + // - Move 456 down by 1 to become 345 + scratch = (scratch & 0b10000111) | + ((scratch & 0b00001000) << 3) | + ((scratch & 0b01110000) >> 1); + + // Braille block codepoints would be 0x2800 + scratch + + // Encode as UTF-8. These are all above U+0800 and below U+FFFF so + // we need 3 bytes. + // First 4 bits are always 0xe. + // First 4 data bits (0x2) go in low nibble of first byte. + ss << (char) 0xe2; + // Next two bits are always 0b10. + // Next 4 data bits (0x8) and high 2 bits of scratch go in next byte. + ss << (char) ((0b10 << 6) | (0x80 >> 2) | (scratch >> 6)); + // Next two bits are also always 0b10. + // Low 6 bits of scratch go in last byte. + ss << (char) ((0b10 << 6) | (scratch & 0b00111111)); + + // Clear out since the last cycle may be partial and we don't want + // stray bits. + scratch = 0; + } + } + return ss.str(); +} + +void MinimizerMapper::dump_chaining_problem(const std::vector& anchors, const std::vector& cluster_seeds_sorted, const HandleGraph& graph) { + ProblemDumpExplainer exp; + + // We need to keep track of all the points we want in our problem subgraph. + std::vector seed_positions; + seed_positions.reserve(cluster_seeds_sorted.size()); + + exp.object_start(); + + // Save all the items + exp.key("items"); + exp.array_start(); + for (auto& index : cluster_seeds_sorted) { + exp.object_start(); + exp.key("read_start"); + exp.value(anchors[index].read_start()); + exp.key("read_end"); + exp.value(anchors[index].read_end()); + + pos_t graph_start = anchors[index].graph_start(); + seed_positions.push_back(graph_start); + exp.key("graph_start"); + exp.value(graph_start); + exp.key("graph_end"); + exp.value(anchors[index].graph_end()); + + exp.object_end(); + } + exp.array_end(); + + // Get the subgraph for the cluster + HashGraph subgraph; + algorithms::extract_containing_graph(&graph, &subgraph, seed_positions, 10000); + exp.key("subgraph"); + exp.value(subgraph); + + exp.object_end(); +} + +void MinimizerMapper::dump_debug_sequence(ostream& out, const string& sequence, size_t start_offset, size_t length_limit) { + int digits_needed = (int) ceil(log10(std::min(sequence.size(), start_offset + length_limit))); + for (int digit = digits_needed - 1; digit >= 0; digit--) { + out << log_name(); + for (size_t i = start_offset; i < std::min(sequence.size(), start_offset + length_limit); i++) { + // Output the correct digit for this place in this number + out << (char) ('0' + (uint8_t) floor(i % (int) round(pow(10, digit + 1)) / pow(10, digit))); + } + out << endl; + } + out << log_name() << sequence.substr(start_offset, std::min(sequence.size() - start_offset, length_limit)) << endl; +} + +void MinimizerMapper::dump_debug_extension_set(const HandleGraph& graph, const Alignment& aln, const vector& extended_seeds) { + + if (aln.sequence().size() >= LONG_LIMIT) { + // Describe the extensions, because the read is huge + cerr << log_name() << "<" << extended_seeds.size() << " extensions>" << endl; + } else { + // Show a diagram + dump_debug_sequence(cerr, aln.sequence()); + + for (auto& ext : extended_seeds) { + // For each extension + + cerr << log_name(); + + for (size_t i = 0; i < ext.read_interval.first; i++) { + // Space until it starts + cerr << ' '; + } + + for (size_t i = ext.read_interval.first; i < ext.read_interval.second; i++) { + if (std::find(ext.mismatch_positions.begin(), ext.mismatch_positions.end(), i) != ext.mismatch_positions.end()) { + // Has an error here + cerr << "*"; + } else { + // A match + cerr << aln.sequence()[i]; + } + } + cerr << " @"; + for (const handle_t& h : ext.path) { + cerr << " " << graph.get_id(h); + } + cerr << endl; + } + } +} + +void MinimizerMapper::dump_debug_minimizers(const VectorView& minimizers, const string& sequence, const vector* to_include, size_t start_offset, size_t length_limit) { + + // Work out what region we are actually looking at + size_t region_start = start_offset; + size_t region_length = std::min(sequence.size() - start_offset, length_limit); + + if (region_length >= LONG_LIMIT) { + // Describe the minimizers, because the read is huge + size_t minimizer_count = to_include ? to_include->size() : minimizers.size(); + if (minimizer_count < MANY_LIMIT) { + auto print_minimizer = [&](size_t i) { + cerr << log_name() << "Minimizer " << i << ": " << minimizers[i].forward_sequence() << "@" << minimizers[i].forward_offset() << " with " << minimizers[i].hits << " hits" << endl; + }; + + if (to_include) { + for (auto& i : *to_include) { + print_minimizer(i); + } + } else { + for (size_t i = 0; i < minimizers.size(); i++) { + print_minimizer(i); + } + } + } else { + if (region_start == 0 && length_limit == sequence.size()) { + // Report as if we have a count + cerr << log_name() << "<" << minimizer_count << " minimizers>" << endl; + } else { + // We don't know how many minimizers are actually in the region + cerr << log_name() << "" << endl; + } + } + } else { + // Draw a diagram + dump_debug_sequence(cerr, sequence, region_start, region_length); + + vector all; + if (to_include == nullptr) { + // Synthesize a list of all minimizers + to_include = &all; + for (size_t i = 0; i < minimizers.size(); i++) { + all.push_back(i); + } + + // Sort minimizer subset so we go through minimizers in increasing order of start position + std::sort(all.begin(), all.end(), [&](size_t a, size_t b) { + // Return true if a must come before b, and false otherwise + return minimizers[a].forward_offset() < minimizers[b].forward_offset(); + }); + } + + // Dump minimizers + for (auto& index : *to_include) { + // For each minimizer + auto& m = minimizers[index]; + + if (m.forward_offset() < region_start || m.forward_offset() - region_start + m.length > region_length) { + // Minimizer itself reaches out of bounds, so hide it + continue; + } + + // Compute its traits relative to the region we are interested in showing + size_t relative_agglomeration_start = (m.agglomeration_start < region_start) ? (size_t)0 : m.agglomeration_start - region_start; + size_t relative_forward_offset = m.forward_offset() - region_start; + size_t capped_agglomeration_length = (relative_agglomeration_start + m.agglomeration_length > region_length) ? (region_length - relative_agglomeration_start) : m.agglomeration_length; + + cerr << log_name(); + + + for (size_t i = 0; i < relative_agglomeration_start; i++) { + // Space until its agglomeration starts + cerr << ' '; + } + + for (size_t i = relative_agglomeration_start; i < relative_forward_offset; i++) { + // Do the beginnign of the agglomeration + cerr << '-'; + } + // Do the minimizer itself + cerr << m.value.key.decode(m.length); + for (size_t i = relative_forward_offset + m.length ; i < relative_agglomeration_start + capped_agglomeration_length; i++) { + // Do the tail end of the agglomeration + cerr << '-'; + } + + // Tag with metadata + cerr << " (#" << index << ", " << m.hits << " hits)" << endl; + } + } +} + +void MinimizerMapper::dump_debug_clustering(const Cluster& cluster, size_t cluster_number, const VectorView& minimizers, const std::vector& seeds) { + if (minimizers.size() < MANY_LIMIT) { + // There are a few minimizers overall, so describe each in the cluster individually. + for (auto hit_index : cluster.seeds) { + cerr << log_name() << "Minimizer " << seeds[hit_index].source << " is present in cluster " << cluster_number << endl; + } + } else { + // Describe the seeds in aggregate + std::vector presence(minimizers.size()); + for (auto hit_index : cluster.seeds) { + presence[seeds[hit_index].source] = true; + } + cerr << log_name() << "Minimizers in cluster " << cluster_number << "\t" << log_bits(presence) << endl; + } +} +bool MinimizerMapper::validate_clusters(const std::vector>& clusters, const std::vector>& seeds, size_t read_limit, size_t fragment_limit) const { + vector> fragment_clusters; + for (size_t read_num = 0 ; read_num < clusters.size() ; read_num ++) { + auto& one_read_clusters = clusters[read_num]; + if (one_read_clusters.size() > 0) { + for (size_t cluster_num1 = 0; cluster_num1 < one_read_clusters.size(); cluster_num1++) { + // For each cluster -cluster this cluster to ensure that there is only one + vector clust = one_read_clusters[cluster_num1].seeds; + size_t fragment_cluster = one_read_clusters[cluster_num1].fragment; + if (fragment_cluster >= fragment_clusters.size()) { + fragment_clusters.resize(fragment_cluster+1); + } + + structures::UnionFind new_clusters (clust.size(), false); + for (size_t i1 = 0 ; i1 < clust.size() ; i1++) { + pos_t pos1 = seeds[read_num][clust[i1]].pos; + fragment_clusters[fragment_cluster].emplace_back(pos1); + + for (size_t cluster_num2 = 0 ; cluster_num2 < one_read_clusters.size() ; cluster_num2++) { + if (cluster_num2 != cluster_num1) { + //For each other cluster, make sure that the seeds in the other cluster are far away + vector clust2 = one_read_clusters[cluster_num2].seeds; + for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) { + //And each position in each other cluster, + //make sure that this position is far away from i1 + pos_t pos2 = seeds[read_num][clust2[i2]].pos;; + size_t distance_between = unoriented_distance_between(pos1, pos2); + + if ( distance_between != std::numeric_limits::max() && + distance_between <= read_limit) { + cerr << "These should have been in the same read cluster: " ; + cerr << pos1 << " and " << pos2 << " with distance between them " << distance_between << endl; + return false; + } + + } + } + } + for (size_t i2 = 0 ; i2 < clust.size() ; i2++) { + //For each position in the same cluster, make sure it's close to something + pos_t pos2 = seeds[read_num][clust[i2]].pos; + size_t distance_between = unoriented_distance_between(pos1, pos2); + if ( distance_between != std::numeric_limits::max() && + distance_between <= read_limit) { + new_clusters.union_groups(i1, i2); + } + + } + } + auto actual_clusters = new_clusters.all_groups(); + if (actual_clusters.size() != 1) { + cerr << "These should be different read clusters: " << endl; + for (auto c : actual_clusters) { + cerr << "cluster: " ; + for (size_t i : c) { + cerr << seeds[read_num][clust[i]].pos << " "; + } + cerr << endl; + } + return false; + } + } + } + } + //Now check the fragment clusters + if (fragment_limit != 0) { + for (size_t cluster_num1 = 0; cluster_num1 < fragment_clusters.size(); cluster_num1++) { + // For each cluster -cluster this cluster to ensure that + // there is only one + vector clust = fragment_clusters[cluster_num1]; + + structures::UnionFind new_clusters (clust.size(), false); + + for (size_t i1 = 0 ; i1 < clust.size() ; i1++) { + pos_t pos1 = clust[i1]; + + for (size_t cluster_num2 = 0 ; cluster_num2 < fragment_clusters.size() ; cluster_num2++) { + if (cluster_num2 != cluster_num1) { + //For each other cluster + vector clust2 = fragment_clusters[cluster_num2]; + for (size_t i2 = 0 ; i2 < clust2.size() ; i2++) { + //And each position in each other cluster, + //make sure that this position is far away from i1 + pos_t pos2 = clust2[i2]; + int64_t distance_between = unoriented_distance_between(pos1, pos2); + if ( distance_between != std::numeric_limits::max() && + distance_between <= fragment_limit) { + cerr << "These should have been in the same fragment cluster: " ; + cerr << pos1 << " and " << pos2 << " with distance between " << distance_between << " and distance limit " << fragment_limit << endl; + return false; + + } + + } + } + } + for (size_t i2 = 0 ; i2 < clust.size() ; i2++) { + //For each position in the same cluster + pos_t pos2 = clust[i2]; + int64_t distance_between = unoriented_distance_between(pos1, pos2); + if ( distance_between != std::numeric_limits::max() && + distance_between <= fragment_limit) { + new_clusters.union_groups(i1, i2); + } + + } + } + auto actual_clusters = new_clusters.all_groups(); + if (actual_clusters.size() != 1) { + cerr << "These should be different fragment clusters with distance limit " << fragment_limit << endl; + for (auto c : actual_clusters) { + cerr << "cluster: " ; + for (size_t i1 : c) { + cerr << clust[i1] << " "; + } + cerr << endl; + } + return false; + } + } + } + return true; +} + + + +void MinimizerMapper::dump_debug_seeds(const VectorView& minimizers, const std::vector& seeds, const std::vector& selected_seeds) { + if (selected_seeds.size() < MANY_LIMIT) { + // There are a few seeds so describe them individually. + for (auto seed_index : selected_seeds) { + const Seed& seed = seeds[seed_index]; + const Minimizer& minimizer = minimizers[seed.source]; + cerr << log_name() << "Seed read:" << minimizer.value.offset << (minimizer.value.is_reverse ? '-' : '+') << " = " << seed.pos + << " from minimizer " << seed.source << "(" << minimizer.hits << "), #" << seed_index << endl; + } + } else { + // Describe the seeds in aggregate + size_t min_read_pos = std::numeric_limits::max(); + size_t max_read_pos = 0; + size_t min_hits = std::numeric_limits::max(); + size_t max_hits = 0; + for (auto seed_index : selected_seeds) { + // Compute statistics over all the seeds + const Seed& seed = seeds[seed_index]; + const Minimizer& minimizer = minimizers[seed.source]; + min_read_pos = std::min(min_read_pos, (size_t)minimizer.value.offset); + max_read_pos = std::max(max_read_pos, (size_t)minimizer.value.offset); + min_hits = std::min(min_hits, minimizer.hits); + max_hits = std::max(max_hits, minimizer.hits); + } + cerr << log_name() << selected_seeds.size() << " seeds in read:" << min_read_pos << "-" << max_read_pos << " with " << min_hits << "-" << max_hits << " hits" << endl; + } +} + +void MinimizerMapper::dump_debug_query(const Alignment& aln) { + cerr << log_name() << "Read " << aln.name() << ": " ; + if (aln.sequence().size() < LONG_LIMIT) { + cerr << aln.sequence(); + } else { + cerr << "<" << aln.sequence().size() << " bp>"; + } + cerr << endl; +} + +void MinimizerMapper::dump_debug_query(const Alignment& aln1, const Alignment& aln2) { + cerr << log_name() << "Read pair " << aln1.name() << ": "; + if (aln1.sequence().size() < LONG_LIMIT) { + cerr << aln1.sequence(); + } else { + cerr << "<" << aln1.sequence().size() << " bp>"; + } + cerr << " and " << aln2.name() << ": "; + if (aln2.sequence().size() < LONG_LIMIT) { + cerr << aln2.sequence(); + } else { + cerr << "<" << aln2.sequence().size() << " bp>"; + } + cerr << endl; +} + + +//----------------------------------------------------------------------------- + +void MinimizerMapper::map(Alignment& aln, AlignmentEmitter& alignment_emitter) { + // Ship out all the aligned alignments + alignment_emitter.emit_mapped_single(map(aln)); +} + +vector MinimizerMapper::map(Alignment& aln) { + if (align_from_chains) { + return map_from_chains(aln); + } else { + return map_from_extensions(aln); + } +} + +vector MinimizerMapper::map_from_extensions(Alignment& aln) { + + if (show_work) { + #pragma omp critical (cerr) + dump_debug_query(aln); + } + + // Make a new funnel instrumenter to watch us map this read. + Funnel funnel; + funnel.start(aln.name()); + + // Prepare the RNG for shuffling ties, if needed + LazyRNG rng([&]() { + return aln.sequence(); + }); + + + // Minimizers sorted by score in descending order. + std::vector minimizers = this->find_minimizers(aln.sequence(), funnel); + + // Find the seeds and mark the minimizers that were located. + vector seeds = this->find_seeds(minimizers, aln, funnel); + + // Cluster the seeds. Get sets of input seed indexes that go together. + if (track_provenance) { + funnel.stage("cluster"); + } + + // Find the clusters + std::vector clusters = clusterer.cluster_seeds(seeds, get_distance_limit(aln.sequence().size())); + +#ifdef debug_validate_clusters + vector> all_clusters; + all_clusters.emplace_back(clusters); + vector> all_seeds; + all_seeds.emplace_back(seeds); + validate_clusters(all_clusters, all_seeds, get_distance_limit(aln.sequence().size()), 0); +#endif + + // Determine the scores and read coverages for each cluster. + // Also find the best and second-best cluster scores. + if (this->track_provenance) { + funnel.substage("score"); + } + double best_cluster_score = 0.0, second_best_cluster_score = 0.0; + for (size_t i = 0; i < clusters.size(); i++) { + Cluster& cluster = clusters[i]; + this->score_cluster(cluster, i, minimizers, seeds, aln.sequence().length(), funnel); + if (cluster.score > best_cluster_score) { + second_best_cluster_score = best_cluster_score; + best_cluster_score = cluster.score; + } else if (cluster.score > second_best_cluster_score) { + second_best_cluster_score = cluster.score; + } + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Found " << clusters.size() << " clusters" << endl; + } + } + + // We will set a score cutoff based on the best, but move it down to the + // second best if it does not include the second best and the second best + // is within pad_cluster_score_threshold of where the cutoff would + // otherwise be. This ensures that we won't throw away all but one cluster + // based on score alone, unless it is really bad. + double cluster_score_cutoff = best_cluster_score - cluster_score_threshold; + if (cluster_score_cutoff - pad_cluster_score_threshold < second_best_cluster_score) { + cluster_score_cutoff = std::min(cluster_score_cutoff, second_best_cluster_score); + } + + if (track_provenance) { + // Now we go from clusters to gapless extensions + funnel.stage("extend"); + } + + // These are the GaplessExtensions for all the clusters. + vector> cluster_extensions; + cluster_extensions.reserve(clusters.size()); + + // To compute the windows for explored minimizers, we need to get + // all the minimizers that are explored. + SmallBitset minimizer_explored(minimizers.size()); + //How many hits of each minimizer ended up in each extended cluster? + vector> minimizer_extended_cluster_count; + + size_t kept_cluster_count = 0; + + //Process clusters sorted by both score and read coverage + process_until_threshold_c(clusters.size(), [&](size_t i) -> double { + return clusters[i].coverage; + }, [&](size_t a, size_t b) -> bool { + return ((clusters[a].coverage > clusters[b].coverage) || + (clusters[a].coverage == clusters[b].coverage && clusters[a].score > clusters[b].score)); + }, cluster_coverage_threshold, min_extensions, max_extensions, rng, [&](size_t cluster_num) -> bool { + // Handle sufficiently good clusters in descending coverage order + + Cluster& cluster = clusters[cluster_num]; + if (track_provenance) { + funnel.pass("cluster-coverage", cluster_num, cluster.coverage); + funnel.pass("max-extensions", cluster_num); + } + + // First check against the additional score filter + if (cluster_score_threshold != 0 && cluster.score < cluster_score_cutoff + && kept_cluster_count >= min_extensions) { + //If the score isn't good enough and we already kept at least min_extensions clusters, + //ignore this cluster + if (track_provenance) { + funnel.fail("cluster-score", cluster_num, cluster.score); + } + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Cluster " << cluster_num << " fails cluster score cutoff" << endl; + cerr << log_name() << "Covers " << clusters[cluster_num].coverage << "/best-" << cluster_coverage_threshold << " of read" << endl; + cerr << log_name() << "Scores " << clusters[cluster_num].score << "/" << cluster_score_cutoff << endl; + } + } + return false; + } + + if (track_provenance) { + funnel.pass("cluster-score", cluster_num, cluster.score); + } + + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Cluster " << cluster_num << endl; + cerr << log_name() << "Covers " << cluster.coverage << "/best-" << cluster_coverage_threshold << " of read" << endl; + cerr << log_name() << "Scores " << cluster.score << "/" << cluster_score_cutoff << endl; + } + } + + // Extend seed hits in the cluster into one or more gapless extensions + cluster_extensions.emplace_back(this->extend_cluster( + cluster, + cluster_num, + minimizers, + seeds, + aln.sequence(), + minimizer_extended_cluster_count, + funnel)); + + kept_cluster_count ++; + + return true; + + }, [&](size_t cluster_num) -> void { + // There are too many sufficiently good clusters + Cluster& cluster = clusters[cluster_num]; + if (track_provenance) { + funnel.pass("cluster-coverage", cluster_num, cluster.coverage); + funnel.fail("max-extensions", cluster_num); + } + + if (show_work) { + #pragma omp critical (cerr) + { + + cerr << log_name() << "Cluster " << cluster_num << " passes cluster cutoffs but we have too many" << endl; + cerr << log_name() << "Covers " << cluster.coverage << "/best-" << cluster_coverage_threshold << " of read" << endl; + cerr << log_name() << "Scores " << cluster.score << "/" << cluster_score_cutoff << endl; + } + } + + }, [&](size_t cluster_num) -> void { + // This cluster is not sufficiently good. + if (track_provenance) { + funnel.fail("cluster-coverage", cluster_num, clusters[cluster_num].coverage); + } + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Cluster " << cluster_num << " fails cluster coverage cutoffs" << endl; + cerr << log_name() << "Covers " << clusters[cluster_num].coverage << "/best-" << cluster_coverage_threshold << " of read" << endl; + cerr << log_name() << "Scores " << clusters[cluster_num].score << "/" << cluster_score_cutoff << endl; + } + } + }); + + std::vector cluster_extension_scores = this->score_extensions(cluster_extensions, aln, funnel); + if (track_provenance) { + funnel.stage("align"); + } + + //How many of each minimizer ends up in an extension set that actually gets turned into an alignment? + vector minimizer_extensions_count(minimizers.size(), 0); + + // Now start the alignment step. Everything has to become an alignment. + + // We will fill this with all computed alignments in estimated score order. + vector alignments; + alignments.reserve(cluster_extensions.size()); + // This maps from alignment index back to cluster extension index, for + // tracing back to minimizers for MAPQ. Can hold + // numeric_limits::max() for an unaligned alignment. + vector alignments_to_source; + alignments_to_source.reserve(cluster_extensions.size()); + + // Create a new alignment object to get rid of old annotations. + { + Alignment temp; + temp.set_sequence(aln.sequence()); + temp.set_name(aln.name()); + temp.set_quality(aln.quality()); + aln = std::move(temp); + } + + // Annotate the read with metadata + if (!sample_name.empty()) { + aln.set_sample_name(sample_name); + } + if (!read_group.empty()) { + aln.set_read_group(read_group); + } + + // We need to be able to discard a processed cluster because its score isn't good enough. + // We have more components to the score filter than process_until_threshold_b supports. + auto discard_processed_cluster_by_score = [&](size_t extension_num) -> void { + // This extension is not good enough. + if (track_provenance) { + funnel.fail("extension-set", extension_num, cluster_extension_scores[extension_num]); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "processed cluster " << extension_num << " failed because its score was not good enough (score=" << cluster_extension_scores[extension_num] << ")" << endl; + if (track_correctness && funnel.was_correct(extension_num)) { + cerr << log_name() << "\tCORRECT!" << endl; + dump_debug_extension_set(gbwt_graph, aln, cluster_extensions[extension_num]); + } + } + } + }; + + // Go through the gapless extension groups in score order. + process_until_threshold_b(cluster_extension_scores, + extension_set_score_threshold, min_extension_sets, max_alignments, rng, [&](size_t extension_num) -> bool { + // This extension set is good enough. + // Called in descending score order. + + if (cluster_extension_scores[extension_num] < extension_set_min_score) { + // Actually discard by score + discard_processed_cluster_by_score(extension_num); + return false; + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "gapless extension group " << extension_num << " is good enough (score=" << cluster_extension_scores[extension_num] << ")" << endl; + if (track_correctness && funnel.was_correct(extension_num)) { + cerr << log_name() << "\tCORRECT!" << endl; + dump_debug_extension_set(gbwt_graph, aln, cluster_extensions[extension_num]); + } + } + } + if (track_provenance) { + funnel.pass("extension-set", extension_num, cluster_extension_scores[extension_num]); + funnel.pass("max-alignments", extension_num); + funnel.processing_input(extension_num); + } + + auto& extensions = cluster_extensions[extension_num]; + + // Collect the top alignments. Make sure we have at least one always, starting with unaligned. + vector best_alignments(1, aln); + + if (GaplessExtender::full_length_extensions(extensions)) { + // We got full-length extensions, so directly convert to an Alignment. + + if (track_provenance) { + funnel.substage("direct"); + } + + //Fill in the best alignments from the extension. We know the top one is always full length and exists. + this->extension_to_alignment(extensions.front(), best_alignments.front()); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Produced alignment directly from full length gapless extension " << extension_num << endl; + } + } + + for (auto next_ext_it = extensions.begin() + 1; next_ext_it != extensions.end() && next_ext_it->full(); ++next_ext_it) { + // For all subsequent full length extensions, make them into alignments too. + // We want them all to go on to the pairing stage so we don't miss a possible pairing in a tandem repeat. + best_alignments.emplace_back(aln); + this->extension_to_alignment(*next_ext_it, best_alignments.back()); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Produced additional alignment directly from full length gapless extension " << (next_ext_it - extensions.begin()) << endl; + } + } + + } + + if (track_provenance) { + // Stop the current substage + funnel.substage_stop(); + } + } else if (do_dp) { + // We need to do base-level alignment. + + if (track_provenance) { + funnel.substage("align"); + } + + // Do the DP and compute up to 2 alignments from the individual gapless extensions + best_alignments.emplace_back(aln); + find_optimal_tail_alignments(aln, extensions, rng, best_alignments[0], best_alignments[1]); + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Did dynamic programming for gapless extension group " << extension_num << endl; + } + } + + if (track_provenance) { + // We're done base-level alignment. Next alignment may not go through this substage. + funnel.substage_stop(); + } + } else { + // We would do base-level alignment but it is disabled. + // Leave best_alignment unaligned + } + + // Have a function to process the best alignments we obtained + auto observe_alignment = [&](Alignment& aln) { + alignments.emplace_back(std::move(aln)); + alignments_to_source.push_back(extension_num); + + if (track_provenance) { + + funnel.project(extension_num); + funnel.score(alignments.size() - 1, alignments.back().score()); + } + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Produced alignment from gapless extension group " << extension_num + << " with score " << alignments.back().score() << ": " << log_alignment(alignments.back()) << endl; + } + } + }; + + for(auto aln_it = best_alignments.begin() ; aln_it != best_alignments.end() && aln_it->score() != 0 && aln_it->score() >= best_alignments[0].score() * 0.8; ++aln_it) { + //For each additional alignment with score at least 0.8 of the best score + observe_alignment(*aln_it); + } + + + if (track_provenance) { + // We're done with this input item + funnel.processed_input(); + } + + for (size_t i = 0 ; i < minimizer_extended_cluster_count[extension_num].size() ; i++) { + minimizer_extensions_count[i] += minimizer_extended_cluster_count[extension_num][i]; + if (minimizer_extended_cluster_count[extension_num][i] > 0) { + // This minimizer is in an extended cluster that gave rise + // to at least one alignment, so it is explored. + minimizer_explored.insert(i); + } + } + + return true; + }, [&](size_t extension_num) -> void { + // There are too many sufficiently good extensions + if (track_provenance) { + funnel.pass("extension-set", extension_num, cluster_extension_scores[extension_num]); + funnel.fail("max-alignments", extension_num); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "gapless extension group " << extension_num << " failed because there were too many good extensions (score=" << cluster_extension_scores[extension_num] << ")" << endl; + if (track_correctness && funnel.was_correct(extension_num)) { + cerr << log_name() << "\tCORRECT!" << endl; + dump_debug_extension_set(gbwt_graph, aln, cluster_extensions[extension_num]); + } + } + } + }, discard_processed_cluster_by_score); + + if (alignments.size() == 0) { + // Produce an unaligned Alignment + alignments.emplace_back(aln); + alignments_to_source.push_back(numeric_limits::max()); + + if (track_provenance) { + // Say it came from nowhere + funnel.introduce(); + } + } + + if (track_provenance) { + // Now say we are finding the winner(s) + funnel.stage("winner"); + } + + // Fill this in with the alignments we will output as mappings + vector mappings; + mappings.reserve(min(alignments.size(), max_multimaps)); + + // Grab all the scores in order for MAPQ computation. + vector scores; + scores.reserve(alignments.size()); + + process_until_threshold_a(alignments.size(), (std::function) [&](size_t i) -> double { + return alignments.at(i).score(); + }, 0, 1, max_multimaps, rng, [&](size_t alignment_num) { + // This alignment makes it + // Called in score order + + // Remember the score at its rank + scores.emplace_back(alignments[alignment_num].score()); + + // Remember the output alignment + mappings.emplace_back(std::move(alignments[alignment_num])); + + if (track_provenance) { + // Tell the funnel + funnel.pass("max-multimaps", alignment_num); + funnel.project(alignment_num); + funnel.score(funnel.latest(), scores.back()); + } + + return true; + }, [&](size_t alignment_num) { + // We already have enough alignments, although this one has a good score + + // Remember the score at its rank anyway + scores.emplace_back(alignments[alignment_num].score()); + + if (track_provenance) { + funnel.fail("max-multimaps", alignment_num); + } + }, [&](size_t alignment_num) { + // This alignment does not have a sufficiently good score + // Score threshold is 0; this should never happen + crash_unless(false); + }); + + if (track_provenance) { + funnel.substage("mapq"); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Picked best alignment " << log_alignment(mappings[0]) << endl; + cerr << log_name() << "For scores"; + for (auto& score : scores) cerr << " " << score << ":" << endl; + } + } + + crash_unless(!mappings.empty()); + // Compute MAPQ if not unmapped. Otherwise use 0 instead of the 50% this would give us. + // Use exact mapping quality + double mapq = (mappings.front().path().mapping_size() == 0) ? 0 : + get_regular_aligner()->compute_max_mapping_quality(scores, false) ; + +#ifdef print_minimizer_table + double uncapped_mapq = mapq; +#endif + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "uncapped MAPQ is " << mapq << endl; + } + } + + // TODO: give SmallBitset iterators so we can use it instead of an index vector. + vector explored_minimizers; + for (size_t i = 0; i < minimizers.size(); i++) { + if (minimizer_explored.contains(i)) { + explored_minimizers.push_back(i); + } + } + // Compute caps on MAPQ. TODO: avoid needing to pass as much stuff along. + double escape_bonus = mapq < std::numeric_limits::max() ? 1.0 : 2.0; + double mapq_explored_cap = escape_bonus * faster_cap(minimizers, explored_minimizers, aln.sequence(), aln.quality()); + + // Remember the uncapped MAPQ and the caps + set_annotation(mappings.front(),"secondary_scores", scores); + set_annotation(mappings.front(), "mapq_uncapped", mapq); + set_annotation(mappings.front(), "mapq_explored_cap", mapq_explored_cap); + + // Apply the caps and transformations + mapq = round(min(mapq_explored_cap, min(mapq, 60.0))); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Explored cap is " << mapq_explored_cap << endl; + cerr << log_name() << "MAPQ is " << mapq << endl; + } + } + + // Make sure to clamp 0-60. + mappings.front().set_mapping_quality(max(min(mapq, 60.0), 0.0)); + + + if (track_provenance) { + funnel.substage_stop(); + } + + for (size_t i = 0; i < mappings.size(); i++) { + // For each output alignment in score order + auto& out = mappings[i]; + + // Assign primary and secondary status + out.set_is_secondary(i > 0); + } + + // Stop this alignment + funnel.stop(); + + // Annotate with whatever's in the funnel + funnel.annotate_mapped_alignment(mappings[0], track_correctness); + + if (track_provenance) { + if (track_correctness) { + annotate_with_minimizer_statistics(mappings[0], minimizers, seeds, seeds.size(), 0, funnel); + } + // Annotate with parameters used for the filters. + set_annotation(mappings[0], "param_hit-cap", (double) hit_cap); + set_annotation(mappings[0], "param_hard-hit-cap", (double) hard_hit_cap); + set_annotation(mappings[0], "param_score-fraction", (double) minimizer_score_fraction); + set_annotation(mappings[0], "param_max-extensions", (double) max_extensions); + set_annotation(mappings[0], "param_max-alignments", (double) max_alignments); + set_annotation(mappings[0], "param_cluster-score", (double) cluster_score_threshold); + set_annotation(mappings[0], "param_cluster-coverage", (double) cluster_coverage_threshold); + set_annotation(mappings[0], "param_extension-set", (double) extension_set_score_threshold); + set_annotation(mappings[0], "param_max-multimaps", (double) max_multimaps); + } + +#ifdef print_minimizer_table + cerr << aln.sequence() << "\t"; + for (char c : aln.quality()) { + cerr << (char)(c+33); + } + cerr << "\t" << clusters.size(); + for (size_t i = 0 ; i < minimizers.size() ; i++) { + auto& minimizer = minimizers[i]; + cerr << "\t" + << minimizer.value.key.decode(minimizer.length) << "\t" + << minimizer.forward_offset() << "\t" + << minimizer.agglomeration_start << "\t" + << minimizer.agglomeration_length << "\t" + << minimizer.hits << "\t" + << minimizer_extensions_count[i]; + if (minimizer_extensions_count[i]>0) { + crash_unless(minimizer.hits<=hard_hit_cap) ; + } + } + cerr << "\t" << uncapped_mapq << "\t" << mapq_explored_cap << "\t" << mappings.front().mapping_quality() << "\t"; + cerr << "\t"; + for (auto& score : scores) { + cerr << score << ","; + } + if (track_correctness) { + cerr << "\t" << funnel.last_correct_stage() << endl; + } else { + cerr << "\t" << "?" << endl; + } +#endif + + if (track_provenance && show_work && aln.sequence().size() < LONG_LIMIT) { + // Dump the funnel info graph. + // TODO: Add a new flag for this. + #pragma omp critical (cerr) + { + funnel.to_dot(cerr); + } + } + + return mappings; +} + +// For MinimizerMapper::map_from_chains() see ./minimizer_mapper_from_chains.cpp + +//----------------------------------------------------------------------------- + +void MinimizerMapper::pair_all(std::array, 2>& mappings) const { + for (auto r : {0, 1}) { + if (!mappings[r].empty()) { + for (auto& other : mappings[1 - r]) { + // Each read 1 needs to point to read 2 as its successor + // Each read 2 needs to point to read 1 as its predecessor + // If we started with the first read, other is the second read, so it needs to point to the first read as prev. + Alignment* ref = r == 0 ? other.mutable_fragment_prev() : other.mutable_fragment_next(); + ref->set_name(mappings[r].front().name()); + } + } + } +} + +pair, vector> MinimizerMapper::map_paired(Alignment& aln1, Alignment& aln2, + vector>& ambiguous_pair_buffer){ + if (fragment_length_distr.is_finalized()) { + + //If we know the fragment length distribution then we just map paired ended + return map_paired(aln1, aln2); + } else { + std::array alns {&aln1, &aln2}; + + std::array, 2> single; + std::array max_score_aln; + bool both_perfect_unique = true; + for (auto r : {0, 1}) { + //If we don't know the fragment length distribution, map the reads single ended + single[r] = std::move(map(*alns[r])); + // Check if the separately-mapped ends are both sufficiently perfect and sufficiently unique + max_score_aln[r] = get_regular_aligner()->score_exact_match(*alns[r], 0, alns[r]->sequence().size()); + both_perfect_unique = both_perfect_unique && !single[r].empty() && single[r].front().mapping_quality() == 60 && single[r].front().score() >= max_score_aln[r] * 0.85; + } + + if (both_perfect_unique) { + //Flip the second alignment to get the proper fragment distance + reverse_complement_alignment_in_place(&single[1].front(), [&](vg::id_t node_id) { + return gbwt_graph.get_length(gbwt_graph.get_handle(node_id)); + }); + int64_t dist = distance_between(single[0].front(), single[1].front()); + // And that they have an actual pair distance and set of relative orientations + + if (dist == std::numeric_limits::max() || + dist >= max_fragment_length) { + //If the distance between them is ambiguous or it it large enough that we don't think it's valid, leave them unmapped + + ambiguous_pair_buffer.emplace_back(aln1, aln2); + pair, vector> empty; + return empty; + } + + //If we're keeping this alignment, flip the second alignment back + reverse_complement_alignment_in_place(&single[1].front(), [&](vg::id_t node_id) { + return gbwt_graph.get_length(gbwt_graph.get_handle(node_id)); + }); + // If that all checks out, say they're mapped, emit them, and register their distance and orientations + fragment_length_distr.register_fragment_length(dist); + + std::array, 2> mapped_pair; + for (auto r : {0, 1}) { + mapped_pair[r].emplace_back(std::move(single[r].front())); + } + pair_all(mapped_pair); + + //TODO: This is "properly paired" if there is a path between the reads. Since we + //haven't finalized the distribution we can't compare it + bool properly_paired = dist != std::numeric_limits::max(); + for (auto r : {0, 1}) { + set_annotation(mapped_pair[r].back(), "proper_pair", properly_paired); + } + +#ifdef debug_fragment_distr + //Print stats about finalizing the fragment length distribution, copied from mpmap + if (fragment_length_distr.is_finalized()) { + cerr << "finalized read distribution with " << fragment_length_distr.max_sample_size() << " measurements" << endl; + cerr << "mean: " << fragment_length_distr.mean() << endl; + cerr << "std dev: " << fragment_length_distr.std_dev() << endl; + cerr << "ambiguous buffer contains pairs:" << endl; + for (pair& aln_pair : ambiguous_pair_buffer) { + cerr << "\t" << aln_pair.first.name() << ", " << aln_pair.second.name() << endl; + } + cerr << "distance measurements:" << endl; + auto iter = fragment_length_distr.measurements_begin(); + if (iter != fragment_length_distr.measurements_end()) { + cerr << *iter; + iter++; + } + for (; iter != fragment_length_distr.measurements_end(); iter++) { + cerr << ", " << *iter; + } + cerr << endl; + } +#endif + + + return {std::move(mapped_pair[0]), std::move(mapped_pair[1])}; + + } else { + // Otherwise, discard the mappings and put them in the ambiguous buffer + + ambiguous_pair_buffer.emplace_back(aln1, aln2); + return {}; + } + } +} + +// For paired-end alignment we use a bunch of structures indexed by clustering +// fragment, then read, then alignment of the read. So we define some index +// types and lookup functions to deal with these. +// TODO: Make these local classes when C++ learns to let you use template members in local classes. + +/// Type to point to an alignment of a known read +struct read_alignment_index_t { + size_t fragment; + size_t alignment; + + /// Allow looking up this index in a structure organized by fragment, + /// read, and alignment, such as in alignments, or alignment_indices. + template + auto lookup_for_read_in(bool read, NestedArray& a) const -> typename std::add_lvalue_reference::type { + return a[fragment][read][alignment]; + } + + /// Make sure that this index actually points to an alignment of the given + /// read in the given structure. Throws if not. + template + void check_for_read_in(bool read, NestedArray& a) const { + a.at(fragment).at(read).at(alignment); + } + + // Allow comparison + inline bool operator==(const read_alignment_index_t& other) { + return fragment == other.fragment && alignment == other.alignment; + }; + inline bool operator!=(const read_alignment_index_t& other) { + return !(*this == other); + }; +}; +/// Represents an unset index +const read_alignment_index_t NO_READ_INDEX = {std::numeric_limits::infinity(), std::numeric_limits::infinity()}; + +/// Type to point to an alignment of either read +/// +struct alignment_index_t { + size_t fragment; + size_t alignment; + bool read; + + /// Drop the read field + inline read_alignment_index_t without_read() { + return read_alignment_index_t {fragment, alignment}; + } + + /// Allow looking up this index in a structure organized by fragment, + /// read, and alignment, such as in alignments, or alignment_indices. + template + auto lookup_in(NestedArray& a) const -> typename std::add_lvalue_reference::type { + return a[fragment][read][alignment]; + } + + // Allow comparison + inline bool operator==(const alignment_index_t& other) { + return fragment == other.fragment && alignment == other.alignment && read == other.read; + }; + inline bool operator!=(const alignment_index_t& other) { + return !(*this == other); + }; +}; +/// Represents an unset index +const alignment_index_t NO_INDEX {std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max()}; + +pair, vector> MinimizerMapper::map_paired(Alignment& aln1, Alignment& aln2) { + + if (show_work) { + #pragma omp critical (cerr) + dump_debug_query(aln1, aln2); + } + + // Make sure we actually have a working fragment length distribution that the clusterer will accept. + int64_t fragment_distance_limit = fragment_length_distr.mean() + paired_distance_stdevs * fragment_length_distr.std_dev(); + if (fragment_distance_limit < get_distance_limit(aln1.sequence().size())) { + // We can't use this distribution + + if (!warned_about_bad_distribution.test_and_set()) { + // We get to print the warning + cerr << "warning[vg::giraffe]: Cannot cluster reads with a fragment distance smaller than read distance" << endl; + cerr << " Fragment length distribution: mean=" << fragment_length_distr.mean() + << ", stdev=" << fragment_length_distr.std_dev() << endl; + cerr << " Fragment distance limit: " << fragment_distance_limit + << ", read distance limit: " << get_distance_limit(aln1.sequence().size()) << endl; + cerr << "warning[vg::giraffe]: Falling back on single-end mapping" << endl; + } + + // Map single-ended and bail + std::array, 2> mapped_pair = {map(aln1), map(aln2)}; + pair_all(mapped_pair); + return {std::move(mapped_pair[0]), std::move(mapped_pair[1])}; + } + + + // Assume reads are in inward orientations on input, and + // convert to rightward orientations before mapping + // and flip the second read back before output + + aln2.clear_path(); + reverse_complement_alignment_in_place(&aln2, [&](vg::id_t node_id) { + return gbwt_graph.get_length(gbwt_graph.get_handle(node_id)); + }); + + // Lay out the alignments for looping + std::array alns{&aln1, &aln2}; + + // Make two new funnel instrumenters to watch us map this read pair. + std::array funnels; + // Start this alignment + for (auto r : {0, 1}) { + funnels[r].start(alns[r]->name()); + } + + // Annotate the original read with metadata + for (auto r : {0, 1}) { + if (!sample_name.empty()) { + alns[r]->set_sample_name(sample_name); + } + if (!read_group.empty()) { + alns[r]->set_read_group(read_group); + } + } + + // Prepare the RNG for shuffling ties, if needed + LazyRNG rng([&]() { + return aln1.sequence() + aln2.sequence(); + }); + + // Minimizers for both reads, sorted by read position. + std::array, 2> minimizers_in_read_by_read; + // Indexes of minimizers for both reads, sorted into score order, best score first + std::array, 2> minimizer_score_order_by_read; + // Minimizers for both reads, sorted by best score first. + std::array, 2> minimizers_by_read; + for (auto r : {0, 1}) { + minimizers_in_read_by_read[r] = this->find_minimizers(alns[r]->sequence(), funnels[r]); + minimizer_score_order_by_read[r] = sort_minimizers_by_score(minimizers_in_read_by_read[r]); + minimizers_by_read[r] = {minimizers_in_read_by_read[r], minimizer_score_order_by_read[r]}; + } + + // Seeds for both reads, stored in separate vectors. + // These *MUST* be std::vector, because the clusterer's internal data + // structures pass around pointers to std::vector>. + // TODO: Let the clusterer use something else? + std::vector> seeds_by_read(2); + for (auto r : {0, 1}) { + seeds_by_read[r] = this->find_seeds(minimizers_by_read[r], *alns[r], funnels[r]); + } + + // Cluster the seeds. Get sets of input seed indexes that go together. + if (track_provenance) { + for (auto r : {0, 1}) { + funnels[r].stage("cluster"); + } + } + + std::vector> all_clusters = clusterer.cluster_seeds(seeds_by_read, get_distance_limit(aln1.sequence().size()), fragment_distance_limit); +#ifdef debug_validate_clusters + validate_clusters(all_clusters, seeds_by_read, get_distance_limit(aln1.sequence().size()), fragment_distance_limit); + +#endif + + + //Keep track of which fragment clusters (clusters of clusters) have read clusters from each end + + size_t max_fragment_num = 0; + for (auto r : {0, 1}) { + for (auto& cluster : all_clusters[r]) { + max_fragment_num = std::max(max_fragment_num, cluster.fragment); + } + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Found " << max_fragment_num + 1 << " fragment clusters" << endl; + } + } + + vector has_first_read (max_fragment_num+1, false);//For each fragment cluster, does it have a cluster for the first read + vector fragment_cluster_has_pair (max_fragment_num+1, false);//Does a fragment cluster have both reads + bool found_paired_cluster = false; + for (auto& cluster : all_clusters[0]) { + has_first_read[cluster.fragment] = true; + } + for (auto& cluster : all_clusters[1]) { + size_t fragment_num = cluster.fragment; + fragment_cluster_has_pair[fragment_num] = has_first_read[fragment_num]; + if (has_first_read[fragment_num]) { + found_paired_cluster = true; + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Fragment cluster " << fragment_num << " has read clusters from both reads" << endl; + } + } + } + } + + if (track_provenance) { + for (auto r : {0, 1}) { + funnels[r].substage("score"); + } + } + + + //Keep track of the best cluster score and coverage per end for each fragment cluster + std::array, 2> cluster_score_by_fragment; + std::array, 2> cluster_coverage_by_fragment; + for (auto r : {0, 1}) { + cluster_score_by_fragment[r].resize(max_fragment_num + 1, 0.0); + cluster_coverage_by_fragment[r].resize(max_fragment_num + 1, 0.0); + } + + // We have accessors for totals across both reads, which we use a lot later. + // TODO: Make a macro? + auto total_score = [&](size_t fragment_num) -> double { + return cluster_score_by_fragment[0][fragment_num] + cluster_score_by_fragment[1][fragment_num]; + }; + auto total_coverage = [&](size_t fragment_num) -> double { + return cluster_coverage_by_fragment[0][fragment_num] + cluster_coverage_by_fragment[1][fragment_num]; + }; + + //Get the scores of each cluster + for (auto r : {0, 1}) { + auto& aln = *alns[r]; + std::vector& clusters = all_clusters[r]; + const VectorView& minimizers = minimizers_by_read[r]; + vector& best_cluster_score = cluster_score_by_fragment[r]; + vector& best_cluster_coverage = cluster_coverage_by_fragment[r]; + + for (size_t i = 0; i < clusters.size(); i++) { + // Determine cluster score and read coverage. + Cluster& cluster = clusters[i]; + this->score_cluster(cluster, i, minimizers, seeds_by_read[r], aln.sequence().length(), funnels[r]); + size_t fragment = cluster.fragment; + best_cluster_score[fragment] = std::max(best_cluster_score[fragment], cluster.score); + best_cluster_coverage[fragment] = std::max(best_cluster_coverage[fragment], cluster.coverage); + } + } + + //For each fragment cluster, we want to know how many equivalent or better clusters we found + //We consider two fragment clusters to be "equivalent" if the sum of the best score and + //coverage for both ends is the same + + //Get a vector of the indices of fragment clusters so we can sort + vector fragment_cluster_indices_by_score (max_fragment_num + 1); + for (size_t i = 0 ; i < fragment_cluster_indices_by_score.size() ; i++) { + fragment_cluster_indices_by_score[i] = i; + } + + //Sort by the sum of the score and coverage of the best cluster for each end + sort_shuffling_ties(fragment_cluster_indices_by_score.begin(), fragment_cluster_indices_by_score.end(), [&](size_t a, size_t b) { + return total_coverage(a) + total_score(a) > total_coverage(b) + total_score(b); + }, rng); + + // How many fragment clusters are at least as good as the one at each index + vector better_cluster_count (max_fragment_num+1); + + double prev_score_sum = 0.0; + for (int rank = fragment_cluster_indices_by_score.size() - 1 ; rank >= 0 ; rank--) { + //Go through fragment clusters in descending score order and count how many equivalent or + //better clusters we found + size_t fragment_num = fragment_cluster_indices_by_score[rank]; + if (rank == fragment_cluster_indices_by_score.size()-1) { + better_cluster_count[fragment_num] = rank+1; + } else { + size_t prev_fragment_num = fragment_cluster_indices_by_score[rank+1]; + double curr_score_sum = total_coverage(fragment_num) + total_score(fragment_num); + if (curr_score_sum == prev_score_sum) { + //If this is the same as the last cluster, it has the same count + better_cluster_count[fragment_num] = better_cluster_count[prev_fragment_num]; + } else { + //Otherwise, its count is the index + better_cluster_count[fragment_num] = rank+1; + prev_score_sum = curr_score_sum; + } + } + } +#ifdef debug + for (size_t count : better_cluster_count) { + crash_unless(count >= 1); + } +#endif + + // To compute the windows that are explored, we need to get + // all the minimizers that are explored. + std::array minimizer_explored_by_read; + std::array, 2> minimizer_aligned_count_by_read; + //How many hits of each minimizer ended up in each cluster that was kept? + std::array>, 2> minimizer_kept_cluster_count_by_read; + + // To compute the windows present in any extended cluster, we need to get + // all the minimizers in any extended cluster. + + //For each fragment cluster (cluster of clusters), for each read, a vector of all alignments + the order they were fed into the funnel + //so the funnel can track them + vector, 2>> alignments; + vector, 2>> alignment_indices; + std::array best_alignment_scores {0, 0}; // The best alignment score for each end + + // We will fill this with all computed alignments in estimated score order. + // alignments has one entry for each fragment cluster and an extra for unpaired alignment + alignments.resize(max_fragment_num + 2); + alignment_indices.resize(max_fragment_num + 2); + + //Now that we've scored each of the clusters, extend and align them + for (size_t read_num = 0 ; read_num < 2 ; read_num++) { + Alignment& aln = *alns[read_num]; + std::vector& clusters = all_clusters[read_num]; + const VectorView& minimizers = minimizers_by_read[read_num]; + std::vector& seeds = seeds_by_read[read_num]; + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Found " << clusters.size() << " clusters for read " << (read_num + 1) << endl; + } + } + + // Retain clusters only if their score is better than this, in addition to the coverage cutoff + double cluster_score_cutoff = 0.0, cluster_coverage_cutoff = 0.0, second_best_cluster_score = 0.0; + + //The score and coverage of the best cluster, "best" is determined first by coverage then score + pair best_cluster_coverage_score (0.0, 0.0); + for (auto& cluster : clusters) { + + if (cluster.coverage > best_cluster_coverage_score.first) { + //If this is the new best coverage, update both best coverage and best score + best_cluster_coverage_score.first = cluster.coverage; + best_cluster_coverage_score.second = cluster.score; + } else if (cluster.coverage == best_cluster_coverage_score.first) { + //If this is the same as the best coverage, get best score + best_cluster_coverage_score.second = std::max(best_cluster_coverage_score.second, cluster.score); + } + + cluster_coverage_cutoff = std::max(cluster_coverage_cutoff, cluster.coverage); + + if (cluster.score > cluster_score_cutoff) { + second_best_cluster_score = cluster_score_cutoff; + cluster_score_cutoff = cluster.score; + } else if (cluster.score > second_best_cluster_score) { + second_best_cluster_score = cluster.score; + } + } + cluster_score_cutoff -= cluster_score_threshold; + cluster_coverage_cutoff -= cluster_coverage_threshold; + + if (cluster_score_cutoff - pad_cluster_score_threshold < second_best_cluster_score) { + cluster_score_cutoff = std::min(cluster_score_cutoff, second_best_cluster_score); + } + + if (track_provenance) { + // Now we go from clusters to gapless extensions + funnels[read_num].stage("extend"); + } + + // These are the GaplessExtensions for all the clusters (and fragment cluster assignments), in cluster_indexes_in_order order. + vector, size_t>> cluster_extensions; + cluster_extensions.reserve(clusters.size()); + + minimizer_explored_by_read[read_num] = SmallBitset(minimizers.size()); + minimizer_aligned_count_by_read[read_num].resize(minimizers.size(), 0); + size_t kept_cluster_count = 0; + + //Process clusters sorted by both score and read coverage + process_until_threshold_c(clusters.size(), [&](size_t i) -> double { + return clusters[i].coverage; + }, [&](size_t a, size_t b) -> bool { + //Sort clusters first by whether it was paired, then by the best coverage and score of any pair in the fragment cluster, + //then by its coverage and score + size_t fragment_a = clusters[a].fragment; + size_t fragment_b = clusters[b].fragment; + + double coverage_a = cluster_coverage_by_fragment[0][fragment_a]+cluster_coverage_by_fragment[1][fragment_a]; + double coverage_b = cluster_coverage_by_fragment[0][fragment_b]+cluster_coverage_by_fragment[1][fragment_b]; + double score_a = cluster_score_by_fragment[0][fragment_a]+cluster_score_by_fragment[1][fragment_a]; + double score_b = cluster_score_by_fragment[0][fragment_b]+cluster_score_by_fragment[1][fragment_b]; + + if (fragment_cluster_has_pair[fragment_a] != fragment_cluster_has_pair[fragment_b]) { + return fragment_cluster_has_pair[fragment_a]; + } else if (coverage_a != coverage_b){ + return coverage_a > coverage_b; + } else if (score_a != score_b) { + return score_a > score_b; + } else if (clusters[a].coverage != clusters[b].coverage){ + return clusters[a].coverage > clusters[b].coverage; + } else { + return clusters[a].score > clusters[b].score; + } + }, + 0, min_extensions, max_extensions, rng, [&](size_t cluster_num) -> bool { + // Handle sufficiently good clusters + Cluster& cluster = clusters[cluster_num]; + if (!found_paired_cluster || fragment_cluster_has_pair[cluster.fragment] || + (cluster.coverage == best_cluster_coverage_score.first && + cluster.score == best_cluster_coverage_score.second)) { + //If this cluster has a pair or if we aren't looking at pairs + //Or if it is the best cluster + + // First check against the additional score filter + if (cluster_coverage_threshold != 0 && cluster.coverage < cluster_coverage_cutoff + && kept_cluster_count >= min_extensions) { + //If the coverage isn't good enough, ignore this cluster + if (track_provenance) { + funnels[read_num].fail("cluster-coverage", cluster_num, cluster.coverage); + } + return false; + } + if (cluster_score_threshold != 0 && cluster.score < cluster_score_cutoff + && kept_cluster_count >= min_extensions) { + //If the score isn't good enough, ignore this cluster + if (track_provenance) { + funnels[read_num].pass("cluster-coverage", cluster_num, cluster.coverage); + funnels[read_num].pass("max-extensions", cluster_num); + funnels[read_num].fail("cluster-score", cluster_num, cluster.score); + } + return false; + } + if (track_provenance) { + funnels[read_num].pass("cluster-coverage", cluster_num, cluster.coverage); + funnels[read_num].pass("max-extensions", cluster_num); + funnels[read_num].pass("cluster-score", cluster_num, cluster.score); + funnels[read_num].pass("paired-clusters", cluster_num); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Cluster " << cluster_num << endl; + } + } + + // Extend seed hits in the cluster into one or more gapless extensions + cluster_extensions.emplace_back(std::move(this->extend_cluster( + cluster, + cluster_num, + minimizers, + seeds, + aln.sequence(), + minimizer_kept_cluster_count_by_read[read_num], + funnels[read_num])), cluster.fragment); + + kept_cluster_count ++; + + return true; + } else { + //We were looking for clusters in a paired fragment cluster but this one doesn't have any on the other end + if (track_provenance) { + funnels[read_num].pass("cluster-coverage", cluster_num, cluster.coverage); + funnels[read_num].pass("max-extensions", cluster_num); + funnels[read_num].pass("cluster-score", cluster_num, cluster.score); + funnels[read_num].fail("paired-clusters", cluster_num); + } + return false; + } + + }, [&](size_t cluster_num) -> void { + // There are too many sufficiently good clusters + if (track_provenance) { + funnels[read_num].pass("cluster-coverage", cluster_num, clusters[cluster_num].coverage); + funnels[read_num].fail("max-extensions", cluster_num); + } + }, [&](size_t cluster_num) -> void { + // This cluster is not sufficiently good. + // TODO: I don't think it should ever get here unless we limit the scores of the fragment clusters we look at + }); + + // We now estimate the best possible alignment score for each cluster. + std::vector cluster_alignment_score_estimates = this->score_extensions(cluster_extensions, aln, funnels[read_num]); + + if (track_provenance) { + funnels[read_num].stage("align"); + } + + // Now start the alignment step. Everything has to become an alignment. + + // Clear any old refpos annotation and path + aln.clear_refpos(); + aln.clear_path(); + aln.set_score(0); + aln.set_identity(0); + aln.set_mapping_quality(0); + + //Since we will lose the order in which we pass alignments to the funnel, use this to keep track + size_t curr_funnel_index = 0; + + // Go through the processed clusters in estimated-score order. + process_until_threshold_b(cluster_alignment_score_estimates, + extension_set_score_threshold, 2, max_alignments, rng, [&](size_t processed_num) { + // This processed cluster is good enough. + // Called in descending score order. + + if (track_provenance) { + funnels[read_num].pass("extension-set", processed_num, cluster_alignment_score_estimates[processed_num]); + funnels[read_num].pass("max-alignments", processed_num); + funnels[read_num].processing_input(processed_num); + } + + auto& extensions = cluster_extensions[processed_num].first; + + // Collect the top alignments. Make sure we have at least one always, starting with unaligned. + vector best_alignments(1, aln); + + if (GaplessExtender::full_length_extensions(extensions)) { + // We got full-length extensions, so directly convert to an Alignment. + + if (track_provenance) { + funnels[read_num].substage("direct"); + } + + //Fill in the best alignments from the extension. We know the top one is always full length and exists. + this->extension_to_alignment(extensions.front(), best_alignments.front()); + + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Produced alignment directly from full length gapless extension " << processed_num << endl; + } + } + + for (auto next_ext_it = extensions.begin() + 1; next_ext_it != extensions.end() && next_ext_it->full(); ++next_ext_it) { + // For all subsequent full length extensions, make them into alignments too. + // We want them all to go on to the pairing stage so we don't miss a possible pairing in a tandem repeat. + best_alignments.emplace_back(aln); + this->extension_to_alignment(*next_ext_it, best_alignments.back()); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Produced additional alignment directly from full length gapless extension " << (next_ext_it - extensions.begin()) << endl; + } + } + + } + + if (track_provenance) { + // Stop the current substage + funnels[read_num].substage_stop(); + } + } else if (do_dp) { + // We need to do base-level alignment. + + if (track_provenance) { + funnels[read_num].substage("align"); + } + + // Do the DP and compute up to 2 alignments + best_alignments.emplace_back(aln); + find_optimal_tail_alignments(aln, extensions, rng, best_alignments[0], best_alignments[1]); + + + if (track_provenance) { + // We're done base-level alignment. Next alignment may not go through this substage. + funnels[read_num].substage_stop(); + } + } else { + // We would do base-level alignment but it is disabled. + // Leave best_alignments unaligned + } + + + size_t fragment_num = cluster_extensions[processed_num].second; + + // Have a function to process the best alignments we obtained + auto observe_alignment = [&](Alignment& aln) { + auto& best_score = best_alignment_scores[read_num]; + best_score = max(best_score, aln.score()); + + auto& alignment_list = alignments[fragment_num][read_num]; + alignment_list.emplace_back(std::move(aln)); + + auto& indices_list = alignment_indices[fragment_num][read_num]; + indices_list.emplace_back(curr_funnel_index); + curr_funnel_index++; + + if (track_provenance) { + funnels[read_num].project(processed_num); + funnels[read_num].score(funnels[read_num].latest(), alignment_list.back().score()); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Produced fragment option " << fragment_num << " read " << (read_num + 1) + << " alignment with score " << alignment_list.back().score() << ": " << log_alignment(alignment_list.back()) << endl; + } + } + }; + + for(auto aln_it = best_alignments.begin() ; aln_it != best_alignments.end() && aln_it->score() != 0 && aln_it->score() >= best_alignments[0].score() * 0.8; ++aln_it) { + //For each additional extension with score at least 0.8 of the best score + observe_alignment(*aln_it); + } + + if (track_provenance) { + // We're done with this input item + funnels[read_num].processed_input(); + } + + for (size_t i = 0 ; i < minimizer_kept_cluster_count_by_read[read_num][processed_num].size() ; i++) { + if (minimizer_kept_cluster_count_by_read[read_num][processed_num][i] > 0) { + // This minimizer is in a cluster that gave rise + // to at least one alignment, so it is explored. + minimizer_explored_by_read[read_num].insert(i); + minimizer_aligned_count_by_read[read_num][i] += minimizer_kept_cluster_count_by_read[read_num][processed_num][i]; + } + } + + + return true; + }, [&](size_t processed_num) { + // There are too many sufficiently good processed clusters + if (track_provenance) { + funnels[read_num].pass("extension-set", processed_num, cluster_alignment_score_estimates[processed_num]); + funnels[read_num].fail("max-alignments", processed_num); + } + }, [&](size_t processed_num) { + // This processed cluster is not good enough. + if (track_provenance) { + funnels[read_num].fail("extension-set", processed_num, cluster_alignment_score_estimates[processed_num]); + } + }); + + } + + + //Now that we have alignments, figure out how to pair them up + + if (track_provenance) { + // Now say we are finding the pairs + for (auto r : {0, 1}) { + funnels[r].stage("pairing"); + } + } + + // Fill this in with the indexes of pairs of alignments we will output + // each alignment is stored as into alignments + // fragment_index should be the same for both ends, unless one was rescued + vector> paired_alignments; + paired_alignments.reserve(alignments.size()); + + +#ifdef print_minimizer_table + vector> alignment_was_rescued; +#endif + + //For each alignment in alignments, which paired_alignment includes it. Follows structure of alignments + vector>, 2>> alignment_groups(alignments.size()); + + // Grab all the scores in order for MAPQ computation. + vector paired_scores; + paired_scores.reserve(alignments.size()); + vector fragment_distances; + fragment_distances.reserve(alignments.size()); + + //for each alignment pair, what type of pair is it + enum PairType {paired, unpaired, rescued_from_first, rescued_from_second}; + vector pair_types; + + //For each pair of alignments in paired_alignments, how many equivalent or better fragment clusters + //did we find + vector better_cluster_count_by_pairs; + + + //Keep track of alignments with no pairs in the same fragment cluster + bool found_pair = false; + + //Alignments that don't have a mate + vector unpaired_alignments; + std::array unpaired_count {0, 0}; + + for (size_t fragment_num = 0 ; fragment_num < alignments.size() ; fragment_num ++ ) { + //Get pairs of plausible alignments + for (auto r : {0, 1}) { + alignment_groups[fragment_num][r].resize(alignments[fragment_num][r].size()); + } + + std::array, 2>& fragment_alignments = alignments[fragment_num]; + if (!fragment_alignments[0].empty() && ! fragment_alignments[1].empty()) { + //Only keep pairs of alignments that were in the same fragment cluster + found_pair = true; + std::array aln_index; + std::array alignment; + std::array funnel_index; + + // TODO: replace the nested all-combinations loop with something like a cross product? + for (aln_index[0] = 0 ; aln_index[0] < fragment_alignments[0].size() ; aln_index[0]++) { + alignment[0] = &fragment_alignments[0][aln_index[0]]; + funnel_index[0] = alignment_indices[fragment_num][0][aln_index[0]]; + for (aln_index[1] = 0 ; aln_index[1] < fragment_alignments[1].size() ; aln_index[1]++) { + alignment[1] = &fragment_alignments[1][aln_index[1]]; + funnel_index[1] = alignment_indices[fragment_num][1][aln_index[1]]; + + //Get the likelihood of the fragment distance + int64_t fragment_distance = distance_between(*alignment[0], *alignment[1]); + double score = score_alignment_pair(*alignment[0], *alignment[1], fragment_distance); + + for (auto r : {0, 1}) { + alignment_groups[fragment_num][r][aln_index[r]].emplace_back(paired_alignments.size()); + } + paired_alignments.emplace_back(); + for (auto r : {0, 1}) { + paired_alignments.back()[r] = read_alignment_index_t {fragment_num, aln_index[r]}; + } +#ifdef debug_validate_index_references + for (auto r : {0, 1}) { + // Make sure we refer to things that exist. + paired_alignments.back().at(r).check_for_read_in(r, alignments); + } +#endif + + paired_scores.emplace_back(score); + fragment_distances.emplace_back(fragment_distance); + better_cluster_count_by_pairs.emplace_back(better_cluster_count[fragment_num]); + pair_types.emplace_back(paired); +#ifdef print_minimizer_table + alignment_was_rescued.emplace_back(false, false); +#endif + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Found pair of alignments from fragment " << fragment_num << " with scores " + << alignment[0]->score() << " " << alignment[1]->score() << " at distance " << fragment_distance + << " gets pair score " << score << endl; + cerr << log_name() << "Alignment 1: " << log_alignment(*alignment[0]) << endl << "Alignment 2: " << log_alignment(*alignment[1]) << endl; + } + } + + if (track_provenance) { + for (auto r : {0, 1}) { + funnels[r].processing_input(funnel_index[r]); + funnels[r].substage("pair-clusters"); + funnels[r].pass("max-rescue-attempts", funnel_index[r]); + funnels[r].project(funnel_index[r]); + funnels[r].score(funnels[r].latest(), score); + funnels[r].substage_stop(); + funnels[r].processed_input(); + } + } + } + } + } else { + // At most one of these is set + for (auto r : {0, 1}) { + if (!fragment_alignments[r].empty()) { + //If this fragment cluster has only alignments from this + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Found unpaired alignments from fragment " << fragment_num << " for read " << (r + 1) << endl; + } + } + for (size_t i = 0 ; i < fragment_alignments[r].size() ; i++) { + unpaired_alignments.push_back({fragment_num, i, (bool)r}); + unpaired_count[r]++; + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "\t" << log_alignment(fragment_alignments[r][i]) << endl; + } + } + } + } + } + } + } + std::array rescued_count {0, 0}; + std::array, 2> unpaired_scores; + for (auto r : {0, 1}) { + unpaired_scores[r].reserve(unpaired_alignments.size()); + } + + + if (!unpaired_alignments.empty()) { + //If we found some clusters that had no pair in a fragment cluster + if (!found_pair) { + //If we didn't find any pairs find the best alignment for each end + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Found no pairs and we aren't doing rescue: return best alignment for each read" << endl; + } + } + std::array best_index {NO_INDEX, NO_INDEX}; + std::array best_score {0, 0}; + + for (auto& index : unpaired_alignments ) { + const Alignment& alignment = index.lookup_in(alignments); + unpaired_scores[index.read].emplace_back(alignment.score()); + if (deterministic_beats(alignment.score(), best_score[index.read], rng)) { + best_index[index.read] = index; + best_score[index.read] = alignment.score(); + } + } + if (max_rescue_attempts == 0 ) { + // If we aren't attempting rescue, just return the best alignment from each end. + // By default, use argument alignments for scratch. + std::array best_aln {alns[0], alns[1]}; + + for (auto r : {0, 1}) { + if (best_index[r] != NO_INDEX) { + //If there was a best alignment, use it + best_aln[r] = &best_index[r].lookup_in(alignments); + } else { + //Otherwise return an empty alignment + best_aln[r]->clear_refpos(); + best_aln[r]->clear_path(); + best_aln[r]->set_score(0); + best_aln[r]->set_identity(0); + best_aln[r]->set_mapping_quality(0); // TODO: this gets clobbered + } + set_annotation(*best_aln[r], "unpaired", true); + } + + std::array, 2> paired_mappings; + for (auto r : {0, 1}) { + paired_mappings[r].emplace_back(std::move(*best_aln[r])); + } + // Now we may have moved from our arguments so we can't use them anymore. + + // Flip second read back to input orientation + reverse_complement_alignment_in_place(&paired_mappings[1].back(), [&](vg::id_t node_id) { + return gbwt_graph.get_length(gbwt_graph.get_handle(node_id)); + }); + + for (auto r : {0, 1}) { + paired_mappings[r].back().set_mapping_quality(1); + + // Stop this alignment + funnels[r].stop(); + + // Annotate with whatever's in the funnel + funnels[r].annotate_mapped_alignment(paired_mappings[r].back(), track_correctness); + } + + return {std::move(paired_mappings[0]), std::move(paired_mappings[1])}; + } else if (best_score[0] != 0 && best_score[1] != 0) { + //We are attempting rescue, but we still want to keep the best alignments as a potential (unpaired) pair + + std::array funnel_index; + if (track_provenance) { + // Work out what the paired-up alignments are numbered in the funnel. + // TODO: can we flatten these lookup paths or change tuples + // to structs to be more understandable? + for (auto r : {0, 1}) { + funnel_index[r] = best_index[r].lookup_in(alignment_indices); + funnels[r].processing_input(funnel_index[r]); + } + } + + + std::array winners; + paired_alignments.emplace_back(); + for (auto r : {0, 1}) { + paired_alignments.back()[r] = best_index[r].without_read(); + winners[r] = &best_index[r].lookup_in(alignments); + } +#ifdef debug_validate_index_references + for (auto r : {0, 1}) { + // Make sure we refer to things that exist. + paired_alignments.back().at(r).check_for_read_in(r, alignments); + } +#endif + + //Assume the distance between them is infinite + double pair_score = score_alignment_pair(*winners[0], *winners[1], std::numeric_limits::max()); + paired_scores.emplace_back(pair_score); + fragment_distances.emplace_back(std::numeric_limits::max()); + better_cluster_count_by_pairs.emplace_back(0); + pair_types.emplace_back(unpaired); + + if (track_provenance) { + for (auto r : {0, 1}) { + funnels[r].substage("pair-clusters"); + funnels[r].project(funnel_index[0]); + funnels[r].score(funnels[r].latest(), pair_score); + funnels[r].substage_stop(); + funnels[r].processed_input(); + } + } + } + } + + if (max_rescue_attempts != 0) { + //Attempt rescue on unpaired alignments if either we didn't find any pairs or if the unpaired alignments are very good + + process_until_threshold_a(unpaired_alignments.size(), (std::function) [&](size_t i) -> double{ + return (double) unpaired_alignments.at(i).lookup_in(alignments).score(); + }, 0, 1, max_rescue_attempts, rng, [&](size_t i) { + auto& index = unpaired_alignments.at(i); + size_t j = index.lookup_in(alignment_indices); + if (track_provenance) { + funnels[index.read].processing_input(j); + funnels[index.read].substage("rescue"); + } + Alignment& mapped_aln = index.lookup_in(alignments); + Alignment rescued_aln = *alns[1 - index.read]; + rescued_aln.clear_path(); + + if (found_pair && (double) mapped_aln.score() < (double) best_alignment_scores[index.read] * paired_rescue_score_limit) { + //If we have already found paired clusters and this unpaired alignment is not good enough, do nothing + return true; + } + + //Rescue the alignment + attempt_rescue(mapped_aln, rescued_aln, minimizers_by_read[1 - index.read], index.read == 0); + + if (rescued_aln.path().mapping_size() != 0) { + //If we actually found an alignment + + int64_t fragment_dist = index.read == 0 ? distance_between(mapped_aln, rescued_aln) + : distance_between(rescued_aln, mapped_aln); + + double score = score_alignment_pair(mapped_aln, rescued_aln, fragment_dist); + + set_annotation(mapped_aln, "rescuer", true); + set_annotation(rescued_aln, "rescued", true); + set_annotation(mapped_aln, "fragment_length", distance_to_annotation(fragment_dist)); + set_annotation(rescued_aln, "fragment_length", distance_to_annotation(fragment_dist)); + bool properly_paired = fragment_dist == std::numeric_limits::max() ? false : + (std::abs(fragment_dist-fragment_length_distr.mean()) <= 6.0*fragment_length_distr.std_dev()) ; + set_annotation(mapped_aln, "proper_pair", properly_paired); + set_annotation(rescued_aln, "proper_pair", properly_paired); + + //Since we're still accumulating a list of indexes of pairs of alignments, + //add the new alignment to the list of alignments + //(in a separate "fragment cluster" vector for rescued alignments) and keep track of its index + // + read_alignment_index_t mapped_index = index.without_read(); + read_alignment_index_t rescued_index {alignments.size() - 1, alignments.back()[1 - index.read].size()}; + alignments.back()[1 - index.read].emplace_back(std::move(rescued_aln)); + rescued_count[index.read]++; + + alignment_groups.back()[1 - index.read].emplace_back(); + std::array index_pair; + index_pair[index.read] = mapped_index; + index_pair[1 - index.read] = rescued_index; + + paired_alignments.emplace_back(std::move(index_pair)); +#ifdef debug_validate_index_references + for (auto r : {0, 1}) { + // Make sure we refer to things that exist. + paired_alignments.back().at(r).check_for_read_in(r, alignments); + } +#endif + + paired_scores.emplace_back(score); + fragment_distances.emplace_back(fragment_dist); + pair_types.push_back(index.read == 0 ? rescued_from_first : rescued_from_second); + better_cluster_count_by_pairs.emplace_back(better_cluster_count[mapped_index.fragment]); + +#ifdef print_minimizer_table + alignment_was_rescued.emplace_back(index.read == 1, index.read == 0); +#endif + if (track_provenance) { + funnels[index.read].pass("max-rescue-attempts", j); + funnels[index.read].project(j); + funnels[1 - index.read].introduce(); + for (auto r : {0, 1}) { + funnels[r].score(funnels[r].latest(), score); + } + } + } + if (track_provenance) { + funnels[index.read].processed_input(); + funnels[index.read].substage_stop(); + } + return true; + }, [&](size_t i) { + //This alignment is good enough but we already rescued enough + if (track_provenance) { + auto& index = unpaired_alignments.at(i); + size_t j = index.lookup_in(alignment_indices); + funnels[index.read].fail("max-rescue-attempts", j); + } + return; + }, [&] (size_t i) { + //This alignment is insufficiently good + if (track_provenance) { + //TODO: Fail something here + auto& index = unpaired_alignments.at(i); + size_t j = index.lookup_in(alignment_indices); + } + return; + }); + } + } + + + + if (track_provenance) { + // Now say we are finding the winner(s) + for (auto r : {0, 1}) { + funnels[r].stage("winner"); + } + } + + // Fill this in with the alignments we will output. + std::array, 2> mappings; + // Grab all the scores in order for MAPQ computation. + vector scores; + std::array, 2> scores_group; + vector distances; + vector types; + for (auto r : {0, 1}) { + mappings[r].reserve(paired_alignments.size()); + } + scores.reserve(paired_scores.size()); + distances.reserve(fragment_distances.size()); + types.reserve(pair_types.size()); + + //For each pair of alignments in mappings, how many equivalent or better fragment clusters were there + vector better_cluster_count_by_mappings; + +#ifdef print_minimizer_table + vector> mapping_was_rescued; + vector> pair_indices; +#endif + + process_until_threshold_a(paired_alignments.size(), (std::function) [&](size_t i) -> double { + return paired_scores[i]; + }, 0, 1, max_multimaps, rng, [&](size_t alignment_num) { + // This alignment makes it + // Called in score order + + const std::array& index_pair = paired_alignments[alignment_num]; + + + // Remember the score at its rank + scores.emplace_back(paired_scores[alignment_num]); + distances.emplace_back(fragment_distances[alignment_num]); + types.emplace_back(pair_types[alignment_num]); + better_cluster_count_by_mappings.emplace_back(better_cluster_count_by_pairs[alignment_num]); + // Remember the output alignment + for (auto r : {0, 1}) { + mappings[r].emplace_back(index_pair[r].lookup_for_read_in(r, alignments)); + } + + if (mappings[0].size() == 1 && found_pair) { + //If this is the best pair of alignments that we're going to return and we didn't attempt rescue, + //get the group scores for mapq + + //Get the scores of this pair + for (auto r : {0, 1}) { + scores_group[r].push_back(paired_scores[alignment_num]); + } + + //The indices (into paired_alignments) of pairs with the same first/second read as this + std::array*, 2> alignment_group; + for (auto r : {0, 1}) { + alignment_group[r] = &index_pair[r].lookup_for_read_in(r, alignment_groups); + } + + for (auto r : {0, 1}) { + for (size_t other_alignment_num : *alignment_group[r]) { + if (other_alignment_num != alignment_num) { + scores_group[r].push_back(paired_scores[other_alignment_num]); + } + } + } + } + + // Flip second alignment back to input orientation + reverse_complement_alignment_in_place(&mappings[1].back(), [&](vg::id_t node_id) { + return gbwt_graph.get_length(gbwt_graph.get_handle(node_id)); + }); + + if (mappings[0].size() > 1) { + // Mark pair as secondary alignments + for (auto r : {0, 1}) { + mappings[r].back().set_is_secondary(true); + } + } + +#ifdef print_minimizer_table + mapping_was_rescued.emplace_back(alignment_was_rescued[alignment_num]); + pair_indices.push_back(index_pair); +#endif + + if (track_provenance) { + // Tell the funnel + for (auto r : {0, 1}) { + funnels[r].pass("max-multimaps", alignment_num); + funnels[r].project(alignment_num); + funnels[r].score(funnels[r].latest(), scores.back()); + } + } + + return true; + }, [&](size_t alignment_num) { + // We already have enough alignments, although this one has a good score + + // Remember the score at its rank anyway + scores.emplace_back(paired_scores[alignment_num]); + distances.emplace_back(fragment_distances[alignment_num]); + types.emplace_back(pair_types[alignment_num]); + better_cluster_count_by_mappings.emplace_back(better_cluster_count_by_pairs[alignment_num]); + + +#ifdef print_minimizer_table + std::array index_pair = paired_alignments[alignment_num]; + pair_indices.push_back(index_pair); +#endif + if (track_provenance) { + for (auto r : {0, 1}) { + funnels[r].fail("max-multimaps", alignment_num); + } + } + }, [&](size_t alignment_num) { + // This alignment does not have a sufficiently good score + // Score threshold is 0; this should never happen + crash_unless(false); + }); + + if (track_provenance) { + for (auto r : {0, 1}) { + funnels[r].substage("mapq"); + } + } + + + // Compute raw explored caps (with 2.0 scaling, like for single-end) and raw group caps. + // Non-capping caps stay at infinity. + std::array mapq_explored_caps { + std::numeric_limits::infinity(), + std::numeric_limits::infinity() + }; + std::array mapq_score_groups { + std::numeric_limits::infinity(), + std::numeric_limits::infinity() + }; + // We also have one fragment_cluster_cap across both ends. + double fragment_cluster_cap = std::numeric_limits::infinity(); + // And one base uncapped MAPQ + double uncapped_mapq = 0; + double new_cluster_cap = numeric_limits::infinity(); + + // Store multiplicities, if we fill them in + vector paired_multiplicities; + + if (mappings[0].empty()) { + //If we didn't get an alignment, return empty alignments + for (auto r : {0, 1}) { + mappings[r].emplace_back(*alns[r]); + } + + // Flip second read back to input orientation + reverse_complement_alignment_in_place(&mappings[1].back(), [&](vg::id_t node_id) { + return gbwt_graph.get_length(gbwt_graph.get_handle(node_id)); + }); + + for (auto r : {0, 1}) { + mappings[r].back().clear_refpos(); + mappings[r].back().clear_path(); + mappings[r].back().set_score(0); + mappings[r].back().set_identity(0); + mappings[r].back().set_mapping_quality(0); + } + +#ifdef print_minimizer_table + mapping_was_rescued.emplace_back(false, false); + pair_indices.emplace_back(NO_READ_INDEX, NO_READ_INDEX); +#endif + + } else { + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "For scores"; + for (auto& score : scores) cerr << " " << score << ":" << endl; + } + } + + //Get the multiplicities for mapq calculation + //We're only using multiplicities if the alignments were rescued + std::array estimated_multiplicity_from; + for (auto r : {0, 1}) { + estimated_multiplicity_from[r] = unpaired_count[r] > 0 ? (double) unpaired_count[r] / min(rescued_count[r], max_rescue_attempts) : 1.0; + } + bool all_rescued = true; + // TODO: This switch used to fall-through on all cases. + for (PairType type : types) { + switch (type){ + case paired: + paired_multiplicities.push_back(1.0); + all_rescued=false; + break; + case unpaired: + paired_multiplicities.push_back(1.0); + break; + case rescued_from_first: + paired_multiplicities.push_back(estimated_multiplicity_from[0]); + break; + case rescued_from_second: + paired_multiplicities.push_back(estimated_multiplicity_from[1]); + break; + } + } + const vector* multiplicities = all_rescued ? &paired_multiplicities : nullptr; + // Compute base MAPQ if not unmapped. Otherwise use 0 instead of the 50% this would give us. + // If all of the alignment pairs were found with rescue, use the multiplicities to determine mapq + // Use exact mapping quality + uncapped_mapq = scores[0] == 0 ? 0 : + get_regular_aligner()->compute_max_mapping_quality(scores, false, multiplicities); + + //Cap mapq at 1 - 1 / # equivalent or better fragment clusters, including self + if (better_cluster_count_by_mappings.front() > 1) { + // TODO: why is this a sensible cap? + fragment_cluster_cap = prob_to_phred(1.0 - (1.0 / (double) better_cluster_count_by_mappings.front())); + // Leave zeros in here and don't round. + } + + for (auto r : {0, 1}) { + // For each fragment + + //If one alignment was duplicated in other pairs, cap the mapq for that alignment at the mapq + //of the group of duplicated alignments. Always compute this even if not quite sensible. + mapq_score_groups[r] = get_regular_aligner()->compute_max_mapping_quality(scores_group[r], false); + + vector explored_minimizers; + for (size_t i = 0; i < minimizers_by_read[r].size(); i++) { + if (minimizer_explored_by_read[r].contains(i)) { + explored_minimizers.push_back(i); + } + } + // Compute exploration cap on MAPQ. TODO: avoid needing to pass as much stuff along. + double mapq_explored_cap = faster_cap(minimizers_by_read[r], explored_minimizers, alns[r]->sequence(), alns[r]->quality()); + + mapq_explored_caps[r] = mapq_explored_cap; + + // Remember the caps + set_annotation(mappings[r].front(), "mapq_explored_cap", mapq_explored_cap); + set_annotation(mappings[r].front(), "mapq_score_group", mapq_score_groups[r]); + } + + // Have a function to transform interesting cap values to uncapped. + auto preprocess_cap = [](double cap) { + return (cap != -numeric_limits::infinity()) ? cap : numeric_limits::infinity(); + }; + + for (auto r : {0, 1}) { + // For each fragment + + // Compute the overall cap for just this read, now that the individual cap components are filled in for both reads. + double escape_bonus = uncapped_mapq < std::numeric_limits::max() ? 1.0 : 2.0; + double mapq_cap = std::min(fragment_cluster_cap, ((mapq_explored_caps[0] + mapq_explored_caps[1])*escape_bonus) ); + + //TODO: How to cap mapq when the reads were unpaired + if (types.front() == unpaired) { + //If this pair came from two different fragment cluster, then cap mapq at the mapq + //from only unpaired alignments of this read + mapq_cap = std::min(mapq_cap, (double)get_regular_aligner()->compute_max_mapping_quality(unpaired_scores[r], false)); + } + + // Find the MAPQ to cap + double read_mapq = uncapped_mapq; + + // Remember the uncapped MAPQ + set_annotation(mappings[r].front(), "mapq_uncapped", read_mapq); + // And the cap we actually applied (possibly from the pair partner) + set_annotation(mappings[r].front(), "mapq_applied_cap", mapq_cap); + + // Apply the cap, and limit to 0-60 + double capped_mapq = min(mapq_cap, read_mapq); + if (distances.front() == std::numeric_limits::max()) { + //If the two reads are not reachable, lower cap + capped_mapq = capped_mapq / 2.0; + } + read_mapq = max(min(capped_mapq, 120.0) / 2.0, 0.0); + + // Save the MAPQ + mappings[r].front().set_mapping_quality(read_mapq); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "MAPQ for read " << (r + 1) << " is " << read_mapq << ", was " << uncapped_mapq + << " capped by fragment cluster cap " << fragment_cluster_cap + << ", score group cap " << (mapq_score_groups[r] / 2.0) + << ", combined explored cap " << ((mapq_explored_caps[0] + mapq_explored_caps[1]) / 2.0) << endl; + } + } + } + + //Annotate top pair with its fragment distance, properly-paired-ness, fragment length distrubution, and secondary scores + bool properly_paired = distances.front() == std::numeric_limits::max() ? false : + (std::abs(distances.front()-fragment_length_distr.mean()) <= 6.0*fragment_length_distr.std_dev()) ; + string distribution = "-I " + to_string(fragment_length_distr.mean()) + " -D " + to_string(fragment_length_distr.std_dev()); + for (auto r : {0, 1}) { + set_annotation(mappings[r].front(), "fragment_length", distance_to_annotation(distances.front())); + set_annotation(mappings[r].front(), "proper_pair", properly_paired); + set_annotation(mappings[r].front(),"fragment_length_distribution", distribution); + set_annotation(mappings[r].front(),"secondary_scores", scores); + } + } + + // Make sure pair partners reference each other + pair_all(mappings); + + + + for (auto r : {0, 1}) { + if (track_provenance) { + funnels[r].substage_stop(); + } + // Stop this alignment + funnels[r].stop(); + } + + for (auto r : {0, 1}) { + // Annotate with whatever's in the funnel. + funnels[r].annotate_mapped_alignment(mappings[r].front(), track_correctness); + + if (track_provenance) { + if (track_correctness) { + annotate_with_minimizer_statistics(mappings[r].front(), minimizers_by_read[r], seeds_by_read[r], seeds_by_read[r].size(), 0, funnels[r]); + } + // Annotate with parameters used for the filters. + set_annotation(mappings[r].front(), "param_hit-cap", (double) hit_cap); + set_annotation(mappings[r].front(), "param_hard-hit-cap", (double) hard_hit_cap); + set_annotation(mappings[r].front(), "param_score-fraction", (double) minimizer_score_fraction); + set_annotation(mappings[r].front(), "param_max-extensions", (double) max_extensions); + set_annotation(mappings[r].front(), "param_max-alignments", (double) max_alignments); + set_annotation(mappings[r].front(), "param_cluster-score", (double) cluster_score_threshold); + set_annotation(mappings[r].front(), "param_cluster-coverage", (double) cluster_coverage_threshold); + set_annotation(mappings[r].front(), "param_extension-set", (double) extension_set_score_threshold); + set_annotation(mappings[r].front(), "param_max-multimaps", (double) max_multimaps); + set_annotation(mappings[r].front(), "param_max-rescue-attempts", (double) max_rescue_attempts); + } + } + +#ifdef print_minimizer_table + + if (distances.size() == 0) { + distances.emplace_back(0); + } + for (auto r : {0, 1}) { + cerr << alns[r]->sequence() << "\t"; + for (char c : alns[r]->quality()) { + cerr << (char)(c+33); + } + cerr << "\t" << max_fragment_num << "\t" << mapping_was_rescued[0].first << "\t" << mapping_was_rescued[0].second + << "\t" << distances.front(); + for (size_t i = 0 ; i < minimizers_by_read[r].size() ; i++) { + auto& minimizer = minimizers_by_read[r][i]; + cerr << "\t" + << minimizer.value.key.decode(minimizer.length) << "\t" + << minimizer.forward_offset() << "\t" + << minimizer.agglomeration_start << "\t" + << minimizer.agglomeration_length << "\t" + << minimizer.hits << "\t" + << minimizer_explored_by_read[r].contains(i); + if (minimizer_explored_by_read[r].contains(i)) { + crash_unless(minimizer.hits<=hard_hit_cap) ; + } + } + cerr << "\t" << uncapped_mapq << "\t" << fragment_cluster_cap << "\t" << mapq_score_groups[0] << "\t" + << mapq_explored_caps[0] << "\t" << new_cluster_cap << "\t" << mappings[r].front().mapping_quality() << "\t"; + for (size_t i = 0 ; i < scores.size() ; i++) { + std::array& indices = pair_indices[i]; + std::array paired_alignments; + for (auto r2 : {0, 1}) { + paired_alignments[r2] = &indices[r2].lookup_for_read_in(r2, alignments); + } + + int64_t dist = distances[i]; + crash_unless(dist == distance_between(paired_alignments[0], paired_alignments[1])); + + crash_unless(scores[i] == score_alignment_pair(paired_alignments[0], paired_alignments[1], dist)); + + double multiplicity = paired_multiplicities.size() == scores.size() ? paired_multiplicities[i] : 1.0; + + cerr << paired_alignments[0].score() << "," + << paired_alignments[1].score() << "," + << multiplicity << "," + << scores[i] << ";"; + } + + + if (track_correctness) { + cerr << "\t" << funnels[r].last_correct_stage() << endl; + } else { + cerr << "\t?" << endl; + } + } +#endif + + if (track_provenance && show_work) { + // Dump the funnel info graph. + #pragma omp critical (cerr) + { + for (auto r : {0, 1}) { + funnels[r].to_dot(cerr); + } + } + } + + // Ship out all the aligned alignments + return {std::move(mappings[0]), std::move(mappings[1])}; +} + +//----------------------------------------------------------------------------- + +double MinimizerMapper::faster_cap(const VectorView& minimizers, vector& minimizers_explored, + const string& sequence, const string& quality_bytes) { + + // TODO: Maybe we should to something smarter if we do not have base qualities. + if (quality_bytes.empty()) { + return numeric_limits::infinity(); + } + + // Sort minimizer subset so we go through minimizers in increasing order of + // agglomeration end position, and then increasing order of agglomeration + // start position + std::sort(minimizers_explored.begin(), minimizers_explored.end(), [&](size_t a, size_t b) { + // Return true if a must come before b, and false otherwise + size_t a_end = minimizers[a].agglomeration_start + minimizers[a].agglomeration_length; + size_t b_end = minimizers[b].agglomeration_start + minimizers[b].agglomeration_length; + return a_end < b_end || (a_end == b_end && minimizers[a].agglomeration_start < minimizers[b].agglomeration_start); + }); +#ifdef debug + cerr << "Sorted " << minimizers_explored.size() << " minimizers" << endl; +#endif + +#ifdef debug + cerr << "Explored minimizers:" << endl; + dump_debug_minimizers(minimizers, sequence, &minimizers_explored); +#endif + + for (auto it = minimizers_explored.begin(); it != minimizers_explored.end(); ++it) { + if (minimizers[*it].length == 0) { + #pragma omp critical (cerr) + { + std::cerr << "error[MinimizerMapper::faster_cap]: Minimizer with no sequence found in read with sequence " << sequence << std::endl; + dump_debug_minimizers(minimizers, sequence, &minimizers_explored); + for (size_t i = 0 ; i < minimizers_explored.size() ; i++) { + auto& m = minimizers[minimizers_explored[i]]; + std::cerr << "Mininizer " << minimizers_explored[i] << " agg start " << m.agglomeration_start << " length " << m.agglomeration_length + << " core start " << m.value.offset << " length " << m.length << std::endl; + } + std::cerr << "Read sequence: " << sequence << std::endl; + std::cerr << "Read quality: "; + for (char q : quality_bytes) { + std::cerr << (char) (33 + (int)q); + } + std::cerr << std::endl; + exit(1); + } + } + } + + // Make a DP table holding the log10 probability of having an error disrupt each minimizer. + // Entry i+1 is log prob of mutating minimizers 0, 1, 2, ..., i. + // Make sure to have an extra field at the end to support this. + // Initialize with -inf for unfilled. + vector c(minimizers_explored.size() + 1, -numeric_limits::infinity()); + c[0] = 0.0; + + for_each_agglomeration_interval(minimizers, sequence, quality_bytes, minimizers_explored, [&](size_t left, size_t right, size_t bottom, size_t top) { + // For each overlap range in the agglomerations + +#ifdef debug + cerr << "Consider overlap range " << left << " to " << right << " in minimizer ranks " << bottom << " to " << top << endl; + cerr << "log10prob for bottom: " << c[bottom] << endl; +#endif + + // Calculate the probability of a disruption here + double p_here = get_log10_prob_of_disruption_in_interval(minimizers, sequence, quality_bytes, + minimizers_explored.begin() + bottom, minimizers_explored.begin() + top, left, right); + +#ifdef debug + cerr << "log10prob for here: " << p_here << endl; +#endif + + if (isinf(p_here)) { + #pragma omp critical (cerr) + { + std::cerr << "error[MinimizerMapper::faster_cap]: Minimizers seem impossible to disrupt in region " << left << " " << right << " " << bottom << " " << top << std::endl; + dump_debug_minimizers(minimizers, sequence, &minimizers_explored); + for (size_t i = 0 ; i < minimizers_explored.size() ; i++) { + auto& m = minimizers[minimizers_explored[i]]; + std::cerr << "Mininizer " << minimizers_explored[i] << " agg start " << m.agglomeration_start << " length " << m.agglomeration_length + << " core start " << m.value.offset << " length " << m.length << std::endl; + } + std::cerr << "Read sequence: " << sequence << std::endl; + std::cerr << "Read quality: "; + for (char q : quality_bytes) { + std::cerr << (char) (33 + (int)q); + } + std::cerr << std::endl; + } + exit(1); + } + + // Calculate prob of all intervals up to top being disrupted + double p = c[bottom] + p_here; + +#ifdef debug + cerr << "log10prob overall: " << p << endl; +#endif + + for (size_t i = bottom + 1; i < top + 1; i++) { + // Replace min-prob for minimizers in the interval + if (c[i] < p) { +#ifdef debug + cerr << "\tBeats " << c[i] << " at rank " << i-1 << endl; +#endif + c[i] = p; + } else { +#ifdef debug + cerr << "\tBeaten by " << c[i] << " at rank " << i-1 << endl; +#endif + } + } + }); + +#ifdef debug + cerr << "log10prob after all minimizers is " << c.back() << endl; +#endif + + if (isinf(c.back())) { + #pragma omp critical (cerr) + { + std::cerr << "error[MinimizerMapper::faster_cap]: Minimizers seem impossible to disrupt!" << std::endl; + dump_debug_minimizers(minimizers, sequence, &minimizers_explored); + for (size_t i = 0 ; i < minimizers_explored.size() ; i++) { + auto& m = minimizers[minimizers_explored[i]]; + std::cerr << "Mininizer " << minimizers_explored[i] << " agg start " << m.agglomeration_start << " length " << m.agglomeration_length + << " core start " << m.value.offset << " length " << m.length << std::endl; + } + std::cerr << "Read sequence: " << sequence << std::endl; + std::cerr << "Read quality: "; + for (char q : quality_bytes) { + std::cerr << (char) (33 + (int)q); + } + std::cerr << std::endl; + } + exit(1); + } + + // Conver to Phred. + double result = -c.back() * 10; + return result; +} + +void MinimizerMapper::for_each_agglomeration_interval(const VectorView& minimizers, + const string& sequence, const string& quality_bytes, + const vector& minimizer_indices, + const function& iteratee) { + + if (minimizer_indices.empty()) { + // Handle no item case + return; + } + + // Items currently being iterated over + list stack = {&minimizers[minimizer_indices.front()]}; + // The left end of an item interval + size_t left = stack.front()->agglomeration_start; + // The index of the first item in the interval in the sequence of selected items + size_t bottom = 0; + + // Emit all intervals that precede a given point "right" + auto emit_preceding_intervals = [&](size_t right) { + while (left < right) { + // Work out the end position of the top thing on the stack + size_t stack_top_end = stack.front()->agglomeration_start + stack.front()->agglomeration_length; + if (stack_top_end <= right) { + // Case where the left-most item ends before the start of the new item + + if (stack_top_end < left) { + // Something is wrong with the order we are visiting these in. + #pragma omp critical (cerr) + { + std::cerr << "error[MinimizerMapper::faster_cap]: Minimizers not sorted properly for read with sequence " << sequence << "! Agglomeration on stack ends at " << stack_top_end << " but we are at " << left << " from a previous agglomeration" << std::endl; + exit(1); + } + } + + iteratee(left, stack_top_end, bottom, bottom + stack.size()); + + // If the stack contains only one item there is a gap between the item + // and the new item, otherwise just shift to the end of the leftmost item + left = stack.size() == 1 ? right : stack_top_end; + + bottom += 1; + stack.pop_front(); + } else { + // Case where the left-most item ends at or after the beginning of the new new item + iteratee(left, right, bottom, bottom + stack.size()); + left = right; + } + } + }; + + for (auto it = minimizer_indices.begin() + 1; it != minimizer_indices.end(); ++it) { + // For each item in turn + auto& item = minimizers[*it]; + + if (stack.size() == 0) { + // Something is wrong with our stacking algorithm + #pragma omp critical (cerr) + { + std::cerr << "error[MinimizerMapper::faster_cap]: Minimizers not stacked up properly for read with sequence " << sequence << "!" << std::endl; + exit(1); + } + } + + // For each new item we return all intervals that + // precede its start + emit_preceding_intervals(item.agglomeration_start); + + // Add the new item for the next loop + stack.push_back(&item); + } + + // Intervals of the remaining intervals on the stack + emit_preceding_intervals(sequence.size()); +} + +double MinimizerMapper::get_log10_prob_of_disruption_in_interval(const VectorView& minimizers, + const string& sequence, const string& quality_bytes, + const vector::iterator& disrupt_begin, const vector::iterator& disrupt_end, + size_t left, size_t right) { + +#ifdef debug + cerr << "Compute log10 probability in interval " << left << "-" << right << endl; +#endif + + if (left == right) { + // 0-length intervals need no disruption. + return 0; + } + + // We want an OR over all the columns, but some of the probabilities are tiny. + // So instead of NOT(AND(NOT())), which also would assume independence the + // way we calculate AND by multiplication, we just assume independence and + // compute OR as (p1 + p2 - (p1 * p2)). + // Start with the first column. + double p = get_prob_of_disruption_in_column(minimizers, sequence, quality_bytes, disrupt_begin, disrupt_end, left); +#ifdef debug + cerr << "\tProbability disrupted at column " << left << ": " << p << endl; +#endif + for(size_t i = left + 1 ; i < right; i++) { + // OR up probability of all the other columns + double col_p = get_prob_of_disruption_in_column(minimizers, sequence, quality_bytes, disrupt_begin, disrupt_end, i); +#ifdef debug + cerr << "\tProbability disrupted at column " << i << ": " << col_p << endl; +#endif + p = (p + col_p - (p * col_p)); +#ifdef debug + cerr << "\tRunning OR of disrupted anywhere: " << p << endl; +#endif + } + + // Convert to log10prob. + return log10(p); + +} + +double MinimizerMapper::get_prob_of_disruption_in_column(const VectorView& minimizers, + const string& sequence, const string& quality_bytes, + const vector::iterator& disrupt_begin, const vector::iterator& disrupt_end, + size_t index) { + +#ifdef debug + cerr << "\tCompute probability at column " << index << endl; +#endif + + // Base cost is quality. Make sure to compute a non-integral answer. + double p = phred_to_prob((uint8_t)quality_bytes[index]); +#ifdef debug + cerr << "\t\tBase probability from quality: " << p << endl; +#endif + for (auto it = disrupt_begin; it != disrupt_end; ++it) { + // For each minimizer to disrupt + auto& m = minimizers[*it]; + +#ifdef debug + cerr << "\t\tRelative rank " << (it - disrupt_begin) << " is minimizer " << m.value.key.decode(m.length) << endl; +#endif + + if (!(m.forward_offset() <= index && index < m.forward_offset() + m.length)) { + // Index is out of range of the minimizer itself. We're in the flank. +#ifdef debug + cerr << "\t\t\tColumn " << index << " is in flank." << endl; +#endif + // How many new possible minimizers would an error here create in this agglomeration, + // to compete with its minimizer? + // No more than one per position in a minimizer sequence. + // No more than 1 per base from the start of the agglomeration to here, inclusive. + // No more than 1 per base from here to the last base of the agglomeration, inclusive. + size_t possible_minimizers = min((size_t) m.length, + min(index - m.agglomeration_start + 1, + (m.agglomeration_start + m.agglomeration_length) - index)); + +#ifdef debug + cerr << "\t\t\tBeat hash " << m.value.hash << " at least 1 time in " << possible_minimizers << endl; +#endif + + // Account for at least one of them beating the minimizer. + double any_beat_prob = prob_for_at_least_one(m.value.hash, possible_minimizers); + +#ifdef debug + cerr << "\t\t\t\tGives probability: " << any_beat_prob << endl; +#endif + + p *= any_beat_prob; + + // TODO: handle N somehow??? It can occur outside the minimizer itself, here in the flank. + } +#ifdef debug + cerr << "\t\t\tRunning AND prob: " << p << endl; +#endif + } + + return p; +} + +//----------------------------------------------------------------------------- + +void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& rescued_alignment, const VectorView& minimizers, bool rescue_forward ) { + + // Get rid of the old path. + rescued_alignment.clear_path(); + + if (this->rescue_algorithm == rescue_none) { return; } + + // We are traversing the same small subgraph repeatedly, so it's better to use a cache. + gbwtgraph::CachedGBWTGraph cached_graph(this->gbwt_graph); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Attempt rescue from: " << log_alignment(aligned_read) << endl; + } + } + + // Find all nodes within a reasonable range from aligned_read. + std::unordered_set rescue_nodes; + int64_t min_distance = max(0.0, fragment_length_distr.mean() - rescued_alignment.sequence().size() - rescue_subgraph_stdevs * fragment_length_distr.std_dev()); + int64_t max_distance = fragment_length_distr.mean() + rescue_subgraph_stdevs * fragment_length_distr.std_dev(); + + subgraph_in_distance_range(*distance_index, aligned_read.path(), &cached_graph, min_distance, max_distance, rescue_nodes, rescue_forward); + + if (rescue_nodes.size() == 0) { + //If the rescue subgraph is empty + return; + } + + // Remove node ids that do not exist in the GBWTGraph from the subgraph. + // We may be using the distance index of the original graph, and nodes + // not visited by any thread are missing from the GBWTGraph. + for (auto iter = rescue_nodes.begin(); iter != rescue_nodes.end(); ) { + if (!cached_graph.has_node(*iter)) { + iter = rescue_nodes.erase(iter); + } else { + ++iter; + } + } + + // Find all seeds in the subgraph and try to get a full-length extension. + GaplessExtender::cluster_type seeds = this->seeds_in_subgraph(minimizers, rescue_nodes); + if (seeds.size() > this->rescue_seed_limit) { + return; + } + std::vector extensions = this->extender->extend(seeds, rescued_alignment.sequence(), &cached_graph); + + // If we have a full-length extension, use it as the rescued alignment. + if (GaplessExtender::full_length_extensions(extensions)) { + this->extension_to_alignment(extensions.front(), rescued_alignment); + return; + } + + // Determine the best extension. + size_t best = extensions.size(); + for (size_t i = 0; i < extensions.size(); i++) { + if (best >= extensions.size() || extensions[i].score > extensions[best].score) { + best = i; + } + } + + // Use the best extension as a seed for dozeu. + // Also ensure that the entire extension is in the subgraph. + std::vector dozeu_seed; + if (best < extensions.size()) { + const GaplessExtension& extension = extensions[best]; + for (handle_t handle : extension.path) { + rescue_nodes.insert(cached_graph.get_id(handle)); + } + dozeu_seed.emplace_back(); + dozeu_seed.back().begin = rescued_alignment.sequence().begin() + extension.read_interval.first; + dozeu_seed.back().end = rescued_alignment.sequence().begin() + extension.read_interval.second; + nid_t id = cached_graph.get_id(extension.path.front()); + bool is_reverse = cached_graph.get_is_reverse(extension.path.front()); + gcsa::node_type node = gcsa::Node::encode(id, extension.offset, is_reverse); + dozeu_seed.back().nodes.push_back(node); + } + + // GSSW and dozeu assume that the graph is a DAG. + std::vector topological_order = gbwtgraph::topological_order(cached_graph, rescue_nodes); + if (!topological_order.empty()) { + + size_t rescue_subgraph_bases = 0; + for (auto& h : topological_order) { + rescue_subgraph_bases += cached_graph.get_length(h); + } + if (rescue_subgraph_bases * rescued_alignment.sequence().size() > max_dozeu_cells) { + if (!warned_about_rescue_size.test_and_set()) { + cerr << "warning[vg::giraffe]: Refusing to perform too-large rescue alignment of " + << rescued_alignment.sequence().size() << " bp against " + << rescue_subgraph_bases << " bp ordered subgraph for read " << rescued_alignment.name() + << " which would use more than " << max_dozeu_cells + << " cells and might exhaust Dozeu's allocator; suppressing further warnings." << endl; + } + return; + } + + if (rescue_algorithm == rescue_dozeu) { + size_t gap_limit = this->get_regular_aligner()->longest_detectable_gap(rescued_alignment); + get_regular_aligner()->align_xdrop(rescued_alignment, cached_graph, topological_order, + dozeu_seed, false, gap_limit); + this->fix_dozeu_score(rescued_alignment, cached_graph, topological_order); + } else { + get_regular_aligner()->align(rescued_alignment, cached_graph, topological_order); + } + return; + } + + // Build a subgraph overlay. + SubHandleGraph sub_graph(&cached_graph); + for (id_t id : rescue_nodes) { + sub_graph.add_handle(cached_graph.get_handle(id)); + } + + // Create an overlay where each strand is a separate node. + StrandSplitGraph split_graph(&sub_graph); + + // Dagify the subgraph. + bdsg::HashGraph dagified; + std::unordered_map dagify_trans = + handlealgs::dagify(&split_graph, &dagified, rescued_alignment.sequence().size()); + + size_t rescue_subgraph_bases = dagified.get_total_length(); + if (rescue_subgraph_bases * rescued_alignment.sequence().size() > max_dozeu_cells) { + if (!warned_about_rescue_size.test_and_set()) { + cerr << "warning[vg::giraffe]: Refusing to perform too-large rescue alignment of " + << rescued_alignment.sequence().size() << " bp against " + << rescue_subgraph_bases << " bp dagified subgraph for read " << rescued_alignment.name() + << " which would use more than " << max_dozeu_cells + << " cells and might exhaust Dozeu's allocator; suppressing further warnings." << endl; + } + return; + } + + // Align to the subgraph. + // TODO: Map the seed to the dagified subgraph. + if (this->rescue_algorithm == rescue_dozeu) { + size_t gap_limit = this->get_regular_aligner()->longest_detectable_gap(rescued_alignment); + get_regular_aligner()->align_xdrop(rescued_alignment, dagified, std::vector(), false, gap_limit); + this->fix_dozeu_score(rescued_alignment, dagified, std::vector()); + } else if (this->rescue_algorithm == rescue_gssw) { + get_regular_aligner()->align(rescued_alignment, dagified, true); + } + + // Map the alignment back to the original graph. + Path& path = *(rescued_alignment.mutable_path()); + for (size_t i = 0; i < path.mapping_size(); i++) { + Position& pos = *(path.mutable_mapping(i)->mutable_position()); + id_t id = dagify_trans[pos.node_id()]; + handle_t handle = split_graph.get_underlying_handle(split_graph.get_handle(id)); + pos.set_node_id(sub_graph.get_id(handle)); + pos.set_is_reverse(sub_graph.get_is_reverse(handle)); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Rescue result: " << log_alignment(rescued_alignment) << endl; + } + } +} + +GaplessExtender::cluster_type MinimizerMapper::seeds_in_subgraph(const VectorView& minimizers, + const std::unordered_set& subgraph) const { + std::vector sorted_ids(subgraph.begin(), subgraph.end()); + std::sort(sorted_ids.begin(), sorted_ids.end()); + GaplessExtender::cluster_type result; + for (const Minimizer& minimizer : minimizers) { + gbwtgraph::hits_in_subgraph(minimizer.hits, minimizer.occs, sorted_ids, [&](pos_t pos, gbwtgraph::Payload) { + if (minimizer.value.is_reverse) { + size_t node_length = this->gbwt_graph.get_length(this->gbwt_graph.get_handle(id(pos))); + pos = reverse_base_pos(pos, node_length); + } + result.insert(GaplessExtender::to_seed(pos, minimizer.value.offset)); + }); + } + return result; +} + +void MinimizerMapper::fix_dozeu_score(Alignment& rescued_alignment, const HandleGraph& rescue_graph, + const std::vector& topological_order) const { + + const Aligner* aligner = this->get_regular_aligner(); + int32_t score = aligner->score_contiguous_alignment(rescued_alignment); + if (score > 0) { + rescued_alignment.set_score(score); + } else { + rescued_alignment.clear_path(); + if (topological_order.empty()) { + aligner->align(rescued_alignment, rescue_graph, true); + } else { + aligner->align(rescued_alignment, rescue_graph, topological_order); + } + } +} + +//----------------------------------------------------------------------------- + + +int64_t MinimizerMapper::distance_between(const pos_t& pos1, const pos_t& pos2) { + int64_t min_dist= distance_index->minimum_distance(id(pos1), is_rev(pos1), offset(pos1), + id(pos2), is_rev(pos2), offset(pos2), + false, &gbwt_graph); + return min_dist; +} + +int64_t MinimizerMapper::unoriented_distance_between(const pos_t& pos1, const pos_t& pos2) const { + + int64_t min_dist = distance_index->minimum_distance(id(pos1), is_rev(pos1), offset(pos1), + id(pos2), is_rev(pos2), offset(pos2), + true, &gbwt_graph); + return min_dist; +} + +int64_t MinimizerMapper::distance_between(const Alignment& aln1, const Alignment& aln2) { + crash_unless(aln1.path().mapping_size() != 0); + crash_unless(aln2.path().mapping_size() != 0); + + pos_t pos1 = initial_position(aln1.path()); + pos_t pos2 = final_position(aln2.path()); + + return distance_between(pos1, pos2); +} + +void MinimizerMapper::extension_to_alignment(const GaplessExtension& extension, Alignment& alignment) const { + *(alignment.mutable_path()) = extension.to_path(this->gbwt_graph, alignment.sequence()); + alignment.set_score(extension.score); + double identity = 0.0; + if (!alignment.sequence().empty()) { + size_t len = alignment.sequence().length(); + identity = (len - extension.mismatches()) / static_cast(len); + } + alignment.set_identity(identity); +} + +//----------------------------------------------------------------------------- + +std::vector MinimizerMapper::find_minimizers(const std::string& sequence, Funnel& funnel) const { + + if (this->track_provenance) { + // Start the minimizer finding stage + funnel.stage("minimizer"); + } + + std::vector result; + double base_score = 1.0 + std::log(this->hard_hit_cap); + // Get minimizers and their window agglomeration starts and lengths + // Starts and lengths are all 0 if we are using syncmers. + vector> minimizers = + this->minimizer_index.minimizer_regions(sequence); + for (auto& m : minimizers) { + double score = 0.0; + auto hits = this->minimizer_index.find(get<0>(m)); + if (hits.second > 0) { + if (hits.second <= this->hard_hit_cap) { + score = base_score - std::log(hits.second); + } else { + score = 1.0; + } + } + + // Length of the match from this minimizer or syncmer + int32_t match_length = (int32_t) minimizer_index.k(); + // Number of candidate kmers that this minimizer is minimal of + int32_t candidate_count = this->minimizer_index.uses_syncmers() ? 1 : (int32_t) minimizer_index.w(); + + auto& value = std::get<0>(m); + size_t agglomeration_start = std::get<1>(m); + size_t agglomeration_length = std::get<2>(m); + if (this->minimizer_index.uses_syncmers()) { + // The index says the start and length are 0. Really they should be where the k-mer is. + // So start where the k-mer is on the forward strand + agglomeration_start = value.is_reverse ? (value.offset - (match_length - 1)) : value.offset; + // And run for the k-mer length + agglomeration_length = match_length; + } + + result.push_back({ value, agglomeration_start, agglomeration_length, hits.second, hits.first, + match_length, candidate_count, score }); + } + + if (this->track_provenance) { + // Record how many we found, as new lines. + funnel.introduce(result.size()); + } + + return result; +} + +std::vector MinimizerMapper::sort_minimizers_by_score(const std::vector& minimizers) const { + // We defined operator< so the minimizers always sort descening by score by default. + return sort_permutation(minimizers.begin(), minimizers.end()); +} + +std::vector MinimizerMapper::find_seeds(const VectorView& minimizers, const Alignment& aln, Funnel& funnel) const { + + if (this->track_provenance) { + // Start the minimizer locating stage + funnel.stage("seed"); + } + + // One of the filters accepts minimizers until selected_score reaches target_score. + double base_target_score = 0.0; + for (const Minimizer& minimizer : minimizers) { + base_target_score += minimizer.score; + } + double target_score = (base_target_score * this->minimizer_score_fraction) + 0.000001; + double selected_score = 0.0; + + // We group all all occurrences of the same minimizer in the read together + // and either take all of them (if the total number of hits is low enough) + // or skip all of them. Such minimizers are expensive to process, because + // they tend to have many hits and each hit in the graph is matched with + // each occurrence in the read. + size_t start = 0, limit = 0; + size_t run_hits = 0; + bool taking_run = false; + + if (show_work) { + #pragma omp critical (cerr) + { + std::cerr << log_name() << "All minimizers:" << std::endl; + dump_debug_minimizers(minimizers, aln.sequence()); + } + } + + // bit vector length of read to check for overlaps + size_t num_minimizers = 0; + size_t read_len = aln.sequence().size(); + size_t num_min_by_read_len = read_len / this->num_bp_per_min; + std::vector read_bit_vector (read_len, false); + + // Select the minimizers we use for seeds. + size_t rejected_count = 0; + std::vector seeds; + + // Define the filters for minimizers. + // + // Each has a name, a function to check if a minimizer passes, a function + // to check the minimizer's statistic, and a pass and a fail function to + // update state. + // + // Filters that keep state and don't want to treat multiple duplicate + // minimizers differently should check taking_run to see if something in + // the run already passed all filters (and therefore this filter). + // + // Functions are only allowed to be called after run_hits is set and the + // run logic has determined run membership for the minimizer. + using filter_t = std::tuple, std::function, std::function, std::function>; + std::vector minimizer_filters; + minimizer_filters.reserve(5); + minimizer_filters.emplace_back( + "any-hits", + [&](const Minimizer& m) { return m.hits > 0; }, + [](const Minimizer& m) { return nan(""); }, + [](const Minimizer& m) {}, + [](const Minimizer& m) {} + ); + minimizer_filters.emplace_back( + "hard-hit-cap", + [&](const Minimizer& m) { return run_hits <= this->hard_hit_cap; }, + [&](const Minimizer& m) { return (double)run_hits; }, + [](const Minimizer& m) {}, + [](const Minimizer& m) {} + ); + if (this->exclude_overlapping_min) { + minimizer_filters.emplace_back( + "exclude-overlapping-min", + [&](const Minimizer& m) { + // TODO: Is this matching abutting minimizers? + // If at both ehnds the flag isn't set yet, we pass. + return !read_bit_vector[m.forward_offset()] && + !read_bit_vector[m.forward_offset() + m.length]; + }, + [](const Minimizer& m) { return nan(""); }, + [&](const Minimizer& m) { + for (size_t i = m.forward_offset(); i < m.forward_offset() + m.length; i++) { + read_bit_vector[i] = true; + } + }, + [](const Minimizer& m) {} + ); + } + minimizer_filters.emplace_back( + "max-unique-min||num-bp-per-min", + [&](const Minimizer& m) { + return num_minimizers < std::max(this->max_unique_min, num_min_by_read_len); + }, + [](const Minimizer& m) { return nan(""); }, + [](const Minimizer& m) {}, + [](const Minimizer& m) {} + ); + minimizer_filters.emplace_back( + "hit-cap||score-fraction", + [&](const Minimizer& m) { + return (m.hits <= this->hit_cap) || // We pass if we are under the soft hit cap + (run_hits <= this->hard_hit_cap && selected_score + m.score <= target_score) || // Or the run as a whole is under the hard hot cap and we need the score + (taking_run); // Or we already took one duplicate and we want to finish out the run + }, + [&](const Minimizer& m) { + return (selected_score + m.score) / base_target_score; + }, + [&](const Minimizer& m) { + // Remember that we took this minimizer for evaluating later ones + selected_score += m.score; + }, + [&](const Minimizer& m) { + //Stop looking for more minimizers once we fail the score fraction + target_score = selected_score; + } + ); + + + // Flag whether each minimizer in the read was located or not, for MAPQ capping. + // We ignore minimizers with no hits (count them as not located), because + // they would have to be created in the read no matter where we say it came + // from, and because adding more of them should lower the MAPQ cap, whereas + // locating more of the minimizers that are present and letting them pass + // to the enxt stage should raise the cap. + for (size_t i = 0; i < minimizers.size(); i++) { + if (this->track_provenance) { + // Say we're working on it + funnel.processing_input(i); + } + + // Find the next run of identical minimizers. + if (i >= limit) { + // We are starting a new run + start = i; limit = i + 1; + run_hits = minimizers[i].hits; + for (size_t j = i + 1; j < minimizers.size() && minimizers[j].value.key == minimizers[i].value.key; j++) { + limit++; + run_hits += minimizers[j].hits; + } + // We haven't taken the first thing in the run yet. + taking_run = false; + } + + // Select the minimizer if it is informative enough or if the total score + // of the selected minimizers is not high enough. + const Minimizer& minimizer = minimizers[i]; + + // Evaluate the filters + bool passing = true; + for (auto& filter : minimizer_filters) { + auto& filter_name = std::get<0>(filter); + auto& filter_function = std::get<1>(filter); + auto& filter_stat_function = std::get<2>(filter); + auto& filter_pass_function = std::get<3>(filter); + auto& filter_fail_function = std::get<4>(filter); + + passing = filter_function(minimizer); + if (passing) { + // Pass this filter + if (this->track_provenance) { + funnel.pass(filter_name, i, filter_stat_function(minimizer)); + } + filter_pass_function(minimizer); + } else { + // Fail this filter. + if (this->track_provenance) { + funnel.fail(filter_name, i, filter_stat_function(minimizer)); + } + filter_fail_function(minimizer); + // Don't do later filters + break; + } + } + + if (passing) { + // We passed all filters. + // So we are taking this item and ought to take the others in the same run in most cases. + taking_run = true; + // Track number of minimizers selected. + num_minimizers++; + + // We should keep this minimizer instance because it is + // sufficiently rare, or we want it to make target_score, or it is + // the same sequence as a previous minimizer in this run of identical + // minimizers which we also took. + + // Locate the hits. + for (size_t j = 0; j < minimizer.hits; j++) { + pos_t hit = minimizer.occs[j].position.decode(); + // Reverse the hits for a reverse minimizer + if (minimizer.value.is_reverse) { + size_t node_length = this->gbwt_graph.get_length(this->gbwt_graph.get_handle(id(hit))); + hit = reverse_base_pos(hit, node_length); + } + // Extract component id and offset in the root chain, if we have them for this seed. + // TODO: Get all the seed values here + // TODO: Don't use the seed payload anymore + gbwtgraph::Payload chain_info = no_chain_info(); + if (minimizer.occs[j].payload != MIPayload::NO_CODE) { + chain_info = minimizer.occs[j].payload; + } + seeds.push_back(chain_info_to_seed(hit, i, chain_info)); + } + + if (this->track_provenance) { + // Record in the funnel that this minimizer gave rise to these seeds. + funnel.expand(i, minimizer.hits); + } + + } else { + // We failed a filter. + // Track number of minimizers rejected. + rejected_count++; + } + + if (this->track_provenance) { + // Say we're done with this input item + funnel.processed_input(); + } + } + + if (this->track_provenance) { + if (this->track_correctness) { + // Tag seeds with correctness + funnel.substage("correct"); + } else { + // We're just tagging them with read positions + funnel.substage("placed"); + } + this->tag_seeds(aln, seeds.cbegin(), seeds.cend(), minimizers, 0, funnel); + } + + if (show_work) { + #pragma omp critical (cerr) + { + std::cerr << log_name() << "Found " << seeds.size() << " seeds from " + << (minimizers.size() - rejected_count) << " minimizers, rejected " + << rejected_count << std::endl; + } + } + + return seeds; +} + +void MinimizerMapper::tag_seeds(const Alignment& aln, const std::vector::const_iterator& begin, const std::vector::const_iterator& end, const VectorView& minimizers, size_t funnel_offset, Funnel& funnel) const { + if (this->track_correctness && this->path_graph == nullptr) { + cerr << "error[vg::MinimizerMapper] Cannot use track_correctness with no XG index" << endl; + exit(1); + } + + // Track the index of each seed in the funnel + size_t funnel_index = funnel_offset; + for (std::vector::const_iterator it = begin; it != end; ++it) { + + // We know the seed is placed somewhere. + Funnel::State tag = Funnel::State::PLACED; + if (this->track_correctness && aln.refpos_size() != 0) { + // It might also be correct + // Find every seed's reference positions. This maps from path name to pairs of offset and orientation. + auto offsets = algorithms::nearest_offsets_in_paths(this->path_graph, it->pos, 100); + + for (auto& true_pos : aln.refpos()) { + // For every annotated true position + for (auto& hit_pos : offsets[this->path_graph->get_path_handle(true_pos.name())]) { + // Look at all the hit positions on the path the read's true position is on. + if (abs((int64_t)hit_pos.first - (int64_t) true_pos.offset()) < 200) { + // We're close enough to be correct + tag = Funnel::State::CORRECT; + break; + } + } + if (tag == Funnel::State::CORRECT) { + break; + } + } + } + + // Tag this seed as making some of the read space placed or even correct. + funnel.tag(funnel_index, tag, minimizers[it->source].forward_offset(), minimizers[it->source].length); + + // Look at the next seed + funnel_index++; + } +} + +void MinimizerMapper::annotate_with_minimizer_statistics(Alignment& target, const VectorView& minimizers, const std::vector& seeds, size_t old_seed_count, size_t new_seed_offset, const Funnel& funnel) const { + // Annotate with fraction covered by correct (and necessarily located) seed hits. + + // First make the set of minimizers that got correct seeds + std::unordered_set seeded; + for (size_t i = 0; i < seeds.size(); i++) { + // We need to get correctness out of the funnel, since we don't tag the seed or minimizer. + // Correctness is assessed per seed, not per minimizer. + + if (i < old_seed_count) { + // This seed was created at the seed stage. + // We know seed finding was always stage 1. + if (funnel.was_correct(1, "seed", i)) { + seeded.insert(seeds[i].source); + } + } else { + // This seed must have been created at the reseed stage in the + // align_from_chains codepath. We happen to know the magic stager + // number for it. + // Make sure to translate seed number to funnel number in the reseed stage. + // TODO: This is a tightly coupled hack, do we even need this annotation anymore? And does it really need to include the reseeded seeds? + if (funnel.was_correct(3, "reseed", i - old_seed_count + new_seed_offset)) { + seeded.insert(seeds[i].source); + } + } + } + + // Then we make a table of all the ranges covered by correct minimizers + std::vector> bounds; + bounds.reserve(seeded.size()); + for(auto& minimizer_number : seeded) { + // For each minimizer with correct seeds + auto& minimizer = minimizers[minimizer_number]; + // Cover the minimizer k-mer itself. + size_t start = minimizer.forward_offset(); + bounds.emplace_back(start, start + minimizer.length); + } + // Then we count the positions covered + size_t covered_count = algorithms::count_covered(bounds); + // And turn it into a fraction + double covered_fraction = (double) covered_count / target.sequence().size(); + // And add the annotation + set_annotation(target, "correct-minimizer-coverage", covered_fraction); +} + +//----------------------------------------------------------------------------- + +void MinimizerMapper::score_cluster(Cluster& cluster, size_t i, const VectorView& minimizers, const std::vector& seeds, size_t seq_length, Funnel& funnel) const { + + if (this->track_provenance) { + // Say we're making it + funnel.producing_output(i); + } + + // Initialize the values. + cluster.score = 0.0; + cluster.coverage = 0.0; + cluster.present = SmallBitset(minimizers.size()); + + // Determine the minimizers that are present in the cluster. + for (auto hit_index : cluster.seeds) { + cluster.present.insert(seeds[hit_index].source); + } + if (show_work) { + #pragma omp critical (cerr) + dump_debug_clustering(cluster, i, minimizers, seeds); + } + + // Compute the score and cluster coverage. + sdsl::bit_vector covered(seq_length, 0); + for (size_t j = 0; j < minimizers.size(); j++) { + if (cluster.present.contains(j)) { + const Minimizer& minimizer = minimizers[j]; + cluster.score += minimizer.score; + + // The offset of a reverse minimizer is the endpoint of the kmer + size_t start_offset = minimizer.forward_offset(); + size_t k = minimizer.length; + + // Set the k bits starting at start_offset. + covered.set_int(start_offset, sdsl::bits::lo_set[k], k); + } + } + // Count up the covered positions and turn it into a fraction. + cluster.coverage = sdsl::util::cnt_one_bits(covered) / static_cast(seq_length); + + if (this->track_provenance) { + // Record the cluster in the funnel as a group of the size of the number of items. + funnel.merge_group(cluster.seeds.begin(), cluster.seeds.end()); + funnel.score(funnel.latest(), cluster.score); + + // Say we made it. + funnel.produced_output(); + } +} + +//----------------------------------------------------------------------------- + +vector MinimizerMapper::extend_cluster(const Cluster& cluster, + size_t cluster_num, + const VectorView& minimizers, + const std::vector& seeds, + const string& sequence, + vector>& minimizer_kept_cluster_count, + Funnel& funnel) const { + + if (track_provenance) { + // Say we're working on this cluster + funnel.processing_input(cluster_num); + } + + // Count how many of each minimizer is in each cluster that we kept + minimizer_kept_cluster_count.emplace_back(minimizers.size(), 0); + // Pack the seeds for GaplessExtender. + GaplessExtender::cluster_type seed_matchings; + for (auto seed_index : cluster.seeds) { + // Insert the (graph position, read offset) pair. + auto& seed = seeds[seed_index]; + seed_matchings.insert(GaplessExtender::to_seed(seed.pos, minimizers[seed.source].value.offset)); + minimizer_kept_cluster_count.back()[seed.source]++; + + if (show_work) { + #pragma omp critical (cerr) + { + dump_debug_seeds(minimizers, seeds, cluster.seeds); + } + } + } + + vector cluster_extension = extender->extend(seed_matchings, sequence); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Extensions:" << endl; + for (auto& e : cluster_extension) { + cerr << log_name() << "\tRead " << e.read_interval.first + << "-" << e.read_interval.second << " with " + << e.mismatch_positions.size() << " mismatches:"; + for (auto& pos : e.mismatch_positions) { + cerr << " " << pos; + } + cerr << endl; + } + } + } + + if (track_provenance) { + // Record with the funnel that the previous group became a group of this size. + // Don't bother recording the seed to extension matching... + funnel.project_group(cluster_num, cluster_extension.size()); + // Say we finished with this cluster, for now. + funnel.processed_input(); + } + + return cluster_extension; +} + +//----------------------------------------------------------------------------- + +int MinimizerMapper::score_extension_group(const Alignment& aln, const vector& extended_seeds, + int gap_open_penalty, int gap_extend_penalty) { + + if (extended_seeds.empty()) { + // TODO: We should never see an empty group of extensions + return 0; + } else if (GaplessExtender::full_length_extensions(extended_seeds)) { + // These are full-length matches. We already have the score. + return extended_seeds.front().score; + } else { + // This is a collection of one or more non-full-length extended seeds. + + if (aln.sequence().size() == 0) { + // No score here + return 0; + } + + // We use a sweep line algorithm to find relevant points along the read: extension starts or ends. + // This records the last base to be covered by the current sweep line. + int64_t sweep_line = 0; + // This records the first base not covered by the last sweep line. + int64_t last_sweep_line = 0; + + // And we track the next unentered gapless extension + size_t unentered = 0; + + // Extensions we are in are in this min-heap of past-end position and gapless extension number. + vector> end_heap; + // The heap uses this comparator + auto min_heap_on_first = [](const pair& a, const pair& b) { + // Return true if a must come later in the heap than b + return a.first > b.first; + }; + + // We track the best score for a chain reaching the position before this one and ending in a gap. + // We never let it go below 0. + // Will be 0 when there's no gap that can be open + int best_gap_score = 0; + + // We track the score for the best chain ending with each gapless extension + vector best_chain_score(extended_seeds.size(), 0); + + // And we're after the best score overall that we can reach when an extension ends + int best_past_ending_score_ever = 0; + + // Overlaps are more complicated. + // We need a heap of all the extensions for which we have seen the + // start and that we can thus overlap. + // We filter things at the top of the heap if their past-end positions + // have occurred. + // So we store pairs of score we get backtracking to the current + // position, and past-end position for the thing we are backtracking + // from. + vector> overlap_heap; + // We can just use the standard max-heap comparator + + // We encode the score relative to a counter that we increase by the + // gap extend every base we go through, so we don't need to update and + // re-sort the heap. + int overlap_score_offset = 0; + + while(last_sweep_line <= aln.sequence().size()) { + // We are processed through the position before last_sweep_line. + + // Find a place for sweep_line to go + + // Find the next seed start + int64_t next_seed_start = numeric_limits::max(); + if (unentered < extended_seeds.size()) { + next_seed_start = extended_seeds[unentered].read_interval.first; + } + + // Find the next seed end + int64_t next_seed_end = numeric_limits::max(); + if (!end_heap.empty()) { + next_seed_end = end_heap.front().first; + } + + // Whichever is closer between those points and the end, do that. + sweep_line = min(min(next_seed_end, next_seed_start), (int64_t) aln.sequence().size()); + + // So now we're only interested in things that happen at sweep_line. + + // Compute the distance from the previous sweep line position + // Make sure to account for last_sweep_line's semantics as the next unswept base. + int sweep_distance = sweep_line - last_sweep_line + 1; + + // We need to track the score of the best thing that past-ended here + int best_past_ending_score_here = 0; + + while(!end_heap.empty() && end_heap.front().first == sweep_line) { + // Find anything that past-ends here + size_t past_ending = end_heap.front().second; + + // Mix it into the score + best_past_ending_score_here = std::max(best_past_ending_score_here, best_chain_score[past_ending]); + + // Remove it from the end-tracking heap + std::pop_heap(end_heap.begin(), end_heap.end(), min_heap_on_first); + end_heap.pop_back(); + } + + + // Mix that into the best score overall + best_past_ending_score_ever = std::max(best_past_ending_score_ever, best_past_ending_score_here); + + if (sweep_line == aln.sequence().size()) { + // We don't need to think about gaps or backtracking anymore since everything has ended + break; + } + + // Update the overlap score offset by removing some gap extends from it. + overlap_score_offset += sweep_distance * gap_extend_penalty; + + // The best way to backtrack to here is whatever is on top of the heap, if anything, that doesn't past-end here. + int best_overlap_score = 0; + while (!overlap_heap.empty()) { + // While there is stuff on the heap + if (overlap_heap.front().second <= sweep_line) { + // We are already past this thing, so drop it + std::pop_heap(overlap_heap.begin(), overlap_heap.end()); + overlap_heap.pop_back(); + } else { + // This is at the top of the heap and we aren't past it + // Decode and use its score offset if we only backtrack to here. + best_overlap_score = overlap_heap.front().first + overlap_score_offset; + // Stop looking in the heap + break; + } + } + + // The best way to end 1 before here in a gap is either: + + if (best_gap_score != 0) { + // Best way to end 1 before our last sweep line position with a gap, plus distance times gap extend penalty + best_gap_score -= sweep_distance * gap_extend_penalty; + } + + // Best way to end 1 before here with an actual extension, plus the gap open part of the gap open penalty. + // (Will never be taken over an actual adjacency) + best_gap_score = std::max(0, std::max(best_gap_score, best_past_ending_score_here - (gap_open_penalty - gap_extend_penalty))); + + while (unentered < extended_seeds.size() && extended_seeds[unentered].read_interval.first == sweep_line) { + // For each thing that starts here + + // Compute its chain score + best_chain_score[unentered] = std::max(best_overlap_score, + std::max(best_gap_score, best_past_ending_score_here)) + extended_seeds[unentered].score; + + // Compute its backtrack-to-here score and add it to the backtracking heap + // We want how far we would have had to have backtracked to be + // able to preceed the base we are at now, where this thing + // starts. + size_t extension_length = extended_seeds[unentered].read_interval.second - extended_seeds[unentered].read_interval.first; + int raw_overlap_score = best_chain_score[unentered] - gap_open_penalty - gap_extend_penalty * extension_length; + int encoded_overlap_score = raw_overlap_score - overlap_score_offset; + + // Stick it in the heap + overlap_heap.emplace_back(encoded_overlap_score, extended_seeds[unentered].read_interval.second); + std::push_heap(overlap_heap.begin(), overlap_heap.end()); + + // Add it to the end finding heap + end_heap.emplace_back(extended_seeds[unentered].read_interval.second, unentered); + std::push_heap(end_heap.begin(), end_heap.end(), min_heap_on_first); + + // Advance and check the next thing to start + unentered++; + } + + // Move last_sweep_line to sweep_line. + // We need to add 1 since last_sweep_line is the next *un*included base + last_sweep_line = sweep_line + 1; + } + + + // When we get here, we've seen the end of every extension and so we + // have the best score at the end of any of them. + return best_past_ending_score_ever; + } + + +} + +// TODO: Combine the two score_extensions overloads into one template when we get constexpr if. + +std::vector MinimizerMapper::score_extensions(const std::vector>& extensions, const Alignment& aln, Funnel& funnel) const { + + // Extension scoring substage. + if (this->track_provenance) { + funnel.substage("score"); + } + + // We now estimate the best possible alignment score for each cluster. + std::vector result(extensions.size(), 0); + for (size_t i = 0; i < extensions.size(); i++) { + + if (this->track_provenance) { + funnel.producing_output(i); + } + + result[i] = score_extension_group(aln, extensions[i], get_regular_aligner()->gap_open, get_regular_aligner()->gap_extension); + + // Record the score with the funnel. + if (this->track_provenance) { + funnel.score(i, result[i]); + funnel.produced_output(); + } + } + + return result; +} + +std::vector MinimizerMapper::score_extensions(const std::vector, size_t>>& extensions, const Alignment& aln, Funnel& funnel) const { + + // Extension scoring substage. + if (this->track_provenance) { + funnel.substage("score"); + } + + // We now estimate the best possible alignment score for each cluster. + std::vector result(extensions.size(), 0); + for (size_t i = 0; i < extensions.size(); i++) { + + if (this->track_provenance) { + funnel.producing_output(i); + } + + result[i] = score_extension_group(aln, extensions[i].first, get_regular_aligner()->gap_open, get_regular_aligner()->gap_extension); + + // Record the score with the funnel. + if (this->track_provenance) { + funnel.score(i, result[i]); + funnel.produced_output(); + } + } + + return result; +} + +//----------------------------------------------------------------------------- + +// (value, cost) +typedef std::pair pareto_point; + +static void find_pareto_frontier(std::vector& v) { + if(v.empty()) { + return; + } + std::sort(v.begin(), v.end(), [](pareto_point a, pareto_point b) { + return (a.second < b.second || (a.second == b.second && a.first > b.first)); + }); + size_t tail = 1; + for (size_t i = 1; i < v.size(); i++) { + if (v[i].first <= v[tail - 1].first) { + continue; + } + v[tail] = v[i]; + tail++; + } + v.resize(tail); + std::sort(v.begin(), v.end()); +} + +// Positive gap penalty if there is a gap. +static int32_t gap_penalty(size_t length, const Aligner* aligner) { + return (length == 0 ? 0 : aligner->gap_open + (length - 1) * aligner->gap_extension); +} + +// Positive penalty for a number of mismatches. +static int32_t mismatch_penalty(size_t n, const Aligner* aligner) { + return n * (aligner->match + aligner->mismatch); +} + +// Positive gap penalty, assuming that there is always a gap. +static int32_t gap_penalty(size_t start, size_t limit, const Aligner* aligner) { + return (start >= limit ? aligner->gap_open : aligner->gap_open + (limit - start - 1) * aligner->gap_extension); +} + +// Positive flank penalty based on taking a gap to the end or to the Pareto frontier. +static int32_t flank_penalty(size_t length, const std::vector& frontier, const Aligner* aligner) { + int32_t result = gap_penalty(length, aligner); + for (size_t i = 0; i < frontier.size(); i++) { + int32_t candidate = frontier[i].second + gap_penalty(frontier[i].first, length, aligner); + result = std::min(result, candidate); + if (frontier[i].first >= length) { + break; + } + } + return result; +} + +/// A helper function that cna merge softclips in properly when joining up +/// paths, but doesn't need expensive full passes over the paths later. +static inline void add_to_path(Path* target, Path* to_append) { + for (auto& mapping : *to_append->mutable_mapping()) { + // For each mapping to append + if (target->mapping_size() > 0) { + // There is an existing mapping on the path. + // Find that previous mapping. + auto* prev_mapping = target->mutable_mapping(target->mapping_size() - 1); + + if (mapping.position().node_id() == prev_mapping->position().node_id() && + (mapping.position().offset() != 0 || mapping_is_total_insertion(*prev_mapping) || mapping_is_total_insertion(mapping))) { + // Previous mapping is to the same node, and either the new + // mapping doesn't start at 0, or one mapping takes up no + // space on the node (i.e. is a pure insert). + // + // So we want to combine the mappings. + for (auto& edit : *mapping.mutable_edit()) { + // Move over all the edits in this mapping onto the end of that one. + *prev_mapping->add_edit() = std::move(edit); + } + + continue; + } + } + // If we don't combine the mappings, we need to just move the whole mapping + *target->add_mapping() = std::move(mapping); + } +}; + +void MinimizerMapper::find_optimal_tail_alignments(const Alignment& aln, const vector& extended_seeds, LazyRNG& rng, Alignment& best, Alignment& second_best) const { + + // This assumes that full-length extensions have the highest scores. + // We want to align at least two extensions and at least one + // partial extension. However, we do not want to align more than one + // partial extension, unless the score is very close to the best + // extension or the extension looks very promising. + size_t min_tails = 1; + for (const GaplessExtension& extension : extended_seeds) { + if (extension.full()) { + min_tails++; + } + } + if (min_tails < 2) { + min_tails = 2; + } + + /* + (length, penalty) pairs sorted by length. Pareto frontiers for the + number of bp we can align at each end and the corresponding alignment + score penalty. We use three types of points: + 1. A gap from the start/end of the read to the start/end of the + extension followed by the entire extension. + 2. A gap from the start/end of the read to the start/end of the + extension followed by the extension until the first mismatch. + 3. An all-windows-length - 1 bp exact match at the start/end of the read. + */ + const Aligner* aligner = this->get_regular_aligner(); + std::vector left_frontier, right_frontier; + { + size_t seq_len = aln.sequence().length(); + for (const GaplessExtension& extension : extended_seeds) { + if (extension.full()) { + continue; + } + int32_t left_penalty = gap_penalty(extension.read_interval.first, aligner); + int32_t mid_penalty = mismatch_penalty(extension.mismatches(), aligner); + int32_t right_penalty = gap_penalty(seq_len - extension.read_interval.second, aligner); + left_frontier.push_back(pareto_point(extension.read_interval.second, mid_penalty + left_penalty)); + right_frontier.push_back(pareto_point(seq_len - extension.read_interval.first, mid_penalty + right_penalty)); + if (extension.mismatches() > 0) { + left_frontier.push_back(pareto_point(extension.mismatch_positions.front(), left_penalty)); + right_frontier.push_back(pareto_point(seq_len - extension.mismatch_positions.back() - 1, right_penalty)); + } + } + size_t window_length = this->minimizer_index.uses_syncmers() ? this->minimizer_index.k() : (this->minimizer_index.k() + this->minimizer_index.w() - 1); + left_frontier.push_back(pareto_point(window_length - 1, 0)); + right_frontier.push_back(pareto_point(window_length - 1, 0)); + } + find_pareto_frontier(left_frontier); + find_pareto_frontier(right_frontier); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Trying to find " << min_tails << " tail alignments for " + << extended_seeds.size() << " extended seeds" << endl; + } + } + + // We will keep the winning alignment here, in pieces + Path winning_left; + Path winning_middle; + Path winning_right; + int32_t winning_score = 0; + + Path second_left; + Path second_middle; + Path second_right; + int32_t second_score = 0; + + // Handle each extension in the set + bool partial_extension_aligned = false; + int32_t threshold = -1; + process_until_threshold_a(extended_seeds.size(), + [&](size_t extended_seed_num) -> double { + return static_cast(extended_seeds[extended_seed_num].score); + }, extension_score_threshold, min_tails, max_local_extensions, rng, [&](size_t extended_seed_num) -> bool { + + // This extended seed looks good enough. + const GaplessExtension& extension = extended_seeds[extended_seed_num]; + + // Extensions with score at most this will not be aligned, + // unless we do not have enough alignments. + if (threshold < 0) { + threshold = extension.score - extension_score_threshold; + } + + // Identify the special case: We already have aligned a partial + // extension and the current score is too far below the best + // extension. We do not want to align further partial extensions, + // unless they look very promising. + // The estimate is based on taking a gap to read end or to another + // extension on the Pareto frontier, for both ends. + if (!extension.full()) { + if (partial_extension_aligned && extension.score <= threshold) { + int32_t score_estimate = aln.sequence().length() * aligner->match + 2 * aligner->full_length_bonus - + mismatch_penalty(extension.mismatches(), aligner); + if (!extension.left_full) { + score_estimate -= flank_penalty(extension.read_interval.first, left_frontier, aligner); + } + if (!extension.right_full) { + score_estimate -= flank_penalty(aln.sequence().length() - extension.read_interval.second, + right_frontier, aligner); + } + if (score_estimate <= winning_score) { + return true; + } + } + partial_extension_aligned = true; + } + + // TODO: We don't track this filter with the funnel because it + // operates within a single "item" (i.e. cluster/extension set). + // We track provenance at the item level, so throwing out wrong + // local alignments in a correct cluster would look like throwing + // out correct things. + // TODO: Revise how we track correctness and provenance to follow + // sub-cluster things. + + // We start with the path in extension_paths[extended_seed_num], + // scored in extension_path_scores[extended_seed_num] + + // We also have a left tail path and score + pair left_tail_result {{}, 0}; + // And a right tail path and score + pair right_tail_result {{}, 0}; + + if (!extension.left_full) { + // There is a left tail + + // Have scratch for the longest detectable gap + size_t longest_detectable_gap; + + // Get the forest of all left tail placements + auto forest = get_tail_forest(extension, aln.sequence().size(), true, &longest_detectable_gap); + + // Grab the part of the read sequence that comes before the extension + string before_sequence = aln.sequence().substr(0, extension.read_interval.first); + + // Do right-pinned alignment + left_tail_result = std::move(get_best_alignment_against_any_tree(forest, before_sequence, + extension.starting_position(gbwt_graph), false, longest_detectable_gap, rng)); + } + + if (!extension.right_full) { + // There is a right tail + + // Have scratch for the longest detectable gap + size_t longest_detectable_gap; + + // Get the forest of all right tail placements + auto forest = get_tail_forest(extension, aln.sequence().size(), false, &longest_detectable_gap); + + // Find the sequence + string trailing_sequence = aln.sequence().substr(extension.read_interval.second); + + // Do left-pinned alignment + right_tail_result = std::move(get_best_alignment_against_any_tree(forest, trailing_sequence, + extension.tail_position(gbwt_graph), true, longest_detectable_gap, rng)); + } + + // Compute total score + int32_t total_score = extension.score + left_tail_result.second + right_tail_result.second; + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Extended seed " << extended_seed_num << " has left tail of " + << extension.read_interval.first << "bp and right tail of " + << (aln.sequence().size() - extension.read_interval.second) + << "bp for total score " << total_score << endl; + } + } + + // Get the node ids of the beginning and end of each alignment + id_t winning_start = winning_score == 0 ? 0 : (winning_left.mapping_size() == 0 + ? winning_middle.mapping(0).position().node_id() + : winning_left.mapping(0).position().node_id()); + id_t current_start = left_tail_result.first.mapping_size() == 0 + ? gbwt_graph.get_id(extension.path.front()) + : left_tail_result.first.mapping(0).position().node_id(); + id_t winning_end = winning_score == 0 ? 0 : (winning_right.mapping_size() == 0 + ? winning_middle.mapping(winning_middle.mapping_size() - 1).position().node_id() + : winning_right.mapping(winning_right.mapping_size()-1).position().node_id()); + id_t current_end = right_tail_result.first.mapping_size() == 0 + ? gbwt_graph.get_id(extension.path.back()) + : right_tail_result.first.mapping(right_tail_result.first.mapping_size()-1).position().node_id(); + + // Is this left tail different from the currently winning left tail? + bool different_left = winning_start != current_start; + bool different_right = winning_end != current_end; + + if (total_score > winning_score || winning_score == 0) { + // This is the new best alignment seen so far. + + if (winning_score != 0 && different_left && different_right) { + //The previous best scoring alignment replaces the second best + second_score = winning_score; + second_left = std::move(winning_left); + second_middle = std::move(winning_middle); + second_right = std::move(winning_right); + } + + // Save the score + winning_score = total_score; + // And the path parts + winning_left = std::move(left_tail_result.first); + winning_middle = extension.to_path(gbwt_graph, aln.sequence()); + winning_right = std::move(right_tail_result.first); + + } else if ((total_score > second_score || second_score == 0) && different_left && different_right) { + // This is the new second best alignment seen so far and it is + // different from the best alignment. + + // Save the score + second_score = total_score; + // And the path parts + second_left = std::move(left_tail_result.first); + second_middle = extension.to_path(gbwt_graph, aln.sequence()); + second_right = std::move(right_tail_result.first); + } + + return true; + }, [&](size_t extended_seed_num) -> void { + // This extended seed is good enough by its own score, but we have too many. + // Do nothing + }, [&](size_t extended_seed_num) -> void { + // This extended seed isn't good enough by its own score. + // Do nothing + }); + + // Now we know the winning path and score. Move them over to out + best.set_score(winning_score); + second_best.set_score(second_score); + + // Concatenate the paths. + + // We know there must be at least an edit boundary + // between each part, because the maximal extension doesn't end in a + // mismatch or indel and eats all matches. + // We also don't need to worry about jumps that skip intervening sequence. + *best.mutable_path() = std::move(winning_left); + add_to_path(best.mutable_path(), &winning_middle); + add_to_path(best.mutable_path(), &winning_right); + // Compute the identity from the path. + best.set_identity(identity(best.path())); + + //Do the same for the second best + *second_best.mutable_path() = std::move(second_left); + add_to_path(second_best.mutable_path(), &second_middle); + add_to_path(second_best.mutable_path(), &second_right); + second_best.set_identity(identity(second_best.path())); +} + +//----------------------------------------------------------------------------- + +pair MinimizerMapper::get_best_alignment_against_any_tree(const vector& trees, + const string& sequence, const Position& default_position, bool pin_left, size_t longest_detectable_gap, LazyRNG& rng) const { + + // We want the best alignment, to the base graph, done against any target path + Path best_path; + // And its score + int32_t best_score = 0; + + if (!sequence.empty()) { + // We start out with the best alignment being a pure softclip. + // If we don't have any trees, or all trees are empty, or there's nothing beter, this is what we return. + Mapping* m = best_path.add_mapping(); + Edit* e = m->add_edit(); + e->set_from_length(0); + e->set_to_length(sequence.size()); + e->set_sequence(sequence); + // Since the softclip consumes no graph, we place it on the node we are going to. + *m->mutable_position() = default_position; + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "First best alignment: " << log_alignment(best_path) << " score " << best_score << endl; + } + } + } + + // We can align it once per target tree + for (auto& subgraph : trees) { + // For each tree we can map against, map pinning the correct edge of the sequence to the root. + + if (subgraph.get_node_count() != 0) { + // This path has bases in it and could potentially be better than + // the default full-length softclip + + // Do alignment to the subgraph with GSSWAligner. + Alignment current_alignment; + // If pinning right, we need to reverse the sequence, since we are + // always pinning left to the left edge of the tree subgraph. + current_alignment.set_sequence(pin_left ? sequence : reverse_complement(sequence)); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Align " << log_alignment(current_alignment) << " pinned left" << endl; + } + } + +#ifdef debug_dump_graph + cerr << "Vs graph:" << endl; + subgraph.for_each_handle([&](const handle_t& here) { + cerr << subgraph.get_id(here) << " (" << subgraph.get_sequence(here) << "): " << endl; + subgraph.follow_edges(here, true, [&](const handle_t& there) { + cerr << "\t" << subgraph.get_id(there) << " (" << subgraph.get_sequence(there) << ") ->" << endl; + }); + subgraph.follow_edges(here, false, [&](const handle_t& there) { + cerr << "\t-> " << subgraph.get_id(there) << " (" << subgraph.get_sequence(there) << ")" << endl; + }); + }); +#endif + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Limit gap length to " << longest_detectable_gap << " bp" << endl; + } + } + + size_t tail_subgraph_bases = subgraph.get_total_length(); + if (tail_subgraph_bases * sequence.size() > max_dozeu_cells) { + if (!warned_about_tail_size.test_and_set()) { + cerr << "warning[vg::giraffe]: Refusing to perform too-large tail alignment of " + << sequence.size() << " bp against " + << tail_subgraph_bases << " bp tree which would use more than " << max_dozeu_cells + << " cells and might exhaust Dozeu's allocator; suppressing further warnings." << endl; + } + } else { + // X-drop align, accounting for full length bonus. + // We *always* do left-pinned alignment internally, since that's the shape of trees we get. + // Make sure to pass through the gap length limit so we don't just get the default. + get_regular_aligner()->align_pinned(current_alignment, subgraph, true, true, longest_detectable_gap); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "\tScore: " << current_alignment.score() << endl; + } + } + + if (current_alignment.path().mapping_size() > 0 && deterministic_beats(current_alignment.score(), best_score, rng)) { + // This is a new best alignment, and it is nonempty. + best_path = current_alignment.path(); + + if (!pin_left) { + // Un-reverse it if we were pinning right + best_path = reverse_complement_path(best_path, [&](id_t node) { + return subgraph.get_length(subgraph.get_handle(node, false)); + }); + } + + // Translate from subgraph into base graph and keep it. + best_path = subgraph.translate_down(best_path); + best_score = current_alignment.score(); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "New best alignment is " + << log_alignment(best_path) << " score " << best_score << endl; + } + } + } + } + } + + return make_pair(best_path, best_score); +} + +vector MinimizerMapper::get_tail_forest(const GaplessExtension& extended_seed, + size_t read_length, bool left_tails, size_t* longest_detectable_gap) const { + + // We will fill this in with all the trees we return + vector to_return; + + // Now for this extension, walk the GBWT in the appropriate direction + +#ifdef debug + cerr << "Look for " << (left_tails ? "left" : "right") << " tails from extension" << endl; +#endif + + // TODO: Come up with a better way to do this with more accessors on the extension and less get_handle + // Get the Position reading out of the extension on the appropriate tail + Position from; + // And the length of that tail + size_t tail_length; + // And the GBWT search state we want to start with + const gbwt::SearchState* base_state = nullptr; + if (left_tails) { + // Look right from start + from = extended_seed.starting_position(gbwt_graph); + // And then flip to look the other way at the prev base + from = reverse(from, gbwt_graph.get_length(gbwt_graph.get_handle(from.node_id(), false))); + + // Use the search state going backward + base_state = &extended_seed.state.backward; + + tail_length = extended_seed.read_interval.first; + } else { + // Look right from end + from = extended_seed.tail_position(gbwt_graph); + + // Use the search state going forward + base_state = &extended_seed.state.forward; + + tail_length = read_length - extended_seed.read_interval.second; + } + + if (tail_length == 0) { + // Don't go looking for places to put no tail. + return to_return; + } + + // This is one tree that we are filling in + vector> tree; + + // This is a stack of indexes at which we put parents in the tree + list parent_stack; + + // Get the handle we are starting from + // TODO: is it cheaper to get this out of base_state? + handle_t start_handle = gbwt_graph.get_handle(from.node_id(), from.is_reverse()); + + // Decide if the start node will end up included in the tree, or if we cut it all off with the offset. + bool start_included = (from.offset() < gbwt_graph.get_length(start_handle)); + + // Make sure we have a place to store the longest detectable gap + size_t gap_limit; + if (!longest_detectable_gap) { + longest_detectable_gap = &gap_limit; + } + + // Work it out because we need it for the limit of our search distance + *longest_detectable_gap = get_regular_aligner()->longest_detectable_gap(read_length, tail_length); + +#ifdef debug + cerr << "Tail length: " << tail_length << " Read length: " << read_length << " Longest detectable gap: " << *longest_detectable_gap << endl; +#endif + + // How long should we search? It should be the longest detectable gap plus the remaining sequence. + size_t search_limit = *longest_detectable_gap + tail_length; + +#ifdef debug + cerr << "Search limit: now " << search_limit << endl; +#endif + + // Do a DFS over the haplotypes in the GBWT out to that distance. + dfs_gbwt(*base_state, from.offset(), search_limit, [&](const handle_t& entered) { + // Enter a new handle. + + if (parent_stack.empty()) { + // This is the root of a new tree in the forrest + + if (!tree.empty()) { + // Save the old tree and start a new one. + // We need to cut off from.offset() from the root, unless we would cut off the whole root. + // In that case, the GBWT DFS will have skipped the empty root entirely, so we cut off nothing. + to_return.emplace_back(&gbwt_graph, std::move(tree), start_included ? from.offset() : 0); + tree.clear(); + } + + // Add this to the tree with no parent + tree.emplace_back(-1, entered); + } else { + // Just say this is visitable from our parent. + tree.emplace_back(parent_stack.back(), entered); + } + + // Record the parent index + parent_stack.push_back(tree.size() - 1); + }, [&]() { + // Exit the last visited handle. Pop off the stack. + parent_stack.pop_back(); + }); + + if (!tree.empty()) { + // Now save the last tree + to_return.emplace_back(&gbwt_graph, std::move(tree), start_included ? from.offset() : 0); + tree.clear(); + } + +#ifdef debug + cerr << "Found " << to_return.size() << " trees" << endl; +#endif + + // Now we have all the trees! + return to_return; +} + +size_t MinimizerMapper::immutable_path_from_length(const ImmutablePath& path) { + size_t to_return = 0; + for (auto& m : path) { + // Sum up the from lengths of all the component Mappings + to_return += mapping_from_length(m); + } + return to_return; +} + +Path MinimizerMapper::to_path(const ImmutablePath& path) { + Path to_return; + for (auto& m : path) { + // Copy all the Mappings into the Path. + *to_return.add_mapping() = m; + } + + // Flip the order around to actual path order. + std::reverse(to_return.mutable_mapping()->begin(), to_return.mutable_mapping()->end()); + + // Return the completed path + return to_return; +} + +void MinimizerMapper::dfs_gbwt(const Position& from, size_t walk_distance, + const function& enter_handle, const function exit_handle) const { + + // Get a handle to the node the from position is on, in the position's forward orientation + handle_t start_handle = gbwt_graph.get_handle(from.node_id(), from.is_reverse()); + + // Delegate to the handle-based version + dfs_gbwt(start_handle, from.offset(), walk_distance, enter_handle, exit_handle); + +} + +void MinimizerMapper::dfs_gbwt(handle_t from_handle, size_t from_offset, size_t walk_distance, + const function& enter_handle, const function exit_handle) const { + + // Turn from_handle into a SearchState for everything on it. + gbwt::SearchState start_state = gbwt_graph.get_state(from_handle); + + // Delegate to the state-based version + dfs_gbwt(start_state, from_offset, walk_distance, enter_handle, exit_handle); +} + +void MinimizerMapper::dfs_gbwt(const gbwt::SearchState& start_state, size_t from_offset, size_t walk_distance, + const function& enter_handle, const function exit_handle) const { + + if (start_state.empty()) { + // No haplotypes even visit the first node. Stop. + return; + } + + // Get the handle we are starting on + handle_t from_handle = gbwt_graph.node_to_handle(start_state.node); + + // The search state represents searching through the end of the node, so we have to consume that much search limit. + + // Tack on how much search limit distance we consume by going to the end of + // the node. Our start position is a cut *between* bases, and we take everything after it. + // If the cut is at the offset of the whole length of the node, we take 0 bases. + // If it is at 0, we take all the bases in the node. + size_t remaining_root = gbwt_graph.get_length(from_handle) - from_offset; + +#ifdef debug + cerr << "DFS starting at offset " << from_offset << " on node " + << gbwt_graph.get_id(from_handle) << " " << gbwt_graph.get_is_reverse(from_handle) << " of length " + << gbwt_graph.get_length(from_handle) << " leaving " << remaining_root << " bp" << endl; +#endif + + // We would do a recursive formulation, but if attempted on long reads that can run out of stack. + // See . + // So we go the iterative route. Which means we need a stack frame for our simulated stack. + struct Frame { + gbwt::SearchState here_state; + size_t used_distance; + bool visit; + }; + std::stack stack; + + // Start the DFS with our stating node, consuming the distance from our + // offset to its end. Don't show the root state to the user if we don't + // actually visit any bases on that node. + // Make sure we don't count the length of the root node inside the DFS, + // since we are already feeding it in. + stack.push({start_state, 0, false}); + + while (!stack.empty()) { + auto& frame = stack.top(); + + handle_t here_handle = gbwt_graph.node_to_handle(frame.here_state.node); + + // If we're at the root node we don't measure it the same way + bool is_root = (stack.size() == 1); + // If we're at the root node but there's no bases on it left, we don't show it. + bool is_hidden_root = (is_root && remaining_root == 0); + + if (frame.visit == false) { + // First visit + frame.visit = true; + + if (!is_hidden_root) { + // Enter this handle if there are any bases on it to visit +#ifdef debug + cerr << "Enter handle " << gbwt_graph.get_id(here_handle) << " " << gbwt_graph.get_is_reverse(here_handle) << endl; +#endif + enter_handle(here_handle); + } + + // Use the length of the node, or the remaining root if we're the root. + size_t node_length = is_root ? remaining_root : gbwt_graph.get_length(here_handle); + frame.used_distance += node_length; +#ifdef debug + cerr << "Node was " << node_length << " bp; Used " << frame.used_distance << "/" << walk_distance << " bp distance" << endl; +#endif + if (frame.used_distance < walk_distance) { + // If we haven't used up all our distance yet + + // Stack up all the paths onto the stack to be processed. + gbwt_graph.follow_paths(frame.here_state, [&](const gbwt::SearchState& there_state) -> bool { + // For each next state + + // Do it with the new distance value. + stack.push({there_state, frame.used_distance, 0}); + + return true; + }); + + // Jump to handling the new top of the stack. When we come back + // to us, we'll take the second visit branch. + continue; + } + } + + // Second visit, or first visit didn't expand the stack. + + if (!is_hidden_root) { + // Exit this handle if we entered it + +#ifdef debug + cerr << "Exit handle " << gbwt_graph.get_id(here_handle) << " " << gbwt_graph.get_is_reverse(here_handle) << endl; +#endif + + exit_handle(); + } + + // Remove ourselves from the stack. + stack.pop(); + } +} + +//----------------------------------------------------------------------------- + +double MinimizerMapper::score_alignment_pair(Alignment& aln1, Alignment& aln2, int64_t fragment_distance) { + //Score a pair of alignments + + double dev = fragment_distance - fragment_length_distr.mean(); + double fragment_length_log_likelihood = (-dev * dev / (2.0 * fragment_length_distr.std_dev() * fragment_length_distr.std_dev()))/ get_regular_aligner()->log_base; + double score = aln1.score() + aln2.score() +fragment_length_log_likelihood ; + + //Don't let the fragment length log likelihood bring score down below the score of the best alignment + double worse_score = std::min(aln1.score(), aln2.score()); + + return std::max(score, worse_score);; +} + +double MinimizerMapper::distance_to_annotation(int64_t distance) const { + // We use numeric_limits::max() to represent no distance. But that + // can't convert to double (which rounds up) and then safely back to int64. + // We also aren't allowed inf or nan in a Protobuf double Value. So change + // the sentinel value to 0 which is probably not a fragment length. + if (distance == numeric_limits::max()) { + distance = 0; + } + + // Make sure we can't generate any >64 bit integers in the double cast by + // clamping to the doubles that are also integers. + static_assert(DBL_MANT_DIG <= 64, "We assume doubles have <64 bits of mantissa"); + double max_int_double = (double)((int64_t)1 << DBL_MANT_DIG); + return max(min((double) distance, max_int_double), -max_int_double); +} + +} + + diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp new file mode 100644 index 00000000000..ab653ea80a6 --- /dev/null +++ b/src/minimizer_mapper.hpp @@ -0,0 +1,1136 @@ +#ifndef VG_MINIMIZER_MAPPER_HPP_INCLUDED +#define VG_MINIMIZER_MAPPER_HPP_INCLUDED + +/** + * \file minimizer_mapper.hpp + * Defines a mapper that uses the minimizer index and GBWT-based extension. + */ + +#include "algorithms/chain_items.hpp" +#include "algorithms/nearest_offsets_in_paths.hpp" +#include "aligner.hpp" +#include "vg/io/alignment_emitter.hpp" +#include "gbwt_extender.hpp" +#include "snarl_seed_clusterer.hpp" +#include "mapper.hpp" +#include "snarls.hpp" +#include "tree_subgraph.hpp" +#include "funnel.hpp" + +#include +#include + +#include + +namespace vg { + +//#define debug_chaining + +using namespace std; +using namespace vg::io; + +class MinimizerMapper : public AlignerClient { +public: + + /** + * Construct a new MinimizerMapper using the given indexes. The PathPositionhandleGraph can be nullptr, + * as we only use it for correctness tracking. + */ + + MinimizerMapper(const gbwtgraph::GBWTGraph& graph, + const gbwtgraph::DefaultMinimizerIndex& minimizer_index, + SnarlDistanceIndex* distance_index, + const PathPositionHandleGraph* path_graph = nullptr); + + using AlignerClient::set_alignment_scores; + virtual void set_alignment_scores(const int8_t* score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus); + + /** + * Map the given read, and send output to the given AlignmentEmitter. May be run from any thread. + * TODO: Can't be const because the clusterer's cluster_seeds isn't const. + */ + void map(Alignment& aln, AlignmentEmitter& alignment_emitter); + + /** + * Map the given read. Return a vector of alignments that it maps to, winner first. + */ + vector map(Alignment& aln); + + /** + * Map the given read using chaining of seeds. Return a vector of alignments that it maps to, winner first. + */ + vector map_from_chains(Alignment& aln); + + /** + * Map the given read using gapless extensions. Return a vector of alignments that it maps to, winner first. + */ + vector map_from_extensions(Alignment& aln); + + // The idea here is that the subcommand feeds all the reads to the version + // of map_paired that takes a buffer, and then empties the buffer by + // iterating over it in parallel with the version that doesn't. + // TODO: how will we warn about not having a pair distribution yet then? + + /** + * Map the given pair of reads, where aln1 is upstream of aln2 and they are + * oriented towards each other in the graph. + * + * If the reads are ambiguous and there's no fragment length distribution + * fixed yet, they will be dropped into ambiguous_pair_buffer. + * + * Otherwise, at least one result will be returned for them (although it + * may be the unmapped alignment). + */ + pair, vector> map_paired(Alignment& aln1, Alignment& aln2, + vector>& ambiguous_pair_buffer); + + /** + * Map the given pair of reads, where aln1 is upstream of aln2 and they are + * oriented towards each other in the graph. + * + * If the fragment length distribution is not yet fixed, reads will be + * mapped independently. Otherwise, they will be mapped according to the + * fragment length distribution. + */ + pair, vector> map_paired(Alignment& aln1, Alignment& aln2); + + + + + // Mapping settings. + // TODO: document each + + /// Use all minimizers with at most hit_cap hits + static constexpr size_t default_hit_cap = 10; + size_t hit_cap = default_hit_cap; + + /// Ignore all minimizers with more than hard_hit_cap hits + static constexpr size_t default_hard_hit_cap = 500; + size_t hard_hit_cap = default_hard_hit_cap; + + /// Take minimizers between hit_cap and hard_hit_cap hits until this fraction + /// of total score + static constexpr double default_minimizer_score_fraction = 0.9; + double minimizer_score_fraction = default_minimizer_score_fraction; + + /// Maximum number of distinct minimizers to take + static constexpr size_t default_max_unique_min = 500; + size_t max_unique_min = default_max_unique_min; + + /// Number of minimzers to select based on read_len/num_min_per_bp + static constexpr size_t default_num_bp_per_min = 1000; + size_t num_bp_per_min = default_num_bp_per_min; + + /// If set, exclude overlapping minimizers + static constexpr bool default_exclude_overlapping_min = false; + bool exclude_overlapping_min = default_exclude_overlapping_min; + + ////////////// + // Alignment-from-gapless-extension/short read Giraffe specific parameters: + ////////////// + + ///Accept at least this many clusters for gapless extension + static constexpr size_t default_min_extensions = 2; + size_t min_extensions = default_min_extensions; + + /// How many clusters should we produce gapless extensions for, max? + static constexpr size_t default_max_extensions = 800; + size_t max_extensions = default_max_extensions; + + //If an extension set's score is smaller than the best + //extension's score by more than this much, don't align it + static constexpr double default_extension_set_score_threshold = 20; + double extension_set_score_threshold = default_extension_set_score_threshold; + + //If an extension's score is smaller than the best extension's score by + //more than this much, don't align it + static constexpr int default_extension_score_threshold = 1; + int extension_score_threshold = default_extension_score_threshold; + + /// Disregard the extension set score thresholds when they would give us + /// fewer than this many extension sets. + static constexpr int default_min_extension_sets = 2; + int min_extension_sets = default_min_extension_sets; + + /// Even if we would have fewer than min_extension_sets results, don't + /// process anything with a score smaller than this. + static constexpr int default_extension_set_min_score = 20; + int extension_set_min_score = default_extension_set_min_score; + + ///////////////// + // More shared parameters: + ///////////////// + + /// How many extended clusters should we align, max? + static constexpr size_t default_max_alignments = 8; + size_t max_alignments = default_max_alignments; + + /// How many extensions should we try as seeds within a mapping location? + static constexpr size_t default_max_local_extensions = numeric_limits::max(); + size_t max_local_extensions = default_max_local_extensions; + + /// If a cluster's score is smaller than the best score of any cluster by more than + /// this much, then don't extend it + static constexpr double default_cluster_score_threshold = 50; + double cluster_score_threshold = default_cluster_score_threshold; + + /// If the second best cluster's score is no more than this many points below + /// the cutoff set by cluster_score_threshold, snap that cutoff down to the + /// second best cluster's score, to avoid throwing away promising + /// secondaries. + static constexpr double default_pad_cluster_score_threshold = 20; + double pad_cluster_score_threshold = default_pad_cluster_score_threshold; + + /// If the read coverage of a cluster is less than the best coverage of any cluster + /// by more than this much, don't extend it + static constexpr double default_cluster_coverage_threshold = 0.3; + double cluster_coverage_threshold = default_cluster_coverage_threshold; + + ////////////////// + // Alignment-from-chains/long read Giraffe specific parameters: + ////////////////// + + /// If true, produce alignments from extension sets by chaining gapless + /// extensions up and aligning the sequences between them. If false, + /// produce alignments by aligning the tails off of individual gapless + /// extensions. + static constexpr bool default_align_from_chains = false; + bool align_from_chains = default_align_from_chains; + + /// What read-length-independent distance threshold do we want to use for clustering? + static constexpr size_t default_chaining_cluster_distance = 100; + size_t chaining_cluster_distance = default_chaining_cluster_distance; + + /// If the read coverage of a precluster connection is less than the best of any + /// by more than this much, don't extend it + static constexpr double default_precluster_connection_coverage_threshold = 0.3; + double precluster_connection_coverage_threshold = default_precluster_connection_coverage_threshold; + + /// How many connections between preclusters should we reseed over, minimum? + static constexpr size_t default_min_precluster_connections = 10; + size_t min_precluster_connections = default_min_precluster_connections; + + /// How many connections between preclusters should we reseed over, maximum? + static constexpr size_t default_max_precluster_connections = 50; + size_t max_precluster_connections = default_max_precluster_connections; + + /// When connecting subclusters for reseeding, how far should we search? + static constexpr size_t default_reseed_search_distance = 10000; + size_t reseed_search_distance = default_reseed_search_distance; + + // TODO: These will go away with cluster-merging chaining + /// Accept at least this many clusters for chain generation + static constexpr size_t default_min_clusters_to_chain = 2; + size_t min_clusters_to_chain = default_min_clusters_to_chain; + /// How many clusters should we produce chains for, max? + static constexpr size_t default_max_clusters_to_chain = 20; + size_t max_clusters_to_chain = default_max_clusters_to_chain; + + /// When converting chains to alignments, what's the longest gap between + /// items we will actually try to align? Passing strings longer than ~100bp + /// can cause WFAAligner to run for a pathologically long amount of time. + /// May not be 0. + static constexpr size_t default_max_chain_connection = 100; + size_t max_chain_connection = default_max_chain_connection; + /// Similarly, what is the maximum tail length we will try to align? + static constexpr size_t default_max_tail_length = 100; + size_t max_tail_length = default_max_tail_length; + + /// How many bases should we look back when chaining? Needs to be about the + /// same as the clustering distance or we will be able to cluster but not + /// chain. + static constexpr size_t default_max_lookback_bases = 100; + size_t max_lookback_bases = default_max_lookback_bases; + /// How many chaining sources should we make sure to consider regardless of distance? + static constexpr size_t default_min_lookback_items = 1; + size_t min_lookback_items = default_min_lookback_items; + /// How many chaining sources should we allow ourselves to consider ever? + static constexpr size_t default_lookback_item_hard_cap = 15; + size_t lookback_item_hard_cap = lookback_item_hard_cap; + /// How many bases should we try to look back initially when chaining? + static constexpr size_t default_initial_lookback_threshold = 10; + size_t initial_lookback_threshold = default_initial_lookback_threshold; + /// How much chould we increase lookback when we can't find anything good? + static constexpr double default_lookback_scale_factor = 2.0; + double lookback_scale_factor = default_lookback_scale_factor; + /// How bad can a transition be per base before lookback accepts it? + static constexpr double default_min_good_transition_score_per_base = -0.1; + double min_good_transition_score_per_base = default_min_good_transition_score_per_base; + /// How much of a bonus should we give to each item in chaining? + static constexpr int default_item_bonus = 0; + int item_bonus = default_item_bonus; + /// How many bases of indel should we allow in chaining? + static constexpr size_t default_max_indel_bases = 50; + size_t max_indel_bases = default_max_indel_bases; + + /// If a chain's score is smaller than the best + /// chain's score by more than this much, don't align it + static constexpr double default_chain_score_threshold = 100; + double chain_score_threshold = default_chain_score_threshold; + + /// Disregard the chain score thresholds when they would give us + /// fewer than this many chains. + static constexpr int default_min_chains = 1; + int min_chains = default_min_chains; + + /// Even if we would have fewer than min_chains results, don't + /// process anything with a score smaller than this. + static constexpr int default_chain_min_score = 100; + int chain_min_score = default_chain_min_score; + + /// How long of a DP can we do before GSSW crashes due to 16-bit score + /// overflow? + static constexpr int MAX_DP_LENGTH = 30000; + + /// How many DP cells should we be willing to do in GSSW for an end-pinned + /// alignment? If we want to do more than this, just leave tail unaligned. + static constexpr size_t default_max_dp_cells = 16UL * 1024UL * 1024UL; + size_t max_dp_cells = default_max_dp_cells; + + ///////////////// + // More shared parameters: + ///////////////// + + static constexpr size_t default_max_multimaps = 1; + size_t max_multimaps = default_max_multimaps; + static constexpr size_t default_distance_limit = 200; + size_t distance_limit = default_distance_limit; + + /// If false, skip computing base-level alignments. + static constexpr bool default_do_dp = true; + bool do_dp = default_do_dp; + + /// Track which internal work items came from which others during each + /// stage of the mapping algorithm. + static constexpr bool default_track_provenance = false; + bool track_provenance = default_track_provenance; + + /// Guess which seed hits are correct by location in the linear reference + /// and track if/when their descendants make it through stages of the + /// algorithm. Only works if track_provenance is true. + static constexpr bool default_track_correctness = false; + bool track_correctness = default_track_correctness; + + /// If set, log what the mapper is thinking in its mapping of each read. + static constexpr bool default_show_work = false; + bool show_work = default_show_work; + + ////How many stdevs from fragment length distr mean do we cluster together? + static constexpr double default_paired_distance_stdevs = 2.0; + double paired_distance_stdevs = default_paired_distance_stdevs; + + ///How close does an alignment have to be to the best alignment for us to rescue on it + static constexpr double default_paired_rescue_score_limit = 0.9; + double paired_rescue_score_limit = default_paired_rescue_score_limit; + + ///How many stdevs from the mean do we extract a subgraph from? + static constexpr double default_rescue_subgraph_stdevs = 4.0; + double rescue_subgraph_stdevs = default_rescue_subgraph_stdevs; + + /// Do not attempt rescue if there are more seeds in the rescue subgraph. + static constexpr size_t default_rescue_seed_limit = 100; + size_t rescue_seed_limit = default_rescue_seed_limit; + + /// For paired end mapping, how many times should we attempt rescue (per read)? + static constexpr size_t default_max_rescue_attempts = 15; + size_t max_rescue_attempts = default_max_rescue_attempts; + + /// How big of an alignment in POA cells should we ever try to do with Dozeu? + /// TODO: Lift this when Dozeu's allocator is able to work with >4 MB of memory. + /// Each cell is 16 bits in Dozeu, and we leave some room for the query and + /// padding to full SSE registers. Note that a very chopped graph might + /// still break this! + static constexpr size_t default_max_dozeu_cells = (size_t)(1.5 * 1024 * 1024); + size_t max_dozeu_cells = default_max_dozeu_cells; + + ///What is the maximum fragment length that we accept as valid for paired-end reads? + static constexpr size_t default_max_fragment_length = 2000; + size_t max_fragment_length = default_max_fragment_length; + + /// Implemented rescue algorithms: no rescue, dozeu, GSSW. + enum RescueAlgorithm { rescue_none, rescue_dozeu, rescue_gssw }; + + /// The algorithm used for rescue. + RescueAlgorithm rescue_algorithm = rescue_dozeu; + + /// Apply this sample name + string sample_name; + /// Apply this read group name + string read_group; + + /// Have we complained about hitting the size limit for rescue? + atomic_flag warned_about_rescue_size = ATOMIC_FLAG_INIT; + + /// Have we complained about hitting the size limit for tails? + mutable atomic_flag warned_about_tail_size = ATOMIC_FLAG_INIT; + + bool fragment_distr_is_finalized () {return fragment_length_distr.is_finalized();} + void finalize_fragment_length_distr() { + if (!fragment_length_distr.is_finalized()) { + fragment_length_distr.force_parameters(fragment_length_distr.mean(), fragment_length_distr.std_dev()); + } + } + void force_fragment_length_distr(double mean, double stdev) { + fragment_length_distr.force_parameters(mean, stdev); + } + double get_fragment_length_mean() const { return fragment_length_distr.mean(); } + double get_fragment_length_stdev() const {return fragment_length_distr.std_dev(); } + size_t get_fragment_length_sample_size() const { return fragment_length_distr.curr_sample_size(); } + + /** + * Get the distance limit for the given read length + */ + size_t get_distance_limit(size_t read_length) const { + return max(distance_limit, read_length + 50); + } + + /// The information we store for each seed. + typedef SnarlDistanceIndexClusterer::Seed Seed; + + /** + * We define our own type for minimizers, to use during mapping and to pass around between our internal functions. + * Also used to represent syncmers, in which case the only window, the "minimizer", and the agglomeration are all the same region. + */ + struct Minimizer { + typename gbwtgraph::DefaultMinimizerIndex::minimizer_type value; + size_t agglomeration_start; // What is the start base of the first window this minimizer instance is minimal in? + size_t agglomeration_length; // What is the length in bp of the region of consecutive windows this minimizer instance is minimal in? + size_t hits; // How many hits does the minimizer have? + const typename gbwtgraph::DefaultMinimizerIndex::value_type* occs; + int32_t length; // How long is the minimizer (index's k) + int32_t candidates_per_window; // How many minimizers compete to be the best (index's w), or 1 for syncmers. + double score; // Scores as 1 + ln(hard_hit_cap) - ln(hits). + + // Sort the minimizers in descending order by score and group identical minimizers together. + inline bool operator< (const Minimizer& another) const { + return (this->score > another.score || (this->score == another.score && this->value.key < another.value.key)); + } + + /// Get the starting position of the given minimizer on the forward strand. + /// Use this instead of value.offset which can really be the last base for reverse strand minimizers. + inline size_t forward_offset() const { + if (this->value.is_reverse) { + // We have the position of the last base and we need the position of the first base. + return this->value.offset - (this->length - 1); + } else { + // We already have the position of the first base. + return this->value.offset; + } + } + + /// How many bases are in a window for which a minimizer is chosen? + inline size_t window_size() const { + return length + candidates_per_window - 1; + } + + /// How many different windows are in this minimizer's agglomeration? + inline size_t agglomeration_window_count() const { + // Work out the length of a whole window, and then from that and the window count get the overall length. + return agglomeration_length - window_size() + 1; + } + + /// What is the minimizer sequence, in read orientation? + inline string forward_sequence() const { + string sequence = value.key.decode(length); + return value.is_reverse ? reverse_complement(sequence) : sequence; + } + }; + +protected: + + /// Convert an integer distance, with limits standing for no distance, to a + /// double annotation that can safely be parsed back from JSON into an + /// integer if it is integral. + double distance_to_annotation(int64_t distance) const; + + /// How should we initialize chain info when it's not stored in the minimizer index? + inline static gbwtgraph::Payload no_chain_info() { + return MIPayload::NO_CODE; + } + + /// How do we convert chain info to an actual seed of the type we are using? + /// Also needs to know the hit position, and the minimizer number. + inline static Seed chain_info_to_seed(const pos_t& hit, size_t minimizer, const gbwtgraph::Payload& chain_info) { + return { hit, minimizer, chain_info }; + } + + /// Convert a collection of seeds to a collection of chaining anchors. + std::vector to_anchors(const Alignment& aln, const VectorView& minimizers, const std::vector& seeds) const; + + /// Convert a single seed to a single chaining anchor. + algorithms::Anchor to_anchor(const Alignment& aln, const VectorView& minimizers, const Seed& seed) const; + + /// Convert an Anchor to a WFAAlignment + WFAAlignment to_wfa_alignment(const algorithms::Anchor& anchor) const; + + /// The information we store for each cluster. + typedef SnarlDistanceIndexClusterer::Cluster Cluster; + + // These are our indexes + const PathPositionHandleGraph* path_graph; // Can be nullptr; only needed for correctness tracking. + const gbwtgraph::DefaultMinimizerIndex& minimizer_index; + SnarlDistanceIndex* distance_index; + /// This is our primary graph. + const gbwtgraph::GBWTGraph& gbwt_graph; + + /// We have a gapless extender to extend seed hits in haplotype space. + /// Because this needs a reference to an Aligner, and because changing the + /// scoring parameters deletes all the alignmers, we need to keep this + /// somewhere we can clear out. + std::unique_ptr extender; + + /// We have a clusterer + SnarlDistanceIndexClusterer clusterer; + + /// We have a distribution for read fragment lengths that takes care of + /// knowing when we've observed enough good ones to learn a good + /// distribution. + FragmentLengthDistribution fragment_length_distr; + /// We may need to complain exactly once that the distribution is bad. + atomic_flag warned_about_bad_distribution = ATOMIC_FLAG_INIT; + +//----------------------------------------------------------------------------- + + // Stages of mapping. + + /** + * Find the minimizers in the sequence using the minimizer index, and + * return them sorted in read order. + */ + std::vector find_minimizers(const std::string& sequence, Funnel& funnel) const; + + /** + * Return the indices of all the minimizers, sorted in descending order by theit minimizers' scores. + */ + std::vector sort_minimizers_by_score(const std::vector& minimizers) const; + + /** + * Find seeds for all minimizers passing the filters. + */ + std::vector find_seeds(const VectorView& minimizers, const Alignment& aln, Funnel& funnel) const; + + /** + * If tracking correctness, mark seeds that are correctly mapped as correct + * in the funnel, based on proximity along paths to the input read's + * refpos. Otherwise, tag just as placed, with the seed's read interval. + * Assumes we are tracking provenance. + */ + void tag_seeds(const Alignment& aln, const std::vector::const_iterator& begin, const std::vector::const_iterator& end, const VectorView& minimizers, size_t funnel_offset, Funnel& funnel) const; + + /** + * Determine cluster score, read coverage, and a vector of flags for the + * minimizers present in the cluster. Score is the sum of the scores of + * distinct minimizers in the cluster, while read coverage is the fraction + * of the read covered by seeds in the cluster. + * + * Puts the cluster in the funnel as coming from its seeds. + */ + void score_cluster(Cluster& cluster, size_t i, const VectorView& minimizers, const std::vector& seeds, size_t seq_length, Funnel& funnel) const; + + /** + * Determine cluster score, read coverage, and a vector of flags for the + * minimizers present in the cluster. Score is the sum of the scores of + * distinct minimizers in the cluster, while read coverage is the fraction + * of the read covered by seeds in the cluster. + * + * Thinks of the cluster as being made out of some previous clusters and + * some new seeds from the tail end of seeds, which are already in the + * funnel, clusters first. seed_to_precluster maps from seed to the old + * cluster it is part of, or std::numeric_limits::max() if it isn't + * from an old cluster. + * + * Puts the cluster in the funnel. + */ + void score_merged_cluster(Cluster& cluster, size_t i, const VectorView& minimizers, const std::vector& seeds, size_t first_new_seed, const std::vector& seed_to_precluster, const std::vector& preclusters, size_t seq_length, Funnel& funnel) const; + + /** + * Reseed between the given graph and read positions. Produces new seeds by asking the given callback for minimizers' occurrence positions. + * Up to one end of the graph region can be a read end, with a pos_t matching is_empty(). + * The read region always needs to be fully defined. + */ + std::vector reseed_between( + size_t read_region_start, + size_t read_region_end, + pos_t left_graph_pos, + pos_t right_graph_pos, + const HandleGraph& graph, + const VectorView& minimizers, + const std::function&, const std::function&)>& for_each_pos_for_source_in_subgraph) const; + + /** + * Extends the seeds in a cluster into a collection of GaplessExtension objects. + */ + vector extend_cluster( + const Cluster& cluster, + size_t cluster_num, + const VectorView& minimizers, + const std::vector& seeds, + const string& sequence, + vector>& minimizer_kept_cluster_count, + Funnel& funnel) const; + + /** + * Score the given group of gapless extensions. Determines the best score + * that can be obtained by chaining extensions together, using the given + * gap open and gap extend penalties to charge for either overlaps or gaps + * in coverage of the read. + * + * Enforces that overlaps cannot result in containment. + * + * Input extended seeds must be sorted by start position. + */ + static int score_extension_group(const Alignment& aln, const vector& extended_seeds, + int gap_open_penalty, int gap_extend_penalty); + + /** + * Score the set of extensions for each cluster using score_extension_group(). + * Return the scores in the same order as the extension groups. + */ + std::vector score_extensions(const std::vector>& extensions, const Alignment& aln, Funnel& funnel) const; + /** + * Score the set of extensions for each cluster using score_extension_group(). + * Return the scores in the same order as the extensions. + * + * This version allows the collections of extensions to be scored to come + * with annotating read numbers, which are ignored. + */ + std::vector score_extensions(const std::vector, size_t>>& extensions, const Alignment& aln, Funnel& funnel) const; + + /** + * Turn a chain into an Alignment. + * + * Operating on the given input alignment, align the tails and intervening + * sequences along the given chain of perfect-match seeds, and return an + * optimal Alignment. + */ + Alignment find_chain_alignment(const Alignment& aln, const VectorView& to_chain, const std::vector& chain) const; + + /** + * Operating on the given input alignment, align the tails dangling off the + * given extended perfect-match seeds and produce an optimal alignment into + * the given output Alignment object, best, and the second best alignment + * into second_best. + * + * Uses the given RNG to break ties. + */ + void find_optimal_tail_alignments(const Alignment& aln, const vector& extended_seeds, LazyRNG& rng, Alignment& best, Alignment& second_best) const; + +//----------------------------------------------------------------------------- + + // Rescue. + + /** + * Given an aligned read, extract a subgraph of the graph within a distance range + * based on the fragment length distribution and attempt to align the unaligned + * read to it. + * Rescue_forward is true if the aligned read is the first and false otherwise. + * Assumes that both reads are facing the same direction. + * TODO: This should be const, but some of the function calls are not. + */ + void attempt_rescue(const Alignment& aligned_read, Alignment& rescued_alignment, const VectorView& minimizers, bool rescue_forward); + + /** + * Return the all non-redundant seeds in the subgraph, including those from + * minimizers not used for mapping. + */ + GaplessExtender::cluster_type seeds_in_subgraph(const VectorView& minimizers, const std::unordered_set& subgraph) const; + + /** + * When we use dozeu for rescue, the reported alignment score is incorrect. + * 1) Dozeu only gives the full-length bonus once. + * 2) There is no penalty for a softclip at the edge of the subgraph. + * This function calculates the score correctly. If the score is <= 0, + * we realign the read using GSSW. + * TODO: This should be unnecessary. + */ + void fix_dozeu_score(Alignment& rescued_alignment, const HandleGraph& rescue_graph, + const std::vector& topological_order) const; + +//----------------------------------------------------------------------------- + + // Helper functions. + + /** + * Get the distance between a pair of positions, or std::numeric_limits::max() if unreachable. + */ + int64_t distance_between(const pos_t& pos1, const pos_t& pos2); + + /** + * Get the distance between a pair of read alignments, or std::numeric_limits::max() if unreachable. + */ + int64_t distance_between(const Alignment& aln1, const Alignment& aln2); + + /** + * Get the unoriented distance between a pair of positions + */ + int64_t unoriented_distance_between(const pos_t& pos1, const pos_t& pos2) const; + + /** + * Convert the GaplessExtension into an alignment. This assumes that the + * extension is a full-length alignment and that the sequence field of the + * alignment has been set. + */ + void extension_to_alignment(const GaplessExtension& extension, Alignment& alignment) const; + + /** + * Convert a WFAAlignment into a vg Alignment. This assumes that the + * WFAAlignment is a full-length alignment and that the sequence field of + * the vg Alignment has been set. + */ + void wfa_alignment_to_alignment(const WFAAlignment& wfa_alignment, Alignment& alignment) const; + + /** + * Clip out the part of the graph between the given positions, and dagify + * it from the perspective of the anchors. If a left anchor is set, all + * heads should correspond to the left anchor, and if a right anchor is + * set, all tails should correspond to the right anchor. At least one + * anchor must be set. + * + * Calls the callback with an extracted, strand-split, dagified graph, and + * a function that translates from handle in the dagified graph to node ID + * and orientation in the base graph. + */ + static void with_dagified_local_graph(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, const HandleGraph& graph, const std::function(const handle_t&)>&)>& callback); + + /** + * Clip out the part of the graph between the given positions and + * global-align the sequence of the given Alignment to it. Populate the + * Alignment's path and score. + * + * Finds an alignment against a graph path if it is <= max_path_length, and uses <= max_dp_cells GSSW cells. + * + * If one of the anchor positions is empty, does pinned alighnment against + * the other position. + */ + static void align_sequence_between(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, size_t max_dp_cells = std::numeric_limits::max()); + + /** + * Set pair partner references for paired mapping results. + */ + void pair_all(std::array, 2>& mappings) const; + + /** + * Add annotations to an Alignment with statistics about the minimizers. + * + * old_seed_count is the number of seeds in the seed vector actually + * created at the "seed" stage of the alignment process. new_seed_offset is + * where the first of thos eseeds appears in the funnel at the reseed stage. + */ + void annotate_with_minimizer_statistics(Alignment& target, const VectorView& minimizers, const std::vector& seeds, size_t old_seed_count, size_t new_seed_offset, const Funnel& funnel) const; + +//----------------------------------------------------------------------------- + + /** + * Compute MAPQ caps based on all minimizers that are explored, for some definition of explored. + * + * Needs access to the input alignment for sequence and quality + * information. + * + * Returns only an "extended" cap at the moment. + */ + double compute_mapq_caps(const Alignment& aln, const VectorView& minimizers, + const SmallBitset& explored); + + /** + * Compute a bound on the Phred score probability of having created the + * agglomerations of the specified minimizers by base errors from the given + * sequence, which was sequenced with the given qualities. + * + * No limit is imposed if broken is empty. + * + * Takes the collection of all minimizers found, and a vector of the + * indices of minimizers we are interested in the agglomerations of. May + * modify the order of that index vector. + * + * Also takes the sequence of the read (to avoid Ns) and the quality string + * (interpreted as a byte array). + * + * Currently computes a lower-score-bound, upper-probability-bound, + * suitable for use as a mapping quality cap, by assuming the + * easiest-to-disrupt possible layout of the windows, and the lowest + * possible qualities for the disrupting bases. + */ + static double window_breaking_quality(const VectorView& minimizers, vector& broken, + const string& sequence, const string& quality_bytes); + + /** + * Compute a bound on the Phred score probability of a mapping beign wrong + * due to base errors and unlocated minimizer hits prevented us from + * finding the true alignment. + * + * Algorithm uses a "sweep line" dynamic programming approach. + * For a read with minimizers aligned to it: + * + * 000000000011111111112222222222 + * 012345678901234567890123456789 + * Read: ****************************** + * Minimizer 1: ***** + * Minimizer 2: ***** + * Minimizer 3: ***** + * Minimizer 4: ***** + * + * For each distinct read interval of overlapping minimizers, e.g. in the + * example the intervals 3,4,5; 6,7; 8,9,10; 18,19,20; 21,22; and 23,24,25 + * we consider base errors that would result in the minimizers in the + * interval being incorrect + * + * We use dynamic programming sweeping left-to-right over the intervals to + * compute the probability of the minimum number of base errors needed to + * disrupt all the minimizers. + * + * Will sort minimizers_explored (which is indices into minimizers) by + * minimizer start position. + */ + static double faster_cap(const VectorView& minimizers, vector& minimizers_explored, const string& sequence, const string& quality_bytes); + + /** + * Given a collection of minimizers, and a list of the minimizers we + * actually care about (as indices into the collection), iterate over + * common intervals of overlapping minimizer agglomerations. + * + * Calls the given callback with (left, right, bottom, top), where left is + * the first base of the agglomeration interval (inclusive), right is the + * last base of the agglomeration interval (exclusive), bottom is the index + * of the first minimizer with an agglomeration in the interval and top is + * the index of the last minimizer with an agglomeration in the interval + * (exclusive). + * + * minimizer_indices must be sorted by agglomeration end, and then by + * agglomeration start, so they can be decomposed into nice rectangles. + * + * Note that bottom and top are offsets into minimizer_indices, **NOT** + * minimizers itself. Only contiguous ranges in minimizer_indices actually + * make sense. + */ + static void for_each_agglomeration_interval(const VectorView& minimizers, + const string& sequence, const string& quality_bytes, + const vector& minimizer_indices, + const function& iteratee); + + /** + * Gives the log10 prob of a base error in the given interval of the read, + * accounting for the disruption of specified minimizers. + * + * minimizers is the collection of all minimizers + * + * disrupt_begin and disrupt_end are iterators defining a sequence of + * **indices** of minimizers in minimizers that are disrupted. + * + * left and right are the inclusive and exclusive bounds of the interval + * of the read where the disruption occurs. + */ + static double get_log10_prob_of_disruption_in_interval(const VectorView& minimizers, + const string& sequence, const string& quality_bytes, + const vector::iterator& disrupt_begin, const vector::iterator& disrupt_end, + size_t left, size_t right); + + /** + * Gives the raw probability of a base error in the given column of the + * read, accounting for the disruption of specified minimizers. + * + * minimizers is the collection of all minimizers + * + * disrupt_begin and disrupt_end are iterators defining a sequence of + * **indices** of minimizers in minimizers that are disrupted. + * + * index is the position in the read where the disruption occurs. + */ + static double get_prob_of_disruption_in_column(const VectorView& minimizers, + const string& sequence, const string& quality_bytes, + const vector::iterator& disrupt_begin, const vector::iterator& disrupt_end, + size_t index); + + /** + * Get all the trees defining tails off the specified side of the specified + * gapless extension. Should only be called if a tail on that side exists, + * or this is a waste of time. + * + * If the gapless extension starts or ends at a node boundary, there may be + * multiple trees produced, each with a distinct root. + * + * If the gapless extension abuts the edge of the read, an empty forest + * will be produced. + * + * Each tree is represented as a TreeSubgraph over our gbwt_graph. + * + * If left_tails is true, the trees read out of the left sides of the + * gapless extension. Otherwise they read out of the right side. + * + * As a side effect, saves the length of the longest detectable gap in an + * alignment of a tail to the forest into the provided location, if set. + */ + vector get_tail_forest(const GaplessExtension& extended_seed, + size_t read_length, bool left_tails, size_t* longest_detectable_gap = nullptr) const; + + /** + * Find the best alignment of the given sequence against any of the trees + * provided in trees, where each tree is a TreeSubgraph over the GBWT + * graph. Each tree subgraph is rooted at the left in its own local + * coordinate space, even if we are pinning on the right. + * + * If no mapping is possible (for example, because there are no trees), + * produce a pure insert at default_position. + * + * Alignment is always pinned. + * + * If pin_left is true, pin the alignment on the left to the root of each + * tree. Otherwise pin it on the right to the root of each tree. + * + * Limits the length of the longest gap to longest_detectable_gap. + * + * Returns alignments in gbwt_graph space. + */ + pair get_best_alignment_against_any_tree(const vector& trees, const string& sequence, + const Position& default_position, bool pin_left, size_t longest_detectable_gap, LazyRNG& rng) const; + + /// We define a type for shared-tail lists of Mappings, to avoid constantly + /// copying Path objects. + using ImmutablePath = structures::ImmutableList; + + /** + * Get the from length of an ImmutabelPath. + * + * Can't be called path_from_length or it will shadow the one for Paths + * instead of overloading. + */ + static size_t immutable_path_from_length(const ImmutablePath& path); + + /** + * Convert an ImmutablePath to a Path. + */ + static Path to_path(const ImmutablePath& path); + + /** + * Run a DFS on valid haplotypes in the GBWT starting from the given + * Position, and continuing up to the given number of bases. + * + * Calls enter_handle when the DFS enters a haplotype visit to a particular + * handle, and exit_handle when it exits a visit. These let the caller + * maintain a stack and track the traversals. + * + * The starting node is only entered if its offset isn't equal to its + * length (i.e. bases remain to be visited). + * + * Stopping early is not permitted. + */ + void dfs_gbwt(const Position& from, size_t walk_distance, + const function& enter_handle, const function exit_handle) const; + + /** + * The same as dfs_gbwt on a Position, but takes a handle in the + * backing gbwt_graph and an offset from the start of the handle instead. + */ + void dfs_gbwt(handle_t from_handle, size_t from_offset, size_t walk_distance, + const function& enter_handle, const function exit_handle) const; + + /** + * The same as dfs_gbwt on a handle and an offset, but takes a + * gbwt::SearchState that defines only some haplotypes on a handle to start + * with. + */ + void dfs_gbwt(const gbwt::SearchState& start_state, size_t from_offset, size_t walk_distance, + const function& enter_handle, const function exit_handle) const; + + /** + * Score a pair of alignments given the distance between them + */ + double score_alignment_pair(Alignment& aln1, Alignment& aln2, int64_t fragment_distance); + + /** + * Given a count of items, a function to get the score of each, a + * score-difference-from-the-best cutoff, a min and max processed item + * count, and a function to get a sort-shuffling seed for breaking ties, + * process items in descending score order by calling process_item with the + * item's number, until min_count items are processed and either max_count + * items are processed or the score difference threshold is hit (or we run + * out of items). + * + * If process_item returns false, the item is skipped and does not count + * against min_count or max_count. + * + * Call discard_item_by_count with the item's number for all remaining + * items that would pass the score threshold. + * + * Call discard_item_by_score with the item's number for all remaining + * items that would fail the score threshold. + */ + template + void process_until_threshold_a(size_t items, const function& get_score, + double threshold, size_t min_count, size_t max_count, + LazyRNG& rng, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const; + + /** + * Same as the other process_until_threshold functions, except using a vector to supply scores. + */ + template + void process_until_threshold_b(const vector& scores, + double threshold, size_t min_count, size_t max_count, + LazyRNG& rng, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const; + + /** + * Same as the other process_until_threshold functions, except user supplies + * comparator to sort the items (must still be sorted by score). + */ + template + void process_until_threshold_c(size_t items, const function& get_score, + const function& comparator, + double threshold, size_t min_count, size_t max_count, + LazyRNG& get_seed, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const; + + // Internal debugging functions + + /// Get the thread identifier prefix for logging + static string log_name(); + + /// Turn an Alignment into a conveniently-sized string for logging + static string log_alignment(const Alignment& aln); + + /// Turn an Path from an alignment into a conveniently-sized string for logging + static string log_alignment(const Path& path, bool force_condensed = false); + + /// Turn a list of bit flags into a compact representation. + static string log_bits(const std::vector& bits); + + /// Dump a whole chaining problem + static void dump_chaining_problem(const std::vector& anchors, const std::vector& cluster_seeds_sorted, const HandleGraph& graph); + + /// Dump all the given minimizers, with optional subset restriction + static void dump_debug_minimizers(const VectorView& minimizers, const string& sequence, + const vector* to_include = nullptr, size_t start_offset = 0, size_t length_limit = std::numeric_limits::max()); + + /// Dump all the extansions in an extension set + static void dump_debug_extension_set(const HandleGraph& graph, const Alignment& aln, const vector& extended_seeds); + + /// Print a sequence with base numbering + static void dump_debug_sequence(ostream& out, const string& sequence, size_t start_offset = 0, size_t length_limit = std::numeric_limits::max()); + + /// Print the seed content of a cluster. + static void dump_debug_clustering(const Cluster& cluster, size_t cluster_number, const VectorView& minimizers, const std::vector& seeds); + + /// Do a brute check of the clusters. Print errors to stderr + bool validate_clusters(const std::vector>& clusters, const std::vector>& seeds, size_t read_limit, size_t fragment_limit) const; + + /// Print information about a selected set of seeds. + static void dump_debug_seeds(const VectorView& minimizers, const std::vector& seeds, const std::vector& selected_seeds); + + /// Print information about a read to be aligned + static void dump_debug_query(const Alignment& aln); + + /// Print information about a read pair to be aligned + static void dump_debug_query(const Alignment& aln1, const Alignment& aln2); + + /// Length at which we cut over to long-alignment logging. + const static size_t LONG_LIMIT = 256; + + /// Count at which we cut over to summary logging. + const static size_t MANY_LIMIT = 20; + + + friend class TestMinimizerMapper; +}; + +template +void MinimizerMapper::process_until_threshold_a(size_t items, const function& get_score, + double threshold, size_t min_count, size_t max_count, + LazyRNG& rng, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const { + + process_until_threshold_c(items, get_score, [&](size_t a, size_t b) -> bool { + return (get_score(a) > get_score(b)); + },threshold, min_count, max_count, rng, process_item, discard_item_by_count, discard_item_by_score); +} + +template +void MinimizerMapper::process_until_threshold_b(const vector& scores, + double threshold, size_t min_count, size_t max_count, + LazyRNG& rng, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const { + + process_until_threshold_c(scores.size(), [&](size_t i) -> Score { + return scores[i]; + }, [&](size_t a, size_t b) -> bool { + return (scores[a] > scores[b]); + },threshold, min_count, max_count, rng, process_item, discard_item_by_count, discard_item_by_score); +} + +template +void MinimizerMapper::process_until_threshold_c(size_t items, const function& get_score, + const function& comparator, + double threshold, size_t min_count, size_t max_count, + LazyRNG& rng, + const function& process_item, + const function& discard_item_by_count, + const function& discard_item_by_score) const { + + // Sort item indexes by item score + vector indexes_in_order; + indexes_in_order.reserve(items); + for (size_t i = 0; i < items; i++) { + indexes_in_order.push_back(i); + } + + // Put the highest scores first, but shuffle top ties so reads spray evenly + // across equally good mappings + sort_shuffling_ties(indexes_in_order.begin(), indexes_in_order.end(), comparator, rng); + + // Retain items only if their score is at least as good as this + double cutoff = items == 0 ? 0 : get_score(indexes_in_order[0]) - threshold; + + // Count up non-skipped items for min_count and max_count + size_t unskipped = 0; + + // Go through the items in descending score order. + for (size_t i = 0; i < indexes_in_order.size(); i++) { + // Find the item we are talking about + size_t& item_num = indexes_in_order[i]; + + if (threshold != 0 && get_score(item_num) <= cutoff) { + // Item would fail the score threshold + + if (unskipped < min_count) { + // But we need it to make up the minimum number. + + // Go do it. + // If it is not skipped by the user, add it to the total number + // of unskipped items, for min/max number accounting. + unskipped += (size_t) process_item(item_num); + } else { + // We will reject it for score + discard_item_by_score(item_num); + } + } else { + // The item has a good enough score + + if (unskipped < max_count) { + // We have room for it, so accept it. + + // Go do it. + // If it is not skipped by the user, add it to the total number + // of unskipped items, for min/max number accounting. + unskipped += (size_t) process_item(item_num); + } else { + // We are out of room! Reject for count. + discard_item_by_count(item_num); + } + } + } +} + +} + + + +#endif diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp new file mode 100644 index 00000000000..c14c651a0f3 --- /dev/null +++ b/src/minimizer_mapper_from_chains.cpp @@ -0,0 +1,2085 @@ +/** + * \file minimizer_mapper_from_chains.cpp + * Defines the code for the long-read code path for the + * minimizer-and-GBWT-based mapper (long read Giraffe). + */ + +#include "minimizer_mapper.hpp" + +#include "annotation.hpp" +#include "path_subgraph.hpp" +#include "multipath_alignment.hpp" +#include "split_strand_graph.hpp" +#include "subgraph.hpp" +#include "statistics.hpp" +#include "algorithms/count_covered.hpp" +#include "algorithms/intersect_path_offsets.hpp" +#include "algorithms/extract_containing_graph.hpp" +#include "algorithms/extract_connecting_graph.hpp" +#include "algorithms/extract_extending_graph.hpp" +#include "algorithms/chain_items.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +// Turn on debugging prints +//#define debug +// Turn on printing of minimizer fact tables +//#define print_minimizer_table +// Dump local graphs that we align against +//#define debug_dump_graph +// Dump fragment length distribution information +//#define debug_fragment_distr +//Do a brute force check that clusters are correct +//#define debug_validate_clusters + +namespace vg { + +using namespace std; + +void MinimizerMapper::score_merged_cluster(Cluster& cluster, + size_t i, + const VectorView& minimizers, + const std::vector& seeds, + size_t first_new_seed, + const std::vector& seed_to_precluster, + const std::vector& preclusters, + size_t seq_length, + Funnel& funnel) const { + + + if (this->track_provenance) { + // Say we're making it + funnel.producing_output(i); + } + + // Initialize the values. + cluster.score = 0.0; + cluster.coverage = 0.0; + cluster.present = SmallBitset(minimizers.size()); // TODO: This is probably usually too big to really be "small" now. + + // Collect the old clusters and new seeds we are coming from + // TODO: Skip if not tracking provenance? + std::vector to_combine; + // Deduplicate old clusters with a bit set + SmallBitset preclusters_seen(preclusters.size()); + + + // Determine the minimizers that are present in the cluster. + for (auto hit_index : cluster.seeds) { + // We have this seed's minimizer + cluster.present.insert(seeds[hit_index].source); + + if (hit_index < first_new_seed) { + // An old seed. + // We can also pick up an old cluster. + size_t old_cluster = seed_to_precluster.at(hit_index); + if (old_cluster != std::numeric_limits::max()) { + // This seed came form an old cluster, so we must have eaten it + if (!preclusters_seen.contains(old_cluster)) { + // Remember we used this old cluster + to_combine.push_back(old_cluster); + preclusters_seen.insert(old_cluster); + } + } + } else { + // Make sure we tell the funnel we took in this new seed. + // Translate from a space that is old seeds and then new seeds to a + // space that is old *clusters* and then new seeds + to_combine.push_back(hit_index - first_new_seed + preclusters.size()); + } + } + if (show_work) { + #pragma omp critical (cerr) + dump_debug_clustering(cluster, i, minimizers, seeds); + } + + // Compute the score and cluster coverage. + sdsl::bit_vector covered(seq_length, 0); + for (size_t j = 0; j < minimizers.size(); j++) { + if (cluster.present.contains(j)) { + const Minimizer& minimizer = minimizers[j]; + cluster.score += minimizer.score; + + // The offset of a reverse minimizer is the endpoint of the kmer + size_t start_offset = minimizer.forward_offset(); + size_t k = minimizer.length; + + // Set the k bits starting at start_offset. + covered.set_int(start_offset, sdsl::bits::lo_set[k], k); + } + } + // Count up the covered positions and turn it into a fraction. + cluster.coverage = sdsl::util::cnt_one_bits(covered) / static_cast(seq_length); + + if (this->track_provenance) { + // Record the cluster in the funnel as a group combining the previous groups. + funnel.merge_groups(to_combine.begin(), to_combine.end()); + funnel.score(funnel.latest(), cluster.score); + + // Say we made it. + funnel.produced_output(); + } + +} + +/// Get the forward-relative-to-the-read version of a seed's position. Will +/// have the correct orientation, but won't necessarily be to any particular +/// (i.e. first or last) base of the seed. +static pos_t forward_pos(const MinimizerMapper::Seed& seed, const VectorView& minimizers, const HandleGraph& graph) { + pos_t position = seed.pos; + if (minimizers[seed.source].value.is_reverse) { + // Need to flip the position, for which we need to fetch the node length. + position = reverse_base_pos(position, graph.get_length(graph.get_handle(id(position), is_rev(position)))); + } + return position; +} + +std::vector MinimizerMapper::reseed_between( + size_t read_region_start, + size_t read_region_end, + pos_t left_graph_pos, + pos_t right_graph_pos, + const HandleGraph& graph, + const VectorView& minimizers, + const std::function&, const std::function&)>& for_each_pos_for_source_in_subgraph +) const { + + // We are going to make up some seeds + std::vector forged_items; + + + std::vector seed_positions; + seed_positions.reserve(2); + std::vector position_forward_max_dist; + position_forward_max_dist.reserve(2); + std::vector position_backward_max_dist; + position_backward_max_dist.reserve(2); + + if (!is_empty(left_graph_pos)) { + // We have a left endpoint + seed_positions.emplace_back(left_graph_pos); + position_forward_max_dist.emplace_back(this->reseed_search_distance); + position_backward_max_dist.emplace_back(0); + } + + if (!is_empty(right_graph_pos)) { + // We have a left endpoint + seed_positions.emplace_back(right_graph_pos); + position_forward_max_dist.emplace_back(0); + position_backward_max_dist.emplace_back(this->reseed_search_distance); + } + + std::vector sorted_ids; + { + bdsg::HashGraph subgraph; + // TODO: can we use connecting graph again? + // TODO: Should we be using more seeds from the cluster? + algorithms::extract_containing_graph(&graph, &subgraph, seed_positions, this->reseed_search_distance); + sorted_ids.reserve(subgraph.get_node_count()); + subgraph.for_each_handle([&](const handle_t& h) { + sorted_ids.push_back(subgraph.get_id(h)); + }); + } + std::sort(sorted_ids.begin(), sorted_ids.end()); + + if (this->show_work) { + #pragma omp critical (cerr) + { + std::cerr << log_name() << "Reseeding against nodes "; + // Dump the nodes as consecutive ranges + nid_t prev_node; + nid_t printed_node; + for (size_t i = 0; i < sorted_ids.size(); i++) { + if (i == 0 || prev_node + 1 != sorted_ids[i]) { + if (i > 0) { + std::cerr << "-" << prev_node << ", "; + } + std::cerr << sorted_ids[i]; + printed_node = sorted_ids[i]; + } + prev_node = sorted_ids[i]; + } + if (!sorted_ids.empty() && printed_node != sorted_ids.back()) { + std::cerr << "-" << sorted_ids.back(); + } + std::cerr << endl; + } + } + + for (size_t i = 0; i < minimizers.size(); i++) { + auto& m = minimizers[i]; + + if (m.forward_offset() < read_region_start || m.forward_offset() + m.length > read_region_end) { + // Minimizer is not in the range we care about. + // TODO: Find a faster way to find the relevant minimizers that doesn't require a scan! Sort them by start position or something. + continue; + } + + if (this->show_work) { + #pragma omp critical (cerr) + { + std::cerr << log_name() << "Query minimizer #" << i << " at " << m.forward_offset() << " which overall has " << m.hits << " hits" << std::endl; + } + } + + // We may see duplicates, so we want to do our own deduplication. + unordered_set seen; + + size_t hit_count = 0; + + // Find all its hits in the part of the graph between the bounds + for_each_pos_for_source_in_subgraph(m, sorted_ids, [&](const pos_t& pos) { + // So now we know pos corresponds to read base + // m.value.offset, in the read's forward orientation. + + // Forge an item. + forged_items.emplace_back(); + forged_items.back().pos = pos; + forged_items.back().source = i; + + // Record the hit + hit_count++; + }); + + if (this->show_work) { + #pragma omp critical (cerr) + { + std::cerr << log_name() << "\tFound " << hit_count << "/" << m.hits << " hits" << std::endl; + } + } + } + + // TODO: sort and deduplicate the new seeds + + return forged_items; + +} + +vector MinimizerMapper::map_from_chains(Alignment& aln) { + + if (show_work) { + #pragma omp critical (cerr) + dump_debug_query(aln); + } + + // Make a new funnel instrumenter to watch us map this read. + Funnel funnel; + funnel.start(aln.name()); + + // Prepare the RNG for shuffling ties, if needed + LazyRNG rng([&]() { + return aln.sequence(); + }); + + + // Minimizers sorted by position + std::vector minimizers_in_read = this->find_minimizers(aln.sequence(), funnel); + // Indexes of minimizers, sorted into score order, best score first + std::vector minimizer_score_order = sort_minimizers_by_score(minimizers_in_read); + // Minimizers sorted by best score first + VectorView minimizers{minimizers_in_read, minimizer_score_order}; + // We may or may not need to invert this view, but if we do we will want to + // keep the result. So have a place to lazily keep an inverse. + std::unique_ptr minimizer_score_sort_inverse; + + // Find the seeds and mark the minimizers that were located. + vector seeds = this->find_seeds(minimizers, aln, funnel); + + // Pre-cluster just the seeds we have. Get sets of input seed indexes that go together. + if (track_provenance) { + funnel.stage("precluster"); + funnel.substage("compute-preclusters"); + } + + // Find the clusters up to a flat distance limit + std::vector preclusters = clusterer.cluster_seeds(seeds, chaining_cluster_distance); + + if (track_provenance) { + funnel.substage("score-preclusters"); + } + for (size_t i = 0; i < preclusters.size(); i++) { + Cluster& precluster = preclusters[i]; + this->score_cluster(precluster, i, minimizers, seeds, aln.sequence().length(), funnel); + } + + // Find pairs of "adjacent" preclusters + if (track_provenance) { + funnel.substage("pair-preclusters"); + } + + // To do that, we need start end end positions for each precluster, in the read + std::vector> precluster_read_ranges(preclusters.size(), {std::numeric_limits::max(), 0}); + // And the lowest-numbered seeds in the precluster from those minimizers. + std::vector> precluster_bounding_seeds(preclusters.size(), {std::numeric_limits::max(), std::numeric_limits::max()}); + for (size_t i = 0; i < preclusters.size(); i++) { + // For each precluster + auto& precluster = preclusters[i]; + // We will fill in the range it ocvcupies in the read + auto& read_range = precluster_read_ranges[i]; + auto& graph_seeds = precluster_bounding_seeds[i]; + for (auto& seed_index : precluster.seeds) { + // Which means we look at the minimizer for each seed + auto& minimizer = minimizers[seeds[seed_index].source]; + + if (minimizer.forward_offset() < read_range.first) { + // Min all their starts to get the precluster start + read_range.first = minimizer.forward_offset(); + if (seed_index < graph_seeds.first) { + // And keep a seed hit + graph_seeds.first = seed_index; + } + } + + if (minimizer.forward_offset() + minimizer.length > read_range.second) { + // Max all their past-ends to get the precluster past-end + read_range.second = minimizer.forward_offset() + minimizer.length; + if (seed_index < graph_seeds.second) { + // And keep a seed hit + graph_seeds.second = seed_index; + } + } + } + } + + // Now we want to find, for each interval, the next interval that starts after it ends + // So we put all the intervals in an ordered map by start position. + std::map preclusters_by_start; + // We're also going to need to know which seeds went into which preclusters. + // TODO: We could get away with one seed per precluster here probably. + // TODO: Can we skip building this if not tracking provenance? + std::vector seed_to_precluster(seeds.size(), std::numeric_limits::max()); + for (size_t i = 0; i < preclusters.size(); i++) { + auto found = preclusters_by_start.find(precluster_read_ranges[i].first); + if (found == preclusters_by_start.end()) { + // First thing we've found starting here + preclusters_by_start.emplace_hint(found, precluster_read_ranges[i].first, i); + } else { + // When multiple preclusters start at a position, we always pick the one with the most seeds. + // TODO: score the preclusters and use the scores? + if (preclusters[found->second].seeds.size() < preclusters[i].seeds.size()) { + // If the one in the map has fewer seeds, replace it. + found->second = i; + } + } + for (auto& seed : preclusters[i].seeds) { + // Record which precluster this seed went into. + seed_to_precluster.at(seed) = i; + } + } + // And we need to know the unconnected-to preclusters with nothing to their + // left, which also won the contest for most seeds at their start position + // (and so could have been connected to) + std::unordered_set unconnected_preclusters; + for (auto& kv : preclusters_by_start) { + unconnected_preclusters.insert(kv.second); + } + // And then we do bound lookups for each cluster to find the next one + // And we put those pairs here. + using precluster_connection_t = std::pair; + std::vector precluster_connections; + for (size_t i = 0; i < preclusters.size(); i++) { + size_t past_end = precluster_read_ranges[i].second; + // Find the cluster with the most seeds that starts the soonest after the last base in this cluster. + auto found = preclusters_by_start.lower_bound(past_end); + if (found != preclusters_by_start.end()) { + // We found one. Can we connect them? + precluster_connections.emplace_back(i, found->second); + // Something might connect to them + unconnected_preclusters.erase(found->second); + } else { + // There's nothing after us, so connect to nowhere. + precluster_connections.emplace_back(i, std::numeric_limits::max()); + if (show_work) { + #pragma omp critical (cerr) + std::cerr << log_name() << "Precluster at {R:" << precluster_read_ranges[i].first << "-" << precluster_read_ranges[i].second << "} has nowhere to reseed to" << std::endl; + } + } + } + for (auto& unconnected : unconnected_preclusters) { + // These preclusters could have been connected to but weren't, so look left off of them. + precluster_connections.emplace_back(std::numeric_limits::max(), unconnected); + } + + if (track_provenance) { + funnel.stage("reseed"); + } + + if (track_provenance) { + // We project all preclusters into the funnel + for (size_t i = 0; i < preclusters.size(); i++) { + funnel.project_group(i, preclusters[i].seeds.size()); + } + } + + // Remember how many seeds we had before reseeding + size_t old_seed_count = seeds.size(); + + // We are going to need a widget for finding minimizer hit + // positions in a subgraph, in the right orientation. + auto find_minimizer_hit_positions = [&](const Minimizer& m, const vector& sorted_ids, const std::function& iteratee) -> void { + gbwtgraph::hits_in_subgraph(m.hits, m.occs, sorted_ids, [&](pos_t pos, gbwtgraph::Payload) { + if (m.value.is_reverse) { + // Convert to face along forward strand of read. + size_t node_length = this->gbwt_graph.get_length(this->gbwt_graph.get_handle(id(pos))); + pos = reverse_base_pos(pos, node_length); + } + // Show the properly stranded position to the iteratee. + iteratee(pos); + }); + }; + + // We are going to need our existing seeds in the form of something we can deduplicate. + // TODO: Also remove overlap? + std::unordered_set> seen_seeds; + for (auto& seed : seeds) { + seen_seeds.emplace(minimizers[seed.source].forward_offset(), seed.pos); + } + + // Connections don't appear in the funnel so we track them ourselves. + size_t precluster_connection_explored_count = 0; + + process_until_threshold_a(precluster_connections.size(), (std::function) [&](size_t i) -> double { + // Best pairs to connect are those with the highest average coverage + if (precluster_connections[i].first == std::numeric_limits::max()) { + return preclusters[precluster_connections[i].second].coverage; + } else if (precluster_connections[i].second == std::numeric_limits::max()) { + return preclusters[precluster_connections[i].first].coverage; + } else { + return (preclusters[precluster_connections[i].first].coverage + preclusters[precluster_connections[i].second].coverage) / 2; + } + }, + precluster_connection_coverage_threshold, + min_precluster_connections, + max_precluster_connections, + rng, + [&](size_t connection_num) -> bool { + // This connection is good enough + + // TODO: Add provenance tracking/stage for connections? + + // Reseed between each pair of preclusters and dump into seeds + auto& connected = precluster_connections[connection_num]; + + // Where should we start in the read + size_t left_read; + // And in the graph + pos_t left_pos; + if (connected.first == std::numeric_limits::max()) { + // Nothing is on the left side of this connection + left_read = 0; + left_pos = empty_pos_t(); + } else { + // Get the information from the precluster on the left side of this connection. + left_read = precluster_read_ranges[connected.first].second; + // Make sure graph position points forward along the read. + left_pos = forward_pos(seeds.at(precluster_bounding_seeds[connected.first].second), minimizers, this->gbwt_graph); + } + + // Where should we end in the read + size_t right_read; + // And in the graph + pos_t right_pos; + if (connected.second == std::numeric_limits::max()) { + // Nothing is on the right side of this connection + right_read = aln.sequence().size(); + right_pos = empty_pos_t(); + } else { + // Get the information from the precluster on the right side of this connection. + right_read = precluster_read_ranges[connected.second].first; + // Make sure graph position points forward along the read. + right_pos = forward_pos(seeds.at(precluster_bounding_seeds[connected.second].first), minimizers, this->gbwt_graph); + } + + if (show_work) { + if (connected.first == std::numeric_limits::max()) { + #pragma omp critical (cerr) + { + std::cerr << log_name() << "Reseeding before precluster " << connected.second << " at {R:" << right_read << "-" << precluster_read_ranges[connected.second].second << " = G:" << right_pos + << "}" << std::endl; + } + } else if (connected.second == std::numeric_limits::max()) { + #pragma omp critical (cerr) + { + std::cerr << log_name() << "Reseeding after precluster " << connected.first << " at {R:" << precluster_read_ranges[connected.first].first << "-" << left_read << " = G:" << left_pos + << "}" << std::endl; + } + } else { + #pragma omp critical (cerr) + { + std::cerr << log_name() << "Reseeding between preclusters " << connected.first << " at {R:" << precluster_read_ranges[connected.first].first << "-" << left_read << " = G:" << left_pos + << "} and " << connected.second << " at {R:" << right_read << "-" << precluster_read_ranges[connected.second].second << " = G:" << right_pos + << "}" << std::endl; + } + } + + // Dump the minimizers in the region + this->dump_debug_minimizers(minimizers, aln.sequence(), nullptr, left_read, right_read - left_read); + } + + // Do the reseed + std::vector new_seeds = reseed_between(left_read, right_read, left_pos, right_pos, this->gbwt_graph, minimizers, find_minimizer_hit_positions); + + // Concatenate and deduplicate with existing seeds + size_t seeds_before = seeds.size(); + seeds.reserve(seeds_before + new_seeds.size()); + for (auto& seed : new_seeds) { + // Check if we have seen it before + std::pair key {minimizers[seed.source].forward_offset(), seed.pos}; + auto found = seen_seeds.find(key); + if (found == seen_seeds.end()) { + // Keep this new seed + seeds.emplace_back(std::move(seed)); + seen_seeds.emplace_hint(found, std::move(key)); + + if (this->track_provenance) { + funnel.introduce(); + // Tell the funnel we came from these preclusters together + if (connected.first != std::numeric_limits::max()) { + funnel.also_relevant(1, connected.first); + } + if (connected.second != std::numeric_limits::max()) { + funnel.also_relevant(1, connected.second); + } + // TODO: Tie these back to the minimizers, several stages ago. + } + } + } + + if (show_work) { + #pragma omp critical (cerr) + { + std::cerr << log_name() << "Found " << new_seeds.size() << " seeds, of which " << (seeds.size() - seeds_before) << " are new" << std::endl; + std::vector new_seeds; + for (size_t i = seeds_before; i < seeds.size(); i++) { + new_seeds.push_back(i); + } + this->dump_debug_seeds(minimizers, seeds, new_seeds); + } + } + + precluster_connection_explored_count++; + + return true; + }, [&](size_t connection_num) -> void { + // There are too many sufficiently good connections + // TODO: Add provenance tracking + }, [&](size_t connection_num) -> void { + // This connection is not sufficiently good. + // TODO: Add provenance tracking + }); + + if (this->track_provenance) { + // Make items in the funnel for all the new seeds, basically as one-seed preclusters. + if (this->track_correctness) { + // Tag newly introduced seed items with correctness + funnel.substage("correct"); + } else { + // We're just tagging them with read positions + funnel.substage("placed"); + } + this->tag_seeds(aln, seeds.cbegin() + old_seed_count, seeds.cend(), minimizers, preclusters.size(), funnel); + } + + // Make the main clusters that include the recovered seeds + if (track_provenance) { + funnel.stage("cluster"); + } + + std::vector clusters = clusterer.cluster_seeds(seeds, chaining_cluster_distance); + + // Determine the scores and read coverages for each cluster. + // Also find the best and second-best cluster scores. + if (this->track_provenance) { + funnel.substage("score"); + } + double best_cluster_score = 0.0, second_best_cluster_score = 0.0; + for (size_t i = 0; i < clusters.size(); i++) { + Cluster& cluster = clusters[i]; + this->score_merged_cluster(cluster, + i, + minimizers, + seeds, + old_seed_count, + seed_to_precluster, + preclusters, + aln.sequence().length(), + funnel); + if (cluster.score > best_cluster_score) { + second_best_cluster_score = best_cluster_score; + best_cluster_score = cluster.score; + } else if (cluster.score > second_best_cluster_score) { + second_best_cluster_score = cluster.score; + } + } + + // Throw out some scratch + seed_to_precluster.clear(); + seen_seeds.clear(); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Found " << clusters.size() << " clusters" << endl; + } + } + + // We will set a score cutoff based on the best, but move it down to the + // second best if it does not include the second best and the second best + // is within pad_cluster_score_threshold of where the cutoff would + // otherwise be. This ensures that we won't throw away all but one cluster + // based on score alone, unless it is really bad. + double cluster_score_cutoff = best_cluster_score - cluster_score_threshold; + if (cluster_score_cutoff - pad_cluster_score_threshold < second_best_cluster_score) { + cluster_score_cutoff = std::min(cluster_score_cutoff, second_best_cluster_score); + } + + if (track_provenance) { + // Now we go from clusters to chains + funnel.stage("chain"); + } + + // Convert the seeds into chainable anchors in the same order + vector seed_anchors = this->to_anchors(aln, minimizers, seeds); + + // These are the chains for all the clusters, as score and sequence of visited seeds. + vector>> cluster_chains; + cluster_chains.reserve(clusters.size()); + + // To compute the windows for explored minimizers, we need to get + // all the minimizers that are explored. + SmallBitset minimizer_explored(minimizers.size()); + //How many hits of each minimizer ended up in each cluster we kept? + vector> minimizer_kept_cluster_count; + + size_t kept_cluster_count = 0; + + // What cluster seeds define the space for clusters' chosen chains? + vector> cluster_chain_seeds; + + //Process clusters sorted by both score and read coverage + process_until_threshold_c(clusters.size(), [&](size_t i) -> double { + return clusters[i].coverage; + }, [&](size_t a, size_t b) -> bool { + return ((clusters[a].coverage > clusters[b].coverage) || + (clusters[a].coverage == clusters[b].coverage && clusters[a].score > clusters[b].score)); + }, cluster_coverage_threshold, min_clusters_to_chain, max_clusters_to_chain, rng, [&](size_t cluster_num) -> bool { + // Handle sufficiently good clusters in descending coverage order + + Cluster& cluster = clusters[cluster_num]; + if (track_provenance) { + funnel.pass("cluster-coverage", cluster_num, cluster.coverage); + funnel.pass("max-clusters-to-chain", cluster_num); + } + + // Collect some cluster statistics in the graph + size_t cluster_node_count = 0; + nid_t cluster_min_node = std::numeric_limits::max(); + nid_t cluster_max_node = 0; + { + // Count the distinct node IDs in the cluster (as seed starts) + // to get an idea of its size in the reference + std::unordered_set id_set; + for (auto seed_index : cluster.seeds) { + auto& seed = seeds[seed_index]; + nid_t node_id = id(seed.pos); + cluster_min_node = std::min(cluster_min_node, node_id); + cluster_max_node = std::max(cluster_max_node, node_id); + id_set.insert(node_id); + } + cluster_node_count = id_set.size(); + } + + // First check against the additional score filter + if (cluster_score_threshold != 0 && cluster.score < cluster_score_cutoff + && kept_cluster_count >= min_clusters_to_chain) { + //If the score isn't good enough and we already kept at least min_clusters_to_chain clusters, + //ignore this cluster + if (track_provenance) { + funnel.fail("cluster-score", cluster_num, cluster.score); + } + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Cluster " << cluster_num << " fails cluster score cutoff" << endl; + cerr << log_name() << "Covers " << clusters[cluster_num].coverage << "/best-" << cluster_coverage_threshold << " of read" << endl; + cerr << log_name() << "Involves " << cluster_node_count << " nodes in " << cluster_min_node << "-" << cluster_max_node << endl; + cerr << log_name() << "Scores " << clusters[cluster_num].score << "/" << cluster_score_cutoff << endl; + } + } + return false; + } + + if (track_provenance) { + funnel.pass("cluster-score", cluster_num, cluster.score); + } + + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Cluster " << cluster_num << endl; + cerr << log_name() << "Covers " << cluster.coverage << "/best-" << cluster_coverage_threshold << " of read" << endl; + cerr << log_name() << "Involves " << cluster_node_count << " nodes in " << cluster_min_node << "-" << cluster_max_node << endl; + cerr << log_name() << "Scores " << cluster.score << "/" << cluster_score_cutoff << endl; + } + } + + if (track_provenance) { + // Say we're working on this cluster + funnel.processing_input(cluster_num); + } + + // Count how many of each minimizer is in each cluster that we kept. + // TODO: deduplicate with extend_cluster + minimizer_kept_cluster_count.emplace_back(minimizers.size(), 0); + for (auto seed_index : cluster.seeds) { + auto& seed = seeds[seed_index]; + minimizer_kept_cluster_count.back()[seed.source]++; + } + ++kept_cluster_count; + + if (show_work) { + dump_debug_seeds(minimizers, seeds, cluster.seeds); + } + + // Sort all the seeds used in the cluster by start position, so we can chain them. + std::vector cluster_seeds_sorted = cluster.seeds; + + // Sort seeds by read start of seeded region, and remove indexes for seeds that are redundant + algorithms::sort_and_shadow(seed_anchors, cluster_seeds_sorted); + + if (track_provenance) { + funnel.substage("find_chain"); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Computing chain over " << cluster_seeds_sorted.size() << " seeds" << endl; + } + } + + if (show_work) { + // Log the chaining problem so we can try it again elsewhere. + this->dump_chaining_problem(seed_anchors, cluster_seeds_sorted, gbwt_graph); + } + + // Compute the best chain + cluster_chains.emplace_back(); + cluster_chains.back().first = std::numeric_limits::min(); + cluster_chain_seeds.emplace_back(); + + // Find a chain from this cluster + VectorView cluster_view {seed_anchors, cluster_seeds_sorted}; + auto candidate_chain = algorithms::find_best_chain(cluster_view, + *distance_index, + gbwt_graph, + get_regular_aligner()->gap_open, + get_regular_aligner()->gap_extension, + max_lookback_bases, + min_lookback_items, + lookback_item_hard_cap, + initial_lookback_threshold, + lookback_scale_factor, + min_good_transition_score_per_base, + item_bonus, + max_indel_bases); + if (show_work && !candidate_chain.second.empty()) { + #pragma omp critical (cerr) + { + + cerr << log_name() << "Cluster " << cluster_num << " running " << seed_anchors[cluster_seeds_sorted.front()] << " to " << seed_anchors[cluster_seeds_sorted.back()] + << " has chain with score " << candidate_chain.first + << " and length " << candidate_chain.second.size() + << " running R" << cluster_view[candidate_chain.second.front()].read_start() + << " to R" << cluster_view[candidate_chain.second.back()].read_end() << std::endl; + } + } + if (candidate_chain.first > cluster_chains.back().first) { + // Keep it if it is better + cluster_chains.back() = std::move(candidate_chain); + cluster_chain_seeds.back() = cluster_seeds_sorted; + } + + if (track_provenance) { + funnel.substage_stop(); + } + + if (track_provenance) { + // Record with the funnel that there is now a chain that comes + // from all the seeds that participate in the chain. + funnel.introduce(); + funnel.score(funnel.latest(), cluster_chains.back().first); + // Accumulate the old and new seed funnel numbers to connect to. + // TODO: should we just call into the funnel every time instead of allocating? + std::vector old_seed_ancestors; + std::vector new_seed_ancestors; + for (auto& sorted_seed_number : cluster_chains.back().second) { + // Map each seed back to its canonical seed order + size_t seed_number = cluster_chain_seeds.back().at(sorted_seed_number); + if (seed_number < old_seed_count) { + // Seed is original, from "seed" stage 4 stages ago + old_seed_ancestors.push_back(seed_number); + } else { + // Seed is new, from "reseed" stage 2 stages ago. Came + // after all the preclusters which also live in the reseed stage. + new_seed_ancestors.push_back(seed_number - old_seed_count + preclusters.size()); + } + } + // We came from all the original seeds, 4 stages ago + funnel.also_merge_group(4, old_seed_ancestors.begin(), old_seed_ancestors.end()); + // We came from all the new seeds, 2 stages ago + funnel.also_merge_group(2, new_seed_ancestors.begin(), new_seed_ancestors.end()); + // We're also related to the source cluster from the + // immediately preceeding stage. + funnel.also_relevant(1, cluster_num); + + // Say we finished with this cluster, for now. + funnel.processed_input(); + } + + return true; + + }, [&](size_t cluster_num) -> void { + // There are too many sufficiently good clusters + Cluster& cluster = clusters[cluster_num]; + if (track_provenance) { + funnel.pass("cluster-coverage", cluster_num, cluster.coverage); + funnel.fail("max-clusters-to-chain", cluster_num); + } + + if (show_work) { + #pragma omp critical (cerr) + { + + cerr << log_name() << "Cluster " << cluster_num << " passes cluster cutoffs but we have too many" << endl; + cerr << log_name() << "Covers " << cluster.coverage << "/best-" << cluster_coverage_threshold << " of read" << endl; + cerr << log_name() << "Scores " << cluster.score << "/" << cluster_score_cutoff << endl; + } + } + + }, [&](size_t cluster_num) -> void { + // This cluster is not sufficiently good. + if (track_provenance) { + funnel.fail("cluster-coverage", cluster_num, clusters[cluster_num].coverage); + } + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Cluster " << cluster_num << " fails cluster coverage cutoffs" << endl; + cerr << log_name() << "Covers " << clusters[cluster_num].coverage << "/best-" << cluster_coverage_threshold << " of read" << endl; + cerr << log_name() << "Scores " << clusters[cluster_num].score << "/" << cluster_score_cutoff << endl; + } + } + }); + + // We now estimate the best possible alignment score for each cluster. + std::vector cluster_alignment_score_estimates; + // Copy cluster chain scores over + cluster_alignment_score_estimates.resize(cluster_chains.size()); + for (size_t i = 0; i < cluster_chains.size(); i++) { + cluster_alignment_score_estimates[i] = cluster_chains[i].first; + } + + if (track_provenance) { + funnel.stage("align"); + } + + //How many of each minimizer ends up in a cluster that actually gets turned into an alignment? + vector minimizer_kept_count(minimizers.size(), 0); + + // Now start the alignment step. Everything has to become an alignment. + + // We will fill this with all computed alignments in estimated score order. + vector alignments; + alignments.reserve(cluster_alignment_score_estimates.size()); + // This maps from alignment index back to chain index, for + // tracing back to minimizers for MAPQ. Can hold + // numeric_limits::max() for an unaligned alignment. + vector alignments_to_source; + alignments_to_source.reserve(cluster_alignment_score_estimates.size()); + + // Create a new alignment object to get rid of old annotations. + { + Alignment temp; + temp.set_sequence(aln.sequence()); + temp.set_name(aln.name()); + temp.set_quality(aln.quality()); + aln = std::move(temp); + } + + // Annotate the read with metadata + if (!sample_name.empty()) { + aln.set_sample_name(sample_name); + } + if (!read_group.empty()) { + aln.set_read_group(read_group); + } + + // We need to be able to discard a processed cluster because its score isn't good enough. + // We have more components to the score filter than process_until_threshold_b supports. + auto discard_processed_cluster_by_score = [&](size_t processed_num) -> void { + // This chain is not good enough. + if (track_provenance) { + funnel.fail("chain-score", processed_num, cluster_alignment_score_estimates[processed_num]); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "processed cluster " << processed_num << " failed because its score was not good enough (score=" << cluster_alignment_score_estimates[processed_num] << ")" << endl; + if (track_correctness && funnel.was_correct(processed_num)) { + cerr << log_name() << "\tCORRECT!" << endl; + } + } + } + }; + + // Go through the processed clusters in estimated-score order. + process_until_threshold_b(cluster_alignment_score_estimates, + chain_score_threshold, min_chains, max_alignments, rng, [&](size_t processed_num) -> bool { + // This processed cluster is good enough. + // Called in descending score order. + + if (cluster_alignment_score_estimates[processed_num] < chain_min_score) { + // Actually discard by score + discard_processed_cluster_by_score(processed_num); + return false; + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "processed cluster " << processed_num << " is good enough (score=" << cluster_alignment_score_estimates[processed_num] << ")" << endl; + if (track_correctness && funnel.was_correct(processed_num)) { + cerr << log_name() << "\tCORRECT!" << endl; + } + } + } + if (track_provenance) { + funnel.pass("chain-score", processed_num, cluster_alignment_score_estimates[processed_num]); + funnel.pass("max-alignments", processed_num); + funnel.processing_input(processed_num); + } + + // Collect the top alignments. Make sure we have at least one always, starting with unaligned. + vector best_alignments(1, aln); + + // Align from the chained-up seeds + if (do_dp) { + // We need to do base-level alignment. + + if (track_provenance) { + funnel.substage("align"); + } + + // We currently just have the one best score and chain per cluster + auto& eligible_seeds = cluster_chain_seeds[processed_num]; + auto& score_and_chain = cluster_chains[processed_num]; + vector& chain = score_and_chain.second; + + // Do the DP between the items in the cluster as specified by the chain we got for it. + best_alignments[0] = find_chain_alignment(aln, {seed_anchors, eligible_seeds}, chain); + + // TODO: Come up with a good secondary for the cluster somehow. + } else { + // We would do base-level alignment but it is disabled. + // Leave best_alignment unaligned + } + + // Have a function to process the best alignments we obtained + auto observe_alignment = [&](Alignment& aln) { + alignments.emplace_back(std::move(aln)); + alignments_to_source.push_back(processed_num); + + if (track_provenance) { + + funnel.project(processed_num); + funnel.score(alignments.size() - 1, alignments.back().score()); + } + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Produced alignment from processed cluster " << processed_num + << " with score " << alignments.back().score() << ": " << log_alignment(alignments.back()) << endl; + } + } + }; + + for(auto aln_it = best_alignments.begin() ; aln_it != best_alignments.end() && aln_it->score() != 0 && aln_it->score() >= best_alignments[0].score() * 0.8; ++aln_it) { + //For each additional alignment with score at least 0.8 of the best score + observe_alignment(*aln_it); + } + + + if (track_provenance) { + // We're done with this input item + funnel.processed_input(); + } + + for (size_t i = 0 ; i < minimizer_kept_cluster_count[processed_num].size() ; i++) { + minimizer_kept_count[i] += minimizer_kept_cluster_count[processed_num][i]; + if (minimizer_kept_cluster_count[processed_num][i] > 0) { + // This minimizer is in a cluster that gave rise + // to at least one alignment, so it is explored. + minimizer_explored.insert(i); + } + } + + return true; + }, [&](size_t processed_num) -> void { + // There are too many sufficiently good processed clusters + if (track_provenance) { + funnel.pass("chain-score", processed_num, cluster_alignment_score_estimates[processed_num]); + funnel.fail("max-alignments", processed_num); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "processed cluster " << processed_num << " failed because there were too many good processed clusters (score=" << cluster_alignment_score_estimates[processed_num] << ")" << endl; + if (track_correctness && funnel.was_correct(processed_num)) { + cerr << log_name() << "\tCORRECT!" << endl; + } + } + } + }, discard_processed_cluster_by_score); + + if (alignments.size() == 0) { + // Produce an unaligned Alignment + alignments.emplace_back(aln); + alignments_to_source.push_back(numeric_limits::max()); + + if (track_provenance) { + // Say it came from nowhere + funnel.introduce(); + } + } + + if (track_provenance) { + // Now say we are finding the winner(s) + funnel.stage("winner"); + } + + // Fill this in with the alignments we will output as mappings + vector mappings; + mappings.reserve(min(alignments.size(), max_multimaps)); + + // Grab all the scores in order for MAPQ computation. + vector scores; + scores.reserve(alignments.size()); + + process_until_threshold_a(alignments.size(), (std::function) [&](size_t i) -> double { + return alignments.at(i).score(); + }, 0, 1, max_multimaps, rng, [&](size_t alignment_num) { + // This alignment makes it + // Called in score order + + // Remember the score at its rank + scores.emplace_back(alignments[alignment_num].score()); + + // Remember the output alignment + mappings.emplace_back(std::move(alignments[alignment_num])); + + if (track_provenance) { + // Tell the funnel + funnel.pass("max-multimaps", alignment_num); + funnel.project(alignment_num); + funnel.score(funnel.latest(), scores.back()); + } + + return true; + }, [&](size_t alignment_num) { + // We already have enough alignments, although this one has a good score + + // Remember the score at its rank anyway + scores.emplace_back(alignments[alignment_num].score()); + + if (track_provenance) { + funnel.fail("max-multimaps", alignment_num); + } + }, [&](size_t alignment_num) { + // This alignment does not have a sufficiently good score + // Score threshold is 0; this should never happen + assert(false); + }); + + if (track_provenance) { + funnel.substage("mapq"); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Picked best alignment " << log_alignment(mappings[0]) << endl; + cerr << log_name() << "For scores"; + for (auto& score : scores) cerr << " " << score << ":" << endl; + } + } + + assert(!mappings.empty()); + // Compute MAPQ if not unmapped. Otherwise use 0 instead of the 50% this would give us. + // Use exact mapping quality + double mapq = (mappings.front().path().mapping_size() == 0) ? 0 : + get_regular_aligner()->compute_max_mapping_quality(scores, false) ; + +#ifdef print_minimizer_table + double uncapped_mapq = mapq; +#endif + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "uncapped MAPQ is " << mapq << endl; + } + } + + // TODO: give SmallBitset iterators so we can use it instead of an index vector. + vector explored_minimizers; + for (size_t i = 0; i < minimizers.size(); i++) { + if (minimizer_explored.contains(i)) { + explored_minimizers.push_back(i); + } + } + // Compute caps on MAPQ. TODO: avoid needing to pass as much stuff along. + double escape_bonus = mapq < std::numeric_limits::max() ? 1.0 : 2.0; + double mapq_explored_cap = escape_bonus * faster_cap(minimizers, explored_minimizers, aln.sequence(), aln.quality()); + + // Remember the uncapped MAPQ and the caps + set_annotation(mappings.front(),"secondary_scores", scores); + set_annotation(mappings.front(), "mapq_uncapped", mapq); + set_annotation(mappings.front(), "mapq_explored_cap", mapq_explored_cap); + + // Apply the caps and transformations + mapq = round(min(mapq_explored_cap, min(mapq, 60.0))); + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Explored cap is " << mapq_explored_cap << endl; + cerr << log_name() << "MAPQ is " << mapq << endl; + } + } + + // Make sure to clamp 0-60. + mappings.front().set_mapping_quality(max(min(mapq, 60.0), 0.0)); + + + if (track_provenance) { + funnel.substage_stop(); + } + + for (size_t i = 0; i < mappings.size(); i++) { + // For each output alignment in score order + auto& out = mappings[i]; + + // Assign primary and secondary status + out.set_is_secondary(i > 0); + } + + // Stop this alignment + funnel.stop(); + + // Annotate with whatever's in the funnel + funnel.annotate_mapped_alignment(mappings[0], track_correctness); + + if (track_provenance) { + if (track_correctness) { + annotate_with_minimizer_statistics(mappings[0], minimizers, seeds, old_seed_count, preclusters.size(), funnel); + } + // Annotate with parameters used for the filters and algorithms. + + set_annotation(mappings[0], "param_hit-cap", (double) hit_cap); + set_annotation(mappings[0], "param_hard-hit-cap", (double) hard_hit_cap); + set_annotation(mappings[0], "param_score-fraction", (double) minimizer_score_fraction); + set_annotation(mappings[0], "param_max-unique-min", (double) max_unique_min); + set_annotation(mappings[0], "param_num-bp-per-min", (double) num_bp_per_min); + set_annotation(mappings[0], "param_exclude-overlapping-min", exclude_overlapping_min); + set_annotation(mappings[0], "param_align-from-chains", align_from_chains); + set_annotation(mappings[0], "param_chaining-cluster-distance", (double) chaining_cluster_distance); + set_annotation(mappings[0], "param_precluster-connection-coverage-threshold", precluster_connection_coverage_threshold); + set_annotation(mappings[0], "param_min-precluster-connections", (double) min_precluster_connections); + set_annotation(mappings[0], "param_max-precluster-connections", (double) max_precluster_connections); + set_annotation(mappings[0], "param_min-clusters-to-chain", (double) min_clusters_to_chain); + set_annotation(mappings[0], "param_max-clusters-to-chain", (double) max_clusters_to_chain); + set_annotation(mappings[0], "param_reseed-search-distance", (double) reseed_search_distance); + + // Chaining algorithm parameters + set_annotation(mappings[0], "param_max-lookback-bases", (double) max_lookback_bases); + set_annotation(mappings[0], "param_initial-lookback-threshold", (double) initial_lookback_threshold); + set_annotation(mappings[0], "param_lookback-scale-factor", lookback_scale_factor); + set_annotation(mappings[0], "param_min-good-transition-score-per-base", min_good_transition_score_per_base); + set_annotation(mappings[0], "param_item-bonus", (double) item_bonus); + set_annotation(mappings[0], "param_max-indel-bases", (double) max_indel_bases); + + set_annotation(mappings[0], "param_max-chain-connection", (double) max_chain_connection); + set_annotation(mappings[0], "param_max-tail-length", (double) max_tail_length); + set_annotation(mappings[0], "param_max-alignments", (double) max_alignments); + set_annotation(mappings[0], "param_cluster-score", (double) cluster_score_threshold); + set_annotation(mappings[0], "param_cluster-coverage", (double) cluster_coverage_threshold); + set_annotation(mappings[0], "param_cluster-score", (double) cluster_score_threshold); + set_annotation(mappings[0], "param_chain-score", (double) chain_score_threshold); + set_annotation(mappings[0], "param_chain-min-score", (double) chain_min_score); + set_annotation(mappings[0], "param_min-chains", (double) min_chains); + + set_annotation(mappings[0], "precluster_connections_explored", (double)precluster_connection_explored_count); + set_annotation(mappings[0], "precluster_connections_total", (double)precluster_connections.size()); + } + +#ifdef print_minimizer_table + cerr << aln.sequence() << "\t"; + for (char c : aln.quality()) { + cerr << (char)(c+33); + } + cerr << "\t" << clusters.size(); + for (size_t i = 0 ; i < minimizers.size() ; i++) { + auto& minimizer = minimizers[i]; + cerr << "\t" + << minimizer.value.key.decode(minimizer.length) << "\t" + << minimizer.forward_offset() << "\t" + << minimizer.agglomeration_start << "\t" + << minimizer.agglomeration_length << "\t" + << minimizer.hits << "\t" + << minimizer_kept_count[i]; + if (minimizer_kept_count[i]>0) { + assert(minimizer.hits<=hard_hit_cap) ; + } + } + cerr << "\t" << uncapped_mapq << "\t" << mapq_explored_cap << "\t" << mappings.front().mapping_quality() << "\t"; + cerr << "\t"; + for (auto& score : scores) { + cerr << score << ","; + } + if (track_correctness) { + cerr << "\t" << funnel.last_correct_stage() << endl; + } else { + cerr << "\t" << "?" << endl; + } +#endif + + if (track_provenance) { + if (show_work && aln.sequence().size() < LONG_LIMIT) { + // Dump the funnel info graph to standard error + #pragma omp critical (cerr) + { + funnel.to_dot(cerr); + } + } + + // Otherwise/also, if we are dumping explanations, dump it to a file + DotDumpExplainer explainer(funnel); + } + + return mappings; +} + +Alignment MinimizerMapper::find_chain_alignment( + const Alignment& aln, + const VectorView& to_chain, + const std::vector& chain) const { + + if (chain.empty()) { + throw std::logic_error("Cannot find an alignment for an empty chain!"); + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Align chain of"; + if (chain.size() < MANY_LIMIT) { + cerr << ": "; + for (auto item_number : chain) { + cerr << " " << item_number; + } + } else { + cerr << " " << chain.size() << " items"; + } + cerr << " in " << to_chain.size() << " items" << endl; + } + } + + // We need an Aligner for scoring. + const Aligner& aligner = *get_regular_aligner(); + + // We need a WFAExtender to do tail and intervening alignments. + // Note that the extender expects anchoring matches!!! + WFAExtender extender(gbwt_graph, aligner); + + // Keep a couple cursors in the chain: extension before and after the linking up we need to do. + auto here_it = chain.begin(); + auto next_it = here_it; + ++next_it; + + const algorithms::Anchor* here = &to_chain[*here_it]; + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "First item " << *here_it + << " with overall index " << to_chain.backing_index(*here_it) + << " aligns source " << here->source + << " at " << (*here).read_start() << "-" << (*here).read_end() + << " with " << (*here).graph_start() << "-" << (*here).graph_end() + << endl; + } + } +#endif + + // We compose into a Path, since sometimes we may have to drop back to + // aligners that aren't the WFAAligner and don't make WFAAlignments. + Path composed_path; + // We also track the total score of all the pieces. + int composed_score = 0; + + // Do the left tail, if any. + size_t left_tail_length = (*here).read_start(); + if (left_tail_length > 0) { + // We need to do a left tail. + // Anchor position will not be covered. + string left_tail = aln.sequence().substr(0, left_tail_length); + WFAAlignment left_alignment; + pos_t right_anchor = (*here).graph_start(); + if (left_tail.size() <= max_tail_length) { + // Tail is short so keep to the GBWT. + // We align the left tail with prefix(), which creates a prefix of the alignment. + left_alignment = extender.prefix(left_tail, right_anchor); + if (left_alignment && left_alignment.seq_offset != 0) { + // We didn't get all the way to the left end of the read without + // running out of score. + // Prepend a softclip. + // TODO: Can we let the aligner know it can softclip for free? + WFAAlignment prepend = WFAAlignment::make_unlocalized_insertion(0, left_alignment.seq_offset, 0); + prepend.join(left_alignment); + left_alignment = std::move(prepend); + } + if (left_alignment.length != (*here).read_start()) { + // We didn't get the alignment we expected. + stringstream ss; + ss << "Aligning left tail " << left_tail << " from " << (*here).graph_start() << " produced wrong-length alignment "; + left_alignment.print(ss); + throw std::runtime_error(ss.str()); + } + } + if (left_alignment) { + // We got an alignment, so make it a path + left_alignment.check_lengths(gbwt_graph); + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Start with left tail of " << left_alignment.length << " with score of " << left_alignment.score << endl; + } + } +#endif + + composed_path = left_alignment.to_path(this->gbwt_graph, aln.sequence()); + composed_score = left_alignment.score; + } else { + // We need to fall back on alignment against the graph + + if (left_tail_length > MAX_DP_LENGTH) { + // Left tail is too long to align. + + #pragma omp critical (cerr) + { + cerr << "warning[MinimizerMapper::find_chain_alignment]: Refusing to align " << left_tail_length << " bp left tail against " << right_anchor << " in " << aln.name() << " to avoid overflow" << endl; + } + + // Make a softclip for it. + left_alignment = WFAAlignment::make_unlocalized_insertion(0, left_tail.size(), 0); + composed_path = left_alignment.to_path(this->gbwt_graph, aln.sequence()); + composed_score = left_alignment.score; + } else { + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Start with long left tail fallback alignment" << endl; + } + } +#endif + + #pragma omp critical (cerr) + { + cerr << "warning[MinimizerMapper::find_chain_alignment]: Falling back to non-GBWT alignment of " << left_tail_length << " bp left tail against " << right_anchor << " in " << aln.name() << endl; + } + + Alignment tail_aln; + tail_aln.set_sequence(left_tail); + if (!aln.quality().empty()) { + tail_aln.set_quality(aln.quality().substr(0, left_tail_length)); + } + + // Work out how far the tail can see + size_t graph_horizon = left_tail_length + this->get_regular_aligner()->longest_detectable_gap(aln, aln.sequence().begin()); + // Align the left tail, anchoring the right end. + align_sequence_between(empty_pos_t(), right_anchor, graph_horizon, &this->gbwt_graph, this->get_regular_aligner(), tail_aln, this->max_dp_cells); + // Since it's the left tail we can just clobber the path + composed_path = tail_aln.path(); + composed_score = tail_aln.score(); + } + } + } + + size_t longest_attempted_connection = 0; + while(next_it != chain.end()) { + // Do each region between successive gapless extensions + + // We have to find the next item we can actually connect to + const algorithms::Anchor* next; + // And the actual connecting alignment to it + WFAAlignment link_alignment; + + while (next_it != chain.end()) { + next = &to_chain[*next_it]; + // Try and find a next thing to connect to + + if (algorithms::get_read_distance(*here, *next) == std::numeric_limits::max()) { + // There's overlap between these items. Keep here and skip next. +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Don't try and connect " << *here_it << " to " << *next_it << " because they overlap" << endl; + } + } +#endif + + ++next_it; + } else { + // No overlap, so try it. + break; + } + } + + if (next_it == chain.end()) { + // We couldn't find anything to connect to + break; + } + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Add current item " << *here_it << " of length " << (*here).length() << " with score of " << (*here).score() << endl; + } + } +#endif + + // Make an alignment for the bases used in this item, and + // concatenate it in. + WFAAlignment here_alignment = this->to_wfa_alignment(*here); + append_path(composed_path, here_alignment.to_path(this->gbwt_graph, aln.sequence())); + composed_score += here_alignment.score; + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Next connectable item " << *next_it + << " with overall index " << to_chain.backing_index(*next_it) + << " aligns source " << next->source + << " at " << (*next).read_start() << "-" << (*next).read_end() + << " with " << (*next).graph_start() << "-" << (*next).graph_end() + << endl; + } + } +#endif + + // Pull out the intervening string to the next, if any. + size_t link_start = (*here).read_end(); + size_t link_length = (*next).read_start() - link_start; + string linking_bases = aln.sequence().substr(link_start, link_length); + size_t graph_length = algorithms::get_graph_distance(*here, *next, *distance_index, gbwt_graph); + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Need to align graph from " << (*here).graph_end() << " to " << (*next).graph_start() + << " separated by " << graph_length << " bp and sequence \"" << linking_bases << "\"" << endl; + } + } +#endif + + if (link_length == 0 && graph_length == 0) { + // These items abut in the read and the graph, so we assume we can just connect them. + // WFAExtender::connect() can't handle an empty read sequence, and + // our fallback method to align just against the graph can't handle + // an empty graph region. + // TODO: We can be leaving the GBWT's space here! + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Treat as empty link" << endl; + } + } +#endif + + link_alignment = WFAAlignment::make_empty(); + } else if (link_length > 0 && link_length <= max_chain_connection) { + // If it's not empty and is a reasonable size, align it. + // Make sure to walk back the left anchor so it is outside of the region to be aligned. + pos_t left_anchor = (*here).graph_end(); + get_offset(left_anchor)--; + + link_alignment = extender.connect(linking_bases, left_anchor, (*next).graph_start()); + + longest_attempted_connection = std::max(longest_attempted_connection, linking_bases.size()); + + if (!link_alignment) { + // We couldn't align. + if (graph_length == 0) { + // We had read sequence but no graph sequence. + // Try falling back to a pure insertion. + // TODO: We can be leaving the GBWT's space here! + // TODO: What if this is forcing an insertion that could also be in the graph already? +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "connect() failed; treat as insertion" << endl; + } + } +#endif + link_alignment = WFAAlignment::make_unlocalized_insertion((*here).read_end(), link_length, aligner.score_gap(link_length)); + } + } else if (link_alignment.length != linking_bases.size()) { + // We could align, but we didn't get the alignment we expected. This shouldn't happen for a middle piece that can't softclip. + stringstream ss; + ss << "Aligning anchored link " << linking_bases << " (" << linking_bases.size() << " bp) from " << left_anchor << " - " << (*next).graph_start() << " against graph distance " << graph_length << " produced wrong-length alignment "; + link_alignment.print(ss); + throw std::runtime_error(ss.str()); + } else { + // We got the right alignment. + // Put the alignment back into full read space + link_alignment.seq_offset += (*here).read_end(); + } + } + + if (link_alignment) { + // We found a link alignment + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Add link of length " << link_alignment.length << " with score of " << link_alignment.score << endl; + } + } +#endif + + link_alignment.check_lengths(gbwt_graph); + + // Then the link (possibly empty) + append_path(composed_path, link_alignment.to_path(this->gbwt_graph, aln.sequence())); + composed_score += link_alignment.score; + } else { + // The sequence to the next thing is too long, or we couldn't reach it doing connect(). + // Fall back to another alignment method + + if (linking_bases.size() > MAX_DP_LENGTH) { + // This would be too long for GSSW to handle and might overflow 16-bit scores in its matrix. + #pragma omp critical (cerr) + { + cerr << "warning[MinimizerMapper::find_chain_alignment]: Refusing to align " << link_length << " bp connection between chain items " << graph_length << " apart at " << (*here).graph_end() << " and " << (*next).graph_start() << " in " << aln.name() << " to avoid overflow" << endl; + } + // Just jump to right tail + break; + } + + // We can't actually do this alignment, we'd have to align too + // long of a sequence to find a connecting path. + #pragma omp critical (cerr) + { + cerr << "warning[MinimizerMapper::find_chain_alignment]: Falling back to non-GBWT alignment of " << link_length << " bp connection between chain items " << graph_length << " apart at " << (*here).graph_end() << " and " << (*next).graph_start() << " in " << aln.name() << endl; + } + + Alignment link_aln; + link_aln.set_sequence(linking_bases); + if (!aln.quality().empty()) { + link_aln.set_quality(aln.quality().substr(link_start, link_length)); + } + assert(graph_length != 0); // TODO: Can't handle abutting graph positions yet + // Guess how long of a graph path we ought to allow in the alignment. + size_t path_length = std::max(graph_length, link_length) + this->get_regular_aligner()->longest_detectable_gap(aln, aln.sequence().begin() + link_start); + MinimizerMapper::align_sequence_between((*here).graph_end(), (*next).graph_start(), path_length, &this->gbwt_graph, this->get_regular_aligner(), link_aln, this->max_dp_cells); + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Add link of length " << path_to_length(link_aln.path()) << " with score of " << link_aln.score() << endl; + } + } +#endif + + // Then tack that path and score on + append_path(composed_path, link_aln.path()); + composed_score += link_aln.score(); + } + + // Advance here to next and start considering the next after it + here_it = next_it; + ++next_it; + here = next; + } + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Add last extension " << *here_it << " of length " << (*here).length() << " with score of " << (*here).score() << endl; + } + } +#endif + + WFAAlignment here_alignment = this->to_wfa_alignment(*here); + + here_alignment.check_lengths(gbwt_graph); + + // Do the final GaplessExtension itself (may be the first) + append_path(composed_path, here_alignment.to_path(this->gbwt_graph, aln.sequence())); + composed_score += here_alignment.score; + + // Do the right tail, if any. Do as much of it as we can afford to do. + size_t right_tail_length = aln.sequence().size() - (*here).read_end(); + if (right_tail_length > 0) { + // We need to do a right tail + string right_tail = aln.sequence().substr((*here).read_end(), right_tail_length); + WFAAlignment right_alignment; + pos_t left_anchor = (*here).graph_end(); + get_offset(left_anchor)--; + if (right_tail_length <= max_tail_length) { + // We align the right tail with suffix(), which creates a suffix of the alignment. + // Make sure to walk back the anchor so it is outside of the region to be aligned. + right_alignment = extender.suffix(right_tail, left_anchor); + } + + if (right_alignment) { + // Right tail did align. Put the alignment back into full read space. + right_alignment.seq_offset += (*here).read_end(); + if (right_alignment.seq_offset + right_alignment.length != aln.sequence().size()) { + // We didn't get all the way to the right end of the read without + // running out of score. + // Append a softclip. + // TODO: Can we let the aligner know it can softclip for free? + size_t right_end = right_alignment.seq_offset + right_alignment.length; + size_t remaining = aln.sequence().size() - right_end; + right_alignment.join(WFAAlignment::make_unlocalized_insertion(right_end, remaining, 0)); + } + if (right_alignment.length != right_tail_length) { + // We didn't get the alignment we expected. + stringstream ss; + ss << "Aligning right tail " << right_tail << " from " << left_anchor << " produced wrong-length alignment "; + right_alignment.print(ss); + throw std::runtime_error(ss.str()); + } +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Add right tail of " << right_tail.size() << " with score of " << right_alignment.score << endl; + } + } +#endif + + right_alignment.check_lengths(gbwt_graph); + + append_path(composed_path, right_alignment.to_path(this->gbwt_graph, aln.sequence())); + composed_score += right_alignment.score; + } else { + // We need to fall back on alignment against the graph + +#ifdef debug_chaining + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "End with long right tail fallback alignment" << endl; + } + } +#endif + + if (right_tail.size() > MAX_DP_LENGTH) { + // Right tail is too long to align. + + #pragma omp critical (cerr) + { + cerr << "warning[MinimizerMapper::find_chain_alignment]: Refusing to align " << right_tail.size() << " bp right tail against " << left_anchor << " in " << aln.name() << " to avoid overflow" << endl; + } + + // Make a softclip for it. + right_alignment = WFAAlignment::make_unlocalized_insertion((*here).read_end(), aln.sequence().size() - (*here).read_end(), 0); + append_path(composed_path, right_alignment.to_path(this->gbwt_graph, aln.sequence())); + composed_score += right_alignment.score; + } else { + + #pragma omp critical (cerr) + { + cerr << "warning[MinimizerMapper::find_chain_alignment]: Falling back to non-GBWT alignment of " << right_tail_length << " bp right tail against " << left_anchor << " in " << aln.name() << endl; + } + + Alignment tail_aln; + tail_aln.set_sequence(right_tail); + if (!aln.quality().empty()) { + tail_aln.set_quality(aln.quality().substr((*here).read_end(), right_tail_length)); + } + + // Work out how far the tail can see + size_t graph_horizon = right_tail_length + this->get_regular_aligner()->longest_detectable_gap(aln, aln.sequence().begin() + (*here).read_end()); + // Align the right tail, anchoring the left end. + align_sequence_between(left_anchor, empty_pos_t(), graph_horizon, &this->gbwt_graph, this->get_regular_aligner(), tail_aln, this->max_dp_cells); + // Since it's the right tail we have to add it on + append_path(composed_path, tail_aln.path()); + composed_score += tail_aln.score(); + } + } + } + + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Composed alignment is length " << path_to_length(composed_path) << " with score of " << composed_score << endl; + if (composed_path.mapping_size() > 0) { + cerr << log_name() << "Composed alignment starts with: " << pb2json(composed_path.mapping(0)) << endl; + cerr << log_name() << "Composed alignment ends with: " << pb2json(composed_path.mapping(composed_path.mapping_size() - 1)) << endl; + } + } + } + + // Convert to a vg Alignment. + Alignment result(aln); + *result.mutable_path() = std::move(simplify(composed_path)); + result.set_score(composed_score); + if (!result.sequence().empty()) { + result.set_identity(identity(result.path())); + } + + set_annotation(result, "left_tail_length", (double) left_tail_length); + set_annotation(result, "longest_attempted_connection", (double) longest_attempted_connection); + set_annotation(result, "right_tail_length", (double) right_tail_length); + + return result; +} + +void MinimizerMapper::wfa_alignment_to_alignment(const WFAAlignment& wfa_alignment, Alignment& alignment) const { + *(alignment.mutable_path()) = wfa_alignment.to_path(this->gbwt_graph, alignment.sequence()); + alignment.set_score(wfa_alignment.score); + if (!alignment.sequence().empty()) { + alignment.set_identity(identity(alignment.path())); + } +} + +void MinimizerMapper::with_dagified_local_graph(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, const HandleGraph& graph, const std::function(const handle_t&)>&)>& callback) { + + if (is_empty(left_anchor) && is_empty(right_anchor)) { + throw std::runtime_error("Cannot align sequence between two unset positions"); + } + + // We need to get the graph to align to. + bdsg::HashGraph local_graph; + unordered_map local_to_base; + if (!is_empty(left_anchor) && !is_empty(right_anchor)) { + // We want a graph actually between two positions + local_to_base = algorithms::extract_connecting_graph( + &graph, + &local_graph, + max_path_length, + left_anchor, right_anchor + ); + } else if (!is_empty(left_anchor)) { + // We only have the left anchor + local_to_base = algorithms::extract_extending_graph( + &graph, + &local_graph, + max_path_length, + left_anchor, + false, + false + ); + } else { + // We only have the right anchor + local_to_base = algorithms::extract_extending_graph( + &graph, + &local_graph, + max_path_length, + right_anchor, + true, + false + ); + } + + // To find the anchoring nodes in the extracted graph, we need to scan local_to_base. + nid_t local_left_anchor_id = 0; + nid_t local_right_anchor_id = 0; + for (auto& kv : local_to_base) { + if (kv.second == id(left_anchor)) { + local_left_anchor_id = kv.first; + } + if (kv.second == id(right_anchor)) { + local_right_anchor_id = kv.first; + } + // TODO: Stop early when we found them all. + } + + // And split by strand since we can only align to one strand + StrandSplitGraph split_graph(&local_graph); + + // And make sure it's a DAG of the stuff reachable from our anchors + bdsg::HashGraph dagified_graph; + // For which we need the handles that anchor the graph, facing inwards + std::vector bounding_handles; + if (!is_empty(left_anchor)) { + // Dagify from the forward version of the left anchor + + // Grab the left anchor in the local graph + assert(local_graph.has_node(local_left_anchor_id)); + handle_t local_handle = local_graph.get_handle(local_left_anchor_id, is_rev(left_anchor)); + + // And get the node that that orientation of it is in the strand-split graph + handle_t overlay_handle = split_graph.get_overlay_handle(local_handle); + + // And use that + bounding_handles.push_back(overlay_handle); + } + if (!is_empty(right_anchor)) { + // Dagify from the reverse version of the node for the forward version of the right anchor + + // Grab the right anchor from the local graph + assert(local_graph.has_node(local_right_anchor_id)); + handle_t local_handle = local_graph.get_handle(local_right_anchor_id, is_rev(right_anchor)); + + // And get the node that that orientation of it is in the strand-split graph + // But flip it because we want to dagify going inwards from the right + handle_t overlay_handle = split_graph.flip(split_graph.get_overlay_handle(local_handle)); + + // And use that + bounding_handles.push_back(overlay_handle); + } + + auto dagified_to_split = handlegraph::algorithms::dagify_from(&split_graph, bounding_handles, &dagified_graph, max_path_length); + +#ifdef debug + std::cerr << "Dagified from " << bounding_handles.size() << " bounding handles in " << split_graph.get_node_count() << " node strand-split graph to " << dagified_graph.get_node_count() << " node DAG" << std::endl; +#endif + + // Make an accessor for getting back to the base graph space + auto dagified_handle_to_base = [&](const handle_t& h) -> pair { + nid_t dagified_id = dagified_graph.get_id(h); + bool dagified_is_reverse = dagified_graph.get_is_reverse(h); + auto found_in_split = dagified_to_split.find(dagified_id); + if (found_in_split == dagified_to_split.end()) { + throw std::runtime_error("ID " + std::to_string(dagified_id) + " from dagified graph not found in strand-split graph"); + } + nid_t split_id = found_in_split->second; + handle_t split_handle = split_graph.get_handle(split_id, dagified_is_reverse); + // We rely on get_underlying_handle understanding reversed handles in the split graph + handle_t local_handle = split_graph.get_underlying_handle(split_handle); + nid_t local_id = local_graph.get_id(local_handle); + bool local_is_reverse = local_graph.get_is_reverse(local_handle); + auto found_in_base = local_to_base.find(local_id); + if (found_in_base == local_to_base.end()) { + throw std::runtime_error("ID " + std::to_string(local_id) + " from local graph not found in full base graph"); + } + nid_t base_id = found_in_base->second; + return std::make_pair(base_id, local_is_reverse); + }; + + // Show the graph we made and the translation function + callback(dagified_graph, dagified_handle_to_base); +} + +void MinimizerMapper::align_sequence_between(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, size_t max_dp_cells) { + + // Get the dagified local graph, and the back translation + MinimizerMapper::with_dagified_local_graph(left_anchor, right_anchor, max_path_length, *graph, + [&](DeletableHandleGraph& dagified_graph, const std::function(const handle_t&)>& dagified_handle_to_base) { + + // Then trim off the tips that are either in the wrong orientation relative + // to whether we want them to be a source or a sink, or extraneous + + std::vector tip_handles = handlegraph::algorithms::find_tips(&dagified_graph); + bool trimmed; + size_t trim_count = 0; + do { + trimmed = false; + // We need to make sure to remove only one orientation of each handle + // we remove. + std::unordered_set to_remove_ids; + std::vector to_remove_handles; + for (auto& h : tip_handles) { + auto base_coords = dagified_handle_to_base(h); + if (!dagified_graph.get_is_reverse(h) && (is_empty(left_anchor) || base_coords.first == id(left_anchor))) { + // Tip is inward forward, so it's a source. + // This is a head in the subgraph, and either matches a left + // anchoring node or we don't have any, so keep it. +#ifdef debug + std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an acceptable source (" << base_coords.first << " " << base_coords.second << ")" << std::endl; +#endif + } else if (dagified_graph.get_is_reverse(h) && (is_empty(right_anchor) || base_coords.first == id(right_anchor))) { + // Tip is inward reverse, so it's a sink. + // This is a tail in the subgraph, and either matches a right + // anchoring node or we don't have any, so keep it. +#ifdef debug + std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an acceptable sink (" << base_coords.first << " " << base_coords.second << ")" << std::endl; +#endif + } else { + // This is a wrong orientation of an anchoring node, or some other tip. + // We don't want to keep this handle +#ifdef debug + std::cerr << "Dagified graph node " << dagified_graph.get_id(h) << " " << dagified_graph.get_is_reverse(h) << " is an unacceptable tip (" << base_coords.first << " " << base_coords.second << ")" << std::endl; +#endif + nid_t dagified_id = dagified_graph.get_id(h); + if (!to_remove_ids.count(dagified_id)) { + to_remove_ids.insert(dagified_id); + to_remove_handles.push_back(h); + } + } + } + for (auto& h : to_remove_handles) { + dagified_graph.destroy_handle(h); + trimmed = true; + } + if (trimmed) { + // TODO: This is going to be O(slow) if we actually have to + // prune back a dangling run. We should look at what is + // connected to the tip and the tip only, and make that the new + // tip. Or keep some kind of online tip info. Or use an + // algorithm function that we make actually good. + tip_handles = handlegraph::algorithms::find_tips(&dagified_graph); + trim_count++; + } + } while (trimmed); + if (trim_count > 0) { + #pragma omp critical (cerr) + std::cerr << "warning[MinimizerMapper::align_sequence_between]: Trimmed back tips " << trim_count << " times on graph between " << left_anchor << " and " << right_anchor << " leaving " << dagified_graph.get_node_count() << " nodes and " << tip_handles.size() << " tips" << std::endl; + } + + if (!is_empty(left_anchor) && !is_empty(right_anchor)) { + // Then align the linking bases, with global alignment so they have + // to go from a source to a sink. Banded alignment means we can safely do big problems. + aligner->align_global_banded(alignment, dagified_graph); + } else { + // Do pinned alignment off the anchor we actually have. + // Don't use X-Drop because Dozeu is known to just overwrite the + // stack with garbage whenever alignments are "too big", and these + // alignments are probably often too big. + // But if we don't use Dozeu this uses GSSW and that can *also* be too big. + // So work out how big it will be + size_t cell_count = dagified_graph.get_total_length() * alignment.sequence().size(); + if (cell_count > max_dp_cells) { + #pragma omp critical (cerr) + std::cerr << "warning[MinimizerMapper::align_sequence_between]: Refusing to fill " << cell_count << " DP cells in tail with GSSW" << std::endl; + // Fake a softclip right in input graph space + alignment.clear_path(); + Mapping* m = alignment.mutable_path()->add_mapping(); + // TODO: Is this fake position OK regardless of anchoring side? + m->mutable_position()->set_node_id(is_empty(left_anchor) ? id(right_anchor) : id(left_anchor)); + m->mutable_position()->set_is_reverse(is_empty(left_anchor) ? is_rev(right_anchor) : is_rev(left_anchor)); + m->mutable_position()->set_offset(is_empty(left_anchor) ? offset(right_anchor) : offset(left_anchor)); + Edit* e = m->add_edit(); + e->set_to_length(alignment.sequence().size()); + e->set_sequence(alignment.sequence()); + return; + } else { +#ifdef debug_chaining + #pragma omp critical (cerr) + std::cerr << "debug[MinimizerMapper::align_sequence_between]: Fill " << cell_count << " DP cells in tail with GSSW" << std::endl; +#endif + aligner->align_pinned(alignment, dagified_graph, !is_empty(left_anchor), false); + } + } + + // And translate back into original graph space + for (size_t i = 0; i < alignment.path().mapping_size(); i++) { + // Translate each mapping's ID and orientation down to the base graph + Mapping* m = alignment.mutable_path()->mutable_mapping(i); + + handle_t dagified_handle = dagified_graph.get_handle(m->position().node_id(), m->position().is_reverse()); + auto base_coords = dagified_handle_to_base(dagified_handle); + + m->mutable_position()->set_node_id(base_coords.first); + m->mutable_position()->set_is_reverse(base_coords.second); + } + if (!is_empty(left_anchor) && alignment.path().mapping_size() > 0 && offset(left_anchor) != 0) { + // Get the positions of the leftmost mapping + Position* left_pos = alignment.mutable_path()->mutable_mapping(0)->mutable_position(); + // Add on the offset for the missing piece of the left anchor node + left_pos->set_offset(left_pos->offset() + offset(left_anchor)); + } + + // Now the alignment is filled in! + }); +} + +std::vector MinimizerMapper::to_anchors(const Alignment& aln, const VectorView& minimizers, const std::vector& seeds) const { + std::vector to_return; + to_return.reserve(seeds.size()); + for (auto& seed : seeds) { + to_return.push_back(this->to_anchor(aln, minimizers, seed)); + } + return to_return; +} + +algorithms::Anchor MinimizerMapper::to_anchor(const Alignment& aln, const VectorView& minimizers, const Seed& seed) const { + // Turn each seed into the part of its match on the node where the + // anchoring end (start for forward-strand minimizers, ane for + // reverse-strand minimizers) falls. + auto& source = minimizers[seed.source]; + size_t length; + pos_t graph_start; + size_t read_start; + if (source.value.is_reverse) { + // Seed stores the final base of the match in the graph. + // So get the past-end position. + pos_t graph_end = make_pos_t(id(seed.pos), is_rev(seed.pos), offset(seed.pos) + 1); + + // Work out how much of the node it could use before there. + length = std::min((size_t) source.length, offset(graph_end)); + // And derive the graph start + graph_start = make_pos_t(id(graph_end), is_rev(graph_end), offset(graph_end) - length); + // And the read start + read_start = source.value.offset + 1 - length; + } else { + // Seed stores the first base of the match in the graph + graph_start = seed.pos; + + // Get the handle to the node it's on. + handle_t start_handle = gbwt_graph.get_handle(id(graph_start), is_rev(graph_start)); + // Work out how much of the node it could use before there. + length = std::min((size_t) source.length, gbwt_graph.get_length(start_handle) - offset(graph_start)); + + // And we store the read start position already in the item + read_start = source.value.offset; + } + // Work out how many points the anchor is + // TODO: Always make sequence and quality available for scoring! + int score = get_regular_aligner()->score_exact_match(aln, read_start, length); + return algorithms::Anchor(read_start, graph_start, length, score); +} + +WFAAlignment MinimizerMapper::to_wfa_alignment(const algorithms::Anchor& anchor) const { + return { + {gbwt_graph.get_handle(id(anchor.graph_start()), is_rev(anchor.graph_start()))}, + {{WFAAlignment::match, (uint32_t)anchor.length()}}, + (uint32_t)offset(anchor.graph_start()), + (uint32_t)anchor.read_start(), + (uint32_t)anchor.length(), + anchor.score(), + true + }; +} + +} diff --git a/src/msa_converter.cpp b/src/msa_converter.cpp index c99516cb17c..cd78a27bdf4 100644 --- a/src/msa_converter.cpp +++ b/src/msa_converter.cpp @@ -36,38 +36,7 @@ using namespace std; return tokens; }; - if (format == "maf") { - - auto get_next_sequence_line = [&](istream& in) { - string next; - bool got_data = true; - while (got_data && (next.empty() ? true : next[0] != 's')) { - // are we starting a new alignment block? - if (next.empty() ? false : next[0] == 'a') { - alignments.emplace_back(); - } - got_data = getline(in, next).good(); - } - return next; - }; - - for (string line = get_next_sequence_line(in); !line.empty(); line = get_next_sequence_line(in)) { - vector tokens = tokenize(line); - - if (tokens.size() != 7) { - cerr << "error:[MSAConverter] malformed MAF file, expecting 7 tokens on each sequence ('s') line" << endl; - exit(1); - } - - auto& alignment = alignments.back(); - if (alignment.count(tokens[1])) { - cerr << "error:[MSAConverter] repeated sequence name '" << tokens[1] << "' within an alignment, names must be unique" << endl; - exit(1); - } - alignment[tokens[1]] = tokens[6]; - } - } - else if (format == "clustal") { + if (format == "clustal") { unordered_set conservation_chars{'.', ':', '*'}; @@ -388,9 +357,11 @@ using namespace std; edit->set_from_length(node_length); edit->set_to_length(node_length); } + graph.paths.extend(*path); } graph.destroy_node(dummy_node); + graph.sync_paths(); } destroy_progress(); diff --git a/src/msa_converter.hpp b/src/msa_converter.hpp index 6b35a46bdfc..f03efb50c42 100644 --- a/src/msa_converter.hpp +++ b/src/msa_converter.hpp @@ -12,7 +12,7 @@ #include #include -#include "vg.pb.h" +#include #include "vg.hpp" diff --git a/src/multipath_alignment.cpp b/src/multipath_alignment.cpp index 6838166fde5..3f5c0d4d51f 100644 --- a/src/multipath_alignment.cpp +++ b/src/multipath_alignment.cpp @@ -3,37 +3,180 @@ // #include "multipath_alignment.hpp" +#include "haplotypes.hpp" #include #include #include //#define debug_multiple_tracebacks +//#define debug_search +//#define debug_trace //#define debug_verbose_validation +//#define debug_cigar +//#define debug_find_match +//#define debug_remove_empty using namespace std; using namespace structures; +using namespace vg::io; namespace vg { + + multipath_alignment_t::multipath_alignment_t() : _mapping_quality(0) { + // i don't understand why this is necessary, but somehow mapq is getting defaulted to 160? + } + + multipath_alignment_t::multipath_alignment_t(const multipath_alignment_t& other) { + *this = other; + } + + multipath_alignment_t::multipath_alignment_t(multipath_alignment_t&& other) { + *this = move(other); + } + + multipath_alignment_t& multipath_alignment_t::operator=(const multipath_alignment_t& other) { + _sequence = other._sequence; + _quality = other._quality; + _subpath = other._subpath; + _mapping_quality = other._mapping_quality; + _start = other._start; + other.for_each_annotation([&](const string& name, anno_type_t type, const void* annotation) { + switch (type) { + case Null: + set_annotation(name); + break; + case Double: + set_annotation(name, *((const double*) annotation)); + break; + case Bool: + set_annotation(name, *((const bool*) annotation)); + break; + case String: + set_annotation(name, *((const string*) annotation)); + break; + default: + cerr << "error: unrecognized annotation type" << endl; + exit(1); + break; + } + }); + return *this; + } + + multipath_alignment_t& multipath_alignment_t::operator=(multipath_alignment_t&& other) { + if (this != &other) { + _sequence = move(other._sequence); + _quality = move(other._quality); + _subpath = move(other._subpath); + _mapping_quality = move(other._mapping_quality); + _start = move(other._start); + _annotation = move(other._annotation); + other._annotation.clear(); + } + return *this; + } + + multipath_alignment_t::~multipath_alignment_t() { + while (!_annotation.empty()) { + clear_annotation(_annotation.begin()->first); + } + } + + void multipath_alignment_t::clear_annotation(const string& annotation_name) { + auto iter = _annotation.find(annotation_name); + if (iter != _annotation.end()) { + switch (iter->second.first) { + case Null: + break; + case Double: + free((double*) iter->second.second); + break; + case Bool: + free((bool*) iter->second.second); + break; + case String: + delete ((string*) iter->second.second); + break; + default: + cerr << "error: unrecognized annotation type" << endl; + exit(1); + break; + } + _annotation.erase(iter); + } + } + + bool multipath_alignment_t::has_annotation(const string& annotation_name) const { + return _annotation.count(annotation_name); + } + + void multipath_alignment_t::set_annotation(const string& annotation_name) { + clear_annotation(annotation_name); + _annotation[annotation_name] = make_pair(Null, (void*) nullptr); + } + + void multipath_alignment_t::set_annotation(const string& annotation_name, double value) { + clear_annotation(annotation_name); + auto ptr = (double*) malloc(sizeof(double)); + *ptr = value; + _annotation[annotation_name] = make_pair(Double, (void*) ptr); + } + + void multipath_alignment_t::set_annotation(const string& annotation_name, bool value) { + clear_annotation(annotation_name); + auto ptr = (bool*) malloc(sizeof(bool)); + *ptr = value; + _annotation[annotation_name] = make_pair(Bool, (void*) ptr); + } + + void multipath_alignment_t::set_annotation(const string& annotation_name, const string& value) { + clear_annotation(annotation_name); + auto ptr = new string(); + *ptr = value; + _annotation[annotation_name] = make_pair(String, (void*) ptr); + } + + pair + multipath_alignment_t::get_annotation(const string& annotation_name) const { + auto iter = _annotation.find(annotation_name); + if (iter != _annotation.end()) { + return iter->second; + } + else { + return pair(Null, nullptr); + } + } + + void multipath_alignment_t::for_each_annotation(function lambda) const { + for (const auto& annotation : _annotation) { + lambda(annotation.first, annotation.second.first, annotation.second.second); + } + } - void topologically_order_subpaths(MultipathAlignment& multipath_aln) { + /// Return either the vector of topological order by index or the vector of indexes within the topological order + vector subpath_topological_order(const multipath_alignment_t& multipath_aln, + bool do_index) { // Kahn's algorithm - vector index(multipath_aln.subpath_size(), 0); - size_t order_idx = 0; + vector return_val(multipath_aln.subpath_size(), 0); + size_t idx = 0; vector stack; vector in_degree(multipath_aln.subpath_size(), 0); - for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); - for (size_t j = 0; j < subpath.next_size(); j++) { - in_degree[subpath.next(j)]++; + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + const subpath_t& subpath = multipath_aln.subpath(i); + for (size_t j = 0; j < subpath.next_size(); ++j) { + ++in_degree[subpath.next(j)]; + } + for (const auto& connection : subpath.connection()) { + ++in_degree[connection.next()]; } } // identify the source nodes and add them to the stack - for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { if (!in_degree[i]) { stack.push_back(i); } @@ -44,27 +187,49 @@ namespace vg { size_t here = stack.back(); stack.pop_back(); - index[here] = order_idx; - order_idx++; + if (do_index) { + return_val[here] = idx; + } + else { + return_val[idx] = here; + } + ++idx; // remove the node's edges - const Subpath& subpath = multipath_aln.subpath(here); - for (size_t i = 0; i < subpath.next_size(); i++) { + const subpath_t& subpath = multipath_aln.subpath(here); + for (size_t i = 0; i < subpath.next_size(); ++i) { size_t next = subpath.next(i); - in_degree[next]--; + --in_degree[next]; // if a node is now a source, stack it up if (!in_degree[next]) { stack.push_back(next); } } + // repeat for connections + for (const auto& connection : subpath.connection()) { + --in_degree[connection.next()]; + if (!in_degree[connection.next()]) { + stack.push_back(connection.next()); + } + } } + return return_val; + } + + void topologically_order_subpaths(multipath_alignment_t& multipath_aln) { + + vector index = subpath_topological_order(multipath_aln, true); // translate the edges to the new indices for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - Subpath* subpath = multipath_aln.mutable_subpath(i); + subpath_t* subpath = multipath_aln.mutable_subpath(i); for (size_t j = 0; j < subpath->next_size(); j++) { subpath->set_next(j, index[subpath->next(j)]); } + for (size_t j = 0; j < subpath->connection_size(); ++j) { + connection_t* connection = subpath->mutable_connection(j); + connection->set_next(index[connection->next()]); + } } // translate the start nodes @@ -80,8 +245,256 @@ namespace vg { } } } + + void remove_empty_alignment_sections(multipath_alignment_t& multipath_aln) { + + vector is_empty(multipath_aln.subpath_size(), false); + bool any_empty_subpaths = false; + + // find subpaths that don't have any aligned bases + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + auto path = multipath_aln.mutable_subpath(i)->mutable_path(); + size_t mappings_removed = 0; + for (size_t j = 0; j < path->mapping_size(); ++j) { + auto mapping = path->mutable_mapping(j); + size_t edits_removed = 0; + for (size_t k = 0; k < mapping->edit_size(); ++k) { + auto edit = mapping->mutable_edit(k); + if (edit->from_length() == 0 && edit->to_length() == 0) { + ++edits_removed; + } + else if (edits_removed != 0) { + *mapping->mutable_edit(k - edits_removed) = move(*edit); + } + } + mapping->mutable_edit()->resize(mapping->edit_size() - edits_removed); + if (mapping->edit().empty()) { + ++mappings_removed; + } + else if (mappings_removed != 0) { + *path->mutable_mapping(j - mappings_removed) = move(*mapping); + } + } + path->mutable_mapping()->resize(path->mapping_size() - mappings_removed); + is_empty[i] = path->mapping().empty(); + any_empty_subpaths = any_empty_subpaths || path->mapping().empty(); + } + +#ifdef debug_remove_empty + cerr << "subpaths empty:" << endl; + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + cerr << "\t" << i << " " << is_empty[i] << endl; + } +#endif + + if (any_empty_subpaths) { + // there's at least one empty subpath + + // compute the transitive edges through empty subpaths + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + if (is_empty[i]) { + continue; + } + // check if any of this subpaths nexts are empty + subpath_t& subpath = *multipath_aln.mutable_subpath(i); + + set transitive_nexts; + map transitive_connections; + + unordered_set dequeued; + // records of (index, no connections, score) + priority_queue> queue; + for (auto n : subpath.next()) { + if (is_empty[n]) { + queue.emplace(0, true, n); + } + } + for (const auto& connection : subpath.connection()) { + if (is_empty[connection.next()]) { + queue.emplace(connection.score(), false, connection.next()); + } + } + // dijkstra works to compute max here because it's a DAG + while (!queue.empty()) { + + int32_t score; + bool not_connection; + size_t idx; + tie(score, not_connection, idx) = queue.top(); + queue.pop(); + + if (dequeued.count(idx)) { + continue; + } + dequeued.insert(idx); + + const auto& subpath_here = multipath_aln.subpath(idx); + for (auto next : subpath_here.next()) { + if (is_empty[next]) { + queue.emplace(score, not_connection, next); + } + else if (not_connection) { + transitive_nexts.insert(next); + } + else { + auto it = transitive_connections.find(next); + if (it == transitive_connections.end()) { + transitive_connections[next] = score; + } + else { + it->second = max(score, it->second); + } + } + } + for (const auto& connection : subpath_here.connection()) { + int32_t score_thru = connection.score() + score; + if (is_empty[connection.next()]) { + queue.emplace(score_thru, false, connection.next()); + } + else { + auto it = transitive_connections.find(connection.next()); + if (it == transitive_connections.end()) { + transitive_connections[connection.next()] = score_thru; + } + else { + it->second = max(score_thru, it->second); + } + } + } + } +#ifdef debug_remove_empty + cerr << "transitive nexts for subpath " << i << endl; + for (size_t j : transitive_nexts) { + cerr << "\t" << j << endl; + } + cerr << "transitive connections for subpath " << i << endl; + for (auto c : transitive_connections) { + cerr << "\t" << c.first << " " << c.second << endl; + } +#endif + + // add edges for the nexts reachable through empty subpaths + for (size_t j : transitive_nexts) { + bool found = false; + for (size_t k = 0; k < subpath.next_size() && !found; ++k) { + found = (subpath.next(k) == j); + } + if (!found) { + subpath.add_next(j); + } + } + + for (const pair& cnxn : transitive_connections) { + bool found = false; + size_t k = 0; + for (; k < subpath.connection_size() && !found; ++k) { + found = (subpath.connection(k).next() == cnxn.first); + } + if (!found) { + auto connection = subpath.add_connection(); + connection->set_next(cnxn.first); + connection->set_score(cnxn.second); + } + else { + subpath.mutable_connection(k - 1)->set_score(max(subpath.connection(k - 1).score(), cnxn.second)); + } + } + } + +#ifdef debug_remove_empty + cerr << "before removing subpaths" << endl; + cerr << debug_string(multipath_aln) << endl; +#endif + + // relocate the subpaths we're keeping at the front of the vector + vector removed_so_far(multipath_aln.subpath_size() + 1, 0); + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + if (is_empty[i]) { + removed_so_far[i + 1] = removed_so_far[i] + 1; + continue; + } + else { + removed_so_far[i + 1] = removed_so_far[i]; + } + + if (removed_so_far[i] > 0) { + *multipath_aln.mutable_subpath(i - removed_so_far[i]) = move(*multipath_aln.mutable_subpath(i)); + } + } + + // delete the end of the subpaths + multipath_aln.mutable_subpath()->resize(multipath_aln.subpath_size() - removed_so_far.back()); + +#ifdef debug_remove_empty + cerr << "before updating edges" << endl; + cerr << debug_string(multipath_aln) << endl; +#endif + + // reassign the next and connection indexes + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + + subpath_t& subpath = *multipath_aln.mutable_subpath(i); + + size_t nexts_removed_so_far = 0; + unordered_set nexts_seen; + for (size_t j = 0; j < subpath.next_size(); ++j) { + int64_t updated_next = subpath.next(j) - removed_so_far[subpath.next(j)]; + if (is_empty[subpath.next(j)] || nexts_seen.count(updated_next)) { + ++nexts_removed_so_far; + } + else { + subpath.set_next(j - nexts_removed_so_far, updated_next); + nexts_seen.insert(updated_next); + } + } + if (nexts_removed_so_far) { + subpath.mutable_next()->resize(subpath.next_size() - nexts_removed_so_far); + } + + size_t connections_removed_so_far = 0; + unordered_set> connections_seen; + for (size_t j = 0; j < subpath.connection_size(); ++j) { + auto connection = subpath.mutable_connection(j); + auto updated_connection = pair(connection->next() - removed_so_far[connection->next()], + connection->score()); + + if (is_empty[connection->next()] || connections_seen.count(updated_connection)) { + ++connections_removed_so_far; + } + else { + connection->set_next(updated_connection.first); + if (connections_removed_so_far) { + *subpath.mutable_connection(j - connections_removed_so_far) = *connection; + } + connections_seen.insert(updated_connection); + } + } + if (connections_removed_so_far) { + subpath.mutable_connection()->resize(subpath.connection_size() - connections_removed_so_far); + } + } + +#ifdef debug_remove_empty + cerr << "before updating starts" << endl; + cerr << debug_string(multipath_aln) << endl; +#endif + + // update the starts + bool found_deleted_start = false; + for (size_t i = 0; !found_deleted_start && i < multipath_aln.start_size(); ++i) { + found_deleted_start = is_empty[multipath_aln.start(i)]; + multipath_aln.set_start(i, multipath_aln.start(i) - removed_so_far[multipath_aln.start(i)]); + } + + if (found_deleted_start) { + // recompute the edges from scratch (could be done faster, but + // this is easy); + identify_start_subpaths(multipath_aln); + } + } + } - void identify_start_subpaths(MultipathAlignment& multipath_aln) { + void identify_start_subpaths(multipath_alignment_t& multipath_aln) { // remove start nodes if there are any (avoids doubling them if this function is used liberally) multipath_aln.clear_start(); @@ -89,10 +502,13 @@ namespace vg { // label nodes with incoming edges vector has_incoming_edge(multipath_aln.subpath_size(), false); for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); + const subpath_t& subpath = multipath_aln.subpath(i); for (size_t j = 0; j < subpath.next_size(); j++) { has_incoming_edge[subpath.next(j)] = true; } + for (const connection_t& connection : subpath.connection()) { + has_incoming_edge[connection.next()] = true; + } } // construct list of starts @@ -102,66 +518,149 @@ namespace vg { } } } + + void clear_alignment(multipath_alignment_t& multipath_aln) { + multipath_aln.set_mapping_quality(0); + multipath_aln.clear_subpath(); + multipath_aln.clear_start(); + } /// We define this struct for holding the dynamic programming problem for a /// multipath alignment, which we use for finding the optimal alignment, /// scoring the optimal alignment, and enumerating the top alignments. struct MultipathProblem { - // Score of the optimal alignment ending immediately before this + // If forward, score of the optimal alignment ending immediately before this // subpath. To get the score of the optimal alignment ending with the // subpath, add the subpath's score. + // If backward, the score of the optimal alignment beginning and including + // this subpath. vector prefix_score; // previous subpath for traceback (we refer to subpaths by their index) vector prev_subpath; - // the length of read sequence preceding this subpath + // the length of read sequence preceding this subpath if forward + // else the length of read sequence including and following this subpath vector prefix_length; - /// Make a new MultipathProblem over the given number of subpaths - MultipathProblem(size_t subpath_size) : prefix_score(subpath_size, 0), - prev_subpath(subpath_size, -1), prefix_length(subpath_size, 0) { + /// Make a new MultipathProblem over the given number of subpaths with scores + /// initialized according to whether we're doing a local or global traceback + MultipathProblem(const multipath_alignment_t& multipath_aln, bool subpath_global, bool forward) + : prefix_score(multipath_aln.subpath_size(), subpath_global ? numeric_limits::min() / 2 : 0), + prev_subpath(multipath_aln.subpath_size(), -1), prefix_length(multipath_aln.subpath_size(), 0) { - // Nothing to do! + if (subpath_global) { + // set the starting score at sources/sinks to 0 so that alignments can only start there + if (forward) { + for (const auto& i : multipath_aln.start()) { + prefix_score[i] = 0; + } + } + else { + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + if (multipath_aln.subpath(i).next_size() == 0 && multipath_aln.subpath(i).connection_size() == 0) { + prefix_score[i] = 0; + } + } + } + } } }; /// Internal helper function for running the dynamic programming problem /// represented by a multipath alignment. Returns the filled DP problem, /// the optimal ending subpath, or -1 if no subpath is optimal, and the - /// optimal score, or 0 if no score is optimal. - tuple run_multipath_dp(const MultipathAlignment& multipath_aln) { - - // Create and unpack the return value (including setting up the DP table) - tuple to_return(multipath_aln.subpath_size(), -1, 0); + /// optimal score, or 0 if no score is optimal. An option toggles whether + /// the traceback should be global (a source to a sink in the multipath DAG) + /// or local (starting and ending at any subpath) + tuple run_multipath_dp(const multipath_alignment_t& multipath_aln, + bool subpath_global = false, + bool forward = true) { + + // Create and unpack the return value (including setting up the DP table). Initialise score according + // to whether the alignment is local or global + tuple to_return(MultipathProblem(multipath_aln, subpath_global, forward), + -1, subpath_global ? numeric_limits::min() : 0); auto& problem = get<0>(to_return); auto& opt_subpath = get<1>(to_return); auto& opt_score = get<2>(to_return); - for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); - int32_t extended_score = problem.prefix_score[i] + subpath.score(); - - // carry DP forward - if (subpath.next_size() > 0) { - int64_t thru_length = path_to_length(subpath.path()) + problem.prefix_length[i]; - for (size_t j = 0; j < subpath.next_size(); j++) { - int64_t next = subpath.next(j); - problem.prefix_length[next] = thru_length; - - // can we improve prefix score on following subpath through this one? - if (extended_score >= problem.prefix_score[next]) { - problem.prev_subpath[next] = i; - problem.prefix_score[next] = extended_score; + // TODO: i'm sure there's a way to generalize this so i don't switch off for the whole + // thing, but the iteration schemes are just different enough to be pretty annoying + + if (forward) { + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + const subpath_t& subpath = multipath_aln.subpath(i); + int32_t extended_score = problem.prefix_score[i] + subpath.score(); + // carry DP forward + if (subpath.next_size() != 0 || subpath.connection_size() != 0) { + int64_t thru_length = path_to_length(subpath.path()) + problem.prefix_length[i]; + for (size_t j = 0; j < subpath.next_size(); ++j) { + int64_t next = subpath.next(j); + problem.prefix_length[next] = thru_length; + + // can we improve prefix score on following subpath through this one? + if (extended_score >= problem.prefix_score[next]) { + problem.prev_subpath[next] = i; + problem.prefix_score[next] = extended_score; + } + } + // repeat DP across scored connections + for (size_t j = 0; j < subpath.connection_size(); ++j) { + const connection_t& connection = subpath.connection(j); + + problem.prefix_length[connection.next()] = thru_length; + if (extended_score + connection.score() >= problem.prefix_score[connection.next()]) { + problem.prev_subpath[connection.next()] = i; + problem.prefix_score[connection.next()] = extended_score + connection.score(); + } } } + // check if an alignment is allowed to end here according to global/local rules and + // if so whether it's optimal + if (extended_score >= opt_score && (!subpath_global || (subpath.next_size() == 0 && + subpath.connection_size() == 0))) { + // We have a better optimal subpath + opt_score = extended_score; + opt_subpath = i; + } + } + } + else { + // TODO: maybe this should be done with an unordered_set? + vector is_start(multipath_aln.subpath_size(), false); + for (auto i : multipath_aln.start()) { + is_start[i] = true; } - // check if optimal alignment ends here - if (extended_score >= opt_score) { - // We have a better optimal subpath - opt_score = extended_score; - opt_subpath = i; + for (int64_t i = multipath_aln.subpath_size() - 1; i >= 0; --i) { + const subpath_t& subpath = multipath_aln.subpath(i); + // find the length and best score from subsequent subpaths + for (size_t j = 0; j < subpath.next_size(); ++j) { + auto n = subpath.next(j); + problem.prefix_length[i] = problem.prefix_length[n]; + if (problem.prefix_score[n] > problem.prefix_score[i]) { + problem.prefix_score[i] = problem.prefix_score[n]; + problem.prev_subpath[i] = n; + } + } + for (const auto& connection : subpath.connection()) { + problem.prefix_length[i] = problem.prefix_length[connection.next()]; + if (problem.prefix_score[connection.next()] + connection.score() > problem.prefix_score[i]) { + problem.prefix_score[i] = problem.prefix_score[connection.next()] + connection.score(); + problem.prev_subpath[i] = connection.next(); + } + } + // add score and length of this subpath + problem.prefix_length[i] += path_to_length(subpath.path()); + problem.prefix_score[i] += subpath.score(); + + if (problem.prefix_score[i] >= opt_score && (!subpath_global || is_start[i])) { + // We have a better optimal subpath + opt_score = problem.prefix_score[i]; + opt_subpath = i; + } } } + return to_return; } @@ -171,7 +670,7 @@ namespace vg { /// and past-the-end of the traceback (in some kind of list of int64_t /// subpath indexes) to define it. template - void populate_path_from_traceback(const MultipathAlignment& multipath_aln, const MultipathProblem& problem, + void populate_path_from_traceback(const multipath_alignment_t& multipath_aln, const MultipathProblem& problem, TracebackIterator traceback_start, TracebackIterator traceback_end, Path* output) { static_assert(is_convertible::value, "traceback must contain int64_t items"); @@ -194,7 +693,11 @@ namespace vg { edit->set_to_length(problem.prefix_length[*current_subpath]); edit->set_sequence(multipath_aln.sequence().substr(0, problem.prefix_length[*current_subpath])); - *soft_clip_mapping->mutable_position() = multipath_aln.subpath(*current_subpath).path().mapping(0).position(); + Position* position = soft_clip_mapping->mutable_position(); + const position_t& pos_from = multipath_aln.subpath(*current_subpath).path().mapping(0).position(); + position->set_node_id(pos_from.node_id()); + position->set_is_reverse(pos_from.is_reverse()); + position->set_offset(pos_from.offset()); } // merge the subpaths into one optimal path in the Alignment object @@ -206,7 +709,7 @@ namespace vg { if (output->mapping_size() == 0) { // There's nothing in the output yet, so just copy all the mappings from this subpath - *output = multipath_aln.subpath(*current_subpath).path(); + to_proto_path(multipath_aln.subpath(*current_subpath).path(), *output); for(size_t i = 0; i < output->mapping_size(); i++) { // Set all the ranks @@ -217,16 +720,17 @@ namespace vg { Mapping* curr_end_mapping = output->mutable_mapping(output->mapping_size() - 1); // get the first mapping of the next path - const Path& next_path = multipath_aln.subpath(*current_subpath).path(); - const Mapping& next_start_mapping = next_path.mapping(0); + const path_t& next_path = multipath_aln.subpath(*current_subpath).path(); + const path_mapping_t& next_start_mapping = next_path.mapping(0); size_t mapping_start_idx = 0; // merge mappings if they occur on the same node and same strand if (curr_end_mapping->position().node_id() == next_start_mapping.position().node_id() - && curr_end_mapping->position().is_reverse() == next_start_mapping.position().is_reverse()) { + && curr_end_mapping->position().is_reverse() == next_start_mapping.position().is_reverse() + && curr_end_mapping->position().offset() + mapping_from_length(*curr_end_mapping) == next_start_mapping.position().offset()) { Edit* last_edit = curr_end_mapping->mutable_edit(curr_end_mapping->edit_size() - 1); - const Edit& first_edit = next_start_mapping.edit(0); + const edit_t& first_edit = next_start_mapping.edit(0); // merge the first edit if it is the same type size_t edit_start_idx = 0; @@ -243,7 +747,7 @@ namespace vg { // append the rest of the edits for (size_t j = edit_start_idx; j < next_start_mapping.edit_size(); j++) { - *curr_end_mapping->add_edit() = next_start_mapping.edit(j); + to_proto_edit(next_start_mapping.edit(j), *curr_end_mapping->add_edit()); } mapping_start_idx++; @@ -252,7 +756,7 @@ namespace vg { // append the rest of the mappings for (size_t j = mapping_start_idx; j < next_path.mapping_size(); j++) { Mapping* next_mapping = output->add_mapping(); - *next_mapping = next_path.mapping(j); + to_proto_mapping(next_path.mapping(j), *next_mapping); next_mapping->set_rank(output->mapping_size()); } } @@ -277,10 +781,11 @@ namespace vg { } } - int32_t optimal_alignment_internal(const MultipathAlignment& multipath_aln, Alignment* aln_out) { + int32_t optimal_alignment_internal(const multipath_alignment_t& multipath_aln, Alignment* aln_out, + bool subpath_global) { // Run the dynamic programming - auto dp_result = run_multipath_dp(multipath_aln); + auto dp_result = run_multipath_dp(multipath_aln, subpath_global); // C++17 finally gets http://en.cppreference.com/w/cpp/language/structured_binding // Until then we have to unpack tuples like this. @@ -296,7 +801,7 @@ namespace vg { if (aln_out && opt_subpath >= 0) { // traceback the optimal subpaths until hitting sentinel (-1) - list opt_traceback; + deque opt_traceback; int64_t curr = opt_subpath; while (curr >= 0) { opt_traceback.push_front(curr); @@ -317,24 +822,138 @@ namespace vg { return opt_score; } - void optimal_alignment(const MultipathAlignment& multipath_aln, Alignment& aln_out) { - + void optimal_alignment(const multipath_alignment_t& multipath_aln, Alignment& aln_out, bool subpath_global) { + // transfer read information over to alignment transfer_read_metadata(multipath_aln, aln_out); - aln_out.set_mapping_quality(multipath_aln.mapping_quality()); // do dynamic programming and traceback the optimal alignment - int32_t score = optimal_alignment_internal(multipath_aln, &aln_out); + int32_t score = optimal_alignment_internal(multipath_aln, &aln_out, subpath_global); aln_out.set_score(score); } - int32_t optimal_alignment_score(const MultipathAlignment& multipath_aln){ + int32_t optimal_alignment_score(const multipath_alignment_t& multipath_aln, bool subpath_global){ // do dynamic programming without traceback - return optimal_alignment_internal(multipath_aln, nullptr); + return optimal_alignment_internal(multipath_aln, nullptr, subpath_global); + } + + int32_t worst_alignment_score(const multipath_alignment_t& multipath_aln) { + + if (multipath_aln.subpath().empty()) { + return 0; + } + + // initialize a DP table + vector dp(multipath_aln.subpath_size(), numeric_limits::max()); + + // initial conditions, allow alignments to begin at the starts + for (auto i : multipath_aln.start()) { + dp[i] = 0; + } + + int32_t opt = numeric_limits::max(); + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + const auto& subpath = multipath_aln.subpath(i); + int32_t score_thru = dp[i] + subpath.score(); + if (subpath.next().empty()) { + // this is a sink, check for optimality + opt = min(opt, score_thru); + } + else { + // carry the DP through to the next subpaths + for (auto j : subpath.next()) { + dp[j] = min(dp[j], score_thru); + } + } + } + return max(opt, 0); + } + + void remove_low_scoring_sections(multipath_alignment_t& multipath_aln, int32_t max_score_diff) { + + // do forward-backward dynamic programming so that we can efficiently + // compute maximum score over each subpath and edge + auto fwd_dp = run_multipath_dp(multipath_aln, true, true); + auto bwd_dp = run_multipath_dp(multipath_aln, true, false); + + auto& fwd_scores = get<0>(fwd_dp).prefix_score; + auto& bwd_scores = get<0>(bwd_dp).prefix_score; + + // we'll require that the maximum score be at least this much + int32_t min_score = get<2>(fwd_dp) - max_score_diff; + + // remove entire subpaths or individual edges + vector removed_so_far(multipath_aln.subpath_size() + 1, 0); + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + auto subpath = multipath_aln.mutable_subpath(i); + if (fwd_scores[i] + bwd_scores[i] < min_score) { + removed_so_far[i + 1] = removed_so_far[i] + 1; + } + else { + int32_t score_thru = fwd_scores[i] + subpath->score(); + for (size_t j = 0; j < subpath->next_size(); ) { + // this condition should also filter out edges to deleted subpaths + if (score_thru + bwd_scores[subpath->next(j)] < min_score) { + subpath->set_next(j, subpath->next().back()); + subpath->mutable_next()->pop_back(); + } + else { + ++j; + } + } + for (size_t j = 0; j < subpath->connection_size(); ) { + auto connection = subpath->mutable_connection(j); + // this condition should also filter out connections to deleted subpaths + if (score_thru + bwd_scores[connection->next()] + connection->score() < min_score) { + *connection = subpath->connection().back(); + subpath->mutable_connection()->pop_back(); + } + else { + ++j; + } + } + + if (removed_so_far[i]) { + // move it up in the vector through all the deleted subpaths + *multipath_aln.mutable_subpath(i - removed_so_far[i]) = move(*subpath); + } + + removed_so_far[i + 1] = removed_so_far[i]; + } + } + if (removed_so_far.back()) { + + // get rid of the now unused suffix of the vector + multipath_aln.mutable_subpath()->resize(multipath_aln.subpath_size() - removed_so_far.back()); + + // update the indexes of edges and connections + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + auto subpath = multipath_aln.mutable_subpath(i); + for (size_t j = 0; j < subpath->next_size(); ++j) { + subpath->set_next(j, subpath->next(j) - removed_so_far[subpath->next(j)]); + } + for (size_t j = 0; j < subpath->connection_size(); ++j) { + auto connection = subpath->mutable_connection(j); + connection->set_next(connection->next() - removed_so_far[connection->next()]); + } + } + + // and also fix up the starts + for (size_t i = 0; i < multipath_aln.start_size(); ) { + if (removed_so_far[multipath_aln.start(i)] == removed_so_far[multipath_aln.start(i) + 1]) { + multipath_aln.set_start(i, multipath_aln.start(i) - removed_so_far[multipath_aln.start(i)]); + ++i; + } + else { + multipath_aln.set_start(i, multipath_aln.mutable_start()->back()); + multipath_aln.mutable_start()->pop_back(); + } + } + } } - vector optimal_alignments(const MultipathAlignment& multipath_aln, size_t count) { + vector optimal_alignments(const multipath_alignment_t& multipath_aln, size_t count) { #ifdef debug_multiple_tracebacks cerr << "Computing top " << count << " alignments" << endl; @@ -386,8 +1005,8 @@ namespace vg { // Also, subpaths only keep track of their nexts, so we need to invert // that so we can get all valid prev subpaths. - vector> prev_subpaths; - + vector>> prev_subpaths(multipath_aln.subpath_size()); + // We want to be able to start the traceback only from places where we // won't get shorter versions of same- or higher-scoring alignments. // This means that we want exactly the subpaths that have no successors @@ -402,8 +1021,7 @@ namespace vg { // the optimal score overall and the score we would get for the optimal // alignment ending at each. - prev_subpaths.resize(multipath_aln.subpath_size()); - for (int64_t i = 0; i < multipath_aln.subpath_size(); i++) { + for (int64_t i = 0; i < multipath_aln.subpath_size(); ++i) { // For each subpath // If it has no successors, we can start a traceback here @@ -413,7 +1031,7 @@ namespace vg { // For each next subpath it lists // Register this subpath as a predecessor of the next - prev_subpaths[next_subpath].push_back(i); + prev_subpaths[next_subpath].emplace_back(i, 0); if (multipath_aln.subpath(next_subpath).score() >= 0) { // This successor has a nonnegative score, so taking it @@ -424,6 +1042,17 @@ namespace vg { } } + for (const auto& connection : multipath_aln.subpath(i).connection()) { + + // register the connection + prev_subpaths[connection.next()].emplace_back(i, connection.score()); + + if (multipath_aln.subpath(connection.next()).score() + connection.score() >= 0) { + // Taking the connection would lead to a longer or better alignment + valid_traceback_start = false; + } + } + if (valid_traceback_start) { // We can start a traceback here. @@ -472,7 +1101,6 @@ namespace vg { // Set up read info and MAPQ // TODO: MAPQ on secondaries? transfer_read_metadata(multipath_aln, aln_out); - aln_out.set_mapping_quality(multipath_aln.mapping_quality()); // Populate path populate_path_from_traceback(multipath_aln, problem, basis.begin(), basis.end(), aln_out.mutable_path()); @@ -484,7 +1112,8 @@ namespace vg { cerr << "Traceback reaches start; emit with score " << aln_out.score() << endl; #endif - } else { + } + else { // The path does not lead all the way to a source // Find out all the places we can come from, and the score @@ -503,12 +1132,12 @@ namespace vg { for (auto& prev : prev_subpaths[here]) { // For each, compute the score of the optimal alignment ending at that predecessor - auto prev_opt_score = problem.prefix_score[prev] + multipath_aln.subpath(prev).score(); + auto prev_opt_score = problem.prefix_score[prev.first] + multipath_aln.subpath(prev.first).score() + prev.second; // What's the difference we would take if we went with this predecessor? auto additional_penalty = best_prefix_score - prev_opt_score; - destinations.emplace_back(prev, additional_penalty); + destinations.emplace_back(prev.first, additional_penalty); } // TODO: unify loops! @@ -541,7 +1170,7 @@ namespace vg { } - vector optimal_alignments_with_disjoint_subpaths(const MultipathAlignment& multipath_aln, size_t count) { + vector optimal_alignments_with_disjoint_subpaths(const multipath_alignment_t& multipath_aln, size_t count) { #ifdef debug_multiple_tracebacks cerr << "Computing top " << count << " alignments with disjoint subpaths" << endl; @@ -567,9 +1196,8 @@ namespace vg { // Also, subpaths only keep track of their nexts, so we need to invert // that so we can get all valid prev subpaths. - vector> prev_subpaths; + vector>> prev_subpaths(multipath_aln.subpath_size()); - prev_subpaths.resize(multipath_aln.subpath_size()); for (int64_t i = 0; i < multipath_aln.subpath_size(); i++) { // For each subpath @@ -580,7 +1208,7 @@ namespace vg { // For each next subpath it lists // Register this subpath as a predecessor of the next - prev_subpaths[next_subpath].push_back(i); + prev_subpaths[next_subpath].emplace_back(i, 0); if (multipath_aln.subpath(next_subpath).score() >= 0) { // This successor has a nonnegative score, so taking it @@ -591,6 +1219,17 @@ namespace vg { } } + for (const auto& connection : multipath_aln.subpath(i).connection()) { + + // register the connection + prev_subpaths[connection.next()].emplace_back(i, connection.score()); + + if (multipath_aln.subpath(connection.next()).score() + connection.score() >= 0) { + // Taking the connection would lead to a longer or better alignment + valid_traceback_start = false; + } + } + if (valid_traceback_start) { // We can start a traceback here. @@ -702,7 +1341,6 @@ namespace vg { // Set up read info and MAPQ // TODO: MAPQ on secondaries? transfer_read_metadata(multipath_aln, aln_out); - aln_out.set_mapping_quality(multipath_aln.mapping_quality()); // Populate path populate_path_from_traceback(multipath_aln, problem, basis.begin(), basis.end(), aln_out.mutable_path()); @@ -734,7 +1372,7 @@ namespace vg { for (auto& prev : prev_subpaths[here]) { // For each candidate previous subpath - if (subpath_is_used[prev]) { + if (subpath_is_used[prev.first]) { // This subpath has already been used in an emitted alignment, so we can't use it. #ifdef debug_multiple_tracebacks @@ -745,7 +1383,7 @@ namespace vg { } // For each, compute the score of the optimal alignment ending at that predecessor - auto prev_opt_score = problem.prefix_score[prev] + multipath_aln.subpath(prev).score(); + auto prev_opt_score = problem.prefix_score[prev.first] + multipath_aln.subpath(prev.first).score() + prev.second; // What's the difference we would take if we went with this predecessor? auto additional_penalty = best_prefix_score - prev_opt_score; @@ -753,25 +1391,25 @@ namespace vg { // Calculate the score differences from optimal auto total_penalty = basis_score_difference + additional_penalty; - if (total_penalty >= min_penalty_for_subpath[prev]) { + if (total_penalty >= min_penalty_for_subpath[prev.first]) { // This previous subpath is already reachable with a penalty as good or better. // Don't bother with it again #ifdef debug_multiple_tracebacks - cerr << "\tSkip " << prev << " with penalty " << total_penalty << " >= " << min_penalty_for_subpath[prev] << endl; + cerr << "\tSkip " << prev.first << " with penalty " << total_penalty << " >= " << min_penalty_for_subpath[prev.first] << endl; #endif continue; } // Record that this is the cheapest we managed to get here - min_penalty_for_subpath[prev] = total_penalty; + min_penalty_for_subpath[prev.first] = total_penalty; // Make an extended path - auto extended_path = basis.push_front(prev); + auto extended_path = basis.push_front(prev.first); #ifdef debug_multiple_tracebacks - cerr << "\tAugment with " << prev << " to penalty " << total_penalty << endl; + cerr << "\tAugment with " << prev.first << " to penalty " << total_penalty << endl; #endif // Put them in the priority queue @@ -786,28 +1424,528 @@ namespace vg { return to_return; } - - /// Stores the reverse complement of a Subpath in another Subpath - /// - /// note: this is not included in the header because reversing a subpath without going through - /// the multipath alignment can break invariants related to the edge lists - /// + + vector haplotype_consistent_alignments(const multipath_alignment_t& multipath_aln, const haplo::ScoreProvider& score_provider, + size_t soft_count, size_t hard_count, bool optimal_first) { + +#ifdef debug_multiple_tracebacks + cerr << "Computing haplotype consistent alignments" << endl; +#endif + + // We can only work with a score provider that supports incremental search. + assert(score_provider.has_incremental_search()); + + // Keep a list of what we're going to emit. + vector to_return; + + // Fill out the dynamic programming problem + // TODO: are we duplicating work if we also get the top alignment? + auto dp_result = run_multipath_dp(multipath_aln); + // Get the filled DP problem + MultipathProblem& problem = get<0>(dp_result); + // And the optimal final subpath + int64_t& opt_subpath = get<1>(dp_result); + // And the optimal score + int32_t& opt_score = get<2>(dp_result); + + if (optimal_first) { + // Compute the optimal alignment and put it first. + // TODO: It will also appear later if it is haplotype-consistent. + // But we are allowed to produce duplicates so that's OK. + to_return.emplace_back(); + Alignment& opt_aln = to_return.back(); + + opt_aln.set_score(opt_score); + + // traceback the optimal subpaths until hitting sentinel (-1) + list opt_traceback; + int64_t curr = opt_subpath; + while (curr >= 0) { + opt_traceback.push_front(curr); + curr = problem.prev_subpath[curr]; + } + + Path* opt_path = opt_aln.mutable_path(); + + // Fill in the path in the alignment with the alignment represented + // by this traceback in this DP problem for this multipath + // alignment. + populate_path_from_traceback(multipath_aln, problem, opt_traceback.begin(), opt_traceback.end(), opt_path); + +#ifdef debug_multiple_tracebacks + cerr << "Produced optimal alignment with score " << opt_aln.score() << endl; +#endif + } + + // Keep lists of traceback steps as multipath subpath numbers + using step_list_t = ImmutableList; + + // We define our own search state, which includes a haplotype search + // state and a flag for whether all the edges crossed so far have been + // present in the GBWT. This flag can remain true, and we can keep + // searching, after hour haplotype search state becomes empty. + struct SearchState { + haplo::IncrementalSearchState haplo_state; + bool all_edges_exist = true; + bool started = false; + + inline bool empty() const { + return haplo_state.empty(); + } + + inline size_t size() const { + return haplo_state.size(); + } + + // More consistent things should be smaller, for good queueing + inline bool operator<(const SearchState& other) const { + return haplo_state.size() > other.haplo_state.size() || + (haplo_state.size() == other.haplo_state.size() && all_edges_exist && !other.all_edges_exist); + } + }; + + // This function advances an incremental haplotype search state with the edges along a subpath, if any. + // A non-started input SearchState means to start the search. + // Note that we interpret the path IN REVERSE, because we're doing a traceback. + auto extend_with_subpath = [&](const SearchState& initial, int64_t subpath) { + // Get the Path from the subpath. + const path_t& path = multipath_aln.subpath(subpath).path(); + + // No empty paths are allowed. + assert(path.mapping_size() > 0); + + // Set up a search state scratch + SearchState state = initial; + if (!state.started) { + // If our input state says we need to start a new search, start with the node from the last mapping. + auto& pos = path.mapping(path.mapping_size() - 1).position(); + // We require everything to have mappings to actual places, even pure inserts. + assert(pos.node_id() != 0); + // Make sure to search in the orientation we are actually going + state.haplo_state = score_provider.incremental_find(make_position(pos.node_id(), !pos.is_reverse(), 0)); + state.started = true; + +#ifdef debug_multiple_tracebacks + cerr << "New haplotype search for " << pb2json(pos) << " finds " << state.size() << " matching haplotypes" << endl; +#endif + } + + // Otherwise we have already selected the last Mapping when we crossed the edge back into here. + for (size_t i = path.mapping_size() - 1; i != 0; i--) { + // For each transition between Mappings, we assume we are going between distinct node visits because of our precondition. + // So find the next position looking left + auto& pos = path.mapping(i - 1).position(); + + if (!state.empty()) { + // Search the transition to it in the reverse orientation. + state.haplo_state = score_provider.incremental_extend(state.haplo_state, + make_position(pos.node_id(), !pos.is_reverse(), 0)); +#ifdef debug_multiple_tracebacks + cerr << "Extend within subpath " << subpath + << " by going to node " << pos.node_id() + << " matches " << state.size() << " haplotypes" << endl; +#endif + } else { +#ifdef debug_multiple_tracebacks + cerr << "Extend within subpath " << subpath + << " by going to node " << pos.node_id() + << " but haplotype match set is already empty" << endl; +#endif + } + + if (state.empty() && state.all_edges_exist) { + // We have run out of haplotypes. It may be because we have traversed an edge not in the haplotype index. + // Check for that. + + // Look up where we came from + auto& prev_pos = path.mapping(i).position(); + auto scratch = score_provider.incremental_find(make_position(prev_pos.node_id(), !prev_pos.is_reverse(), 0)); + + // Search where we go to + scratch = score_provider.incremental_extend(scratch, make_position(pos.node_id(), !pos.is_reverse(), 0)); + + if (scratch.empty()) { + // If no haplotypes go there, mark the state as having crossed an unused edge + state.all_edges_exist = false; + +#ifdef debug_multiple_tracebacks + cerr << "Extend within subpath " << subpath + << " by going node " << prev_pos.node_id() << " -> " << pos.node_id() + << " has no haplotypes crossing that edge" << endl; +#endif + } + } + + if (state.empty() && !state.all_edges_exist) { + // If we enter this state we can never imporve, so return + return state; + } + + // Otherwise loop until we run out of transitions + } + + return state; + }; + + // This function advances an incremental haplotype search state with the edge between two subpaths, if any. + // An empty input subpath means to start the search. + // Note that we interpret the path IN REVERSE, because we're doing a traceback. + // Even though old_subpath comes before new_subpath, since we're going backward, new_subpath comes in on the left. + auto extend_between_subpaths = [&](const SearchState& initial, int64_t old_subpath, int64_t new_subpath) { + // We can't have an un-started input state + assert(initial.started); + + // See if the transition from the previous subpath to the next subpath is just two mappings abutting on the same node + const path_t& old_path = multipath_aln.subpath(old_subpath).path(); + const path_t& new_path = multipath_aln.subpath(new_subpath).path(); + assert(old_path.mapping_size() > 0); + assert(new_path.mapping_size() > 0); + // We're going left from the first mapping on the old path + const path_mapping_t& old_mapping = old_path.mapping(0); + // And into the last mapping on the new path. + const path_mapping_t& new_mapping = new_path.mapping(new_path.mapping_size() - 1); + + // Look up the positions + auto& new_pos = new_mapping.position(); + auto& old_pos = old_mapping.position(); + + if (new_pos.node_id() == old_pos.node_id() && + new_pos.is_reverse() == old_pos.is_reverse() && + new_pos.offset() + mapping_from_length(new_mapping) == old_pos.offset()) { + // We actually are transitioning just within a node. No more state updates to do. + +#ifdef debug_multiple_tracebacks + cerr << "Extend between subpaths " << old_subpath << " and " << new_subpath + << " is just abutment on node " << new_pos.node_id() << endl; +#endif + return initial; + } + + // Otherwise there's an edge we have to look for. + SearchState result = initial; + + if (!result.empty()) { + // Try searching on the contained interval + // Make sure to flip the orientation because we're searching left. + result.haplo_state = score_provider.incremental_extend(initial.haplo_state, + make_position(new_mapping.position().node_id(), !new_mapping.position().is_reverse(), 0)); + +#ifdef debug_multiple_tracebacks + cerr << "Extend between subpaths " << old_subpath << " and " << new_subpath + << " by going node " << old_pos.node_id() << " (" << initial.size() << ") -> " + << new_pos.node_id() << " (" << result.size() << ")" << endl; +#endif + } else { +#ifdef debug_multiple_tracebacks + cerr << "Extend between subpaths " << old_subpath << " and " << new_subpath + << " by going node " << old_pos.node_id() << " -> " << new_pos.node_id() + << " starts from empty search" << endl; +#endif + } + + if (result.empty() && result.all_edges_exist) { + // We need to see if we not only ran out of agreeing haplotypes but also took an unused edge + // Look up where we came from + auto scratch = score_provider.incremental_find(make_position(old_pos.node_id(), !old_pos.is_reverse(), 0)); + + // Search where we go to + scratch = score_provider.incremental_extend(scratch, make_position(new_pos.node_id(), !new_pos.is_reverse(), 0)); + + if (scratch.empty()) { + // If no haplotypes go there, mark the state as having crossed an unused edge + result.all_edges_exist = false; + +#ifdef debug_multiple_tracebacks + cerr << "Extend between subpaths " << old_subpath << " and " << new_subpath + << " by going node " << old_pos.node_id() << " -> " << new_pos.node_id() + << " has no haplotypes crossing that edge" << endl; +#endif + } + + } + + return result; + }; + + // Our search is kind of complicated, because we want to enumerate all + // haplotype-consistent linearizations, but pad out to n merely + // scorable linearizations, in alignment score order. + + // So we keep our search queue in a min-max heap, where lower is a + // better thing to extend. + // + // We sort by search state, which we define an order on where more + // consistent haplotypes come before fewer, and then unscorable things + // come last. + // + // And then after that we search by score penalty from optimal. + + // Put them in a size-limited priority queue by search state, and then + // score difference (positive) from optimal + MinMaxHeap> queue; + + // We define a function to put stuff in the queue and limit its size to + // the (count - to_return.size()) items with lowest penalty. + auto try_enqueue = [&](const tuple& item) { + // Work out how many things can be in the queue to compete to pad out the remaining return slots. + // Make sure it doesn't try to go negative. + size_t soft_max_size = soft_count - std::min(soft_count, to_return.size()); + size_t hard_max_size = hard_count ? hard_count - to_return.size() : numeric_limits::max(); + +#ifdef debug_multiple_tracebacks + if (queue.size() >= hard_max_size) { + cerr << "We've reached the hard cap on queue size -- even haplotype consistent are rejected" << endl; + } else if (!get<0>(item).empty()) { + cerr << "Item is haplotype-consistent and must be queued" << endl; + } else if (queue.size() < soft_max_size) { + cerr << "Item fits in queue" << endl; + } else if (!queue.empty() && item < queue.max()) { + cerr << "Item beats worst thing in queue" << endl; + } +#endif + + + if ((!get<0>(item).empty() && queue.size() < hard_max_size) + || (queue.size() < soft_max_size && queue.size() < hard_max_size) + || (!queue.empty() && item < queue.max())) { + // The item belongs in the queue because it fits or it beats + // the current worst thing if present, or it's not eligible for removal. + queue.push(item); +#ifdef debug_multiple_tracebacks + cerr << "Allow item into queue (" << queue.size() << "/" << soft_max_size << "," << hard_max_size << ")" << endl; +#endif + + } + + while (!queue.empty() + && (queue.size() > hard_max_size || queue.size() > soft_max_size) + && (get<0>(queue.max()).empty() || queue.size() > hard_max_size)) { + // We have more possibilities than we need to consider, and + // some are eligible for removal. Get rid of the worst one. + queue.pop_max(); +#ifdef debug_multiple_tracebacks + cerr << "Remove worst from queue (" << queue.size() << "/" << soft_max_size << "," << hard_max_size << ")" << endl; +#endif + } + }; + + // Also, subpaths only keep track of their nexts, so we need to invert + // that so we can get all valid prev subpaths. + // TODO: This code is also duplicated + vector>> prev_subpaths; + + prev_subpaths.resize(multipath_aln.subpath_size()); + for (int64_t i = 0; i < multipath_aln.subpath_size(); i++) { + // For each subpath + + // If it has no successors, we can start a traceback here + bool valid_traceback_start = true; + + for (auto& next_subpath : multipath_aln.subpath(i).next()) { + // For each next subpath it lists + + // Register this subpath as a predecessor of the next + prev_subpaths[next_subpath].emplace_back(i, 0); + + if (multipath_aln.subpath(next_subpath).score() >= 0) { + // This successor has a nonnegative score, so taking it + // after us would generate a longer, same- or + // higher-scoring alignment. So we shouldn't start a + // traceback from subpath i. + valid_traceback_start = false; + } + } + + for (const auto& connection : multipath_aln.subpath(i).connection()) { + + // register the connection + prev_subpaths[connection.next()].emplace_back(i, connection.score()); + + if (multipath_aln.subpath(connection.next()).score() + connection.score() >= 0) { + // Taking the connection would lead to a longer or better alignment + valid_traceback_start = false; + } + } + + if (valid_traceback_start) { + // We can start a traceback here. + + // The path is just to be here + step_list_t starting_path{i}; + + // The search state is what we get just starting with this subpath + SearchState state = extend_with_subpath(SearchState(), i); + + // The score penalty for starting here is the optimal score minus the optimal score starting here + auto penalty = opt_score - (problem.prefix_score[i] + multipath_aln.subpath(i).score()); + + +#ifdef debug_multiple_tracebacks + cerr << "Could end at subpath " << i << " with " << state.size() + << " matches and scorability " << state.all_edges_exist << endl; +#endif + + try_enqueue(make_tuple(state, penalty, starting_path)); + + } + } + + while(!queue.empty()) { + // Grab a traceback to try extending + auto frame = queue.min(); + queue.pop_min(); + + // Unpack the stack frame + auto& state = get<0>(frame); + auto& base_penalty = get<1>(frame); + auto& basis = get<2>(frame); + + if (problem.prev_subpath[basis.front()] == -1) { + // If it leads all the way to a subpath that is optimal as a start + + // Make an Alignment to emit it in + to_return.emplace_back(); + Alignment& aln_out = to_return.back(); + + // Set up read info and MAPQ + // TODO: MAPQ on secondaries? + transfer_read_metadata(multipath_aln, aln_out); + + // Populate path + populate_path_from_traceback(multipath_aln, problem, basis.begin(), basis.end(), aln_out.mutable_path()); + + // Compute the score from the penalty + aln_out.set_score(opt_score - base_penalty); + +#ifdef debug_multiple_tracebacks + cerr << "Traceback reaches start at " << basis.front() << " with " << state.size() + << " consistent haplotypes and scorability " << state.all_edges_exist << "; emit linearization " + << (to_return.size() - 1) << " with score " << aln_out.score() << endl; + + for (auto& m : aln_out.path().mapping()) { + cerr << m.position().node_id() << " "; + } + cerr << endl; +#endif + } else { + // We can't optimally stop the traceback here. We have to come from somewhere. + + auto& here = basis.front(); + + // To compute the additional score difference, we need to know what our optimal prefix score was. + auto& best_prefix_score = problem.prefix_score[here]; + + for (auto& prev : prev_subpaths[here]) { + // For each possible previous location + + // Compute the score of the optimal alignment ending at that predecessor + auto prev_opt_score = problem.prefix_score[prev.first] + multipath_aln.subpath(prev.first).score() + prev.second; + + // What's the difference we would take if we went with this predecessor? + auto additional_penalty = best_prefix_score - prev_opt_score; + + // Try extending into the subpath + auto extended_state = extend_between_subpaths(state, here, prev.first); + // And then with all of it + extended_state = extend_with_subpath(extended_state, prev.first); + +#ifdef debug_multiple_tracebacks + cerr << "Extending traceback from subpath " << here << " to and through " << prev.first << " keeps " + << extended_state.size() << " / " << state.size() << " haplotype matches, with scorability " + << extended_state.all_edges_exist << " and penalty " << base_penalty + additional_penalty << endl; +#endif + + // Save the result + try_enqueue(make_tuple(extended_state, base_penalty + additional_penalty, basis.push_front(prev.first))); + } + } + + + + } + + return to_return; + } + + + pair aligned_interval(const multipath_alignment_t& multipath_aln) { + + // check whether there are any aligned bases + bool empty = true; + for (size_t i = 0; i < multipath_aln.subpath_size() && empty; ++i) { + const auto& path = multipath_aln.subpath(i).path(); + for (size_t j = 0; j < path.mapping_size() && empty; ++j) { + const auto& mapping = path.mapping(j); + for (size_t k = 0; k < mapping.edit_size() && empty; ++k) { + const auto& edit = mapping.edit(k); + empty = (edit.to_length() == 0 && edit.from_length() == 0); + } + } + } + if (empty) { + return pair(0, 0); + } + + int64_t min_softclip_left = numeric_limits::max(); + int64_t min_softclip_right = numeric_limits::max(); + + for (auto i : multipath_aln.start()) { + const auto& edit = multipath_aln.subpath(i).path().mapping(0).edit(0); + if (edit.from_length() == 0 && edit.to_length() != 0) { + min_softclip_left = min(min_softclip_left, edit.to_length()); + } + else { + min_softclip_left = 0; + } + } + + vector is_sink(multipath_aln.subpath_size(), false); + for (const auto& subpath : multipath_aln.subpath()) { + if (subpath.next_size() == 0 && subpath.connection_size() == 0) { + + const auto& path = subpath.path(); + const auto& mapping = path.mapping(path.mapping_size() - 1); + const auto& edit = mapping.edit(mapping.edit_size() - 1); + if (edit.from_length() == 0 && edit.to_length() != 0) { + min_softclip_right = min(min_softclip_right, edit.to_length()); + } + else { + min_softclip_right = 0; + } + } + } + + if (min_softclip_left == numeric_limits::max()) { + min_softclip_left = 0; + } + if (min_softclip_right == numeric_limits::max()) { + min_softclip_right = 0; + } + return pair(min_softclip_left, + multipath_aln.sequence().size() - min_softclip_right); + } + + /// Stores the reverse complement of a Subpath in another Subpath + /// + /// note: this is not included in the header because reversing a subpath without going through + /// the multipath alignment can break invariants related to the edge lists + /// /// Args: /// subpath subpath to reverse complement /// node_length a function that returns the length of a node sequence from its node ID /// rev_comp_out empty subpath to store reverse complement in (data will be overwritten /// if not empty) /// - inline void rev_comp_subpath(const Subpath& subpath, const function& node_length, - Subpath& rev_comp_out) { + inline void rev_comp_subpath(const subpath_t& subpath, const function& node_length, + subpath_t& rev_comp_out) { *(rev_comp_out.mutable_path()) = reverse_complement_path(subpath.path(), node_length); rev_comp_out.set_score(subpath.score()); // leave reversing the edges to the multipath alignment } - void rev_comp_multipath_alignment(const MultipathAlignment& multipath_aln, const function& node_length, - MultipathAlignment& rev_comp_out) { + void rev_comp_multipath_alignment(const multipath_alignment_t& multipath_aln, const function& node_length, + multipath_alignment_t& rev_comp_out) { // reverse complement sequence rev_comp_out.set_sequence(reverse_complement(multipath_aln.sequence())); @@ -815,12 +1953,10 @@ namespace vg { rev_comp_out.set_quality(string(multipath_aln.quality().rbegin(), multipath_aln.quality().rend())); // transfer the rest of the metadata directly - rev_comp_out.set_read_group(multipath_aln.read_group()); - rev_comp_out.set_name(multipath_aln.name()); - rev_comp_out.set_sample_name(multipath_aln.sample_name()); - rev_comp_out.set_paired_read_name(multipath_aln.paired_read_name()); + rev_comp_out.set_mapping_quality(multipath_aln.mapping_quality()); - vector< vector > reverse_edge_lists(multipath_aln.subpath_size()); + vector> reverse_edge_lists(multipath_aln.subpath_size()); + vector>> reverse_connection_lists(multipath_aln.subpath_size()); vector reverse_starts; // remove subpaths to avoid duplicating @@ -828,410 +1964,1892 @@ namespace vg { // add subpaths in reverse order to maintain topological ordering for (int64_t i = multipath_aln.subpath_size() - 1; i >= 0; i--) { - const Subpath& subpath = multipath_aln.subpath(i); - Subpath* rc_subpath = rev_comp_out.add_subpath(); + const subpath_t& subpath = multipath_aln.subpath(i); + subpath_t* rc_subpath = rev_comp_out.add_subpath(); rev_comp_subpath(subpath, node_length, *rc_subpath); - if (subpath.next_size() > 0) { + if (subpath.next_size() > 0 || subpath.connection_size() > 0) { // collect edges by their target (for reversing) for (size_t j = 0; j < subpath.next_size(); j++) { reverse_edge_lists[subpath.next(j)].push_back(i); } + for (const connection_t& connection : subpath.connection()) { + reverse_connection_lists[connection.next()].emplace_back(i, connection.score()); + } + } + else { + // sink subpaths become sources in reverse + reverse_starts.push_back(i); + } + } + + // add reversed edges + for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { + subpath_t* rc_subpath = rev_comp_out.mutable_subpath(i); + vector& reverse_edge_list = reverse_edge_lists[multipath_aln.subpath_size() - i - 1]; + for (size_t j = 0; j < reverse_edge_list.size(); j++) { + rc_subpath->add_next(multipath_aln.subpath_size() - reverse_edge_list[j] - 1); + } + vector>& reverse_connection_list = reverse_connection_lists[multipath_aln.subpath_size() - i - 1]; + for (size_t j = 0; j < reverse_connection_list.size(); ++j) { + connection_t* connection = rc_subpath->add_connection(); + connection->set_next(multipath_aln.subpath_size() - reverse_connection_list[j].first - 1); + connection->set_score(reverse_connection_list[j].second); + } + } + + // remove start nodes that are invalid in reverse + rev_comp_out.clear_start(); + + // assume that if the original multipath alignment had its starts labeled they want them + // labeled in the reverse complement too + if (multipath_aln.start_size() > 0) { + for (size_t i = 0; i < reverse_starts.size(); i++) { + rev_comp_out.add_start(multipath_aln.subpath_size() - reverse_starts[i] - 1); + } + } + } + + void rev_comp_multipath_alignment_in_place(multipath_alignment_t* multipath_aln, + const function& node_length) { + + // reverse complement sequence + reverse_complement_in_place(*multipath_aln->mutable_sequence()); + // reverse base qualities + string* quality = multipath_aln->mutable_quality(); + std::reverse(quality->begin(), quality->end()); + + // current reverse edges + vector> reverse_edge_lists(multipath_aln->subpath_size()); + vector> reverse_connection_lists(multipath_aln->subpath_size()); + // current sink nodes (will be starts) + vector reverse_starts; + + uint32_t subpath_swap_size = multipath_aln->subpath_size() / 2; + uint32_t last = multipath_aln->subpath_size() - 1; + for (uint32_t i = 0, j = last; i < subpath_swap_size; i++, j--) { + subpath_t* subpath_1 = multipath_aln->mutable_subpath(i); + subpath_t* subpath_2 = multipath_aln->mutable_subpath(j); + + // add reverse edges for first subpath + if (subpath_1->next_size() > 0 || subpath_1->connection_size() > 0) { + for (uint32_t k = 0; k < subpath_1->next_size(); k++) { + reverse_edge_lists[subpath_1->next(k)].push_back(last - i); + } + for (uint32_t k = 0; k < subpath_1->connection_size(); k++) { + const connection_t& connection = subpath_1->connection(k); + reverse_connection_lists[connection.next()].emplace_back(); + connection_t& new_connection = reverse_connection_lists[connection.next()].back(); + new_connection.set_next(last - i); + new_connection.set_score(connection.score()); + } + } + else { + reverse_starts.push_back(last - i); + } + + // add reverse edges for second subpath + if (subpath_2->next_size() > 0 || subpath_2->connection_size() > 0) { + for (uint32_t k = 0; k < subpath_2->next_size(); k++) { + reverse_edge_lists[subpath_2->next(k)].push_back(last - j); + } + for (uint32_t k = 0; k < subpath_2->connection_size(); k++) { + const connection_t& connection = subpath_2->connection(k); + reverse_connection_lists[connection.next()].emplace_back(); + connection_t& new_connection = reverse_connection_lists[connection.next()].back(); + new_connection.set_next(last - j); + new_connection.set_score(connection.score()); + } + } + else { + reverse_starts.push_back(last - j); + } + + // clear current edges + subpath_1->clear_next(); + subpath_2->clear_next(); + subpath_1->clear_connection(); + subpath_2->clear_connection(); + + // reverse complement the paths + reverse_complement_path_in_place(subpath_1->mutable_path(), node_length); + reverse_complement_path_in_place(subpath_2->mutable_path(), node_length); + + // swap their positions (to maintain topological ordering) + std::swap(*subpath_1, *subpath_2); + } + + // repeat process for the middle subpath if there is an odd number + if (multipath_aln->subpath_size() % 2) { + subpath_t* subpath = multipath_aln->mutable_subpath(subpath_swap_size); + if (subpath->next_size() > 0 || subpath->connection_size() > 0) { + for (uint32_t k = 0; k < subpath->next_size(); k++) { + reverse_edge_lists[subpath->next(k)].push_back(subpath_swap_size); + } + for (uint32_t k = 0; k < subpath->connection_size(); k++) { + const connection_t& connection = subpath->connection(k); + reverse_connection_lists[connection.next()].emplace_back(); + connection_t& new_connection = reverse_connection_lists[connection.next()].back(); + new_connection.set_next(subpath_swap_size); + new_connection.set_score(connection.score()); + } + } + else { + reverse_starts.push_back(subpath_swap_size); + } + + subpath->clear_next(); + subpath->clear_connection(); + + reverse_complement_path_in_place(subpath->mutable_path(), node_length); + } + + // add reversed edges + for (uint32_t i = 0, j = last; i < multipath_aln->subpath_size(); i++, j--) { + subpath_t* subpath = multipath_aln->mutable_subpath(i); + *subpath->mutable_next() = move(reverse_edge_lists[j]); + *subpath->mutable_connection() = move(reverse_connection_lists[j]); + } + + // if we had starts labeled before, label them again + if (multipath_aln->start_size() > 0) { + *multipath_aln->mutable_start() = move(reverse_starts); + } + } + + void convert_multipath_alignment_char(multipath_alignment_t& multipath_aln, char from, char to) { + auto& seq = *multipath_aln.mutable_sequence(); + for (size_t i = 0; i < seq.size(); ++i) { + if (seq[i] == from) { + seq[i] = to; + } + } + for (subpath_t& subpath : *multipath_aln.mutable_subpath()) { + for (path_mapping_t& mapping : *subpath.mutable_path()->mutable_mapping()) { + for (edit_t& edit : *mapping.mutable_edit()) { + if (!edit.sequence().empty()) { + auto& eseq = *edit.mutable_sequence(); + for (size_t i = 0; i < eseq.size(); ++i) { + if (eseq[i] == from) { + eseq[i] = to; + } + } + } + } + } + } + } + + void convert_Us_to_Ts(multipath_alignment_t& multipath_aln) { + convert_multipath_alignment_char(multipath_aln, 'U', 'T'); + } + + void convert_Ts_to_Us(multipath_alignment_t& multipath_aln) { + convert_multipath_alignment_char(multipath_aln, 'T', 'U'); + } + + template + void transfer_from_proto_annotation(const ProtoAlignment& from, multipath_alignment_t& to) { + for_each_basic_annotation(from, + [&](const string& anno_name) { to.set_annotation(anno_name); }, + [&](const string& anno_name, double value) { to.set_annotation(anno_name, value); }, + [&](const string& anno_name, bool value) { to.set_annotation(anno_name, value); }, + [&](const string& anno_name, const string& value) { to.set_annotation(anno_name, value); }); + } + + template + void transfer_to_proto_annotation(const multipath_alignment_t& from, ProtoAlignment& to) { + from.for_each_annotation([&](const string& anno_name, multipath_alignment_t::anno_type_t type, const void* value) { + switch (type) { + case multipath_alignment_t::Null: + break; + case multipath_alignment_t::Double: + set_annotation(to, anno_name, *((const double*) value)); + break; + case multipath_alignment_t::Bool: + set_annotation(to, anno_name, *((const bool*) value)); + break; + case multipath_alignment_t::String: + set_annotation(to, anno_name, *((const string*) value)); + break; + default: + cerr << "error: unrecognized annotation type" << endl; + exit(1); + break; + } + }); + } + + // TODO: our proto annotation system actually doesn't seem to allow null annotations... + template + void transfer_between_proto_annotation(const ProtoAlignment1& from, ProtoAlignment2& to) { + for_each_basic_annotation(from, + [&to](const string& name) { return; }, + [&to](const string& name, double value) { set_annotation(to, name, value); }, + [&to](const string& name, bool value) { set_annotation(to, name, value); }, + [&to](const string& name, const string& value) { set_annotation(to, name, value); }); + } + + // transfers the metadata that is shared across all formats + template + void transfer_uniform_metadata(const Alignment1& from, Alignment2& to) { + to.set_sequence(from.sequence()); + to.set_quality(from.quality()); + to.set_mapping_quality(from.mapping_quality()); + } + + void to_proto_multipath_alignment(const multipath_alignment_t& multipath_aln, + MultipathAlignment& proto_multipath_aln_out) { + proto_multipath_aln_out.clear_subpath(); + proto_multipath_aln_out.clear_start(); + transfer_read_metadata(multipath_aln, proto_multipath_aln_out); + for (const auto& subpath : multipath_aln.subpath()) { + auto subpath_copy = proto_multipath_aln_out.add_subpath(); + subpath_copy->set_score(subpath.score()); + for (auto next : subpath.next()) { + subpath_copy->add_next(next); + } + for (const auto& connection : subpath.connection()) { + auto connection_copy = subpath_copy->add_connection(); + connection_copy->set_next(connection.next()); + connection_copy->set_score(connection.score()); + } + if (subpath.has_path()) { + const auto& path = subpath.path(); + auto path_copy = subpath_copy->mutable_path(); + to_proto_path(path, *path_copy); + } + } + for (auto start : multipath_aln.start()) { + proto_multipath_aln_out.add_start(start); + } + } + + void from_proto_multipath_alignment(const MultipathAlignment& proto_multipath_aln, + multipath_alignment_t& multipath_aln_out) { + multipath_aln_out.clear_subpath(); + multipath_aln_out.clear_start(); + transfer_read_metadata(proto_multipath_aln, multipath_aln_out); + for (auto subpath : proto_multipath_aln.subpath()) { + auto subpath_copy = multipath_aln_out.add_subpath(); + subpath_copy->set_score(subpath.score()); + for (auto next : subpath.next()) { + subpath_copy->add_next(next); + } + for (const auto& connection : subpath.connection()) { + auto connection_copy = subpath_copy->add_connection(); + connection_copy->set_next(connection.next()); + connection_copy->set_score(connection.score()); + } + if (subpath.has_path()) { + auto path = subpath.path(); + auto path_copy = subpath_copy->mutable_path(); + from_proto_path(path, *path_copy); + } + } + + for (auto start : proto_multipath_aln.start()) { + multipath_aln_out.add_start(start); + } + } + + void to_multipath_alignment(const Alignment& aln, multipath_alignment_t& multipath_aln_out) { + + // clear repeated fields + multipath_aln_out.clear_subpath(); + multipath_aln_out.clear_start(); + + // transfer read and alignment metadata + transfer_read_metadata(aln, multipath_aln_out); + + // transfer alignment and score + if (aln.has_path() || aln.score()) { + subpath_t* subpath = multipath_aln_out.add_subpath(); + subpath->set_score(aln.score()); + from_proto_path(aln.path(), *subpath->mutable_path()); + } + identify_start_subpaths(multipath_aln_out); + } + + void transfer_read_metadata(const MultipathAlignment& from, multipath_alignment_t& to) { + transfer_uniform_metadata(from, to); + transfer_from_proto_annotation(from, to); + } + + void transfer_read_metadata(const multipath_alignment_t& from, MultipathAlignment& to) { + transfer_uniform_metadata(from, to); + transfer_to_proto_annotation(from, to); + } + + void transfer_read_metadata(const multipath_alignment_t& from, multipath_alignment_t& to) { + transfer_uniform_metadata(from, to); + from.for_each_annotation([&](const string& anno_name, multipath_alignment_t::anno_type_t type, const void* value) { + switch (type) { + case multipath_alignment_t::Null: + to.set_annotation(anno_name); + break; + case multipath_alignment_t::Double: + to.set_annotation(anno_name, *((const double*) value)); + break; + case multipath_alignment_t::Bool: + to.set_annotation(anno_name, *((const bool*) value)); + break; + case multipath_alignment_t::String: + to.set_annotation(anno_name, *((const string*) value)); + break; + default: + cerr << "error: unrecognized annotation type" << endl; + exit(1); + break; + } + }); + } + + void transfer_read_metadata(const Alignment& from, multipath_alignment_t& to) { + transfer_uniform_metadata(from, to); + transfer_from_proto_annotation(from, to); + if (from.is_secondary()) { + to.set_annotation("secondary", true); + } + } + + void transfer_read_metadata(const multipath_alignment_t& from, Alignment& to) { + transfer_uniform_metadata(from, to); + transfer_to_proto_annotation(from, to); + if (from.has_annotation("secondary")) { + auto annotation = from.get_annotation("secondary"); + assert(annotation.first == multipath_alignment_t::Bool); + to.set_is_secondary(*((bool*) annotation.second)); + } + } + + void transfer_read_metadata(const Alignment& from, Alignment& to) { + transfer_uniform_metadata(from, to); + + to.set_read_group(from.read_group()); + to.set_name(from.name()); + to.set_sample_name(from.sample_name()); + to.set_is_secondary(from.is_secondary()); + + transfer_between_proto_annotation(from, to); + + if (from.has_fragment_prev()) { + *to.mutable_fragment_prev() = from.fragment_prev(); + } + if (from.has_fragment_next()) { + *to.mutable_fragment_next() = from.fragment_next(); + } + if (from.has_annotation()) { + *to.mutable_annotation() = from.annotation(); + } + } + + void transfer_proto_metadata(const Alignment& from, MultipathAlignment& to) { + // transfer over the fields that are included only in the protobuf object + to.set_name(from.name()); + to.set_read_group(from.read_group()); + to.set_sample_name(from.sample_name()); + if (from.has_fragment_prev()) { + to.set_paired_read_name(from.fragment_prev().name()); + } + else if (from.has_fragment_next()) { + to.set_paired_read_name(from.fragment_next().name()); + } + } + + void transfer_proto_metadata(const MultipathAlignment& from, Alignment& to) { + // transfer over the fields that are included only in the protobuf object + to.set_name(from.name()); + to.set_read_group(from.read_group()); + to.set_sample_name(from.sample_name()); + + // not doing paired name because need extra logic to decide if it's prev or next + } + + void merge_non_branching_subpaths(multipath_alignment_t& multipath_aln, + const unordered_set* prohibited_merges) { + + vector in_degree(multipath_aln.subpath_size(), 0); + vector has_inward_connection(multipath_aln.subpath_size()); + for (const subpath_t& subpath : multipath_aln.subpath()) { + for (auto next : subpath.next()) { + in_degree[next]++; + } + for (const auto& connection : subpath.connection()) { + has_inward_connection[connection.next()] = true; + } + } + + auto get_mergeable_next = [&](size_t idx) -> int64_t { + const subpath_t& subpath = multipath_aln.subpath(idx); + bool prohibited = false; + if (prohibited_merges) { + prohibited = prohibited_merges->count(idx); + } + if (!prohibited && subpath.next_size() == 1 && subpath.connection_size() == 0 + && in_degree[subpath.next(0)] == 1 && !has_inward_connection[subpath.next(0)]) { + return subpath.next(0); + } + return -1; + }; + + vector removed(multipath_aln.subpath_size(), false); + + for (auto i : subpath_topological_order(multipath_aln, false)) { + + // this one has been marked for removal, + if (removed[i]) { + continue; + } + + // the subpath we might merge into + subpath_t* subpath = multipath_aln.mutable_subpath(i); + + int64_t last = -1; + // iterate through non-branching subpaths + for (int64_t j = get_mergeable_next(i); j >= 0; j = get_mergeable_next(j)) { + + // mark the next one for removal + removed[j] = true; + + subpath_t* merge_subpath = multipath_aln.mutable_subpath(j); + + subpath->set_score(subpath->score() + merge_subpath->score()); + + path_t* merge_path = merge_subpath->mutable_path(); + if (merge_path->mapping_size() == 0) { + continue; + } + + path_t* path = subpath->mutable_path(); + path_mapping_t* final_mapping = path->mutable_mapping(path->mapping_size() - 1); + const position_t& final_position = final_mapping->position(); + + path_mapping_t* first_mapping = merge_path->mutable_mapping(0); + const position_t& first_position = first_mapping->position(); + + int64_t mapping_idx = 0; + + // do we need to merge the abutting mappings? + if (first_position.node_id() == final_position.node_id() && + first_position.is_reverse() == final_position.is_reverse() && + first_position.offset() == final_position.offset() + mapping_from_length(*final_mapping)) { + // do we need to merge the abutting edits? + int64_t edit_idx = 0; + if (final_mapping->edit_size() && first_mapping->edit_size()) { + edit_t* final_edit = final_mapping->mutable_edit(final_mapping->edit_size() - 1); + const edit_t& first_edit = first_mapping->edit(0); + if ((first_edit.from_length() > 0) == (final_edit->from_length() > 0) && + (first_edit.to_length() > 0) == (final_edit->to_length() > 0) && + first_edit.sequence().empty() == final_edit->sequence().empty()) { + final_edit->set_from_length(final_edit->from_length() + first_edit.from_length()); + final_edit->set_to_length(final_edit->to_length() + first_edit.to_length()); + final_edit->set_sequence(final_edit->sequence() + first_edit.sequence()); + + edit_idx++; + } + } + + // append rest of the edits + for (; edit_idx < first_mapping->edit_size(); edit_idx++) { + *final_mapping->add_edit() = move(*first_mapping->mutable_edit(edit_idx)); + } + + mapping_idx++; + } + + for (; mapping_idx < merge_path->mapping_size(); mapping_idx++) { + *path->add_mapping() = move(*merge_path->mutable_mapping(mapping_idx)); + } + + last = j; + } + + // move the adjacencies over from the last one we merged in + if (last >= 0) { + subpath->clear_next(); + subpath->clear_connection(); + for (int64_t next : multipath_aln.subpath(last).next()) { + subpath->add_next(next); + } + for (const auto& connection : multipath_aln.subpath(last).connection()) { + *subpath->add_connection() = connection; + } + } + } + + // go back and do the removals + vector removed_so_far(multipath_aln.subpath_size(), 0); + for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { + if (i > 0) { + removed_so_far[i] = removed_so_far[i - 1]; + } + + if (removed[i]) { + // this one has been marked for removal + removed_so_far[i]++; + continue; + } + + if (removed_so_far[i]) { + // move it up in the vector past the removed subpaths + *multipath_aln.mutable_subpath(i - removed_so_far[i]) = move(*multipath_aln.mutable_subpath(i)); + } + } + + // did we merge and remove any subpaths? + if (!removed_so_far.empty() && removed_so_far.back()) { + // trim the vector of subpaths + multipath_aln.mutable_subpath()->resize(multipath_aln.subpath_size() - removed_so_far.back()); + + // update the indexes of the adjacencies + for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { + subpath_t* subpath = multipath_aln.mutable_subpath(i); + for (size_t j = 0; j < subpath->next_size(); j++) { + subpath->set_next(j, subpath->next(j) - removed_so_far[subpath->next(j)]); + } + + for (size_t j = 0; j < subpath->connection_size(); j++) { + auto connection = subpath->mutable_connection(j); + connection->set_next(connection->next() - removed_so_far[connection->next()]); + } + } + + // update the indexes of the starts + for (size_t i = 0; i < multipath_aln.start_size(); ++i) { + multipath_aln.set_start(i, multipath_aln.start(i) - removed_so_far[multipath_aln.start(i)]); + } + } + } + + void connected_comps_do(const multipath_alignment_t& multipath_aln, + function& on_new_component, + function& on_new_node) { + + vector> reverse_edge_lists(multipath_aln.subpath_size()); + + for (int64_t i = 0; i < multipath_aln.subpath_size(); i++) { + const subpath_t& subpath = multipath_aln.subpath(i); + // collect edges and connections by their target + for (size_t j = 0; j < subpath.next_size(); j++) { + reverse_edge_lists[subpath.next(j)].push_back(i); + } + for (size_t j = 0; j < subpath.connection_size(); j++) { + reverse_edge_lists[subpath.connection(j).next()].push_back(i); + } + } + + vector collected(multipath_aln.subpath_size(), false); + + for (int64_t i = 0; i < multipath_aln.subpath_size(); i++) { + if (collected[i]) { + continue; + } + + // start traversing a new component + on_new_component(); + + vector stack{i}; + collected[i] = true; + while (!stack.empty()) { + int64_t at = stack.back(); + stack.pop_back(); + + // traverse a new node in the component + on_new_node(at); + + const subpath_t& subpath = multipath_aln.subpath(at); + for (int64_t j = 0; j < subpath.next_size(); j++) { + int64_t idx = subpath.next(j); + if (!collected[idx]) { + collected[idx] = true; + stack.push_back(idx); + } + } + for (int64_t j = 0; j < subpath.connection_size(); j++) { + int64_t idx = subpath.connection(j).next(); + if (!collected[idx]) { + collected[idx] = true; + stack.push_back(idx); + } + } + for (int64_t idx : reverse_edge_lists[at]) { + if (!collected[idx]) { + collected[idx] = true; + stack.push_back(idx); + } + } + } + } + } + + size_t num_connected_components(const multipath_alignment_t& multipath_aln) { + size_t num_comps = 0; + function on_new_component = [&](void) { + ++num_comps; + }; + function on_new_node = [](size_t i) { + // nothing to do + }; + connected_comps_do(multipath_aln, on_new_component, on_new_node); + return num_comps; + } + + vector> connected_components(const multipath_alignment_t& multipath_aln) { + + vector> components; + function on_new_component = [&](void) { + components.emplace_back(); + }; + function on_new_node = [&](size_t i) { + components.back().push_back(i); + }; + connected_comps_do(multipath_aln, on_new_component, on_new_node); + return components; + } + + void extract_sub_multipath_alignment(const multipath_alignment_t& multipath_aln, + const vector& subpath_indexes, + multipath_alignment_t& sub_multipath_aln) { + + transfer_read_metadata(multipath_aln, sub_multipath_aln); + + // create subpaths for each of the ones we're retaining and record the translation + unordered_map new_index; + for (int64_t i = 0; i < subpath_indexes.size(); i++) { + int64_t old_idx = subpath_indexes[i]; + const subpath_t& old_subpath = multipath_aln.subpath(old_idx); + + subpath_t* subpath = sub_multipath_aln.add_subpath(); + *subpath->mutable_path() = old_subpath.path(); + subpath->set_score(old_subpath.score()); + + new_index[old_idx] = i; + } + + // add edges according to the translation + for (int64_t i = 0; i < subpath_indexes.size(); i++) { + const subpath_t& old_subpath = multipath_aln.subpath(subpath_indexes[i]); + subpath_t* new_subpath = sub_multipath_aln.mutable_subpath(i); + for (int64_t j = 0; j < old_subpath.next_size(); j++) { + if (new_index.count(old_subpath.next(j))) { + new_subpath->add_next(new_index[old_subpath.next(j)]); + } + } + for (int64_t j = 0; j < old_subpath.connection_size(); j++) { + const connection_t& old_connection = old_subpath.connection(j); + if (new_index.count(old_connection.next())) { + connection_t* new_connection = new_subpath->add_connection(); + new_connection->set_next(new_index[old_connection.next()]); + new_connection->set_score(old_connection.score()); + } + } + } + + // assume that if we had starts labeled before, we want them again + if (multipath_aln.start_size() > 0) { + identify_start_subpaths(sub_multipath_aln); + } + } + + void append_multipath_alignment(multipath_alignment_t& multipath_aln, + const multipath_alignment_t& to_append) { + + size_t original_size = multipath_aln.subpath().size(); + for (const subpath_t& appending_subpath : to_append.subpath()) { + subpath_t* new_subpath = multipath_aln.add_subpath(); + new_subpath->set_score(appending_subpath.score()); + *new_subpath->mutable_path() = appending_subpath.path(); + new_subpath->mutable_next()->reserve(appending_subpath.next_size()); + for (auto n : appending_subpath.next()) { + new_subpath->add_next(n + original_size); + } + for (const auto& c : appending_subpath.connection()) { + auto new_connection = new_subpath->add_connection(); + new_connection->set_next(c.next() + original_size); + new_connection->set_score(c.score()); + } + } + if (multipath_aln.start_size() != 0 && to_append.start_size() != 0) { + for (auto s : to_append.start()) { + multipath_aln.add_start(s + original_size); + } + } + else if (multipath_aln.start_size() != 0) { + identify_start_subpaths(multipath_aln); + } + } + + bool contains_connection(const multipath_alignment_t& multipath_aln) { + bool no_connection = true; + for (size_t i = 0; i < multipath_aln.subpath_size() && no_connection; ++i) { + no_connection = multipath_aln.subpath(i).connection().empty(); + } + return !no_connection; + } + + vector> + search_multipath_alignment(const multipath_alignment_t& multipath_aln, + const pos_t& graph_pos, int64_t seq_pos) { + +#ifdef debug_search + cerr << "starting search for " << graph_pos << " at seq pos " << seq_pos << endl; +#endif + + vector> return_val; + + vector subpath_seq_pos(multipath_aln.subpath_size(), 0); + + for (int64_t i = 0; i < multipath_aln.subpath_size(); ++i) { +#ifdef debug_search + cerr << "subpath " << i << endl; +#endif + const subpath_t& subpath = multipath_aln.subpath(i); + const path_t& path = subpath.path(); + int64_t to_length_here = subpath_seq_pos[i]; + int64_t to_length_thru = to_length_here + path_to_length(subpath.path()); + // tell the next subpaths where they start + for (auto j : subpath.next()) { + subpath_seq_pos[j] = to_length_thru; + } + for (const auto& connection : subpath.connection()) { + subpath_seq_pos[connection.next()] = to_length_thru; + } + + if (to_length_here <= seq_pos && to_length_thru >= seq_pos) { + // this is where we might expect to find the sequence position + +#ifdef debug_search + cerr << "interval " << to_length_here << " " << to_length_thru << " covers seq pos " << seq_pos << endl; +#endif + + for (int64_t j = 0; j < path.mapping_size(); ++j) { + + const auto& mapping = path.mapping(j); + const auto& pos = mapping.position(); + +#ifdef debug_search + cerr << "mapping " << j << " at graph pos " << debug_string(pos) << endl; +#endif + + if (pos.node_id() == id(graph_pos) && pos.is_reverse() == is_rev(graph_pos)) { + int64_t offset_here = pos.offset(); + +#ifdef debug_search + cerr << "position " << debug_string(pos) << " consistent with graph pos " << graph_pos << endl; +#endif + + // this mapping is on the right node to be a match + + for (int64_t k = 0; k < mapping.edit_size(); ++k) { + + const auto& edit = mapping.edit(k); + int64_t to_length_thru_edit = to_length_here + edit.to_length(); + int64_t offset_thru_edit = offset_here + edit.from_length(); + +#ifdef debug_search + cerr << "edit " << k << ", to length interval " << to_length_here << " " << to_length_thru_edit << ", offset interval " << offset_here << " " << offset_thru_edit << endl; +#endif + + // does this edit contain both the sequence and graph positions (allowing + // for a past-the-last position on the final edit)? + if (to_length_here <= seq_pos && + (to_length_thru_edit > seq_pos || (to_length_thru_edit == seq_pos && + (k + 1 == mapping.edit_size() || + to_length_here == to_length_thru_edit))) && + offset_here <= offset(graph_pos) && + (offset_thru_edit > offset(graph_pos) || (offset_thru_edit == offset(graph_pos) && + (k + 1 == mapping.edit_size() || + offset_here == offset_thru_edit)))) { + + // are the offsets within the edit consistent with each other? + int64_t graph_l = offset(graph_pos) - offset_here; + int64_t seq_l = seq_pos - to_length_here; + bool consistent = (graph_l == seq_l || + (graph_l == 0 && edit.from_length() == 0) || + (seq_l == 0 && edit.to_length() == 0)); + +#ifdef debug_search + cerr << "read interval " << to_length_here << " " << to_length_thru_edit << " covers seq pos " << seq_pos << ", offset interval " << offset_here << " " << offset_thru_edit << " covers offset " << offset(graph_pos) << ", consistent? " << consistent << endl; +#endif + + // handle some special cases of the past-the-last position to make canonical results + // TODO: ugly + bool must_place_here = true; + if (consistent && k + 1 == mapping.edit_size() && to_length_thru_edit == seq_pos + && offset_thru_edit == offset(graph_pos)) { + // we're looking at locating this position at the past-the-last position on + // a mapping, but it might also exist at the first position of the next mapping. + // if so, we will canonicalize it to go there instead. + +#ifdef debug_search + cerr << "checking if must place past-the-last" << endl; +#endif + if (j + 1 < path.mapping_size()) { + // the next mapping is still on this subpath + const auto& next_pos = path.mapping(j + 1).position(); + must_place_here &= (next_pos.node_id() != pos.node_id() + || next_pos.is_reverse() != pos.is_reverse() + || next_pos.offset() != offset_thru_edit); + } + else { + // we have to check the next subpaths + for (auto n : subpath.next()) { + const auto& next_pos = multipath_aln.subpath(n).path().mapping(0).position(); + must_place_here &= (next_pos.node_id() != pos.node_id() + || next_pos.is_reverse() != pos.is_reverse() + || next_pos.offset() != offset_thru_edit); + } + } + } + + if (consistent && must_place_here) { + // winner winner chicken dinner, record the match + int64_t l = max(seq_l, graph_l); +#ifdef debug_search + cerr << "recording match " << i << " " << j << " " << k << " " << l << endl; +#endif + + return_val.emplace_back(i, j, k, l); + } + + } + offset_here = offset_thru_edit; + to_length_here = to_length_thru_edit; + } + } + else { + to_length_here += mapping_to_length(mapping); + } + } + } + } + return return_val; + } + + pair, vector>> + trace_path(const multipath_alignment_t& multipath_aln, const Path& path, + int64_t subpath_idx, int64_t mapping_idx, int64_t edit_idx, int64_t base_idx, + bool search_left, int64_t search_limit) { + +#ifdef debug_trace + cerr << "entering trace path algorithm, searching left? " << search_left << endl; + cerr << "tracing path " << pb2json(path) << endl; + cerr << "start coordinate: " << subpath_idx << ", " << mapping_idx << ", " << edit_idx << ", " << base_idx << endl; + cerr << "search limit " << search_limit << endl; +#endif + pair, vector>> return_val; + auto& pfarthest = return_val.first; + auto& mfarthest = return_val.second; + + if (search_left) { + // we like to index the base as if it's from the left even though it's from the right + // to simplify some conditions later, so we have to reverse it now + const auto& start_path = multipath_aln.subpath(subpath_idx).path(); + if (mapping_idx < start_path.mapping_size()) { + const auto& start_mapping = start_path.mapping(mapping_idx); + if (edit_idx < start_mapping.edit_size()) { + const auto& start_edit = start_mapping.edit(edit_idx); + base_idx = max(start_edit.from_length(), start_edit.to_length()) - base_idx; +#ifdef debug_trace + cerr << "flip leftward base index on edit " << debug_string(start_edit) << " to " << base_idx << endl; +#endif + } + } + } + + // the farthest match along the path + pfarthest = search_left ? tuple(path.mapping_size(), 0, 0) + : tuple(-1, 0, 0); + + // the position on the mp aln that corresponds to this match + + mfarthest.emplace_back(subpath_idx, mapping_idx, edit_idx, base_idx); + + // DFS stack + vector stacked(multipath_aln.subpath_size(), false); + vector> stack; + + int64_t incr = search_left ? -1 : 1; + + // which end of the path are we starting on? + int64_t p_start_mapping_idx = 0, p_start_edit_idx = 0; + if (search_left) { + p_start_mapping_idx = path.mapping_size() - 1; + if (p_start_mapping_idx >= 0) { + const auto& start_mapping = path.mapping(p_start_mapping_idx); + p_start_edit_idx = start_mapping.edit_size() - 1; + } + } + + // we may need reverse adjacencies if we're searching leftwards + vector> reverse_adjacencies; + if (search_left) { + reverse_adjacencies.resize(multipath_aln.subpath_size()); + for (int64_t i = 0; i < multipath_aln.subpath_size(); ++i) { + const auto& subpath = multipath_aln.subpath(i); + for (auto n : subpath.next()) { + reverse_adjacencies[n].push_back(i); + } + for (const auto& c : subpath.connection()) { + reverse_adjacencies[c.next()].push_back(i); + } + } + } + + // start at the indicated location on the mp aln and the beginning of the path + // note: the logic is simpler if we treat base indexes from 0 regardless of which + // direction w + stack.emplace_back(subpath_idx, mapping_idx, edit_idx, base_idx, + p_start_mapping_idx, p_start_edit_idx, 0); + stacked[subpath_idx] = true; + bool first_iter = true; + while (!stack.empty()) { + // load up the indexes of where we're going to look for a match + int64_t i, j, k, l, pj, pk, pl; + tie(i, j, k, l, pj, pk, pl) = stack.back(); + stack.pop_back(); + + + // the indexes of the last non-empty match for each index + int64_t ni, nj, nk, nl, npj, npk, npl; + tie(ni, nj, nk, nl, npj, npk, npl) = tie(i, j, k, l, pj, pk, pl); + bool any_new_matches = first_iter; + first_iter = false; + +#ifdef debug_trace + cerr << "destack (" << i << " " << j << " " << k << " " << l << ") (" << pj << " " << pk << " " << pl << ")" << endl; +#endif + + const subpath_t& subpath = multipath_aln.subpath(i); + const path_t& mpath = subpath.path(); + bool reached_mismatch = false; + while (j < mpath.mapping_size() && pj < path.mapping_size() && j >= 0 && pj >= 0 && + (search_left || pj < search_limit) && (!search_left || pj >= search_limit) && !reached_mismatch) { + +#ifdef debug_trace + cerr << "mp mapping " << j << ", p mapping " << pj << endl; +#endif + + const auto& mmapping = mpath.mapping(j); + const auto& pmapping = path.mapping(pj); + + // TODO: these mappings can actually have positions that are inconsistent + // but checking for consistency is tricky at mapping boundaries that + // also correspond to node boundaries + + // skip over the subpath mapping if it's empty + bool mnonempty = false; + for (int64_t m = k, n = l; m < mmapping.edit_size() && m >= 0 && !mnonempty; m += incr) { + const auto& edit = mmapping.edit(m); + int64_t rem = max(edit.from_length(), edit.to_length()) - n; + if (rem) { + mnonempty = true; + } + n = 0; + } + if (!mnonempty) { +#ifdef debug_trace + cerr << "mp mapping is empty" << endl; +#endif + l = 0; + j += incr; + if (search_left && j >= 0) { + k = mpath.mapping(j).edit_size() - 1; + } + else { + k = 0; + } + continue; + } + // skip over the path mapping if it's empty + bool pnonempty = false; + for (int64_t m = pk, n = pl; m < pmapping.edit_size() && m >= 0 && !pnonempty; m += incr) { + const auto& edit = pmapping.edit(m); + int64_t rem = max(edit.from_length(), edit.to_length()) - n; + if (rem) { + pnonempty = true; + } + n = 0; + } + if (!pnonempty) { +#ifdef debug_trace + cerr << "p mapping is empty" << endl; +#endif + pl = 0; + pj += incr; + if (search_left && pj >= 0) { + pk = path.mapping(pj).edit_size() - 1; + } + else { + pk = 0; + } + continue; + } + + // now that we know we're looking at non-empty mappings, the positions need to match + + // TODO: it would be nice if we didn't need to iterate over the entire mapping + + // find the graph position on the subpath + const auto& mpos = mmapping.position(); + int64_t moffset = mpos.offset(); + for (int64_t m = 0; m < k; ++m) { + moffset += mmapping.edit(m).from_length(); + } + if (search_left) { + moffset += mmapping.edit(k).from_length(); + } + if (l > 0 && mmapping.edit(k).from_length() > 0) { + moffset += l * incr; + } + + // find the graph position on the path + const auto& ppos = pmapping.position(); + int64_t poffset = ppos.offset(); + for (int64_t m = 0; m < pk; ++m) { + poffset += pmapping.edit(m).from_length(); + } + if (search_left) { + poffset += pmapping.edit(pk).from_length(); + } + if (pl > 0 && pmapping.edit(pk).from_length() > 0) { + poffset += pl * incr; + } + +#ifdef debug_trace + cerr << "mp pos " << mpos.node_id() << " " << mpos.is_reverse() << " " << moffset << ", p pos " << ppos.node_id() << " " << ppos.is_reverse() << " " << poffset << endl; +#endif + + if (mpos.node_id() != ppos.node_id() || mpos.is_reverse() != ppos.is_reverse() + || moffset != poffset) { + // these positions don't match + reached_mismatch = true; + } + + + // try to match edits + while (k < mmapping.edit_size() && k >= 0 && pk < pmapping.edit_size() && pk >= 0 && !reached_mismatch) { +#ifdef debug_trace + cerr << "mp edit " << k << " " << l << ", p edit " << pk << " " << pl << endl; +#endif + const auto& medit = mmapping.edit(k); + const auto& pedit = pmapping.edit(pk); + if (medit.from_length() == 0 && medit.to_length() == 0) { + // skip over an empty edit + l = 0; + k += incr; +#ifdef debug_trace + cerr << "mp edit empty" << endl; +#endif + } + else if (pedit.from_length() == 0 && pedit.to_length() == 0) { + // skip over an empty edit + pl = 0; + pk += incr; +#ifdef debug_trace + cerr << "p edit empty" << endl; +#endif + } + else if ((medit.from_length() == 0) == (pedit.from_length() == 0) && + (medit.to_length() == 0) == (pedit.to_length() == 0) && + medit.sequence().empty() == pedit.sequence().empty()) { + + // the type of edit matches + + if ((medit.from_length() == 0 || medit.from_length() - l == pedit.from_length() - pl) && + (medit.to_length() == 0 || medit.to_length() - l == pedit.to_length() - pl)) { + // the size of edit matches + l = 0; + k += incr; + pl = 0; + pk += incr; + +#ifdef debug_trace + cerr << "edits match" << endl; +#endif + } + else if ((medit.from_length() == 0 || medit.from_length() - l < pedit.from_length() - pl) && + (medit.to_length() == 0 || medit.to_length() - l < pedit.to_length() - pl)) { + + // subpath edit is a prefix of path edit + pl += max(medit.from_length(), medit.to_length()) - l; + if (pl == max(pedit.from_length(), pedit.to_length())) { + // TODO: won't this never happen because of the earlier condition? + pl = 0; + pk += incr; + } + l = 0; + k += incr; +#ifdef debug_trace + cerr << "mp edit is prefix" << endl; +#endif + } + else { + // path edit is a prefix of subpath edit + l += max(pedit.from_length(), pedit.to_length()) - pl; + if (l == max(medit.from_length(), medit.to_length())) { + // TODO: won't this never happen because of the earlier condition? + l = 0; + k += incr; + } + pl = 0; + pk += incr; +#ifdef debug_trace + cerr << "p edit is prefix" << endl; +#endif + } + // we made a non-empty match, update the non-empty index trackers + tie(ni, nj, nk, nl, npj, npk, npl) = tie(i, j, k, l, pj, pk, pl); + any_new_matches = true; + } + else { + // the edits do not match + reached_mismatch = true; +#ifdef debug_trace + cerr << "edits mismatch" << endl; +#endif + } + } + + // did we finish off either mapping? + if (k == mmapping.edit_size() || k < 0) { +#ifdef debug_trace + cerr << "finished mp mapping" << endl; +#endif + j += incr; + k = 0; + if (search_left && j >= 0) { + k = mpath.mapping(j).edit_size() - 1; + } + l = 0; + } + if (pk == pmapping.edit_size() || pk < 0) { +#ifdef debug_trace + cerr << "finished p mapping" << endl; +#endif + pj += incr; + pk = 0; + if (search_left && pj >= 0) { + pk = path.mapping(pj).edit_size() - 1; + } + pl = 0; + } } - else { - // sink subpaths become sources in reverse - reverse_starts.push_back(i); + // how far did we get along the path by walking this subpath (looking at non-empty matches only)? + if (any_new_matches) { + if ((search_left && (npj < get<0>(pfarthest) || + (npj == get<0>(pfarthest) && npk < get<1>(pfarthest)) || + (npj == get<0>(pfarthest) && npk == get<1>(pfarthest) && npl < get<2>(pfarthest)))) || + (!search_left && (npj > get<0>(pfarthest) || + (npj == get<0>(pfarthest) && npk > get<1>(pfarthest)) || + (npj == get<0>(pfarthest) && npk == get<1>(pfarthest) && npl > get<2>(pfarthest))))) { + // we've traversed more of the path than on any previous subpath +#ifdef debug_trace + cerr << "new farthest at subpath index " << ni << ", " << nj << ", " << nk << ", " << nl << " and path index " << npj << ", " << npk << ", " << npl << endl; +#endif + pfarthest = make_tuple(npj, npk, npl); + mfarthest.clear(); + mfarthest.emplace_back(ni, nj, nk, nl); + } + else if (npj == get<0>(pfarthest) && npk == get<1>(pfarthest) && npl == get<2>(pfarthest)) { + // we've tied the farthest we've gone along the path previously +#ifdef debug_trace + cerr << "tied existing farthest at subpath index " << ni << ", " << nj << ", " << nk << ", " << nl << endl; +#endif + mfarthest.emplace_back(ni, nj, nk, nl); + } } - } - - // add reversed edges - for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - Subpath* rc_subpath = rev_comp_out.mutable_subpath(i); - vector& reverse_edge_list = reverse_edge_lists[multipath_aln.subpath_size() - i - 1]; - for (size_t j = 0; j < reverse_edge_list.size(); j++) { - rc_subpath->add_next(multipath_aln.subpath_size() - reverse_edge_list[j] - 1); + + if (pj == path.mapping_size() || pj < 0) { + // we've found the farthest point possible + break; } - } - - // remove start nodes that are invalid in reverse - rev_comp_out.clear_start(); - - // assume that if the original multipath alignment had its starts labeled they want them - // labeled in the reverse complement too - if (multipath_aln.start_size() > 0) { - for (size_t i = 0; i < reverse_starts.size(); i++) { - rev_comp_out.add_start(multipath_aln.subpath_size() - reverse_starts[i] - 1); + if ((j == mpath.mapping_size() || j < 0) && !reached_mismatch) { + // we got to the end of the subpath without exhausting our match + if (search_left) { + for (auto n : reverse_adjacencies[i]) { + if (!stacked[n]) { + stacked[n] = true; + const auto& next_path = multipath_aln.subpath(n).path(); + int64_t next_mapping_idx = next_path.mapping_size() - 1; + int64_t next_edit_idx = 0; + if (next_mapping_idx >= 0) { + const auto& next_mapping = next_path.mapping(next_mapping_idx); + next_edit_idx = next_mapping.edit_size() - 1; + } + stack.emplace_back(n, next_mapping_idx, next_edit_idx, 0, pj, pk, pl); +#ifdef debug_trace + cerr << "stack up (" << n << " " << next_mapping_idx << " " << next_edit_idx << " " << 0 << ") (" << pj << " " << pk << " " << pl << ")" << endl; +#endif + } + } + } + else { + for (auto n : subpath.next()) { + if (!stacked[n]) { + stacked[n] = true; + stack.emplace_back(n, 0, 0, 0, pj, pk, pl); +#ifdef debug_trace + cerr << "stack up (" << n << " " << 0 << " " << 0 << " " << 0 << ") (" << pj << " " << pk << " " << pl << ")" << endl; +#endif + } + } + for (const auto& c : subpath.connection()) { + if (!stacked[c.next()]) { + stacked[c.next()] = true; + stack.emplace_back(c.next(), 0, 0, 0, pj, pk, pl); +#ifdef debug_trace + cerr << "stack up (" << c.next() << " " << 0 << " " << 0 << " " << 0 << ") (" << pj << " " << pk << " " << pl << ")" << endl; +#endif + } + } + } } } - } - - void rev_comp_multipath_alignment_in_place(MultipathAlignment* multipath_aln, - const function& node_length) { - - // reverse complement sequence - reverse_complement_in_place(*multipath_aln->mutable_sequence()); - // reverse base qualities - string* quality = multipath_aln->mutable_quality(); - std::reverse(quality->begin(), quality->end()); - - // current reverse edges - vector< vector > reverse_edge_lists(multipath_aln->subpath_size()); - // current sink nodes (will be starts) - vector reverse_starts; - int64_t subpath_swap_size = multipath_aln->subpath_size() / 2; - int64_t last = multipath_aln->subpath_size() - 1; - for (int64_t i = 0, j = last; i < subpath_swap_size; i++, j--) { - Subpath* subpath_1 = multipath_aln->mutable_subpath(i); - Subpath* subpath_2 = multipath_aln->mutable_subpath(j); + if (search_left) { + // we're set up to find past-the-first coordinates, but if we're going leftward + // what we want is actually the final coordinates - // add reverse edges for first subpath - if (subpath_1->next_size() > 0) { - for (int64_t k = 0; k < subpath_1->next_size(); k++) { - reverse_edge_lists[subpath_1->next(k)].push_back(i); - } +#ifdef debug_trace + cerr << "converting past-the-first coordinates to at-the-first" << endl; + cerr << "p farthest (" << get<0>(pfarthest) << " " << get<1>(pfarthest) << " " << get<2>(pfarthest) << ") -> "; +#endif + if (get<0>(pfarthest) < 0) { + get<0>(pfarthest) = 0; + } + else if (get<1>(pfarthest) < 0) { + get<1>(pfarthest) = 0; + // even though we interpreted the base index of 0 differently before, it's already what we want here } else { - reverse_starts.push_back(i); + // switch index of the base to from-the-end + const auto& final_edit = path.mapping(get<0>(pfarthest)).edit(get<1>(pfarthest)); + get<2>(pfarthest) = max(final_edit.from_length(), final_edit.to_length()) - get<2>(pfarthest); } +#ifdef debug_trace + cerr << "(" << get<0>(pfarthest) << " " << get<1>(pfarthest) << " " << get<2>(pfarthest) << ")" << endl; +#endif - // add reverse edges for second subpath - if (subpath_2->next_size() > 0) { - for (int64_t k = 0; k < subpath_2->next_size(); k++) { - reverse_edge_lists[subpath_2->next(k)].push_back(j); + for (auto& coord : mfarthest) { +#ifdef debug_trace + cerr << "m farthest (" << get<0>(coord) << " " << get<1>(coord) << " " << get<2>(coord) << " " << get<3>(coord) << ") -> "; +#endif + if (get<1>(coord) < 0) { + get<1>(coord) = 0; } + else if (get<2>(coord) < 0) { + get<2>(coord) = 0; + // even though we interpreted the base index of 0 differently before, it's already what we want here + } + else { + // switch index of the base to from-the-end + const auto& final_edit = multipath_aln.subpath(get<0>(coord)).path().mapping(get<1>(coord)).edit(get<2>(coord)); + get<3>(coord) = max(final_edit.from_length(), final_edit.to_length()) - get<3>(coord); + } +#ifdef debug_trace + cerr << "(" << get<0>(coord) << " " << get<1>(coord) << " " << get<2>(coord) << " " << get<3>(coord) << ")" << endl; +#endif } - else { - reverse_starts.push_back(j); - } + } + + return return_val; + } + + bool contains_match(const multipath_alignment_t& multipath_aln, const pos_t& pos, + int64_t read_pos, int64_t match_length) { + +#ifdef debug_find_match + cerr << "starting search for match at graph pos " << pos << ", read pos " << read_pos << ", length " << match_length << endl; + cerr << debug_string(multipath_aln) << endl; +#endif + + // to keep track of the read interval corresponding to each subpath + vector to_length(multipath_aln.subpath_size(), 0); + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + const subpath_t& subpath = multipath_aln.subpath(i); + const path_t& path = subpath.path(); - // clear current edges - subpath_1->clear_next(); - subpath_2->clear_next(); + int64_t to_length_here = to_length[i]; - // reverse complement the paths - reverse_complement_path_in_place(subpath_1->mutable_path(), node_length); - reverse_complement_path_in_place(subpath_2->mutable_path(), node_length); +#ifdef debug_find_match + cerr << "at subpath " << i << " at read pos " << to_length_here << endl; +#endif - // swap their positions (to maintain topological ordering) - std::swap(*subpath_1, *subpath_2); - } - - // repeat process for the middle subpath if there is an odd number - if (multipath_aln->subpath_size() % 2) { - Subpath* subpath = multipath_aln->mutable_subpath(subpath_swap_size); - if (subpath->next_size() > 0) { - for (int64_t k = 0; k < subpath->next_size(); k++) { - reverse_edge_lists[subpath->next(k)].push_back(subpath_swap_size); + for (size_t j = 0; j < path.mapping_size(); ++j) { + + const path_mapping_t& mapping = path.mapping(j); + const position_t& position = mapping.position(); + +#ifdef debug_find_match + cerr << "at mapping " << j << ", graph pos " << make_pos_t(position) << ", read pos " << to_length_here << endl; +#endif + + if (id(pos) == position.node_id() && is_rev(pos) == position.is_reverse()) { + // we're on the node with the position we're looking for + int64_t offset_here = position.offset(); + for (size_t k = 0; k < mapping.edit_size(); ++k) { + const edit_t& edit = mapping.edit(k); +#ifdef debug_find_match + cerr << "at edit " << k << ", offset " << offset_here << ", read pos " << to_length_here << endl; +#endif + if (offset_here <= offset(pos) && offset_here + edit.from_length() > offset(pos) && + to_length_here <= read_pos && to_length_here + edit.to_length() > read_pos + && (offset(pos) - offset_here) == (read_pos - to_length_here) + && edit.sequence().empty()) { + // we're going to pass a place where we might find a match + + +#ifdef debug_find_match + cerr << "starting a DFS for the match on edit " << k << ", offset " << offset_here << ", read pos " << to_length_here << ", initial walked " << offset(pos) - offset_here << endl; +#endif + + // do DFS to find the match + // records of (remaining, subpath idx, mapping idx, edit idx, which next) + vector> stack; + stack.emplace_back(offset(pos) - offset_here, i, j, k); + while (!stack.empty()) { + + int64_t walked; + size_t di, dj, dk; + tie(walked, di, dj, dk) = stack.back(); + stack.pop_back(); + + + const subpath_t& subpath_here = multipath_aln.subpath(di); + const path_t& path_here = subpath_here.path(); + const path_mapping_t& mapping_here = path_here.mapping(dj); + const edit_t& edit_here = mapping_here.edit(dk); + +#ifdef debug_find_match + cerr << "DFS location " << di << ", " << dj << ", " << dk << ", walked " << walked << ", edit " << debug_string(edit_here) << endl; +#endif + + if (edit_here.to_length() && edit_here.from_length() && edit_here.sequence().empty()) { + // this edit can continue the match + walked += edit_here.to_length(); + if (walked >= match_length) { + // we found the match + return true; + } + // advance by one edit + ++dk; + if (dk == mapping_here.edit_size()) { + // we're to the next mapping + dk = 0; + ++dj; + } + if (dj == path_here.mapping_size()) { + // we're at the boundary to the next subpaths + for (auto n : subpath_here.next()) { + stack.emplace_back(walked, n, 0, 0); +#ifdef debug_find_match + cerr << "queue up next subpath " << n << ", walked " << walked << endl; +#endif + } + } + else { + stack.emplace_back(walked, di, dj, dk); +#ifdef debug_find_match + cerr << "queue up next edit " << di << ", " << dj << ", " << dk << ", walked " << walked << endl; +#endif + } + } + } + + } + offset_here += edit.from_length(); + to_length_here += edit.to_length(); + } + } + else { + // we're not on the node we want, just record the read interval of this mapping + to_length_here += mapping_to_length(mapping); } - } - else { - reverse_starts.push_back(subpath_swap_size); } - subpath->clear_next(); - reverse_complement_path_in_place(subpath->mutable_path(), node_length); - } - - // add reversed edges - for (int64_t i = 0, j = last; i < multipath_aln->subpath_size(); i++, j--) { - vector edges = reverse_edge_lists[j]; - Subpath* subpath = multipath_aln->mutable_subpath(i); - for (int64_t k : edges) { - subpath->add_next(last - k); + // record the read interval of the successors + for (auto n : subpath.next()) { + to_length[n] = to_length_here; } - } - - // if we had starts labeled before, label them again - if (multipath_aln->start_size() > 0) { - multipath_aln->clear_start(); - for (int64_t i : reverse_starts) { - multipath_aln->add_start(last - i); + for (const auto& connection : subpath.connection()) { + to_length[connection.next()] = to_length_here; } } - } - - void to_multipath_alignment(const Alignment& aln, MultipathAlignment& multipath_aln_out) { - - // clear repeated fields - multipath_aln_out.clear_subpath(); - multipath_aln_out.clear_start(); - - // transfer read and alignment metadata - transfer_read_metadata(aln, multipath_aln_out); - multipath_aln_out.set_mapping_quality(aln.mapping_quality()); - - // transfer alignment and score - if (aln.has_path() || aln.score()) { - Subpath* subpath = multipath_aln_out.add_subpath(); - subpath->set_score(aln.score()); - *(subpath->mutable_path()) = aln.path(); - } + // we never found the match + return false; } - - void transfer_read_metadata(const MultipathAlignment& from, MultipathAlignment& to) { - to.set_sequence(from.sequence()); - to.set_quality(from.quality()); - to.set_read_group(from.read_group()); - to.set_name(from.name()); - to.set_sample_name(from.sample_name()); - to.set_paired_read_name(from.paired_read_name()); - } - - void transfer_read_metadata(const Alignment& from, MultipathAlignment& to) { - to.set_sequence(from.sequence()); - to.set_quality(from.quality()); - to.set_read_group(from.read_group()); - to.set_name(from.name()); - to.set_sample_name(from.sample_name()); + + // TODO: does it really make sense to split this algorithm into a separate surject + // and CIGAR conversion? it seems like i'm replicating a lot of the same work + vector> cigar_against_path(const multipath_alignment_t& multipath_aln, const string& path_name, + bool rev, int64_t path_pos, const PathPositionHandleGraph& graph, + int64_t min_splice_length) { - // no difference in these fields for MultipathAlignments - if (from.has_fragment_prev()) { - to.set_paired_read_name(from.fragment_prev().name()); - } - else if (from.has_fragment_next()) { - to.set_paired_read_name(from.fragment_next().name()); +#ifdef debug_cigar + cerr << "converting mp aln to CIGAR on path " << path_name << ", rev? " << rev << ", pos " << path_pos << endl; + cerr << debug_string(multipath_aln) << endl; +#endif + vector> cigar; + if (path_pos < 0) { + // read is unmapped + return cigar; } - } - - void transfer_read_metadata(const MultipathAlignment& from, Alignment& to) { - to.set_sequence(from.sequence()); - to.set_quality(from.quality()); - to.set_read_group(from.read_group()); - to.set_name(from.name()); - to.set_sample_name(from.sample_name()); - // note: not transferring paired_read_name because it is unclear whether - // it should go into fragment_prev or fragment_next - } - - void merge_non_branching_subpaths(MultipathAlignment& multipath_aln) { - - vector in_degree(multipath_aln.subpath_size(), 0); - for (const Subpath& subpath : multipath_aln.subpath()) { - for (int64_t next : subpath.next()) { - in_degree[next]++; - } - } + // a graph of runs of alignments to the path + // records of (subpath index, mapping index, num_mappings, final step, from connection, adj list of (index, distance)) + vector>>> run_graph; - vector removed(multipath_aln.subpath_size(), false); - vector removed_so_far(multipath_aln.subpath_size(), 0); + path_handle_t path_handle = graph.get_path_handle(path_name); - auto get_mergeable_next = [&](const Subpath& subpath) { - if (subpath.next_size() == 1) { - if (in_degree[subpath.next(0)] == 1) { - return int64_t(subpath.next(0)); - } - } - return int64_t(-1); - }; + // the runs from the previous node that could be extended in the current iteration + // second value in the pair is the number of bases remaining until the end of node + unordered_map> curr_runs; - for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - - if (i > 0) { - removed_so_far[i] = removed_so_far[i - 1]; - } - - // this one has been marked for removal, - if (removed[i]) { - removed_so_far[i]++; - continue; - } + size_t num_mappings = 0; + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { - // the subpath we might merge into - Subpath* subpath = multipath_aln.mutable_subpath(i); + const auto& subpath = multipath_aln.subpath(i); + const auto& path = subpath.path(); - // move it up in the vector if we've removed earlier subpaths - if (removed_so_far[i] > 0) { - *multipath_aln.mutable_subpath(i - removed_so_far[i]) = move(*subpath); - subpath = multipath_aln.mutable_subpath(i - removed_so_far[i]); + if (subpath.next_size() + subpath.connection_size() > 1) { + cerr << "error: cannot convert a multipath alignment to a CIGAR unless is consists of a single non-branching path" << endl; + exit(1); } - int64_t last = -1; - // iterate through non-branching subpaths - for (int64_t j = get_mergeable_next(*subpath); j >= 0; j = get_mergeable_next(multipath_aln.subpath(j))) { - - // mark the next one for removal - removed[j] = true; + for (size_t j = 0; j < path.mapping_size(); ++j, ++num_mappings) { - const Subpath& merge_subpath = multipath_aln.subpath(j); + const auto& mapping = path.mapping(j); + const auto& pos = mapping.position(); - subpath->set_score(subpath->score() + merge_subpath.score()); +#ifdef debug_cigar + cerr << "look for runs on mapping " << i << " " << j << ": " << debug_string(mapping) << endl; +#endif - const Path& merge_path = merge_subpath.path(); - if (merge_path.mapping_size() == 0) { + if (i == 0 && j == 0 && !rev) { + // base case if the position is given for the beginning of the surjected alignment + auto step = graph.get_step_at_position(path_handle, path_pos); +#ifdef debug_cigar + cerr << "got step " << graph.get_id(graph.get_handle_of_step(step)) << " " << graph.get_is_reverse(graph.get_handle_of_step(step)) << " as path pos " << graph.get_position_of_step(step) << endl; +#endif + if (graph.get_id(graph.get_handle_of_step(step)) != pos.node_id() + || graph.get_is_reverse(graph.get_handle_of_step(step)) != pos.is_reverse() + || path_pos - graph.get_position_of_step(step) != pos.offset()) { + // the step doesn't match our starting position, but this can sometimes happen when + // a position occurs right at a node boundary + auto prev_step = graph.get_previous_step(step); + +#ifdef debug_cigar + cerr << "didn't match, walk back to " << graph.get_id(graph.get_handle_of_step(prev_step)) << " " << graph.get_is_reverse(graph.get_handle_of_step(prev_step)) << " as path pos " << graph.get_position_of_step(prev_step) << endl; +#endif + if (prev_step != graph.path_front_end(path_handle) + && graph.get_id(graph.get_handle_of_step(prev_step)) == pos.node_id() + && graph.get_is_reverse(graph.get_handle_of_step(prev_step)) == pos.is_reverse() + && graph.get_length(graph.get_handle_of_step(prev_step)) == pos.offset()) { + step = prev_step; + } + else { + cerr << "error: couldn't find a matching subpath on " << path_name << " for read " << multipath_aln.sequence() << endl; + exit(1); + } + } + run_graph.emplace_back(0, 0, 1, step, false, vector>()); + curr_runs[step] = pair(0, graph.get_length(graph.get_handle_of_step(step)) + - mapping_from_length(mapping) - pos.offset()); + +#ifdef debug_cigar + cerr << "initializing on forward strand with step on " << graph.get_id(graph.get_handle_of_step(step)) << " " << graph.get_is_reverse(graph.get_handle_of_step(step)) << " at pos " << graph.get_position_of_step(step) << endl; +#endif continue; } - Path* path = subpath->mutable_path(); - Mapping* final_mapping = path->mutable_mapping(path->mapping_size() - 1); - const Position& final_position = final_mapping->position(); + auto from_length = mapping_from_length(mapping); - const Mapping& first_mapping = merge_path.mapping(0); - const Position& first_position = first_mapping.position(); - - int64_t mapping_idx = 0; + // get the next mapping's position, if there is one + const position_t* next_pos = nullptr; + if (j + 1 < path.mapping_size()) { + next_pos = &path.mapping(j + 1).position(); + } + else if (i + 1 < multipath_aln.subpath_size()) { + next_pos = &multipath_aln.subpath(i + 1).path().mapping().front().position(); + } - // do we need to merge the abutting mappings? - if (first_position.node_id() == final_position.node_id() && - first_position.is_reverse() == final_position.is_reverse() && - first_position.offset() == final_position.offset() + mapping_from_length(*final_mapping)) { + if (next_pos && next_pos->node_id() == pos.node_id() && next_pos->is_reverse() == pos.is_reverse() + && next_pos->offset() == pos.offset() + from_length && !curr_runs.empty()) { + // we only care about transitions that are between nodes, this one is within a node - // do we need to merge the abutting edits? - int64_t edit_idx = 0; - if (final_mapping->edit_size() && first_mapping.edit_size()) { - Edit* final_edit = final_mapping->mutable_edit(0); - const Edit& first_edit = first_mapping.edit(0); - if ((first_edit.from_length() > 0) == (final_edit->from_length() > 0) && - (first_edit.to_length() > 0) == (final_edit->to_length() > 0) && - first_edit.sequence().empty() == final_edit->sequence().empty()) { - - final_edit->set_from_length(final_edit->from_length() + first_edit.from_length()); - final_edit->set_to_length(final_edit->to_length() + first_edit.to_length()); - final_edit->set_sequence(final_edit->sequence() + first_edit.sequence()); - - edit_idx++; +#ifdef debug_cigar + cerr << "transition for " << i << " " << j << " does not exit node, stalling current runs" << endl; +#endif + + // but keep track of the number of mapping in each run + for (pair>& curr_run : curr_runs) { + ++get<2>(run_graph[curr_run.second.first]); + curr_run.second.second -= from_length; + } + continue; + } + + // remember where previously created run nodes stop + size_t num_runs_before_extend = run_graph.size(); + + // is this step across a splice connection? + bool across_connection = i > 0 ? j == 0 && !multipath_aln.subpath(i - 1).connection().empty() : false; + + // check whether steps on this handle extend previous runs or not + unordered_map> next_runs; + next_runs.reserve(curr_runs.size()); + handle_t handle = graph.get_handle(pos.node_id(), pos.is_reverse()); +#ifdef debug_cigar + cerr << "iterating over steps on " << graph.get_id(handle) << " " << graph.get_is_reverse(handle) << endl; + cerr << "curr runs to extend:" << endl; + for (const auto& run : curr_runs) { + auto h = graph.get_handle_of_step(run.first); + cerr << "\trun " << run.second.first << ", node " << graph.get_id(h) << " " << graph.get_is_reverse(h) << ", rem " << run.second.second << ", pos " << graph.get_position_of_step(run.first) << endl; + } + cerr << "crossing a connection? " << across_connection << endl; +#endif + + // check that we can extend all of the current runs, but not when going over + // a connection or not at the start of a node + // note: we only extend any run if all of the runs can extend, or else we might miss + // adjacencies into the middle of a run + bool all_extendable = false; + if (!across_connection && pos.offset() == 0) { + all_extendable = graph.for_each_step_on_handle(handle, [&](const step_handle_t& step) { + if (graph.get_path_handle_of_step(step) != path_handle || + (graph.get_handle_of_step(step) != handle) != rev) { + // we're only concerned about one strand of this one path + return true; } + step_handle_t prev = rev ? graph.get_next_step(step) : graph.get_previous_step(step); + auto it = curr_runs.find(prev); + return it == curr_runs.end() ? false : it->second.second == 0; + }); + } + + graph.for_each_step_on_handle(handle, [&](const step_handle_t& step) { + if (graph.get_path_handle_of_step(step) != path_handle || + (graph.get_handle_of_step(step) != handle) != rev) { + // we're only concerned about one strand of this one path + return; } - // append rest of the edits - for (; edit_idx < first_mapping.edit_size(); edit_idx++) { - *final_mapping->add_edit() = first_mapping.edit(edit_idx); + size_t remaining = (graph.get_length(graph.get_handle_of_step(step)) + - from_length - pos.offset()); + + if (all_extendable) { + // this is the next step we would expect along a previous run, and it's + // not across a connection + step_handle_t prev = rev ? graph.get_next_step(step) : graph.get_previous_step(step); + auto it = curr_runs.find(prev); + next_runs[step] = make_pair(it->second.first, remaining); + auto& run_node = run_graph[it->second.first]; +#ifdef debug_cigar + cerr << "extending run " << it->second.first << " from step at " << graph.get_position_of_step(get<3>(run_node)) << " to step at " << graph.get_position_of_step(step) << endl; +#endif + + ++get<2>(run_node); + get<3>(run_node) = step; } - - mapping_idx++; - } + else { + // we're at the start of a new run, or we just crossed a connection, start + // a new run + next_runs[step] = make_pair(run_graph.size(), remaining); + run_graph.emplace_back(i, j, 1, step, across_connection, vector>()); +#ifdef debug_cigar + cerr << "new run " << run_graph.size() - 1 << " for step at " << graph.get_position_of_step(step) << " and subpath indexes " << i << " " << j << endl; +#endif + } + }); - // append rest of the mappings - for (; mapping_idx < merge_path.mapping_size(); mapping_idx++) { - *path->add_mapping() = merge_path.mapping(mapping_idx); + // TODO: are there situations where i would need to split a run into multiple chunks + // in order to find the full length? + if (!all_extendable) { + // check if any of the unextended runs can make a long-distance adjacency + // to the new runs + + for (const auto& curr_run : curr_runs) { + // an unextended run from the previous iteration + for (size_t run_idx = num_runs_before_extend; run_idx < run_graph.size(); ++run_idx) { + // fresh new run that we just found + auto& run_node = run_graph[run_idx]; + + int64_t dist; + if (rev) { + dist = (graph.get_position_of_step(curr_run.first) + - graph.get_position_of_step(get<3>(run_node)) + + mapping.position().offset() + curr_run.second.second + - graph.get_length(graph.get_handle_of_step(get<3>(run_node)))); + } + else { + dist = (graph.get_position_of_step(get<3>(run_node)) + - graph.get_position_of_step(curr_run.first) + + mapping.position().offset() + curr_run.second.second + - graph.get_length(graph.get_handle_of_step(curr_run.first))); + } + if (dist >= 0) { + // they are in increasing order (relative to the strand) + + // add an edge + get<5>(run_graph[curr_run.second.first]).emplace_back(run_idx, dist); +#ifdef debug_cigar + cerr << "adjacency of length " << dist << " from " << curr_run.second.first << " to " << run_idx << endl; +#endif + } + } + } } - last = j; - } - - // move the adjacencies over from the last one we merged in - if (last >= 0) { - subpath->clear_next(); - for (int64_t next : multipath_aln.subpath(last).next()) { - subpath->add_next(next); - } + curr_runs = move(next_runs); } } - // did we merge and remove any subpaths? - if (removed_so_far.back()) { - // trim the vector of subpaths - multipath_aln.mutable_subpath()->DeleteSubrange(multipath_aln.subpath_size() - removed_so_far.back(), - removed_so_far.back()); + // okay, now we did that whole business, it's time to find the path through the run graph + // that corresponds to the surjected alignment + +#ifdef debug_cigar + cerr << "doing mapping length DP with a total number of mappings " << num_mappings << endl; +#endif + + // find the longest path, measured by number of mappings (should consume the whole alignment) + vector mapping_dp(run_graph.size(), 0); + vector full_length(mapping_dp.size(), false); + for (size_t i = 0; i < mapping_dp.size(); ++i) { - // update the indexes of the adjacencies - for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - Subpath* subpath = multipath_aln.mutable_subpath(i); - for (size_t j = 0; j < subpath->next_size(); j++) { - subpath->set_next(j, subpath->next(j) - removed_so_far[subpath->next(j)]); - } + const auto& run_node = run_graph[i]; + mapping_dp[i] += get<2>(run_node); + for (const auto& edge : get<5>(run_node)) { + mapping_dp[edge.first] = max(mapping_dp[i], mapping_dp[edge.first]); } +#ifdef debug_cigar + cerr << "\t" << i << ": " << mapping_dp[i] << endl; +#endif + + // does it complete a full traversal of the alignment? + full_length[i] = (mapping_dp[i] == num_mappings); } - } - - vector> connected_components(const MultipathAlignment& multipath_aln) { - int64_t comps = 0; - - vector> reverse_edge_lists(multipath_aln.subpath_size()); +#ifdef debug_cigar + cerr << "identifying full length run combinations" << endl; +#endif - for (int64_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); - // collect edges by their target - for (size_t j = 0; j < subpath.next_size(); j++) { - reverse_edge_lists[subpath.next(j)].push_back(i); + // identify the run nodes that could be part of a full length alignment + for (int64_t i = full_length.size() - 1; i >= 0; --i) { + for (const auto& edge : get<5>(run_graph[i])) { + full_length[i] = full_length[i] || full_length[edge.first]; } +#ifdef debug_cigar + cerr << "\t" << i << ": " << full_length[i] << endl; +#endif } - vector collected(multipath_aln.subpath_size(), false); - - vector> components; +#ifdef debug_cigar + cerr << "doing path distance DP" << endl; +#endif - for (int64_t i = 0; i < multipath_aln.subpath_size(); i++) { - if (collected[i]) { + // identify the shortest path through the run graph based on total path length + vector path_dist_dp(run_graph.size(), numeric_limits::max()); + vector> backpointer(path_dist_dp.size(), make_pair(-1, -1)); + int64_t best = -1; + for (int64_t i = 0; i < path_dist_dp.size(); ++i) { + if (!full_length[i]) { + // this node can't be part of a full length alignment so we don't want + // to find paths through it continue; } - - components.emplace_back(); - - vector stack{i}; - collected[i] = true; - while (!stack.empty()) { - int64_t at = stack.back(); - stack.pop_back(); - - components.back().push_back(at); - - const Subpath& subpath = multipath_aln.subpath(at); - for (int64_t j = 0; j < subpath.next_size(); j++) { - int64_t idx = subpath.next(j); - if (!collected[idx]) { - collected[idx] = true; - stack.push_back(idx); - } + const auto& run_node = run_graph[i]; + if (get<0>(run_node) == 0 && get<1>(run_node) == 0) { + // base case + path_dist_dp[i] = 0; + } + for (const auto& edge : get<5>(run_node)) { + size_t dist_thru = path_dist_dp[i] + edge.second; + if (dist_thru < path_dist_dp[edge.first]) { + backpointer[edge.first] = pair(i, edge.second); + path_dist_dp[edge.first] = dist_thru; } - for (int64_t idx : reverse_edge_lists[at]) { - if (!collected[idx]) { - collected[idx] = true; - stack.push_back(idx); + } + if (mapping_dp[i] == num_mappings) { +#ifdef debug_cigar + cerr << "\tpossible end " << i << ": is full length? " << full_length[i] << ", path dist " << path_dist_dp[i] << endl; +#endif + if (rev) { + const auto& final_mapping = multipath_aln.subpath().back().path().mapping().back(); + int64_t path_pos_here = (graph.get_position_of_step(get<3>(run_node)) + + graph.get_length(graph.get_handle_of_step(get<3>(run_node))) + - final_mapping.position().offset() + - mapping_from_length(final_mapping)); + if (path_pos_here != path_pos) { + // this run doesn't end where it should based on our path position, it can't + // be part of the CIGAR +#ifdef debug_cigar + cerr << "path pos doesn't match expected " << path_pos << ", instead got " << path_pos_here << " from step pos " << graph.get_position_of_step(get<3>(run_node)) << ", node length " << graph.get_length(graph.get_handle_of_step(get<3>(run_node))) << " mapping offset " << final_mapping.position().offset() << " and mapping len " << mapping_from_length(final_mapping) << endl; +#endif + continue; } } + + if (best == -1 || path_dist_dp[i] < path_dist_dp[best]) { + // this is the shortest full length run sequence we've seen + best = i; + } } } - return std::move(components); - } - - void extract_sub_multipath_alignment(const MultipathAlignment& multipath_aln, - const vector& subpath_indexes, - MultipathAlignment& sub_multipath_aln) { - sub_multipath_aln.Clear(); - transfer_read_metadata(multipath_aln, sub_multipath_aln); + if (best == -1) { + cerr << "error: couldn't find a matching subpath on " << path_name << " for read " << multipath_aln.sequence() << endl; + exit(1); + } - // create subpaths for each of the ones we're retaining and record the translation - unordered_map new_index; - for (int64_t i = 0; i < subpath_indexes.size(); i++) { - int64_t old_idx = subpath_indexes[i]; - const Subpath& old_subpath = multipath_aln.subpath(old_idx); +#ifdef debug_cigar + cerr << "backtracing" << endl; +#endif + + // compute the traceback + vector traceback(1, best); + while (backpointer[traceback.back()].first != -1) { + traceback.push_back(backpointer[traceback.back()].first); + } + +#ifdef debug_cigar + cerr << "forming CIGAR string" << endl; +#endif + + // now finally make the CIGAR + for (int64_t i = traceback.size() - 1; i >= 0; --i) { - Subpath* subpath = sub_multipath_aln.add_subpath(); - *subpath->mutable_path() = old_subpath.path(); - subpath->set_score(old_subpath.score()); +#ifdef debug_cigar + cerr << "forward trace to " << traceback[i] << endl; +#endif - new_index[old_idx] = i; + auto& run_node = run_graph[traceback[i]]; + + // handle the edge between these runs + if (i != traceback.size() - 1) { + int64_t dist = backpointer[traceback[i]].second; +#ifdef debug_cigar + cerr << "handling adjacency of length " << dist << endl; +#endif + if (dist >= min_splice_length || get<4>(run_node)) { + // let this be a splice either because it's long or because it's across a connection + cigar.emplace_back(dist, 'N'); + } + else if (!cigar.empty() && cigar.back().second == 'D') { + cigar.back().first += dist; + } + else { + cigar.emplace_back(dist, 'D'); + } + } + + // determine the bounds of the iteration over the mp aln that corresponds + // to this run + size_t j = get<0>(run_node); + size_t k = get<1>(run_node); + size_t j_end, k_end; + if (i > 0) { + auto& next_run_node = run_graph[traceback[i - 1]]; + j_end = get<0>(next_run_node); + k_end = get<1>(next_run_node); + } + else { + j_end = multipath_aln.subpath_size(); + k_end = 0; + } + +#ifdef debug_cigar + cerr << "iteration bounds are " << j << " " << k << " to " << j_end << " " << k_end << endl; +#endif + + // convert this segment of the mp aln into CIGAR + while (j != j_end || k != k_end) { + const auto& path = multipath_aln.subpath(j).path(); + const auto& mapping = path.mapping(k); +#ifdef debug_cigar + cerr << "on mapping " << debug_string(mapping) << endl; +#endif + for (const auto& edit : mapping.edit()) { + char cigar_code; + int length; + if (edit.from_length() == edit.to_length()) { + cigar_code = 'M'; + length = edit.from_length(); + } + else if (edit.from_length() > 0 && edit.to_length() == 0) { + cigar_code = 'D'; + length = edit.from_length(); + } + else if (edit.to_length() > 0 && edit.from_length() == 0) { + cigar_code = 'I'; + length = edit.to_length(); + } + else { + throw std::runtime_error("Spliced CIGAR construction can only convert simple edits"); + } + + if (!cigar.empty() && cigar_code == cigar.back().second) { + cigar.back().first += length; + } + else { + cigar.emplace_back(length, cigar_code); + } + } + k++; + if (k == path.mapping_size()) { + ++j; + k = 0; + } + } } - // add edges according to the translation - for (int64_t i = 0; i < subpath_indexes.size(); i++) { - const Subpath& old_subpath = multipath_aln.subpath(subpath_indexes[i]); - Subpath* new_subpath = sub_multipath_aln.mutable_subpath(i); - for (int64_t j = 0; j < old_subpath.next_size(); j++) { - if (new_index.count(old_subpath.next(j))) { - new_subpath->add_next(new_index[old_subpath.next(j)]); - } + // change start/end insertions into softclips + if (!cigar.empty()) { + if (cigar.front().second == 'I') { + cigar.front().second = 'S'; + } + if (cigar.back().second == 'I') { + cigar.back().second = 'S'; } } - // assume that if we had starts labeled before, we want them again - if (multipath_aln.start_size() > 0) { - identify_start_subpaths(sub_multipath_aln); + if (rev) { + // return the cigar relative to the forward strand + for (size_t i = 0, end = cigar.size() / 2; i < end; ++i) { + swap(cigar[i], cigar[cigar.size() - i - 1]); + } + } + +#ifdef debug_cigar + cerr << "got cigar: "; + for (auto& cigar_record : cigar) { + cerr << cigar_record.first << cigar_record.second; + } + cerr << endl; + cerr << "coalescing runs of I/D..." << endl; +#endif + + simplify_cigar(cigar); + +#ifdef debug_cigar + cerr << "final cigar: "; + for (auto& cigar_record : cigar) { + cerr << cigar_record.first << cigar_record.second; } + cerr << endl; +#endif + + return cigar; } - - bool validate_multipath_alignment(const MultipathAlignment& multipath_aln, const HandleGraph& handle_graph) { - + bool validate_multipath_alignment(const multipath_alignment_t& multipath_aln, const HandleGraph& handle_graph) { + // are the subpaths in topological order? for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); + const subpath_t& subpath = multipath_aln.subpath(i); for (size_t j = 0; j < subpath.next_size(); j++) { if (subpath.next(j) <= i) { #ifdef debug_verbose_validation @@ -1247,10 +3865,13 @@ namespace vg { if (multipath_aln.start_size()) { vector is_source(multipath_aln.subpath_size(), true); for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); + const subpath_t& subpath = multipath_aln.subpath(i); for (size_t j = 0; j < subpath.next_size(); j++) { is_source[subpath.next(j)] = false; } + for (const auto& connection : subpath.connection()) { + is_source[connection.next()] = false; + } } size_t num_starts = 0; @@ -1260,7 +3881,7 @@ namespace vg { if (num_starts != multipath_aln.start_size()) { #ifdef debug_verbose_validation - cerr << "validation failure on correct number of starts" << endl; + cerr << "validation failure on correct number of starts, says " << multipath_aln.start_size() << " but actually " << num_starts << endl; for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { if (is_source[i]) { cerr << i << " "; @@ -1296,21 +3917,23 @@ namespace vg { subpath_read_interval[i].first = 0; } - const Subpath& subpath = multipath_aln.subpath(i); + const subpath_t& subpath = multipath_aln.subpath(i); int64_t subsequence_length = path_to_length(subpath.path()); subpath_read_interval[i].second = subpath_read_interval[i].first + subsequence_length; - if (!subpath.next_size()) { + if (subpath.next_size() == 0 && subpath.connection_size() == 0) { if (subpath_read_interval[i].second != multipath_aln.sequence().size()) { #ifdef debug_verbose_validation cerr << "validation failure on using complete read" << endl; cerr << "subpath " << i << " ends on sequence index " << subpath_read_interval[i].second << " of " << multipath_aln.sequence().size() << endl; - cerr << pb2json(subpath) << endl; for (size_t j = 0; j < multipath_aln.subpath_size(); j++) { cerr << j << " (" << subpath_read_interval[j].first << ", " << subpath_read_interval[j].second << "): "; for (size_t k = 0; k < multipath_aln.subpath(j).next_size(); k++) { cerr << multipath_aln.subpath(j).next(k) << " "; } + for (auto& connection : subpath.connection()) { + cerr << connection.next() << " "; + } cerr << endl; } #endif @@ -1322,7 +3945,7 @@ namespace vg { if (subpath_read_interval[subpath.next(j)].first >= 0) { if (subpath_read_interval[subpath.next(j)].first != subpath_read_interval[i].second) { #ifdef debug_verbose_validation - cerr << "validation failure on read contiguity" << endl; + cerr << "validation failure on read contiguity from subpath " << i << " with read interval " << subpath_read_interval[i].first << ":" << subpath_read_interval[i].second << " to next " << subpath.next(j) << " with read interval " << subpath_read_interval[subpath.next(j)].first << ":" << subpath_read_interval[subpath.next(j)].second << endl; #endif return false; } @@ -1331,6 +3954,19 @@ namespace vg { subpath_read_interval[subpath.next(j)].first = subpath_read_interval[i].second; } } + for (const auto& connection : subpath.connection()) { + if (subpath_read_interval[connection.next()].first >= 0) { + if (subpath_read_interval[connection.next()].first != subpath_read_interval[i].second) { +#ifdef debug_verbose_validation + cerr << "validation failure on read contiguity from subpath " << i << " with read interval " << subpath_read_interval[i].first << ":" << subpath_read_interval[i].second << " to connection " << connection.next() << " with read interval " << subpath_read_interval[connection.next()].first << ":" << subpath_read_interval[connection.next()].second << endl; +#endif + return false; + } + } + else { + subpath_read_interval[connection.next()].first = subpath_read_interval[i].second; + } + } } } @@ -1340,7 +3976,7 @@ namespace vg { if (multipath_aln.subpath(i).path().mapping_size() == 0) { #ifdef debug_verbose_validation cerr << "validation failure on containing only nonempty paths" << endl; - cerr << "subpath " << i << ": " << pb2json(multipath_aln.subpath(i)) << endl; + cerr << "subpath " << i << ": " << debug_string(multipath_aln.subpath(i)) << endl; #endif return false; } @@ -1348,7 +3984,7 @@ namespace vg { if (multipath_aln.subpath(i).path().mapping(j).edit_size() == 0) { #ifdef debug_verbose_validation cerr << "validation failure on containing only nonempty mappings" << endl; - cerr << "subpath " << i << ": " << pb2json(multipath_aln.subpath(i)) << endl; + cerr << "subpath " << i << ": " << debug_string(multipath_aln.subpath(i)) << endl; #endif return false; } @@ -1358,128 +3994,167 @@ namespace vg { // are the subpaths contiguous within the graph? - auto validate_adjacent_mappings = [&](const Mapping& mapping_from, const Mapping& mapping_to) { + auto validate_adjacent_mappings = [&](const path_mapping_t& mapping_from, const path_mapping_t& mapping_to) { size_t mapping_from_end_offset = mapping_from.position().offset() + mapping_from_length(mapping_from); - if (mapping_from.position().node_id() == mapping_to.position().node_id() && - mapping_from.position().is_reverse() == mapping_to.position().is_reverse()) { - if (mapping_to.position().offset() != mapping_from_end_offset) { + + handle_t handle_from = handle_graph.get_handle(mapping_from.position().node_id(), mapping_from.position().is_reverse()); + handle_t handle_to = handle_graph.get_handle(mapping_to.position().node_id(), mapping_to.position().is_reverse()); + + + + if (handle_from == handle_to) { + if (!(mapping_to.position().offset() == 0 && mapping_from_end_offset == handle_graph.get_length(handle_from))) { + // We aren't going from the end of the handle back to its start (over an edge) + + if (mapping_to.position().offset() != mapping_from_end_offset) { + // So then the mappings need to abut and they don't. #ifdef debug_verbose_validation - cerr << "validation failure on within-node adjacency" << endl; - cerr << pb2json(mapping_from) << "->" << pb2json(mapping_to) << endl; + cerr << "validation failure on within-node adjacency" << endl; + cerr << debug_string(mapping_from) << "->" << debug_string(mapping_to) << endl; #endif - return false; + return false; + } else { + // No edge involved. We can succeed early. + return true; + } } } - else { - if (mapping_from_end_offset != handle_graph.get_length(handle_graph.get_handle(mapping_from.position().node_id()))) { + + // If we get here, we must be crossing an edge. + + if (mapping_from_end_offset != handle_graph.get_length(handle_graph.get_handle(mapping_from.position().node_id()))) { #ifdef debug_verbose_validation - cerr << "validation failure on using edge at middle of node" << endl; - cerr << pb2json(mapping_from) << "->" << pb2json(mapping_to) << endl; + cerr << "validation failure on using edge at middle of node" << endl; + cerr << debug_string(mapping_from) << "->" << debug_string(mapping_to) << endl; #endif - return false; - } - - handle_t handle_from = handle_graph.get_handle(mapping_from.position().node_id(), mapping_from.position().is_reverse()); - handle_t handle_to = handle_graph.get_handle(mapping_to.position().node_id(), mapping_to.position().is_reverse()); - - bool found_edge = false; - function check_for_edge = [&](const handle_t& next_handle) { - found_edge = (next_handle == handle_to); - return !found_edge; - }; - handle_graph.follow_edges(handle_from, false, check_for_edge); - - if (!found_edge) { + return false; + } + + + + bool found_edge = false; + function check_for_edge = [&](const handle_t& next_handle) { + found_edge = (next_handle == handle_to); + return !found_edge; + }; + handle_graph.follow_edges(handle_from, false, check_for_edge); + + if (!found_edge) { #ifdef debug_verbose_validation - cerr << "validation failure on nodes not connected by an edge" << endl; - cerr << pb2json(mapping_from) << "->" << pb2json(mapping_to) << endl; + cerr << "validation failure on nodes not connected by an edge" << endl; + cerr << debug_string(mapping_from) << "->" << debug_string(mapping_to) << endl; #endif - return false; - } + return false; } return true; }; for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); - const Path& path = subpath.path(); + const subpath_t& subpath = multipath_aln.subpath(i); + const path_t& path = subpath.path(); for (size_t j = 1; j < path.mapping_size(); j++) { if (!validate_adjacent_mappings(path.mapping(j - 1), path.mapping(j))) { return false; } } - const Mapping& final_mapping = path.mapping(path.mapping_size() - 1); + const path_mapping_t& final_mapping = path.mapping(path.mapping_size() - 1); for (size_t j = 0; j < subpath.next_size(); j++) { if (!validate_adjacent_mappings(final_mapping, multipath_aln.subpath(subpath.next(j)).path().mapping(0))) { return false; } } + // connections are not required to be contiguous, ignore them here } // do the paths represent valid alignments of the associated read string and graph path? - auto validate_mapping_edits = [&](const Mapping& mapping, const string& subseq) { - string node_seq = handle_graph.get_sequence(handle_graph.get_handle(mapping.position().node_id())); - string rev_node_seq = reverse_complement(node_seq); + auto validate_mapping_edits = [&](const path_mapping_t& mapping, const string& subseq, size_t s, size_t m) { + handle_t handle = handle_graph.get_handle(mapping.position().node_id(), mapping.position().is_reverse()); size_t node_idx = mapping.position().offset(); size_t seq_idx = 0; for (size_t i = 0; i < mapping.edit_size(); i++) { - const Edit& edit = mapping.edit(i); - if (edit_is_match(edit)) { + const edit_t& edit = mapping.edit(i); + if (edit.to_length() == edit.from_length() && edit.sequence().empty()) { for (size_t j = 0; j < edit.from_length(); j++, node_idx++, seq_idx++) { - if ((mapping.position().is_reverse() ? rev_node_seq[node_idx] : node_seq[node_idx]) != subseq[seq_idx]) { + if (handle_graph.get_base(handle, node_idx) != subseq[seq_idx]) { #ifdef debug_verbose_validation - cerr << "validation failure on match that does not match for read " << multipath_aln.name() << endl; - cerr << pb2json(mapping) << ", " << subseq << endl; + cerr << "validation failure on match that does not match on node " << handle_graph.get_id(handle) << (handle_graph.get_is_reverse(handle) ? "-" : "+") << " on subpath " << s << ", mapping " << m << endl; + cerr << "node sequence: " << handle_graph.get_sequence(handle) << ", offset: " << node_idx << endl; + cerr << "read subsequence: " << subseq << ", offset: " << seq_idx << endl; + + cerr << debug_string(mapping) << ", " << subseq << endl; #endif return false; } } } - else if (edit_is_sub(edit)) { + else if (edit.to_length() == edit.from_length() && !edit.sequence().empty()) { + if (edit.sequence().size() != edit.to_length()) { +#ifdef debug_verbose_validation + cerr << "validation failure on mismatched sequence length and to length: " << debug_string(edit) << " in mapping " << debug_string(mapping) << endl; +#endif + return false; + } + + bool is_Ns = find_if(edit.sequence().begin(), edit.sequence().end(), [](char c) {return c != 'N';}) == edit.sequence().end(); for (size_t j = 0; j < edit.from_length(); j++, node_idx++, seq_idx++) { - if ((mapping.position().is_reverse() ? rev_node_seq[node_idx] : node_seq[node_idx]) == subseq[seq_idx]) { + // we will also let N's be marked as mismatches even if the node sequence is also Ns + if (handle_graph.get_base(handle, node_idx) == subseq[seq_idx] && !is_Ns) { #ifdef debug_verbose_validation cerr << "validation failure on mismatch that matches" << endl; - cerr << pb2json(mapping) << ", " << subseq << endl; + cerr << debug_string(mapping) << ", " << subseq << endl; #endif return false; } if (edit.sequence()[j] != subseq[seq_idx]) { #ifdef debug_verbose_validation cerr << "validation failure on substitution sequence that does not match read" << endl; - cerr << pb2json(mapping) << ", " << subseq << endl; + cerr << debug_string(mapping) << ", " << subseq << endl; #endif return false; } } } - else if (edit_is_insertion(edit)) { + else if (edit.to_length() > 0 && edit.from_length() == 0) { + if (edit.sequence().size() != edit.to_length()) { +#ifdef debug_verbose_validation + cerr << "validation failure on mismatched sequence length and to length: " << debug_string(edit) << " on mapping " << debug_string(mapping) << endl; +#endif + return false; + } + for (size_t j = 0; j < edit.to_length(); j++, seq_idx++) { + if (edit.sequence()[j] != subseq[seq_idx]) { #ifdef debug_verbose_validation cerr << "validation failure on insertion sequence that does not match read" << endl; - cerr << pb2json(mapping) << ", " << subseq << endl; + cerr << debug_string(mapping) << ", " << subseq << endl; #endif return false; } } } - else if (edit_is_deletion(edit)) { + else if (edit.from_length() > 0 && edit.to_length() == 0) { node_idx += edit.from_length(); } + else { +#ifdef debug_verbose_validation + cerr << "validation failure on non-simple edit " << debug_string(edit) << endl; +#endif + return false; + } } return true; }; for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); - const Path& path = subpath.path(); + const subpath_t& subpath = multipath_aln.subpath(i); + const path_t& path = subpath.path(); size_t read_start = subpath_read_interval[i].first; for (size_t j = 0; j < path.mapping_size(); j++) { size_t read_mapping_len = mapping_to_length(path.mapping(j)); - if (!validate_mapping_edits(path.mapping(j), multipath_aln.sequence().substr(read_start, read_mapping_len))) { + if (!validate_mapping_edits(path.mapping(j), multipath_aln.sequence().substr(read_start, read_mapping_len), i, j)) { return false; } read_start += read_mapping_len; @@ -1492,7 +4167,7 @@ namespace vg { // the node sequence to score mismatches but the node sequence is not stored in the Alignment object // for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - // const Subpath& subpath = multipath_aln.subpath(i); + // const subpath_t& subpath = multipath_aln.subpath(i); // Alignment& alignment; // *alignment.mutable_sequence() = multipath_aln.sequence().substr(subpath_read_interval[i].first, // subpath_read_interval[i].second - subpath_read_interval[i].first); @@ -1505,14 +4180,14 @@ namespace vg { return true; } - void view_multipath_alignment(ostream& out, const MultipathAlignment& multipath_aln, const HandleGraph& handle_graph) { + void view_multipath_alignment(ostream& out, const multipath_alignment_t& multipath_aln, const HandleGraph& handle_graph) { size_t max_line_length = 128; vector> subpath_read_interval(multipath_aln.subpath_size(), pair(0, 0)); for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); + const subpath_t& subpath = multipath_aln.subpath(i); subpath_read_interval[i].second = subpath_read_interval[i].first + path_to_length(subpath.path()); for (int64_t j : subpath.next()) { @@ -1520,14 +4195,14 @@ namespace vg { } } - auto format_position = [](const Position& pos) { + auto format_position = [](const position_t& pos) { stringstream strm; strm << pos.node_id() << (pos.is_reverse() ? "-" : "+") << (pos.offset() ? (":" + to_string(pos.offset())) : ""); return strm.str(); }; for (int64_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); + const subpath_t& subpath = multipath_aln.subpath(i); stringstream read_strm; stringstream node_strm; @@ -1546,7 +4221,7 @@ namespace vg { bool first_mapping = true; for (size_t j = 0; j < subpath.path().mapping_size(); j++) { - const Mapping& mapping = subpath.path().mapping(j); + const path_mapping_t& mapping = subpath.path().mapping(j); string pos_string = format_position(mapping.position()); @@ -1559,7 +4234,7 @@ namespace vg { mapping.position().is_reverse())); int64_t node_at = mapping.position().offset(); - for (const Edit& edit : mapping.edit()) { + for (const edit_t& edit : mapping.edit()) { if (edit.from_length() > 0 && edit.to_length() > 0) { mapping_read_strm << multipath_aln.sequence().substr(read_at, edit.to_length()); mapping_node_strm << node_seq.substr(node_at, edit.from_length()); @@ -1609,6 +4284,171 @@ namespace vg { } } } + + void view_multipath_alignment_as_dot(ostream& out, const multipath_alignment_t& multipath_aln, bool show_graph) { + out << "digraph graphname {" << endl; + out << "rankdir=\"LR\";" << endl; + + // Track graph nodes so we get one node for each + unordered_set mentioned_nodes; + // Similarly for graph edges + unordered_set> mentioned_edges; + + // Do the start node + out << "start[label=\"Start\" shape=circle];" << endl; + for (size_t start_subpath : multipath_aln.start()) { + // Hook it up to each start subpath + out << "start -> s" << start_subpath << ";" << endl; + } + + for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { + // For each subpath, say it with its score + out << "s" << i << " [label=\"" << i << "\" shape=circle tooltip=\"" << multipath_aln.subpath(i).score() << "\"];" << endl; + + for (size_t next_subpath : multipath_aln.subpath(i).next()) { + // For each edge from it, say where it goes + out << "s" << i << " -> s" << next_subpath << ";" << endl; + } + + if (show_graph) { + auto& path = multipath_aln.subpath(i).path(); + for (size_t j = 0; j < path.mapping_size(); j++) { + // For each mapping in the path, show the vg node in the graph too + auto node_id = path.mapping(j).position().node_id(); + + if (!mentioned_nodes.count(node_id)) { + // This graph node eneds to be made + mentioned_nodes.insert(node_id); + out << "g" << node_id << " [label=\"" << node_id << "\" shape=box];" << endl; + } + + // Attach the subpath to each involved graph node. + out << "s" << i << " -> g" << node_id << " [dir=none color=blue];" << endl; + + if (j != 0) { + // We have a previous node in this segment of path. What is it? + auto prev_id = path.mapping(j-1).position().node_id(); + pair edge_pair{prev_id, node_id}; + + if (!mentioned_edges.count(edge_pair)) { + // This graph edge needs to be made + mentioned_edges.insert(edge_pair); + + out << "g" << prev_id << " -> g" << node_id << ";" << endl; + } + } + } + } + } + + out << "}" << endl; + } + + string debug_string(const connection_t& connection) { + string to_return = "{next: " + to_string(connection.next()) + ", score: " + to_string(connection.score()) + "}"; + return to_return; + } + + string debug_string(const subpath_t& subpath) { + string to_return = "{path: " + debug_string(subpath.path()); + if (!subpath.next().empty()) { + to_return += ", next: ["; + for (size_t i = 0; i < subpath.next_size(); ++i) { + if (i > 0) { + to_return += ", "; + } + to_return += to_string(subpath.next(i)); + } + to_return += "]"; + } + if (!subpath.connection().empty()) { + to_return += ", connection: ["; + for (size_t i = 0; i < subpath.connection_size(); ++i) { + if (i > 0) { + to_return += ", "; + } + to_return += debug_string(subpath.connection(i)); + } + to_return += "]"; + } + to_return += ", score: " + to_string(subpath.score()); + to_return += "}"; + return to_return; + } + + string debug_string(const multipath_alignment_t& multipath_aln) { + string to_return = "{seq: " + multipath_aln.sequence(); + if (!multipath_aln.quality().empty()) { + to_return += ", qual: " + string_quality_short_to_char(multipath_aln.quality()); + } + if (!multipath_aln.subpath().empty()) { + to_return += ", subpath: ["; + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { + if (i > 0) { + to_return += ", "; + } + to_return += debug_string(multipath_aln.subpath(i)); + } + to_return += "]"; + } + to_return += ", mapq: " + to_string(multipath_aln.mapping_quality()); + if (!multipath_aln.start().empty()) { + to_return += ", start: ["; + for (size_t i = 0; i < multipath_aln.start_size(); ++i) { + if (i > 0) { + to_return += ", "; + } + to_return += to_string(multipath_aln.start(i)); + } + to_return += "]"; + } + int anno_num = 0; + multipath_aln.for_each_annotation([&](const string& name, + multipath_alignment_t::anno_type_t type, + const void* annotation) { + if (anno_num == 0) { + to_return += ", annotations: {"; + } + else { + to_return += ", "; + } + switch (type) { + case multipath_alignment_t::Null: + to_return += name; + break; + case multipath_alignment_t::Double: + { + to_return += name + ": "; + // handle the annoying lack of an integer annotation + double val = *((const double*) annotation); + if (trunc(val) == val) { + to_return += to_string((int64_t) val); + } + else { + to_return += to_string(val); + } + break; + } + case multipath_alignment_t::Bool: + to_return += name + ": " + (*((const bool*) annotation) ? "true" : "false"); + break; + case multipath_alignment_t::String: + to_return += name + ": \"" + *((const string*) annotation) + "\""; + break; + default: + cerr << "error: unrecognized annotation type" << endl; + exit(1); + break; + } + ++anno_num; + }); + if (anno_num != 0) { + to_return += "}"; + } + to_return += "}"; + return to_return; + } + } diff --git a/src/multipath_alignment.hpp b/src/multipath_alignment.hpp index 363f9f1c617..2efa7f27796 100644 --- a/src/multipath_alignment.hpp +++ b/src/multipath_alignment.hpp @@ -1,8 +1,7 @@ -// -// multipath_alignment.hpp -// -// utility functions for the MultipathAlignment protobuf object -// +/// \file multipath_alignment.hpp +/// +/// utility functions for the multipath_alignment_t object +/// #ifndef multipath_alignment_hpp #define multipath_alignment_hpp @@ -12,22 +11,146 @@ #include #include #include -#include "vg.pb.h" + +#include #include "path.hpp" #include "alignment.hpp" #include "utility.hpp" #include "handle.hpp" +#include "annotation.hpp" + +// Declare the haplo::ScoreProvider we use for haplotype-aware traceback generation. +namespace haplo { + class ScoreProvider; +} namespace vg { + + class connection_t { + public: + connection_t() = default; + connection_t(const connection_t&) = default; + connection_t(connection_t&&) = default; + ~connection_t() = default; + connection_t& operator=(const connection_t&) = default; + connection_t& operator=(connection_t&&) = default; + inline int32_t next() const; + inline void set_next(int32_t n); + inline int32_t score() const; + inline void set_score(int32_t s); + private: + uint32_t _next; + int32_t _score; + }; + + /* + * STL implementations of the protobuf object for use in in-memory operations + */ + class subpath_t { + public: + subpath_t() = default; + subpath_t(const subpath_t&) = default; + subpath_t(subpath_t&&) = default; + ~subpath_t() = default; + subpath_t& operator=(const subpath_t&) = default; + subpath_t& operator=(subpath_t&&) = default; + inline const path_t& path() const; + inline path_t* mutable_path(); + inline bool has_path() const; + inline const vector& next() const; + inline uint32_t next(size_t i) const; + inline vector* mutable_next(); + inline void set_next(size_t i, uint32_t n); + inline void add_next(uint32_t n); + inline void clear_next(); + inline size_t next_size() const; + inline bool has_next() const; + inline int32_t score() const; + inline void set_score(int32_t s); + inline const vector& connection() const; + inline const connection_t& connection(size_t i) const; + inline vector* mutable_connection(); + inline connection_t* mutable_connection(size_t i); + inline void set_connection(size_t i, const connection_t& c); + inline connection_t* add_connection(); + inline void clear_connection(); + inline size_t connection_size() const; + inline bool has_connection() const; + private: + path_t _path; + vector _next; + int32_t _score; + vector _connection; + }; + + // TODO: the metadata could be removed and only added to the protobuf at serialization time + class multipath_alignment_t { + public: + multipath_alignment_t(); + multipath_alignment_t(const multipath_alignment_t& other); + multipath_alignment_t(multipath_alignment_t&& other); + ~multipath_alignment_t(); + multipath_alignment_t& operator=(const multipath_alignment_t& other); + multipath_alignment_t& operator=(multipath_alignment_t&& other); + inline const string& sequence() const; + inline string* mutable_sequence(); + inline void set_sequence(const string& s); + inline const string& quality() const; + inline string* mutable_quality(); + inline void set_quality(const string& q); + inline const vector& subpath() const; + inline const subpath_t& subpath(size_t i) const; + inline vector* mutable_subpath(); + inline subpath_t* mutable_subpath(size_t i); + inline subpath_t* add_subpath(); + inline void clear_subpath(); + inline size_t subpath_size() const; + inline int32_t mapping_quality() const; + inline void set_mapping_quality(int32_t q); + inline const vector& start() const; + inline uint32_t start(size_t i) const; + inline vector* mutable_start(); + inline void set_start(size_t i, uint32_t s); + inline void add_start(uint32_t s); + inline void clear_start(); + inline size_t start_size() const; + inline bool has_start() const; + + // annotation interface + // TODO: add List and Struct from https://github.com/protocolbuffers/protobuf/blob/master/src/google/protobuf/struct.proto + enum anno_type_t {Null = 0, Double = 2, Bool = 3, String = 4}; + void set_annotation(const string& annotation_name); + void set_annotation(const string& annotation_name, double value); + void set_annotation(const string& annotation_name, bool value); + void set_annotation(const string& annotation_name, const string& value); + void clear_annotation(const string& annotation_name); + bool has_annotation(const string& annotation_name) const; + pair get_annotation(const string& annotation_name) const; + void for_each_annotation(function lambda) const; + private: + string _sequence; + string _quality; + vector _subpath; + int32_t _mapping_quality; + vector _start; + map> _annotation; + }; + + string debug_string(const connection_t& connection); + string debug_string(const subpath_t& subpath); + string debug_string(const multipath_alignment_t& multipath_aln); /// Put subpaths in topological order (assumed to be true for other algorithms) - void topologically_order_subpaths(MultipathAlignment& multipath_aln); + void topologically_order_subpaths(multipath_alignment_t& multipath_aln); /// Finds the start subpaths (i.e. the source nodes of the multipath DAG) and stores - /// them in the 'start' field of the MultipathAlignment - void identify_start_subpaths(MultipathAlignment& multipath_aln); + /// them in the 'start' field of the multipath_alignment_t + void identify_start_subpaths(multipath_alignment_t& multipath_aln); + + /// Clear all of the field associated with the alignment + void clear_alignment(multipath_alignment_t& multipath_aln); - /// Stores the highest scoring alignment contained in the MultipathAlignment in an Alignment + /// Stores the highest scoring alignment contained in the multipath_alignment_t in an Alignment /// /// Note: Assumes that each subpath's Path object uses one Mapping per node and that /// start subpaths have been identified @@ -36,20 +159,30 @@ namespace vg { /// multipath_aln multipath alignment to find optimal path through /// aln_out empty alignment to store optimal alignment in (data will be /// overwritten if not empty) + /// subpath_global if true, only allows alignments that source subpath to sink subpath + /// in the multipath DAG, else allows any start and end subpath /// - void optimal_alignment(const MultipathAlignment& multipath_aln, Alignment& aln_out); + void optimal_alignment(const multipath_alignment_t& multipath_aln, Alignment& aln_out, + bool subpath_global = false); - /// Returns the score of the highest scoring alignment contained in the MultipathAlignment + /// Returns the score of the highest scoring alignment contained in the multipath_alignment_t /// /// Note: Assumes that each subpath's Path object uses one Mapping per node and that /// start subpaths have been identified /// /// Args: /// multipath_aln multipath alignment to find optimal score in + /// subpath_global if true, only allows alignments that source subpath to sink subpath + /// in the multipath DAG, else allows any start and end subpath /// - int32_t optimal_alignment_score(const MultipathAlignment& multipath_aln); + int32_t optimal_alignment_score(const multipath_alignment_t& multipath_aln, + bool subpath_global = false); + + /// Returns the score of the lowest-scoring source-to-sink alignment in the multipath_alignment_t. + /// Assumes that subpaths are topologically ordered and starts have been identified. + int32_t worst_alignment_score(const multipath_alignment_t& multipath_aln); - /// Returns the top k highest-scoring alignments contained in the MultipathAlignment. + /// Returns the top k highest-scoring alignments contained in the multipath_alignment_t. /// Note that some or all of these may be duplicate Alignments, which were spelled out /// by tracebacks through different sequences of subpaths that shared alignment material. /// @@ -62,7 +195,7 @@ namespace vg { /// multipath_aln multipath alignment to find optimal paths through /// count maximum number of top alignments to return /// - vector optimal_alignments(const MultipathAlignment& multipath_aln, size_t count); + vector optimal_alignments(const multipath_alignment_t& multipath_aln, size_t count); /// Finds k or fewer top-scoring alignments using only distinct subpaths. /// Asymmetrical: the optimal alignment for each end subpath is found, greedily, subject to the constraint, @@ -78,9 +211,36 @@ namespace vg { /// multipath_aln multipath alignment to find optimal paths through /// count maximum number of top alignments to return /// - vector optimal_alignments_with_disjoint_subpaths(const MultipathAlignment& multipath_aln, size_t count); + vector optimal_alignments_with_disjoint_subpaths(const multipath_alignment_t& multipath_aln, size_t count); - /// Stores the reverse complement of a MultipathAlignment in another MultipathAlignment + + /// Finds all alignments consistent with haplotypes available by incremental search with the given haplotype + /// score provider. Pads to a certain count with haplotype-inconsistent alignments that are population-scorable + /// (i.e. use only edges used by some haplotype in the index), and then with unscorable alignments if scorable + /// ones are unavailable. This may result in an empty vector. + /// + /// Output Alignments may not be unique. The input multipath_alignment_t may have exponentially many ways to + /// spell the same Alignment, and we will look at all of them. We also may have duplicates of the optimal + /// alignment if we are asked to produce it unconsitionally. + /// + /// Note: Assumes that each subpath's Path object uses one Mapping per node and that + /// start subpaths have been identified + /// + /// + /// Args: + /// multipath_aln multipath alignment to find optimal paths through + /// score_provider a haplo::ScoreProvider that supports incremental search over its haplotype database (such as a GBWTScoreProvider) + /// soft_count maximum number of haplotype-inconsistent alignments to pad to + /// hard_count maximum number of alignments, including haplotype-consistent (0 if no limit) + /// optimal_first always compute and return first the optimal alignment, even if not haplotype-consistent + /// + vector haplotype_consistent_alignments(const multipath_alignment_t& multipath_aln, const haplo::ScoreProvider& score_provider, + size_t soft_count, size_t hard_count, bool optimal_first = false); + + /// The indexes on the read sequence of the portion of the read that is aligned outside of soft clips + pair aligned_interval(const multipath_alignment_t& multipath_aln); + + /// Stores the reverse complement of a multipath_alignment_t in another multipath_alignment_t /// /// Args: /// multipath_aln multipath alignment to reverse complement @@ -88,76 +248,306 @@ namespace vg { /// rev_comp_out empty multipath alignment to store reverse complement in (some data may /// be overwritten if not empty) /// - void rev_comp_multipath_alignment(const MultipathAlignment& multipath_aln, + void rev_comp_multipath_alignment(const multipath_alignment_t& multipath_aln, const function& node_length, - MultipathAlignment& rev_comp_out); + multipath_alignment_t& rev_comp_out); - /// Stores the reverse complement of a MultipathAlignment in another MultipathAlignment + /// Stores the reverse complement of a multipath_alignment_t in another multipath_alignment_t /// /// Args: /// multipath_aln multipath alignment to reverse complement in place /// node_length a function that returns the length of a node sequence from its node ID /// - void rev_comp_multipath_alignment_in_place(MultipathAlignment* multipath_aln, + void rev_comp_multipath_alignment_in_place(multipath_alignment_t* multipath_aln, const function& node_length); + + /// Replaces all U's in the sequence and the aligned Paths with T's + void convert_Us_to_Ts(multipath_alignment_t& multipath_aln); + + /// Replaces all T's in the sequence and the aligned Paths with U's + void convert_Ts_to_Us(multipath_alignment_t& multipath_aln); - /// Converts a Alignment into a Multipath alignment with one Subpath and stores it in an object + /// Convert an STL-based multipath_alignment_t to a protobuf MultipathAlignment + void to_proto_multipath_alignment(const multipath_alignment_t& multipath_aln, + MultipathAlignment& proto_multipath_aln_out); + + /// Convert a protobuf MultipathAlignment to an STL-based multipath_alignment_t + void from_proto_multipath_alignment(const MultipathAlignment& proto_multipath_aln, + multipath_alignment_t& multipath_aln_out); + + /// Converts a Alignment into a multipath_alignment_t with one subpath and stores it in an object /// /// Args: /// aln alignment to convert /// multipath_aln empty multipath alignment to store converted alignment in (data may be /// be overwritten if not empty) /// - void to_multipath_alignment(const Alignment& aln, MultipathAlignment& multipath_aln_out); - - /// Copies metadata from an Alignment object and transfers it to a MultipathAlignment - /// - /// Args: - /// from copy metadata from this - /// to into this - /// - void transfer_read_metadata(const Alignment& from, MultipathAlignment& to); - - /// Copies metadata from an MultipathAlignment object and transfers it to a Alignment - /// - /// Args: - /// from copy metadata from this - /// to into this - /// - void transfer_read_metadata(const MultipathAlignment& from, Alignment& to); - - /// Copies metadata from an MultipathAlignment object and transfers it to another MultipathAlignment - /// - /// Args: - /// from copy metadata from this - /// to into this - /// - void transfer_read_metadata(const MultipathAlignment& from, MultipathAlignment& to); + void to_multipath_alignment(const Alignment& aln, multipath_alignment_t& multipath_aln_out); + // TODO: these metadata functions should also transfer annotations + + /// All functions of this form transfer: + /// - sequence + /// - base quality + /// - mapping quality + /// - read annotations (including multiple encodings of secondary) + void transfer_read_metadata(const Alignment& from, multipath_alignment_t& to); + void transfer_read_metadata(const multipath_alignment_t& from, Alignment& to); + void transfer_read_metadata(const multipath_alignment_t& from, multipath_alignment_t& to); + void transfer_read_metadata(const Alignment& from, Alignment& to); + void transfer_read_metadata(const MultipathAlignment& from, multipath_alignment_t& to); + void transfer_read_metadata(const multipath_alignment_t& from, MultipathAlignment& to); + + /// Transfer the annotations that are carried with the Protobuf formats but not + /// the internal multipath_alignment_t (and which therefore get lost when using + /// it as an intermediate format). + void transfer_proto_metadata(const Alignment& from, MultipathAlignment& to); + void transfer_proto_metadata(const MultipathAlignment& from, Alignment& to); + /// Merges non-branching paths in a multipath alignment in place - void merge_non_branching_subpaths(MultipathAlignment& multipath_aln); + /// Does not assume topological order among subpaths + void merge_non_branching_subpaths(multipath_alignment_t& multipath_aln, + const unordered_set* prohibited_merges = nullptr); + + /// Removes all edit, mappings, and subpaths that have no aligned bases, and introduces transitive edges + /// to preserve connectivity through any completely removed subpaths + void remove_empty_alignment_sections(multipath_alignment_t& multipath_aln); + + /// Removes all subpaths and edges whose optimal full length alignment is less than the given difference + /// from the highest-scoring full length alignment + void remove_low_scoring_sections(multipath_alignment_t& multipath_aln, int32_t max_score_diff); + + /// Returns the number of connected components in the multipath alignment. + size_t num_connected_components(const multipath_alignment_t& multipath_aln); - /// Returns a vector whose elements are vectors with the indexes of the Subpaths in - /// each connected component - vector> connected_components(const MultipathAlignment& multipath_aln); + /// Returns a vector whose elements are vectors with the indexes of the subpath_t's in + /// each connected component. An unmapped multipath_alignment_t with no subpaths produces an empty vector. + vector> connected_components(const multipath_alignment_t& multipath_aln); - /// Extract the MultipathAlignment consisting of the Subpaths with the given indexes - /// into a new MultipathAlignment object - void extract_sub_multipath_alignment(const MultipathAlignment& multipath_aln, + /// Extract the multipath_alignment_t consisting of the Subpaths with the given indexes + /// into a new multipath_alignment_t object + void extract_sub_multipath_alignment(const multipath_alignment_t& multipath_aln, const vector& subpath_indexes, - MultipathAlignment& sub_multipath_aln); - + multipath_alignment_t& sub_multipath_aln); + + /// Add the subpaths of one multipath alignment onto another + void append_multipath_alignment(multipath_alignment_t& multipath_aln, + const multipath_alignment_t& to_append); + + /// Returns true if any subpath has a connection adjacency + bool contains_connection(const multipath_alignment_t& multipath_aln); + + /// Returns all of the positions where a given sequence index occurs at a given graph + /// graph position (if any), where positions are represented as tuples of + /// (subpath index, mapping index, edit index, index within edit) + vector> + search_multipath_alignment(const multipath_alignment_t& multipath_aln, + const pos_t& graph_pos, int64_t seq_pos); + + /// Returns a pair of (mapping, edit, base) and possibly multiple (subpath, mapping, edit, base),of the furthest position + /// that can be traced through the multipath alignment along the pathstarting the indicated position in the multipath + /// alignment. The path can be traced rightward starting at the beginning, or leftward starting. + /// Search is limited to not passing a given mapping on the path. + pair, vector>> + trace_path(const multipath_alignment_t& multipath_aln, const Path& path, + int64_t subpath_idx, int64_t mapping_idx, int64_t edit_idx, int64_t base_idx, bool search_left, + int64_t search_limit); + + /// Returns true if the multipath alignment contains a match of a given length starting at the graph and + /// read position + bool contains_match(const multipath_alignment_t& multipath_aln, const pos_t& pos, + int64_t read_pos, int64_t match_length); + + /// Convert a surjected multipath alignment into a CIGAR sequence against a path. Splicing will be allowed + /// at connections and at any silent deletions of path sequence. Surjected multipath alignment graph must + /// consist of a single non-branching path + vector> cigar_against_path(const multipath_alignment_t& multipath_aln, const string& path_name, bool rev, + int64_t path_pos, const PathPositionHandleGraph& graph, + int64_t min_splice_length = numeric_limits::max()); + /// Debugging function to check that multipath alignment meets the formalism's basic /// invariants. Returns true if multipath alignment is valid, else false. Does not /// validate alignment score. - bool validate_multipath_alignment(const MultipathAlignment& multipath_aln, const HandleGraph& handle_graph); + bool validate_multipath_alignment(const multipath_alignment_t& multipath_aln, const HandleGraph& handle_graph); - /// Send a formatted string representation of the MultipathAlignment into the ostream - void view_multipath_alignment(ostream& out, const MultipathAlignment& multipath_aln, const HandleGraph& handle_graph); + /// Send a formatted string representation of the multipath_alignment_t into the ostream + void view_multipath_alignment(ostream& out, const multipath_alignment_t& multipath_aln, const HandleGraph& handle_graph); + + /// Converts a multipath_alignment_t to a GraphViz Dot representation, output to the given ostream. + void view_multipath_alignment_as_dot(ostream& out, const multipath_alignment_t& multipath_aln, bool show_graph = false); // TODO: function for adding a graph augmentation to an existing multipath alignment -} + /* + * Implementations of inline methods + */ + + /* + * connection_t + */ + inline int32_t connection_t::next() const { + return _next; + } + inline void connection_t::set_next(int32_t n) { + _next = n; + } + inline int32_t connection_t::score() const { + return _score; + } + inline void connection_t::set_score(int32_t s) { + _score = s; + } + + /* + * subpath_t + */ + inline const path_t& subpath_t::path() const { + return _path; + } + inline path_t* subpath_t::mutable_path() { + return &_path; + } + inline bool subpath_t::has_path() const { + return _path.mapping_size(); + } + inline const vector& subpath_t::next() const { + return _next; + } + inline uint32_t subpath_t::next(size_t i) const { + return _next[i]; + } + inline vector* subpath_t::mutable_next() { + return &_next; + } + inline void subpath_t::set_next(size_t i, uint32_t n) { + _next[i] = n; + } + inline void subpath_t::add_next(uint32_t n) { + _next.emplace_back(n); + } + inline void subpath_t::clear_next() { + _next.clear(); + } + inline size_t subpath_t::next_size() const { + return _next.size(); + } + inline bool subpath_t::has_next() const { + return !_next.empty(); + } + inline int32_t subpath_t::score() const { + return _score; + } + inline void subpath_t::set_score(int32_t s) { + _score = s; + } + inline const vector& subpath_t::connection() const { + return _connection; + } + inline const connection_t& subpath_t::connection(size_t i) const { + return _connection[i]; + } + inline vector* subpath_t::mutable_connection() { + return &_connection; + } + inline connection_t* subpath_t::mutable_connection(size_t i) { + return &_connection[i]; + } + inline void subpath_t::set_connection(size_t i, const connection_t& c) { + _connection[i] = c; + } + inline connection_t* subpath_t::add_connection() { + _connection.emplace_back(); + return &_connection.back(); + } + inline void subpath_t::clear_connection() { + _connection.clear(); + } + inline size_t subpath_t::connection_size() const { + return _connection.size(); + } + inline bool subpath_t::has_connection() const { + return !_connection.empty(); + } + + /* + * multipath_alignment_t + */ + inline const string& multipath_alignment_t::sequence() const { + return _sequence; + } + inline string* multipath_alignment_t::mutable_sequence() { + return &_sequence; + } + inline void multipath_alignment_t::set_sequence(const string& s) { + _sequence = s; + } + inline const string& multipath_alignment_t::quality() const { + return _quality; + } + inline string* multipath_alignment_t::mutable_quality() { + return &_quality; + } + inline void multipath_alignment_t::set_quality(const string& q) { + _quality = q; + } + inline const vector& multipath_alignment_t::subpath() const { + return _subpath; + } + inline const subpath_t& multipath_alignment_t::subpath(size_t i) const { + return _subpath[i]; + } + inline vector* multipath_alignment_t::mutable_subpath() { + return &_subpath; + } + inline subpath_t* multipath_alignment_t::mutable_subpath(size_t i) { + return &_subpath[i]; + } + inline subpath_t* multipath_alignment_t::add_subpath() { + _subpath.emplace_back(); + return &_subpath.back(); + } + inline void multipath_alignment_t::clear_subpath() { + _subpath.clear(); + } + inline size_t multipath_alignment_t::subpath_size() const { + return _subpath.size(); + } + inline int32_t multipath_alignment_t::mapping_quality() const { + return _mapping_quality; + } + inline void multipath_alignment_t::set_mapping_quality(int32_t q) { + _mapping_quality = q; + } + inline const vector& multipath_alignment_t::start() const { + return _start; + } + inline uint32_t multipath_alignment_t::start(size_t i) const { + return _start[i]; + } + inline vector* multipath_alignment_t::mutable_start() { + return &_start; + } + inline void multipath_alignment_t::set_start(size_t i, uint32_t s) { + _start[i] = s; + } + inline void multipath_alignment_t::add_start(uint32_t s) { + _start.emplace_back(s); + } + inline void multipath_alignment_t::clear_start() { + _start.clear(); + } + inline size_t multipath_alignment_t::start_size() const { + return _start.size(); + } + inline bool multipath_alignment_t::has_start() const { + return !_start.empty(); + } + + /// Define seed generation for shuffling multipath alignments + inline string make_shuffle_seed(const multipath_alignment_t& aln) { + return aln.sequence(); + } +} #endif /* multipath_alignment_hpp */ diff --git a/src/multipath_alignment_emitter.cpp b/src/multipath_alignment_emitter.cpp new file mode 100644 index 00000000000..705910b1dff --- /dev/null +++ b/src/multipath_alignment_emitter.cpp @@ -0,0 +1,447 @@ +/** + * \file multipath_alignment_emitter.cpp + * + * Implements a system for emitting multipath alignments and groups of multipath alignments in multiple formats. + */ + +#include "multipath_alignment_emitter.hpp" +#include "vg/io/json2pb.h" + +using namespace vg::io; + +namespace vg { +using namespace std; + +MultipathAlignmentEmitter::MultipathAlignmentEmitter(const string& filename, size_t num_threads, const string out_format, + const PathPositionHandleGraph* graph, const vector>* path_order_and_length) : + HTSWriter(filename, + out_format == "SAM" || out_format == "BAM" || out_format == "CRAM" ? out_format : "SAM", // just so the assert passes + path_order_and_length ? *path_order_and_length : vector>(), + {}, + num_threads), + graph(graph) +{ + + // init the emitters for the correct output type + if (out_format == "GAM" ) { + format = GAM; + aln_emitters.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) { + aln_emitters.emplace_back(new vg::io::ProtobufEmitter(multiplexer.get_thread_stream(i))); + } + } + else if (out_format == "GAMP") { + format = GAMP; + mp_aln_emitters.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) { + mp_aln_emitters.emplace_back(new vg::io::ProtobufEmitter(multiplexer.get_thread_stream(i))); + } + } + else if (out_format == "GAF") { + format = GAF; + if (graph == nullptr) { + cerr << "error:[MultipathAlignmentEmitter] GAF format output requires a graph" << endl; + exit(1); + } + } + else if (out_format == "SAM" || out_format == "BAM" || out_format == "CRAM") { + if (out_format == "SAM") { + format = SAM; + } + else if (out_format == "BAM") { + format = BAM; + } + else { + format = CRAM; + } + // TODO: check for graph, in case of spliced alignments? + } + else { + cerr << "error:[MultipathAlignmentEmitter] unrecognized output format " << out_format << endl; + exit(1); + } +} + +MultipathAlignmentEmitter::~MultipathAlignmentEmitter() { + for (auto& emitter : aln_emitters) { + // Flush each ProtobufEmitter + emitter->flush(); + // Make it go away before the stream + emitter.reset(); + } + for (auto& emitter : mp_aln_emitters) { + // Flush each ProtobufEmitter + emitter->flush(); + // Make it go away before the stream + emitter.reset(); + } +} + +void MultipathAlignmentEmitter::set_read_group(const string& read_group) { + this->read_group = read_group; +} + +void MultipathAlignmentEmitter::set_sample_name(const string& sample_name) { + this->sample_name = sample_name; +} + +void MultipathAlignmentEmitter::set_min_splice_length(int64_t min_splice_length) { + this->min_splice_length = min_splice_length; +} + +void MultipathAlignmentEmitter::emit_pairs(const string& name_1, const string& name_2, + vector>&& mp_aln_pairs, + vector, tuple>>* path_positions, + vector* tlen_limits) { + + int thread_number = omp_get_thread_num(); + switch (format) { + case GAMP: + { + vector mp_alns_out(2 * mp_aln_pairs.size()); + for (size_t i = 0; i < mp_aln_pairs.size(); ++i) { + MultipathAlignment& mp_aln_out_1 = mp_alns_out[2 * i]; + MultipathAlignment& mp_aln_out_2 = mp_alns_out[2 * i + 1]; + to_proto_multipath_alignment(mp_aln_pairs[i].first, mp_aln_out_1); + to_proto_multipath_alignment(mp_aln_pairs[i].second, mp_aln_out_2); + mp_aln_out_1.set_name(name_1); + mp_aln_out_2.set_name(name_2); + mp_aln_out_1.set_paired_read_name(name_2); + mp_aln_out_2.set_paired_read_name(name_1); + if (!sample_name.empty()) { + mp_aln_out_1.set_sample_name(sample_name); + mp_aln_out_2.set_sample_name(sample_name); + } + if (!read_group.empty()) { + mp_aln_out_1.set_read_group(read_group); + mp_aln_out_2.set_read_group(read_group); + } + } + + mp_aln_emitters[thread_number]->write_many(std::move(mp_alns_out)); + + if (multiplexer.want_breakpoint(thread_number)) { + // The multiplexer wants our data. + // Flush and create a breakpoint. + mp_aln_emitters[thread_number]->flush(); + multiplexer.register_breakpoint(thread_number); + } + break; + } + case GAM: + case GAF: + { + vector alns_out(2 * mp_aln_pairs.size()); + for (size_t i = 0; i < mp_aln_pairs.size(); ++i) { + Alignment& aln_out_1 = alns_out[2 * i]; + Alignment& aln_out_2 = alns_out[2 * i + 1]; + convert_to_alignment(mp_aln_pairs[i].first, aln_out_1, + nullptr, + &name_2); + convert_to_alignment(mp_aln_pairs[i].second, aln_out_2, + &name_1, + nullptr); + aln_out_1.set_name(name_1); + aln_out_2.set_name(name_2); + if (!sample_name.empty()) { + aln_out_1.set_sample_name(sample_name); + aln_out_2.set_sample_name(sample_name); + } + if (!read_group.empty()) { + aln_out_1.set_read_group(read_group); + aln_out_2.set_read_group(read_group); + } + } + + if (format == GAM) { + aln_emitters[thread_number]->write_many(std::move(alns_out)); + + if (multiplexer.want_breakpoint(thread_number)) { + // The multiplexer wants our data. + // Flush and create a breakpoint. + aln_emitters[thread_number]->flush(); + } + } + else { + for (auto& aln : alns_out) { + multiplexer.get_thread_stream(thread_number) << alignment_to_gaf(*graph, aln) << endl; + } + } + multiplexer.register_breakpoint(thread_number); + break; + } + case SAM: + case BAM: + case CRAM: + { + size_t thread_number = omp_get_thread_num(); + bam_hdr_t* header = ensure_header(read_group, sample_name, thread_number); + vector records; + records.reserve(2 * mp_aln_pairs.size()); + + for (size_t i = 0; i < mp_aln_pairs.size(); ++i) { + string ref_name_1, ref_name_2; + bool ref_rev_1, ref_rev_2; + int64_t ref_pos_1, ref_pos_2; + tie(ref_name_1, ref_rev_1, ref_pos_1) = path_positions->at(i).first; + tie(ref_name_2, ref_rev_2, ref_pos_2) = path_positions->at(i).second; + int64_t tlen_limit = 0; + if (tlen_limits) { + tlen_limit = tlen_limits->at(i); + } + convert_to_hts_paired(name_1, name_2, mp_aln_pairs[i].first, mp_aln_pairs[i].second, + ref_name_1, ref_rev_1, ref_pos_1, ref_name_2, ref_rev_2, ref_pos_2, + tlen_limit, header, records); + } + + save_records(header, records, thread_number); + break; + } + + default: + cerr << "error:[MultipathAlignmentEmitter] unrecognized output format" << endl; + break; + } +} + +void MultipathAlignmentEmitter::emit_singles(const string& name, vector&& mp_alns, + vector>* path_positions) { + + int thread_number = omp_get_thread_num(); + + switch (format) { + case GAMP: + { + vector mp_alns_out(mp_alns.size()); + for (size_t i = 0; i < mp_alns.size(); ++i) { + MultipathAlignment& mp_aln_out = mp_alns_out[i]; + to_proto_multipath_alignment(mp_alns[i], mp_aln_out); + mp_aln_out.set_name(name); + if (!sample_name.empty()) { + mp_aln_out.set_sample_name(sample_name); + } + if (!read_group.empty()) { + mp_aln_out.set_read_group(read_group); + } + } + + mp_aln_emitters[thread_number]->write_many(std::move(mp_alns_out)); + + if (multiplexer.want_breakpoint(thread_number)) { + // The multiplexer wants our data. + // Flush and create a breakpoint. + mp_aln_emitters[thread_number]->flush(); + multiplexer.register_breakpoint(thread_number); + } + break; + } + case GAM: + case GAF: + { + vector alns_out(mp_alns.size()); + for (size_t i = 0; i < mp_alns.size(); ++i) { + Alignment& aln_out = alns_out[i]; + convert_to_alignment(mp_alns[i], aln_out); + aln_out.set_name(name); + if (!sample_name.empty()) { + aln_out.set_sample_name(sample_name); + } + if (!read_group.empty()) { + aln_out.set_read_group(read_group); + } + } + + if (format == GAM) { + aln_emitters[thread_number]->write_many(std::move(alns_out)); + + if (multiplexer.want_breakpoint(thread_number)) { + // The multiplexer wants our data. + // Flush and create a breakpoint. + aln_emitters[thread_number]->flush(); + } + } + else { + for (auto& aln : alns_out) { + multiplexer.get_thread_stream(thread_number) << alignment_to_gaf(*graph, aln) << endl; + } + } + break; + } + case SAM: + case BAM: + case CRAM: + { + size_t thread_number = omp_get_thread_num(); + bam_hdr_t* header = ensure_header(read_group, sample_name, thread_number); + vector records; + records.reserve(mp_alns.size()); + + for (size_t i = 0; i < mp_alns.size(); ++i) { + string ref_name; + bool ref_rev; + int64_t ref_pos; + tie(ref_name, ref_rev, ref_pos) = path_positions->at(i); + convert_to_hts_unpaired(name, mp_alns[i], ref_name, ref_rev, ref_pos, header, records); + } + + save_records(header, records, thread_number); + break; + } + + default: + cerr << "error:[MultipathAlignmentEmitter] unrecognized output format" << endl; + break; + } +} + +void MultipathAlignmentEmitter::convert_to_alignment(const multipath_alignment_t& mp_aln, Alignment& aln, + const string* prev_name, + const string* next_name) const { + optimal_alignment(mp_aln, aln); + if (prev_name) { + aln.mutable_fragment_prev()->set_name(*prev_name); + aln.set_read_paired(true); + } + if (next_name) { + aln.mutable_fragment_next()->set_name(*next_name); + aln.set_read_paired(true); + } + // at one point vg call needed these, maybe it doesn't anymore though + aln.set_identity(identity(aln.path())); +} + +void MultipathAlignmentEmitter::create_alignment_shim(const string& name, const multipath_alignment_t& mp_aln, + Alignment& shim, const string* prev_name, const string* next_name) const { + + shim.set_sequence(mp_aln.sequence()); + shim.set_quality(mp_aln.quality()); + shim.set_name(name); + if (prev_name) { + shim.mutable_fragment_prev()->set_name(*prev_name); + } + if (next_name) { + shim.mutable_fragment_next()->set_name(*next_name); + } + if (!read_group.empty()) { + shim.set_read_group(read_group); + } + if (!sample_name.empty()) { + shim.set_read_group(sample_name); + } + shim.set_mapping_quality(mp_aln.mapping_quality()); + // do we have at least 1 mapping? + bool mapped = false; + for (size_t i = 0; i < mp_aln.subpath_size() && !mapped; ++i) { + const auto& path = mp_aln.subpath(i).path(); + for (size_t j = 0; j < path.mapping_size() && !mapped; ++j) { + mapped = true; + } + } + if (mapped) { + // hacky way to inform the conversion code that the read is mapped + shim.mutable_path()->add_mapping(); + // and we'll also communicate the alignment score + shim.set_score(optimal_alignment_score(mp_aln, true)); + } + + // this tag comes from surject and is used in both + if (mp_aln.has_annotation("all_scores")) { + auto anno = mp_aln.get_annotation("all_scores"); + assert(anno.first == multipath_alignment_t::String); + set_annotation(&shim, "all_scores", *((const string*) anno.second)); + } +} + +void MultipathAlignmentEmitter::convert_to_hts_unpaired(const string& name, const multipath_alignment_t& mp_aln, + const string& ref_name, bool ref_rev, int64_t ref_pos, + bam_hdr_t* header, vector& dest) const { + + auto cigar = cigar_against_path(mp_aln, ref_name, ref_rev, ref_pos, *graph, min_splice_length); + Alignment shim; + create_alignment_shim(name, mp_aln, shim); + auto bam = alignment_to_bam(header, shim, ref_name, ref_pos, ref_rev, cigar); + add_annotations(mp_aln, bam); + dest.push_back(bam); +} + +void MultipathAlignmentEmitter::convert_to_hts_paired(const string& name_1, const string& name_2, + const multipath_alignment_t& mp_aln_1, + const multipath_alignment_t& mp_aln_2, + const string& ref_name_1, bool ref_rev_1, int64_t ref_pos_1, + const string& ref_name_2, bool ref_rev_2, int64_t ref_pos_2, + int64_t tlen_limit, bam_hdr_t* header, vector& dest) const { + + auto cigar_1 = cigar_against_path(mp_aln_1, ref_name_1, ref_rev_1, ref_pos_1, *graph, min_splice_length); + auto cigar_2 = cigar_against_path(mp_aln_2, ref_name_2, ref_rev_2, ref_pos_2, *graph, min_splice_length); + + Alignment shim_1, shim_2; + create_alignment_shim(name_1, mp_aln_1, shim_1, nullptr, &name_2); + create_alignment_shim(name_2, mp_aln_2, shim_2, &name_1, nullptr); + + auto tlens = compute_template_lengths(ref_pos_1, cigar_1, ref_pos_2, cigar_2); + + auto bam_1 = alignment_to_bam(header, shim_1, ref_name_1, ref_pos_1, ref_rev_1, cigar_1, + ref_name_2, ref_pos_2, ref_rev_2, tlens.first, tlen_limit); + auto bam_2 = alignment_to_bam(header, shim_2, ref_name_2, ref_pos_2, ref_rev_2, cigar_2, + ref_name_1, ref_pos_1, ref_rev_1, tlens.second, tlen_limit); + + + // set mate unmapped flags + // FIXME: the BAM conversion code looks like it doesn't do this correctly... + if (bam_1->core.flag & BAM_FUNMAP) { + bam_2->core.flag |= BAM_FMUNMAP; + } + else { + bam_2->core.flag &= ~BAM_FMUNMAP; + } + if (bam_2->core.flag & BAM_FUNMAP) { + bam_1->core.flag |= BAM_FMUNMAP; + } + else { + bam_1->core.flag &= ~BAM_FMUNMAP; + } + + add_annotations(mp_aln_1, bam_1); + add_annotations(mp_aln_2, bam_2); + + dest.push_back(bam_1); + dest.push_back(bam_2); +} + +void MultipathAlignmentEmitter::add_annotations(const multipath_alignment_t& mp_aln, bam1_t* bam) const { + if (mp_aln.has_annotation("allelic_mapq")) { + auto anno = mp_aln.get_annotation("allelic_mapq"); + assert(anno.first == multipath_alignment_t::Double); + int64_t allelic_mapq = *((double*) anno.second); + bam_aux_update_int(bam, "AQ", allelic_mapq); + } + if (mp_aln.has_annotation("group_mapq")) { + auto anno = mp_aln.get_annotation("group_mapq"); + assert(anno.first == multipath_alignment_t::Double); + int64_t group_mapq = *((double*) anno.second); + bam_aux_update_int(bam, "GM", group_mapq); + } + if (mp_aln.has_annotation("secondary")) { + auto anno = mp_aln.get_annotation("secondary"); + assert(anno.first == multipath_alignment_t::Bool); + bool secondary = *((bool*) anno.second); + if (secondary) { + bam->core.flag |= BAM_FSECONDARY; + } + } + if (mp_aln.has_annotation("proper_pair")) { + // we've annotated proper pairing, let this override the tlen limit + auto anno = mp_aln.get_annotation("proper_pair"); + assert(anno.first == multipath_alignment_t::Bool); + bool proper_pair = *((bool*) anno.second); + // we assume proper pairing applies to both reads + if (proper_pair) { + bam->core.flag |= BAM_FPROPER_PAIR; + } + else { + bam->core.flag &= ~BAM_FPROPER_PAIR; + } + } +} + +} diff --git a/src/multipath_alignment_emitter.hpp b/src/multipath_alignment_emitter.hpp new file mode 100644 index 00000000000..10f9f518df8 --- /dev/null +++ b/src/multipath_alignment_emitter.hpp @@ -0,0 +1,117 @@ +#ifndef VG_MULTIPATH_ALIGNMENT_EMITTER_HPP_INCLUDED +#define VG_MULTIPATH_ALIGNMENT_EMITTER_HPP_INCLUDED + +/** + * \file multipath_alignment_emitter.hpp + * + * Defines a system for emitting multipath alignments and groups of multipath alignments in multiple formats. + */ + +#include +#include +#include + +#include +#include +#include +#include "multipath_alignment.hpp" +#include "hts_alignment_emitter.hpp" +#include "alignment.hpp" + +namespace vg { +using namespace std; + +/* + * Class that handles multithreaded output for multipath alignments + */ +class MultipathAlignmentEmitter : public HTSWriter { +public: + + /// Initialize with the intended output stream and the maximum number of threads that + /// will be outputting. + /// Allowed formats: + /// - "GAMP" + /// - "GAM", involves conversion to single path + /// - "GAF", involves conversion to single path, requires a graph + /// - "SAM", "BAM", "CRAM:" requires path length map, and all input alignments must + /// already be surjected. If alignments have connections, requires a graph + MultipathAlignmentEmitter(const string& filename, size_t num_threads, const string out_format = "GAMP", + const PathPositionHandleGraph* graph = nullptr, + const vector>* path_order_and_length = nullptr); + ~MultipathAlignmentEmitter(); + + /// Choose a read group to apply to all emitted alignments + void set_read_group(const string& read_group); + + /// Choose a sample name to apply to all emitted alignments + void set_sample_name(const string& sample_name); + + /// Set the length deletion (at a node boundary) that will be considered an unaligned splicing event + /// in HTSLib output + void set_min_splice_length(int64_t min_splice_length); + + /// Emit paired read mappings as interleaved protobuf messages + void emit_pairs(const string& name_1, const string& name_2, + vector>&& mp_aln_pairs, + vector, tuple>>* path_positions = nullptr, + vector* tlen_limits = nullptr); + + /// Emit read mappings as protobuf messages + void emit_singles(const string& name, vector&& mp_alns, + vector>* path_positions = nullptr); + +private: + + /// what format are we outputting in + enum output_format_t {GAMP, GAM, GAF, BAM, SAM, CRAM}; + output_format_t format; + + /// make a GAM alignment from a multipath alignment + void convert_to_alignment(const multipath_alignment_t& mp_aln, Alignment& aln, + const string* prev_name = nullptr, + const string* next_name = nullptr) const; + + /// store the data in an Algnment that is used in the conversion to bam1_t + void create_alignment_shim(const string& name, const multipath_alignment_t& mp_aln, + Alignment& shim, const string* prev_name = nullptr, + const string* next_name = nullptr) const; + + /// store a bam1_t object with the indicated data in the dest vector + void convert_to_hts_unpaired(const string& name, const multipath_alignment_t& mp_aln, + const string& ref_name, bool ref_rev, int64_t ref_pos, + bam_hdr_t* header, vector& dest) const; + + /// store two paired bam1_t objects with the indicated data in the dest vector + void convert_to_hts_paired(const string& name_1, const string& name_2, + const multipath_alignment_t& mp_aln_1, + const multipath_alignment_t& mp_aln_2, + const string& ref_name_1, bool ref_rev_1, int64_t ref_pos_1, + const string& ref_name_2, bool ref_rev_2, int64_t ref_pos_2, + int64_t tlen_limit, bam_hdr_t* header, vector& dest) const; + + /// transfer the allelic mapq, group mapq, and secondary annotations to a BAM record + void add_annotations(const multipath_alignment_t& mp_aln, bam1_t* bam) const; + + const PathPositionHandleGraph* graph; + + /// an Alignment emitter for each thread + vector>> aln_emitters; + + /// a MultipathAlignment emitter for each thread + vector>> mp_aln_emitters; + + /// read group applied to alignments + string read_group; + + /// sample name applied to alignments + string sample_name; + + /// the shortest deletion that we will interpret as a splice in the CIGAR string of HTS output + int64_t min_splice_length = numeric_limits::max(); + +}; + +} + + +#endif diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index 2edc2005ec7..a4d1c3e0cd2 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -3,69 +3,126 @@ // #include "multipath_alignment_graph.hpp" +#include "sequence_complexity.hpp" + +#include "structures/rank_pairing_heap.hpp" + +#include "algorithms/extract_connecting_graph.hpp" +#include "algorithms/extract_extending_graph.hpp" //#define debug_multipath_alignment +//#define debug_decompose_algorithm +//#define debug_shift_pruning using namespace std; namespace vg { unordered_multimap> MultipathAlignmentGraph::create_injection_trans(const unordered_map>& projection_trans) { // create the injection translator, which maps a node in the original graph to every one of its occurrences - // in the dagified graph + // in the dagified graphfs unordered_multimap > injection_trans; for (const auto& trans_record : projection_trans) { #ifdef debug_multipath_alignment - cerr << trans_record.second.first << "->" << trans_record.first << (trans_record.second.second ? "-" : "+") << endl; + cerr << trans_record.second.first << " -> " << trans_record.first << (trans_record.second.second ? "-" : "+") << endl; #endif injection_trans.emplace(trans_record.second.first, make_pair(trans_record.first, trans_record.second.second)); } return injection_trans; } + + function(id_t)> MultipathAlignmentGraph::create_projector(const unordered_map>& projection_trans) { + return [&](id_t node_id) { return projection_trans.at(node_id); }; + } + + unordered_multimap> MultipathAlignmentGraph::create_injection_trans(const HandleGraph& graph, + const function(id_t)>& project) { + unordered_multimap> injection_trans; + graph.for_each_handle([&](const handle_t& handle) { + id_t node_id = graph.get_id(handle); + auto proj = project(node_id); + injection_trans.emplace(proj.first, make_pair(node_id, proj.second)); + }); + return injection_trans; + } - MultipathAlignmentGraph::MultipathAlignmentGraph(VG& vg, + unordered_map> MultipathAlignmentGraph::create_identity_projection_trans(const HandleGraph& graph) { + unordered_map> to_return; + + graph.for_each_handle([&](const handle_t& handle) { + // Each node just projects from itself forward. + to_return[graph.get_id(handle)] = make_pair(graph.get_id(handle), false); + }); + + return to_return; + } + + MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, - const Alignment& alignment, const unordered_map>& projection_trans, - const unordered_multimap>& injection_trans) { + const Alignment& alignment, const function(id_t)>& project, + const unordered_multimap>& injection_trans, bool realign_Ns, + bool preserve_tail_anchors, vector* path_node_provenance) { // Set up the initial multipath graph from the given path chunks. - create_path_chunk_nodes(vg, path_chunks, alignment, projection_trans, injection_trans); + create_path_chunk_nodes(graph, path_chunks, alignment, project, injection_trans, path_node_provenance); // trim indels off of nodes to make the score dynamic programmable across nodes - trim_hanging_indels(alignment); + trim_hanging_indels(alignment, realign_Ns, preserve_tail_anchors); // compute reachability and add edges - add_reachability_edges(vg, projection_trans, injection_trans); + add_reachability_edges(graph, project, injection_trans); } - MultipathAlignmentGraph::MultipathAlignmentGraph(VG& vg, + MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, - const Alignment& alignment, const unordered_map>& projection_trans) : - MultipathAlignmentGraph(vg, path_chunks, alignment, projection_trans, - create_injection_trans(projection_trans)) { + const Alignment& alignment, const function(id_t)>& project, bool realign_Ns, + bool preserve_tail_anchors, vector* path_node_provenance) : + MultipathAlignmentGraph(graph, path_chunks, alignment, project, + create_injection_trans(graph, project), realign_Ns, preserve_tail_anchors, + path_node_provenance) { + // Nothing to do + + } + + MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, + const vector, Path>>& path_chunks, + const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns, + bool preserve_tail_anchors, vector* path_node_provenance) : + MultipathAlignmentGraph(graph, path_chunks, alignment, create_projector(projection_trans), + create_injection_trans(projection_trans), realign_Ns, preserve_tail_anchors, + path_node_provenance) { // Nothing to do } - MultipathAlignmentGraph::MultipathAlignmentGraph(VG& vg, const MultipathMapper::memcluster_t& hits, - const unordered_map>& projection_trans, + MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, + const function(id_t)>& project, const unordered_multimap>& injection_trans, - gcsa::GCSA* gcsa) { + vector& path_node_provenance, + size_t max_branch_trim_length, gcsa::GCSA* gcsa, + const MultipathMapper::match_fanouts_t* fanout_breaks) { // initialize the match nodes - create_match_nodes(vg, hits, projection_trans, injection_trans); + create_match_nodes(graph, hits, project, injection_trans, path_node_provenance, max_branch_trim_length, fanout_breaks); if (gcsa) { // we indicated that these MEMs came from a GCSA, so there might be order-length MEMs that we can combine - collapse_order_length_runs(vg, gcsa); + // TODO: this can lose some provenance information + collapse_order_length_runs(graph, gcsa, path_node_provenance); + } + + if (max_branch_trim_length) { + // we indicated that we'd like to trim the path nodes to avoid ends that cause us to get locked into one + // branch after a branch point + trim_to_branch_points(&graph, max_branch_trim_length); } #ifdef debug_multipath_alignment - cerr << "nodes after adding and collapsing:" << endl; + cerr << "nodes after adding, jittering, trimming, and collapsing:" << endl; for (size_t i = 0; i < path_nodes.size(); i++) { - PathNode& path_node = path_nodes[i]; - cerr << i << " " << pb2json(path_node.path) << " "; + PathNode& path_node = path_nodes.at(i); + cerr << i << " (hit " << path_node_provenance[i] << ") " << debug_string(path_node.path) << " "; for (auto iter = path_node.begin; iter != path_node.end; iter++) { cerr << *iter; } @@ -74,52 +131,84 @@ namespace vg { #endif // compute reachability and add edges - add_reachability_edges(vg, projection_trans, injection_trans); + add_reachability_edges(graph, project, injection_trans, &path_node_provenance); } - MultipathAlignmentGraph::MultipathAlignmentGraph(VG& vg, const MultipathMapper::memcluster_t& hits, + MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, + const function(id_t)>& project, + vector& path_node_provenance, + size_t max_branch_trim_length, gcsa::GCSA* gcsa, + const MultipathMapper::match_fanouts_t* fanout_breaks) : + MultipathAlignmentGraph(graph, hits, project, + create_injection_trans(graph, project), + path_node_provenance, max_branch_trim_length, + gcsa, fanout_breaks) { + // Nothing to do + + } + + MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, const unordered_map>& projection_trans, - gcsa::GCSA* gcsa) : - MultipathAlignmentGraph(vg, hits, projection_trans, - create_injection_trans(projection_trans), gcsa) { + vector& path_node_provenance, + size_t max_branch_trim_length, gcsa::GCSA* gcsa, + const MultipathMapper::match_fanouts_t* fanout_breaks) : + MultipathAlignmentGraph(graph, hits, create_projector(projection_trans), + create_injection_trans(projection_trans), + path_node_provenance, max_branch_trim_length, + gcsa, fanout_breaks) { // Nothing to do } - MultipathAlignmentGraph::MultipathAlignmentGraph(VG& vg, const Alignment& alignment, SnarlManager& snarl_manager, size_t max_snarl_cut_size, - const unordered_map>& projection_trans, + MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, + SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, + const function(id_t)>& project, const unordered_multimap>& injection_trans) { // this can only be done on aligned sequences - if (!alignment.has_path()) { + if (!alignment.has_path() || alignment.path().mapping_size() == 0) { + has_reachability_edges = true; return; } // shim the aligned path into the path chunks constructor to make a node for it vector, Path>> path_holder; path_holder.emplace_back(make_pair(alignment.sequence().begin(), alignment.sequence().end()), alignment.path()); - create_path_chunk_nodes(vg, path_holder, alignment, projection_trans, injection_trans); + create_path_chunk_nodes(graph, path_holder, alignment, project, injection_trans); // cut the snarls out of the aligned path so we can realign through them - resect_snarls_from_paths(&snarl_manager, projection_trans, max_snarl_cut_size); + if (max_snarl_cut_size) { + resect_snarls_from_paths(snarl_manager, dist_index, project, max_snarl_cut_size); + } // the snarls algorithm adds edges where necessary has_reachability_edges = true; // trim indels from the end of path nodes so that scores will be dynamic programmable across subpaths - trim_hanging_indels(alignment); + trim_hanging_indels(alignment, true); } - - MultipathAlignmentGraph::MultipathAlignmentGraph(VG& vg, const Alignment& alignment, SnarlManager& snarl_manager, size_t max_snarl_cut_size, + + MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, + SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, const unordered_map>& projection_trans) : - MultipathAlignmentGraph(vg, alignment, snarl_manager, max_snarl_cut_size, projection_trans, + MultipathAlignmentGraph(graph, alignment, snarl_manager, dist_index, max_snarl_cut_size, + create_projector(projection_trans), create_injection_trans(projection_trans)) { // Nothing to do } + + MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, + SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, + const function(id_t)>& project) : + MultipathAlignmentGraph(graph, alignment, snarl_manager, dist_index, max_snarl_cut_size, + project, create_injection_trans(graph, project)) { + // Nothing to do + } - void MultipathAlignmentGraph::create_path_chunk_nodes(VG& vg, const vector, Path>>& path_chunks, - const Alignment& alignment, const unordered_map>& projection_trans, - const unordered_multimap>& injection_trans) { + void MultipathAlignmentGraph::create_path_chunk_nodes(const HandleGraph& graph, const vector, Path>>& path_chunks, + const Alignment& alignment, const function(id_t)>& project, + const unordered_multimap>& injection_trans, + vector* path_node_provenance) { for (const auto& path_chunk : path_chunks) { @@ -139,8 +228,8 @@ namespace vg { } // stack for DFS, each record contains records of (next trav index, next traversals) - vector>> stack; - stack.emplace_back(0, vector{NodeTraversal(vg.get_node(injected_id))}); + vector>> stack; + stack.emplace_back(0, vector{graph.get_handle(injected_id)}); while (!stack.empty()) { auto& back = stack.back(); @@ -151,36 +240,34 @@ namespace vg { stack.pop_back(); continue; } - NodeTraversal trav = back.second[back.first]; + handle_t trav = back.second[back.first]; back.first++; #ifdef debug_multipath_alignment - cerr << "checking node " << trav.node->id() << endl; + cerr << "checking node " << graph.get_id(trav) << endl; #endif + pair projected_trav = project(graph.get_id(trav)); - auto f = projection_trans.find(trav.node->id()); - if (f != projection_trans.end()) { - pair projected_trav = f->second; + const Position& pos = path.mapping(stack.size() - 1).position(); + if (projected_trav.first == pos.node_id() && + projected_trav.second == (projected_trav.second != graph.get_is_reverse(trav))) { + + // position matched the path - const Position& pos = path.mapping(stack.size() - 1).position(); - if (projected_trav.first == pos.node_id() && - projected_trav.second == (projected_trav.second != trav.backward)) { - - // position matched the path - #ifdef debug_multipath_alignment - cerr << "chunk position " << pb2json(pos) << " matches traversal " << projected_trav.first << (projected_trav.second ? "-" : "+") << endl; + cerr << "chunk position " << pb2json(pos) << " matches traversal " << projected_trav.first << (projected_trav.second ? "-" : "+") << ", walked " << stack.size() << " of " << path.mapping_size() << " mappings" << endl; #endif - - if (stack.size() == path.mapping_size()) { + + if (stack.size() == path.mapping_size()) { #ifdef debug_multipath_alignment - cerr << "finished walking path" << endl; + cerr << "finished walking path" << endl; #endif - break; - } - stack.emplace_back(0, vector()); - vg.nodes_next(trav, stack.back().second); + break; } + stack.emplace_back(0, vector()); + graph.follow_edges(trav, false, [&](const handle_t& next) { + stack.back().second.emplace_back(next); + }); } } @@ -193,6 +280,9 @@ namespace vg { } // now we can make a node in the subpath graph + if (path_node_provenance) { + path_node_provenance->push_back(path_nodes.size()); + } path_nodes.emplace_back(); PathNode& path_node = path_nodes.back(); @@ -205,146 +295,255 @@ namespace vg { const Position& position = mapping.position(); auto& stack_record = stack[i]; - NodeTraversal& trav = stack_record.second[stack_record.first - 1]; - - Mapping* new_mapping = path_node.path.add_mapping(); - Position* new_position = new_mapping->mutable_position(); - - new_mapping->set_rank(path_node.path.mapping_size()); + handle_t& trav = stack_record.second[stack_record.first - 1]; + path_mapping_t* new_mapping = path_node.path.add_mapping(); + position_t* new_position = new_mapping->mutable_position(); + // use the node space that we walked out in - new_position->set_node_id(trav.node->id()); - new_position->set_is_reverse(trav.backward); + new_position->set_node_id(graph.get_id(trav)); + new_position->set_is_reverse(graph.get_is_reverse(trav)); new_position->set_offset(position.offset()); for (int64_t j = 0; j < mapping.edit_size(); j++) { - *new_mapping->add_edit() = mapping.edit(j); + from_proto_edit(mapping.edit(j), *new_mapping->add_edit()); } } #ifdef debug_multipath_alignment - cerr << "walked path: " << pb2json(path_node.path) << endl; + cerr << "walked path: " << debug_string(path_node.path) << endl; + cerr << "sequence: " << string(path_node.begin, path_node.end) << endl; #endif } } +#ifdef debug_multipath_alignment + cerr << "final path chunks:" << endl; + for (size_t i = 0; i < path_nodes.size(); ++i) { + cerr << i << ": " << string(path_nodes[i].begin, path_nodes[i].end) << " " << debug_string(path_nodes[i].path) << endl; + } +#endif } + - void MultipathAlignmentGraph::trim_hanging_indels(const Alignment& alignment) { - - // if the path begins or ends with any gaps we have to remove them to make the score - // dynamic programmable across Subpaths + bool MultipathAlignmentGraph::trim_and_check_for_empty(const Alignment& alignment, bool trim_Ns, PathNode& path_node, + bool preserve_tail_anchors, int64_t* removed_start_from_length, + int64_t* removed_end_from_length) { - unordered_set to_remove; +#ifdef debug_multipath_alignment + cerr << "trimming path node " << string(path_node.begin, path_node.end) << " " << debug_string(path_node.path) << endl; +#endif - for (size_t i = 0; i < path_nodes.size(); i++) { - - PathNode& path_node = path_nodes[i]; - Path& path = path_node.path; - - int64_t mapping_start_idx = 0; - int64_t mapping_last_idx = path.mapping_size() - 1; - - int64_t edit_start_idx = 0; - int64_t edit_last_idx = path.mapping(mapping_last_idx).edit_size() - 1; - - // don't cut off softclips (we assume the entire softclip is in one edit and the next edit is aligned bases) - bool softclip_start = (path.mapping(0).edit(0).from_length() == 0 && - path.mapping(0).edit(0).to_length() > 0 && - path_node.begin == alignment.sequence().begin()); - - bool softclip_end = (path.mapping(mapping_last_idx).edit(edit_last_idx).from_length() == 0 && - path.mapping(mapping_last_idx).edit(edit_last_idx).to_length() > 0 && - path_node.end == alignment.sequence().end()); - - int64_t removed_start_to_length = 0; - int64_t removed_end_to_length = 0; - - int64_t removed_start_mapping_from_length = 0; + // Trim down the given PathNode of everything except softclips. + // Return true if it all gets trimmed away and should be removed. + path_t& path = path_node.path; - // find the first aligned, non-N bases from the start of the path - if (!softclip_start) { - bool found_start = false; - for (; mapping_start_idx < path.mapping_size(); mapping_start_idx++) { - const Mapping& mapping = path.mapping(mapping_start_idx); - removed_start_mapping_from_length = 0; - for (edit_start_idx = 0; edit_start_idx < mapping.edit_size(); edit_start_idx++) { - const Edit& edit = mapping.edit(edit_start_idx); - - if (edit.from_length() > 0 && edit.to_length() > 0 && - (edit.sequence().empty() || any_of(edit.sequence().begin(), edit.sequence().end(), [](char c) {return c != 'N';}))) { - found_start = true; - break; - } - removed_start_to_length += edit.to_length(); - removed_start_mapping_from_length += edit.from_length(); - } - if (found_start) { + int64_t mapping_start_idx = 0; + int64_t mapping_last_idx = path.mapping_size() - 1; + + int64_t edit_start_idx = 0; + int64_t edit_last_idx = path.mapping(mapping_last_idx).edit_size() - 1; + + // don't cut off softclips (we assume the entire softclip is in one edit and the next edit is aligned bases) + bool softclip_start = (path.mapping(0).edit(0).from_length() == 0 && + path.mapping(0).edit(0).to_length() > 0 && + path_node.begin == alignment.sequence().begin()); + bool softclip_end = (path.mapping(mapping_last_idx).edit(edit_last_idx).from_length() == 0 && + path.mapping(mapping_last_idx).edit(edit_last_idx).to_length() > 0 && + path_node.end == alignment.sequence().end()); + + // if indicated, we may want to preserve the location of tail anchors in spite of deletions + bool ignore_deletion_start = false; + bool ignore_deletion_end = false; + bool ignore_insertion_start = false; + bool ignore_insertion_end = false; + if (preserve_tail_anchors) { + if (path_node.begin == alignment.sequence().begin()) { + const auto& first_edit = path.mapping(0).edit(0); + ignore_deletion_start = (first_edit.from_length() != 0 && first_edit.to_length() == 0); + ignore_insertion_start = (first_edit.from_length() == 0 && first_edit.to_length() != 0); + } + if (path_node.end == alignment.sequence().end()) { + const auto& last_edit = path.mapping(mapping_last_idx).edit(edit_last_idx); + ignore_deletion_end = (last_edit.from_length() != 0 && last_edit.to_length() == 0); + ignore_insertion_end = (last_edit.from_length() == 0 && last_edit.to_length() != 0); + } + } + + +#ifdef debug_multipath_alignment + cerr << "preserving softclips: begin? " << softclip_start << ", end? " << softclip_end << endl; + cerr << "preserving deletion anchors: begin? " << ignore_deletion_start << ", end? " << ignore_deletion_end << endl; + cerr << "preserving insertion anchors: begin? " << ignore_insertion_start << ", end? " << ignore_insertion_end << endl; +#endif + + // Track how much we trim off each end + int64_t removed_start_to_length = 0; + int64_t removed_end_to_length = 0; + if (removed_start_from_length != nullptr) { + *removed_start_from_length = 0; + } + if (removed_end_from_length != nullptr) { + *removed_end_from_length = 0; + } + + int64_t removed_start_mapping_from_length = 0; + + // find the first aligned, non-N bases from the start of the path + if (!softclip_start && !ignore_deletion_start && !ignore_insertion_start) { + bool found_start = false; + for (; mapping_start_idx < path.mapping_size(); mapping_start_idx++) { + const path_mapping_t& mapping = path.mapping(mapping_start_idx); + removed_start_mapping_from_length = 0; + for (edit_start_idx = 0; edit_start_idx < mapping.edit_size(); edit_start_idx++) { + const edit_t& edit = mapping.edit(edit_start_idx); + + if (edit.from_length() > 0 && edit.to_length() > 0 && + (edit.sequence().empty() || !trim_Ns || any_of(edit.sequence().begin(), edit.sequence().end(), [](char c) {return c != 'N';}))) { + found_start = true; break; } + removed_start_to_length += edit.to_length(); + removed_start_mapping_from_length += edit.from_length(); + } + + if (removed_start_from_length != nullptr) { + // Record how much we trimmed + *removed_start_from_length += removed_start_mapping_from_length; + } + + if (found_start) { + break; } } - - // find the first aligned bases from the end of the path - if (!softclip_end) { - bool found_last = false; - for (; mapping_last_idx >= 0; mapping_last_idx--) { - const Mapping& mapping = path.mapping(mapping_last_idx); - for (edit_last_idx = mapping.edit_size() - 1; edit_last_idx >= 0; edit_last_idx--) { - const Edit& edit = mapping.edit(edit_last_idx); - if (edit.from_length() > 0 && edit.to_length() > 0 && - (edit.sequence().empty() || any_of(edit.sequence().begin(), edit.sequence().end(), [](char c) {return c != 'N';}))) { - found_last = true; - break; - } - removed_end_to_length += edit.to_length(); - } - if (found_last) { + } + + // find the first aligned bases from the end of the path + if (!softclip_end && !ignore_deletion_end && !ignore_insertion_end) { + bool found_last = false; + for (; mapping_last_idx >= 0; mapping_last_idx--) { + const path_mapping_t& mapping = path.mapping(mapping_last_idx); + for (edit_last_idx = mapping.edit_size() - 1; edit_last_idx >= 0; edit_last_idx--) { + const edit_t& edit = mapping.edit(edit_last_idx); + if (edit.from_length() > 0 && edit.to_length() > 0 && + (edit.sequence().empty() || !trim_Ns || any_of(edit.sequence().begin(), edit.sequence().end(), [](char c) {return c != 'N';}))) { + found_last = true; break; } + removed_end_to_length += edit.to_length(); + if (removed_end_from_length != nullptr) { + *removed_end_from_length += edit.to_length(); + } + } + if (found_last) { + break; } } + } #ifdef debug_multipath_alignment - cerr << "after removing non-softclip flanking indels and N matches, path goes from (" << mapping_start_idx << ", " << edit_start_idx << ") to (" << mapping_last_idx << ", " << edit_last_idx << ")" << endl; + cerr << "after removing non-softclip flanking indels and N matches, path goes from (" << mapping_start_idx << ", " << edit_start_idx << ") to (" << mapping_last_idx << ", " << edit_last_idx << ")" << endl; #endif + + // did we find any indels? + if (mapping_start_idx != 0 || mapping_last_idx + 1 != path.mapping_size() || + edit_start_idx != 0 || edit_last_idx + 1 != path.mapping(mapping_last_idx).edit_size()) { - // did we find any indels? - if (mapping_start_idx != 0 || mapping_last_idx + 1 != path.mapping_size() || - edit_start_idx != 0 || edit_last_idx + 1 != path.mapping(mapping_last_idx).edit_size()) { - - // would we need to trim the whole node? - if (mapping_start_idx < mapping_last_idx || - (mapping_start_idx == mapping_last_idx && edit_start_idx <= edit_last_idx)) { + // would we need to trim the whole node? + if (mapping_start_idx < mapping_last_idx || + (mapping_start_idx == mapping_last_idx && edit_start_idx <= edit_last_idx)) { + + // update the read interval + path_node.begin += removed_start_to_length; + path_node.end -= removed_end_to_length; + + // make a new path with the indels trimmed + path_t trimmed_path; + for (int64_t j = mapping_start_idx; j <= mapping_last_idx; j++) { + const path_mapping_t& mapping = path.mapping(j); + const position_t& position = mapping.position(); - // update the read interval - path_node.begin += removed_start_to_length; - path_node.end -= removed_end_to_length; + path_mapping_t* new_mapping = trimmed_path.add_mapping(); + position_t* new_position = new_mapping->mutable_position(); - // make a new path with the indels trimmed - Path trimmed_path; - for (int64_t j = mapping_start_idx; j <= mapping_last_idx; j++) { - const Mapping& mapping = path.mapping(j); - const Position& position = mapping.position(); - - Mapping* new_mapping = trimmed_path.add_mapping(); - Position* new_position = new_mapping->mutable_position(); - - new_position->set_node_id(position.node_id()); - new_position->set_is_reverse(position.is_reverse()); - new_position->set_offset(position.offset() + (j == mapping_start_idx ? removed_start_mapping_from_length : 0)); - - size_t k_start = (j == mapping_start_idx ? edit_start_idx : 0); - size_t k_end = (j == mapping_last_idx ? edit_last_idx + 1 : mapping.edit_size()); - for (size_t k = k_start; k < k_end; k++) { - *new_mapping->add_edit() = mapping.edit(k); - } - } + new_position->set_node_id(position.node_id()); + new_position->set_is_reverse(position.is_reverse()); + new_position->set_offset(position.offset() + (j == mapping_start_idx ? removed_start_mapping_from_length : 0)); - path_node.path = move(trimmed_path); - } - else { - to_remove.insert(i); + size_t k_start = (j == mapping_start_idx ? edit_start_idx : 0); + size_t k_end = (j == mapping_last_idx ? edit_last_idx + 1 : mapping.edit_size()); + for (size_t k = k_start; k < k_end; k++) { + *new_mapping->add_edit() = mapping.edit(k); + } } + + path_node.path = move(trimmed_path); + } + else if (ignore_deletion_start) { + // we would need to remove the whole node, except we indicated that we want + // to preserve the tail anchors to the start + + path_node.end = path_node.begin; + + pos_t start_pos = initial_position(path_node.path); + path.clear_mapping(); + path_mapping_t* mapping = path.add_mapping(); + position_t* pos = mapping->mutable_position(); + pos->set_node_id(id(start_pos)); + pos->set_offset(offset(start_pos)); + pos->set_is_reverse(is_rev(start_pos)); + mapping->add_edit(); + +#ifdef debug_multipath_alignment + cerr << "preserving start deletion path read[" << (path_node.begin - alignment.sequence().begin()) << "] " << debug_string(path_node.path) << endl; +#endif + } + else if (ignore_deletion_end) { + // we would need to remove the whole node, except we indicated that we want + // to preserve the tail anchors to the end + + path_node.begin = path_node.end; + + pos_t end_pos = final_position(path_node.path); + path.clear_mapping(); + path_mapping_t* mapping = path.add_mapping(); + position_t* pos = mapping->mutable_position(); + pos->set_node_id(id(end_pos)); + pos->set_offset(offset(end_pos)); + pos->set_is_reverse(is_rev(end_pos)); + mapping->add_edit(); + +#ifdef debug_multipath_alignment + cerr << "preserving end deletion path read[" << (path_node.begin - alignment.sequence().begin()) << "] " << debug_string(path_node.path) << endl; +#endif + } + else if (!ignore_insertion_start & !ignore_insertion_end) { +#ifdef debug_multipath_alignment + cerr << "entire path node is trimmed" << endl; +#endif + // We do need to remove the whole node; now it is empty. + return true; + } + } + + // The node itself can stay + return false; + } + + void MultipathAlignmentGraph::trim_hanging_indels(const Alignment& alignment, bool trim_Ns, + bool preserve_tail_anchors) { + + // if the path begins or ends with any gaps we have to remove them to make the score + // dynamic programmable across Subpaths + + unordered_set to_remove; + + for (size_t i = 0; i < path_nodes.size(); i++) { + + PathNode& path_node = path_nodes.at(i); + + if (trim_and_check_for_empty(alignment, trim_Ns, path_node, preserve_tail_anchors)) { + // We trimmed it and it all trimmed away + to_remove.insert(i); } } @@ -365,7 +564,7 @@ namespace vg { // the indexes of edges we've already added unordered_set added_edges; - for (pair& edge : path_nodes[i].edges) { + for (pair& edge : path_nodes.at(i).edges) { edge_queue.emplace(edge.second, edge.first); } @@ -375,7 +574,7 @@ namespace vg { if (to_remove.count(traversed_edge.second)) { // we're removing this path node, so traverse it and add connections along its edges - PathNode& removing_node = path_nodes[traversed_edge.second]; + PathNode& removing_node = path_nodes.at(traversed_edge.second); size_t through_length = traversed_edge.first + path_from_length(removing_node.path); for (pair& edge : removing_node.edges) { edge_queue.emplace(through_length + edge.second, edge.first); @@ -391,7 +590,7 @@ namespace vg { } // replace the old edges with the new ones - path_nodes[i].edges = new_edges; + path_nodes.at(i).edges = new_edges; } // move the nodes we're going to keep into the prefix of the vector @@ -405,7 +604,7 @@ namespace vg { removed_so_far[i]++;; } else if (removed_so_far[i]) { - path_nodes[i - removed_so_far[i]] = move(path_nodes[i]); + path_nodes.at(i - removed_so_far[i]) = move(path_nodes.at(i)); } } @@ -414,7 +613,7 @@ namespace vg { // update the indexes of the edges for (size_t i = 0; i < path_nodes.size(); i++) { - for (pair& edge : path_nodes[i].edges) { + for (pair& edge : path_nodes.at(i).edges) { edge.first -= removed_so_far[edge.first]; } } @@ -429,9 +628,12 @@ namespace vg { #endif } - void MultipathAlignmentGraph::create_match_nodes(VG& vg, const MultipathMapper::memcluster_t& hits, - const unordered_map>& projection_trans, - const unordered_multimap>& injection_trans) { + void MultipathAlignmentGraph::create_match_nodes(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, + const function(id_t)>& project, + const unordered_multimap>& injection_trans, + vector& path_node_provenance, + int64_t max_branch_trim_length, + const MultipathMapper::match_fanouts_t* fanout_breaks) { #ifdef debug_multipath_alignment cerr << "walking out MEMs in graph" << endl; @@ -440,149 +642,235 @@ namespace vg { // map of node ids in the dagified graph to the indices in the matches that contain them unordered_map> node_matches; - // walk the matches and filter out redundant sub-MEMs - for (int64_t i = 0; i < hits.size(); i++) { - - const pair& hit = hits[i]; - - // the part of the read we're going to match - string::const_iterator begin = hit.first->begin; - string::const_iterator end = hit.first->end; - int64_t mem_length = end - begin; - // the start of the hit in the original graph - const pos_t& hit_pos = hit.second; - + // records an existing path node in this map + auto record_node_matches = [&](const size_t i) { + for (const auto& m : path_nodes[i].path.mapping()) { + // record that each node occurs in this match so we can filter out sub-MEMs + node_matches[m.position().node_id()].push_back(i); #ifdef debug_multipath_alignment - cerr << "walking MEM hit " << hit_pos << " " << hit.first->sequence() << endl; + cerr << "associating node " << m.position().node_id() << " with a match at idx " << i << endl; #endif + } + }; + + // performs a check to see if a hit is a redundant sub-MEM + auto is_redundant = [&](string::const_iterator begin, string::const_iterator end, + const pos_t& hit_pos, id_t injected_id) { - auto hit_range = injection_trans.equal_range(id(hit_pos)); - for (auto iter = hit_range.first; iter != hit_range.second; iter++) { - // this graph is unrolled/dagified, so all orientations should match - if ((*iter).second.second != is_rev(hit_pos)) { - continue; - } - - // an id that corresponds to the original node - id_t injected_id = (*iter).second.first; - + // check all MEMs that traversed this node to see if this is a redundant sub-MEM + if (node_matches.count(injected_id)) { #ifdef debug_multipath_alignment - cerr << "hit node exists in graph as " << injected_id << endl; + cerr << "we need to check if this is a redundant sub MEM, there are previous paths that visited this hit" << endl; #endif - // check all MEMs that traversed this node to see if this is a redundant sub-MEM - bool is_partial_mem = false; - if (node_matches.count(injected_id)) { -#ifdef debug_multipath_alignment - cerr << "we need to check if this is a redundant sub MEM, there are previous that visited this hit" << endl; -#endif + for (int64_t j : node_matches[injected_id]) { + PathNode& match_node = path_nodes[j]; - for (int64_t j : node_matches[injected_id]) { - PathNode& match_node = path_nodes[j]; - - if (begin < match_node.begin || end > match_node.end) { + if (begin < match_node.begin || end > match_node.end) { #ifdef debug_multipath_alignment - if (begin < match_node.begin) { - cerr << "this MEM is earlier in the read than the other, so this is not redundant" << endl; - } - else if (end > match_node.end) { - cerr << "this MEM is later in the read than the other, so this is not redundant" << endl; - } -#endif - // the hit does not fall on the same section of the read as the other match, so - // it cannot be contained in it - continue; + if (begin < match_node.begin) { + cerr << "this MEM is earlier in the read than path node " << j << " by " << (match_node.begin - begin) << ", so this is not redundant" << endl; } - - int64_t relative_offset = begin - match_node.begin; + else if (end > match_node.end) { + cerr << "this MEM is later in the read than path node " << j << " by " << (end - match_node.end) << ", so this is not redundant" << endl; + } +#endif + // the hit does not fall on the same section of the read as the other match, so + // it cannot be contained in it + continue; + } + + int64_t relative_offset = begin - match_node.begin; #ifdef debug_multipath_alignment - cerr << "the match on node " << j << " has an relative offset of " << relative_offset << " to the this MEM in the read" << endl; + cerr << "the match on node " << j << " has an relative offset of " << relative_offset << " to the this MEM in the read" << endl; #endif - - Path& path = match_node.path; - - // if this is a partial MEM, we should be able to predict its hit location by traversing the path - // of the parent MEM by a distance equal to the relative offset - + + path_t& path = match_node.path; + + // if this is a partial MEM, we should be able to predict its hit location by traversing the path + // of the parent MEM by a distance equal to the relative offset + #ifdef debug_multipath_alignment - cerr << "traversing putative parent MEM with path " << pb2json(path) << endl; + cerr << "traversing putative parent MEM with path " << debug_string(path) << endl; #endif - - int64_t prefix_length = 0; - for (size_t k = 0; k < path.mapping_size(); k++) { - if (prefix_length > relative_offset) { + + int64_t prefix_length = 0; + for (size_t k = 0; k < path.mapping_size(); k++) { + if (prefix_length > relative_offset) { #ifdef debug_multipath_alignment - cerr << "we have passed where the location would be, breaking out of loop" << endl; + cerr << "we have passed where the location would be, breaking out of loop" << endl; #endif - break; - } - const Mapping& mapping = path.mapping(k); - // the length through this mapping - int64_t prefix_through_length = prefix_length + mapping_from_length(mapping); + break; + } + const path_mapping_t& mapping = path.mapping(k); + // the length through this mapping + int64_t prefix_through_length = prefix_length + mapping_to_length(mapping); #ifdef debug_multipath_alignment - cerr << "after traversing the " << k << "-th step, we have covered a distance of " << prefix_through_length << endl; + cerr << "after traversing the " << k << "-th step, we have covered a distance of " << prefix_through_length << endl; #endif - if (prefix_through_length > relative_offset) { - // we cross the relative offset on this node, so check if the path is in the predicted - // position for a redundant sub-MEM - id_t node_id_here = mapping.position().node_id(); - is_partial_mem = is_partial_mem || (injected_id == node_id_here - && offset(hit_pos) == mapping.position().offset() + relative_offset - prefix_length - && projection_trans.at(node_id_here).second == is_rev(hit_pos)); + if (prefix_through_length > relative_offset) { + // we cross the relative offset on this node, so check if the path is in the predicted + // position for a redundant sub-MEM + id_t node_id_here = mapping.position().node_id(); + #ifdef debug_multipath_alignment - cerr << "this mapping crosses where we would expect a child to be: " << node_id_here << (projection_trans.at(node_id_here).second ? "-" : "+") << ":" << mapping.position().offset() + relative_offset - prefix_length << endl; - cerr << "this MEM is actually at: " << injected_id << (is_rev(hit_pos) ? "-" : "+") << ":" << offset(hit_pos) << endl; + cerr << "this mapping crosses where we would expect a child to be: " << node_id_here << (project(node_id_here).second ? "-" : "+") << ":" << mapping.position().offset() + relative_offset - prefix_length << endl; + cerr << "this MEM is actually at: " << injected_id << (is_rev(hit_pos) ? "-" : "+") << ":" << offset(hit_pos) << endl; #endif + + // TODO: shouldn't everything be on the forward strand? i think i could remove the + // reverse checking so that only the offset of hit_pos need be communicated to this + // function + if (injected_id == node_id_here + && offset(hit_pos) == mapping.position().offset() + relative_offset - prefix_length + && project(node_id_here).second == is_rev(hit_pos)) { + // this MEM is redundant with the earlier + return true; } - prefix_length = prefix_through_length; - } - if (is_partial_mem) { - break; + + + } + prefix_length = prefix_through_length; } } - - // don't walk the match of false partial hits - if (is_partial_mem) { + } + return false; + }; + + + + // we can't filter sub-MEMs on the fly when there are fan-out breaks in this cluster + // because it's too hard to make sure match nodes get created in descending size order + bool filter_sub_mems_on_fly; + if (fanout_breaks != nullptr) { + if (fanout_breaks->empty()) { + filter_sub_mems_on_fly = true; + } + else { + bool found_any_breaks = false; + for (size_t i = 0; i < hits.first.size() && !found_any_breaks; ++i) { + found_any_breaks = fanout_breaks->count(hits.first[i].first); + } + filter_sub_mems_on_fly = !found_any_breaks; + } + } + else { + filter_sub_mems_on_fly = true; + } + + size_t num_failed_walks = 0; + + // walk the matches and filter out redundant sub-MEMs + for (int64_t i = 0; i < hits.first.size(); i++) { + + pair& hit = hits.first[i]; + + // the part of the read we're going to match + string::const_iterator begin = hit.first->begin; + string::const_iterator end = hit.first->end; + int64_t mem_length = end - begin; + // the start of the hit in the original graph + const pos_t& hit_pos = hit.second; + #ifdef debug_multipath_alignment - cerr << "this MEM is identified as a redundant sub-MEM, so we skip it" << endl; + cerr << "walking MEM hit " << i << ": " << hit_pos << " " << hit.first->sequence() << endl; + if (fanout_breaks && fanout_breaks->count(hit.first)) { + cerr << "fan-out breaks:" << endl; + for (auto fanout : fanout_breaks->at(hit.first)) { + cerr << "\t" << (fanout.first - hit.first->begin) << ": " << *fanout.first << " -> " << fanout.second << endl; + } + } #endif + bool walked_out_hit = false; + auto hit_range = injection_trans.equal_range(id(hit_pos)); + for (auto iter = hit_range.first; iter != hit_range.second; iter++) { + // this graph is unrolled/dagified, so all orientations should match + if (iter->second.second != is_rev(hit_pos)) { continue; } + // an id that corresponds to the original node + id_t injected_id = iter->second.first; + +#ifdef debug_multipath_alignment + cerr << "hit node exists in graph as " << injected_id << endl; +#endif + if (filter_sub_mems_on_fly) { + // don't walk the match of redundant partial hits + if (is_redundant(begin, end, hit_pos, injected_id)) { +#ifdef debug_multipath_alignment + cerr << "this MEM is identified as a redundant sub-MEM, so we skip it" << endl; +#endif + continue; + } + } + #ifdef debug_multipath_alignment cerr << "performing DFS to walk out match" << endl; #endif - // stack for DFS, each record contains tuples of (read begin, node offset, next node index, next node ids) - vector>> stack; - stack.emplace_back(begin, offset(hit_pos), 0, - vector{NodeTraversal(vg.get_node(injected_id))}); + // TODO: magic constant + size_t matches_found = 0; + size_t max_matches_walked = 32; - while (!stack.empty()) { + // stack for DFS, each record contains tuples of + // (read begin, node offset, next node index, next node handles, fan-out index) + vector, size_t>> stack; + stack.emplace_back(begin, offset(hit_pos), 0, + vector{graph.get_handle(injected_id)}, 0); + size_t fanout_size = 0; + if (fanout_breaks && fanout_breaks->count(hit.first)) { + fanout_size = fanout_breaks->at(hit.first).size(); + } + while (!stack.empty() && matches_found < max_matches_walked) { auto& back = stack.back(); if (get<2>(back) == get<3>(back).size()) { #ifdef debug_multipath_alignment - cerr << "traversed all edges out of current traversal" << endl; + cerr << "traversed all edges out of traversals coming from "; + if (stack.size() > 1) { + cerr << graph.get_id(get<3>(stack[stack.size() - 2])[get<2>(stack[stack.size() - 2]) - 1]); + } + else { + cerr << "start"; + } + cerr << endl; #endif stack.pop_back(); continue; } - NodeTraversal trav = get<3>(back)[get<2>(back)]; + + handle_t trav = get<3>(back)[get<2>(back)]; get<2>(back)++; + size_t fanout_idx = get<4>(back); #ifdef debug_multipath_alignment - cerr << "checking node " << trav.node->id() << endl; + cerr << "checking node " << graph.get_id(trav) << " at fanout idx " << get<4>(back) << " of " << fanout_size << endl; #endif - const string& node_seq = trav.node->sequence(); + string node_seq = graph.get_sequence(trav); size_t node_idx = get<1>(back); string::const_iterator read_iter = get<0>(back); // look for a match along the entire node sequence for (; node_idx < node_seq.size() && read_iter != end; node_idx++, read_iter++) { - if (node_seq[node_idx] != *read_iter) { + char read_char; + if (fanout_idx < fanout_size + && fanout_breaks->at(hit.first)[fanout_idx].first == read_iter) { + // we're at the next place where we substituted the read character + // for a different one + read_char = fanout_breaks->at(hit.first)[fanout_idx].second; +#ifdef debug_multipath_alignment + cerr << "\tapplying fanout break to " << read_char << " instead of " << *read_iter << " at index " << (read_iter - begin) << " of MEM" << endl; +#endif + ++fanout_idx; + } + else { + // we just want to match the read + read_char = *read_iter; + } + if (node_seq[node_idx] != read_char) { #ifdef debug_multipath_alignment cerr << "node sequence does not match read" << endl; #endif @@ -593,76 +881,748 @@ namespace vg { if (read_iter == end) { // finished walking match #ifdef debug_multipath_alignment - cerr << "reached end of read sequence, finished walking match" << endl; + cerr << "reached end of read sequence, converting into path node(s) starting at idx " << path_nodes.size() << endl; #endif - break; - } - else if (node_idx == node_seq.size()) { - // matched entire node - stack.emplace_back(read_iter, 0, 0, vector()); - vg.nodes_next(trav, get<3>(stack.back())); - } - } + assert(fanout_idx == fanout_size); + ++matches_found; + walked_out_hit = true; + + path_t path; + int64_t path_length = end - begin; + int64_t length_remaining = path_length; + size_t fanout_idx = 0; + int64_t length_until_fanout; + if (fanout_size) { + length_until_fanout = fanout_breaks->at(hit.first).front().first - begin; + } + else { + length_until_fanout = length_remaining; + } + auto curr_node_begin = begin; + + // walk out the match, breaking it at fan-out positions as necessary + int64_t offset = get<1>(stack.front()); + for (size_t j = 0; curr_node_begin < end; ) { + auto& search_record = stack[j]; + handle_t handle = get<3>(search_record)[get<2>(search_record) - 1]; + int64_t length_on_node = min(int64_t(graph.get_length(handle)) - offset, length_remaining); + + path_mapping_t* mapping = path.add_mapping(); + + edit_t* edit = mapping->add_edit(); + + // note: the graph is dagified and unrolled, so all hits should be on the forward strand + position_t* position = mapping->mutable_position(); + position->set_node_id(graph.get_id(handle)); + position->set_offset(offset); + + if (length_on_node >= length_until_fanout) { + // we're either at a fan-out position, or at the end of the path, so + // now we want to emit a path node with the path we've just walked out + + edit->set_from_length(length_until_fanout); + edit->set_to_length(length_until_fanout); + + auto node_end = end - length_remaining + length_until_fanout; + + if (curr_node_begin < node_end) { + // the node is non-empty + +#ifdef debug_multipath_alignment + cerr << "adding path node for walked match of sequence " << string(curr_node_begin, node_end) << endl; + cerr << debug_string(path) << endl; + cerr << "provenance of path traces to hit " << i << endl; +#endif + + // create a path node + path_nodes.emplace_back(); + PathNode& match_node = path_nodes.back(); + match_node.path = move(path); + match_node.begin = curr_node_begin; + match_node.end = node_end; + + path_node_provenance.emplace_back(i - num_failed_walks); + + if (filter_sub_mems_on_fly) { + record_node_matches(path_nodes.size() - 1); + } + } +#ifdef debug_multipath_alignment + else { + cerr << "skipping a walked path that has no sequence" << endl; + } +#endif + + // set up the next path walk + path = path_t(); + + // we need to advance past the fan-out character + curr_node_begin = node_end + 1; + int64_t length_to_advance = length_until_fanout + 1; + + // walk the path until finding the corresponding position + length_remaining -= length_to_advance; + while (j < stack.size() && + offset + length_to_advance >= graph.get_length(get<3>(stack[j])[get<2>(stack[j]) - 1])) { + length_to_advance -= graph.get_length(get<3>(stack[j])[get<2>(stack[j]) - 1]) - offset; + ++j; + offset = 0; + } + // manipulate the offset so that we start on the correct position + offset = length_to_advance; + + + // move the marker for the next fan-out ahead + ++fanout_idx; + if (fanout_idx < fanout_size) { + length_until_fanout = fanout_breaks->at(hit.first)[fanout_idx].first - curr_node_begin; + } + else { + length_until_fanout = length_remaining; + } + } + else { + edit->set_from_length(length_on_node); + edit->set_to_length(length_on_node); + + // advance to the next stack record + ++j; + offset = 0; + + // tick down the length trackers + length_remaining -= length_on_node; + length_until_fanout -= length_on_node; + + } + } + } + else if (node_idx == node_seq.size()) { + // matched entire node, move to next node(s) + stack.emplace_back(read_iter, 0, 0, vector(), fanout_idx); + graph.follow_edges(trav, false, [&](const handle_t& next) { + get<3>(stack.back()).push_back(next); + }); + } + } + } + + // filter out failed walks so they are marked as unclustered + if (!walked_out_hit) { + ++num_failed_walks; + } + else if (num_failed_walks) { + hits.first[i - num_failed_walks] = hit; + } + } + + // clear out the space that we allocated for walks that failed (if any) + hits.first.resize(hits.first.size() - num_failed_walks); + + if (!filter_sub_mems_on_fly) { + // we weren't removing redundant sub-MEMs as we made them, but now we can do it + // by sorting the path nodes descending by length + + vector order(path_nodes.size(), 0); + for (size_t i = 1; i < path_nodes.size(); ++i) { + order[i] = i; + } + + stable_sort(order.begin(), order.end(), [&](size_t i, size_t j) { + const PathNode& a = path_nodes[i]; + const PathNode& b = path_nodes[j]; + return a.end - a.begin > b.end - b.begin; + }); + vector index(order.size()); + for (size_t i = 0; i < order.size(); ++i) { + index[order[i]] = i; + } + for (size_t i = 0; i < index.size(); ++i) { + std::swap(path_nodes[index[i]], path_nodes[i]); + std::swap(path_node_provenance[index[i]], path_node_provenance[i]); + std::swap(index[index[i]], index[i]); + } + + size_t removed_so_far = 0; + for (size_t i = 0; i < path_nodes.size(); ++i) { + const position_t& pos = path_nodes[i].path.mapping(0).position(); + // TODO: this seems kinda like overkill, why do we need anything more than the offset in hit_pos? + auto proj = project(pos.node_id()); + pos_t hit_pos(proj.first, proj.second != pos.is_reverse(), pos.offset()); + + if (is_redundant(path_nodes[i].begin, path_nodes[i].end, hit_pos, pos.node_id())) { + ++removed_so_far; + } + else { + if (removed_so_far > 0) { + path_nodes[i - removed_so_far] = move(path_nodes[i]); + path_node_provenance[i - removed_so_far] = path_node_provenance[i]; + } + record_node_matches(i - removed_so_far); + } + } + if (removed_so_far) { + path_nodes.resize(path_nodes.size() - removed_so_far); + path_node_provenance.resize(path_nodes.size()); + } + } + + // merge the identical portion of any nodes that overlap exactly + merge_partially_redundant_match_nodes(node_matches, path_node_provenance); + + // if the end of a node is a homopolymer, see if it can be jittered at all + // and still make a reasonable alignment + jitter_homopolymer_ends(graph, path_node_provenance, hits, max_branch_trim_length); + } + + void MultipathAlignmentGraph::merge_partially_redundant_match_nodes(const unordered_map>& node_matches, + vector& path_node_provenance) { + +#ifdef debug_multipath_alignment + cerr << "looking for MEMs with partial redundancies to merge" << endl; +#endif + + if (path_nodes.size() <= 1) { + return; + } + + // find the groups that share at least one node + structures::UnionFind union_find(path_nodes.size(), false); + for (const auto& match_record : node_matches) { + for (int64_t i = 1; i < match_record.second.size(); ++i) { + union_find.union_groups(match_record.second.front(), match_record.second[i]); + } + } + + // records of (vector of (path node idx, mapping start idx), length) + vector>, size_t>> identical_segments; + + // big loop to identify the identical segments + for (const auto& overlapping_group : union_find.all_groups()) { + + if (overlapping_group.size() == 1) { + // doesn't overlap with anything + continue; + } + +#ifdef debug_multipath_alignment + cerr << "looking for merges in overlapping group:" << endl; + for (auto i : overlapping_group) { + cerr << "\t" << i << endl; + } +#endif + + // the amount of sequence we've walked for each node in the overlap group + vector to_length(overlapping_group.size(), 0); + + // some helper functions that keep things a bit more succinct later + + // go from index within overlapping group to current sequence position + auto seq_pos = [&](size_t i) { + return path_nodes[overlapping_group[i]].begin + to_length[i]; + }; + // go from heap record to mapping + auto next_mapping = [&](const pair& a) -> const path_mapping_t& { + return path_nodes[overlapping_group[a.first]].path.mapping(a.second); + }; + // go from index within overlapping group to path length in mappings + auto path_length = [&](size_t i) { + return path_nodes[overlapping_group[i]].path.mapping_size(); + }; + // reverse ordering + auto heap_cmp = [&](const pair& a, const pair& b) { + return seq_pos(a.first) > seq_pos(b.first); + }; + + // heap of (idx in group, mapping idx) + vector> heap(overlapping_group.size(), pair(0, 0)); + for (size_t i = 1; i < heap.size(); ++i) { + heap[i].first = i; + } + make_heap(heap.begin(), heap.end(), heap_cmp); + + while (heap.size() > 1) { + // move all of the path nodes whose next mapping occurs + // at the next sequence position into the unheaped back + auto heaped_end = heap.end(); + pop_heap(heap.begin(), heaped_end--, heap_cmp); + while (heap.begin() != heaped_end && + seq_pos(heap.front().first) == seq_pos(heap.back().first)) { + pop_heap(heap.begin(), heaped_end--, heap_cmp); + } - // if we left a trace in the stack we found a complete match, but sometimes MEMs that overhang - // the edge of the subraph find their way in (only when they are not part of the alignment - // represented by the cluster used to query the subgraph) in which case we just skip this MEM - if (stack.empty()) { #ifdef debug_multipath_alignment - cerr << "this MEM overhangs the end of the graph" << endl; + cerr << "next mappings are at uncentered sequence index " << seq_pos(heap.back().first) - path_nodes.front().begin << ":" << endl; + for (auto it = heaped_end; it != heap.end(); ++it) { + cerr << "\t" << overlapping_group[it->first] << ": " << it->second << " " << debug_string(next_mapping(*it)) << endl; + } + cerr << "the other heaped mappings:" << endl; + for (auto it = heap.begin(); it != heaped_end; ++it) { + cerr << "\t" << overlapping_group[it->first] << " (seq index " << seq_pos(it->first) - path_nodes.front().begin << "): " << it->second << " " << debug_string(next_mapping(*it)) << endl; + } #endif - continue; + + // split into groups that have identical next mappings + vector> groups; + for (size_t i = 0, n = (heap.end() - heaped_end); i < n; ++i) { + bool found_match = false; + for (auto& group : groups) { + if (next_mapping(*(heaped_end + i)) == next_mapping(*(heaped_end + group.front()))) { + group.push_back(i); + found_match = true; + break; + } + } + if (!found_match) { + // no matches, becomes its own group + groups.emplace_back(1, i); + } } #ifdef debug_multipath_alignment - cerr << "converting into a Path at idx " << path_nodes.size() << endl; + cerr << "partitioned into match groups (by index within unheaped records):" << endl; + for (auto& group : groups) { + cerr << "\t"; + for (auto i : group) { + cerr << " " << i; + } + cerr << endl; + } #endif - int64_t match_node_idx = path_nodes.size(); - path_nodes.emplace_back(); - PathNode& match_node = path_nodes.back(); - Path& path = match_node.path; - match_node.begin = begin; - match_node.end = end; - int64_t length_remaining = end - begin; + for (auto& group : groups) { + if (group.size() == 1) { +#ifdef debug_multipath_alignment + cerr << "skipping group of size 1 containing " << group.front() << endl; +#endif + + // no identical mapping to this one + auto& heap_rec = *(heaped_end + group.front()); + to_length[heap_rec.first] += mapping_to_length(next_mapping(heap_rec)); + ++heap_rec.second; + } + else { +#ifdef debug_multipath_alignment + cerr << "walking match for group with"; + for (auto i : group) { + cerr << " " << i; + } + cerr << endl; +#endif + + // at least two mappings are identical + identical_segments.emplace_back(); + auto& segment_rec = identical_segments.back(); + segment_rec.second = 1; + for (auto i : group) { + auto& heap_rec = *(heaped_end + i); + to_length[heap_rec.first] += mapping_to_length(next_mapping(heap_rec)); + segment_rec.first.emplace_back(overlapping_group[heap_rec.first], heap_rec.second++); + } + while (true) { + // only continue if all of the segments of this group match + bool all_match = true; + for (auto i : group) { + auto& heap_rec = *(heaped_end + i); + if (heap_rec.second == path_length(heap_rec.first) + || next_mapping(heap_rec) != next_mapping(*(heaped_end + group.front()))) { + // we hit the end of a path or found one that doesn't match + all_match = false; + break; + } + } + if (!all_match) { + break; + } + // extend the identical group by 1 + ++identical_segments.back().second; + for (auto i : group) { + auto& heap_rec = *(heaped_end + i); + to_length[heap_rec.first] += mapping_to_length(next_mapping(heap_rec)); + ++heap_rec.second; + } + } +#ifdef debug_multipath_alignment + cerr << "walked match of length " << identical_segments.back().second << endl; +#endif + } + } - // walk out the match - int32_t rank = 1; - for (auto search_record : stack) { - int64_t offset = get<1>(search_record); - Node* node = get<3>(search_record)[get<2>(search_record) - 1].node; - int64_t length = std::min((int64_t) node->sequence().size() - offset, length_remaining); - - Mapping* mapping = path.add_mapping(); - mapping->set_rank(rank); - - Edit* edit = mapping->add_edit(); - edit->set_from_length(length); - edit->set_to_length(length); - - // note: the graph is dagified and unrolled, so all hits should be on the forward strand - Position* position = mapping->mutable_position(); - position->set_node_id(node->id()); - position->set_offset(offset); - - // record that each node occurs in this match so we can filter out sub-MEMs - node_matches[node->id()].push_back(match_node_idx); + // now remove any heap records that have reached the end of their path + for (auto it = heaped_end; it != heap.end();) { + if (it->second == path_length(it->first)) { #ifdef debug_multipath_alignment - cerr << "associating node " << node->id() << " with a match at idx " << match_node_idx << endl; + cerr << "reached end of path node " << overlapping_group[it->first] << endl; #endif - - rank++; - length_remaining -= length; + *it = heap.back(); + heap.pop_back(); + } + else { +#ifdef debug_multipath_alignment + cerr << "have not yet exhausted (path node index) " << overlapping_group[it->first] << ", keeping on heap with a new uncentered seq index of " << seq_pos(it->first) - path_nodes.front().begin << endl; +#endif + ++it; + } + } +#ifdef debug_multipath_alignment + cerr << "restoring heap" << endl; +#endif + + // and restore the heap + while (heaped_end != heap.end()) { + push_heap(heap.begin(), ++heaped_end, heap_cmp); + } + } + } + + if (!identical_segments.empty()) { + +#ifdef debug_multipath_alignment + cerr << "found identical segments:" << endl; + for (auto& segment : identical_segments) { + for (auto& rec : segment.first) { + cerr << "(" << rec.first << ", " << rec.second << ") "; + } + cerr << segment.second << endl; + } +#endif + + + vector merged_path_nodes; + vector merged_provenances; + merged_path_nodes.reserve(path_nodes.size() + identical_segments.size()); + merged_provenances.reserve(path_nodes.size() + identical_segments.size()); + + // the index of the last mapping copied over for each path node + vector last_copied(path_nodes.size(), 0); + + vector copied_to_length(path_nodes.size(), 0); + + // function to add a path node for a segment, updating the tracking variables as necessary + auto add_segment_path_node = [&](size_t orig_idx, size_t seg_begin, size_t length) { + + PathNode& orig_path_node = path_nodes[orig_idx]; + int64_t to_length_added = 0; + if (seg_begin == 0 && length == orig_path_node.path.mapping_size()) { + to_length_added = orig_path_node.end - orig_path_node.begin; + merged_path_nodes.emplace_back(move(orig_path_node)); + } + else { + merged_path_nodes.emplace_back(); + PathNode& new_path_node = merged_path_nodes.back(); + new_path_node.begin = orig_path_node.begin + copied_to_length[orig_idx]; + new_path_node.path.mutable_mapping()->reserve(length); + for (size_t i = seg_begin, n = seg_begin + length; i < n; ++i) { + auto& mapping = *orig_path_node.path.mutable_mapping(i); + to_length_added += mapping_to_length(mapping); + *new_path_node.path.add_mapping() = move(mapping); + } + new_path_node.end = new_path_node.begin + to_length_added; + } + merged_provenances.push_back(path_node_provenance[orig_idx]); +#ifdef debug_multipath_alignment + cerr << "made merged node from original node " << orig_idx << ", start " << seg_begin << ", len " << length << endl; + cerr << string(merged_path_nodes.back().begin, merged_path_nodes.back().end) << endl; + cerr << debug_string(merged_path_nodes.back().path) << endl; +#endif + return to_length_added; + }; + + // by construction, the identical segments are ordered by the read position + // of their start, so we can iterate in order safely + for (auto& segment : identical_segments) { + // copy over any intervening segments + for (auto& path_node_start : segment.first) { + if (last_copied[path_node_start.first] != path_node_start.second) { + auto to_length_added = add_segment_path_node(path_node_start.first, + last_copied[path_node_start.first], + path_node_start.second - last_copied[path_node_start.first]); + last_copied[path_node_start.first] = path_node_start.second; + copied_to_length[path_node_start.first] += to_length_added; + } + } + // copy over the merge segments + auto to_length_added = add_segment_path_node(segment.first.front().first, + last_copied[segment.first.front().first], + segment.second); + for (auto& path_node_start : segment.first) { + last_copied[path_node_start.first] = path_node_start.second + segment.second; + copied_to_length[path_node_start.first] += to_length_added; + } + } + + // copy any remaining segments + for (size_t i = 0; i < path_nodes.size(); ++i) { + size_t path_length = path_nodes[i].path.mapping_size(); + if (last_copied[i] != path_length) { + add_segment_path_node(i, last_copied[i], path_length - last_copied[i]); + } + } + + path_nodes = move(merged_path_nodes); + path_node_provenance = move(merged_provenances); + } + +#ifdef debug_multipath_alignment + cerr << "nodes after merging partially redundant segments:" << endl; + for (size_t i = 0; i < path_nodes.size(); ++i) { + cerr << i << " (hit " << path_node_provenance[i] << "): " << debug_string(path_nodes[i].path) << " " << string(path_nodes[i].begin, path_nodes[i].end) << endl; + } +#endif + } + + void MultipathAlignmentGraph::jitter_homopolymer_ends(const HandleGraph& graph, + vector& path_node_provenance, + const MultipathMapper::memcluster_t& hits, + int64_t max_branch_trim_length) { + +#ifdef debug_multipath_alignment + cerr << "checking for opportunities to jitter homopolymer anchors" << endl; +#endif + + // TODO: magic constants + static const int64_t min_homopolymer_length = 6; + // a homopolymer jitter will be accepted if it meets either of these criteria: + static const int64_t max_jitter_diff = 2; + static const int64_t min_jitter_length = 5; + + size_t num_original_path_nodes = path_nodes.size(); + for (size_t i = 0; i < num_original_path_nodes; ++i) { + if (path_nodes[i].end - path_nodes[i].begin != hits.first[path_node_provenance[i]].first->length() + || path_nodes[i].end - path_nodes[i].begin <= min_homopolymer_length) { + // this node has already been merged with some other node, which gives an indication + // that alternate exact matches have already handled the local alignment uncertainty + // or alternatively this node is too short to jitter + continue; + } + + // TODO: put bookkeeping in place to remove this restriction + // only try to jitter of one side of a path node at most (both is complicated because + // we have to sever the source node) + bool did_jitter = false; + for (bool left_side : {true, false}) { + if (did_jitter) { + break; + } + + int64_t j_begin, incr; + if (left_side) { + j_begin = 0; + incr = 1; + } + else { + j_begin = path_nodes[i].end - path_nodes[i].begin - 1; + incr = -1; } + // find the length of homopolymer there is at the end of this match + // TODO: technically these are homodimers now, but whatever + int64_t homopolymer_length = 0; + for (int64_t j = j_begin, n = path_nodes[i].end - path_nodes[i].begin; j >= 0 && j < n; j += incr) { + if (*(path_nodes[i].begin + j) == *(path_nodes[i].begin + j_begin + (abs(j - j_begin) % 2) * incr)) { + ++homopolymer_length; + } + else { + break; + } + } + + if (homopolymer_length >= min_homopolymer_length) { + // this is a long enough homopolymer that we'll consider some jitter + +#ifdef debug_multipath_alignment + cerr << "found homopolymer of length " << homopolymer_length << " on path node " << i << " on left side? " << left_side << endl; +#endif + + // walk until the furthest mapping that can be reached by peeling off + // the homopolymer + int64_t k = left_side ? 0 : path_nodes[i].path.mapping_size() - 1; + int64_t length_before = 0; + for (; k + incr >= 0 && k + incr < path_nodes[i].path.mapping_size(); k += incr) { + int64_t length_thru = length_before + mapping_to_length(path_nodes[i].path.mapping(k)); + if (length_thru > homopolymer_length) { + break; + } + length_before = length_thru; + } + + // walk backwards looking for jittered matches + handle_t adj_handle = graph.get_handle(path_nodes[i].path.mapping(k).position().node_id(), + path_nodes[i].path.mapping(k).position().is_reverse()); + for (; k - incr >= 0 && k - incr < path_nodes[i].path.mapping_size() && !did_jitter; k -= incr) { + if (length_before <= max_branch_trim_length) { + // we won't bother trying to jitter over regions that will + // be caught branch point trimming + break; + } +#ifdef debug_multipath_alignment + cerr << "at mapping " << k << " (" << debug_string(path_nodes[i].path.mapping(k).position()) << ") having already walked " << length_before << endl; +#endif + + // TODO: contains some redundant code with create_match_nodes + + handle_t handle = adj_handle; + adj_handle = graph.get_handle(path_nodes[i].path.mapping(k - incr).position().node_id(), + path_nodes[i].path.mapping(k - incr).position().is_reverse()); + graph.follow_edges(handle, left_side, [&](const handle_t& next) { + if (next != adj_handle){ + // this is an adjacency that the current path doesn't take, so we can + // try to jitter down it + #ifdef debug_multipath_alignment - cerr << pb2json(path) << endl; + cerr << "homopolymer can branch from " << graph.get_id(handle) << " " << graph.get_is_reverse(handle) << " to " << graph.get_id(next) << " " << graph.get_is_reverse(next) << " with " << length_before << " bases to jitter" << endl; #endif + + // stack for DFS, each record contains tuples of + // (read begin, next node index, next node handles, + vector>> stack; + auto riter = left_side ? path_nodes[i].begin + length_before - 1 : path_nodes[i].end - length_before; + stack.emplace_back(riter, 0, vector(1, next)); + while (!stack.empty() && !did_jitter) { + auto& back = stack.back(); + if (get<1>(back) == get<2>(back).size()) { + stack.pop_back(); + continue; + } + + handle_t trav = get<2>(back)[get<1>(back)]; + get<1>(back)++; + +#ifdef debug_multipath_alignment + cerr << "checking for matches node " << graph.get_id(trav) << endl; +#endif + string node_seq = graph.get_sequence(trav); + int64_t node_idx = left_side ? node_seq.size() - 1 : 0; + string::const_iterator read_iter = get<0>(back); + + + // look for a match along the entire node sequence + for (; node_idx >= 0 && node_idx < node_seq.size() + && read_iter >= path_nodes[i].begin && read_iter < path_nodes[i].end; node_idx -= incr, read_iter -= incr) { +#ifdef debug_multipath_alignment + cerr << "comparing MEM index " << (read_iter - path_nodes[i].begin) << " " << *read_iter << " and node index " << node_idx << " " << node_seq[node_idx] << endl; +#endif + if (node_seq[node_idx] != *read_iter) { +#ifdef debug_multipath_alignment + cerr << "found mismatch" << endl; +#endif + + break; + } + } + + if ((node_idx < 0 || node_idx == node_seq.size()) + && read_iter >= path_nodes[i].begin && read_iter < path_nodes[i].end) { + // we went off the end of the node without exhausting the MEM + stack.emplace_back(read_iter, 0, vector()); + graph.follow_edges(trav, left_side, [&](const handle_t& next) { + get<2>(stack.back()).emplace_back(next); + }); + } + else { + int64_t length_diff = left_side ? read_iter - path_nodes[i].begin + 1 : path_nodes[i].end - read_iter; + int64_t jitter_length = length_before - length_diff; + if (length_diff <= max_jitter_diff || jitter_length >= min_jitter_length) { + // we found a jittered anchor with nearly the same length, let's add + // the jittered portion as an alternate anchor + did_jitter = true; + path_nodes.emplace_back(); + path_nodes.emplace_back(); + auto& split_node = path_nodes[path_nodes.size() - 2]; + auto& jitter_node = path_nodes.back(); + path_node_provenance.emplace_back(path_node_provenance[i]); + path_node_provenance.emplace_back(path_node_provenance[i]); + if (left_side) { + jitter_node.begin = read_iter + 1; + jitter_node.end = path_nodes[i].begin + length_before; + if (node_idx < (int64_t) node_seq.size() - 1) { + auto mapping = jitter_node.path.add_mapping(); + auto pos = mapping->mutable_position(); + pos->set_node_id(graph.get_id(trav)); + pos->set_is_reverse(graph.get_is_reverse(trav)); + pos->set_offset(node_idx + 1); + auto edit = mapping->add_edit(); + edit->set_from_length(node_seq.size() - node_idx - 1); + edit->set_to_length(edit->from_length()); + } + for (int64_t l = stack.size() - 2; l >= 0; --l) { + handle_t h = get<2>(stack[l])[get<1>(stack[l]) - 1]; + auto mapping = jitter_node.path.add_mapping(); + auto pos = mapping->mutable_position(); + pos->set_node_id(graph.get_id(h)); + pos->set_is_reverse(graph.get_is_reverse(h)); + pos->set_offset(0); + auto edit = mapping->add_edit(); + edit->set_from_length(graph.get_length(h)); + edit->set_to_length(edit->from_length()); + } + // split up the node that we jittered so that it finds the edge to the jitter + split_node.begin = path_nodes[i].begin + length_before; + split_node.end = path_nodes[i].end; + for (int64_t l = k; l < path_nodes[i].path.mapping_size(); ++l) { + *split_node.path.add_mapping() = move(*path_nodes[i].path.mutable_mapping(l)); + } + path_nodes[i].end = split_node.begin; + path_nodes[i].path.mutable_mapping()->resize(k); + } + else { + jitter_node.begin = path_nodes[i].end - length_before; + jitter_node.end = read_iter; + for (int64_t l = 0; l + 1 < stack.size(); ++l) { + handle_t h = get<2>(stack[l])[get<1>(stack[l]) - 1]; + auto mapping = jitter_node.path.add_mapping(); + auto pos = mapping->mutable_position(); + pos->set_node_id(graph.get_id(h)); + pos->set_is_reverse(graph.get_is_reverse(h)); + pos->set_offset(0); + auto edit = mapping->add_edit(); + edit->set_from_length(graph.get_length(h)); + edit->set_to_length(edit->from_length()); + } + if (node_idx > 0) { + auto mapping = jitter_node.path.add_mapping(); + auto pos = mapping->mutable_position(); + pos->set_node_id(graph.get_id(trav)); + pos->set_is_reverse(graph.get_is_reverse(trav)); + pos->set_offset(0); + auto edit = mapping->add_edit(); + edit->set_from_length(node_idx); + edit->set_to_length(edit->from_length()); + } + // split up the node that we jittered so that it finds the edge to the jitter + split_node.begin = path_nodes[i].end - length_before; + split_node.end = path_nodes[i].end; + for (int64_t l = k + 1; l < path_nodes[i].path.mapping_size(); ++l) { + *split_node.path.add_mapping() = move(*path_nodes[i].path.mutable_mapping(l)); + } + path_nodes[i].end = split_node.begin; + path_nodes[i].path.mutable_mapping()->resize(k + 1); + } + +#ifdef debug_multipath_alignment + cerr << "jitter difference of " << length_diff << " was small enough or length of " << jitter_length << " was large enough to make a new jitter anchor: " << endl; + cerr << string(jitter_node.begin, jitter_node.end) << " (" << (jitter_node.begin - path_nodes[i].begin) << ":" << (jitter_node.end - path_nodes[i].begin) << ") " << debug_string(jitter_node.path) << endl; + cerr << "new split nodes in path node " << i << endl; + cerr << string(path_nodes[i].begin, path_nodes[i].end) << " (" << (path_nodes[i].begin - path_nodes[i].begin) << ":" << (path_nodes[i].end - path_nodes[i].begin) << ") " << debug_string(path_nodes[i].path) << endl; + cerr << string(split_node.begin, split_node.end) << " (" << (split_node.begin - path_nodes[i].begin) << ":" << (split_node.end - path_nodes[i].begin) << ") " << debug_string(split_node.path) << endl; +#endif + } + } + } + } + return !did_jitter; + }); + + if (!did_jitter) { + length_before -= mapping_to_length(path_nodes[i].path.mapping(k - incr)); + } + } + } } } } - void MultipathAlignmentGraph::collapse_order_length_runs(VG& vg, gcsa::GCSA* gcsa) { + void MultipathAlignmentGraph::collapse_order_length_runs(const HandleGraph& graph, gcsa::GCSA* gcsa, + vector& path_node_provenance) { #ifdef debug_multipath_alignment cerr << "looking for runs of order length MEMs to collapse with gcsa order " << gcsa->order() << endl; @@ -673,7 +1633,7 @@ namespace vg { size_t num_order_length_mems = 0; for (size_t i = 0; i < path_nodes.size(); i++) { - PathNode& match_node = path_nodes[i]; + PathNode& match_node = path_nodes.at(i); if (match_node.end - match_node.begin < gcsa->order()) { // we have passed all of the order length MEMs, bail out of loop @@ -692,8 +1652,8 @@ namespace vg { order[i] = i; } sort(order.begin(), order.end(), [&](size_t i, size_t j) { - return path_nodes[i].begin < path_nodes[j].begin || (path_nodes[i].begin == path_nodes[j].begin && - path_nodes[i].end < path_nodes[j].end); + return path_nodes.at(i).begin < path_nodes.at(j).begin || (path_nodes.at(i).begin == path_nodes.at(j).begin && + path_nodes.at(i).end < path_nodes.at(j).end); }); for (size_t i : order) { @@ -701,13 +1661,8 @@ namespace vg { PathNode& match_node = path_nodes[i]; #ifdef debug_multipath_alignment - cerr << "## checking if MEM " << i << " can be an extension: " << endl; - cerr << "\t"; - for (auto iter = match_node.begin; iter != match_node.end; iter++) { - cerr << *iter; - } - cerr << endl; - cerr << "\t" << pb2json(match_node.path) << endl; + cerr << "checking if MEM " << i << " can be an extension: " << endl; + cerr << "\t" << string(match_node.begin, match_node.end) << "\t" << debug_string(match_node.path) << endl; #endif // try to find any run of MEMs that could be merged with this MEM @@ -727,7 +1682,7 @@ namespace vg { cerr << *iter; } cerr << endl; - cerr << "\t" << pb2json(last_run_node.path) << endl; + cerr << "\t" << debug_string(last_run_node.path) << endl; #endif // do they overhang an amount on the read that indicates they overlap and could be merged? @@ -751,7 +1706,7 @@ namespace vg { if (remaining - mapping_length < overhang) { // we will cross the position that should line up with the initial position on this mapping - const Position& overhang_position = last_run_node.path.mapping(k).position(); + const position_t& overhang_position = last_run_node.path.mapping(k).position(); get_id(last_run_node_internal_pos) = overhang_position.node_id(); get_is_rev(last_run_node_internal_pos) = overhang_position.is_reverse(); @@ -764,8 +1719,8 @@ namespace vg { } // get the final position of the node further to the left - const Mapping& final_mapping = last_run_node.path.mapping(last_run_node.path.mapping_size() - 1); - const Position& final_mapping_position = final_mapping.position(); + const path_mapping_t& final_mapping = last_run_node.path.mapping(last_run_node.path.mapping_size() - 1); + const position_t& final_mapping_position = final_mapping.position(); pos_t last_run_node_final_pos = make_pos_t(final_mapping_position.node_id(), final_mapping_position.is_reverse(), final_mapping_position.offset() + mapping_from_length(final_mapping)); @@ -781,7 +1736,7 @@ namespace vg { if (remaining < overhang) { // we will cross the position that should line up with the initial position on this mapping - const Position& overhang_position = match_node.path.mapping(k).position(); + const position_t& overhang_position = match_node.path.mapping(k).position(); get_id(match_node_internal_pos) = overhang_position.node_id(); get_is_rev(match_node_internal_pos) = overhang_position.is_reverse(); @@ -812,10 +1767,9 @@ namespace vg { // it could still be that these are two end-to-end matches that got assigned to the beginning // and end of two nodes connected by an edge - if (offset(last_run_node_final_pos) == vg.get_node(final_mapping_position.node_id())->sequence().size() - && vg.has_edge(NodeSide(id(last_run_node_final_pos), !is_rev(last_run_node_final_pos)), - NodeSide(id(match_node_initial_pos), is_rev(match_node_initial_pos)))) { - + if (offset(last_run_node_final_pos) == graph.get_length(graph.get_handle(final_mapping_position.node_id()))) { + if (graph.has_edge(graph.get_handle(id(last_run_node_final_pos), is_rev(last_run_node_final_pos)), + graph.get_handle(id(match_node_initial_pos), is_rev(match_node_initial_pos)))) { #ifdef debug_multipath_alignment cerr << "found end to end connection over an edge" << endl; #endif @@ -826,6 +1780,7 @@ namespace vg { break; } + } } } } @@ -862,11 +1817,11 @@ namespace vg { // mark the node we're merging from for removal to_remove.insert(merge_group[i]); - PathNode& merge_from_node = path_nodes[merge_group[i]]; + PathNode& merge_from_node = path_nodes.at(merge_group[i]); #ifdef debug_multipath_alignment - cerr << "merging into node " << merge_group[0] << " path " << pb2json(merge_into_node.path) << endl; - cerr << "from node " << merge_group[i] << " path " << pb2json(merge_from_node.path) << endl; + cerr << "merging into node " << merge_group[0] << " path " << debug_string(merge_into_node.path) << endl; + cerr << "from node " << merge_group[i] << " path " << debug_string(merge_from_node.path) << endl; #endif // walk backwards until we find the first mapping to add @@ -886,41 +1841,39 @@ namespace vg { #endif // handle the first mapping we add as a special case - const Mapping& first_mapping_to_add = merge_from_node.path.mapping(first_mapping_to_add_idx); - Mapping* final_merging_mapping = merge_into_node.path.mutable_mapping(merge_into_node.path.mapping_size() - 1); + const path_mapping_t& first_mapping_to_add = merge_from_node.path.mapping(first_mapping_to_add_idx); + path_mapping_t* final_merging_mapping = merge_into_node.path.mutable_mapping(merge_into_node.path.mapping_size() - 1); if (final_merging_mapping->position().node_id() == first_mapping_to_add.position().node_id() && final_merging_mapping->position().is_reverse() == first_mapping_to_add.position().is_reverse() && first_mapping_to_add.position().offset() - final_merging_mapping->position().offset() - remaining == mapping_from_length(*final_merging_mapping)) { // the mappings are on the same node, so they can be combined int64_t mapping_to_add_length = mapping_from_length(first_mapping_to_add) + remaining; - Edit* final_edit = final_merging_mapping->mutable_edit(final_merging_mapping->edit_size() - 1); + edit_t* final_edit = final_merging_mapping->mutable_edit(final_merging_mapping->edit_size() - 1); final_edit->set_from_length(final_edit->from_length() + mapping_to_add_length); final_edit->set_to_length(final_edit->to_length() + mapping_to_add_length); #ifdef debug_multipath_alignment - cerr << "merged mapping is " << pb2json(*final_merging_mapping) << endl; + cerr << "merged mapping is " << debug_string(*final_merging_mapping) << endl; #endif } else { // we need to add this as a new mapping - Mapping* new_mapping = merge_into_node.path.add_mapping(); + path_mapping_t* new_mapping = merge_into_node.path.add_mapping(); *new_mapping = first_mapping_to_add; - new_mapping->set_rank(final_merging_mapping->rank() + 1); #ifdef debug_multipath_alignment - cerr << "new adjacent mapping is " << pb2json(*new_mapping) << endl; + cerr << "new adjacent mapping is " << debug_string(*new_mapping) << endl; #endif } // add the remaining mappings as new mappings for (size_t j = first_mapping_to_add_idx + 1; j < merge_from_node.path.mapping_size(); j++) { - Mapping* new_mapping = merge_into_node.path.add_mapping(); + path_mapping_t* new_mapping = merge_into_node.path.add_mapping(); *new_mapping = merge_from_node.path.mapping(j); - new_mapping->set_rank(merge_into_node.path.mapping(merge_into_node.path.mapping_size() - 2).rank() + 1); #ifdef debug_multipath_alignment - cerr << "new transfer mapping is " << pb2json(*new_mapping) << endl; + cerr << "new transfer mapping is " << debug_string(*new_mapping) << endl; #endif } @@ -928,7 +1881,7 @@ namespace vg { merge_into_node.end = merge_from_node.end; #ifdef debug_multipath_alignment - cerr << "merged path is " << pb2json(merge_into_node.path) << endl; + cerr << "merged path is " << debug_string(merge_into_node.path) << endl; cerr << "merged substring is "; for (auto iter = merge_into_node.begin; iter != merge_into_node.end; iter++) { cerr << *iter; @@ -946,6 +1899,7 @@ namespace vg { } else if (removed_so_far > 0) { path_nodes[i - removed_so_far] = move(path_nodes[i]); + path_node_provenance[i - removed_so_far] = path_node_provenance[i]; #ifdef debug_multipath_alignment cerr << "moving path node " << i << " into index " << i - removed_so_far << endl; #endif @@ -958,110 +1912,233 @@ namespace vg { } path_nodes.resize(path_nodes.size() - to_remove.size()); + path_node_provenance.resize(path_nodes.size()); } - - } - - void MultipathAlignmentGraph::resect_snarls_from_paths(SnarlManager* cutting_snarls, - const unordered_map>& projection_trans, - int64_t max_snarl_cut_size) { #ifdef debug_multipath_alignment - cerr << "cutting with snarls" << endl; + cerr << "done merging MEMs" << endl; #endif + } + + void MultipathAlignmentGraph::trim_to_branch_points(const HandleGraph* graph, size_t max_trim_length) { - size_t num_original_path_nodes = path_nodes.size(); - - // we'll need to keep track of which nodes we trim the front off of to update edge lengths later - vector trimmed_prefix_length(path_nodes.size()); - bool trimmed_any_prefix = false; + assert(!has_reachability_edges); - for (size_t i = 0; i < num_original_path_nodes; i++) { - - // first compute the segments we want to cut out - - PathNode* path_node = &path_nodes[i]; - Path* path = &path_node->path; + for (PathNode& path_node : path_nodes) { #ifdef debug_multipath_alignment - cerr << "cutting node at index " << i << " with path " << pb2json(*path) << endl; + cerr << "trimming to branch points within " << max_trim_length << " of ends in path " << debug_string(path_node.path) << endl; #endif - // this list holds the beginning of the current segment at each depth in the snarl hierarchy - // as we traverse the exact match, the beginning is recorded in both sequence distance and node index - list> level_segment_begin; - level_segment_begin.emplace_back(0, 0); - - // we record which segments we are going to cut out of the match here - vector> cut_segments; + // find the mapping where we are first pass the maximum trim length coming inward + // from the left + int64_t from_length = 0; + int64_t to_length = 0; + int64_t prefix_idx = 0; + for (; prefix_idx < path_node.path.mapping_size() + && from_length <= max_trim_length + && to_length <= max_trim_length; prefix_idx++) { + + from_length += mapping_from_length(path_node.path.mapping(prefix_idx)); + to_length += mapping_to_length(path_node.path.mapping(prefix_idx)); + } - auto curr_level = level_segment_begin.begin(); - size_t prefix_length = 0; - for (size_t j = 0, last = path->mapping_size() - 1; j <= last; j++) { - const Position& position = path->mapping(j).position(); - const auto& projection = projection_trans.at(position.node_id()); - id_t projected_id = projection.first; - bool projected_rev = (projection.second != position.is_reverse()); - - if (j > 0) { - // we have entered this node on this iteration - if (cutting_snarls->into_which_snarl(projected_id, !projected_rev)) { - // as we enter this node, we are leaving the snarl we were in - - // since we're going up a level, we need to check whether we need to cut out the segment we've traversed - if (prefix_length - curr_level->first <= max_snarl_cut_size || !max_snarl_cut_size) { - cut_segments.emplace_back(curr_level->second, j); - } - - curr_level++; - if (curr_level == level_segment_begin.end()) { - // we were already at the highest level seen so far, so we need to add a new one - // the entire previous part of the match is contained in this level, so we start - // the segment from 0 - curr_level = level_segment_begin.insert(level_segment_begin.end(), make_pair(0, 0)); - } - } - } - - // cross to the other side of the node - prefix_length += mapping_from_length(path->mapping(j)); - - if (j < last) { - // we are going to leave this node next iteration - if (cutting_snarls->into_which_snarl(projected_id, projected_rev)) { - // as we leave this node, we are entering a new deeper snarl - - // the segment in the new level will begin at the end of the current node - if (curr_level == level_segment_begin.begin()) { - // we are already at the lowest level seen so far, so we need to add a new one - level_segment_begin.emplace_front(prefix_length, j + 1); - curr_level--; - } - else { - // the lower level is in the record already, so we update its segment start - curr_level--; - *curr_level = make_pair(prefix_length, j + 1); - } - } + // walk backwards to see if we passed any leftward branch points + for (prefix_idx--; prefix_idx > 0; prefix_idx--) { + const position_t& pos = path_node.path.mapping(prefix_idx).position(); + if (graph->get_degree(graph->get_handle(pos.node_id(), pos.is_reverse()), true) > 1) { + // this is the inward most branch point within the trim length + +#ifdef debug_multipath_alignment + cerr << "found leftward branch point at " << prefix_idx << endl; +#endif + + break; } } - // check the final segment for a cut unless we're at the highest level in the match - auto last = level_segment_begin.end(); - last--; - if ((prefix_length - curr_level->first <= max_snarl_cut_size || !max_snarl_cut_size) && curr_level != last) { - cut_segments.emplace_back(curr_level->second, path->mapping_size()); + // find the mapping where we are first pass the maximum trim length coming inward + // from the right + from_length = 0; + to_length = 0; + int64_t suffix_idx = path_node.path.mapping_size() - 1; + for (; suffix_idx >= 0 + && from_length <= max_trim_length + && to_length <= max_trim_length; suffix_idx--) { + + from_length += mapping_from_length(path_node.path.mapping(suffix_idx)); + to_length += mapping_to_length(path_node.path.mapping(suffix_idx)); } + // walk forward to see if we passed any rightwards branch points + for (suffix_idx++; suffix_idx + 1 < path_node.path.mapping_size(); suffix_idx++) { + const position_t& pos = path_node.path.mapping(suffix_idx).position(); + if (graph->get_degree(graph->get_handle(pos.node_id(), pos.is_reverse()), false) > 1) { + // this is the inward most branch point within the trim length + #ifdef debug_multipath_alignment - cerr << "found " << cut_segments.size() << " cut segments:" << endl; - for (auto seg : cut_segments) { - cerr << "\t" << seg.first << ":" << seg.second << endl; - } + cerr << "found right branch point at " << suffix_idx << endl; #endif + break; + } + } - // did we cut out any segments? - if (!cut_segments.empty()) { - + // the prefix/suffix idx now indicate the place after which we can trim + + if (prefix_idx > suffix_idx) { + // this is a weird case, we seem to want to trim the whole anchor, which suggests + // that maybe the trim length was chosen to be too long. there are other explanations + // but we're just going to ignore the trimming for now + // TODO: is this the right approach? +#ifdef debug_multipath_alignment + cerr << "seem to want to trim entire path, skipping" << endl; +#endif + continue; + } + + if (prefix_idx > 0 || suffix_idx + 1 < path_node.path.mapping_size()) { + + // compute the amount of read we're trimming off from each end + int64_t trimmed_prefix_to_length = 0; + int64_t trimmed_suffix_to_length = 0; + for (int64_t i = 0; i < prefix_idx; i++) { + trimmed_prefix_to_length += mapping_to_length(path_node.path.mapping(i)); + } + for (int64_t i = suffix_idx + 1; i < path_node.path.mapping_size(); i++) { + trimmed_suffix_to_length += mapping_to_length(path_node.path.mapping(i)); + } + + // replace the path with the portion that we didn't trim + path_t new_path; + for (int64_t i = prefix_idx; i <= suffix_idx; i++) { + path_mapping_t* mapping = new_path.add_mapping(); + *mapping = path_node.path.mapping(i); + } + path_node.path = move(new_path); +#ifdef debug_multipath_alignment + cerr << "trimmed path: " << debug_string(path_node.path) << endl; +#endif + // update the read interval + path_node.begin += trimmed_prefix_to_length; + path_node.end -= trimmed_suffix_to_length; + } + } + } + + vector> MultipathAlignmentGraph::get_cut_segments(path_t& path, + SnarlManager* cutting_snarls, + SnarlDistanceIndex* dist_index, + const function(id_t)>& project, + int64_t max_snarl_cut_size) const { + + // this list holds the beginning of the current segment at each depth in the snarl hierarchy + // as we traverse the exact match, the beginning is recorded in both sequence distance and node index + list> level_segment_begin; + level_segment_begin.emplace_back(0, 0); + + // we record which segments we are going to cut out of the match here + vector> cut_segments; + + auto curr_level = level_segment_begin.begin(); + size_t prefix_length = 0; + for (size_t j = 0, last = path.mapping_size() - 1; j <= last; j++) { + const position_t& position = path.mapping(j).position(); + auto projection = project(position.node_id()); + id_t projected_id = projection.first; + bool projected_rev = (projection.second != position.is_reverse()); + + if (j > 0) { + // we have entered this node on this iteration + if (into_cutting_snarl(projected_id, !projected_rev, cutting_snarls, dist_index)) { + // as we enter this node, we are leaving the snarl we were in + + // since we're going up a level, we need to check whether we need to cut out the segment we've traversed + if (prefix_length - curr_level->first <= max_snarl_cut_size) { + cut_segments.emplace_back(curr_level->second, j); + } + + curr_level++; + if (curr_level == level_segment_begin.end()) { + // we were already at the highest level seen so far, so we need to add a new one + // the entire previous part of the match is contained in this level, so we start + // the segment from 0 + curr_level = level_segment_begin.insert(level_segment_begin.end(), pair(0, 0)); + } + } + } + + // cross to the other side of the node + prefix_length += mapping_from_length(path.mapping(j)); + + if (j < last) { + // we are going to leave this node next iteration + if (into_cutting_snarl(projected_id, projected_rev, cutting_snarls, dist_index)) { + // as we leave this node, we are entering a new deeper snarl + + // the segment in the new level will begin at the end of the current node + if (curr_level == level_segment_begin.begin()) { + // we are already at the lowest level seen so far, so we need to add a new one + level_segment_begin.emplace_front(prefix_length, j + 1); + curr_level--; + } + else { + // the lower level is in the record already, so we update its segment start + curr_level--; + *curr_level = make_pair(prefix_length, j + 1); + } + } + } + } + + // check the final segment for a cut unless we're at the highest level in the match + auto last = level_segment_begin.end(); + last--; + if ((prefix_length - curr_level->first <= max_snarl_cut_size) && curr_level != last) { + cut_segments.emplace_back(curr_level->second, path.mapping_size()); + } + +#ifdef debug_multipath_alignment + cerr << "found " << cut_segments.size() << " cut segments:" << endl; + for (auto seg : cut_segments) { + cerr << "\t" << seg.first << ":" << seg.second << endl; + } +#endif + + + return cut_segments; + } + + void MultipathAlignmentGraph::resect_snarls_from_paths(SnarlManager* cutting_snarls, + SnarlDistanceIndex* dist_index, + const function(id_t)>& project, + int64_t max_snarl_cut_size) { +#ifdef debug_multipath_alignment + cerr << "cutting with snarls" << endl; +#endif + + size_t num_original_path_nodes = path_nodes.size(); + + // we'll need to keep track of which nodes we trim the front off of to update edge lengths later + vector trimmed_prefix_length(path_nodes.size()); + bool trimmed_any_prefix = false; + + for (size_t i = 0; i < num_original_path_nodes; i++) { + + // first compute the segments we want to cut out + + PathNode* path_node = &path_nodes.at(i); + path_t* path = &path_node->path; + +#ifdef debug_multipath_alignment + cerr << "cutting node at index " << i << " with path " << debug_string(*path) << endl; +#endif + + auto cut_segments = get_cut_segments(*path, cutting_snarls, dist_index, project, max_snarl_cut_size); + + // did we cut out any segments? + if (!cut_segments.empty()) { + + vector> keep_segments; + // we may have decided to cut the segments of both a parent and child snarl, so now we // collapse the list of intervals, which is sorted on the end index by construction // @@ -1069,7 +2146,6 @@ namespace vg { // cut segments that are not nested, so we don't need to deal with the case where the // segments are partially overlapping (i.e. it's a bit easier than the general interval // intersection problem) - vector> keep_segments; size_t curr_keep_seg_end = path->mapping_size(); auto riter = cut_segments.rbegin(); if (riter->second == curr_keep_seg_end) { @@ -1093,7 +2169,7 @@ namespace vg { reverse(keep_segments.begin(), keep_segments.end()); // record the data stored on the original path node - Path original_path = *path; + path_t original_path = move(*path); string::const_iterator original_begin = path_node->begin; string::const_iterator original_end = path_node->end; vector> forward_edges = move(path_node->edges); @@ -1122,9 +2198,8 @@ namespace vg { // place the first keep segment into the original node path_node->begin = original_begin + prefix_to_length; for (int32_t rank = 1; prefix_idx < keep_segments.front().second; prefix_idx++, rank++) { - Mapping* mapping = path->add_mapping(); + path_mapping_t* mapping = path->add_mapping(); *mapping = original_path.mapping(prefix_idx); - mapping->set_rank(rank); prefix_from_length += mapping_from_length(*mapping); prefix_to_length += mapping_to_length(*mapping); } @@ -1132,7 +2207,7 @@ namespace vg { #ifdef debug_multipath_alignment - cerr << "new cut path: " << pb2json(path_node->path) << endl; + cerr << "new cut path: " << debug_string(path_node->path) << endl; #endif // keep track of the index in the node vector of the previous segment @@ -1159,36 +2234,35 @@ namespace vg { // create a new node for this keep segment path_nodes.emplace_back(); PathNode& cut_node = path_nodes.back(); - Path& cut_path = cut_node.path; + path_t& cut_path = cut_node.path; // add a connecting edge from the last keep segment - path_nodes[prev_segment_idx].edges.emplace_back(path_nodes.size() - 1, prefix_from_length - intersegment_start); + path_nodes.at(prev_segment_idx).edges.emplace_back(path_nodes.size() - 1, prefix_from_length - intersegment_start); // transfer over the path and the read interval cut_node.begin = original_begin + prefix_to_length; for (int32_t rank = 1; prefix_idx < keep_segment.second; prefix_idx++, rank++) { - Mapping* mapping = cut_path.add_mapping(); + path_mapping_t* mapping = cut_path.add_mapping(); *mapping = original_path.mapping(prefix_idx); - mapping->set_rank(rank); prefix_from_length += mapping_from_length(*mapping); prefix_to_length += mapping_to_length(*mapping); } cut_node.end = original_begin + prefix_to_length; #ifdef debug_multipath_alignment - cerr << "new cut path: " << pb2json(cut_path) << endl; + cerr << "new cut path: " << debug_string(cut_path) << endl; #endif prev_segment_idx = path_nodes.size() - 1; } // move the edges from the original node onto the last keep segment - path_nodes[prev_segment_idx].edges = move(forward_edges); + path_nodes.at(prev_segment_idx).edges = move(forward_edges); // add the length of the trimmed portion of the path to the edge length size_t trimmed_suffix_length = path_from_length(original_path) - prefix_from_length; if (trimmed_suffix_length) { - for (pair& edge : path_nodes[prev_segment_idx].edges) { + for (pair& edge : path_nodes.at(prev_segment_idx).edges) { edge.second += trimmed_suffix_length; } } @@ -1206,10 +2280,202 @@ namespace vg { } } } + + void MultipathAlignmentGraph::synthesize_tail_anchors(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, + size_t min_anchor_length, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, + double pessimistic_tail_gap_multiplier) { - void MultipathAlignmentGraph::add_reachability_edges(VG& vg, - const unordered_map>& projection_trans, - const unordered_multimap>& injection_trans) { + // Align the tails, not collecting a set of source subpaths. + // TODO: factor of 1/2 is arbitray, but i do think it should be fewer than the max + auto tail_alignments = align_tails(alignment, align_graph, aligner, max(1, max_alt_alns / 2), + dynamic_alt_alns, max_gap, pessimistic_tail_gap_multiplier, max_alt_alns, nullptr); + + + for (bool handling_right_tail : {false, true}) { + // For each tail we are processing + + for (auto kv : tail_alignments[handling_right_tail]) { + // For each node that has alignments off in that direction + + // Grab the PathNode we are attached to on the right or left side + auto& attached_path_node_index = kv.first; + // And all the alignments off of there + auto& alns = kv.second; + + // only make anchors from alignments that have score equal to the optimal + for (size_t i = 1; i < alns.size(); ++i) { + if (alns[i].score() < alns.front().score()) { + alns.resize(i); + break; + } + } + +#ifdef debug_multipath_alignment + cerr << "Handling " << (handling_right_tail ? "right" : "left") << " tail off of PathNode " + << attached_path_node_index << " with path " << debug_string(path_nodes.at(attached_path_node_index).path) << endl; +#endif + + for (auto& aln : alns) { + +#ifdef debug_multipath_alignment + cerr << "Tail alignment: " << pb2json(aln) << endl; +#endif + + auto seq_begin = alignment.sequence().begin() + (handling_right_tail ? (alignment.sequence().size() - aln.sequence().size()) : 0); + + // how far have we traveled in the graph since the start of the alignment + size_t cumul_from_length = 0; + // how far have we traveled along the read since the start of the alignment + size_t cumul_to_length = 0; + + // we will keep track of where we are on the current node + size_t offset_on_curr_node = numeric_limits::max(); + + // when we find a match, we will keep track of + size_t curr_match_length = 0; + size_t match_start_mapping_idx = numeric_limits::max(); + size_t match_start_edit_idx = numeric_limits::max(); + size_t curr_match_start_offset = numeric_limits::max(); + + // if it's the right tail, we know the previous index, otherwise there is no previous index + size_t prev_anchor_path_node = handling_right_tail ? attached_path_node_index : numeric_limits::max(); + size_t prev_anchor_final_from_length = 0; + + const Path& path = aln.path(); + + // a function that will create a new match node based on these trackers + auto create_synthetic_anchor_node = [&](const size_t& i, const size_t& j) { + + path_nodes.emplace_back(); + PathNode& synth_path_node = path_nodes.back(); + + // copy the first mapping, paying attention to the initial position + path_mapping_t* new_mapping = synth_path_node.path.add_mapping(); + position_t* new_position = new_mapping->mutable_position(); + new_position->set_node_id(path.mapping(match_start_mapping_idx).position().node_id()); + new_position->set_is_reverse(path.mapping(match_start_mapping_idx).position().is_reverse()); + new_position->set_offset(curr_match_start_offset); + + // we should only be able to copy one edit over from this mapping, either because the next one + // is a mismatch or because it's on the next node's mapping + from_proto_edit(path.mapping(match_start_mapping_idx).edit(match_start_edit_idx), + *new_mapping->add_edit()); + + // copy any whole mappings from the middle of the anchor path + for (size_t copy_i = match_start_mapping_idx + 1; copy_i < i; copy_i++) { + assert(path.mapping(copy_i).edit_size() == 1); + from_proto_mapping(path.mapping(copy_i), *synth_path_node.path.add_mapping()); + } + + // on the final mapping we don't need to pay special attention to the initial position, but + // we can't copy over the whole mapping since there might be more edits after the match + if (i > match_start_mapping_idx && j > 0) { + // This condition is broken because N matches get split into separate edits + assert(j == 1); + new_mapping = synth_path_node.path.add_mapping(); + position_t* pos = new_mapping->mutable_position(); + const Position& pos_from = path.mapping(i).position(); + pos->set_node_id(pos_from.node_id()); + pos->set_offset(pos_from.offset()); + pos->set_is_reverse(pos_from.is_reverse()); + from_proto_edit(path.mapping(i).edit(0), *new_mapping->add_edit()); + } + + synth_path_node.end = seq_begin + cumul_to_length; + synth_path_node.begin = synth_path_node.end - curr_match_length; + +#ifdef debug_multipath_alignment + cerr << "\tyielded anchor with path " << debug_string(synth_path_node.path) << " and seq "; + for (auto it = synth_path_node.begin; it != synth_path_node.end; ++it) { + cerr << *it; + } + cerr << endl; + +#endif + // make an edge from the previous synthetic anchor (if it exists) + if (prev_anchor_path_node != numeric_limits::max()) { + path_nodes[prev_anchor_path_node].edges.emplace_back(path_nodes.size() - 1, + cumul_from_length - curr_match_length - prev_anchor_final_from_length); +#ifdef debug_multipath_alignment + cerr << "\talso making edge from path node at " << prev_anchor_path_node << " with length " << cumul_from_length - curr_match_length - prev_anchor_final_from_length << endl; + +#endif + } + + // mark this anchor as the new anchor + prev_anchor_path_node = path_nodes.size() - 1; + prev_anchor_final_from_length = cumul_from_length; + }; + + // iterate over the path, updating the tracking variables as we go + for (size_t i = 0; i < path.mapping_size(); i++) { + const Mapping& mapping = path.mapping(i); + + // new node, new offset + offset_on_curr_node = mapping.position().offset(); + + for (size_t j = 0; j < mapping.edit_size(); j++) { + + const Edit& edit = mapping.edit(j); + if (edit.from_length() != edit.to_length() || !edit.sequence().empty()) { + // we've found a non-match edit + + if (curr_match_length >= min_anchor_length) { + // we are coming out of a match that is long enough for us to make a new anchor + create_synthetic_anchor_node(i, j); + } + + // mark the trackers that indicate that we are not in a match + curr_match_start_offset = numeric_limits::max(); + curr_match_length = 0; + } + else { + // we've found a match + + if (curr_match_start_offset == numeric_limits::max()) { + // we're starting a new match, update the + curr_match_start_offset = offset_on_curr_node; + match_start_mapping_idx = i; + match_start_edit_idx = j; + } + + // update the length of the match + curr_match_length += edit.from_length(); + } + + // update our positional trackers + offset_on_curr_node += edit.from_length(); + cumul_from_length += edit.from_length(); + cumul_to_length += edit.to_length(); + } + } + + if (curr_match_length > 0) { + // we were still in a match when we finished up, so we want to finish off the anchor + create_synthetic_anchor_node(path.mapping_size() - 1, path.mapping(path.mapping_size() - 1).edit_size()); + } + + if (!handling_right_tail && prev_anchor_path_node != numeric_limits::max()) { + // we need to make an edge from the final new anchor to the anchor we pinned to + path_nodes[prev_anchor_path_node].edges.emplace_back(attached_path_node_index, + cumul_from_length - prev_anchor_final_from_length); +#ifdef debug_multipath_alignment + cerr << "adding final edge to " << attached_path_node_index << " with length " << cumul_from_length - prev_anchor_final_from_length << endl; + +#endif + } + } + } + } + + // Now we've created new PathNodes for all the perfect matches in the tail alignments. + // They can be resected out of snarls just like the original ones. + } + + void MultipathAlignmentGraph::add_reachability_edges(const HandleGraph& graph, + const function(id_t)>& project, + const unordered_multimap>& injection_trans, + vector* path_node_provenance) { // We're going to make "reachability" edges, which connect MEMs (which @@ -1233,6 +2499,13 @@ namespace vg { // there. +#ifdef debug_multipath_alignment + cerr << "computing reachability" << endl; +#endif + + // Don't let people do this twice. + assert(!has_reachability_edges); + // optimization: we never add edges unless there are multiple nodes, and frequently there is only one // so we can skip traversing over the entire graph if (path_nodes.size() <= 1) { @@ -1243,14 +2516,6 @@ namespace vg { return; } - -#ifdef debug_multipath_alignment - cerr << "computing reachability" << endl; -#endif - - // Don't let people do this twice. - assert(!has_reachability_edges); - // now we calculate reachability between the walked paths so we know which ones // to connect with intervening alignments @@ -1263,8 +2528,8 @@ namespace vg { /// Get the offset in the first visited graph node at which the given MEM ends (i.e. the past-the-end offset). /// Does not account for orientation. auto end_offset = [&](size_t idx) { - Path& path = path_nodes[idx].path; - const Mapping& mapping = path.mapping(path.mapping_size() - 1); + path_t& path = path_nodes[idx].path; + const path_mapping_t& mapping = path.mapping(path.mapping_size() - 1); return mapping.position().offset() + mapping_from_length(mapping); }; @@ -1277,7 +2542,7 @@ namespace vg { /// Get the ID of the last node visited in the graph along the path for a MEM. /// Does not account for orientation. auto end_node_id = [&](size_t idx) { - Path& path = path_nodes[idx].path; + path_t& path = path_nodes[idx].path; return path.mapping(path.mapping_size() - 1).position().node_id(); }; @@ -1286,18 +2551,29 @@ namespace vg { return end ? end_offset(idx) : start_offset(idx); }; - /// Get the node ID in the VG graph of either the start or end position of the given MEM, according to the end flag. + /// Get the node ID in the graph of either the start or end position of the given MEM, according to the end flag. auto endpoint_node_id = [&](size_t idx, bool end) { return end ? end_node_id(idx) : start_node_id(idx); }; + auto non_empty_from_length = [&](size_t idx) { + for (const auto& mapping : path_nodes[idx].path.mapping()) { + for (const auto& edit : mapping.edit()) { + if (edit.from_length()) { + return true; + } + } + } + return false; + }; + // record the start and end node ids of every path // Maps from node ID to the list of MEM numbers that start on that node. unordered_map> path_starts; // Maps from node ID to the list of MEM numbers that end on that node. unordered_map> path_ends; for (size_t i = 0; i < path_nodes.size(); i++) { - Path& path = path_nodes[i].path; + path_t& path = path_nodes[i].path; path_starts[path.mapping(0).position().node_id()].push_back(i); path_ends[path.mapping(path.mapping_size() - 1).position().node_id()].push_back(i); } @@ -1328,14 +2604,14 @@ namespace vg { for (pair>& node_starts : path_starts) { std::sort(node_starts.second.begin(), node_starts.second.end(), [&](const size_t idx_1, const size_t idx_2) { - return start_offset(idx_1) < start_offset(idx_2); - }); + return start_offset(idx_1) < start_offset(idx_2); + }); } for (pair>& node_ends : path_ends) { std::sort(node_ends.second.begin(), node_ends.second.end(), [&](const size_t idx_1, const size_t idx_2) { - return end_offset(idx_1) < end_offset(idx_2); - }); + return end_offset(idx_1) < end_offset(idx_2); + }); } // The "ranges" that are used below (range_start, range_end, etc.) @@ -1372,26 +2648,26 @@ namespace vg { unordered_map>> reachable_ends_from_end; unordered_map>> reachable_starts_from_end; - // note: graph has been sorted into topological order - - Graph& graph = vg.graph; - for (int64_t i = 0; i < graph.node_size(); i++) { - Node* node = graph.mutable_node(i); - id_t node_id = node->id(); + // get a topological order over the nodes in the graph to iterate over + vector topological_order = handlealgs::lazier_topological_order(&graph); + for (int64_t i = 0; i < topological_order.size(); i++) { + id_t node_id = graph.get_id(topological_order[i]); #ifdef debug_multipath_alignment cerr << "DP step for graph node " << node_id << endl; #endif - size_t node_length = node->sequence().size(); + size_t node_length = graph.get_length(topological_order[i]); // do any MEMs start or end on this node? bool contains_starts = path_starts.count(node_id); bool contains_ends = path_ends.count(node_id); // we will use DP to carry reachability information forward onto the next nodes - vector nexts; - vg.nodes_next(NodeTraversal(node), nexts); + vector nexts; + graph.follow_edges(topological_order[i], false, [&](const handle_t& next) { + nexts.push_back(next); + }); if (contains_starts && contains_ends) { // since there are both starts and ends on this node, we have to traverse both lists simultaneously @@ -1416,10 +2692,10 @@ namespace vg { size_t curr_end_offset = end_offset(ends[end_range_begin]); size_t prev_offset = 0; - while (end_range_end == ends.size() ? false : end_offset(ends[end_range_end]) == curr_end_offset) { + while (end_range_end != ends.size() && end_offset(ends[end_range_end]) == curr_end_offset) { end_range_end++; } - while (start_range_end == starts.size() ? false : start_offset(starts[start_range_end]) == curr_start_offset) { + while (start_range_end != starts.size() && start_offset(starts[start_range_end]) == curr_start_offset) { start_range_end++; } @@ -1493,7 +2769,7 @@ namespace vg { prev_offset = *curr_offset; if (*range_begin != endpoints->size()) { *curr_offset = endpoint_offset(endpoints->at(*range_begin), at_end); - while (*range_end == endpoints->size() ? false : endpoint_offset(endpoints->at(*range_end), at_end) == *curr_offset) { + while (*range_end != endpoints->size() && endpoint_offset(endpoints->at(*range_end), at_end) == *curr_offset) { (*range_end)++; } } @@ -1569,7 +2845,7 @@ namespace vg { prev_offset = *curr_offset; if (*range_begin != endpoints->size()) { *curr_offset = endpoint_offset(endpoints->at(*range_begin), at_end); - while (*range_end == endpoints->size() ? false : endpoint_offset(endpoints->at(*range_end), at_end) == *curr_offset) { + while (*range_end != endpoints->size() && endpoint_offset(endpoints->at(*range_end), at_end) == *curr_offset) { (*range_end)++; } } @@ -1653,7 +2929,7 @@ namespace vg { if (*range_begin != endpoints->size()) { *curr_offset = endpoint_offset(endpoints->at(*range_begin), at_end); - while (*range_end == endpoints->size() ? false : endpoint_offset(endpoints->at(*range_end), at_end) == *curr_offset) { + while (*range_end != endpoints->size() && endpoint_offset(endpoints->at(*range_end), at_end) == *curr_offset) { (*range_end)++; } } @@ -1670,8 +2946,8 @@ namespace vg { cerr << "\tcarrying forward reachability onto next nodes at distance " << dist_thru << endl; #endif - for (NodeTraversal next : nexts) { - unordered_map& reachable_endpoints_next = (*reachable_endpoints)[next.node->id()]; + for (const handle_t& next : nexts) { + unordered_map& reachable_endpoints_next = (*reachable_endpoints)[graph.get_id(next)]; for (size_t j = *prev_range_begin; j < endpoints->size(); j++) { if (reachable_endpoints_next.count(endpoints->at(j))) { reachable_endpoints_next[endpoints->at(j)] = std::min(reachable_endpoints_next[endpoints->at(j)], dist_thru); @@ -1681,7 +2957,7 @@ namespace vg { } #ifdef debug_multipath_alignment - cerr << "\t\t" << "endpoint of M" << endpoints->at(j) << " at dist " << reachable_endpoints_next[endpoints->at(j)] << " to node " << next.node->id() << endl; + cerr << "\t\t" << "endpoint of M" << endpoints->at(j) << " at dist " << reachable_endpoints_next[endpoints->at(j)] << " to node " << graph.get_id(next) << endl; #endif } @@ -1723,7 +2999,7 @@ namespace vg { size_t curr_offset = endpoint_offset(endpoints->at(range_begin), contains_ends); size_t prev_offset = curr_offset; // find the range of endpoints that are at the first offset - while (range_end == endpoints->size() ? false : endpoint_offset(endpoints->at(range_end), contains_ends) == curr_offset) { + while (range_end < endpoints->size() && endpoint_offset(endpoints->at(range_end), contains_ends) == curr_offset) { range_end++; } @@ -1754,7 +3030,7 @@ namespace vg { // find the range of endpoints at this offset prev_offset = curr_offset; curr_offset = endpoint_offset(endpoints->at(range_begin), contains_ends); - while (range_end == endpoints->size() ? false : endpoint_offset(endpoints->at(range_end), contains_ends) == curr_offset) { + while (range_end < endpoints->size() && endpoint_offset(endpoints->at(range_end), contains_ends) == curr_offset) { range_end++; } @@ -1786,9 +3062,9 @@ namespace vg { cerr << "\tcarrying forward reachability onto next nodes at distance " << dist_thru << endl; #endif - for (NodeTraversal next : nexts) { + for (const handle_t& next : nexts) { - unordered_map& reachable_endpoints_next = (*reachable_endpoints)[next.node->id()]; + unordered_map& reachable_endpoints_next = (*reachable_endpoints)[graph.get_id(next)]; for (size_t j = prev_range_begin; j < endpoints->size(); j++) { if (reachable_endpoints_next.count(endpoints->at(j))) { reachable_endpoints_next[endpoints->at(j)] = std::min(reachable_endpoints_next[endpoints->at(j)], dist_thru); @@ -1798,7 +3074,7 @@ namespace vg { } #ifdef debug_multipath_alignment - cerr << "\t\t" << (contains_ends ? "end" : "start") << " of M" << endpoints->at(j) << " at dist " << reachable_endpoints_next[endpoints->at(j)] << " to node " << next.node->id() << endl; + cerr << "\t\t" << (contains_ends ? "end" : "start") << " of M" << endpoints->at(j) << " at dist " << reachable_endpoints_next[endpoints->at(j)] << " to node " << graph.get_id(next) << endl; #endif } } @@ -1811,12 +3087,12 @@ namespace vg { cerr << "\tnode " << node_id << " does not contain starts or ends of MEMs, carrying forward reachability" << endl; #endif - for (NodeTraversal next : nexts) { - unordered_map& reachable_ends_next = reachable_ends[next.node->id()]; + for (const handle_t& next : nexts) { + unordered_map& reachable_ends_next = reachable_ends[graph.get_id(next)]; for (const pair& reachable_end : reachable_ends[node_id]) { size_t dist_thru = reachable_end.second + node_length; #ifdef debug_multipath_alignment - cerr << "\t\tend of M" << reachable_end.first << " at dist " << dist_thru << " to node " << next.node->id() << endl; + cerr << "\t\tend of M" << reachable_end.first << " at dist " << dist_thru << " to node " << graph.get_id(next) << endl; #endif if (reachable_ends_next.count(reachable_end.first)) { reachable_ends_next[reachable_end.first] = std::min(reachable_ends_next[reachable_end.first], @@ -1827,11 +3103,11 @@ namespace vg { } } - unordered_map& reachable_starts_next = reachable_starts[next.node->id()]; + unordered_map& reachable_starts_next = reachable_starts[graph.get_id(next)]; for (const pair& reachable_start : reachable_starts[node_id]) { size_t dist_thru = reachable_start.second + node_length; #ifdef debug_multipath_alignment - cerr << "\t\tstart of M" << reachable_start.first << " at dist " << dist_thru << " to node " << next.node->id() << endl; + cerr << "\t\tstart of M" << reachable_start.first << " at dist " << dist_thru << " to node " << graph.get_id(next) << endl; #endif if (reachable_starts_next.count(reachable_start.first)) { reachable_starts_next[reachable_start.first] = std::min(reachable_starts_next[reachable_start.first], @@ -1884,19 +3160,21 @@ namespace vg { vector> noncolinear_shells(path_nodes.size()); - // tuples of (overlap size, index onto, index from, dist) - vector> confirmed_overlaps; + // map from index_from to maps of index_onto to (overlap to length, overlap from length, dist) + unordered_map>> confirmed_overlaps; + // map from path index to set of indexes whose start occurs on the path + unordered_map> path_starts_on_path; - for (size_t i = 0; i < graph.node_size(); i++) { - id_t node_id = graph.node(i).id(); + for (size_t i = 0; i < topological_order.size(); i++) { + id_t node_id = graph.get_id(topological_order[i]); #ifdef debug_multipath_alignment cerr << "looking for edges for starts on node " << node_id << endl; #endif - if (!path_starts.count(node_id)) { + if (!path_starts.count(node_id) && !path_ends.count(node_id)) { #ifdef debug_multipath_alignment - cerr << "there are no starts on this node" << endl; + cerr << "there are no starts or ends on this node" << endl; #endif continue; } @@ -1907,372 +3185,458 @@ namespace vg { vector& starts = path_starts[node_id]; vector& ends = path_ends[node_id]; - // index of the next end that is past the start we are on - size_t next_end_idx = 0; - // sentinel that will never be equal to the first offset - size_t curr_start_offset = numeric_limits::max(); - - for (size_t start_idx = 0; start_idx < starts.size(); start_idx++) { - // traverse all of the reachable starts to find the adjacent ends that might be colinear - - size_t start = starts[start_idx]; -#ifdef debug_multipath_alignment - cerr << "searching backward from start " << start << endl; -#endif - - PathNode& start_node = path_nodes[start]; - unordered_map& noncolinear_shell = noncolinear_shells[start]; + size_t start_idx = 0, end_idx = 0; + size_t curr_start_offset = numeric_limits::max(), curr_end_offset = numeric_limits::max(); + if (!starts.empty()) { + curr_start_offset = start_offset(starts[0]); + } + if (!ends.empty()) { + curr_end_offset = end_offset(ends[0]); + } + while (start_idx < starts.size() || end_idx < ends.size()) { - // pairs of (dist, index) - priority_queue, vector>, std::greater>> start_queue; - priority_queue, vector>, std::greater>> end_queue; - start_queue.emplace(0, start); + // TODO: would it be better to combine these into one queue? - unordered_set traversed_start; + // initialize queues for the next start and next end, prioritized by shortest distance + structures::RankPairingHeap> start_queue, end_queue; - while (!start_queue.empty()) { - pair start_here = start_queue.top(); - start_queue.pop(); - if (traversed_start.count(start_here.second)) { - continue; - } - traversed_start.insert(start_here.second); -#ifdef debug_multipath_alignment - cerr << "traversing initial start " << start_here.second << " at distance " << start_here.first << endl; -#endif + if (curr_start_offset >= curr_end_offset) { + + // the next endpoint is an end, the point of searching backwards from these is to + // fill out the non-colinear shell of the current end with any path whose end is between + // this path's start and end but is not overlap colinear + + size_t end = ends[end_idx]; - // the minimum distance to each of the starts or ends this can reach is the sum of the min distance - // between them and the distance already traversed - for (const pair& end : reachable_ends_from_start[start_here.second]) { - end_queue.emplace(start_here.first + end.second, end.first); #ifdef debug_multipath_alignment - cerr << "found reachable end " << end.first << " at distance " << start_here.first + end.second << endl; + cerr << "searching backward from end " << end << endl; #endif - } + PathNode& end_node = path_nodes[end]; + unordered_map& noncolinear_shell = noncolinear_shells[end]; - for (const pair& start_next : reachable_starts_from_start[start_here.second]) { - start_queue.emplace(start_here.first + start_next.second, start_next.first); + for (const pair& next_end : reachable_ends_from_end[end]) { + end_queue.push_or_reprioritize(next_end.first, next_end.second); } - } - - // now we've traversed all of the starts, we have the set of ends that can be reached - // without passing another end - - unordered_set traversed_end; - - while (!end_queue.empty()) { - size_t candidate_end = end_queue.top().second; - size_t candidate_dist = end_queue.top().first; - end_queue.pop(); - if (traversed_end.count(candidate_end)) { - continue; + for (const pair& start_next : reachable_starts_from_end[end]) { + start_queue.push_or_reprioritize(start_next.first, start_next.second); } -#ifdef debug_multipath_alignment - cerr << "considering end " << candidate_end << " as candidate for edge of dist " << candidate_dist << endl; -#endif - traversed_end.insert(candidate_end); - PathNode& candidate_end_node = path_nodes[candidate_end]; - - if (candidate_end_node.end <= start_node.begin) { - // these MEMs are read colinear and graph reachable, so connect them - candidate_end_node.edges.emplace_back(start, candidate_dist); - -#ifdef debug_multipath_alignment - cerr << "connection is read colinear, adding edge on " << candidate_end << " for total of " << candidate_end_node.edges.size() << " edges so far" << endl; - for (auto& edge : candidate_end_node.edges) { - cerr << "\t-> " << edge.first << " dist " << edge.second << endl; + while (!start_queue.empty() || !end_queue.empty()) { + // is the next item on the queues a start or an end? + if (!start_queue.empty() && (end_queue.empty() || start_queue.top().second < end_queue.top().second)) { + + // the next closest endpoint is a start, traverse through it to find ends (which is what we really want) + + pair start_here = start_queue.top(); + start_queue.pop(); + + // don't keep looking backward earlier than the start of the current path + if (start_here.first == end) { + continue; + } + + // the minimum distance to each of the starts or ends this can reach is (at most) the sum of the min distance + // between them and the distance already traversed + for (const pair& next_end : reachable_ends_from_start[start_here.first]) { + end_queue.push_or_reprioritize(next_end.first, start_here.second + next_end.second); + } + + for (const pair& start_next : reachable_starts_from_start[start_here.first]) { + start_queue.push_or_reprioritize(start_next.first, start_here.second + start_next.second); + } } -#endif - - // skip to the predecessor's noncolinear shell, whose connections might not be blocked by - // this connection - for (const pair& shell_pred : noncolinear_shells[candidate_end]) { -#ifdef debug_multipath_alignment - cerr << "enqueueing " << shell_pred.first << " at dist " << shell_pred.second + candidate_dist << " from noncolinear shell" << endl; -#endif - end_queue.emplace(candidate_dist + shell_pred.second, shell_pred.first); + else { + + // the next closest endpoint is an end, so we'll check whether we can + + pair end_here = end_queue.top(); + end_queue.pop(); + + PathNode& next_end_node = path_nodes[end_here.first]; + + // these are non-colinear, so add it to the non-colinear shell + if (next_end_node.begin >= end_node.begin || next_end_node.end >= end_node.end) { + if (noncolinear_shell.count(end_here.first)) { + noncolinear_shell[end_here.first] = std::min(end_here.second, noncolinear_shell[end_here.first]); + } + else { + noncolinear_shell[end_here.first] = end_here.second; + } + continue; + } + + // if we get this far, the two paths are colinear or overlap-colinear, so we won't add it to the + // non-colinear shell. now we need to decide whether to keep searching backward. we'll check a + // few conditions that will guarantee that the rest of the search is redundant + + // TODO: this actually isn't a full set of criteria, we don't just want to know if there is an + // edge, we want to know if it is reachable along any series of edges... + // at least this will only cause a few false positive edges that we can remove later with + // the transitive reduction + + // see if this node has an edge forward + bool has_edge_forward = false; + for (auto& edge : next_end_node.edges) { + if (edge.first == end) { + has_edge_forward = true; + break; + } + } + + if (has_edge_forward) { // already has an edge, can stop + continue; + } + + auto overlap_iter = confirmed_overlaps.find(end_here.first); + if (overlap_iter != confirmed_overlaps.end()) { + if (overlap_iter->second.count(end)) { + has_edge_forward = true; + break; + } + } + + if (has_edge_forward) { // already has an overlap, can stop + continue; + } + + // we can't easily guarantee that this is non-colinear or colinear, so we're just going to treat it + // as non-colinear and accept some risk of this creating redundant edges to later nodes + if (noncolinear_shell.count(end_here.first)) { + noncolinear_shell[end_here.first] = std::min(end_here.second, noncolinear_shell[end_here.first]); + } + else { + noncolinear_shell[end_here.first] = end_here.second; + } } } - else if (start_node.end > candidate_end_node.end && start_node.begin > candidate_end_node.begin) { - // the MEM can be made colinear by removing an overlap, which will not threaten reachability - size_t overlap = candidate_end_node.end - start_node.begin; - confirmed_overlaps.emplace_back(overlap, start, candidate_end, candidate_dist + overlap); - -#ifdef debug_multipath_alignment - cerr << "connection is overlap colinear, recording to add edge later" << endl; -#endif - - // the end of this node might not actually block connections since it's going to intersect the middle of the node - // so we need to find predecessors to this end too - - // add any ends directly reachable from the end - for (const pair& exposed_end : reachable_ends_from_end[candidate_end]) { - end_queue.emplace(candidate_dist + exposed_end.second, exposed_end.first); -#ifdef debug_multipath_alignment - cerr << "found reachable exposed end " << exposed_end.first << " at distance " << candidate_dist + exposed_end.second << endl; -#endif - } - - // traverse through any exposed starts to see if we can find other exposed ends - priority_queue, vector>, std::greater>> exposed_start_queue; - unordered_set traversed_exposed_start; - - // inialize the queue with the directly reachable exposed starts - for (const pair& exposed_start : reachable_starts_from_end[candidate_end]) { + + end_idx++; + curr_end_offset = (end_idx == ends.size() ? numeric_limits::max() : end_offset(ends[end_idx])); + } + else { + + size_t start = starts[start_idx]; + #ifdef debug_multipath_alignment - cerr << "initializing exposed start traversal with " << exposed_start.first << " at distance " << candidate_dist + exposed_start.second << endl; + cerr << "searching backward from start " << start << " at index " << start_idx << endl; #endif - exposed_start_queue.emplace(candidate_dist + exposed_start.second, exposed_start.first); - } - - while (!exposed_start_queue.empty()) { - pair start_here = exposed_start_queue.top(); - exposed_start_queue.pop(); - if (traversed_exposed_start.count(start_here.second)) { - continue; - } - traversed_exposed_start.insert(start_here.second); + + PathNode& start_node = path_nodes[start]; + unordered_map& noncolinear_shell = noncolinear_shells[start]; + // TODO: kinda ugly + // init this to 0, we'll actually compute it if we need it ever + size_t start_node_from_length = 0; + + // we begin at the start we're searching backward from + start_queue.push_or_reprioritize(start, 0); + + while (!start_queue.empty() || !end_queue.empty()) { + // is the next item on the queues a start or an end? + if (!start_queue.empty() && (end_queue.empty() || start_queue.top().second < end_queue.top().second)) { + + // the next closest endpoint is a start, traverse through it to find ends (which is what we really want) + + pair start_here = start_queue.top(); + start_queue.pop(); + #ifdef debug_multipath_alignment - cerr << "traversing exposed start " << start_here.second << " at distance " << start_here.first << endl; + cerr << "traversing start " << start_here.first << " at distance " << start_here.second << endl; #endif // the minimum distance to each of the starts or ends this can reach is the sum of the min distance // between them and the distance already traversed - for (const pair& end : reachable_ends_from_start[start_here.second]) { - end_queue.emplace(start_here.first + end.second, end.first); + for (const pair& end : reachable_ends_from_start[start_here.first]) { + end_queue.push_or_reprioritize(end.first, start_here.second + end.second); #ifdef debug_multipath_alignment - cerr << "found reachable exposed end " << end.first << " at distance " << start_here.first + end.second << endl; + cerr << "found reachable end " << end.first << " at distance " << start_here.second + end.second << endl; #endif } - for (const pair& start_next : reachable_starts_from_start[start_here.second]) { - exposed_start_queue.emplace(start_here.first + start_next.second, start_next.first); + for (const pair& start_next : reachable_starts_from_start[start_here.first]) { + start_queue.push_or_reprioritize(start_next.first, start_here.second + start_next.second); } } - - // also skip to the predecessor's noncolinear shell, whose connections might not be blocked by - // this connection - for (const pair& shell_pred : noncolinear_shells[candidate_end]) { -#ifdef debug_multipath_alignment - cerr << "enqueueing " << shell_pred.first << " at dist " << candidate_dist + shell_pred.second << " from noncolinear shell" << endl; -#endif - end_queue.emplace(candidate_dist + shell_pred.second, shell_pred.first); - } - } - else { - // these MEMs are noncolinear, so add this predecessor to the noncolinear shell - if (noncolinear_shell.count(candidate_end)) { - noncolinear_shell[candidate_end] = std::min(candidate_dist + (start_node.end - start_node.begin), - noncolinear_shell[candidate_end]); - } else { - noncolinear_shell[candidate_end] = candidate_dist + (start_node.end - start_node.begin); - } - + + // the next closest endpoint is an end, so we check if we can make a connection to the start + // that we're searching backward from + + size_t candidate_end, candidate_dist; + tie(candidate_end, candidate_dist) = end_queue.top(); + end_queue.pop(); + #ifdef debug_multipath_alignment - cerr << "connection is noncolinear, add to shell at dist " << candidate_dist + (start_node.end - start_node.begin) << " and continue to search backwards" << endl; + cerr << "considering end " << candidate_end << " as candidate for edge of dist " << candidate_dist << endl; #endif - - // there is no connection to block further connections back, so any of this MEMs - // predecessors could still be colinear - - // find the ends that can reach it directly - for (const pair& pred_end : reachable_ends_from_end[candidate_end]) { - end_queue.emplace(candidate_dist + pred_end.second, pred_end.first); + + PathNode& candidate_end_node = path_nodes[candidate_end]; + + if (candidate_end_node.end <= start_node.begin) { + // these MEMs are read colinear and graph reachable + if (candidate_dist != 0 + || candidate_end_node.end != candidate_end_node.begin + || start_node.end != start_node.begin + || non_empty_from_length(start) + || non_empty_from_length(candidate_end)) { + // and they are not empty nodes at the exact same position, so add an edge + // (this is almost always the case, but some code paths will add empty read sequences + // to anchor to specific locations, which slightly confuses the reachability logic) + candidate_end_node.edges.emplace_back(start, candidate_dist); + +#ifdef debug_multipath_alignment + cerr << "connection is read colinear, adding edge on " << candidate_end << " for total of " << candidate_end_node.edges.size() << " edges so far" << endl; + for (auto& edge : candidate_end_node.edges) { + cerr << "\t-> " << edge.first << " dist " << edge.second << endl; + } +#endif + } + + // skip to the predecessor's noncolinear shell, whose connections might not be blocked by + // this connection + for (const pair& shell_pred : noncolinear_shells[candidate_end]) { #ifdef debug_multipath_alignment - cerr << "found reachable end " << pred_end.first << " at distance " << candidate_dist + pred_end.second << endl; + cerr << "enqueueing " << shell_pred.first << " at dist " << shell_pred.second + candidate_dist << " from noncolinear shell" << endl; #endif - } - - // traverse backward through any starts to find more ends that can reach this MEM - priority_queue, vector>, std::greater>> pred_start_queue; - - // initialize the queue with the immediate start neighbors - for (const pair& pred_start : reachable_starts_from_end[candidate_end]) { - pred_start_queue.emplace(candidate_dist + pred_start.second, pred_start.first); - } - - unordered_set pred_traversed; - - // traverse backwards through starts, stopping at any ends - while (!pred_start_queue.empty()) { - size_t start_here = pred_start_queue.top().second; - size_t start_dist = pred_start_queue.top().first; - pred_start_queue.pop(); - if (pred_traversed.count(start_here)) { - continue; + end_queue.push_or_reprioritize(shell_pred.first, candidate_dist + shell_pred.second); + } } - pred_traversed.insert(start_here); - + else if (candidate_end_node.begin < start_node.begin && candidate_end_node.end < start_node.end) { + // the MEM can be made colinear by removing an overlap, which will not threaten reachability + size_t read_overlap = candidate_end_node.end - start_node.begin; + size_t graph_overlap = corresponding_from_length(start_node.path, read_overlap, false); + confirmed_overlaps[start][candidate_end] = make_tuple(read_overlap, graph_overlap, + candidate_dist + graph_overlap); + #ifdef debug_multipath_alignment - cerr << "traversing predecessor start " << start_here << " at distance " << start_dist << endl; + cerr << "connection is overlap colinear, recording to add edge later" << endl; #endif - - for (const pair& pred_end : reachable_ends_from_start[start_here]) { - end_queue.emplace(start_dist + pred_end.second, pred_end.first); + + // the end of this node might not actually block connections since it's going to intersect the middle of the node + // so we need to find predecessors to this end too + + // add any ends directly reachable from the end + for (const pair& exposed_end : reachable_ends_from_end[candidate_end]) { + end_queue.push_or_reprioritize(exposed_end.first, candidate_dist + exposed_end.second); +#ifdef debug_multipath_alignment + cerr << "found reachable exposed end " << exposed_end.first << " at distance " << candidate_dist + exposed_end.second << endl; +#endif + } + + // add the directly reachable exposed starts to the queue + for (const pair& exposed_start : reachable_starts_from_end[candidate_end]) { +#ifdef debug_multipath_alignment + cerr << "adding exposed start traversal with " << exposed_start.first << " at distance " << candidate_dist + exposed_start.second << endl; +#endif + start_queue.push_or_reprioritize(exposed_start.first, candidate_dist + exposed_start.second); + } + + // also skip to the predecessor's noncolinear shell, whose connections might not be blocked by + // this connection + for (const pair& shell_pred : noncolinear_shells[candidate_end]) { #ifdef debug_multipath_alignment - cerr << "found reachable end " << pred_end.first << " at distance " << candidate_dist + pred_end.second << endl; + cerr << "enqueueing " << shell_pred.first << " at dist " << candidate_dist + shell_pred.second << " from noncolinear shell" << endl; #endif + end_queue.push_or_reprioritize(shell_pred.first, candidate_dist + shell_pred.second); + } } - for (const pair& start_next : reachable_starts_from_start[start_here]) { - pred_start_queue.emplace(start_dist + start_next.second, start_next.first); + else { + // these MEMs are noncolinear, so add this predecessor to the noncolinear shell + if (start_node_from_length == 0) { + start_node_from_length = path_from_length(start_node.path); + } + if (noncolinear_shell.count(candidate_end)) { + noncolinear_shell[candidate_end] = std::min(candidate_dist + start_node_from_length, + noncolinear_shell[candidate_end]); + } + else { + noncolinear_shell[candidate_end] = candidate_dist + start_node_from_length; + } + +#ifdef debug_multipath_alignment + cerr << "connection is noncolinear, add to shell at dist " << candidate_dist + start_node_from_length << " and continue to search backwards" << endl; +#endif + + // there is no connection to block further connections back, so any of this MEMs + // predecessors could still be colinear + + // find the ends that can reach it directly + for (const pair& pred_end : reachable_ends_from_end[candidate_end]) { + end_queue.push_or_reprioritize(pred_end.first, candidate_dist + pred_end.second); #ifdef debug_multipath_alignment - cerr << "found intermediate start " << start_next.first << " at distance " << start_dist + start_next.second << endl; + cerr << "found reachable end " << pred_end.first << " at distance " << candidate_dist + pred_end.second << endl; #endif + } + + // set the start queue up with the immediate start neighbors + for (const pair& pred_start : reachable_starts_from_end[candidate_end]) { + start_queue.push_or_reprioritize(pred_start.first, candidate_dist + pred_start.second); + } } } } - } - -#ifdef debug_multipath_alignment - cerr << "walking path to look for overlaps" << endl; -#endif - - size_t prev_start_offset = curr_start_offset; - curr_start_offset = start_offset(start); - // update the list of starts at this offset earlier in the starts vector - if (curr_start_offset != prev_start_offset) { - colocated_starts.clear(); - } - colocated_starts.push_back(start); - - // move the next end pointer to the one immediately following this start on the node - while (next_end_idx >= ends.size() ? false : end_offset(ends[next_end_idx]) <= curr_start_offset) { - next_end_idx++; - } - - Path& path = path_nodes[start].path; - // the starts that are on this path - unordered_set path_starts_on_path(colocated_starts.begin(), colocated_starts.end()); - // records of (node_idx, overlap length) - vector> overlap_candidates; - - if (path.mapping_size() == 1) { - // TODO: this edge case is a little duplicative, probably could merge #ifdef debug_multipath_alignment - cerr << "path is one mapping long" << endl; + cerr << "walking path to look for overlaps" << endl; #endif - size_t final_offset = end_offset(start); - // record which starts are on the path on this node - for (size_t path_start_idx = start_idx + 1; - path_start_idx >= starts.size() ? false : start_offset(starts[path_start_idx]) < final_offset; - path_start_idx++) { + path_t& path = start_node.path; + + // update the path starts index for the paths that start at the same position + for (size_t colocated_start : colocated_starts) { + path_starts_on_path[colocated_start].insert(start); + path_starts_on_path[start].insert(colocated_start); + } + // records of (node_idx, graph overlap length) + vector> overlap_candidates; + + if (path.mapping_size() == 1) { + // TODO: this edge case is a little duplicative, probably could merge - path_starts_on_path.insert(starts[path_start_idx]); +#ifdef debug_multipath_alignment + cerr << "path is one mapping long" << endl; +#endif - } - // record which ends are on the path on this node - for (size_t path_end_idx = next_end_idx; path_end_idx < ends.size(); path_end_idx++) { - size_t end_offset_here = end_offset(ends[path_end_idx]); - if (end_offset_here < final_offset) { - overlap_candidates.emplace_back(ends[path_end_idx], end_offset_here - curr_start_offset); + size_t final_offset = end_offset(start); + // record which starts are on the path on this node + for (size_t path_start_idx = start_idx + 1; + path_start_idx < starts.size() && start_offset(starts[path_start_idx]) < final_offset; + path_start_idx++) { + + path_starts_on_path[start].insert(starts[path_start_idx]); + } - else { - break; + // record which ends are on the path on this node + for (size_t path_end_idx = end_idx; path_end_idx < ends.size(); path_end_idx++) { + size_t end_offset_here = end_offset(ends[path_end_idx]); + if (end_offset_here < final_offset) { + overlap_candidates.emplace_back(ends[path_end_idx], end_offset_here - curr_start_offset); + } + else { + break; + } } } - } - else { + else { #ifdef debug_multipath_alignment - cerr << "path is multiple mappings long" << endl; + cerr << "path is multiple mappings long" << endl; #endif - - // record which starts are on the path on the first node - for (size_t path_start_idx = start_idx + 1; path_start_idx < starts.size(); path_start_idx++) { - path_starts_on_path.insert(starts[path_start_idx]); - } - // record which ends are on the path on the first node - for (size_t path_end_idx = next_end_idx; path_end_idx < ends.size(); path_end_idx++) { - overlap_candidates.emplace_back(ends[path_end_idx], end_offset(ends[path_end_idx]) - curr_start_offset); - } - size_t traversed_length = mapping_from_length(path.mapping(0)); - - for (size_t j = 1; j + 1 < path.mapping_size(); j++) { - id_t path_node_id = path.mapping(j).position().node_id(); - // record which starts are on the path on this node - for (size_t path_start : path_starts[path_node_id]) { - path_starts_on_path.insert(path_start); + + // record which starts are on the path on the first node + for (size_t path_start_idx = start_idx + 1; path_start_idx < starts.size(); path_start_idx++) { + path_starts_on_path[start].insert(starts[path_start_idx]); } - // record which ends are on the path on this node - for (size_t path_end : path_ends[path_node_id]) { - overlap_candidates.emplace_back(path_end, end_offset(path_end) + traversed_length); + // record which ends are on the path on the first node + for (size_t path_end_idx = end_idx; path_end_idx < ends.size(); path_end_idx++) { + overlap_candidates.emplace_back(ends[path_end_idx], end_offset(ends[path_end_idx]) - curr_start_offset); } + size_t traversed_length = mapping_from_length(path.mapping(0)); - traversed_length += mapping_from_length(path.mapping(j)); - } - - id_t final_node_id = path.mapping(path.mapping_size() - 1).position().node_id(); - vector& final_starts = path_starts[final_node_id]; - vector& final_ends = path_ends[final_node_id]; - - size_t final_offset = end_offset(start); - // record which starts are on the path on the last node - for (size_t path_start_idx = 0; - path_start_idx >= final_starts.size() ? false : start_offset(final_starts[path_start_idx]) < final_offset; - path_start_idx++) { + for (size_t j = 1; j + 1 < path.mapping_size(); j++) { + id_t path_node_id = path.mapping(j).position().node_id(); + // record which starts are on the path on this node + for (size_t path_start : path_starts[path_node_id]) { + path_starts_on_path[start].insert(path_start); + } + // record which ends are on the path on this node + for (size_t path_end : path_ends[path_node_id]) { + overlap_candidates.emplace_back(path_end, end_offset(path_end) + traversed_length); + } + + traversed_length += mapping_from_length(path.mapping(j)); + } - path_starts_on_path.insert(final_starts[path_start_idx]); + id_t final_node_id = path.mapping(path.mapping_size() - 1).position().node_id(); + vector& final_starts = path_starts[final_node_id]; + vector& final_ends = path_ends[final_node_id]; - } - // record which ends are on the path on the last node - for (size_t path_end_idx = 0; path_end_idx < final_ends.size(); path_end_idx++) { - size_t end_offset_here = end_offset(final_ends[path_end_idx]); - if (end_offset_here < final_offset) { - overlap_candidates.emplace_back(final_ends[path_end_idx], end_offset_here + traversed_length); + size_t final_offset = end_offset(start); + // record which starts are on the path on the last node + for (size_t path_start_idx = 0; + path_start_idx < final_starts.size() && start_offset(final_starts[path_start_idx]) < final_offset; + path_start_idx++) { + + path_starts_on_path[start].insert(final_starts[path_start_idx]); + } - else { - break; + // record which ends are on the path on the last node + for (size_t path_end_idx = 0; path_end_idx < final_ends.size(); path_end_idx++) { + size_t end_offset_here = end_offset(final_ends[path_end_idx]); + if (end_offset_here < final_offset) { + overlap_candidates.emplace_back(final_ends[path_end_idx], end_offset_here + traversed_length); + } + else { + break; + } } } - } - - for (const pair& overlap_candidate : overlap_candidates) { + + for (const pair& overlap_candidate : overlap_candidates) { #ifdef debug_multipath_alignment - cerr << "considering candidate overlap from " << overlap_candidate.first << " at dist " << overlap_candidate.second << endl; + cerr << "considering candidate overlap from " << overlap_candidate.first << " at dist " << overlap_candidate.second << endl; #endif - - if (path_starts_on_path.count(overlap_candidate.first)) { - // the start of this MEM is also on the path, so this can't be an overhanging overlap - continue; - } - - PathNode& overlap_node = path_nodes[overlap_candidate.first]; - - // how much do the paths overlap? - size_t overlap = overlap_candidate.second; - - // are the paths read colinear after removing the overlap? - if (start_node.begin + overlap >= overlap_node.end) { + + if (path_starts_on_path[start].count(overlap_candidate.first)) { + // the start of this MEM is also on the path, so this can't be an overhanging overlap + continue; + } + + if (!path_starts_on_path[overlap_candidate.first].count(start)) { + // the path we are walking doesn't start on the other path, so this can't be a full overlap + continue; + } + + PathNode& overlap_node = path_nodes[overlap_candidate.first]; + + // how much do the paths overlap? + size_t graph_overlap = overlap_candidate.second; + size_t read_overlap = corresponding_to_length(path, graph_overlap, false); + + // are the paths read colinear after removing the overlap? + if (start_node.begin + read_overlap >= overlap_node.end) { #ifdef debug_multipath_alignment - cerr << "confirmed overlap colinear with overlap of " << overlap << endl; + cerr << "confirmed overlap colinear with read overlap of " << read_overlap << ", graph overlap " << graph_overlap << endl; #endif - confirmed_overlaps.emplace_back(overlap, start, overlap_candidate.first, 0); - } - else if (overlap_node.begin < start_node.begin && overlap_node.end < start_node.end) { + confirmed_overlaps[start][overlap_candidate.first] = tuple(read_overlap, graph_overlap, 0); + } + else if (overlap_node.begin < start_node.begin && overlap_node.end < start_node.end) { #ifdef debug_multipath_alignment - cerr << "confirmed overlap colinear with longer read overlap of " << overlap_node.end - start_node.begin << endl; + cerr << "confirmed overlap colinear with longer read overlap of " << overlap_node.end - start_node.begin << endl; #endif - // there is still an even longer read overlap we need to remove - size_t read_overlap = overlap_node.end - start_node.begin; - confirmed_overlaps.emplace_back(read_overlap, start, overlap_candidate.first, read_overlap - overlap); - } - else { + // there is still an even longer read overlap we need to remove + size_t extended_read_overlap = overlap_node.end - start_node.begin; + size_t extended_graph_overlap = corresponding_from_length(path, extended_read_overlap, false); + confirmed_overlaps[start][overlap_candidate.first] = tuple(extended_read_overlap, + extended_graph_overlap, + extended_graph_overlap - graph_overlap); + } + else { #ifdef debug_multipath_alignment - cerr << "not colinear even with overlap, adding to non-colinear shell at distance " << overlap_candidate.second << endl; + cerr << "not colinear even with overlap, adding to non-colinear shell at distance " << overlap_candidate.second << endl; #endif - // the overlapping node is still not reachable so it is in the noncolinear shell of this node - noncolinear_shell[overlap_candidate.first] = (start_node.end - start_node.begin) - overlap_candidate.second; + // the overlapping node is still not reachable so it is in the noncolinear shell of this node + if (start_node_from_length == 0) { + start_node_from_length = path_from_length(start_node.path); + } + noncolinear_shell[overlap_candidate.first] = start_node_from_length - overlap_candidate.second; + } + } + + start_idx++; + size_t new_start_offset = (start_idx == starts.size() ? numeric_limits::max() : start_offset(starts[start_idx])); + if (new_start_offset != curr_start_offset) { + colocated_starts.clear(); + } + if (start_idx < starts.size()) { + colocated_starts.emplace_back(starts[start_idx]); } + curr_start_offset = new_start_offset; } } } #ifdef debug_multipath_alignment - cerr << "breaking nodes at overlap edges (" << confirmed_overlaps.size() << " times)" << endl; + cerr << "breaking nodes at overlap edges" << endl; #endif // now we've found all overlap edges, so we can add them into the graph in an order such that they don't @@ -2280,30 +3644,55 @@ namespace vg { // about overlaps coming in from both directions) // sort in descending order of overlap length and group by the node that is being cut among overlaps of same length - std::sort(confirmed_overlaps.begin(), confirmed_overlaps.end(), - std::greater>()); + // tuples of (read overlap, graph overlap, index onto, index from, distance) + vector> ordered_overlaps; + for (const auto& path_overlaps : confirmed_overlaps) { + for (const auto& overlap_record : path_overlaps.second) { + ordered_overlaps.emplace_back(get<0>(overlap_record.second), + get<1>(overlap_record.second), + path_overlaps.first, + overlap_record.first, + get<2>(overlap_record.second)); + } + } + // because both from and to lengths are monotonic, we should never get into a situations where the + // pair (read overlap, graph overlap) is incomparable, so this partial order is actually a total order + // barring equal pairs + sort(ordered_overlaps.begin(), ordered_overlaps.end(), greater>()); // keep track of whether another node is holding the suffix of one of the original nodes because of a split unordered_map node_with_suffix; // split up each node with an overlap edge onto it - auto iter = confirmed_overlaps.begin(); - while (iter != confirmed_overlaps.end()) { + auto iter = ordered_overlaps.begin(); + while (iter != ordered_overlaps.end()) { // find the range of overlaps that want to cut this node at the same place auto iter_range_end = iter; - while (get<0>(*iter_range_end) == get<0>(*iter) && get<1>(*iter_range_end) == get<1>(*iter)) { + while (get<0>(*iter_range_end) == get<0>(*iter) && get<1>(*iter_range_end) == get<1>(*iter) + && get<2>(*iter_range_end) == get<2>(*iter)) { iter_range_end++; - if (iter_range_end == confirmed_overlaps.end()) { + if (iter_range_end == ordered_overlaps.end()) { break; } } #ifdef debug_multipath_alignment - cerr << "performing an overlap split onto " << get<1>(*iter) << " of length " << get<0>(*iter) << endl; + cerr << "performing an overlap split onto " << get<2>(*iter) << " from " << get<3>(*iter); + auto it = iter; + ++it; + for (; it != iter_range_end; ++it) { + cerr << ", " << get<3>(*it); + } + cerr << " of read length " << get<0>(*iter) << " and graph length " << get<1>(*iter); + if (path_node_provenance) { + cerr << ", provenances " << path_node_provenance->at(get<2>(*iter)) << " and " << path_node_provenance->at(get<3>(*iter)); + } + cerr << endl; + #endif - PathNode* onto_node = &path_nodes[get<1>(*iter)]; + PathNode* onto_node = &path_nodes[get<2>(*iter)]; #ifdef debug_multipath_alignment cerr << "before splitting:" << endl; @@ -2311,45 +3700,47 @@ namespace vg { for (auto node_iter = onto_node->begin; node_iter != onto_node->end; node_iter++) { cerr << *node_iter; } - cerr << endl << "\t" << pb2json(onto_node->path) << endl; + cerr << endl << "\t" << debug_string(onto_node->path) << endl; #endif // TODO: there should be a way to do this in a single pass over mappings and edits // rather than traversing the whole mapping twice // store the full path and remove it from the node - Path full_path = std::move(onto_node->path); - onto_node->path.Clear(); + path_t full_path = std::move(onto_node->path); + onto_node->path.clear_mapping(); // keep track of how the read sequence should get split up size_t prefix_to_length = 0; // add mappings from the path until reaching the overlap point - int64_t remaining = get<0>(*iter); + int64_t to_remaining = get<0>(*iter); + int64_t from_remaining = get<1>(*iter); int64_t mapping_idx = 0; - int64_t mapping_len = mapping_from_length(full_path.mapping(mapping_idx)); - while (remaining >= mapping_len) { + int64_t mapping_from_len = mapping_from_length(full_path.mapping(mapping_idx)); + int64_t mapping_to_len = mapping_to_length(full_path.mapping(mapping_idx)); + while (to_remaining >= mapping_to_len && from_remaining >= mapping_from_len) { *onto_node->path.add_mapping() = full_path.mapping(mapping_idx); - prefix_to_length += mapping_to_length(full_path.mapping(mapping_idx)); - remaining -= mapping_len; - + prefix_to_length += mapping_to_len; + to_remaining -= mapping_to_len; + from_remaining -= mapping_from_len; mapping_idx++; if (mapping_idx == full_path.mapping_size()) { break; } - mapping_len = mapping_from_length(full_path.mapping(mapping_idx)); + mapping_from_len = mapping_from_length(full_path.mapping(mapping_idx)); + mapping_to_len = mapping_to_length(full_path.mapping(mapping_idx)); } - if (mapping_idx == full_path.mapping_size() && !remaining) { + if (mapping_idx == full_path.mapping_size() && !to_remaining && !from_remaining) { // TODO: isn't this case covered by taking the entire range of splits at the same place? // the overlap covered the path, so connect it to the onto node's successors // rather than splitting it into two nodes - while (iter != iter_range_end) { for (const pair edge : onto_node->edges) { - path_nodes[get<2>(*iter)].edges.emplace_back(edge.first, edge.second + get<3>(*iter)); + path_nodes.at(get<3>(*iter)).edges.emplace_back(edge.first, edge.second + get<4>(*iter)); } iter++; } @@ -2361,10 +3752,13 @@ namespace vg { // make a new node to hold the suffix of the path size_t suffix_idx = path_nodes.size(); path_nodes.emplace_back(); + if (path_node_provenance) { + path_node_provenance->emplace_back((*path_node_provenance)[get<2>(*iter)]); + } PathNode& suffix_node = path_nodes.back(); // get the pointer from the onto node back in case the vector reallocated - onto_node = &path_nodes[get<1>(*iter)]; + onto_node = &path_nodes.at(get<2>(*iter)); // transfer the outgoing edges onto the new node suffix_node.edges = std::move(onto_node->edges); @@ -2374,57 +3768,59 @@ namespace vg { onto_node->edges.emplace_back(suffix_idx, 0); // keep track of the relationship of suffix nodes to original nodes - if (!node_with_suffix.count(get<1>(*iter))) { + if (!node_with_suffix.count(get<2>(*iter))) { // since we take longest overlaps first, only the first split onto a node will change // which node contains the suffix of the original node - node_with_suffix[get<1>(*iter)] = suffix_idx; + node_with_suffix[get<2>(*iter)] = suffix_idx; } - if (remaining) { + if (to_remaining || from_remaining) { // the overlap point is in the middle of a node, need to split a mapping - const Mapping& split_mapping = full_path.mapping(mapping_idx); + const path_mapping_t& split_mapping = full_path.mapping(mapping_idx); // add the prefix of the mapping to the original node - Mapping* prefix_split = onto_node->path.add_mapping(); + path_mapping_t* prefix_split = onto_node->path.add_mapping(); prefix_split->mutable_position()->set_node_id(split_mapping.position().node_id()); prefix_split->mutable_position()->set_offset(split_mapping.position().offset()); // add the suffix of the mapping to the new node - Mapping* suffix_split = suffix_node.path.add_mapping(); + path_mapping_t* suffix_split = suffix_node.path.add_mapping(); suffix_split->mutable_position()->set_node_id(split_mapping.position().node_id()); - suffix_split->mutable_position()->set_offset(split_mapping.position().offset() + remaining); - + suffix_split->mutable_position()->set_offset(split_mapping.position().offset() + from_remaining); + // add the edits up to the point where the split occurs size_t edit_idx = 0; - int64_t mapping_remaining = remaining; - for (; mapping_remaining >= split_mapping.edit(edit_idx).from_length() && edit_idx < split_mapping.edit_size(); edit_idx++) { - mapping_remaining -= split_mapping.edit(edit_idx).from_length(); - prefix_to_length += split_mapping.edit(edit_idx).to_length(); + int64_t mapping_to_remaining = to_remaining; + int64_t mapping_from_remaining = from_remaining; + for (; edit_idx < split_mapping.edit_size() + && mapping_from_remaining >= split_mapping.edit(edit_idx).from_length() + && mapping_to_remaining >= split_mapping.edit(edit_idx).to_length(); edit_idx++) { + const edit_t& split_edit = split_mapping.edit(edit_idx); + mapping_from_remaining -= split_edit.from_length(); + mapping_to_remaining -= split_edit.to_length(); + prefix_to_length += split_edit.to_length(); + *prefix_split->add_edit() = split_edit; } // do we need to split in the middle of an edit? - if (mapping_remaining) { - - const Edit& split_edit = split_mapping.edit(edit_idx); + if (mapping_from_remaining || mapping_to_remaining) { + const edit_t& split_edit = split_mapping.edit(edit_idx); // add an edit for either side of the split - Edit* prefix_split_edit = prefix_split->add_edit(); - prefix_split_edit->set_from_length(mapping_remaining); + edit_t* prefix_split_edit = prefix_split->add_edit(); + prefix_split_edit->set_from_length(mapping_from_remaining); + prefix_split_edit->set_to_length(mapping_to_remaining); - Edit* suffix_split_edit = suffix_split->add_edit(); - suffix_split_edit->set_from_length(split_edit.from_length() - mapping_remaining); + prefix_to_length += prefix_split_edit->to_length(); - if (split_edit.to_length()) { - prefix_split_edit->set_to_length(mapping_remaining); - suffix_split_edit->set_to_length(split_edit.to_length() - mapping_remaining); - prefix_to_length += prefix_split_edit->to_length(); - - if (!split_edit.sequence().empty()) { - suffix_split_edit->set_sequence(split_edit.sequence().substr(0, mapping_remaining)); - prefix_split_edit->set_sequence(split_edit.sequence().substr(mapping_remaining, - split_edit.sequence().size() -mapping_remaining)); - } + edit_t* suffix_split_edit = suffix_split->add_edit(); + suffix_split_edit->set_from_length(split_edit.from_length() - mapping_from_remaining); + suffix_split_edit->set_to_length(split_edit.to_length() - mapping_to_remaining); + + if (!split_edit.sequence().empty()) { + prefix_split_edit->set_sequence(split_edit.sequence().substr(0, mapping_to_remaining)); + suffix_split_edit->set_sequence(split_edit.sequence().substr(mapping_to_remaining, string::npos)); } edit_idx++; @@ -2454,29 +3850,29 @@ namespace vg { for (auto node_iter = onto_node->begin; node_iter != onto_node->end; node_iter++) { cerr << *node_iter; } - cerr << endl << "\t" << pb2json(onto_node->path) << endl; + cerr << endl << "\t" << debug_string(onto_node->path) << endl; cerr << "suffix node:" << endl << "\t"; for (auto node_iter = suffix_node.begin; node_iter != suffix_node.end; node_iter++) { cerr << *node_iter; } - cerr << endl << "\t" << pb2json(suffix_node.path) << endl; + cerr << endl << "\t" << debug_string(suffix_node.path) << endl; #endif while (iter != iter_range_end) { // index of the node that contains the end of the original node we recorded the overlap from - size_t splitting_idx = node_with_suffix.count(get<2>(*iter)) ? node_with_suffix[get<2>(*iter)] : get<2>(*iter); + size_t splitting_idx = node_with_suffix.count(get<3>(*iter)) ? node_with_suffix[get<3>(*iter)] : get<3>(*iter); #ifdef debug_multipath_alignment - cerr << "adding an overlap edge from node " << splitting_idx << " at distance " << get<3>(*iter) << endl; + cerr << "adding an overlap edge from node " << splitting_idx << " at distance " << get<4>(*iter) << endl; cerr << "\t"; - for (auto node_iter = path_nodes[splitting_idx].begin; node_iter != path_nodes[splitting_idx].end; node_iter++) { + for (auto node_iter = path_nodes.at(splitting_idx).begin; node_iter != path_nodes.at(splitting_idx).end; node_iter++) { cerr << *node_iter; } cerr << endl; #endif // get the next node that overlaps onto the other node at this index and add the overlap edge - path_nodes[splitting_idx].edges.emplace_back(suffix_idx, get<3>(*iter)); + path_nodes.at(splitting_idx).edges.emplace_back(suffix_idx, get<4>(*iter)); iter++; } @@ -2489,19 +3885,20 @@ namespace vg { #ifdef debug_multipath_alignment cerr << "final graph after adding reachability edges:" << endl; for (size_t i = 0; i < path_nodes.size(); i++) { - PathNode& path_node = path_nodes[i]; - cerr << i << " " << pb2json(path_node.path) << " "; - for (auto iter = path_node.begin; iter != path_node.end; iter++) { - cerr << *iter; + PathNode& path_node = path_nodes.at(i); + cerr << i; + if (path_node_provenance) { + cerr << " (hit " << path_node_provenance->at(i) << ")"; } - cerr << endl; + cerr << " " << debug_string(path_node.path) << " " << string(path_node.begin, path_node.end) << endl; cerr << "\t"; for (auto edge : path_node.edges) { - cerr << "(to:" << edge.first << ", graph dist:" << edge.second << ", read dist: " << (path_nodes[edge.first].begin - path_node.end) << ") "; + cerr << "(to:" << edge.first << ", graph dist:" << edge.second << ", read dist: " << (path_nodes.at(edge.first).begin - path_node.end) << ") "; } cerr << endl; } #endif + } void MultipathAlignmentGraph::clear_reachability_edges() { @@ -2520,6 +3917,17 @@ namespace vg { } + size_t MultipathAlignmentGraph::count_reachability_edges() const { + if (!has_reachability_edges) { + return 0; + } + size_t count = 0; + for (auto& node : path_nodes) { + count += node.edges.size(); + } + return count; + } + // Kahn's algorithm void MultipathAlignmentGraph::topological_sort(vector& order_out) { // Can only sort if edges are present. @@ -2547,7 +3955,7 @@ namespace vg { size_t src = source_queue.front(); source_queue.pop_front(); - for (const pair& edge : path_nodes[src].edges) { + for (const pair& edge : path_nodes.at(src).edges) { in_degree[edge.first]--; if (in_degree[edge.first] == 0) { source_queue.push_back(edge.first); @@ -2562,14 +3970,11 @@ namespace vg { void MultipathAlignmentGraph::reorder_adjacency_lists(const vector& order) { vector>> reverse_graph(path_nodes.size()); for (size_t i = 0; i < path_nodes.size(); i++) { - for (const pair& edge : path_nodes[i].edges) { + auto& edges = path_nodes[i].edges; + for (const pair& edge : edges) { reverse_graph[edge.first].emplace_back(i, edge.second); } - } - for (PathNode& path_node : path_nodes) { - size_t out_degree = path_node.edges.size(); - path_node.edges.clear(); - path_node.edges.reserve(out_degree); + edges.clear(); } for (size_t i : order) { for (const pair& edge : reverse_graph[i]) { @@ -2582,9 +3987,10 @@ namespace vg { // We can only remove edges when the edges are present assert(has_reachability_edges); - // algorithm assumes edges are also sorted in topological order, which guarantees that we will - // traverse a path that reveals an edge as transitive before actually traversing the transitive edge - reorder_adjacency_lists(topological_order); + // records of (incoming index, length of edge) indicating the index of the nearest node + // that an edge of exactly the expected length to this node, which is a strong sign + // that the edge is a correct connection that we want to keep + vector> shortest_exact_src; for (size_t i : topological_order) { vector>& edges = path_nodes[i].edges; @@ -2595,14 +4001,49 @@ namespace vg { continue; } + // we don't do these linear pre-compute steps unless there is actual ambiguity in the MEMs + // in order to save compute in the typical case that there is none + // TODO: not very readable + if (shortest_exact_src.empty()) { + + // algorithm assumes edges are also sorted in topological order, which guarantees that we will + // traverse a path that reveals an edge as transitive before actually traversing the transitive edge + reorder_adjacency_lists(topological_order); + + // compute the shortest incoming edge that achieves exactly the expected distance for each node + shortest_exact_src.resize(path_nodes.size(), make_pair(numeric_limits::max(), + numeric_limits::max())); + + for (size_t i = 0; i < path_nodes.size(); ++i) { + auto& path_node = path_nodes[i]; + for (auto& edge : path_node.edges) { + if (edge.second == (path_nodes[edge.first].begin - path_node.end)) { + auto& rec = shortest_exact_src[edge.first]; + if (edge.second < rec.second) { + // this is the shortest exact edge we've seen to this node + rec = make_pair(i, edge.second); + } + } + } + } + } + vector keep(edges.size(), true); unordered_set traversed; for (size_t j = 0; j < edges.size(); j++) { const pair& edge = edges[j]; - if (traversed.count(edge.first)) { + if (traversed.count(edge.first) && edge.second != 0 && + path_nodes[i].end != path_nodes[edge.first].begin) { // we can reach the target of this edge by another path, so it is transitive - keep[j] = false; + // and the path nodes don't abut on either the read or graph + + // we also spare the shortest edge with exact distance from being removed, since + // it is probably correct even if it is transitive (which sometimes happens across + // incorrect splice junctions or deletions) + if (i != shortest_exact_src[edge.first].first) { + keep[j] = false; + } continue; } @@ -2612,7 +4053,7 @@ namespace vg { while (!stack.empty()) { size_t idx = stack.back(); stack.pop_back(); - for (const pair& edge_from : path_nodes[idx].edges) { + for (const pair& edge_from : path_nodes.at(idx).edges) { if (!traversed.count(edge_from.first)) { stack.push_back(edge_from.first); traversed.insert(edge_from.first); @@ -2639,12 +4080,12 @@ namespace vg { #ifdef debug_multipath_alignment cerr << "removed transitive edges, topology is:" << endl; for (size_t i = 0; i < path_nodes.size(); i++) { - cerr << "node " << i << ", " << pb2json(path_nodes[i].path.mapping(0).position()) << " "; - for (auto iter = path_nodes[i].begin; iter != path_nodes[i].end; iter++) { + cerr << "node " << i << ", " << debug_string(path_nodes.at(i).path.mapping(0).position()) << " "; + for (auto iter = path_nodes.at(i).begin; iter != path_nodes.at(i).end; iter++) { cerr << *iter; } cerr << endl; - for (pair edge : path_nodes[i].edges) { + for (pair edge : path_nodes.at(i).edges) { cerr << "\tto " << edge.first << ", dist " << edge.second << endl; } } @@ -2655,8 +4096,9 @@ namespace vg { } - void MultipathAlignmentGraph::prune_to_high_scoring_paths(const Alignment& alignment, const BaseAligner* aligner, - double max_suboptimal_score_ratio, const vector& topological_order) { + void MultipathAlignmentGraph::prune_to_high_scoring_paths(const Alignment& alignment, const GSSWAligner* aligner, + double max_suboptimal_score_ratio, const vector& topological_order, + vector& path_node_provenance) { // Can only prune when edges exist. assert(has_reachability_edges); @@ -2668,18 +4110,17 @@ namespace vg { unordered_map, int32_t> edge_weights; vector node_weights(path_nodes.size()); - - // TODO: is the lower bound too strict? - + // compute the weight of edges and node matches for (size_t i = 0; i < path_nodes.size(); i++) { - PathNode& from_node = path_nodes[i]; - node_weights[i] = aligner->match * (from_node.end - from_node.begin) - + aligner->full_length_bonus * ((from_node.begin == alignment.sequence().begin()) - + (from_node.end == alignment.sequence().end())); - + PathNode& from_node = path_nodes.at(i); + node_weights[i] = (aligner->score_exact_match(from_node.begin, from_node.end, + alignment.quality().begin() + (from_node.begin - alignment.sequence().begin())) + + (from_node.begin == alignment.sequence().begin() ? aligner->score_full_length_bonus(true, alignment) : 0) + + (from_node.end == alignment.sequence().end() ? aligner->score_full_length_bonus(false, alignment) : 0)); + for (const pair& edge : from_node.edges) { - PathNode& to_node = path_nodes[edge.first]; + PathNode& to_node = path_nodes.at(edge.first); int64_t graph_dist = edge.second; int64_t read_dist = to_node.begin - from_node.end; @@ -2688,15 +4129,15 @@ namespace vg { // the read length in between the MEMs is longer than the distance, suggesting a read insert // and potentially another mismatch on the other end int64_t gap_length = read_dist - graph_dist; - edge_weights[make_pair(i, edge.first)] = -(gap_length - 1) * aligner->gap_extension - aligner->gap_open - - (graph_dist > 0) * aligner->mismatch; + edge_weights[make_pair(i, edge.first)] = (-(gap_length - 1) * aligner->gap_extension - aligner->gap_open + - (graph_dist > 0) * aligner->mismatch); } else if (read_dist < graph_dist) { // the read length in between the MEMs is shorter than the distance, suggesting a read deletion // and potentially another mismatch on the other end int64_t gap_length = graph_dist - read_dist; - edge_weights[make_pair(i, edge.first)] = -(gap_length - 1) * aligner->gap_extension - aligner->gap_open - - (read_dist > 0) * aligner->mismatch; + edge_weights[make_pair(i, edge.first)] = (-(gap_length - 1) * aligner->gap_extension - aligner->gap_open + - (read_dist > 0) * aligner->mismatch); } else { // the read length in between the MEMs is the same as the distance, suggesting a pure mismatch @@ -2712,7 +4153,7 @@ namespace vg { for (int64_t i = 0; i < topological_order.size(); i++) { size_t idx = topological_order[i]; int32_t from_score = forward_scores[idx]; - for (const pair& edge : path_nodes[idx].edges) { + for (const pair& edge : path_nodes.at(idx).edges) { forward_scores[edge.first] = std::max(forward_scores[edge.first], node_weights[edge.first] + from_score + edge_weights[make_pair(idx, edge.first)]); } @@ -2722,7 +4163,7 @@ namespace vg { for (int64_t i = topological_order.size() - 1; i >= 0; i--) { size_t idx = topological_order[i]; int32_t score_here = node_weights[idx]; - for (const pair& edge : path_nodes[idx].edges) { + for (const pair& edge : path_nodes.at(idx).edges) { backward_scores[idx] = std::max(backward_scores[idx], score_here + backward_scores[edge.first] + edge_weights[make_pair(idx, edge.first)]); } @@ -2731,223 +4172,1218 @@ namespace vg { // compute the minimum score we will require of a node or edge int32_t min_path_score = *std::max_element(forward_scores.begin(), forward_scores.end()) / max_suboptimal_score_ratio; - // use forward-backward to find nodes/edges on some path with a score above the minimum - unordered_set keep_nodes; - unordered_set> keep_edges; + // use forward-backward to find nodes on some path with a score above the minimum vector removed_in_prefix(path_nodes.size() + 1, 0); for (size_t i = 0; i < path_nodes.size(); i++) { - if (forward_scores[i] + backward_scores[i] - node_weights[i] >= min_path_score) { - keep_nodes.insert(i); - for (const pair& edge : path_nodes[i].edges) { - if (forward_scores[i] + backward_scores[edge.first] + edge_weights[make_pair(i, edge.first)] >= min_path_score) { - keep_edges.emplace(i, edge.first); - } - } - removed_in_prefix[i + 1] = removed_in_prefix[i]; - } - else { - removed_in_prefix[i + 1] = removed_in_prefix[i] + 1; - } + removed_in_prefix[i + 1] = (removed_in_prefix[i] + + int(forward_scores[i] + backward_scores[i] - node_weights[i] < min_path_score)); } - // prune down to these nodes and edges - size_t next = 0; - for (size_t i = 0; i < path_nodes.size(); i++) { - if (keep_nodes.count(i)) { - if (i != next) { - path_nodes[next] = std::move(path_nodes[i]); - } - vector>& edges = path_nodes[next].edges; - - size_t new_end = edges.size(); - for (size_t j = 0; j < new_end;) { - pair& edge = edges[j]; - if (!keep_edges.count(make_pair(i, edge.first))) { - new_end--; - edge = edges[new_end]; + // remove any nodes that failed to meet the threshold + for (size_t i = 0; i < path_nodes.size(); ++i) { + if (removed_in_prefix[i] == removed_in_prefix[i + 1]) { + // we're keeping this node + auto& path_node = path_nodes[i]; + + // remove its edges that aren't on a sufficiently high-scoring path too + size_t edges_removed = 0; + for (size_t j = 0; j < path_node.edges.size(); ++j) { + auto& edge = path_node.edges[j]; + if (forward_scores[i] + backward_scores[edge.first] + edge_weights[make_pair(i, edge.first)] < min_path_score) { + ++edges_removed; } else { - edge.first -= removed_in_prefix[edge.first]; - j++; + path_node.edges[j - edges_removed] = make_pair(edge.first - removed_in_prefix[edge.first], edge.second); } } - edges.resize(new_end); - - next++; + path_node.edges.resize(path_node.edges.size() - edges_removed); + if (removed_in_prefix[i]) { + path_nodes[i - removed_in_prefix[i]] = move(path_node); + path_node_provenance[i - removed_in_prefix[i]] = path_node_provenance[i]; + } } } - path_nodes.resize(next); + + path_nodes.resize(path_nodes.size() - removed_in_prefix.back()); + path_node_provenance.resize(path_nodes.size()); #ifdef debug_multipath_alignment cerr << "pruned to high scoring paths, topology is:" << endl; for (size_t i = 0; i < path_nodes.size(); i++) { - cerr << "node " << i << ", " << pb2json(path_nodes[i].path.mapping(0).position()) << " "; - for (auto iter = path_nodes[i].begin; iter != path_nodes[i].end; iter++) { + cerr << "node " << i << " (hit " << path_node_provenance[i] << "), " << debug_string(path_nodes.at(i).path.mapping(0).position()) << " "; + for (auto iter = path_nodes.at(i).begin; iter != path_nodes.at(i).end; iter++) { cerr << *iter; } cerr << endl; - for (pair edge : path_nodes[i].edges) { + for (pair edge : path_nodes.at(i).edges) { cerr << "\tto " << edge.first << ", dist " << edge.second << endl; } } #endif } - void MultipathAlignmentGraph::align(const Alignment& alignment, VG& align_graph, BaseAligner* aligner, bool score_anchors_as_matches, - size_t max_alt_alns, bool dynamic_alt_alns, size_t band_padding, MultipathAlignment& multipath_aln_out) { +void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, + bool score_anchors_as_matches, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, + double pessimistic_tail_gap_multiplier, bool simplify_topologies, size_t unmergeable_len, + size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, + SnarlDistanceIndex* dist_index, const function(id_t)>* project, + bool allow_negative_scores) { // don't dynamically choose band padding, shim constant value into a function type function constant_padding = [&](const Alignment& seq, const HandleGraph& graph) { return band_padding; }; - align(alignment, align_graph, aligner, score_anchors_as_matches, max_alt_alns, dynamic_alt_alns, constant_padding, multipath_aln_out); + align(alignment, + align_graph, + aligner, + score_anchors_as_matches, + max_alt_alns, + dynamic_alt_alns, + max_gap, + pessimistic_tail_gap_multiplier, + simplify_topologies, + unmergeable_len, + constant_padding, + multipath_aln_out, + cutting_snarls, + dist_index, + project, + allow_negative_scores); } - - void MultipathAlignmentGraph::align(const Alignment& alignment, VG& align_graph, BaseAligner* aligner, bool score_anchors_as_matches, - size_t max_alt_alns, bool dynamic_alt_alns, - function band_padding_function, - MultipathAlignment& multipath_aln_out) { + + void MultipathAlignmentGraph::deduplicate_alt_alns(vector>& alt_alns, + bool leftward, bool rightward) { - // Can only align if edges are present. - assert(has_reachability_edges); + // we use stable sort to keep the original score-ordering among alignments + // that take the same path, which is descending by score + stable_sort(alt_alns.begin(), alt_alns.end(), + [&](const pair& aln_1, const pair& aln_2) { + const auto& path_1 = aln_1.first, path_2 = aln_2.first; + int64_t i, j, incr; + if (leftward) { + i = path_1.mapping_size() - 1; + j = path_2.mapping_size() - 1; + incr = -1; + } + else { + i = 0; + j = 0; + incr = 1; + } + bool is_less = false; + for (; i >= 0 && j >= 0 && i < path_1.mapping_size() && j < path_2.mapping_size(); i += incr, j += incr) { + const auto& pos_1 = path_1.mapping(i).position(), pos_2 = path_2.mapping(j).position(); + if (pos_1.node_id() < pos_2.node_id() || + (pos_1.node_id() == pos_2.node_id() && pos_1.is_reverse() < pos_2.is_reverse())) { + is_less = true; + break; + } + else if (pos_1.node_id() > pos_2.node_id() || + (pos_1.node_id() == pos_2.node_id() && pos_1.is_reverse() > pos_2.is_reverse())) { + break; + } + } + // supersequences earlier in the case of paths of different lengths + return (is_less || ((i < path_1.mapping_size() && i >= 0) && (j == path_2.mapping_size() || j < 0))); + }); - // transfer over data from alignment - transfer_read_metadata(alignment, multipath_aln_out); + // move alignments that have the same path to the end of the vector, keeping the + // first one (which has the highest score) + auto new_end = unique(alt_alns.begin(), alt_alns.end(), + [&](const pair& aln_1, const pair& aln_2) { + const auto& path_1 = aln_1.first, path_2 = aln_2.first; + int64_t i, j, incr; + if (leftward) { + i = path_1.mapping_size() - 1; + j = path_2.mapping_size() - 1; + incr = -1; + } + else { + i = 0; + j = 0; + incr = 1; + } + // if this is a tail alignment, we allow paths of different lengths to be "equal" + // if one is a prefix of the other and lower-scoring + bool is_equal = (path_1.mapping_size() == path_2.mapping_size() || leftward || rightward); + for (; i >= 0 && j >= 0 && i < path_1.mapping_size() && j < path_2.mapping_size() && is_equal; i += incr, j += incr) { + const auto& pos_1 = path_1.mapping(i).position(), pos_2 = path_2.mapping(j).position(); + is_equal = (pos_1.node_id() == pos_2.node_id() && pos_1.is_reverse() == pos_2.is_reverse()); + } + // TODO: there has to be a more succinct way to check this condition + return (is_equal && + (path_1.mapping_size() == path_2.mapping_size() || + (path_1.mapping_size() > path_2.mapping_size() && aln_1.second > aln_2.second) || + (path_1.mapping_size() < path_2.mapping_size() && aln_1.second < aln_2.second))); + }); + // remove the duplicates at the end + alt_alns.resize(new_end - alt_alns.begin()); + } + + pair MultipathAlignmentGraph::zip_alignments(vector>& alt_alns, bool from_left, + const Alignment& alignment, const HandleGraph& align_graph, + string::const_iterator begin, const GSSWAligner* aligner) { #ifdef debug_multipath_alignment - cerr << "transferred over read information" << endl; + cerr << "attempting to zip alignments from " << (from_left ? "left" : "right") << endl; + for (auto& alt_aln : alt_alns) { + cerr << debug_string(alt_aln.first) << endl; + } #endif - // add a subpath for each of the exact match nodes - if (score_anchors_as_matches) { - for (int64_t j = 0; j < path_nodes.size(); j++) { - PathNode& path_node = path_nodes[j]; - Subpath* subpath = multipath_aln_out.add_subpath(); - *subpath->mutable_path() = path_node.path; - int32_t match_score = aligner->score_exact_match(path_node.begin, path_node.end, - alignment.quality().begin() + (path_node.begin - alignment.sequence().begin())); - - subpath->set_score(match_score + aligner->full_length_bonus * - ((path_node.begin == alignment.sequence().begin()) + - (path_node.end == alignment.sequence().end()))); - } + pair return_val; + + int64_t i, j, incr; + auto& path_1 = alt_alns.front().first; + if (from_left) { + incr = 1; + i = 0; + j = 0; } else { - for (size_t j = 0; j < path_nodes.size(); j++) { - PathNode& path_node = path_nodes[j]; - Subpath* subpath = multipath_aln_out.add_subpath(); - *subpath->mutable_path() = path_node.path; - - subpath->set_score(aligner->score_partial_alignment(alignment, align_graph, path_node.path, path_node.begin)); + incr = -1; + i = path_1.mapping_size() - 1; + if (i >= 0) { + j = path_1.mapping(i).edit_size() - 1; + } + else { + j = 0; } } + // past-the-last index of the longest match that concludes in a match/mismatvch + int64_t last_aligned_i, last_aligned_j; -#ifdef debug_multipath_alignment - cerr << "doing DP between MEMs" << endl; -#endif - - // perform alignment in the intervening sections - for (int64_t j = 0; j < path_nodes.size(); j++) { -#ifdef debug_multipath_alignment - cerr << "checking for intervening alignments from match node " << j << " with path " << pb2json(path_nodes[j].path) << " and sequence "; - for (auto iter = path_nodes[j].begin; iter != path_nodes[j].end; iter++) { - cerr << *iter; + // walk the first path to check for full prefix/suffix matches + bool found_mismatch = false, in_indel = false; + while (i >= 0 && i < path_1.mapping_size()) { + // check to make sure the positions of all of the alts match at this mapping + const auto& mapping_1 = path_1.mapping(i); + // start from the appropriate side of the mapping and check for matches of indels + if (from_left) { + j = 0; } - cerr << endl; -#endif - - PathNode& src_path_node = path_nodes[j]; - Subpath* src_subpath = multipath_aln_out.mutable_subpath(j); - - const Path& path = src_subpath->path(); - const Mapping& final_mapping = path.mapping(path.mapping_size() - 1); - const Position& final_mapping_position = final_mapping.position(); - // make a pos_t that points to the final base in the match - pos_t src_pos = make_pos_t(final_mapping_position.node_id(), - final_mapping_position.is_reverse(), - final_mapping_position.offset() + mapping_from_length(final_mapping)); - - // the longest gap that could be detected at this position in the read - size_t src_max_gap = aligner->longest_detectable_gap(alignment, src_path_node.end); - - unordered_set> edges_for_removal; - - for (const pair& edge : src_path_node.edges) { - PathNode& dest_path_node = path_nodes[edge.first]; - pos_t dest_pos = make_pos_t(multipath_aln_out.subpath(edge.first).path().mapping(0).position()); - -#ifdef debug_multipath_alignment - cerr << "forming intervening alignment for edge to node " << edge.first << endl; -#endif - - size_t intervening_length = dest_path_node.begin - src_path_node.end; - size_t max_dist = intervening_length + std::min(src_max_gap, aligner->longest_detectable_gap(alignment, dest_path_node.begin)); - -#ifdef debug_multipath_alignment - cerr << "read dist: " << intervening_length << ", source max gap: " << src_max_gap << ", dest max gap " << aligner->longest_detectable_gap(alignment, dest_path_node.begin) << endl; -#endif - - // extract the graph between the matches - VG connecting_graph; - unordered_map connect_trans = algorithms::extract_connecting_graph(&align_graph, // DAG with split strands - &connecting_graph, // graph to extract into - max_dist, // longest distance necessary - src_pos, // end of earlier match - dest_pos, // beginning of later match - false, // do not bother finding all cycles (it's a DAG) - true, // only include nodes on connecting paths - true); // enforce max distance strictly - - - if (connecting_graph.node_size() == 0) { - // the MEMs weren't connectable with a positive score after all, mark the edge for removal - edges_for_removal.insert(edge); - continue; + else { + j = mapping_1.edit_size() - 1; + } + const auto& pos_1 = mapping_1.position(); + auto offset_1 = from_left ? pos_1.offset() : pos_1.offset() + mapping_from_length(mapping_1); + for (size_t k = 1; k < alt_alns.size(); ++k) { + //cerr << "checking for matching position on aln " << k << endl; + const auto& path_2 = alt_alns[k].first; + int64_t i2; + if (from_left) { + i2 = i; } - - size_t num_alt_alns = dynamic_alt_alns ? min(max_alt_alns, algorithms::count_walks(&connecting_graph)) : - max_alt_alns; - - bool added_direct_connection = false; - // TODO a better way of choosing the number of alternate alignments - vector alt_alignments; - if (num_alt_alns > 0) { - - // transfer the substring between the matches to a new alignment - Alignment intervening_sequence; - intervening_sequence.set_sequence(alignment.sequence().substr(src_path_node.end - alignment.sequence().begin(), - dest_path_node.begin - src_path_node.end)); - -#ifdef debug_multipath_alignment - cerr << "making " << num_alt_alns << " alignments of sequence " << intervening_sequence.sequence() << " to connecting graph: " << pb2json(connecting_graph.graph) << endl; -#endif - - if (!alignment.quality().empty()) { - intervening_sequence.set_quality(alignment.quality().substr(src_path_node.end - alignment.sequence().begin(), - dest_path_node.begin - src_path_node.end)); - } - - aligner->align_global_banded_multi(intervening_sequence, alt_alignments, connecting_graph.graph, num_alt_alns, - band_padding_function(intervening_sequence, connecting_graph), true); + else { + i2 = path_2.mapping_size() - path_1.mapping_size() + i; } - - for (Alignment& connecting_alignment : alt_alignments) { -#ifdef debug_multipath_alignment - cerr << "translating connecting alignment: " << pb2json(connecting_alignment) << endl; -#endif - - const Path& aligned_path = connecting_alignment.path(); - const Mapping& first_mapping = aligned_path.mapping(0); - const Mapping& last_mapping = aligned_path.mapping(aligned_path.mapping_size() - 1); + if (i2 < 0 || i2 >= path_2.mapping_size()) { + //cerr << "no corresponding mapping in alt aln " << k << endl; + found_mismatch = true; + break; + } + const auto& mapping_2 = path_2.mapping(i2); + const auto& pos_2 = mapping_2.position(); + if (pos_1.node_id() != pos_2.node_id() + || pos_2.is_reverse() != pos_2.is_reverse() + || (from_left ? pos_2.offset() : pos_2.offset() + mapping_from_length(mapping_2)) != offset_1) { + //cerr << "positions mismatch" << endl; + found_mismatch = true; + break; + } + } + if (found_mismatch) { + break; + } + while(j >= 0 && j < path_1.mapping(i).edit_size()) { + const auto& edit_1 = mapping_1.edit(j); + //cerr << "checking for matches to edit " << i << " " << j << ": " << debug_string(edit_1) << endl; + if (!in_indel) { + // we've matched up to here and the most recent match wasn't an indel + last_aligned_i = i; + last_aligned_j = j; + } + // check whether all of the other paths match the first path at the next edit + bool found_unmatchable = false; + for (size_t k = 1; k < alt_alns.size() && !found_mismatch; ++k) { + auto& path_2 = alt_alns[k].first; + // find the corresponding mapping index + int64_t i2; + if (from_left) { + i2 = i; + } + else { + i2 = path_2.mapping_size() - path_1.mapping_size() + i; + } + + if (i2 < 0 || i2 >= path_2.mapping_size()) { + //cerr << "no corresponding mapping in alt aln " << k << endl; + found_mismatch = true; + break; + } + const auto& mapping_2 = path_2.mapping(i2); + // find the corresponding edit index + int64_t j2; + if (from_left) { + j2 = j; + } + else { + j2 = mapping_2.edit_size() - mapping_1.edit_size() + j; + } + if (j2 < 0 || j2 >= mapping_2.edit_size()) { + //cerr << "no corresponding edit in alt aln " << k << endl; + found_mismatch = true; + break; + } + const auto& edit_2 = mapping_2.edit(j2); + //cerr << "comparing to alt aln " << k << " edit " << i2 << " " << j2 << ": " << debug_string(edit_2) << endl; + found_mismatch = (edit_1 != edit_2); + if ((from_left && j + 1 == mapping_1.edit_size()) || (!from_left && j == 0)) { + // check if there will be edits in this mapping that we won't hit by using the indexes + // of edits on path 1 + found_unmatchable = found_unmatchable || (mapping_1.edit_size() != mapping_2.edit_size()); + } + } + if (found_mismatch) { + break; + } + else { + j += incr; + in_indel = (edit_1.from_length() == 0 || edit_1.to_length() == 0); + if (found_unmatchable) { + found_mismatch = true; + break; + } + } + } + if (found_mismatch) { + break; + } + else { + i += incr; + if (from_left || i < 0) { + j = 0; + } + else { + j = path_1.mapping(i).edit_size() - 1; + } + } + } + + if (in_indel) { + //cerr << "last match is in an indel" << endl; + // the match concluded in an insertion or deletion, we need to make sure it doesn't + // continue onto the next edit + bool all_nexts_aligned = true; + if (i >= 0 && i < path_1.mapping_size()) { + // we didn't match the entire first path, so we need to actually check + const auto& edit = path_1.mapping(i).edit(j); + all_nexts_aligned = (edit.from_length() != 0 && edit.to_length() != 0); + } + for (size_t k = 1; k < alt_alns.size() && all_nexts_aligned; ++k) { + auto& path_2 = alt_alns[k].first; + int64_t i2 = path_2.mapping_size() - path_1.mapping_size() + i; + if (i2 >= 0 && i2 < path_2.mapping_size()) { + // we didn't run through the full path + const auto& mapping_2 = path_2.mapping(i2); + int64_t j2 = mapping_2.edit_size() - path_1.mapping(i).edit_size() + j; + if (j2 >= 0 && j2 < mapping_2.edit_size()) { + // we didn't run through the full mapping + const auto& edit = mapping_2.edit(j2); + all_nexts_aligned = (edit.from_length() != 0 && edit.to_length() != 0); + } + } + } + if (!all_nexts_aligned) { + // we need to backtrack to the last aligned base + i = last_aligned_i; + j = last_aligned_j; + } + } + + + if ((from_left && (i != 0 || j != 0)) || + (!from_left && (i != path_1.mapping_size() - 1 || j != path_1.mapping().back().edit_size() - 1))) { + //cerr << "matched up to " << i << " " << j << endl; + // we matched part of the path across all alternate alignments, we can zip it up + + // delete the corresponding parts from all the other paths + for (int64_t k = 1; k < alt_alns.size(); ++k) { + //cerr << "deleting from aln " << k << endl; + auto& path_2 = alt_alns[k].first; + if (from_left) { + // we have to remove part of the path from the beginning of the existing paths + int64_t num_mappings, num_edits; + if (j == 0) { + if (path_1.mapping(i - 1).edit_size() == path_2.mapping(i - 1).edit_size()) { + // this is the past-the-last on the mapping + num_mappings = i; + num_edits = 0; + } + else { + // the full mapping on path 1 didn't use up the final mapping on path 2 + num_mappings = i - 1; + num_edits = path_1.mapping(i - 1).edit_size(); + } + } + else { + if (i < path_2.mapping_size() && j == path_2.mapping(i).edit_size()) { + // past-the-last beyond the end of this mapping + num_mappings = i + 1; + num_edits = 0; + } + else { + // past-the-last in the middle of this mapping + num_mappings = i; + num_edits = j; + } + } + //cerr << "deleting " << num_mappings << " mappings and " << num_edits << " from the left" << endl; + + if (num_mappings < path_2.mapping_size() && num_edits != 0) { + // we have to remove part of the mapping + auto mapping = path_2.mutable_mapping(i); + int64_t from_len = 0; + for (int64_t l = 0; l < num_edits; ++l) { + from_len += mapping->edit(l).from_length(); + } + mapping->mutable_edit()->erase(mapping->mutable_edit()->begin(), + mapping->mutable_edit()->begin() + num_edits); + mapping->mutable_position()->set_offset(mapping->position().offset() + from_len); + } + // remove any full mappings that are shared + path_2.mutable_mapping()->erase(path_2.mutable_mapping()->begin(), + path_2.mutable_mapping()->begin() + num_mappings); + } + else { + // we remove from the back, more complicated because of needing to translate indexes + + int64_t i2 = path_2.mapping_size() - path_1.mapping_size() + i; + int64_t num_mappings, num_edits; + if (i2 < 0) { + // we delete all of path 2 + num_mappings = path_2.mapping_size(); + num_edits = 0; + } + else if (i < 0) { + // we ran through path 1 but not path 2 + if (path_1.mapping().front().edit_size() == path_2.mapping(i2 + 1).edit_size()) { + // we ran through the full mapping on path 2 + num_mappings = path_1.mapping_size(); + num_edits = 0; + } + else { + // we didn't run through the entire mapping on path 2 + num_mappings = path_1.mapping_size() - 1; + num_edits = path_1.mapping().front().edit_size(); + } + } + else { + // both of the ending mappings can be indexed + const auto& mapping_1 = path_1.mapping(i); + if (j == mapping_1.edit_size() - 1) { + // mismatch at the beginning of a mapping on path 1 + if (path_1.mapping(i + 1).edit_size() == path_2.mapping(i2 + 1).edit_size()) { + // the previous mappings on both paths end at the same edit + num_mappings = path_1.mapping_size() - i - 1; + num_edits = 0; + } + else { + // we didn't run through the entire mapping on path 2 + num_mappings = path_1.mapping_size() - i - 2; + num_edits = path_1.mapping(i + 1).edit_size(); + } + } + else { + // the mismatch happened after at least one edit in this mapping on path 1 + int64_t j2 = path_2.mapping(i2).edit_size() - mapping_1.edit_size() + j; + if (j2 < 0) { + // past-the-last beyond the end of this mapping + num_mappings = path_1.mapping_size() - i; + num_edits = 0; + } + else { + // past-the-last in the middle of this mapping + num_mappings = path_1.mapping_size() - i - 1; + num_edits = mapping_1.edit_size() - j - 1; + } + } + } + //cerr << "deleting " << num_mappings << " mappings and " << num_edits << " from the right" << endl; + + // delete the number of mappings and edits we decided + path_2.mutable_mapping()->resize(path_2.mapping_size() - num_mappings); + if (num_edits) { + auto& mapping_2 = path_2.mutable_mapping()->back(); + mapping_2.mutable_edit()->resize(mapping_2.edit_size() - num_edits); + } + } + } + + // figure out how much of the primary to move over to the zipped alignment + int64_t k, num_mappings, num_edits; + if (from_left) { + k = 0; + num_mappings = i; + num_edits = j; + if (i < path_1.mapping_size() && j >= path_1.mapping(i).edit_size()) { + ++num_mappings; + num_edits = 0; + } + } + else { + k = path_1.mapping_size() - 1; + num_mappings = k - i; + num_edits = 0; + if (j < 0) { + ++num_mappings; + } + else if (i >= 0) { + num_edits = path_1.mapping(i).edit_size() - j - 1; + } + } + //cerr << "need to copy " << num_mappings << " full mappings and " << num_edits << " edits" << endl; + // copy to the zipped alignment + for (int64_t m = 0; m < num_mappings; k += incr, ++m) { + //cerr << "copy mapping " << debug_string(*path_1.mutable_mapping(k)) << endl; + *return_val.first.add_mapping() = move(*path_1.mutable_mapping(k)); + } + if (num_edits != 0) { + auto mapping = path_1.mutable_mapping(k); + int64_t mapping_len = mapping_from_length(*mapping); + auto pos = mapping->mutable_position(); + auto copy_mapping = return_val.first.add_mapping(); + auto copy_pos = copy_mapping->mutable_position(); + for (int64_t l = from_left ? 0 : mapping->edit_size() - 1, e = 0; e < num_edits; l += incr, ++e) { + auto edit = mapping->mutable_edit(l); + //cerr << "copy edit " << debug_string(*edit) << endl; + *copy_mapping->add_edit() = move(*edit); + } + if (from_left) { + mapping->mutable_edit()->erase(mapping->mutable_edit()->begin(), + mapping->mutable_edit()->begin() + num_edits); + *copy_pos = *pos; + pos->set_offset(pos->offset() + mapping_from_length(*copy_mapping)); + } + else { + // the edits were added in reverse order, switch them back + reverse(copy_mapping->mutable_edit()->begin(), copy_mapping->mutable_edit()->end()); + + mapping->mutable_edit()->resize(mapping->edit_size() - num_edits); + copy_pos->set_node_id(pos->node_id()); + copy_pos->set_is_reverse(pos->is_reverse()); + copy_pos->set_offset(pos->offset() + (mapping_len - mapping_from_length(*copy_mapping))); + } + } + + string::const_iterator alignment_begin; + if (from_left) { + path_1.mutable_mapping()->erase(path_1.mutable_mapping()->begin(), + path_1.mutable_mapping()->begin() + num_mappings); + alignment_begin = begin; + } + else { + // the mappings were added in reverse order, switch them back + reverse(return_val.first.mutable_mapping()->begin(), return_val.first.mutable_mapping()->end()); + + path_1.mutable_mapping()->resize(path_1.mapping_size() - num_mappings); + alignment_begin = begin + path_to_length(path_1); + } + + // score the zipped alignment + return_val.second = aligner->score_partial_alignment(alignment, align_graph, return_val.first, alignment_begin); + + // remove this part of the score from the alt alignments + for (auto& alt_aln : alt_alns) { + alt_aln.second -= return_val.second; + } +#ifdef debug_multipath_alignment + cerr << "successfully made zipped alignment with score " << return_val.second << endl; + cerr << debug_string(return_val.first) << endl; +#endif + } + + return return_val; + } + + pair>, vector>>> + MultipathAlignmentGraph::decompose_alignments(const vector>& alt_alns, + const Alignment& alignment, const HandleGraph& align_graph, + string::const_iterator begin, const GSSWAligner* aligner) { + +#ifdef debug_decompose_algorithm + cerr << "attempting to decompose " << alt_alns.size() << " alignments" << endl; + for (size_t i = 0; i < alt_alns.size(); ++i) { + cerr << i << ": " << debug_string(alt_alns[i].first) << endl; + } +#endif + + // records (seq_begin, seq_end, repeated(node_id, rev, node_begin, node_end)) + // note: because we only keep the optimal non-redundant alignment, we are guaranteed that + // any alignment of the same read/ref interval will be equivalent-scoring and can thus + // be swapped + typedef tuple>> key_t; + + // hash function that can handle the vector + struct key_hash_t { + inline size_t operator()(const key_t& key) const { + size_t hsh = 0; + hash_combine(hsh, get<0>(key)); + hash_combine(hsh, get<1>(key)); + for (const auto& rec : get<2>(key)) { + hash_combine(hsh, rec); + } + return hsh; + } + }; + +#ifdef debug_decompose_algorithm + auto key_string = [](const key_t& key) { + stringstream sstrm; + sstrm << get<0>(key) << "-" << get<1>(key) << ",("; + for (size_t i = 0; i < get<2>(key).size(); ++i) { + if (i > 0) { + sstrm << ","; + } + auto rec = get<2>(key)[i]; + sstrm << get<0>(rec) << (get<1>(rec) ? "-" : "+") << ":" << get<2>(rec) << "-" << get<3>(rec); + } + sstrm << ")"; + return sstrm.str(); + }; +#endif + + // first we will count the number of occurrence of each edit to find those that occur on every + // alignment + unordered_map chunk_count; + + // note: this is unsatisfactorily complicated, but it's necessary to keep the deletions as atomic + // units so that the score is decomposable between whatever chunks we choose + auto flush_deletion = [&](const int64_t& del_start_j, const int64_t& del_start_k, + const int64_t& del_start_offset, const int64_t& j, + const int64_t& k, const int64_t& seq_idx, const path_t& path) { + key_t del_key; + get<0>(del_key) = seq_idx; + get<1>(del_key) = seq_idx; + for (int64_t jj = del_start_j; jj <= j; ++jj) { + + const auto& del_mapping = path.mapping(jj); + + // choose start and end indexes for edits on this mapping + int64_t kk = (jj == del_start_j ? del_start_k : 0); + int64_t kk_end = (jj == j ? k : (int64_t) del_mapping.edit_size()); + + // compute the range of the node that is aligned + int64_t off_start = (jj == del_start_j ? del_start_offset : del_mapping.position().offset()); + int64_t off_end = off_start; + for (; kk < kk_end; ++kk) { + off_end += del_mapping.edit(kk).from_length(); + } + + if (off_end != off_start) { + // the deletion covered part of this node, add a record to the deletion key + get<2>(del_key).emplace_back(del_mapping.position().node_id(), + del_mapping.position().is_reverse(), + off_start, off_end); + } + } + +#ifdef debug_decompose_algorithm + cerr << "recording deletion key " << key_string(del_key) << endl; +#endif + + // record a count of this key + chunk_count[del_key] += 1; + }; + +#ifdef debug_decompose_algorithm + cerr << "counting edits" << endl; +#endif + + for (int64_t i = 0; i < alt_alns.size(); ++i) { + + int64_t seq_idx = 0; + bool in_deletion = false; + + int64_t del_start_j = -1; + int64_t del_start_k = -1; + int64_t del_start_offset = 0; + for (int64_t j = 0; j < alt_alns[i].first.mapping_size(); ++j) { + const auto& mapping = alt_alns[i].first.mapping(j); + + int64_t offset = mapping.position().offset(); + for (int64_t k = 0; k < mapping.edit_size(); ++k) { + + const auto& edit = mapping.edit(k); + + if (edit.from_length() != 0 && edit.to_length() == 0) { + // this edit is a deletion + if (!in_deletion) { + // start a new deletion + del_start_j = j; + del_start_k = k; + del_start_offset = offset; + in_deletion = true; + } + } + else { + if (in_deletion) { + // flush the deletion that's being finished + flush_deletion(del_start_j, del_start_k, del_start_offset, + j, k, seq_idx, alt_alns[i].first); + + in_deletion = false; + } + + key_t edit_key; + get<0>(edit_key) = seq_idx; + get<1>(edit_key) = seq_idx + edit.to_length(); + get<2>(edit_key).emplace_back(mapping.position().node_id(), + mapping.position().is_reverse(), + offset, offset + edit.from_length()); + +#ifdef debug_decompose_algorithm + cerr << "recording normal key " << key_string(edit_key) << endl; +#endif + + chunk_count[edit_key] += 1; + } + + offset += edit.from_length(); + seq_idx += edit.to_length(); + } + } + + if (in_deletion) { + // flush the last deletion + flush_deletion(del_start_j, del_start_k, del_start_offset, + alt_alns[i].first.mapping_size() - 1, // this index is used inclusive + alt_alns[i].first.mapping().back().edit_size(), // and this one exclusive + seq_idx, alt_alns[i].first); + } + } + +#ifdef debug_decompose_algorithm + cerr << "identifying shared edits" << endl; +#endif + + // gather all of the edits/chunks that are shared across all of the alignments + vector shared_edits; + for (const auto& chunk_rec : chunk_count) { + if (chunk_rec.second == alt_alns.size()) { + shared_edits.push_back(chunk_rec.first); + } + } + + + pair>, vector>>> return_val; + + if (!shared_edits.empty()) { + +#ifdef debug_decompose_algorithm + cerr << "there are " << shared_edits.size() << " universally shared edits, beginning decompose routine" << endl; + +#endif + + // put them in order along the read + sort(shared_edits.begin(), shared_edits.end()); + +#ifdef debug_decompose_algorithm + for (auto edit : shared_edits) { + cerr << key_string(edit) << endl; + } +#endif + + // the index of the next edit we will add in shared_edits + size_t curr_shared_idx = 0; + // indexes of where we are in each of the alt alignments in + // records of (mapping idx, edit idx, seq idx, node idx) + // note: node idx is relative to the offset of the mapping + vector> curr_index(alt_alns.size(), + tuple(0, 0, 0, 0)); + + // adds a block of shared edits to the return value and advances the current index + // along every path past this value + auto add_shared_segments = [&]() { + +#ifdef debug_decompose_algorithm + cerr << "looking for next shared segment" << endl; +#endif + + // always add a shared segment, even if for a sentinel + return_val.first.emplace_back(); + auto& shared_path = return_val.first.back().first; + + // go through as many shared edits as we can in this shared path (i.e. until hitting + // a mismatch with at least one path + bool all_match = true; + while (curr_shared_idx < shared_edits.size() && all_match) { + auto& shared = shared_edits[curr_shared_idx]; + +#ifdef debug_decompose_algorithm + cerr << "looking for matches to shared edit " << key_string(shared) << endl; +#endif + + // check for match against the paths + for (size_t i = 0; i < alt_alns.size() && all_match; ++i) { + size_t j, k, seq_idx, node_idx; + tie(j, k, seq_idx, node_idx) = curr_index[i]; + const auto& mapping = alt_alns[i].first.mapping(j); + +#ifdef debug_decompose_algorithm + cerr << "comparing to alt alignment " << i << ", mapping " << j << ", edit " << k << ", which starts at sequence index " << seq_idx << ", relative node index " << node_idx << endl; +#endif + + all_match = (seq_idx == get<0>(shared) && + mapping.position().node_id() == get<0>(get<2>(shared).front()) && + mapping.position().is_reverse() == get<1>(get<2>(shared).front()) && + mapping.position().offset() + node_idx == get<2>(get<2>(shared).front())); + } + + if (all_match) { + // add an edit/edits to the shared path +#ifdef debug_decompose_algorithm + cerr << "all alt alignments match the next edit" << endl; +#endif + + path_mapping_t* mapping = nullptr; + if (shared_path.mapping_size() != 0 && + shared_path.mapping().back().position().node_id() == get<0>(get<2>(shared).front()) && + shared_path.mapping().back().position().is_reverse() == get<1>(get<2>(shared).front()) && + shared_path.mapping().back().position().offset() + mapping_from_length(shared_path.mapping().back()) + == get<2>(get<2>(shared).front())) { + + // we can extend the existing mapping by these edit(s) + mapping = shared_path.mutable_mapping(shared_path.mapping_size() - 1); + + } + else { + + // we need to make a new mapping + mapping = shared_path.add_mapping(); + mapping->mutable_position()->set_node_id(get<0>(get<2>(shared).front())); + mapping->mutable_position()->set_is_reverse(get<1>(get<2>(shared).front())); + mapping->mutable_position()->set_offset(get<2>(get<2>(shared).front())); + } + + // we can copy over from any particular path since this segment is shared + size_t j, k, seq_idx, node_idx; + tie(j, k, seq_idx, node_idx) = curr_index.front(); + for (size_t num_edits = 0; num_edits < get<2>(shared).size(); ++num_edits) { + + *mapping->add_edit() = alt_alns.front().first.mapping(j).edit(k); + ++k; + if (k == alt_alns.front().first.mapping(j).edit_size() && + num_edits + 1 != get<2>(shared).size()) { + ++j; + k = 0; + // new position has to match the next mapping (since shared) + mapping = shared_path.add_mapping(); + *mapping->mutable_position() = alt_alns.front().first.mapping(j).position(); + } + } + + // move on to the next shared edit + ++curr_shared_idx; + + // advance the current indexes along the alternative alignments through this edit + for (size_t i = 0; i < curr_index.size(); ++i) { + for (size_t num_edits = 0; num_edits < get<2>(shared).size(); ++num_edits) { + auto& idxs = curr_index[i]; + const auto& edit = alt_alns[i].first.mapping(get<0>(idxs)).edit(get<1>(idxs)); + get<2>(idxs) += edit.to_length(); + get<3>(idxs) += edit.from_length(); + ++get<1>(idxs); + if (get<1>(idxs) == alt_alns[i].first.mapping(get<0>(idxs)).edit_size()) { + ++get<0>(idxs); + get<1>(idxs) = 0; + get<3>(idxs) = 0; + } + } + } + } +#ifdef debug_decompose_algorithm + else { + cerr << "not all paths match the next shared edit (if any), terminating shared segment" << endl; + } +#endif + } +#ifdef debug_decompose_algorithm + cerr << "completed shared segment:" << endl; + cerr << "\t" << debug_string(shared_path) << endl; +#endif + }; + + // adds the unshared portion of alignments in between the last shared portion + // and the next one + auto add_unshared_segments = [&]() { + +#ifdef debug_decompose_algorithm + cerr << "looking for next block of unshared segments" << endl; +#endif + + // dummy values in case there is no next shared segment + size_t end_seq_idx = numeric_limits::max(); + nid_t end_node_id = 0; + bool end_is_rev = false; + size_t end_offset = numeric_limits::max(); + if (curr_shared_idx < shared_edits.size()) { + // but there is a next shared segment, so we set these t meaningful + // values + end_seq_idx = get<0>(shared_edits[curr_shared_idx]); + end_node_id = get<0>(get<2>(shared_edits[curr_shared_idx]).front()); + end_is_rev = get<1>(get<2>(shared_edits[curr_shared_idx]).front()); + end_offset = get<2>(get<2>(shared_edits[curr_shared_idx]).front()); + } + // note: all of the unshared segments end at the same place + +#ifdef debug_decompose_algorithm + cerr << "copying until hitting seq index " << end_seq_idx << " at position " << make_pos_t(end_node_id, end_is_rev, end_offset) << endl; +#endif + + // init all of the unshared semgents + return_val.second.emplace_back(alt_alns.size()); + // and then do the copying for each of them + for (size_t i = 0; i < alt_alns.size(); ++i) { + + auto& alt_path = alt_alns[i].first; + + auto& unshared_path = return_val.second.back()[i].first; + auto& idxs = curr_index[i]; + +#ifdef debug_decompose_algorithm + cerr << "copying from alt aln " << i << ", mapping index " << get<0>(idxs) << ", edit index " << get<1>(idxs) << ", seq index " << get<2>(idxs) << ", node relative index " << get<3>(idxs) << endl; +#endif + + // copy the alt path into the unshared path until reaching either the end of the alt + // path or the next shared segment + while (get<0>(idxs) != alt_path.mapping_size() + && !(get<2>(idxs) == end_seq_idx + && alt_path.mapping(get<0>(idxs)).position().node_id() == end_node_id + && alt_path.mapping(get<0>(idxs)).position().is_reverse() == end_is_rev + && alt_path.mapping(get<0>(idxs)).position().offset() + get<3>(idxs) == end_offset)) { + + // we haven't reached the end or the next shared segment + const auto& alt_path_mapping = alt_path.mapping(get<0>(idxs)); + const auto& alt_path_position = alt_path_mapping.position(); + const auto& alt_path_edit = alt_path_mapping.edit(get<1>(idxs)); + +#ifdef debug_decompose_algorithm + cerr << "have not hit end of unshared segment, adding edit " << debug_string(alt_path_edit) << endl; +#endif + + path_mapping_t* mapping = nullptr; + if (unshared_path.mapping_size() != 0 && + unshared_path.mapping().back().position().node_id() == alt_path_position.node_id() && + unshared_path.mapping().back().position().is_reverse() == alt_path_position.is_reverse() && + unshared_path.mapping().back().position().offset() + mapping_from_length(unshared_path.mapping().back()) + == alt_path_position.offset() + get<3>(idxs)) { + + // we can extend the existing mapping by these edit(s) + mapping = unshared_path.mutable_mapping(unshared_path.mapping_size() - 1); + + } + else { + // we need to make a new mapping + mapping = unshared_path.add_mapping(); + mapping->mutable_position()->set_node_id(alt_path_position.node_id()); + mapping->mutable_position()->set_is_reverse(alt_path_position.is_reverse()); + mapping->mutable_position()->set_offset(alt_path_position.offset() + get<3>(idxs)); + } + + // add the next edit + *mapping->add_edit() = alt_path_edit; + + // advance to the next edit index + get<2>(idxs) += alt_path_edit.to_length(); + get<3>(idxs) += alt_path_edit.from_length(); + ++get<1>(idxs); + if (get<1>(idxs) == alt_path_mapping.edit_size()) { + ++get<0>(idxs); + get<1>(idxs) = 0; + get<3>(idxs) = 0; + } + + } + } +#ifdef debug_decompose_algorithm + cerr << "completed unshared segments:" << endl; + for (auto& unshared : return_val.second.back()) { + cerr << "\t" << debug_string(unshared.first) << endl; + } +#endif + }; + +#ifdef debug_decompose_algorithm + cerr << "constructing shared and unshared segments" << endl; +#endif + + // when we want to stop iterating + auto finished = [&]() { + bool done = (curr_shared_idx == shared_edits.size()); + for (size_t i = 0; i < alt_alns.size() && done; ++i) { + done = (get<0>(curr_index[i]) == alt_alns[i].first.mapping_size()); + } + return done; + }; + + // alternate between the two segment additions + add_shared_segments(); + while (!finished()) { + add_unshared_segments(); + add_shared_segments(); + } + +#ifdef debug_decompose_algorithm + cerr << "rescoring alignment segments" << endl; +#endif + + // rescore all of the broken up segments + auto seq_pos = begin; + for (size_t i = 0; i < return_val.first.size(); ++i) { + if (i != 0) { + // score a block of unshared segments + auto& unshared_alt_alns = return_val.second[i - 1]; + for (size_t j = 0; j < unshared_alt_alns.size(); ++j) { +#ifdef debug_decompose_algorithm + cerr << "rescore unshared alt " << j << " in block " << i - 1 << endl; + cerr << debug_string(unshared_alt_alns[j].first) << endl; +#endif + unshared_alt_alns[j].second = aligner->score_partial_alignment(alignment, align_graph, + unshared_alt_alns[j].first, seq_pos); + } + // they all cover the same read interval, so we can choose an arbitrary alt here + seq_pos += path_to_length(unshared_alt_alns.front().first); + } +#ifdef debug_decompose_algorithm + cerr << "rescore shared segment " << i << endl; + cerr << debug_string(return_val.first[i].first) << endl; +#endif + // score a shared segment + return_val.first[i].second = aligner->score_partial_alignment(alignment, align_graph, + return_val.first[i].first, seq_pos); + seq_pos += path_to_length(return_val.first[i].first); + } + + // and sort the blocks of unshared segments + for (size_t i = 0; i < return_val.second.size(); ++i) { + auto& unshared_alt_alns = return_val.second[i]; + stable_sort(unshared_alt_alns.begin(), unshared_alt_alns.end(), + [](const pair& a, const pair& b) { + return a.second > b.second; + }); + } + } + + return return_val; + } + + void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, + bool score_anchors_as_matches, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, + double pessimistic_tail_gap_multiplier, bool simplify_topologies, size_t unmergeable_len, + function band_padding_function, + multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, + SnarlDistanceIndex* dist_index, const function(id_t)>* project, + bool allow_negative_scores) { + + // TODO: magic number + // how many tails we need to have before we try the more complicated but + // expensive tail decomposition algorithm + size_t tail_decompose_threshold = 4; + + // Can only align if edges are present. + assert(has_reachability_edges); + + // transfer over data from alignment + transfer_read_metadata(alignment, multipath_aln_out); + +#ifdef debug_multipath_alignment + cerr << "transferred over read information" << endl; +#endif + // TODO: could I get away with moving the paths instead of copying them? + + // add a subpath for each of the exact match nodes + if (score_anchors_as_matches) { + for (int64_t j = 0; j < path_nodes.size(); j++) { + PathNode& path_node = path_nodes.at(j); + subpath_t* subpath = multipath_aln_out.add_subpath(); + *subpath->mutable_path() = path_node.path; + int32_t match_score = aligner->score_exact_match(path_node.begin, path_node.end, + alignment.quality().begin() + (path_node.begin - alignment.sequence().begin())); + + subpath->set_score(match_score + + (path_node.begin == alignment.sequence().begin() ? aligner->score_full_length_bonus(true, alignment) : 0) + + (path_node.end == alignment.sequence().end() ? aligner->score_full_length_bonus(false, alignment) : 0)); + } + } + else { + for (size_t j = 0; j < path_nodes.size(); j++) { + PathNode& path_node = path_nodes.at(j); + subpath_t* subpath = multipath_aln_out.add_subpath(); + *subpath->mutable_path() = path_node.path; + subpath->set_score(aligner->score_partial_alignment(alignment, align_graph, path_node.path, path_node.begin)); + } + } + + // the indexes of subpaths that we will not allow to merge at non-branching paths + unordered_set prohibited_merges; + + auto convert_and_deduplicate = [](const vector& alt_alns, + bool leftward, bool rightward) -> vector> { + + // init the deduplicated vector in STL types with move operators + vector> converted(alt_alns.size()); + for (size_t i = 0; i < alt_alns.size(); ++i) { + const auto& aln = alt_alns[i]; + converted[i].second = aln.score(); + from_proto_path(aln.path(), converted[i].first); + } + deduplicate_alt_alns(converted, leftward, rightward); + return converted; + }; + + +#ifdef debug_multipath_alignment + cerr << "doing DP between MEMs" << endl; +#endif + + // perform alignment in the intervening sections + for (int64_t j = 0; j < path_nodes.size(); j++) { +#ifdef debug_multipath_alignment + cerr << "checking for intervening alignments from match node " << j << " with path " << debug_string(path_nodes.at(j).path) << " and sequence "; + for (auto iter = path_nodes.at(j).begin; iter != path_nodes.at(j).end; iter++) { + cerr << *iter; + } + cerr << endl; +#endif + + PathNode& src_path_node = path_nodes.at(j); + subpath_t* src_subpath = multipath_aln_out.mutable_subpath(j); + + const path_t& path = src_subpath->path(); + const path_mapping_t& final_mapping = path.mapping(path.mapping_size() - 1); + const position_t& final_mapping_position = final_mapping.position(); + // make a pos_t that points to the final base in the match + pos_t src_pos = make_pos_t(final_mapping_position.node_id(), + final_mapping_position.is_reverse(), + final_mapping_position.offset() + mapping_from_length(final_mapping)); + + // the longest gap that could be detected at this position in the read + size_t src_max_gap = aligner->longest_detectable_gap(alignment, src_path_node.end); + + // This holds edges that we remove, because we couldn't actually get an alignment across them with a positive score. + unordered_set> edges_for_removal; + + for (const pair& edge : src_path_node.edges) { + PathNode& dest_path_node = path_nodes.at(edge.first); + pos_t dest_pos = make_pos_t(multipath_aln_out.subpath(edge.first).path().mapping(0).position()); + +#ifdef debug_multipath_alignment + cerr << "forming intervening alignment for edge to node " << edge.first << endl; +#endif + + size_t intervening_length = dest_path_node.begin - src_path_node.end; + + // if negative score is allowed set maximum distance to the length between path nodes + // otherwise set it to the maximum gap length possible while retaining a positive score + size_t max_dist = allow_negative_scores ? + edge.second : + intervening_length + min(min(src_max_gap, aligner->longest_detectable_gap(alignment, dest_path_node.begin)), max_gap); + +#ifdef debug_multipath_alignment + cerr << "read dist: " << intervening_length << ", graph dist " << edge.second << " source max gap: " << src_max_gap << ", dest max gap " << aligner->longest_detectable_gap(alignment, dest_path_node.begin) << ", max allowed gap " << max_gap << endl; +#endif + + // extract the graph between the matches + bdsg::HashGraph connecting_graph; + auto connect_trans = algorithms::extract_connecting_graph(&align_graph, // DAG with split strands + &connecting_graph, // graph to extract into + max_dist, // longest distance necessary + src_pos, // end of earlier match + dest_pos, // beginning of later match + false); // do not enforce max distance strictly + + if (connecting_graph.get_node_count() == 0) { + // the MEMs weren't connectable with a positive score after all, mark the edge for removal +#ifdef debug_multipath_alignment + cerr << "Remove edge " << j << " -> " << edge.first << " because we got no nodes in the connecting graph " + << src_pos << " to " << dest_pos << endl; +#endif + edges_for_removal.insert(edge); + continue; + } + + size_t num_alt_alns = dynamic_alt_alns ? min(max_alt_alns, handlealgs::count_walks(&connecting_graph)) : + max_alt_alns; + + + // transfer the substring between the matches to a new alignment + Alignment intervening_sequence; + intervening_sequence.set_sequence(alignment.sequence().substr(src_path_node.end - alignment.sequence().begin(), + dest_path_node.begin - src_path_node.end)); + if (!alignment.quality().empty()) { + intervening_sequence.set_quality(alignment.quality().substr(src_path_node.end - alignment.sequence().begin(), + dest_path_node.begin - src_path_node.end)); + } + + // if we're doing dynamic alt alignments, possibly expand the number of tracebacks until we get an + // alignment to every path or hit the hard max + vector> deduplicated; + if (num_alt_alns > 0) { + + size_t num_alns_iter = num_alt_alns; + while (deduplicated.size() < num_alt_alns) { + + intervening_sequence.clear_path(); + +#ifdef debug_multipath_alignment + cerr << "making " << num_alns_iter << " alignments of sequence " << intervening_sequence.sequence() << " to connecting graph" << endl; + connecting_graph.for_each_handle([&](const handle_t& handle) { + cerr << connecting_graph.get_id(handle) << " " << connecting_graph.get_sequence(handle) << endl; + connecting_graph.follow_edges(handle, true, [&](const handle_t& prev) { + cerr << "\t" << connecting_graph.get_id(prev) << " <-" << endl; + }); + connecting_graph.follow_edges(handle, false, [&](const handle_t& next) { + cerr << "\t-> " << connecting_graph.get_id(next) << endl; + }); + }); +#endif + + vector alt_alignments; + aligner->align_global_banded_multi(intervening_sequence, alt_alignments, connecting_graph, num_alns_iter, + band_padding_function(intervening_sequence, connecting_graph), true); + + // remove alignments with the same path + deduplicated = convert_and_deduplicate(alt_alignments, false, false); + + if (num_alns_iter >= max_alt_alns || !dynamic_alt_alns) { + // we don't want to try again even if we didn't find every path yet + break; + } + else { + // if we didn't find every path, we'll try again with this many tracebacks + num_alns_iter = min(max_alt_alns, num_alns_iter * 2); + } + } + } + + bool added_direct_connection = false; + for (auto& connecting_alignment : deduplicated) { +#ifdef debug_multipath_alignment + cerr << "translating connecting alignment: " << debug_string(connecting_alignment.first) << ", score " << connecting_alignment.second << endl; +#endif + + auto& aligned_path = connecting_alignment.first; + const auto& first_mapping = aligned_path.mapping(0); + const auto& last_mapping = aligned_path.mapping(aligned_path.mapping_size() - 1); bool add_first_mapping = mapping_from_length(first_mapping) != 0 || mapping_to_length(first_mapping) != 0; - bool add_last_mapping = ((mapping_from_length(last_mapping) != 0 || mapping_to_length(last_mapping) != 0) - && aligned_path.mapping_size() > 1); + bool add_last_mapping = mapping_from_length(last_mapping) != 0 || mapping_to_length(last_mapping) != 0; if (!(add_first_mapping || add_last_mapping) && aligned_path.mapping_size() <= 2) { if (!added_direct_connection) { // edge case where there is a simple split but other non-simple edges intersect the target - // at the same place (so it passes the previous filter) + // at the same place (so it passes the previous deduplicating filter) // it actually doesn't need an alignment, just a connecting edge src_subpath->add_next(edge.first); added_direct_connection = true; @@ -2956,228 +5392,818 @@ namespace vg { } // create a subpath between the matches for this alignment - Subpath* connecting_subpath = multipath_aln_out.add_subpath(); - connecting_subpath->set_score(connecting_alignment.score()); - Path* subpath_path = connecting_subpath->mutable_path(); + subpath_t* connecting_subpath = multipath_aln_out.add_subpath(); + connecting_subpath->set_score(connecting_alignment.second); + + // get a pointer to the subpath again in case the vector reallocated + src_subpath = multipath_aln_out.mutable_subpath(j); + + // get rid of the ends if they are empty + if (!add_last_mapping) { + aligned_path.mutable_mapping()->pop_back(); + } + if (!add_first_mapping && !aligned_path.mapping().empty()) { + aligned_path.mutable_mapping()->erase(aligned_path.mutable_mapping()->begin()); + } - int32_t rank = 1; + *connecting_subpath->mutable_path() = move(aligned_path); + + // add the appropriate connections + src_subpath->add_next(multipath_aln_out.subpath_size() - 1); + connecting_subpath->add_next(edge.first); + + // translate the path into the space of the main graph unless the path is null + if (connecting_subpath->path().mapping_size() != 0) { + translate_node_ids(*connecting_subpath->mutable_path(), connect_trans); + path_mapping_t* first_subpath_mapping = connecting_subpath->mutable_path()->mutable_mapping(0); + if (first_subpath_mapping->position().node_id() == final_mapping.position().node_id()) { + first_subpath_mapping->mutable_position()->set_offset(offset(src_pos)); + } + } - // check to make sure the first is not an empty anchoring mapping - if (add_first_mapping) { - Mapping* mapping = subpath_path->add_mapping(); - *mapping = first_mapping; - mapping->set_rank(rank); #ifdef debug_multipath_alignment - cerr << "first mapping is not empty, formed mapping: " << pb2json(*mapping) << endl; + cerr << "subpath from " << j << " to " << edge.first << " at index " << multipath_aln_out.subpath_size() - 1 << ":" << endl; + cerr << debug_string(*connecting_subpath) << endl; #endif - rank++; + } + } + + if (!edges_for_removal.empty()) { + auto new_end = std::remove_if(src_path_node.edges.begin(), src_path_node.edges.end(), + [&](const pair& edge) { + return edges_for_removal.count(edge); + }); + src_path_node.edges.resize(new_end - src_path_node.edges.begin()); + } + } + + + // Now do the tails + + // We need to know what subpaths are real sources + unordered_set sources; + + // Actually align the tails + auto tail_alignments = align_tails(alignment, align_graph, aligner, max_alt_alns, dynamic_alt_alns, + max_gap, pessimistic_tail_gap_multiplier, 0, &sources); + + // TODO: merge and simplify the tail alignments? rescoring would be kind of a pain... + + // Handle the right tails + for (auto& kv : tail_alignments[true]) { + // For each sink subpath number with its alignments + size_t j = kv.first; + + // remove alignments with the same path + auto deduplicated = convert_and_deduplicate(kv.second, false, true); + +#ifdef debug_multipath_alignment + cerr << "deduplicate " << kv.second.size() << " right tail alignments on " << j << " down to " << deduplicated.size() << " nonredundant alignments" << endl; +#endif + + PathNode& path_node = path_nodes.at(j); + + vector> shared_tail_alns; + vector>> unshared_tail_alns; + + if (deduplicated.size() <= 1) { + shared_tail_alns = move(deduplicated); + } + else if (deduplicated.size() < tail_decompose_threshold) { + // do the simpler zip algorithm + auto right_zip_aln = zip_alignments(deduplicated, false, alignment, align_graph, + path_node.end, aligner); + auto left_zip_aln = zip_alignments(deduplicated, true, alignment, align_graph, + path_node.end, aligner); + shared_tail_alns.emplace_back(move(left_zip_aln)); + shared_tail_alns.emplace_back(move(right_zip_aln)); + unshared_tail_alns.emplace_back(move(deduplicated)); + } + else { + // do the more complicated decompose algorithm + tie(shared_tail_alns, unshared_tail_alns) = decompose_alignments(deduplicated, alignment, align_graph, + path_node.end, aligner); + + if (shared_tail_alns.empty()) { + // nothing decomposed, no universally shared elements, just add placeholders + shared_tail_alns.resize(2); + unshared_tail_alns.emplace_back(move(deduplicated)); + } + else { + // the unshared blocks may now contain duplications + for (size_t i = 0; i < unshared_tail_alns.size(); ++i) { + // deduplicate as right tails in the final iteration if there's no final + // shared segment + deduplicate_alt_alns(unshared_tail_alns[i], false, + shared_tail_alns[i + 1].first.mapping_size() == 0); + } +#ifdef debug_multipath_alignment + cerr << "decompose into " << shared_tail_alns.size() << " shared segments with unshared blocks of sizes:" << endl; + for (size_t i = 0; i < unshared_tail_alns.size(); ++i) { + cerr << "\t" << unshared_tail_alns[i].size() << endl; } - // add all mapping in between the ends - for (size_t j = 1; j < aligned_path.mapping_size() - 1; j++) { - Mapping* mapping = subpath_path->add_mapping(); - *mapping = aligned_path.mapping(j); - mapping->set_rank(rank); +#endif + } + } + + // add subpaths for the tail alignments + add_decomposed_tail_alignments(alignment, align_graph, multipath_aln_out, prohibited_merges, + shared_tail_alns, unshared_tail_alns, j, false, unmergeable_len, + aligner, cutting_snarls, dist_index, project); + } + + // Now handle the left tails. + // We need to handle all sources, whether or not they got alignments + for (auto& j : sources) { + + if (path_nodes[j].begin != alignment.sequence().begin()) { + + // There should be some alignments + // remove alignments with the same path + auto deduplicated = convert_and_deduplicate(tail_alignments[false][j], true, false); + +#ifdef debug_multipath_alignment + cerr << "deduplicate " << tail_alignments[false][j].size() << " left tail alignments on " << j << " down to " << deduplicated.size() << " nonredundant alignments" << endl; +#endif + // TODO: this is mostly repetitive with the right tails + vector> shared_tail_alns; + vector>> unshared_tail_alns; + + // zip together identical prefixes and suffixes of the tail alignment + if (deduplicated.size() <= 1) { + shared_tail_alns = move(deduplicated); + } + else if (deduplicated.size() < tail_decompose_threshold) { + // do the simpler zip algorithm + auto right_zip_aln = zip_alignments(deduplicated, false, alignment, align_graph, + alignment.sequence().begin(), aligner); + auto left_zip_aln = zip_alignments(deduplicated, true, alignment, align_graph, + alignment.sequence().begin(), aligner); + shared_tail_alns.emplace_back(move(left_zip_aln)); + shared_tail_alns.emplace_back(move(right_zip_aln)); + unshared_tail_alns.emplace_back(move(deduplicated)); + } + else { + // do the more complicated decompose algorithm + tie(shared_tail_alns, unshared_tail_alns) = decompose_alignments(deduplicated, alignment, align_graph, + alignment.sequence().begin(), aligner); + if (shared_tail_alns.empty()) { + // nothing decomposed, no universally shared elements, just add placeholders + shared_tail_alns.resize(2); + unshared_tail_alns.emplace_back(move(deduplicated)); + } + else { + + for (size_t i = 0; i < unshared_tail_alns.size(); ++i) { + // deduplicate as left tails in the final iteration if there's no final + // shared segment + deduplicate_alt_alns(unshared_tail_alns[i], + shared_tail_alns[i].first.mapping_size() == 0, false); + } +#ifdef debug_multipath_alignment + cerr << "decompose into " << shared_tail_alns.size() << " shared segments with unshared blocks of sizes:" << endl; + for (size_t i = 0; i < unshared_tail_alns.size(); ++i) { + cerr << "\t" << unshared_tail_alns[i].size() << endl; + } +#endif + } + } + + // add subpaths for the tail alignments + add_decomposed_tail_alignments(alignment, align_graph, multipath_aln_out, prohibited_merges, + shared_tail_alns, unshared_tail_alns, j, true, unmergeable_len, + aligner, cutting_snarls, dist_index, project); + + } + else { +#ifdef debug_multipath_alignment + cerr << "No tails left off of subpath " << j << "; it can be a start" << endl; +#endif + multipath_aln_out.add_start(j); + } + } + + if (simplify_topologies) { +#ifdef debug_multipath_alignment + cerr << "merging non-branching subpaths" << endl; + size_t num_pre_merge = multipath_aln_out.subpath_size(); +#endif + + merge_non_branching_subpaths(multipath_aln_out, &prohibited_merges); + +#ifdef debug_multipath_alignment + cerr << "reduce from " << num_pre_merge << " to " << multipath_aln_out.subpath_size() << " subpaths during merge" << endl; +#endif + } + } + + void MultipathAlignmentGraph::add_decomposed_tail_alignments(const Alignment& alignment, const HandleGraph& align_graph, + multipath_alignment_t& multipath_aln_out, + unordered_set& prohibited_merges, + vector>& shared_tail_alns, + vector>>& unshared_tail_alns, + size_t attachment_idx, bool to_left, size_t unmergeable_len, + const GSSWAligner* aligner, + SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, + const function(id_t)>* project) { + + // TODO: i wonder if it would be cleaner/more general to use branches rather than snarls + // as the condition here... + + // add the alignment as subpath(s), possibly cutting it at snarl boundaries and marking + // those cuts as prohibited to merge. the subpaths are created in order at the end of + // the subpath vector and have edges between them + auto add_and_permanently_cut = [&](pair& aln, string::const_iterator begin) { + +#ifdef debug_multipath_alignment + cerr << "assessing need to permanently cut path" << endl; + cerr << debug_string(aln.first) << endl; +#endif + + vector segment_boundaries; + if (project && (cutting_snarls || dist_index)) { + // the intervals of the path that are inside snarls + auto cut_segments = get_cut_segments(aln.first, cutting_snarls, dist_index, *project, + unmergeable_len); + + // collect the (internal) indexes that follow cuts + segment_boundaries.reserve(cut_segments.size() * 2); + for (auto& cut_segment : cut_segments) { + if (cut_segment.first != 0) { + segment_boundaries.push_back(cut_segment.first); + } + if (cut_segment.second != cut_segment.first && cut_segment.second != aln.first.mapping_size()) { + segment_boundaries.push_back(cut_segment.second); + } + } + // make sure they're in order (only ever not in order if there are nested snarls) + if (!is_sorted(segment_boundaries.begin(), segment_boundaries.end())) { + sort(segment_boundaries.begin(), segment_boundaries.end()); + } + } + + // don't allow edge-spanning deletions to be broken up (breaks dynamic programmability of scores) + auto end = remove_if(segment_boundaries.begin(), segment_boundaries.end(), [&](size_t i) { + return (aln.first.mapping(i - 1).edit().back().to_length() == 0 && + aln.first.mapping(i).edit().front().to_length() == 0); + }); + +#ifdef debug_multipath_alignment + cerr << "need to cut path before mappings:" << endl; + for (auto it = segment_boundaries.begin(); it != end; ++it) { + cerr << "\t" << *it << endl; + } +#endif + + if (segment_boundaries.begin() == end) { + // we don't actually want to cut up this alignment at all + auto subpath = multipath_aln_out.add_subpath(); + *subpath->mutable_path() = move(aln.first); + subpath->set_score(aln.second); + } + else { + // make a subpath for the first segment, but don't do anything with it + size_t first_idx = multipath_aln_out.subpath_size(); + multipath_aln_out.add_subpath(); + + for (auto it = segment_boundaries.begin(), next = segment_boundaries.begin() + 1; it != end; ++it, ++next) { + // add an unmergeable link from previous subpath + multipath_aln_out.mutable_subpath()->back().add_next(multipath_aln_out.subpath_size()); + prohibited_merges.insert(multipath_aln_out.subpath_size() - 1); + // move path into the subpath + auto subpath = multipath_aln_out.add_subpath(); + for (size_t i = *it, n = (next == end ? aln.first.mapping_size() : *next); i < n; ++i) { + *subpath->mutable_path()->add_mapping() = move(*aln.first.mutable_mapping(i)); + } + } + // get the subpath here in case the vector reallocates + auto first_subpath = multipath_aln_out.mutable_subpath(first_idx); + aln.first.mutable_mapping()->resize(segment_boundaries.front()); + *first_subpath->mutable_path() = move(aln.first); + + // score the individual segments + auto b = begin; + for (size_t i = first_idx; i < multipath_aln_out.subpath_size(); ++i) { + auto subpath = multipath_aln_out.mutable_subpath(i); + subpath->set_score(aligner->score_partial_alignment(alignment, align_graph, subpath->path(), b)); + b += path_to_length(subpath->path()); + } + +#ifdef debug_multipath_alignment + for (auto i = first_idx; i < multipath_aln_out.subpath_size(); ++i) { + cerr << "added permanently cut subpath " << i << endl; + cerr << debug_string(multipath_aln_out.subpath(i)) << endl; + } +#endif + } + }; + +#ifdef debug_multipath_alignment + cerr << "adding decomposed tail alignments in " << shared_tail_alns.size() << " groups on " << (to_left ? "left" : "right") << " tail" << endl; +#endif + + + // the index of the previous shared segment + size_t shared_idx = attachment_idx; + // the range of indexes of the previous unshared segment + size_t unmerged_block_begin = attachment_idx; + size_t unmerged_block_end = attachment_idx + 1; + + // set up the iteration over the shared segments + int64_t k; + int64_t incr; + int64_t end; + if (to_left) { + k = shared_tail_alns.size() - 1; + incr = -1; + end = -1; + } + else { + k = 0; + incr = 1; + end = shared_tail_alns.size(); + } + auto& path_node = path_nodes[attachment_idx]; + auto end_pos = final_position(path_node.path); + + bool make_direct_connection = false; + string::const_iterator tail_begin = to_left ? path_node.begin : path_node.end; + for (; k != end; k += incr) { +#ifdef debug_multipath_alignment + cerr << "beginning iter " << k << " on decomposed tails (" << shared_tail_alns.size() << " total)" << endl; +#endif + + auto& shared = shared_tail_alns[k]; + auto shared_seq_end = tail_begin + incr * path_to_length(shared.first); + if (to_left) { + tail_begin = shared_seq_end; + } + + if (shared.first.mapping_size() != 0) { + if (k + incr == end + && shared.first.mapping_size() == 1 + && shared.first.mapping(0).edit_size() == 1 + && shared.first.mapping(0).edit(0).from_length() == 0) { + +#ifdef debug_multipath_alignment + cerr << "final shared segment is a softclip" << endl; +#endif + + // this subpath is a pure softclip, we need to move it onto the previous subpaths + // to match the expectations of the rest of the code + + const auto& softclip = shared.first.mapping(0).edit(0); + for (size_t l = unmerged_block_begin; l < unmerged_block_end; ++l) { + if (to_left) { + // have to insert in the beginning of the vector + auto edits = multipath_aln_out.mutable_subpath(l)->mutable_path()->mutable_mapping(0)->mutable_edit(); + edits->emplace(edits->begin(), softclip); + // the adjacent unshared segment is now a start + multipath_aln_out.add_start(l); + + } + else { + *multipath_aln_out.mutable_subpath(l)->mutable_path()->mutable_mapping()->back().add_edit() = softclip; + } +#ifdef debug_multipath_alignment + cerr << "added as an edit into subpath at index " << l << endl; +#endif + } + if (make_direct_connection) { + // we actually still need to make a separate subpath to preserve the connectivity structure + // since the predecessor has other successors + auto subpath = multipath_aln_out.add_subpath(); + *subpath->mutable_path() = move(shared.first); + subpath->set_score(shared.second); + if (to_left) { + subpath->add_next(shared_idx); + multipath_aln_out.add_start(multipath_aln_out.subpath_size() - 1); + } + else { + multipath_aln_out.mutable_subpath(shared_idx)->add_next(multipath_aln_out.subpath_size() - 1); + } #ifdef debug_multipath_alignment - cerr << "added middle mapping: " << pb2json(*mapping) << endl; + cerr << "made subpath " << multipath_aln_out.subpath_size() - 1 << " for softclip with score " << subpath->score() << ":" << endl; + cerr << debug_string(subpath->path()) << endl; #endif - rank++; } - // check to make sure the last is not an empty anchoring mapping or the same as the first - if (add_last_mapping) { - Mapping* mapping = subpath_path->add_mapping(); - *mapping = last_mapping; - mapping->set_rank(rank); + } + else { + + // make a subpath for this shared segment + size_t first_shared_idx = multipath_aln_out.subpath_size(); + if (k + incr == end) { + add_and_permanently_cut(shared, tail_begin); + } + else { + auto subpath = multipath_aln_out.add_subpath(); + *subpath->mutable_path() = move(shared.first); + subpath->set_score(shared.second); #ifdef debug_multipath_alignment - cerr << "final mapping is not empty, formed mapping: " << pb2json(*mapping) << endl; + cerr << "made subpath " << multipath_aln_out.subpath_size() - 1 << " for shared segment with score " << subpath->score() << ":" << endl; + cerr << debug_string(subpath->path()) << endl; #endif } - // add the appropriate connections - src_subpath->add_next(multipath_aln_out.subpath_size() - 1); - connecting_subpath->add_next(edge.first); + if (make_direct_connection) { + // there's an empty alignment in the previous block, so we make a connection + // directly from the previous shared segment + if (to_left) { + multipath_aln_out.mutable_subpath(multipath_aln_out.subpath_size() - 1)->add_next(shared_idx); + } + else { + multipath_aln_out.mutable_subpath(shared_idx)->add_next(first_shared_idx); + } + make_direct_connection = false; + } - // translate the path into the space of the main graph unless the path is null - if (connecting_subpath->path().mapping_size() != 0) { - translate_node_ids(*connecting_subpath->mutable_path(), connect_trans); - Mapping* first_subpath_mapping = connecting_subpath->mutable_path()->mutable_mapping(0); - if (first_subpath_mapping->position().node_id() == final_mapping.position().node_id()) { - first_subpath_mapping->mutable_position()->set_offset(offset(src_pos)); + // now we can update the shared segment index and forget about the old one + shared_idx = multipath_aln_out.subpath_size() - 1; + + // add edges on the subpath + for (size_t l = unmerged_block_begin; l < unmerged_block_end; ++l) { + if (to_left) { + multipath_aln_out.mutable_subpath(shared_idx)->add_next(l); + } + else { + multipath_aln_out.mutable_subpath(l)->add_next(first_shared_idx); } } + if (k == 0) { + if (!to_left) { + // we might need to adjust the initial position because of the way positions + // work in the subgraph extraction + auto first_mapping = multipath_aln_out.mutable_subpath(first_shared_idx)->mutable_path()->mutable_mapping(0); + if (first_mapping->position().node_id() == id(end_pos)) { + first_mapping->mutable_position()->set_offset(offset(end_pos)); + } + } + else { + // this is a start + multipath_aln_out.add_start(first_shared_idx); + } + } + } + } #ifdef debug_multipath_alignment - cerr << "subpath from " << j << " to " << edge.first << ":" << endl; - cerr << pb2json(*connecting_subpath) << endl; + else { + cerr << "shared segment at " << (k == 0 ? "left" : "right") << " side of the tail alignment is empty, skipping" << endl; + } #endif - } + if (!to_left) { + tail_begin = shared_seq_end; } - if (!edges_for_removal.empty()) { - auto new_end = std::remove_if(src_path_node.edges.begin(), src_path_node.edges.end(), - [&](const pair& edge) { - return edges_for_removal.count(edge); - }); - src_path_node.edges.resize(new_end - src_path_node.edges.begin()); + int64_t block_k = to_left ? k - 1 : k; + if (block_k >= 0 && block_k < unshared_tail_alns.size()) { + // there's a block of unshared alignments here + + auto& block = unshared_tail_alns[block_k]; + auto unshared_seq_end = tail_begin + incr * path_to_length(block.front().first); + if (to_left) { + tail_begin = unshared_seq_end; + } + + size_t curr_block_begin = multipath_aln_out.subpath_size(); + + // are the alignments we're adding going to be the frayed tails? + bool are_tails = false; + if (to_left && block_k == 0 && shared_tail_alns[0].first.mapping_size() == 0) { + are_tails = true; + } + else if (!to_left && block_k + 1 == unshared_tail_alns.size() + && shared_tail_alns[block_k + 1].first.mapping_size() == 0) { + are_tails = true; + } + +#ifdef debug_multipath_alignment + cerr << "processing block " << block_k << " of unshared segments, which contains " << block.size() << " alts that are " << (are_tails ? "" : "not") << " tails" << endl; +#endif + + for (size_t l = 0; l < block.size(); ++l) { + auto& unshared = block[l]; + if (unshared.first.mapping_size() != 0) { + // we have an alignment, so make a subpath + size_t first_idx = multipath_aln_out.subpath_size(); + if (are_tails) { + add_and_permanently_cut(unshared, tail_begin); + if (to_left) { + // leftward tails are starts + multipath_aln_out.add_start(first_idx); + } + } + else { + auto subpath = multipath_aln_out.add_subpath(); + *subpath->mutable_path() = move(unshared.first); + subpath->set_score(unshared.second); +#ifdef debug_multipath_alignment + cerr << "made subpath " << multipath_aln_out.subpath_size() - 1 << " for unshared segment with score " << subpath->score() << ":" << endl; + cerr << debug_string(subpath->path()) << endl; +#endif + } + size_t last_idx = multipath_aln_out.subpath_size() - 1; + + if (to_left) { + // TODO: the only reason using shared_idx works here is because we only + // ever do add_and_permanently_cut on the tails, otherwise this would + // select the wrong side of the interval of subpaths. inelegant. + multipath_aln_out.mutable_subpath(last_idx)->add_next(shared_idx); + } + else { + multipath_aln_out.mutable_subpath(shared_idx)->add_next(first_idx); + } + + // when we look for pure softclips, we'll trust that the add_and_permanently_cut + // routine would handle it correctly if the pure softclip was created by cutting + // the subpath, so we skip cases where the tail was cut up + if (are_tails && + last_idx == first_idx && + multipath_aln_out.subpath(first_idx).path().mapping_size() == 1 && + multipath_aln_out.subpath(first_idx).path().mapping(0).edit_size() == 1 && + multipath_aln_out.subpath(first_idx).path().mapping(0).edit(0).from_length() == 0) { + + // this is a pure softclip, the other parts of the code expect it to be located + // on the same node as the predecessor + const auto& prev_end_mapping = to_left ? multipath_aln_out.subpath(shared_idx).path().mapping().front() + : multipath_aln_out.subpath(shared_idx).path().mapping().back(); + const auto& prev_end_pos = prev_end_mapping.position(); + auto unshared_subpath = multipath_aln_out.mutable_subpath(first_idx); + auto softclip_pos = unshared_subpath->mutable_path()->mutable_mapping(0)->mutable_position(); + if (softclip_pos->node_id() != prev_end_pos.node_id()) { + softclip_pos->set_node_id(prev_end_pos.node_id()); + if (to_left) { + softclip_pos->set_offset(prev_end_pos.offset()); + } + else { + softclip_pos->set_offset(prev_end_pos.offset() + mapping_from_length(prev_end_mapping)); + } + } + } + else if (!to_left && shared_idx == attachment_idx) { + // this only happens if there is no shared prefix, so we potentially have to adjust + // initial offsets for the subgraph extraction + auto first_mapping = multipath_aln_out.mutable_subpath(first_idx)->mutable_path()->mutable_mapping(0); + if (first_mapping->position().node_id() == id(end_pos)) { + first_mapping->mutable_position()->set_offset(offset(end_pos)); + } + } + } + else { + // the next shared subpath should connect to this one + make_direct_connection = true; + } + } + + // set up the tracking variables for the next iteration + if (!to_left) { + tail_begin = unshared_seq_end; + } + unmerged_block_begin = curr_block_begin; + unmerged_block_end = multipath_aln_out.subpath_size(); + } + } + } + + // just to control the memory usage to a small value + const size_t MultipathAlignmentGraph::tail_gap_memo_max_size = 1000; + + // make the memo live in this .o file + thread_local unordered_map> MultipathAlignmentGraph::pessimistic_tail_gap_memo; + + int64_t MultipathAlignmentGraph::pessimistic_tail_gap(int64_t tail_length, double multiplier) { + int64_t gap_length; + if (tail_length >= tail_gap_memo_max_size) { + gap_length = multiplier * sqrt(tail_length); + } + else { + vector& memo = pessimistic_tail_gap_memo[multiplier]; + while (memo.size() <= tail_length) { + memo.emplace_back(multiplier * sqrt(memo.size())); } + gap_length = memo[tail_length]; } + return gap_length; + } + + unordered_map>> + MultipathAlignmentGraph::align_tails(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, + size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, + size_t min_paths, unordered_set* sources) { + +#ifdef debug_multipath_alignment + cerr << "doing tail alignments to:" << endl; + to_dot(cerr); +#endif + + // TODO: magic number + int64_t anchor_low_cmplx_len = 16; + + // multiplier to account for low complexity sequences + auto low_complexity_multiplier = [](string::const_iterator begin, string::const_iterator end) { + // TODO: magic numbers + static const double seq_cmplx_alpha = 0.05; + static const double max_multiplier = 32.0; + SeqComplexity<2> complexity(begin, end); + if (complexity.p_value(1) < seq_cmplx_alpha || complexity.p_value(2) < seq_cmplx_alpha) { + double repetitiveness = max(complexity.repetitiveness(1), complexity.repetitiveness(2)); + double low_complexity_len = repetitiveness * (end - begin); + // TODO: empirically-derived function, not very principled + return max(1.0, min(max_multiplier, 0.05 * low_complexity_len * low_complexity_len)); + } + else { + return 1.0; + } + }; + + // Make a structure to populate + unordered_map>> to_return; + auto& left_alignments = to_return[false]; + auto& right_alignments = to_return[true]; vector is_source_node(path_nodes.size(), true); for (size_t j = 0; j < path_nodes.size(); j++) { - PathNode& path_node = path_nodes[j]; - if (path_node.edges.empty()) { - if (path_node.end != alignment.sequence().end()) { - + PathNode& path_node = path_nodes.at(j); #ifdef debug_multipath_alignment - cerr << "doing right end alignment from sink node " << j << " with path " << pb2json(path_node.path) << " and sequence "; - for (auto iter = path_node.begin; iter != path_node.end; iter++) { - cerr << *iter; - } - cerr << endl; + cerr << "Visit PathNode " << j << " with " << path_node.edges.size() << " outbound edges" << endl; #endif - - Subpath* sink_subpath = multipath_aln_out.mutable_subpath(j); - - int64_t target_length = ((alignment.sequence().end() - path_node.end) + - aligner->longest_detectable_gap(alignment, path_node.end)); - pos_t end_pos = final_position(path_node.path); - // want past-the-last instead of last index here - get_offset(end_pos)++; - - VG tail_graph_extractor; - unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, - &tail_graph_extractor, - target_length, - end_pos, - false, // search forward - false); // no need to preserve cycles (in a DAG) - - size_t num_alt_alns = dynamic_alt_alns ? min(max_alt_alns, algorithms::count_walks(&tail_graph_extractor)) : - max_alt_alns; - - vector alt_alignments; - if (num_alt_alns > 0) { - - Graph& tail_graph = tail_graph_extractor.graph; - - // ensure invariants that gssw-based alignment expects - groom_graph_for_gssw(tail_graph); - - // get the sequence remaining in the right tail - Alignment right_tail_sequence; - right_tail_sequence.set_sequence(alignment.sequence().substr(path_node.end - alignment.sequence().begin(), - alignment.sequence().end() - path_node.end)); - if (!alignment.quality().empty()) { - right_tail_sequence.set_quality(alignment.quality().substr(path_node.end - alignment.sequence().begin(), - alignment.sequence().end() - path_node.end)); - } - + if (!path_node.edges.empty()) { + // We go places from here. + for (const pair& edge : path_node.edges) { + // Make everywhere we go as not a source + is_source_node[edge.first] = false; #ifdef debug_multipath_alignment - cerr << "making " << num_alt_alns << " alignments of sequence: " << right_tail_sequence.sequence() << endl << "to right tail graph: " << pb2json(tail_graph) << endl; + cerr << "Edge " << j << " -> " << edge.first << " makes " << edge.first << " not a source" << endl; #endif - - if (tail_graph.node_size() == 0) { - // edge case for when a read keeps going past the end of a graph - alt_alignments.emplace_back(); - Alignment& tail_alignment = alt_alignments.back(); - tail_alignment.set_score(aligner->score_gap(right_tail_sequence.sequence().size())); - Mapping* insert_mapping = tail_alignment.mutable_path()->add_mapping(); - - // add a soft clip - Edit* edit = insert_mapping->add_edit(); - edit->set_to_length(right_tail_sequence.sequence().size()); - edit->set_sequence(right_tail_sequence.sequence()); - - // make it at the correct position - const Path& anchoring_path = path_nodes[j].path; - const Mapping& anchoring_mapping = anchoring_path.mapping(anchoring_path.mapping_size() - 1); - Position* anchoring_position = insert_mapping->mutable_position(); - anchoring_position->set_node_id(anchoring_mapping.position().node_id()); - anchoring_position->set_is_reverse(anchoring_mapping.position().is_reverse()); - anchoring_position->set_offset(anchoring_mapping.position().offset() + mapping_from_length(anchoring_mapping)); + } + } + else if (path_node.end != alignment.sequence().end()) { + #ifdef debug_multipath_alignment - cerr << "read overhangs end of graph, manually added softclip: " << pb2json(tail_alignment) << endl; + cerr << "doing right end alignment from sink node " << j << " with path " << debug_string(path_node.path) << " and sequence "; + for (auto iter = path_node.begin; iter != path_node.end; iter++) { + cerr << *iter; + } + cerr << endl; #endif - // the ID translator is empty, so add this ID here so it doesn't give an out of index error - id_t node_id = insert_mapping->position().node_id(); - tail_trans[node_id] = node_id; - } - else { - // align against the graph - - aligner->align_pinned_multi(right_tail_sequence, alt_alignments, tail_graph, true, num_alt_alns); - } + + // figure out how long we need to try to align out to + int64_t tail_length = alignment.sequence().end() - path_node.end; + int64_t gap = min(aligner->longest_detectable_gap(alignment, path_node.end), max_gap); + if (pessimistic_tail_gap_multiplier) { + gap = min(gap, pessimistic_tail_gap(tail_length, pessimistic_tail_gap_multiplier)); + } + int64_t target_length = tail_length + gap; + + + + pos_t end_pos = final_position(path_node.path); + + bdsg::HashGraph tail_graph; + unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, + &tail_graph, + target_length, + end_pos, + false, // search forward + false); // no need to preserve cycles (in a DAG) + + size_t num_alt_alns; + if (dynamic_alt_alns) { + size_t num_paths = handlealgs::count_walks(&tail_graph); + if (num_paths < min_paths) { + continue; + } + num_alt_alns = min(max_alt_alns, num_paths); + } + else { + num_alt_alns = max_alt_alns; + } + + if (num_alt_alns > 0) { + + // get the sequence remaining in the right tail + Alignment right_tail_sequence; + right_tail_sequence.set_sequence(alignment.sequence().substr(path_node.end - alignment.sequence().begin(), + alignment.sequence().end() - path_node.end)); + if (!alignment.quality().empty()) { + right_tail_sequence.set_quality(alignment.quality().substr(path_node.end - alignment.sequence().begin(), + alignment.sequence().end() - path_node.end)); } #ifdef debug_multipath_alignment - cerr << "made " << alt_alignments.size() << " tail alignments" << endl; + cerr << "making " << num_alt_alns << " alignments of sequence: " << right_tail_sequence.sequence() << endl << "to right tail graph" << endl; + tail_graph.for_each_handle([&](const handle_t& handle) { + cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; + tail_graph.follow_edges(handle, true, [&](const handle_t& prev) { + cerr << "\t" << tail_graph.get_id(prev) << " <-" << endl; + }); + tail_graph.follow_edges(handle, false, [&](const handle_t& next) { + cerr << "\t-> " << tail_graph.get_id(next) << endl; + }); + }); #endif - const Mapping& final_mapping = path_node.path.mapping(path_node.path.mapping_size() - 1); - - for (Alignment& tail_alignment : alt_alignments) { - - sink_subpath->add_next(multipath_aln_out.subpath_size()); + // align against the graph + auto& alt_alignments = right_alignments[j]; + if (num_alt_alns == 1) { +#ifdef debug_multipath_alignment + cerr << "align right with dozeu with gap " << gap << endl; +#endif + // we can speed things up by using the dozeu pinned alignment + alt_alignments.emplace_back(move(right_tail_sequence)); + aligner->align_pinned(alt_alignments.back(), tail_graph, true, true, gap); + } + else { - Subpath* tail_subpath = multipath_aln_out.add_subpath(); - *tail_subpath->mutable_path() = tail_alignment.path(); - tail_subpath->set_score(tail_alignment.score()); +#ifdef debug_multipath_alignment + cerr << "align right with gssw" << endl; +#endif - translate_node_ids(*tail_subpath->mutable_path(), tail_trans); - Mapping* first_mapping = tail_subpath->mutable_path()->mutable_mapping(0); - if (first_mapping->position().node_id() == final_mapping.position().node_id()) { - first_mapping->mutable_position()->set_offset(offset(end_pos)); - } - else if (tail_alignment.path().mapping_size() == 1 && first_mapping->edit_size() == 1 - && first_mapping->edit(0).from_length() == 0 && first_mapping->edit(0).to_length() > 0 - && first_mapping->position().node_id() != final_mapping.position().node_id()) { - // this is a pure soft-clip on the beginning of the next node, we'll move it to the end - // of the match node to match invariants expected by other parts of the code base - Position* pos = first_mapping->mutable_position(); - pos->set_node_id(final_mapping.position().node_id()); - pos->set_is_reverse(final_mapping.position().is_reverse()); - pos->set_offset(final_mapping.position().offset() + mapping_from_length(final_mapping)); + // TODO: it would be cleaner to handle these in one multiplier, but i already tuned + // this for the tails and i don't feel like doing that again... + double tail_multiplier = low_complexity_multiplier(right_tail_sequence.sequence().begin(), + right_tail_sequence.sequence().end()); + double anchor_multiplier = low_complexity_multiplier(max(path_node.end - anchor_low_cmplx_len, + alignment.sequence().begin()), + path_node.end); + double multiplier = max(tail_multiplier, anchor_multiplier); + if (multiplier != 1.0) { + num_alt_alns = round(multiplier * num_alt_alns); +#ifdef debug_multipath_alignment + cerr << "increase num alns for low complexity sequence to " << num_alt_alns << endl; +#endif } + aligner->align_pinned_multi(right_tail_sequence, alt_alignments, tail_graph, true, num_alt_alns); + } + + // Translate back into non-extracted graph. + // Make sure to account for having removed the left end of the cut node relative to end_pos + for (auto& aln : alt_alignments) { + // We always remove end_pos's offset, since we + // search forward from it to extract the subgraph, + // but that may be from the left or right end of + // its node depending on orientation. + translate_node_ids(*aln.mutable_path(), tail_trans, id(end_pos), offset(end_pos), is_rev(end_pos)); + } #ifdef debug_multipath_alignment - cerr << "subpath from " << j << " to right tail:" << endl; - cerr << pb2json(*tail_subpath) << endl; -#endif + cerr << "made " << alt_alignments.size() << " tail alignments" << endl; + for (size_t i = 0; i < alt_alignments.size(); ++i) { + cerr << i << ": " << pb2json(alt_alignments[i]) << endl; } - } - } - else { - for (const pair& edge : path_node.edges) { - is_source_node[edge.first] = false; +#endif } } } + // Now we know all the sources so we can do them and the left tails. for (size_t j = 0; j < path_nodes.size(); j++) { if (is_source_node[j]) { + // This is a source, whether or not it has a tail to the left. + +#ifdef debug_multipath_alignment + cerr << "PathNode " << j << " had no edges into it and is a source" << endl; +#endif + + if (sources != nullptr) { + // Remember it. + sources->insert(j); + } + PathNode& path_node = path_nodes[j]; if (path_node.begin != alignment.sequence().begin()) { - int64_t target_length = (path_node.begin - alignment.sequence().begin()) + - aligner->longest_detectable_gap(alignment, path_node.begin); + // figure out how far we need to try to align out to + int64_t tail_length = path_node.begin - alignment.sequence().begin(); + int64_t gap = min(aligner->longest_detectable_gap(alignment, path_node.begin), max_gap); + if (pessimistic_tail_gap_multiplier) { + gap = min(gap, pessimistic_tail_gap(tail_length, pessimistic_tail_gap_multiplier)); + } + int64_t target_length = tail_length + gap; + pos_t begin_pos = initial_position(path_node.path); - VG tail_graph_extractor; + bdsg::HashGraph tail_graph; unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, - &tail_graph_extractor, + &tail_graph, target_length, begin_pos, true, // search backward false); // no need to preserve cycles (in a DAG) - size_t num_alt_alns = dynamic_alt_alns ? min(max_alt_alns, algorithms::count_walks(&tail_graph_extractor)) : - max_alt_alns; + size_t num_alt_alns; + if (dynamic_alt_alns) { + size_t num_paths = handlealgs::count_walks(&tail_graph); + if (num_paths < min_paths) { + continue; + } + num_alt_alns = min(max_alt_alns, num_paths); + } + else { + num_alt_alns = max_alt_alns; + } - vector alt_alignments; if (num_alt_alns > 0) { - - Graph& tail_graph = tail_graph_extractor.graph; - // ensure invariants that gssw-based alignment expects - groom_graph_for_gssw(tail_graph); Alignment left_tail_sequence; left_tail_sequence.set_sequence(alignment.sequence().substr(0, path_node.begin - alignment.sequence().begin())); @@ -3185,178 +6211,202 @@ namespace vg { left_tail_sequence.set_quality(alignment.quality().substr(0, path_node.begin - alignment.sequence().begin())); } - #ifdef debug_multipath_alignment - cerr << "making " << num_alt_alns << " alignments of sequence: " << left_tail_sequence.sequence() << endl << "to left tail graph: " << pb2json(tail_graph) << endl; + cerr << "making " << num_alt_alns << " alignments of sequence: " << left_tail_sequence.sequence() << endl << "to left tail graph" << endl; + tail_graph.for_each_handle([&](const handle_t& handle) { + cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; + tail_graph.follow_edges(handle, false, [&](const handle_t& next) { + cerr << "\t-> " << tail_graph.get_id(next) << endl; + }); + tail_graph.follow_edges(handle, true, [&](const handle_t& prev) { + cerr << "\t" << tail_graph.get_id(prev) << " <-" << endl; + }); + }); #endif - if (tail_graph.node_size() == 0) { - // edge case for when a read keeps going past the end of a graph - alt_alignments.emplace_back(); - Alignment& tail_alignment = alt_alignments.back(); - tail_alignment.set_score(aligner->score_gap(left_tail_sequence.sequence().size())); - Mapping* insert_mapping = tail_alignment.mutable_path()->add_mapping(); - - // add a soft clip - Edit* edit = insert_mapping->add_edit(); - edit->set_to_length(left_tail_sequence.sequence().size()); - edit->set_sequence(left_tail_sequence.sequence()); - - // make it at the correct position - *insert_mapping->mutable_position() = path_nodes[j].path.mapping(0).position(); + + // align against the graph + auto& alt_alignments = left_alignments[j]; + if (num_alt_alns == 1) { #ifdef debug_multipath_alignment - cerr << "read overhangs end of graph, manually added softclip: " << pb2json(tail_alignment) << endl; + cerr << "align left with dozeu using gap " << gap << endl; #endif - // the ID translator is empty, so add this ID here so it doesn't give an out of index error - id_t node_id = insert_mapping->position().node_id(); - tail_trans[node_id] = node_id; + // we can speed things up by using the dozeu pinned alignment + alt_alignments.emplace_back(move(left_tail_sequence)); + aligner->align_pinned(alt_alignments.back(), tail_graph, false, true, gap); } else { - aligner->align_pinned_multi(left_tail_sequence, alt_alignments, tail_graph, false, num_alt_alns); - } - } - #ifdef debug_multipath_alignment - cerr << "made " << alt_alignments.size() << " tail alignments" << endl; + cerr << "align left with gssw" << endl; #endif - const Mapping& first_mapping = path_node.path.mapping(0); - for (Alignment& tail_alignment : alt_alignments) { - Subpath* tail_subpath = multipath_aln_out.add_subpath(); - *tail_subpath->mutable_path() = tail_alignment.path(); - tail_subpath->set_score(tail_alignment.score()); - - tail_subpath->add_next(j); - multipath_aln_out.add_start(multipath_aln_out.subpath_size() - 1); - - translate_node_ids(*tail_subpath->mutable_path(), tail_trans); + double anchor_multiplier = low_complexity_multiplier(path_node.begin, + min(path_node.begin + anchor_low_cmplx_len, + alignment.sequence().end())); + double tail_multiplier = low_complexity_multiplier(left_tail_sequence.sequence().begin(), + left_tail_sequence.sequence().end()); + double multiplier = max(tail_multiplier, anchor_multiplier); + if (multiplier != 1.0) { + num_alt_alns = round(multiplier * num_alt_alns); #ifdef debug_multipath_alignment - cerr << "subpath from " << j << " to left tail:" << endl; - cerr << pb2json(*tail_subpath) << endl; + cerr << "increase num alns for low complexity sequence to " << num_alt_alns << endl; #endif - Mapping* final_mapping = tail_subpath->mutable_path()->mutable_mapping(tail_subpath->path().mapping_size() - 1); - if (tail_subpath->path().mapping_size() == 1 && final_mapping->edit_size() == 1 - && final_mapping->edit(0).from_length() == 0 && final_mapping->edit(0).to_length() > 0 - && final_mapping->position().node_id() != first_mapping.position().node_id()) { - // this is a pure soft clip on the end of the previous node, so we move it over to the - // beginning of the match node to match invariants in rest of code base - *final_mapping->mutable_position() = first_mapping.position(); + } + + aligner->align_pinned_multi(left_tail_sequence, alt_alignments, tail_graph, false, num_alt_alns); + } + + // Translate back into non-extracted graph. + // Make sure to account for having removed the right end of the cut node relative to begin_pos + for (auto& aln : alt_alignments) { + // begin_pos's offset is how much we keep, so we removed the node's length minus that. + // And by default it comes off the right side of the node. + translate_node_ids(*aln.mutable_path(), tail_trans, + id(begin_pos), + align_graph.get_length(align_graph.get_handle(id(begin_pos))) - offset(begin_pos), + !is_rev(begin_pos)); + } +#ifdef debug_multipath_alignment + cerr << "made " << alt_alignments.size() << " tail alignments" << endl; + for (size_t i = 0; i < alt_alignments.size(); ++i) { + cerr << i << ": " << pb2json(alt_alignments[i]) << endl; } +#endif } } - else { - multipath_aln_out.add_start(j); - } - } - } - } - - void MultipathAlignmentGraph::groom_graph_for_gssw(Graph& graph) { - - // remove empty nodes - size_t end = graph.node_size(); - size_t idx = 0; - unordered_set removed_nodes; - while (idx < end) { - if (graph.node(idx).sequence().empty()) { - end--; - removed_nodes.insert(graph.node(idx).id()); - std::swap(*graph.mutable_node(idx), *graph.mutable_node(end)); - } - else { - idx++; } } - if (end != graph.node_size()) { - graph.mutable_node()->DeleteSubrange(end, graph.node_size() - end); - - // look for any edges connecting them and remove these too - end = graph.edge_size(); - idx = 0; - while (idx < end) { - Edge* edge = graph.mutable_edge(idx); - if (removed_nodes.count(edge->from()) || removed_nodes.count(edge->to())) { - end--; - std::swap(*edge, *graph.mutable_edge(end)); - } - else { - idx++; - } - } - graph.mutable_edge()->DeleteSubrange(end, graph.edge_size() - end); + + // GSSW does some weird things with N's that we want to normalize away + if (find(alignment.sequence().begin(), alignment.sequence().end(), 'N') != alignment.sequence().end()) { + for (bool side : {true, false}) { + for (pair>& tail_alignments : to_return[side]) { + for (Alignment& aln : tail_alignments.second) { + normalize_alignment(aln); + } + } + } + } + +#ifdef debug_multipath_alignment + cerr << "made alignments for " << to_return[false].size() << " source PathNodes and " << to_return[true].size() << " sink PathNodes" << endl; + if (sources != nullptr) { + cerr << "Identified " << sources->size() << " source PathNodes" << endl; } +#endif - // flip doubly reversing edges - for (size_t i = 0; i < graph.edge_size(); i++) { - Edge* edge = graph.mutable_edge(i); - if (edge->from_start() && edge->to_end()) { - id_t tmp = edge->from(); - edge->set_from(edge->to()); - edge->set_to(tmp); - edge->set_from_start(false); - edge->set_to_end(false); + // Return all the alignments, organized by tail and subpath + return to_return; + } + + bool MultipathAlignmentGraph::empty() const { + return path_nodes.empty(); + } + + size_t MultipathAlignmentGraph::size() const { + return path_nodes.size(); + } + + size_t MultipathAlignmentGraph::max_shift() const { + size_t shift = 0; + for (const auto& path_node : path_nodes) { + for (const auto& edge : path_node.edges) { + const auto& next_node = path_nodes[edge.first]; + shift = max(shift, abs((next_node.begin - path_node.end) - edge.second)); } } + return shift; + } + + void MultipathAlignmentGraph::prune_high_shift_edges(size_t prune_diff, bool prohibit_new_sources, bool prohibit_new_sinks) { - // associate node ids with their index - unordered_map node_idx; - for (size_t i = 0; i < graph.node_size(); i++) { - node_idx[graph.node(i).id()] = i; - } + vector min_shift_fwd(path_nodes.size(), numeric_limits::max()); + vector min_shift_rev(path_nodes.size(), numeric_limits::max()); - // construct adjacency list and compute in degrees - vector in_degree(graph.node_size(), 0); - vector> adj_list(graph.node_size()); - for (size_t i = 0; i < graph.edge_size(); i++) { - const Edge& edge = graph.edge(i); - size_t to_idx = node_idx[edge.to()]; - adj_list[node_idx[edge.from()]].push_back(to_idx); - in_degree[to_idx]++; + // compute the min shift with reverse DP and also compute in degrees + vector in_degree(path_nodes.size(), 0); + for (int64_t i = path_nodes.size() - 1; i >= 0; --i) { + auto& path_node = path_nodes[i]; + if (path_node.edges.empty()) { + min_shift_rev[i] = 0; + } + else { + for (auto& edge : path_node.edges) { + + ++in_degree[edge.first]; + + const auto& next_node = path_nodes[edge.first]; + size_t shift = abs((next_node.begin - path_node.end) - edge.second); + +#ifdef debug_shift_pruning + cerr << "shift DP reverse " << i << " <- " << edge.first << " with shift " << shift << " for total " << min_shift_rev[edge.first] + shift << endl; +#endif + + min_shift_rev[i] = min(min_shift_rev[i], min_shift_rev[edge.first] + shift); + } + } } - // get the topological ordering of the graph (Kahn's algorithm) - vector source_stack; - for (size_t i = 0; i < graph.node_size(); i++) { + // compute the min shift the forward direction and find the optimal shift + size_t opt_shift = numeric_limits::max(); + for (size_t i = 0; i < path_nodes.size(); ++i) { + if (in_degree[i] == 0) { - source_stack.push_back(i); + min_shift_fwd[i] = 0; } - } - - vector order(graph.node_size()); - size_t next = 0; - while (!source_stack.empty()) { - size_t src = source_stack.back(); - source_stack.pop_back(); - for (size_t dest : adj_list[src]) { - in_degree[dest]--; - if (in_degree[dest] == 0) { - source_stack.push_back(dest); + auto& path_node = path_nodes[i]; + if (path_node.edges.empty()) { + opt_shift = min(opt_shift, min_shift_fwd[i]); + } + else { + for (auto& edge : path_node.edges) { + const auto& next_node = path_nodes[edge.first]; + size_t shift = abs((next_node.begin - path_node.end) - edge.second); +#ifdef debug_shift_pruning + cerr << "shift DP forward " << i << " -> " << edge.first << " with shift " << shift << " for total " << min_shift_fwd[i] + shift << endl; +#endif + min_shift_fwd[edge.first] = min(min_shift_fwd[edge.first], min_shift_fwd[i] + shift); } } - - order[next] = src; - next++; } - - // identify the index that we want each node to end up at - vector index(order.size()); - for (size_t i = 0; i < order.size(); i++) { - index[order[i]] = i; +#ifdef debug_shift_pruning + cerr << "min forward and backward shift:" << endl; + for (size_t i = 0; i < path_nodes.size(); ++i) { + cerr << "\t" << i << ":\t" << min_shift_fwd[i] << "\t" << min_shift_rev[i] << endl; } +#endif - // in place permutation according to the topological order - for (size_t i = 0; i < graph.node_size(); i++) { - while (index[i] != i) { - std::swap(*graph.mutable_node(i), *graph.mutable_node(index[i])); - std::swap(index[i], index[index[i]]); + // prune edges as necessary + for (size_t i = 0; i < path_nodes.size(); ++i) { + auto& path_node = path_nodes[i]; + size_t removed = 0; + for (size_t j = 0; j < path_node.edges.size(); ++j) { + auto& edge = path_node.edges[j]; + const auto& next_node = path_nodes[edge.first]; + size_t shift = abs((next_node.begin - path_node.end) - edge.second); + + size_t min_edge_shift = min_shift_fwd[i] + shift + min_shift_rev[edge.first]; + + // TODO: we might choose a sub-optimal set of these by just greedily removing edges + if (min_edge_shift > opt_shift + prune_diff + && (!prohibit_new_sinks || removed + 1 < path_node.edges.size()) + && (!prohibit_new_sources || in_degree[edge.first] > 1)) { +#ifdef debug_shift_pruning + cerr << "removing edge " << i << " -> " << edge.first << " with min shift " << min_edge_shift << " compared to opt shift " << opt_shift << endl; +#endif + ++removed; + --in_degree[edge.first]; + } + else if (removed) { + path_node.edges[j - removed] = path_node.edges[j]; + } + } + if (removed) { + path_node.edges.resize(path_node.edges.size() - removed); } } } - bool MultipathAlignmentGraph::empty() { - return path_nodes.empty(); - } - - void MultipathAlignmentGraph::to_dot(ostream& out) const { + void MultipathAlignmentGraph::to_dot(ostream& out, const Alignment* alignment) const { // We track the VG graph nodes we talked about already. set mentioned_nodes; set> mentioned_edges; @@ -3366,18 +6416,24 @@ namespace vg { for (size_t i = 0; i < path_nodes.size(); i++) { // For each node, say the node itself as a mapping node, annotated with match length out << "m" << i << " [label=\"" << i << "\" shape=circle tooltip=\"" - << (path_nodes[i].end - path_nodes[i].begin) << " bp\"];" << endl; - for (pair edge : path_nodes[i].edges) { + << (path_nodes.at(i).end - path_nodes.at(i).begin) << " bp"; + if (alignment != nullptr) { + // Include info on where that falls in the query sequence + out << " (" << (path_nodes.at(i).begin - alignment->sequence().begin()) << " - " + << (path_nodes.at(i).end - alignment->sequence().begin()) << ")"; + } + out << "\"];" << endl; + for (pair edge : path_nodes.at(i).edges) { // For each edge from it, say where it goes and how far it skips out << "m" << i << " -> m" << edge.first << " [label=" << edge.second << "];" << endl; } - auto& path = path_nodes[i].path; + auto& path = path_nodes.at(i).path; for (size_t j = 0; j < path.mapping_size(); j++) { // For each mapping in the path, show the vg node in the graph too auto node_id = path.mapping(j).position().node_id(); if (!mentioned_nodes.count(node_id)) { - // This graph node eneds to be made + // This graph node needs to be made mentioned_nodes.insert(node_id); out << "g" << node_id << " [label=\"" << node_id << "\" shape=box];" << endl; } @@ -3401,6 +6457,24 @@ namespace vg { } out << "}" << endl; } + + bool MultipathAlignmentGraph::into_cutting_snarl(id_t node_id, bool is_rev, + SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index) const { + + if (dist_index) { + auto result = dist_index->into_which_snarl(node_id, is_rev); + // points into a snarl and snarl is nontrivial + return get<0>(result) != 0 && !get<2>(result); + } + else if (snarl_manager) { + // no mechanism to check for triviality without traversing graph + return snarl_manager->into_which_snarl(node_id, is_rev); + } + else { + // no snarls provided + return false; + } + } vector> MultipathAlignmentGraph::get_connected_components() const { // Brea all the path_nodes into components using this union-find @@ -3408,7 +6482,7 @@ namespace vg { for (size_t i = 0; i < path_nodes.size(); i++) { // For each node - for (auto& edge : path_nodes[i].edges) { + for (auto& edge : path_nodes.at(i).edges) { // For each outgoing edge... // Union both ends into the same component. @@ -3427,7 +6501,7 @@ namespace vg { for (auto& path_node : component) { // For each path in the vg graph used by the component - auto& path = path_nodes[path_node].path; + auto& path = path_nodes.at(path_node).path; for (auto& mapping : path.mapping()) { // For each mapping in the path, remember its vg graph node diff --git a/src/multipath_alignment_graph.hpp b/src/multipath_alignment_graph.hpp index 634e22e4afb..bb7abbd9889 100644 --- a/src/multipath_alignment_graph.hpp +++ b/src/multipath_alignment_graph.hpp @@ -11,9 +11,9 @@ #include #include #include +#include -#include "vg.pb.h" -#include "vg.hpp" +#include #include "snarls.hpp" #include "multipath_mapper.hpp" @@ -25,7 +25,7 @@ namespace vg { public: string::const_iterator begin; string::const_iterator end; - Path path; + path_t path; // pairs of (target index, path length) vector> edges; @@ -42,41 +42,83 @@ namespace vg { /// making it a member. static unordered_multimap> create_injection_trans(const unordered_map>& projection_trans); + /// Create the constant injection translation data from a function instead + /// of a map + static unordered_multimap> create_injection_trans(const HandleGraph& graph, + const function(id_t)>& project); + + /// Create an identity projection translation from a DAG that did not + /// need to be modified during dagification. + static unordered_map> create_identity_projection_trans(const HandleGraph& graph); + + /// Create a lambda function that projects using a map that projects + static function(id_t)> create_projector(const unordered_map>& projection_trans); + /// Construct a graph of the reachability between MEMs in a DAG-ified /// graph. If a GCSA is specified, use it to collapse MEMs whose /// lengths bump up against the GCSA's order limit on MEM length. /// Produces a graph with reachability edges. Assumes that the cluster /// is sorted by primarily length and secondarily lexicographically by /// read interval. - MultipathAlignmentGraph(VG& vg, const MultipathMapper::memcluster_t& hits, - const unordered_map>& projection_trans, + /// If a hit fails to be walked ouut in the graph, it is removed from hits. + MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, + const function(id_t)>& project, const unordered_multimap>& injection_trans, - gcsa::GCSA* gcsa = nullptr); + vector& path_node_provenance, + size_t max_branch_trim_length = 0, gcsa::GCSA* gcsa = nullptr, + const MultipathMapper::match_fanouts_t* fanout_breaks = nullptr); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily. - MultipathAlignmentGraph(VG& vg, const MultipathMapper::memcluster_t& hits, + MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, const unordered_map>& projection_trans, - gcsa::GCSA* gcsa = nullptr); + vector& path_node_provenance, + size_t max_branch_trim_length = 0, gcsa::GCSA* gcsa = nullptr, + const MultipathMapper::match_fanouts_t* fanout_breaks = nullptr); - /// Construct a graph of the reachability between MEMs in a linearized + /// Same as the previous constructor, but construct injection_trans implicitly and temporarily + /// using a lambda for a projector + MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, + const function(id_t)>& project, + vector& path_node_provenance, + size_t max_branch_trim_length = 0, gcsa::GCSA* gcsa = nullptr, + const MultipathMapper::match_fanouts_t* fanout_breaks = nullptr); + + /// Construct a graph of the reachability between aligned chunks in a linearized /// path graph. Produces a graph with reachability edges. - MultipathAlignmentGraph(VG& vg, const vector, Path>>& path_chunks, - const Alignment& alignment, const unordered_map>& projection_trans, - const unordered_multimap>& injection_trans); + MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, + const Alignment& alignment, const function(id_t)>& project, + const unordered_multimap>& injection_trans, bool realign_Ns = true, + bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily - MultipathAlignmentGraph(VG& vg, const vector, Path>>& path_chunks, - const Alignment& alignment, const unordered_map>& projection_trans); + MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, + const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns = true, + bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); - /// Make a multipath alignment graph using the path of a single-path alignment - MultipathAlignmentGraph(VG& vg, const Alignment& alignment, SnarlManager& snarl_manager, size_t max_snarl_cut_size, - const unordered_map>& projection_trans, + /// Same as the previous constructor, but construct injection_trans implicitly and temporarily + /// and using a lambda for a projector + MultipathAlignmentGraph(const HandleGraph& graph, const vector, Path>>& path_chunks, + const Alignment& alignment, const function(id_t)>& project, bool realign_Ns = true, + bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); + + /// Make a multipath alignment graph using the path of a single-path alignment. Only + /// one of snarl_manager and dist_index need be supplied. + MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, + SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, + const function(id_t)>& project, const unordered_multimap>& injection_trans); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily - MultipathAlignmentGraph(VG& vg, const Alignment& alignment, SnarlManager& snarl_manager, size_t max_snarl_cut_size, + MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, + SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, const unordered_map>& projection_trans); + /// Same as the previous constructor, but construct injection_trans implicitly and temporarily + /// using a function instead of a map + MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, + SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, + const function(id_t)>& project); + ~MultipathAlignmentGraph(); /// Fills input vector with node indices of a topological sort. @@ -84,57 +126,116 @@ namespace vg { void topological_sort(vector& order_out); /// Removes non-softclip indels from path nodes. Does not update edges--should be called - /// prior to adding computing edges. - void trim_hanging_indels(const Alignment& alignment); + /// prior to adding computing edges. If preserve tail anchors is true, then a null anchor (no + /// bases and no path) will be preserved if the read segment orresponds to the beginning or + /// end of the alignment sequence. + void trim_hanging_indels(const Alignment& alignment, bool trim_Ns = true, bool preserve_tail_anchors = false); - /// Removes all transitive edges from graph (reduces to minimum equivalent graph). + /// Removes all transitive edges from graph (reduces to minimum equivalent graph), + /// except for edges between path nodes that abut either on the graph or read. These + /// edges often correspond to overlap breaks in low complexity sequence, so retaining + /// them improves alignment in low-complexity regions like STR expansions. /// Note: reorders internal representation of adjacency lists. /// Reachability edges must be in the graph. void remove_transitive_edges(const vector& topological_order); /// Removes nodes and edges that are not part of any path that has an estimated score /// within some amount of the highest scoring path. Reachability edges must be present. - void prune_to_high_scoring_paths(const Alignment& alignment, const BaseAligner* aligner, - double max_suboptimal_score_ratio, const vector& topological_order); + void prune_to_high_scoring_paths(const Alignment& alignment, const GSSWAligner* aligner, + double max_suboptimal_score_ratio, const vector& topological_order, + vector& path_node_provenance); /// Clear reachability edges, so that add_reachability_edges can be run /// (possibly after modifying the graph). void clear_reachability_edges(); + /// Get the number of reachability edges in the graph. + size_t count_reachability_edges() const; + + /// Remove the ends of paths, up to a maximum length, if they cause the path + /// to extend past a branch point in the graph. + void trim_to_branch_points(const HandleGraph* graph, size_t max_trim_length = 1); + /// Cut the interior of snarls out of anchoring paths (and split /// alignment nodes accordingly) unless they are longer than the max - /// cut size. Reachability edges must be cleared. - void resect_snarls_from_paths(SnarlManager* cutting_snarls, const unordered_map>& projection_trans, - int64_t max_snarl_cut_size = 5); + /// cut size. Snarls can be stored either in a SnarlManager or a + /// SnarlDistanceIndex (only one need be supplied). + void resect_snarls_from_paths(SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, + const function(id_t)>& project, int64_t max_snarl_cut_size = 5); + + /// Do some exploratory alignments of the tails of the graph, outside + /// the outermost existing anchors, and define new anchoring paths from + /// them. After this, you can call resect_snarls_from_paths, in order + /// to get better coverage of possible combinations of snarl traversals + /// in parts of the alignment that didn't originally have anchors. + /// Produces *only* perfect match anchors, so it is still safe to use + /// score_anchors_as_matches. The Alignment passed *must* be the same + /// Alignment that owns the sequence into which iterators were passed + /// when the MultipathAlignmentGraph was constructed! TODO: Shouldn't + /// the class hold a reference to the Alignment then? + void synthesize_tail_anchors(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, + size_t min_anchor_size, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, + double pessimistic_tail_gap_multiplier); /// Add edges between reachable nodes and split nodes at overlaps - void add_reachability_edges(VG& vg, - const unordered_map>& projection_trans, - const unordered_multimap>& injection_trans); + void add_reachability_edges(const HandleGraph& vg, + const function(id_t)>& project, + const unordered_multimap>& injection_trans, + vector* path_node_provenance = nullptr); - /// Do intervening and tail alignments between the anchoring paths and store the result - /// in a MultipathAlignment. Reachability edges must be in the graph. - void align(const Alignment& alignment, VG& align_graph, BaseAligner* aligner, bool score_anchors_as_matches, - size_t max_alt_alns, bool dynamic_alt_alns, size_t band_padding, MultipathAlignment& multipath_aln_out); - - /// Do intervening and tail alignments between the anchoring paths and store the result - /// in a MultipathAlignment. Reachability edges must be in the graph. Also, choose the - /// band padding dynamically as a function of the inter-MEM sequence and graph - void align(const Alignment& alignment, VG& align_graph, BaseAligner* aligner, bool score_anchors_as_matches, - size_t max_alt_alns, bool dynamic_alt_alns, - function band_padding_function, - MultipathAlignment& multipath_aln_out); + /// Do intervening and tail alignments between the anchoring paths and + /// store the result in a multipath_alignment_t. Reachability edges must + /// be in the graph. The Alignment passed *must* be the same Alignment + /// that owns the sequence into which iterators were passed when the + /// MultipathAlignmentGraph was constructed! TODO: Shouldn't the class + /// hold a reference to the Alignment then? + /// + /// Note that the output alignment may NOT be in topologically-sorted + /// order, even if this MultipathAlignmentGraph is. You MUST sort it + /// with topologically_order_subpaths() before trying to run DP on it. + void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches, + size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies, + size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, + SnarlDistanceIndex* dist_index = nullptr, const function(id_t)>* project = nullptr, + bool allow_negative_scores = false); + + /// Do intervening and tail alignments between the anchoring paths and + /// store the result in a multipath_alignment_t. Reachability edges must + /// be in the graph. Also, choose the band padding dynamically as a + /// function of the inter-MEM sequence and graph. The Alignment passed + /// *must* be the same Alignment that owns the sequence into which + /// iterators were passed when the MultipathAlignmentGraph was + /// constructed! TODO: Shouldn't the class hold a reference to the + /// Alignment then? + /// + /// Note that the output alignment may NOT be in topologically-sorted + /// order, even if this MultipathAlignmentGraph is. You MUST sort it + /// with topologically_order_subpaths() before trying to run DP on it. + void align(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, bool score_anchors_as_matches, + size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, bool simplify_topologies, + size_t unmergeable_len, function band_padding_function, + multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr, + const function(id_t)>* project = nullptr, bool allow_negative_scores = false); /// Converts a MultipathAlignmentGraph to a GraphViz Dot representation, output to the given ostream. - void to_dot(ostream& out) const; + /// If given the Alignment query we are working on, can produce information about subpath iterators. + void to_dot(ostream& out, const Alignment* alignment = nullptr) const; /// Get lists of the vg node IDs that participate in each connected component in the MultipathAlignmentGraph vector> get_connected_components() const; - /// Does the multipath alignment xgraph have any nodes? - bool empty(); + /// Does the multipath alignment graph have any nodes? + bool empty() const; + + size_t size() const; - private: + /// For a graph with reachability edges, identifies the largest difference between read interval and + /// reference distance + size_t max_shift() const; + + void prune_high_shift_edges(size_t prune_diff, bool prohibit_new_sources, bool prohibit_new_sinks); + + protected: /// Nodes representing walked MEMs in the graph vector path_nodes; @@ -147,30 +248,119 @@ namespace vg { /// If this is unset and you want it set, use add_reachability_edges(). bool has_reachability_edges = false; + /// Trim down the given PathNode of everything except softclips. + /// Return true if it all gets trimmed away and should be removed. + /// Fills in removed_start_from_length and/or removed_end_from_length + /// with the bases in the graph removed from the path on each end + /// during trimming, if set. If preserve tail anchors is true, then a null + /// anchor (no bases and no path) will be preserved if the read segment + /// corresponds to the beginning or end of the alignment sequence. + static bool trim_and_check_for_empty(const Alignment& alignment, bool trim_Ns, PathNode& path_node, + bool preserve_tail_anchors, int64_t* removed_start_from_length = nullptr, + int64_t* removed_end_from_length = nullptr); /// Add the path chunks as nodes to the connectivity graph - void create_path_chunk_nodes(VG& vg, const vector, Path>>& path_chunks, - const Alignment& alignment, const unordered_map>& projection_trans, - const unordered_multimap>& injection_trans); + void create_path_chunk_nodes(const HandleGraph& graph, const vector, Path>>& path_chunks, + const Alignment& alignment, const function(id_t)>& project, + const unordered_multimap>& injection_trans, + vector* path_node_provenance = nullptr); /// Walk out MEMs into match nodes and filter out redundant sub-MEMs - void create_match_nodes(VG& vg, const MultipathMapper::memcluster_t& hits, - const unordered_map>& projection_trans, - const unordered_multimap>& injection_trans); + void create_match_nodes(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, + const function(id_t)>& project, + const unordered_multimap>& injection_trans, + vector& path_node_provenance, + int64_t max_branch_trim_length, + const MultipathMapper::match_fanouts_t* fanout_breaks); + + /// If path nodes partially overlap, merge the sections that overlap into a single path node + void merge_partially_redundant_match_nodes(const unordered_map>& node_matches, + vector& path_node_provenance); + + void jitter_homopolymer_ends(const HandleGraph& graph, + vector& path_node_provenance, + const MultipathMapper::memcluster_t& hits, + int64_t max_branch_trim_length); /// Identifies runs of exact matches that are sub-maximal because they hit the order of the GCSA /// index and merges them into a single node, assumes that match nodes are sorted by length and /// then lexicographically by read interval, does not update edges - void collapse_order_length_runs(VG& vg, gcsa::GCSA* gcsa); + void collapse_order_length_runs(const HandleGraph& graph, gcsa::GCSA* gcsa, + vector& path_node_provenance); /// Reorders adjacency list representation of edges so that they follow the indicated /// ordering of their target nodes void reorder_adjacency_lists(const vector& order); - // Reorders the nodes of a Protobuf graph in topological order, flips doubly reversing edges, - // and removes empty sequence nodes (invariants required for gssw alignment) - // TODO: this is duplicative with VG::lazy_sort, but I don't want to construct a VG here - void groom_graph_for_gssw(Graph& graph); + /// Return the pessimistic gap length corresponding to a certain tail length and multiplier (proportional to + /// the square root of the tail length) + int64_t pessimistic_tail_gap(int64_t tail_length, double multiplier); + + /// Returns true if we're pointing into a snarl that we want to cut out of paths + bool into_cutting_snarl(id_t node_id, bool is_rev, + SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index) const; + + /// Returns the intervals of the path that lie inside of snarls + vector> get_cut_segments(path_t& path, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, + const function(id_t)>& project, int64_t max_snarl_cut_size) const; + + /// Generate alignments of the tails of the query sequence, beyond the + /// sources and sinks. The Alignment passed *must* be the one that owns + /// the sequence we are working on. Returns a map from tail + /// (left=false, right=true), to a map from subpath number to all the + /// Alignments of the tail off of that subpath. Also computes the + /// source subpaths and adds their numbers to the given set if not + /// null. + /// If dynamic alignment count is also selected, can indicate a minimum number + /// of paths that must be in the extending graph in order to do an alignment + unordered_map>> + align_tails(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* aligner, + size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, + size_t min_paths, unordered_set* sources = nullptr); + + /// Removes alignments that follow the same path through the graph, retaining only the + /// highest scoring ones. If deduplicating leftward, then also removes paths that take a + /// longer path for no greater score in the leftward direction. Vice versa for rightward. + /// Assumes alignments are descending order by score. + static void deduplicate_alt_alns(vector>& alt_alns, bool leftward, bool rightward); + + /// If a list of aligned subsequences are identifical in a prefix/suffix, remove that + /// prefix/suffix from all of the alignments and return it as a separate alignment. + /// If there is no shared prefix/suffix, returns an empty path with 0 score. + static pair zip_alignments(vector>& alt_alns, bool from_left, + const Alignment& alignment, const HandleGraph& align_graph, + string::const_iterator begin, const GSSWAligner* aligner); + + /// Identifies regions that are shared across all of the alternative alignments, and then + /// splits those regions out into separate alignments, dividing the set of alternative + /// alignments accordingly. Return value consists of a vector of the shared segments and + /// a vector of vectors of the segments between. The length of the vector of shared segments + /// is +1 of the vector of between segments, so that they are interleaved. The first and/or + /// last of the alignments of shared segments may be empty if there is no shared prefix + /// or suffix across all the alignments. + /// If there are no shared segments at all, will return pair of empty vectors. + /// Blocks of unshared paths will be sorted into descending order by score. + static pair>, vector>>> + decompose_alignments(const vector>& alt_alns, + const Alignment& alignment, const HandleGraph& align_graph, + string::const_iterator begin, const GSSWAligner* aligner); + + void add_decomposed_tail_alignments(const Alignment& alignment, const HandleGraph& align_graph, + multipath_alignment_t& multipath_aln_out, + unordered_set& prohibited_merges, + vector>& shared_tail_alns, + vector>>& unshared_tail_alns, + size_t attachment_idx, bool to_left, size_t unmergeable_len, + const GSSWAligner* aligner, + SnarlManager* cutting_snarls = nullptr, + SnarlDistanceIndex* dist_index = nullptr, + const function(id_t)>* project = nullptr); + + /// Memo for the transcendental pessimistic tail gap function (thread local to maintain thread-safety) + static thread_local unordered_map> pessimistic_tail_gap_memo; + + /// The largest size we will memoize up to + static const size_t tail_gap_memo_max_size; }; } diff --git a/src/multipath_mapper.cpp b/src/multipath_mapper.cpp index 04c588100cb..55f765f3fbc 100644 --- a/src/multipath_mapper.cpp +++ b/src/multipath_mapper.cpp @@ -1,20 +1,50 @@ -// -// multipath_mapper.cpp -// -// -// +/** + * \file multipath_mapper.cpp + * + * Implements the MultipathMapper class + */ //#define debug_multipath_mapper //#define debug_multipath_mapper_alignment //#define debug_validate_multipath_alignments //#define debug_report_startup_training //#define debug_pretty_print_alignments +//#define debug_time_phases +//#define debug_log_splice_align_stats +//#define debug_output_distance_correlation +//#define debug_check_adapters + +#ifdef debug_time_phases +#include +#endif + +#ifdef debug_check_adapters +#include "ssw_aligner.hpp" +#endif #include "multipath_mapper.hpp" -#include "multipath_alignment_graph.hpp" -#include "algorithms/topological_sort.hpp" +#include "multipath_alignment_graph.hpp" +#include "kmp.hpp" +#include "hash_map.hpp" +#include "position.hpp" +#include "nodeside.hpp" +#include "path.hpp" +#include "utility.hpp" #include "annotation.hpp" +#include "statistics.hpp" + +#include "identity_overlay.hpp" +#include "reverse_graph.hpp" +#include "split_strand_graph.hpp" +#include "dagified_graph.hpp" + +#include "algorithms/count_covered.hpp" +#include "algorithms/extract_containing_graph.hpp" +#include "algorithms/locally_expand_graph.hpp" +#include "algorithms/jump_along_path.hpp" +#include "algorithms/ref_path_distance.hpp" +#include "algorithms/component.hpp" namespace vg { @@ -24,43 +54,47 @@ namespace vg { //size_t MultipathMapper::SECONDARY_RESCUE_ATTEMPT = 0; //size_t MultipathMapper::SECONDARY_RESCUE_TOTAL = 0; - MultipathMapper::MultipathMapper(xg::XG* xg_index, gcsa::GCSA* gcsa_index, gcsa::LCPArray* lcp_array, - haplo::ScoreProvider* haplo_score_provider, SnarlManager* snarl_manager) : - BaseMapper(xg_index, gcsa_index, lcp_array, haplo_score_provider), - snarl_manager(snarl_manager) + const size_t MultipathMapper::RESCUED = numeric_limits::max(); + + MultipathMapper::MultipathMapper(PathPositionHandleGraph* graph, gcsa::GCSA* gcsa_index, gcsa::LCPArray* lcp_array, + haplo::ScoreProvider* haplo_score_provider, SnarlManager* snarl_manager, + SnarlDistanceIndex* distance_index) : + BaseMapper(graph, gcsa_index, lcp_array, haplo_score_provider), + snarl_manager(snarl_manager), + distance_index(distance_index), + path_component_index(distance_index ? nullptr : new PathComponentIndex(graph)), + splice_stats(*get_regular_aligner()) { - // nothing to do + set_max_merge_supression_length(); } - + MultipathMapper::~MultipathMapper() { } void MultipathMapper::multipath_map(const Alignment& alignment, - vector& multipath_alns_out, - size_t max_alt_mappings) { - multipath_map_internal(alignment, mapping_quality_method, multipath_alns_out, max_alt_mappings); - } - - void MultipathMapper::multipath_map_internal(const Alignment& alignment, - MappingQualityMethod mapq_method, - vector& multipath_alns_out, - size_t max_alt_mappings) { + vector& multipath_alns_out) { #ifdef debug_multipath_mapper cerr << "multipath mapping read " << pb2json(alignment) << endl; cerr << "querying MEMs..." << endl; #endif - - // query MEMs using GCSA2 - double dummy1; double dummy2; - vector mems = find_mems_deep(alignment.sequence().begin(), alignment.sequence().end(), dummy1, dummy2, - 0, min_mem_length, mem_reseed_length, false, true, true, false); + + vector>> mem_fanouts; + auto mems = find_mems(alignment, &mem_fanouts); + unique_ptr fanouts(mem_fanouts.empty() ? nullptr : + new match_fanouts_t(record_fanouts(mems, mem_fanouts))); #ifdef debug_multipath_mapper cerr << "obtained MEMs:" << endl; for (MaximalExactMatch mem : mems) { cerr << "\t" << mem << " (" << mem.nodes.size() << " hits)" << endl; + if (fanouts.get() && fanouts->count(&mem)) { + cerr << "\t\tfan-outs:" << endl; + for (auto fanout : fanouts->at(&mem)) { + cerr << "\t\t\t" << (fanout.first - mem.begin) << ": " << *fanout.first << " -> " << fanout.second << endl; + } + } } cerr << "clustering MEMs..." << endl; #endif @@ -68,31 +102,23 @@ namespace vg { // TODO: use the automatic expected MEM length algorithm to restrict the MEMs used for clustering? // cluster the MEMs - vector clusters; - // memos for the results of expensive succinct operations that we may need to do multiple times - OrientedDistanceClusterer::paths_of_node_memo_t paths_of_node_memo; - OrientedDistanceClusterer::oriented_occurences_memo_t oriented_occurences_memo; - OrientedDistanceClusterer::handle_memo_t handle_memo; - // TODO: Making OrientedDistanceClusterers is the only place we actually - // need to distinguish between regular_aligner and qual_adj_aligner - if (adjust_alignments_for_base_quality) { - OrientedDistanceClusterer clusterer(alignment, mems, *get_qual_adj_aligner(), xindex, max_expected_dist_approx_error, - min_clustering_mem_length, unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters = clusterer.clusters(alignment, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); - } - else { - OrientedDistanceClusterer clusterer(alignment, mems, *get_regular_aligner(), xindex, max_expected_dist_approx_error, - min_clustering_mem_length, unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters = clusterer.clusters(alignment, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); - } + MemoizingGraph memoizing_graph(xindex); + unique_ptr distance_measurer = get_distance_measurer(memoizing_graph); + vector clusters = get_clusters(alignment, mems, &(*distance_measurer), fanouts.get()); #ifdef debug_multipath_mapper cerr << "obtained clusters:" << endl; for (int i = 0; i < clusters.size(); i++) { cerr << "\tcluster " << i << endl; - for (pair hit : clusters[i]) { + for (pair hit : clusters[i].first) { cerr << "\t\t" << hit.second << " " << hit.first->sequence() << endl; + if (fanouts.get() && fanouts->count(hit.first)) { + cerr << "\t\t\tfan-outs:" << endl; + for (auto fanout : fanouts->at(hit.first)) { + cerr << "\t\t\t\t" << (fanout.first - hit.first->begin) << ": " << *fanout.first << " -> " << fanout.second << endl; + } + } } } cerr << "extracting subgraphs..." << endl; @@ -101,13 +127,18 @@ namespace vg { // extract graphs around the clusters auto cluster_graphs = query_cluster_graphs(alignment, mems, clusters); - // actually perform the alignments and post-process to meet MultipathAlignment invariants - vector cluster_idxs = range_vector(cluster_graphs.size()); - align_to_cluster_graphs(alignment, mapq_method, cluster_graphs, multipath_alns_out, num_mapping_attempts, &cluster_idxs); + // actually perform the alignments and post-process to meet multipath_alignment_t invariants + // TODO: do i still need cluster_idx? i think it might have only been used for capping + vector multiplicities; + vector cluster_idxs; + align_to_cluster_graphs(alignment, cluster_graphs, multipath_alns_out, multiplicities, + num_mapping_attempts, fanouts.get(), &cluster_idxs); if (multipath_alns_out.empty()) { // add a null alignment so we know it wasn't mapped multipath_alns_out.emplace_back(); + cluster_graphs.emplace_back(); + multiplicities.push_back(1.0); to_multipath_alignment(alignment, multipath_alns_out.back()); // in case we're realigning GAMs that have paths already @@ -115,18 +146,14 @@ namespace vg { multipath_alns_out.back().clear_start(); } - if (likely_mismapping(multipath_alns_out.front())) { - // we can't distinguish this alignment from the longest MEM of a random sequence -#ifdef debug_multipath_mapper - cerr << "mapping is not distinguishable from a random sequence, snapping MAPQ to 0" << endl; -#endif - - multipath_alns_out.front().set_mapping_quality(0); + if (do_spliced_alignment && !likely_mismapping(multipath_alns_out.front())) { + find_spliced_alignments(alignment, multipath_alns_out, multiplicities, cluster_idxs, + mems, cluster_graphs, fanouts.get()); } - else { - // account for the possiblity that we missed the correct cluster due to sub-sampling MEM hits - // within the cluster - cap_mapping_quality_by_hit_sampling_probability(multipath_alns_out, cluster_idxs, cluster_graphs); + + if (agglomerate_multipath_alns) { + // we want the disconnected alignments combined into one + agglomerate_alignments(multipath_alns_out, &multiplicities); } // if we computed extra alignments to get a mapping quality, remove them @@ -134,44 +161,294 @@ namespace vg { multipath_alns_out.resize(max_alt_mappings); } - if (simplify_topologies) { - for (MultipathAlignment& multipath_aln : multipath_alns_out) { - merge_non_branching_subpaths(multipath_aln); - } + // mark unmapped reads and get rid of noise alignments + purge_unmapped_alignments(multipath_alns_out); + + for (size_t i = 1; i < multipath_alns_out.size(); ++i) { + multipath_alns_out[i].set_annotation("secondary", true); } if (strip_bonuses) { - for (MultipathAlignment& multipath_aln : multipath_alns_out) { + for (multipath_alignment_t& multipath_aln : multipath_alns_out) { strip_full_length_bonuses(multipath_aln); } } - // clean up the cluster graphs - for (auto cluster_graph : cluster_graphs) { - delete get<0>(cluster_graph); - } #ifdef debug_pretty_print_alignments cerr << "final alignments being returned:" << endl; - for (const MultipathAlignment& multipath_aln : multipath_alns_out) { + for (const multipath_alignment_t& multipath_aln : multipath_alns_out) { view_multipath_alignment(cerr, multipath_aln, *xindex); } #endif + +#ifdef mpmap_instrument_mem_statistics + size_t num_mems = mems.size(); + size_t min_mem_length = numeric_limits::max(); + size_t max_mem_length = 0; + double avg_mem_length = 0.0; + for (const auto& mem : mems) { + min_mem_length = min(min_mem_length, mem.length()); + max_mem_length = max(max_mem_length, mem.length()); + avg_mem_length += mem.length(); + } + avg_mem_length /= mems.size(); + double avg_mem_overlap = 0.0; + for (size_t i = 1; i < mems.size(); ++i) { + avg_mem_overlap += max(mems[i - 1].end - mems[i].begin, 0); + } + avg_mem_overlap /= (mems.size() - 1); + + vector hit_lengths; + for (const auto& mem : mems) { + for (const auto& n : mem.nodes) { + hit_lengths.push_back(mem.length()); + } + } + sort(hit_lengths.begin(), hit_lengths.end(), std::greater()); + + size_t num_clusters = clusters.size(); + vector winning_lengths; + size_t winning_cluster_num_mems = clusters.empty() ? 0 : clusters[cluster_idxs.front()].size(); + size_t winning_cluster_total_bases = 0; + size_t winning_cluster_min_mem_length = numeric_limits::max(); + size_t winning_cluster_max_mem_length = 0; + size_t winning_cluster_tail_bases = 0; + double winning_cluster_avg_intermem_gap = 0.0; + vector order; + + if (!clusters.empty()) { + for (const auto& hit : clusters[cluster_idxs.front()]) { + winning_cluster_min_mem_length = min(winning_cluster_min_mem_length, hit.first->length()); + winning_cluster_max_mem_length = max(winning_cluster_max_mem_length, hit.first->length()); + winning_cluster_total_bases += hit.first->length(); + } + for (size_t i = 0; i < clusters[cluster_idxs.front()].size(); ++i) { + order.push_back(i); + winning_lengths.push_back(clusters[cluster_idxs.front()][i].first->length()); + } + sort(order.begin(), order.end(), [&](size_t i, size_t j) { + return clusters[cluster_idxs.front()][i].first->begin < clusters[cluster_idxs.front()][j].first->begin; + }); + sort(winning_lengths.begin(), winning_lengths.end(), std::greater()); + + winning_cluster_tail_bases = ((clusters[cluster_idxs.front()][order.front()].first->begin - alignment.sequence().begin()) + + (alignment.sequence().end() - clusters[cluster_idxs.front()][order.back()].first->end)); + if (clusters[cluster_idxs.front()].size() == 0) { + winning_cluster_avg_intermem_gap = numeric_limits::quiet_NaN(); + } + else { + for (size_t i = 1; i < order.size(); ++i) { + winning_cluster_avg_intermem_gap += (clusters[cluster_idxs.front()][order[i]].first->begin + - clusters[cluster_idxs.front()][order[i - 1]].first->end); + } + winning_cluster_avg_intermem_gap /= order.size() - 1; + } + } + + vector secondary_lengths; + if (cluster_idxs.size() > 1 && clusters.size() > 1) { + for (const auto& hit : clusters.at(cluster_idxs[1])) { + secondary_lengths.push_back(hit.first->length()); + } + } + sort(secondary_lengths.begin(), secondary_lengths.end(), greater()); + + vector secondary_lengths; + if (cluster_idxs.size() > 1 && clusters.size() > 1) { + for (const auto& hit : clusters[cluster_idxs[1]]) { + secondary_lengths.push_back(hit.first->length()); + } + } + sort(secondary_lengths.begin(), secondary_lengths.end(), greater()); + + int64_t max_non_winning_mem_length = 0; + for (size_t i = 0; i < mems.size(); ++i) { + bool found = false; + if (!clusters.empty()) { + for (const auto hit : clusters[cluster_idxs.front()]) { + if (hit.first == &mems[i]) { + found = true; + break; + } + } + } + if (!found) { + max_non_winning_mem_length = max(max_non_winning_mem_length, mems[i].length()); + } + } + +#pragma omp critical + { + if (!_wrote_mem_stats_header) { + _mem_stats << "name\tread_len\tnum_mems\tmin_mem_length\tmax_mem_length\tavg_mem_length\tavg_mem_overlap\tnum_clusters\twinning_cluster_num_mems\twinning_cluster_min_mem_length\twinning_cluster_max_mem_length\twinning_cluster_total_bases\twinning_cluster_tail_bases\twinning_cluster_avg_intermem_gap\tmax_non_winning_mem_length\tmapping_quality\thit_lengths\twinning_lengths\tsecondary_lengths" << endl; + _wrote_mem_stats_header = true; + } + _mem_stats << alignment.name() << "\t" << alignment.sequence().size() << "\t" << num_mems << "\t" << min_mem_length << "\t" << max_mem_length << "\t" << avg_mem_length << "\t" << avg_mem_overlap << "\t" << num_clusters << "\t" << winning_cluster_num_mems << "\t" << winning_cluster_min_mem_length << "\t" << winning_cluster_max_mem_length << "\t" << winning_cluster_total_bases << "\t" << winning_cluster_tail_bases << "\t" << winning_cluster_avg_intermem_gap << "\t" << max_non_winning_mem_length << "\t" << multipath_alns_out.front().mapping_quality(); + _mem_stats << "\t"; + for (size_t i = 0; i < hit_lengths.size(); ++i) { + if (i > 0) { + _mem_stats << ","; + } + _mem_stats << hit_lengths[i]; + } + _mem_stats << "\t"; + for (size_t i = 0; i < winning_lengths.size(); ++i) { + if (i > 0) { + _mem_stats << ","; + } + _mem_stats << winning_lengths[i]; + } + _mem_stats << "\t"; + if (secondary_lengths.empty()) { + _mem_stats << "NA"; + } + else { + for (size_t i = 0; i < secondary_lengths.size(); ++i) { + if (i > 0) { + _mem_stats << ","; + } + _mem_stats << secondary_lengths[i]; + } + } + _mem_stats << endl; + } +#endif + } + + vector MultipathMapper::get_clusters(const Alignment& alignment, const vector& mems, + OrientedDistanceMeasurer* distance_measurer, + const match_fanouts_t* fanouts) const { + + // note: we don't want to generate the distance measurer in this function because we want + // to be able to re-use its memoization if we cluster pairs later + + // choose a clusterer (ordered by expected most likely use for better branch prediction) + unique_ptr clusterer; + if (!no_clustering && use_min_dist_clusterer && component_min_dist) { + clusterer = unique_ptr(new ComponentMinDistanceClusterer(distance_index)); + } + else if (!no_clustering && !use_min_dist_clusterer && !use_tvs_clusterer) { + clusterer = unique_ptr(new OrientedDistanceClusterer(*distance_measurer, + max_expected_dist_approx_error)); + } + else if (no_clustering) { + clusterer = unique_ptr(new NullClusterer()); + } + else if (use_min_dist_clusterer && !greedy_min_dist) { + clusterer = unique_ptr(new MinDistanceClusterer(distance_index)); + } + else if (use_min_dist_clusterer && greedy_min_dist) { + clusterer = unique_ptr(new GreedyMinDistanceClusterer(distance_index)); + } + else { + clusterer = unique_ptr(new TVSClusterer(xindex, distance_index)); + } + clusterer->max_gap = max_alignment_gap; + + // generate clusters + return clusterer->clusters(alignment, mems, get_aligner(!alignment.quality().empty()), + min_clustering_mem_length, max_mapping_quality, log_likelihood_approx_factor, + min_median_mem_coverage_for_split, 0.75, truncation_multiplicity_mq_limit, + fanouts); + } + + vector MultipathMapper::find_mems(const Alignment& alignment, + vector>>* mem_fanout_breaks) { + if (!use_stripped_match_alg && + (!use_fanout_match_alg || (use_fanout_match_alg && alignment.quality().empty()))) { + double dummy1, dummy2; + return find_mems_deep(alignment.sequence().begin(), alignment.sequence().end(), dummy1, dummy2, + 0, min_mem_length, mem_reseed_length, false, true, true, false); + } + else if (use_fanout_match_alg) { + return find_fanout_mems(alignment.sequence().begin(), alignment.sequence().end(), + alignment.quality().begin(), max_fans_out, max_fanout_base_quality, + mem_fanout_breaks); + } + else { + return find_stripped_matches(alignment.sequence().begin(), alignment.sequence().end(), + stripped_match_alg_strip_length, stripped_match_alg_max_length, + stripped_match_alg_target_count); + } + } + + vector, int64_t>> MultipathMapper::get_cluster_pairs(const Alignment& alignment1, + const Alignment& alignment2, + vector& cluster_graphs1, + vector& cluster_graphs2, + OrientedDistanceMeasurer* distance_measurer) { + // make vectors of cluster pointers to shim into the cluster pairing function + vector cluster_mems_1(cluster_graphs1.size()), cluster_mems_2(cluster_graphs2.size()); + for (size_t i = 0; i < cluster_mems_1.size(); i++) { + cluster_mems_1[i] = &(get<1>(cluster_graphs1[i])); + } + for (size_t i = 0; i < cluster_mems_2.size(); i++) { + cluster_mems_2[i] = &(get<1>(cluster_graphs2[i])); + } + + // Find the clusters that have a tie for the longest MEM, and create alternate anchor points for those clusters + vector> alt_anchors_1, alt_anchors_2; + for (size_t i = 0; i < cluster_mems_1.size(); i++) { + auto& mem_cluster = cluster_mems_1[i]->first; + for (size_t j = 1; j < mem_cluster.size(); j++) { + if (mem_cluster[j].first->length() + alt_anchor_max_length_diff >= mem_cluster.front().first->length()) { + alt_anchors_1.emplace_back(i, j); + } + else { + break; + } + } + } + for (size_t i = 0; i < cluster_mems_2.size(); i++) { + auto& mem_cluster = cluster_mems_2[i]->first; + for (size_t j = 1; j < mem_cluster.size(); j++) { + if (mem_cluster[j].first->length() + alt_anchor_max_length_diff >= mem_cluster.front().first->length()) { + alt_anchors_2.emplace_back(i, j); + } + else { + break; + } + } + } + + // Compute the pairs of cluster graphs and their approximate distances from each other + // (ordered by expected most likely use case for better branch prediction) + unique_ptr clusterer; + if (use_min_dist_clusterer && !no_clustering) { + // greedy and non-greedy algorithms are the same, so don't bother distinguishing + clusterer = unique_ptr(new MinDistanceClusterer(distance_index)); + } + else if (no_clustering) { + clusterer = unique_ptr(new NullClusterer()); + } + else if (!use_tvs_clusterer) { + clusterer = unique_ptr(new OrientedDistanceClusterer(*distance_measurer)); + } + else { + clusterer = unique_ptr(new TVSClusterer(xindex, distance_index)); + } + + return clusterer->pair_clusters(alignment1, alignment2, cluster_mems_1, cluster_mems_2, + alt_anchors_1, alt_anchors_2, + fragment_length_distr.mean(), + ceil(10.0 * fragment_length_distr.std_dev())); } void MultipathMapper::align_to_cluster_graphs(const Alignment& alignment, - MappingQualityMethod mapq_method, vector& cluster_graphs, - vector& multipath_alns_out, + vector& multipath_alns_out, + vector& multiplicities_out, size_t num_mapping_attempts, + const match_fanouts_t* fanouts, vector* cluster_idxs) { #ifdef debug_multipath_mapper - cerr << "aligning to subgraphs..." << endl; + cerr << "aligning to (up to) " << cluster_graphs.size() << " subgraphs..." << endl; #endif // we may need to compute an extra mapping above the one we'll report if we're computing mapping quality - size_t num_mappings_to_compute = mapq_method != None ? max(num_mapping_attempts, (size_t) 2) : num_mapping_attempts; + size_t num_mappings_to_compute = max(num_mapping_attempts, (size_t) 2); multipath_alns_out.clear(); @@ -195,86 +472,110 @@ namespace vg { } #ifdef debug_multipath_mapper_alignment - cerr << "performing alignment to subgraph " << pb2json(get<0>(cluster_graph)->graph) << endl; + cerr << "performing alignment to subgraph with coverage " << get<2>(cluster_graph) << " and multiplicity " << get<1>(cluster_graph).second << endl; #endif - + double multiplicity = cluster_multiplicity(get<1>(cluster_graph)); + size_t cluster_size = get<1>(cluster_graph).first.size(); multipath_alns_out.emplace_back(); - multipath_align(alignment, get<0>(cluster_graph), get<1>(cluster_graph), multipath_alns_out.back()); - + multipath_align(alignment, cluster_graph, multipath_alns_out.back(), fanouts); + multiplicities_out.emplace_back(multiplicity); num_mappings++; } - // if we didn't end up performing all of the cluster alignments, re-sync the list - if (cluster_idxs) { - if (cluster_idxs->size() > multipath_alns_out.size()) { - cluster_idxs->resize(multipath_alns_out.size()); + if (!multipath_alns_out.empty()) { + // find clusters whose likelihoods are approximately the same as the low end of the clusters we aligned + auto aligner = get_aligner(!alignment.quality().empty()); + int64_t score_diff = round(aligner->mapping_quality_score_diff(truncation_multiplicity_mq_limit)); + int64_t max_tail_idx = multipath_alns_out.size(); + while (max_tail_idx < cluster_graphs.size() + && get<2>(cluster_graphs[max_tail_idx]) >= get<2>(cluster_graphs[multipath_alns_out.size() - 1]) - score_diff) { + ++max_tail_idx; + } + if (max_tail_idx > multipath_alns_out.size()) { + // there are some (nearly) identical cluster that we ignored, so we'll account for them in the multiplicity + + // find the clusters that are approximately the same + int64_t min_tail_idx = multipath_alns_out.size() - 1; + while (min_tail_idx > 0 && + get<2>(cluster_graphs[min_tail_idx - 1]) <= get<2>(cluster_graphs[multipath_alns_out.size() - 1]) + score_diff) { + --min_tail_idx; + } + + // multiply their multiplicity by the inverse of the fraction aligned + double trunc_multiplicity = double(max_tail_idx - min_tail_idx) / double(multipath_alns_out.size() - min_tail_idx); + for (size_t i = min_tail_idx; i < multipath_alns_out.size(); ++i) { + multiplicities_out[i] *= trunc_multiplicity; + } } } + if (cluster_idxs) { + // initially all of the multipath alignments are in the order of the clusters + *cluster_idxs = range_vector(multipath_alns_out.size()); + } + #ifdef debug_multipath_mapper cerr << "splitting multicomponent alignments..." << endl; #endif - // split up any alignments that ended up being disconnected - split_multicomponent_alignments(multipath_alns_out, cluster_idxs); + if (!suppress_multicomponent_splitting) { + // split up any alignments that ended up being disconnected + split_multicomponent_alignments(multipath_alns_out, &alignment, &cluster_graphs, cluster_idxs, &multiplicities_out); + } -#ifdef debug_multipath_mapper - cerr << "topologically ordering " << multipath_alns_out.size() << " multipath alignments" << endl; -#endif - for (MultipathAlignment& multipath_aln : multipath_alns_out) { + // remove low-scoring bits of complicated multipath alignments + for (auto& multipath_aln : multipath_alns_out) { topologically_order_subpaths(multipath_aln); + simplify_complicated_multipath_alignment(multipath_aln); } #ifdef debug_multipath_mapper cerr << "computing mapping quality and sorting mappings" << endl; #endif - sort_and_compute_mapping_quality(multipath_alns_out, mapq_method, cluster_idxs); + sort_and_compute_mapping_quality(multipath_alns_out, cluster_idxs, &multiplicities_out); - if (!multipath_alns_out.empty() ? likely_mismapping(multipath_alns_out.front()) : false) { + if (!multipath_alns_out.empty() && likely_mismapping(multipath_alns_out.front())) { multipath_alns_out.front().set_mapping_quality(0); } // for debugging: an expensive check for invariant validity that can be turned on // with a preprocessor flag #ifdef debug_validate_multipath_alignments - for (MultipathAlignment& multipath_aln : multipath_alns_out) { + for (multipath_alignment_t& multipath_aln : multipath_alns_out) { #ifdef debug_multipath_mapper cerr << "validating multipath alignment:" << endl; - cerr << pb2json(multipath_aln) << endl; + cerr << debug_string(multipath_aln) << endl; #endif if (!validate_multipath_alignment(multipath_aln, *xindex)) { cerr << "### WARNING ###" << endl; - cerr << "multipath alignment of read " << multipath_aln.name() << " failed to validate" << endl; + cerr << "multipath alignment of read " << multipath_aln.sequence() << " failed to validate" << endl; } } #endif } - void MultipathMapper::attempt_unpaired_multipath_map_of_pair(const Alignment& alignment1, const Alignment& alignment2, - vector>& multipath_aln_pairs_out, + bool MultipathMapper::attempt_unpaired_multipath_map_of_pair(const Alignment& alignment1, const Alignment& alignment2, + vector>& multipath_aln_pairs_out, vector>& ambiguous_pair_buffer) { // compute single ended mappings, and make sure we also compute mapping qualities to assess // mapping ambiguity - vector multipath_alns_1, multipath_alns_2; - multipath_map_internal(alignment1, mapping_quality_method == None ? Approx : mapping_quality_method, - multipath_alns_1, 1); - multipath_map_internal(alignment2, mapping_quality_method == None ? Approx : mapping_quality_method, - multipath_alns_2, 1); + vector multipath_alns_1, multipath_alns_2; + multipath_map(alignment1, multipath_alns_1); + multipath_map(alignment2, multipath_alns_2); bool is_ambiguous = true; if (!multipath_alns_1.empty() && !multipath_alns_2.empty()) { - MultipathAlignment& multipath_aln_1 = multipath_alns_1.front(); - MultipathAlignment& multipath_aln_2 = multipath_alns_2.front(); + multipath_alignment_t& multipath_aln_1 = multipath_alns_1.front(); + multipath_alignment_t& multipath_aln_2 = multipath_alns_2.front(); - auto match_score = get_aligner()->match; - auto full_length_bonus = get_aligner()->full_length_bonus; + auto aligner = get_aligner(!alignment1.quality().empty() && !alignment2.quality().empty()); // score possible of a perfect match (at full base quality) - int32_t max_score_1 = multipath_aln_1.sequence().size() * match_score + 2 * full_length_bonus * !strip_bonuses; - int32_t max_score_2 = multipath_aln_2.sequence().size() * match_score + 2 * full_length_bonus * !strip_bonuses; + int32_t max_score_1 = multipath_aln_1.sequence().size() * aligner->match + 2 * aligner->full_length_bonus * !strip_bonuses; + int32_t max_score_2 = multipath_aln_2.sequence().size() * aligner->match + 2 * aligner->full_length_bonus * !strip_bonuses; #ifdef debug_multipath_mapper cerr << "single ended mappings achieves scores " << optimal_alignment_score(multipath_aln_1) << " and " << optimal_alignment_score(multipath_aln_2) << ", looking for scores " << .8 * max_score_1 << " and " << .8 * max_score_2 << endl; @@ -283,7 +584,9 @@ namespace vg { // are these reads unambiguously mapped and well-aligned? // TODO: i don't like having constants floating around in here - if (multipath_aln_1.mapping_quality() >= min(max_mapping_quality, 45) + if (num_connected_components(multipath_aln_1) == 1 + && num_connected_components(multipath_aln_2) == 1 + && multipath_aln_1.mapping_quality() >= min(max_mapping_quality, 45) && multipath_aln_2.mapping_quality() >= min(max_mapping_quality, 45) && optimal_alignment_score(multipath_aln_1) >= .8 * max_score_1 && optimal_alignment_score(multipath_aln_2) >= .8 * max_score_2) { @@ -305,9 +608,6 @@ namespace vg { #endif multipath_aln_pairs_out.emplace_back(move(multipath_aln_1), move(multipath_aln_2)); - multipath_aln_pairs_out.front().first.set_paired_read_name(multipath_aln_pairs_out.front().second.name()); - multipath_aln_pairs_out.front().second.set_paired_read_name(multipath_aln_pairs_out.front().first.name()); - fragment_length_distr.register_fragment_length(fragment_length); is_ambiguous = false; @@ -323,6 +623,11 @@ namespace vg { #endif ambiguous_pair_buffer.emplace_back(alignment1, alignment2); + + if (ambiguous_pair_buffer.size() + fragment_length_distr.curr_sample_size() + == fragment_length_distr.max_sample_size() * fragment_length_warning_factor) { + cerr << "warning:[vg mpmap] Mapped " << ambiguous_pair_buffer.size() + fragment_length_distr.curr_sample_size() << " read pairs as unpaired reads to learn fragment length distribution, but only obtained " << fragment_length_distr.curr_sample_size() << " unambiguous, consistently mapped pairs. Often this indicates data issues, such as reads that are pre-sorted with unmappable reads at the front, reads that are not actually paired, or mismatched indexes." << endl; + } } @@ -332,7 +637,7 @@ namespace vg { if (fragment_length_distr.is_finalized()) { cerr << "finalized read distribution with " << fragment_length_distr.max_sample_size() << " measurements on read pair " << alignment1.name() << ", " << alignment2.name() << endl; cerr << "mean: " << fragment_length_distr.mean() << endl; - cerr << "std dev: " << fragment_length_distr.stdev() << endl; + cerr << "std dev: " << fragment_length_distr.std_dev() << endl; cerr << "ambiguous buffer contains pairs:" << endl; for (pair& aln_pair : ambiguous_pair_buffer) { cerr << "\t" << aln_pair.first.name() << ", " << aln_pair.second.name() << endl; @@ -349,83 +654,125 @@ namespace vg { cerr << endl; } #endif + + return !is_ambiguous; } - bool MultipathMapper::attempt_rescue(const MultipathAlignment& multipath_aln, const Alignment& other_aln, - bool rescue_forward, MultipathAlignment& rescue_multipath_aln) { + bool MultipathMapper::attempt_rescue(const multipath_alignment_t& multipath_aln, const Alignment& other_aln, + bool rescue_forward, multipath_alignment_t& rescue_multipath_aln) { #ifdef debug_multipath_mapper - cerr << "attemping pair rescue in " << (rescue_forward ? "forward" : "backward") << " direction from " << pb2json(multipath_aln) << endl; + cerr << "attemping pair rescue in " << (rescue_forward ? "forward" : "backward") << " direction from " << debug_string(multipath_aln) << endl; #endif + bool succeeded = do_rescue_alignment(multipath_aln, other_aln, rescue_forward, rescue_multipath_aln, + fragment_length_distr.mean(), rescue_graph_std_devs); - // get the position to jump from and the distance to jump - Alignment opt_anchoring_aln; - optimal_alignment(multipath_aln, opt_anchoring_aln); - pos_t pos_from = rescue_forward ? initial_position(opt_anchoring_aln.path()) : final_position(opt_anchoring_aln.path()); - int64_t jump_dist = rescue_forward ? fragment_length_distr.mean() - other_aln.sequence().size() : -fragment_length_distr.mean(); + if (!succeeded) { + return false; + } - // get the seed position(s) for the rescue by jumping along paths - vector jump_positions = xindex->jump_along_closest_path(id(pos_from), is_rev(pos_from), offset(pos_from), jump_dist, 250); + auto aligner = get_aligner(!multipath_aln.quality().empty() && !other_aln.quality().empty()); + vector score(1, optimal_alignment_score(rescue_multipath_aln)); + int32_t solo_mapq = mapq_scaling_factor * aligner->compute_max_mapping_quality(score, mapping_quality_method == Approx); + int32_t adjusted_mapq = min(solo_mapq, min(max_mapping_quality, multipath_aln.mapping_quality())); + rescue_multipath_aln.set_mapping_quality(adjusted_mapq); #ifdef debug_multipath_mapper - cerr << "found jump positions:" << endl; - for (pos_t& pos : jump_positions) { - cerr << "\t" << pos << endl; - } + cerr << "converted multipath alignment is" << endl; + cerr << debug_string(rescue_multipath_aln) << endl; + cerr << "rescued alignment has effective match length " << pseudo_length(rescue_multipath_aln) << ", which gives p-value " << random_match_p_value(pseudo_length(rescue_multipath_aln), rescue_multipath_aln.sequence().size()) << endl; +#endif + + // TODO: magic number + if (solo_mapq < min(25, max_mapping_quality)) { +#ifdef debug_multipath_mapper + cerr << "rescue fails because raw_mapq " << solo_mapq << " < " << min(25, max_mapping_quality) << endl; #endif - if (jump_positions.empty()) { return false; } - // pull out the graph around the position(s) we jumped to - VG rescue_graph; - vector backward_dist(jump_positions.size(), 6 * fragment_length_distr.stdev()); - vector forward_dist(jump_positions.size(), 6 * fragment_length_distr.stdev() + other_aln.sequence().size()); - algorithms::extract_containing_graph(xindex, &rescue_graph, jump_positions, backward_dist, forward_dist); - + if (likely_misrescue(rescue_multipath_aln)) { #ifdef debug_multipath_mapper - cerr << "got rescue graph " << pb2json(rescue_graph.graph) << endl; + cerr << "rescue fails with p value above " << max_rescue_p_value << endl; #endif + return false; + } + + return true; + } + + bool MultipathMapper::do_rescue_alignment(const multipath_alignment_t& multipath_aln, const Alignment& other_aln, + bool rescue_forward, multipath_alignment_t& rescue_multipath_aln, + double rescue_mean_length, double num_std_devs) const { + bdsg::HashGraph rescue_graph; + extract_rescue_graph(multipath_aln, other_aln, rescue_forward, &rescue_graph, + rescue_mean_length, rescue_graph_std_devs); + + if (rescue_graph.get_node_count() == 0) { + return false; + } +#ifdef debug_multipath_mapper_alignment + cerr << "got rescue graph" << endl; + rescue_graph.for_each_handle([&](const handle_t& h) { + cerr << rescue_graph.get_id(h) << " " << rescue_graph.get_sequence(h) << endl; + rescue_graph.follow_edges(h, true, [&](const handle_t& p) { + cerr << "\t" << rescue_graph.get_id(p) << (rescue_graph.get_is_reverse(p) ? "-" : "+") << " <-" << endl; + }); + rescue_graph.follow_edges(h, false, [&](const handle_t& n) { + cerr << "\t-> " << rescue_graph.get_id(n) << (rescue_graph.get_is_reverse(n) ? "-" : "+") << endl; + }); + }); +#endif // TODO: repetitive code with multipath_align // the longest path we could possibly align to (full gap and a full sequence) - size_t target_length = other_aln.sequence().size() + get_aligner()->longest_detectable_gap(other_aln); + auto aligner = get_aligner(!multipath_aln.quality().empty() && !other_aln.quality().empty()); + size_t target_length = other_aln.sequence().size() + min(aligner->longest_detectable_gap(other_aln), max_alignment_gap); // convert from bidirected to directed - VG align_graph; - unordered_map > node_trans = algorithms::split_strands(&rescue_graph, &align_graph); - // if necessary, convert from cyclic to acylic - if (!algorithms::is_directed_acyclic(&rescue_graph)) { - unordered_map > dagify_trans; - align_graph = align_graph.dagify(target_length, // high enough that num SCCs is never a limiting factor - dagify_trans, - target_length, - 0); // no maximum on size of component - node_trans = align_graph.overlay_node_translations(dagify_trans, node_trans); + StrandSplitGraph align_digraph(&rescue_graph); + + // if necessary, convert from cyclic to acylic (this is expensive, so only do it if we need to) + IdentityOverlay undagified(&align_digraph); + unique_ptr dagified; + + ExpandingOverlayGraph* align_dag = nullptr; + if (handlealgs::is_directed_acyclic(&align_digraph)) { + align_dag = &undagified; + } + else { +#ifdef debug_multipath_mapper_alignment + cerr << "graph contains directed cycles, performing dagification" << endl; +#endif + dagified = unique_ptr(new DagifiedGraph(&align_digraph, target_length, max_dagify_duplications)); + align_dag = dagified.get(); } // put local alignment here Alignment aln = other_aln; - // in case we're realigning a GAM, get rid of the path + // in case we're realigning a GAM, get rid of the path and score aln.clear_path(); + aln.set_score(0); - algorithms::lazier_sort(&align_graph); - - get_aligner()->align(aln, align_graph.graph, true, false); + aligner->align(aln, *align_dag, true); // get the IDs back into the space of the reference graph - translate_oriented_node_ids(*aln.mutable_path(), node_trans); + function(id_t)> translator = [&](const id_t& node_id) { + handle_t original = align_digraph.get_underlying_handle(align_dag->get_underlying_handle(align_dag->get_handle(node_id))); + return make_pair(rescue_graph.get_id(original), rescue_graph.get_is_reverse(original)); + }; + translate_oriented_node_ids(*aln.mutable_path(), translator); #ifdef debug_multipath_mapper - cerr << "resecued direct alignment is" << endl; + cerr << "rescued direct alignment is" << endl; cerr << pb2json(aln) << endl; #endif - if (num_alt_alns > 1 && snarl_manager != nullptr) { + if (num_alt_alns > 1 && (snarl_manager != nullptr || distance_index != nullptr)) { // make an interesting multipath alignment by realigning the single path alignment inside snarls - make_nontrivial_multipath_alignment(aln, align_graph, node_trans, *snarl_manager, rescue_multipath_aln); + make_nontrivial_multipath_alignment(aln, *align_dag, translator, rescue_multipath_aln); } else { @@ -435,35 +782,118 @@ namespace vg { identify_start_subpaths(rescue_multipath_aln); - vector score(1, aln.score()); - int32_t raw_mapq = get_aligner()->compute_mapping_quality(score, mapping_quality_method == None || mapping_quality_method == Approx); - int32_t adjusted_mapq = min(raw_mapq, min(max_mapping_quality, multipath_aln.mapping_quality())); - rescue_multipath_aln.set_mapping_quality(adjusted_mapq); - -#ifdef debug_multipath_mapper - cerr << "converted multipath alignment is" << endl; - cerr << pb2json(rescue_multipath_aln) << endl; - cerr << "rescued alignment has effective match length " << pseudo_length(rescue_multipath_aln) / 3 << ", which gives p-value " << random_match_p_value(pseudo_length(rescue_multipath_aln) / 3, rescue_multipath_aln.sequence().size()) << endl; -#endif + return true; + } + void MultipathMapper::extract_rescue_graph(const multipath_alignment_t& multipath_aln, const Alignment& other_aln, + bool rescue_forward, MutableHandleGraph* rescue_graph, + double rescue_mean_length, double num_std_devs) const { - if (raw_mapq < min(25, max_mapping_quality)) { -#ifdef debug_multipath_mapper - cerr << "rescue fails because raw_mapq " << raw_mapq << " < " << min(25, max_mapping_quality) << endl; -#endif - return false; - } - - auto p_val = random_match_p_value(pseudo_length(rescue_multipath_aln) / 3, rescue_multipath_aln.sequence().size()); + // get the position to jump from and the distance to jump + Alignment opt_anchoring_aln; + optimal_alignment(multipath_aln, opt_anchoring_aln); - if (p_val >= max_mapping_p_value * 0.1) { + if (opt_anchoring_aln.path().mapping_size() == 0) { + return; + } + + if (get_rescue_graph_from_paths || !distance_index) { + // we're either not using the distance index or we don't have one + pos_t pos_from = rescue_forward ? initial_position(opt_anchoring_aln.path()) : final_position(opt_anchoring_aln.path()); + int64_t jump_dist = rescue_forward ? rescue_mean_length : -rescue_mean_length; + + // get the seed position(s) for the rescue by jumping along paths + vector jump_positions = algorithms::jump_along_closest_path(xindex, pos_from, jump_dist, 250); + #ifdef debug_multipath_mapper - cerr << "rescue fails because p value " << p_val << " >= " << (max_mapping_p_value * 0.1) << endl; + cerr << "found jump positions:" << endl; + for (pos_t& pos : jump_positions) { + cerr << "\t" << pos << endl; + } #endif - return false; + if (jump_positions.empty()) { + return; + } + + size_t search_dist_bwd, search_dist_fwd; + if (rescue_forward) { + search_dist_bwd = size_t(round(num_std_devs * fragment_length_distr.std_dev())) + other_aln.sequence().size(); + search_dist_fwd = num_std_devs * fragment_length_distr.std_dev(); + } + else { + search_dist_bwd = num_std_devs * fragment_length_distr.std_dev(); + search_dist_fwd = size_t(round(num_std_devs * fragment_length_distr.std_dev())) + other_aln.sequence().size(); + } + + vector backward_dist(jump_positions.size(), search_dist_bwd); + vector forward_dist(jump_positions.size(), search_dist_fwd); + algorithms::extract_containing_graph(xindex, rescue_graph, jump_positions, backward_dist, forward_dist, + num_alt_alns > 1 ? reversing_walk_length : 0); + + } + else { + // we have a distance index and we want to use it + + // get the set of nodes that we want to extrat + unordered_set subgraph_nodes_to_add; + int64_t min_distance = max(0.0, rescue_mean_length - other_aln.sequence().size() + - rescue_graph_std_devs * fragment_length_distr.std_dev()); + int64_t max_distance = rescue_mean_length + rescue_graph_std_devs * fragment_length_distr.std_dev(); + subgraph_in_distance_range(*distance_index, opt_anchoring_aln.path(), xindex, min_distance, max_distance, + subgraph_nodes_to_add, rescue_forward); + + // this algorithm is better matched to the GBWTGraph, we need to extract the subgraph manually now. + // we'll use an algorithm that tries to follow edges to find nodes. this way we minimize calls to XG's + // get_handle, and we have to follow all edges anyway to add them + + while (!subgraph_nodes_to_add.empty()) { + // there's at least one node that we haven't added yet + + // initialize a search out from an arbitrary unadded node + id_t node_id = *subgraph_nodes_to_add.begin(); + subgraph_nodes_to_add.erase(node_id); + handle_t start_handle = xindex->get_handle(node_id); + rescue_graph->create_handle(xindex->get_sequence(start_handle), + xindex->get_id(start_handle)); + vector stack(1, start_handle); + while (!stack.empty()) { + handle_t super_handle = stack.back(); + stack.pop_back(); + for (bool go_left : {true, false}) { + xindex->follow_edges(super_handle, go_left, [&](const handle_t& neighbor) { + + if (subgraph_nodes_to_add.count(xindex->get_id(neighbor))) { + // we've found a new node that we haven't added yet, add it to the graph + // and the queue, and erase from the nodes left to add + subgraph_nodes_to_add.erase(xindex->get_id(neighbor)); + rescue_graph->create_handle(xindex->get_sequence(xindex->forward(neighbor)), + xindex->get_id(neighbor)); + stack.push_back(neighbor); + } + + if (rescue_graph->has_node(xindex->get_id(neighbor))) { + // we always know that the handle we're coming from is in the subgraph, it + // seems that this neighbor is as well, so add the edge + + handle_t sub_handle = rescue_graph->get_handle(xindex->get_id(super_handle), + xindex->get_is_reverse(super_handle)); + handle_t sub_neighbor = rescue_graph->get_handle(xindex->get_id(neighbor), + xindex->get_is_reverse(neighbor)); + + // edges automatically deduplicate, so don't worry about checking whether + // it exists + if (go_left) { + rescue_graph->create_edge(sub_neighbor, sub_handle); + } + else { + rescue_graph->create_edge(sub_handle, sub_neighbor); + } + } + }); + } + } + } } - - return true; } void MultipathMapper::init_band_padding_memo() { @@ -474,194 +904,369 @@ namespace vg { band_padding_memo[i] = size_t(band_padding_multiplier * sqrt(i)) + 1; } } + + void MultipathMapper::set_alignment_scores(const int8_t* score_matrix, int8_t gap_open, int8_t gap_extend, + int8_t full_length_bonus) { + AlignerClient::set_alignment_scores(score_matrix, gap_open, gap_extend, full_length_bonus); + splice_stats.update_scoring(*get_regular_aligner()); + set_min_softclip_length_for_splice(min_softclip_length_for_splice); + set_log_odds_against_splice(no_splice_natural_log_odds); + set_max_merge_supression_length(); + } + - bool MultipathMapper::likely_mismapping(const MultipathAlignment& multipath_aln) { - - // empirically, we get better results by scaling the pseudo-length down, I have no good explanation for this probabilistically - auto p_val = random_match_p_value(pseudo_length(multipath_aln) / 3, multipath_aln.sequence().size()); - + bool MultipathMapper::likely_mismapping(const multipath_alignment_t& multipath_aln) { + if (!suppress_mismapping_detection) { + + auto p_val = random_match_p_value(pseudo_length(multipath_aln), multipath_aln.sequence().size()); + +#ifdef debug_multipath_mapper + cerr << "effective match length of read " << multipath_aln.sequence() << " is " << pseudo_length(multipath_aln) << " in read length " << multipath_aln.sequence().size() << ", yielding p-value " << p_val << endl; +#endif + + return p_val > max_mapping_p_value; + } + else { + return false; + } + } + + bool MultipathMapper::likely_misrescue(const multipath_alignment_t& multipath_aln) { + auto p_val = random_match_p_value(pseudo_length(multipath_aln), multipath_aln.sequence().size()); + #ifdef debug_multipath_mapper - cerr << "effective match length of read " << multipath_aln.name() << " is " << pseudo_length(multipath_aln) / 3 << " in read length " << multipath_aln.sequence().size() << ", yielding p-value " << p_val << endl; + cerr << "effective match length of rescued read " << multipath_aln.sequence() << " is " << pseudo_length(multipath_aln) << " in read length " << multipath_aln.sequence().size() << ", yielding p-value " << p_val << endl; #endif - return p_val > max_mapping_p_value; + return p_val > max_rescue_p_value; } + - size_t MultipathMapper::pseudo_length(const MultipathAlignment& multipath_aln) const { - Alignment alignment; - optimal_alignment(multipath_aln, alignment); - const Path& path = alignment.path(); - - int64_t net_matches = 0; - for (size_t i = 0; i < path.mapping_size(); i++) { - const Mapping& mapping = path.mapping(i); - for (size_t j = 0; j < mapping.edit_size(); j++) { - const Edit& edit = mapping.edit(j); - - // skip soft clips - if (((i == 0 && j == 0) || (i == path.mapping_size() - 1 && j == mapping.edit_size() - 1)) - && edit.from_length() == 0 && edit.to_length() > 0) { - continue; - } - - // add matches and subtract mismatches/indels - if (edit.from_length() == edit.to_length() && edit.sequence().empty()) { - net_matches += edit.from_length(); - } - else { - net_matches -= max(edit.from_length(), edit.to_length()); - } - } - } - - return max(0, net_matches); + int64_t MultipathMapper::pseudo_length(const multipath_alignment_t& multipath_aln) const { + return optimal_alignment_score(multipath_aln); } // make the memo live in this .o file - thread_local unordered_map, double> MultipathMapper::p_value_memo; + thread_local unordered_map, double> MultipathMapper::p_value_memo; - double MultipathMapper::random_match_p_value(size_t match_length, size_t read_length) { + double MultipathMapper::random_match_p_value(int64_t match_length, size_t read_length) { // memoized to avoid transcendental functions (at least in cases where read lengths don't vary too much) auto iter = p_value_memo.find(make_pair(match_length, read_length)); if (iter != p_value_memo.end()) { return iter->second; } else { - double p_value = 1.0 - pow(1.0 - exp(-(match_length * pseudo_length_multiplier)), xindex->seq_length * read_length); - if (p_value_memo.size() < max_p_value_memo_size) { + double rate = max_exponential_rate_intercept + max_exponential_rate_slope * read_length; + double shape = exp(max_exponential_shape_intercept + max_exponential_shape_slope * read_length); + double p_value = 1.0 - max_exponential_cdf(match_length, rate, shape); + if (p_value_memo.size() < max_p_value_memo_size && !suppress_p_value_memoization) { p_value_memo[make_pair(match_length, read_length)] = p_value; } return p_value; } } - - void MultipathMapper::calibrate_mismapping_detection(size_t num_simulations, size_t simulated_read_length) { + void MultipathMapper::calibrate_mismapping_detection(size_t num_simulations, const vector& simulated_read_lengths) { + + // we are calibrating the parameters, so we don't want to memoize any p-values using the default values + suppress_p_value_memoization = true; + // we don't want to do base quality adjusted alignments for this stage since we are just simulating random sequences // with no base qualities bool reset_quality_adjustments = adjust_alignments_for_base_quality; adjust_alignments_for_base_quality = false; - // compute the pseudo length of a bunch of randomly generated sequences - vector lengths(num_simulations, 0.0); - double length_sum = 0.0; - double max_length = numeric_limits::min(); -#pragma omp parallel for - for (size_t i = 0; i < num_simulations; i++) { - - Alignment alignment; - alignment.set_sequence(random_sequence(simulated_read_length)); - vector multipath_alns; - multipath_map(alignment, multipath_alns, 1); - - if (!multipath_alns.empty()) { - lengths[i] = pseudo_length(multipath_alns.front()); -#pragma omp critical - { - length_sum += lengths[i]; - max_length = max(lengths[i], max_length); - } - } - } + // these p-values will eventually be used internally where the scores still have the bonus applied, so we + // want to make sure we don't strip them off here + bool reset_strip_bonuses = strip_bonuses; + strip_bonuses = false; - // reset the memo of p-values (which we are calibrating) for any updates using the default parameter during the null mappings - p_value_memo.clear(); + // we want to avoid returning empty alignments that indicated unmapped reads + bool reset_suppress_mismapping_detection = suppress_mismapping_detection; + suppress_mismapping_detection = true; - // model the lengths as the maximum of genome_size * read_length exponential variables, which gives density function: - // - // GLS exp(-Sx) (1 - exp(-Sx))^(GL - 1) - // - // where: - // G = genome size - // R = read length - // S = scale parameter (which we optimize below) + // and we expect small MEMs, so don't filter them out + int reset_min_mem_length = min_mem_length; + size_t reset_min_clustering_mem_length = min_clustering_mem_length; + min_mem_length = 1; + min_clustering_mem_length = 1; + // and these reads are slow to map, but we only need primaries + size_t reset_max_alt_mappings = max_alt_mappings; + max_alt_mappings = 1; - // compute the log of the 1st and 2nd derivatives for the log likelihood (split up by positive and negative summands) - // we have to do it this wonky way because the exponentiated numbers get very large and cause overflow otherwise + // reset the memo of p-values (which we are calibrating) for any updates using the default parameter during the null mappings + p_value_memo.clear(); - double log_deriv_neg_part = log(length_sum); + // the logarithms of the MLE estimators at each read length + vector mle_max_exponential_rates; + vector log_mle_max_exponential_shapes; - function log_deriv_pos_part = [&](double scale) { - double accumulator = numeric_limits::lowest(); - for (size_t i = 0; i < lengths.size(); i++) { - double length = lengths[i]; - accumulator = add_log(accumulator, log(length) - scale * length - log(1.0 - exp(-scale * length))); + for (const size_t simulated_read_length : simulated_read_lengths) { + // compute the pseudo length of a bunch of randomly generated sequences + vector pseudo_lengths(num_simulations, 0.0); +#pragma omp parallel for + for (size_t i = 0; i < num_simulations; i++) { + + Alignment alignment; + alignment.set_sequence(pseudo_random_sequence(simulated_read_length, i * 716293332 + simulated_read_length)); + vector multipath_alns; + multipath_map(alignment, multipath_alns); + + if (!multipath_alns.empty()) { + pseudo_lengths[i] = pseudo_length(multipath_alns.front()); + } } - accumulator += log(xindex->seq_length * simulated_read_length - 1.0); - return add_log(accumulator, log(num_simulations / scale)); - }; - - function log_deriv2_neg_part = [&](double scale) { - double accumulator = numeric_limits::lowest(); - for (size_t i = 0; i < lengths.size(); i++) { - double length = lengths[i]; - accumulator = add_log(accumulator, 2.0 * log(length) - scale * length - 2.0 * log(1.0 - exp(-scale * length))); + + auto max_exp_params = fit_max_exponential(pseudo_lengths); + mle_max_exponential_rates.push_back(max_exp_params.first); + log_mle_max_exponential_shapes.push_back(log(max_exp_params.second)); + +#ifdef debug_report_startup_training + unordered_map length_counts; + for (auto length : pseudo_lengths) { + length_counts[round(length)]++; } - accumulator += log(xindex->seq_length * simulated_read_length - 1.0); - return add_log(accumulator, log(num_simulations / (scale * scale))); - }; - - // use Newton's method to find the MLE - double tolerance = 1e-10; - double scale = 1.0 / max_length; - double prev_scale = scale * (1.0 + 10.0 * tolerance); - while (abs(prev_scale / scale - 1.0) > tolerance) { - prev_scale = scale; - double log_d2 = log_deriv2_neg_part(scale); - double log_d_pos = log_deriv_pos_part(scale); - double log_d_neg = log_deriv_neg_part; - // determine if the value of the 1st deriv is positive or negative, and compute the - // whole ratio to the 2nd deriv from the positive and negative parts accordingly - if (log_d_pos > log_d_neg) { - scale += exp(subtract_log(log_d_pos, log_d_neg) - log_d2); - } - else { - scale -= exp(subtract_log(log_d_neg, log_d_pos) - log_d2); + vector> sorted_length_counts(length_counts.begin(), length_counts.end()); + sort(sorted_length_counts.begin(), sorted_length_counts.end()); + cerr << "data for length " << simulated_read_length << endl; + for (auto length_count : sorted_length_counts) { + cerr << "\t" << length_count.first << ": " << length_count.second << endl; } + cerr << "trained parameters for length " << simulated_read_length << ": " << endl; + cerr << "\tmax exp rate: " << max_exp_params.first << endl; + cerr << "\tmax exp shape: " << max_exp_params.second << endl; +#endif + } + + // make a design matrix for a log regression and a linear regression + vector> X(simulated_read_lengths.size()); + for (size_t i = 0; i < X.size(); ++i) { + X[i].resize(2, 1.0); + X[i][1] = simulated_read_lengths[i]; } + auto max_exp_rate_coefs = regress(X, mle_max_exponential_rates); + auto max_exp_shape_coefs = regress(X, log_mle_max_exponential_shapes); + + max_exponential_rate_intercept = max_exp_rate_coefs[0]; + max_exponential_rate_slope = max_exp_rate_coefs[1]; + max_exponential_shape_intercept = max_exp_shape_coefs[0]; + max_exponential_shape_slope = max_exp_shape_coefs[1]; + #ifdef debug_report_startup_training - cerr << "trained scale: " << scale << endl; + cerr << "final regression parameters:" << endl; + cerr << "\tmax exp rate = " << max_exponential_rate_intercept << " + " << max_exponential_rate_slope << " * L" << endl; + cerr << "\tmax exp shape = exp(" << max_exponential_shape_intercept << " + " << max_exponential_shape_slope << " * L)" << endl; #endif - // set the multipler to the maximimum likelihood - pseudo_length_multiplier = scale; - + // reset mapping parameters to their original values adjust_alignments_for_base_quality = reset_quality_adjustments; + strip_bonuses = reset_strip_bonuses; + min_clustering_mem_length = reset_min_clustering_mem_length; + min_mem_length = reset_min_mem_length; + max_alt_mappings = reset_max_alt_mappings; + suppress_p_value_memoization = false; + suppress_mismapping_detection = reset_suppress_mismapping_detection; + } + + void MultipathMapper::determine_distance_correlation() { + + // FIXME: very experimental, not sure if i actually want to do this + + if (!distance_index || ref_path_handles.empty()) { + return; + } + + vector refs(ref_path_handles.begin(), ref_path_handles.end()); + sort(refs.begin(), refs.end(), [&](path_handle_t a, path_handle_t b) { + return xindex->get_path_name(a) < xindex->get_path_name(b); + }); + vector ref_weights(refs.size()); + for (size_t i = 0; i < refs.size(); ++i) { + ref_weights[i] = xindex->get_path_length(refs[i]); + } + + SnarlOrientedDistanceMeasurer measurer(distance_index); + + discrete_distribution ref_distr(ref_weights.begin(), ref_weights.end()); + mt19937 gen; + gen.seed(749753582ul); + + vector> positions; + vector> distances; + + int num_measurements = 100 * 1000; + int radius = 50 * 1000; + for (int i = 0; i < num_measurements; ++i) { + + path_handle_t ref = refs[ref_distr(gen)]; + if (xindex->get_path_length(ref) < 2) { + continue; + } + + // don't include the past-the-last, which makes things tricky + uniform_int_distribution off_distr_1(1, xindex->get_path_length(ref) - 1); + size_t path_off_1 = off_distr_1(gen); + + uniform_int_distribution off_distr_2(max(path_off_1 - radius, 1), + min(path_off_1 + radius, xindex->get_path_length(ref) - 1)); + size_t path_off_2 = off_distr_2(gen); + + auto step_1 = xindex->get_step_at_position(ref, path_off_1); + auto step_2 = xindex->get_step_at_position(ref, path_off_2); + + pos_t pos_1(xindex->get_id(xindex->get_handle_of_step(step_1)), + xindex->get_is_reverse(xindex->get_handle_of_step(step_1)), + path_off_1 - xindex->get_position_of_step(step_1)); + pos_t pos_2(xindex->get_id(xindex->get_handle_of_step(step_2)), + xindex->get_is_reverse(xindex->get_handle_of_step(step_2)), + path_off_2 - xindex->get_position_of_step(step_2)); + + positions.emplace_back(); + get<0>(positions.back()) = ref; + get<1>(positions.back()) = path_off_1; + get<2>(positions.back()) = path_off_2; + get<3>(positions.back()) = pos_1; + get<4>(positions.back()) = pos_2; + + pos_t rev_pos_1(id(pos_1), !is_rev(pos_1), + xindex->get_length(xindex->get_handle_of_step(step_1)) - offset(pos_1)); + + vector row; + row.push_back(algorithms::ref_path_distance(xindex, pos_1, pos_2, ref_path_handles, + max_splice_ref_search_length)); + row.push_back(measurer.oriented_distance(pos_1, pos_2)); + row.push_back(algorithms::ref_path_distance(xindex, rev_pos_1, pos_2, ref_path_handles, + max_splice_ref_search_length)); + row.push_back(measurer.oriented_distance(rev_pos_1, pos_2)); + + distances.emplace_back(); + for (auto d : row) { + distances.back().push_back(d == numeric_limits::max() ? numeric_limits::quiet_NaN() : d); + } + + + } +#ifdef debug_output_distance_correlation + for (size_t j = 0; j < distances.size(); ++j) { + auto& pos_row = positions[j]; + cerr << xindex->get_path_name(get<0>(pos_row)); + cerr << "\t" << get<1>(pos_row); + cerr << "\t" << get<2>(pos_row); + cerr << "\t" << get<3>(pos_row); + cerr << "\t" << get<4>(pos_row); + auto& row = distances[j]; + for (size_t i = 0; i < row.size(); ++i) { + cerr << "\t"; + cerr << row[i]; + } + cerr << endl; + } +#endif + } + + unique_ptr MultipathMapper::get_distance_measurer(MemoizingGraph& memoizing_graph) const { + + unique_ptr distance_measurer; + if (distance_index) { +#ifdef debug_multipath_mapper + cerr << "using a snarl-based distance measurer (if doing oriented distance clustering)" << endl; +#endif + distance_measurer = unique_ptr(new SnarlOrientedDistanceMeasurer(distance_index)); + } + else { +#ifdef debug_multipath_mapper + cerr << "using a path-based distance measurer (if doing oriented distance clustering)" << endl; +#endif + distance_measurer = unique_ptr(new PathOrientedDistanceMeasurer(&memoizing_graph, + path_component_index.get())); + } + return distance_measurer; } - int64_t MultipathMapper::distance_between(const MultipathAlignment& multipath_aln_1, - const MultipathAlignment& multipath_aln_2, + int64_t MultipathMapper::distance_between(const multipath_alignment_t& multipath_aln_1, + const multipath_alignment_t& multipath_aln_2, bool full_fragment, bool forward_strand) const { + + if (multipath_aln_1.subpath_size() == 0 || multipath_aln_2.subpath_size() == 0) { + // Something is an unmapped alignment + return numeric_limits::max(); + } + Alignment aln_1; optimal_alignment(multipath_aln_1, aln_1); - pos_t pos_1 = initial_position(aln_1.path()); + pos_t pos_1; + if (aln_1.path().mapping_size() != 0) { + pos_1 = initial_position(aln_1.path()); + } + else { + // the whole thing is negative scoring, take an arbitray postition just to not break + pos_1 = make_pos_t(multipath_aln_1.subpath().front().path().mapping().front().position()); + } Alignment aln_2; optimal_alignment(multipath_aln_2, aln_2); - pos_t pos_2 = full_fragment ? final_position(aln_2.path()) : initial_position(aln_2.path()); + pos_t pos_2; + if (aln_2.path().mapping_size() != 0) { + pos_2 = full_fragment ? final_position(aln_2.path()) : initial_position(aln_2.path()); + } + else { + // the whole thing is negative scoring, take an arbitray postition just to not break + pos_2 = make_pos_t(multipath_aln_2.subpath().front().path().mapping().front().position()); + } #ifdef debug_multipath_mapper cerr << "measuring left-to-" << (full_fragment ? "right" : "left") << " end distance between " << pos_1 << " and " << pos_2 << endl; #endif - return xindex->closest_shared_path_oriented_distance(id(pos_1), offset(pos_1), is_rev(pos_1), - id(pos_2), offset(pos_2), is_rev(pos_2), - forward_strand); + + int64_t dist; + if (use_min_dist_clusterer || use_tvs_clusterer) { + assert(!forward_strand); + // measure the distance in both directions and choose the minimum (or the only) absolute distance + size_t forward_dist = minimum_distance(*distance_index,pos_1, pos_2, false, xindex); + size_t reverse_dist = minimum_distance(*distance_index,pos_2, pos_1, false, xindex); + if (forward_dist == std::numeric_limits::max() && reverse_dist == std::numeric_limits::max()) { + // unreachable both ways, convert to the sentinel that the client code expects + dist = numeric_limits::max(); + } + else if (forward_dist == std::numeric_limits::max() || + (reverse_dist < forward_dist && reverse_dist != std::numeric_limits::max())) { + dist = - (int64_t)reverse_dist; + } + else { + dist = forward_dist; + } + } + else { + PathOrientedDistanceMeasurer measurer(xindex); + dist = measurer.oriented_distance(pos_1, pos_2); + } + return dist; + } + + int64_t MultipathMapper::distance(const pos_t& pos_1, const pos_t& pos_2) const { + if (distance_index) { + size_t distance = minimum_distance(*distance_index, pos_1, pos_2, false, xindex); + return distance == std::numeric_limits::max() ? -1 : (int64_t)distance; + } + else { + return PathOrientedDistanceMeasurer(xindex).oriented_distance(pos_1, pos_2); + } } bool MultipathMapper::is_consistent(int64_t distance) const { - return (distance < fragment_length_distr.mean() + 10.0 * fragment_length_distr.stdev() - && distance > fragment_length_distr.mean() - 10.0 * fragment_length_distr.stdev()); + return (distance < fragment_length_distr.mean() + 10.0 * fragment_length_distr.std_dev() + && distance > fragment_length_distr.mean() - 10.0 * fragment_length_distr.std_dev()); } - bool MultipathMapper::are_consistent(const MultipathAlignment& multipath_aln_1, - const MultipathAlignment& multipath_aln_2) const { + bool MultipathMapper::are_consistent(const multipath_alignment_t& multipath_aln_1, + const multipath_alignment_t& multipath_aln_2) const { return is_consistent(distance_between(multipath_aln_1, multipath_aln_2, true)); } - bool MultipathMapper::share_terminal_positions(const MultipathAlignment& multipath_aln_1, - const MultipathAlignment& multipath_aln_2) const { + bool MultipathMapper::share_terminal_positions(const multipath_alignment_t& multipath_aln_1, + const multipath_alignment_t& multipath_aln_2) const { unordered_set terminal_positions; @@ -681,14 +1286,14 @@ namespace vg { // now look for matching ends for (size_t i = 0; i < multipath_aln_1.subpath_size(); i++) { - const Subpath& subpath = multipath_aln_1.subpath(i); + const subpath_t& subpath = multipath_aln_1.subpath(i); if (subpath.next_size() == 0) { terminal_positions.insert(final_position(subpath.path())); } } for (size_t i = 0; i < multipath_aln_2.subpath_size(); i++) { - const Subpath& subpath = multipath_aln_2.subpath(i); + const subpath_t& subpath = multipath_aln_2.subpath(i); if (subpath.next_size() == 0) { if (terminal_positions.count(final_position(subpath.path()))) { return true; @@ -699,99 +1304,32 @@ namespace vg { return false; } - void MultipathMapper::establish_strand_consistency(vector>& multipath_aln_pairs, - vector, int64_t>>& cluster_pairs, - OrientedDistanceClusterer::paths_of_node_memo_t* paths_of_node_memo, - OrientedDistanceClusterer::oriented_occurences_memo_t* oriented_occurences_memo, - OrientedDistanceClusterer::handle_memo_t* handle_memo) { - -#ifdef debug_multipath_mapper - cerr << "establishing consistency between mapped pairs" << endl; -#endif - - int64_t search_dist = 0.5 * fragment_length_distr.mean() + 5.0 * fragment_length_distr.stdev(); - vector> strand_assignments; - strand_assignments.reserve(multipath_aln_pairs.size()); - for (const pair& multipath_aln_pair : multipath_aln_pairs) { - Alignment optimal_aln_1, optimal_aln_2; - optimal_alignment(multipath_aln_pair.first, optimal_aln_1); - optimal_alignment(multipath_aln_pair.second, optimal_aln_2); - pos_t pos_1 = initial_position(optimal_aln_1.path()); - pos_t pos_2 = initial_position(optimal_aln_2.path()); - - strand_assignments.push_back(xindex->validate_strand_consistency(id(pos_1), offset(pos_1), is_rev(pos_1), - id(pos_2), offset(pos_2), is_rev(pos_2), - search_dist, paths_of_node_memo, - oriented_occurences_memo, handle_memo)); - -#ifdef debug_multipath_mapper - cerr << "pair has initial positions " << pos_1 << " and " << pos_2 << " on strands " << (strand_assignments.back().first ? "-" : "+") << " and " << (strand_assignments.back().second ? "-" : "+") << endl; -#endif - } - - size_t end = multipath_aln_pairs.size(); - for (size_t i = 0; i < end; ) { - // move strand inconsistent mappings to the end - if (strand_assignments[i].first != strand_assignments[i].second) { -#ifdef debug_multipath_mapper - cerr << "removing inconsistent strand at " << i << " and not advancing index" << endl; -#endif - std::swap(multipath_aln_pairs[i], multipath_aln_pairs[end - 1]); - std::swap(strand_assignments[i], strand_assignments[end - 1]); - std::swap(cluster_pairs[i], cluster_pairs[end - 1]); - - --end; - } - else { -#ifdef debug_multipath_mapper - cerr << "identifying " << i << " as consistent" << endl; -#endif - // reverse the distance if it's on the reverse strand - if (strand_assignments[i].first) { -#ifdef debug_multipath_mapper - cerr << "\tinverting distance " << cluster_pairs[i].second << " because on negative strand" << endl; -#endif - cluster_pairs[i].second = -cluster_pairs[i].second; - } - ++i; - } - } - - // remove the inconsistent mappings - if (end != multipath_aln_pairs.size()) { -#ifdef debug_multipath_mapper - cerr << "found " << multipath_aln_pairs.size() - end << " strand inconsitent pairs, removing now" << endl; -#endif - multipath_aln_pairs.resize(end); - cluster_pairs.resize(end); - } - } - bool MultipathMapper::align_to_cluster_graphs_with_rescue(const Alignment& alignment1, const Alignment& alignment2, vector& cluster_graphs1, vector& cluster_graphs2, - bool block_rescue_from_1, bool block_rescue_from_2, - vector>& multipath_aln_pairs_out, + vector& mems1, + vector& mems2, + vector>& multipath_aln_pairs_out, vector, int64_t>>& pair_distances, - size_t max_alt_mappings) { + vector& pair_multiplicities, + const match_fanouts_t* fanouts1, const match_fanouts_t* fanouts2) { - vector multipath_alns_1, multipath_alns_2; + // align the two ends independently + vector multipath_alns_1, multipath_alns_2; vector cluster_idxs_1, cluster_idxs_2; - if (!block_rescue_from_1) { - cluster_idxs_1 = range_vector(cluster_graphs1.size()); - align_to_cluster_graphs(alignment1, mapping_quality_method == None ? Approx : mapping_quality_method, - cluster_graphs1, multipath_alns_1, max_single_end_mappings_for_rescue, &cluster_idxs_1); - } - if (!block_rescue_from_2) { - cluster_idxs_2 = range_vector(cluster_graphs2.size()); - align_to_cluster_graphs(alignment2, mapping_quality_method == None ? Approx : mapping_quality_method, - cluster_graphs2, multipath_alns_2, max_single_end_mappings_for_rescue, &cluster_idxs_2); - } - - if (multipath_alns_1.empty() || multipath_alns_2.empty() ? false : - ((multipath_alns_1.front().mapping_quality() >= min(60, max_mapping_quality) - && multipath_alns_2.front().mapping_quality() >= min(60, max_mapping_quality)) ? - are_consistent(multipath_alns_1.front(), multipath_alns_2.front()) : false)) { + vector multiplicities_1, multiplicities_2; + align_to_cluster_graphs(alignment1, + cluster_graphs1, multipath_alns_1, multiplicities_1, max_single_end_mappings_for_rescue, + fanouts1, &cluster_idxs_1); + align_to_cluster_graphs(alignment2, + cluster_graphs2, multipath_alns_2, multiplicities_2, max_single_end_mappings_for_rescue, + fanouts2, &cluster_idxs_2); + + if (!multipath_alns_1.empty() && + !multipath_alns_2.empty() && + multipath_alns_1.front().mapping_quality() >= min(60, max_mapping_quality) && + multipath_alns_2.front().mapping_quality() >= min(60, max_mapping_quality) && + are_consistent(multipath_alns_1.front(), multipath_alns_2.front())) { // we are able to obtain confident matches that satisfy the pairing constraints #ifdef debug_multipath_mapper @@ -800,16 +1338,20 @@ namespace vg { multipath_aln_pairs_out.emplace_back(move(multipath_alns_1.front()), move(multipath_alns_2.front())); pair_distances.emplace_back(make_pair(cluster_idxs_1.front(), cluster_idxs_2.front()), distance_between(multipath_aln_pairs_out.back().first, multipath_aln_pairs_out.back().second, true)); + pair_multiplicities.emplace_back(min(multiplicities_1.front(), multiplicities_2.front())); return true; } - int32_t max_score_diff = get_aligner()->mapping_quality_score_diff(max_mapping_quality); + // figure out how many rescues we will do and could do from each side + + int32_t max_score_diff = get_aligner(!alignment1.quality().empty() && + !alignment2.quality().empty())->mapping_quality_score_diff(max_mapping_quality); int32_t top_score_1 = multipath_alns_1.empty() ? 0 : optimal_alignment_score(multipath_alns_1.front()); int32_t top_score_2 = multipath_alns_2.empty() ? 0 : optimal_alignment_score(multipath_alns_2.front()); - size_t num_rescuable_alns_1 = block_rescue_from_1 ? 0 : min(multipath_alns_1.size(), max_rescue_attempts); - size_t num_rescuable_alns_2 = block_rescue_from_2 ? 0 : min(multipath_alns_2.size(), max_rescue_attempts); + size_t num_rescuable_alns_1 = multipath_alns_1.size(); + size_t num_rescuable_alns_2 = multipath_alns_2.size(); for (size_t i = 0; i < num_rescuable_alns_1; i++){ if (likely_mismapping(multipath_alns_1[i]) || (i > 0 ? optimal_alignment_score(multipath_alns_1[i]) < top_score_1 - max_score_diff : false)) { @@ -824,31 +1366,40 @@ namespace vg { break; } } - - vector rescue_multipath_alns_1(num_rescuable_alns_2), rescue_multipath_alns_2(num_rescuable_alns_1); - - unordered_set rescued_from_1, rescued_from_2; + size_t num_to_rescue_1 = min(num_rescuable_alns_1, max_rescue_attempts); + size_t num_to_rescue_2 = min(num_rescuable_alns_2, max_rescue_attempts); #ifdef debug_multipath_mapper - cerr << "rescuing from " << num_rescuable_alns_1 << " read1's and " << num_rescuable_alns_2 << " read2's" << endl; + cerr << "rescuing from " << num_to_rescue_1 << " read1's and " << num_to_rescue_2 << " read2's" << endl; #endif - for (size_t i = 0; i < num_rescuable_alns_1; i++) { - MultipathAlignment rescue_multipath_aln; + // calculate the estimated multiplicity of a pair found from each of the two ends + double estimated_multiplicity_from_1 = num_to_rescue_1 > 0 ? double(num_rescuable_alns_1) / num_to_rescue_1 : 1.0; + double estimated_multiplicity_from_2 = num_to_rescue_2 > 0 ? double(num_rescuable_alns_2) / num_to_rescue_2 : 1.0; + + // actually doe the rescues and record which ones succeeded + vector rescue_multipath_alns_1(num_to_rescue_2), rescue_multipath_alns_2(num_to_rescue_1); + unordered_set rescued_from_1, rescued_from_2; + + for (size_t i = 0; i < num_to_rescue_1; i++) { + multipath_alignment_t rescue_multipath_aln; if (attempt_rescue(multipath_alns_1[i], alignment2, true, rescue_multipath_aln)) { rescued_from_1.insert(i); rescue_multipath_alns_2[i] = move(rescue_multipath_aln); } } - for (size_t i = 0; i < num_rescuable_alns_2; i++) { - MultipathAlignment rescue_multipath_aln; + for (size_t i = 0; i < num_to_rescue_2; i++) { + multipath_alignment_t rescue_multipath_aln; if (attempt_rescue(multipath_alns_2[i], alignment1, false, rescue_multipath_aln)) { rescued_from_2.insert(i); rescue_multipath_alns_1[i] = move(rescue_multipath_aln); } } + // follow some complicated logic to check if any of the rescued alignments are duplicates + // of the original alignments + bool found_consistent = false; if (!rescued_from_1.empty() && !rescued_from_2.empty()) { #ifdef debug_multipath_mapper @@ -868,11 +1419,11 @@ namespace vg { #ifdef debug_multipath_mapper cerr << "checking duplication between mapped read1 " << i << " and rescued read1 " << j << endl; #endif - if (abs(distance_between(multipath_alns_1[i], rescue_multipath_alns_1[j])) < 20) { + if (share_terminal_positions(multipath_alns_1[i], rescue_multipath_alns_1[j])) { #ifdef debug_multipath_mapper cerr << "found duplicate, now checking rescued read2 " << i << " and mapped read2 " << j << endl; #endif - if (abs(distance_between(rescue_multipath_alns_2[i], multipath_alns_2[j])) < 20) { + if (share_terminal_positions(rescue_multipath_alns_2[i], multipath_alns_2[j])) { #ifdef debug_multipath_mapper cerr << "found duplicate, marking entire pair as duplicate" << endl; #endif @@ -885,6 +1436,8 @@ namespace vg { if (dist != numeric_limits::max() && dist >= 0) { multipath_aln_pairs_out.emplace_back(move(multipath_alns_1[i]), move(multipath_alns_2[j])); pair_distances.emplace_back(make_pair(cluster_idxs_1[i], cluster_idxs_2[j]), dist); + pair_multiplicities.emplace_back(min(estimated_multiplicity_from_1 * multiplicities_1[i], + estimated_multiplicity_from_2 * multiplicities_2[j])); found_consistent = true; } @@ -901,7 +1454,8 @@ namespace vg { cerr << "adding read1 and rescued read2 " << i << " to output vector" << endl; #endif multipath_aln_pairs_out.emplace_back(move(multipath_alns_1[i]), move(rescue_multipath_alns_2[i])); - pair_distances.emplace_back(make_pair(cluster_idxs_1[i], cluster_graphs2.size()), dist); + pair_distances.emplace_back(make_pair(cluster_idxs_1[i], RESCUED), dist); + pair_multiplicities.emplace_back(estimated_multiplicity_from_1 * multiplicities_1[i]); found_consistent = true; } } @@ -919,7 +1473,8 @@ namespace vg { cerr << "adding rescued read1 and read2 " << j << " to output vector" << endl; #endif multipath_aln_pairs_out.emplace_back(move(rescue_multipath_alns_1[j]), move(multipath_alns_2[j])); - pair_distances.emplace_back(make_pair(cluster_graphs1.size(), cluster_idxs_2[j]), dist); + pair_distances.emplace_back(make_pair(RESCUED, cluster_idxs_2[j]), dist); + pair_multiplicities.emplace_back(estimated_multiplicity_from_2 * multiplicities_2[j]); found_consistent = true; } } @@ -932,7 +1487,8 @@ namespace vg { int64_t dist = distance_between(multipath_alns_1[i], rescue_multipath_alns_2[i], true); if (dist != numeric_limits::max() && dist >= 0) { multipath_aln_pairs_out.emplace_back(move(multipath_alns_1[i]), move(rescue_multipath_alns_2[i])); - pair_distances.emplace_back(make_pair(cluster_idxs_1[i], cluster_graphs2.size()), dist); + pair_distances.emplace_back(make_pair(cluster_idxs_1[i], RESCUED), dist); + pair_multiplicities.emplace_back(estimated_multiplicity_from_1 * multiplicities_1[i]); found_consistent = true; } } @@ -945,21 +1501,72 @@ namespace vg { int64_t dist = distance_between(rescue_multipath_alns_1[i], multipath_alns_2[i], true); if (dist != numeric_limits::max() && dist >= 0) { multipath_aln_pairs_out.emplace_back(move(rescue_multipath_alns_1[i]), move(multipath_alns_2[i])); - pair_distances.emplace_back(make_pair(cluster_graphs1.size(), cluster_idxs_2[i]), dist); + pair_distances.emplace_back(make_pair(RESCUED, cluster_idxs_2[i]), dist); + pair_multiplicities.emplace_back(estimated_multiplicity_from_2 * multiplicities_2[i]); found_consistent = true; } } } - if (found_consistent) { - // compute the paired mapping quality - sort_and_compute_mapping_quality(multipath_aln_pairs_out, pair_distances); + if (!found_consistent && do_spliced_alignment) { +#ifdef debug_multipath_mapper + cerr << "rescue failed, doing independent spliced alignment and then re-attempting pairing" << endl; +#endif + +#ifdef debug_time_phases + clock_t start = clock(); +#endif + + multipath_alignment_t* rescue_anchor_1 = nullptr; + multipath_alignment_t* rescue_anchor_2 = nullptr; + double rescue_multiplicity_1 = 1.0, rescue_multiplicity_2 = 1.0; + if (!multipath_alns_1.empty()) { + rescue_anchor_1 = &multipath_alns_1.front(); + rescue_multiplicity_1 = multiplicities_1.front(); + } + if (!multipath_alns_2.empty()) { + rescue_anchor_2 = &multipath_alns_2.front(); + rescue_multiplicity_2 = multiplicities_2.front(); + } + + // find splices independently, also use the mate to rescue missing splice segments + bool did_splice_1 = false, did_splice_2 = false; + if (!multipath_alns_1.empty() && !likely_mismapping(multipath_alns_1.front())) { + did_splice_1 = find_spliced_alignments(alignment1, multipath_alns_1, multiplicities_1, cluster_idxs_1, + mems1, cluster_graphs1, fanouts1, + rescue_anchor_2, true, rescue_multiplicity_2); + } + if (!multipath_alns_2.empty() && !likely_mismapping(multipath_alns_2.front())) { + did_splice_2 = find_spliced_alignments(alignment2, multipath_alns_2, multiplicities_2, cluster_idxs_2, + mems2, cluster_graphs2, fanouts2, + rescue_anchor_1, false, rescue_multiplicity_1); + } + + if (did_splice_1 || did_splice_2) { + // it may now be possible to identify some pairs as properly paired using the spliced alignment + found_consistent = retry_pairing_spliced_alignments(alignment1, alignment2, multipath_alns_1, multipath_alns_2, + cluster_idxs_1, cluster_idxs_2, multiplicities_1, + multiplicities_2, multipath_aln_pairs_out, + pair_distances, pair_multiplicities); + } + +#ifdef debug_time_phases + cerr << "\trescue failed, spliced alignment used: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif } - else { + + if (!found_consistent) { + #ifdef debug_multipath_mapper cerr << "failed to successfully rescue from either read end, reporting independent mappings" << endl; #endif + // agglomerate them them independently if necessary + if (agglomerate_multipath_alns) { + agglomerate_alignments(multipath_alns_1, &multiplicities_1); + agglomerate_alignments(multipath_alns_2, &multiplicities_2); + } + // rescue failed, so we just report these as independent mappings size_t num_pairs_to_report = min(max_alt_mappings, max(multipath_alns_1.size(), multipath_alns_2.size())); @@ -971,13 +1578,13 @@ namespace vg { } else if (i < multipath_alns_1.size()) { - multipath_aln_pairs_out.emplace_back(move(multipath_alns_1[i]), MultipathAlignment()); + multipath_aln_pairs_out.emplace_back(move(multipath_alns_1[i]), multipath_alignment_t()); to_multipath_alignment(alignment2, multipath_aln_pairs_out.back().second); multipath_aln_pairs_out.back().second.clear_subpath(); multipath_aln_pairs_out.back().second.clear_start(); } else { - multipath_aln_pairs_out.emplace_back(MultipathAlignment(), move(multipath_alns_2[i])); + multipath_aln_pairs_out.emplace_back(multipath_alignment_t(), move(multipath_alns_2[i])); to_multipath_alignment(alignment1, multipath_aln_pairs_out.back().first); multipath_aln_pairs_out.back().first.clear_subpath(); multipath_aln_pairs_out.back().first.clear_start(); @@ -985,31 +1592,29 @@ namespace vg { } } + if (found_consistent) { + // compute the paired mapping quality + sort_and_compute_mapping_quality(multipath_aln_pairs_out, pair_distances, nullptr, &pair_multiplicities); + } + #ifdef debug_validate_multipath_alignments - for (pair& multipath_aln_pair : multipath_aln_pairs_out) { + for (pair& multipath_aln_pair : multipath_aln_pairs_out) { #ifdef debug_multipath_mapper cerr << "validating multipath alignments:" << endl; - cerr << pb2json(multipath_aln_pair.first) << endl; - cerr << pb2json(multipath_aln_pair.second) << endl; + cerr << debug_string(multipath_aln_pair.first) << endl; + cerr << debug_string(multipath_aln_pair.second) << endl; #endif if (!validate_multipath_alignment(multipath_aln_pair.first, *xindex)) { cerr << "### WARNING ###" << endl; - cerr << "multipath alignment of read " << multipath_aln_pair.first.name() << " failed to validate" << endl; + cerr << "multipath alignment of read " << multipath_aln_pair.first.sequence() << " failed to validate" << endl; } if (!validate_multipath_alignment(multipath_aln_pair.second, *xindex)) { cerr << "### WARNING ###" << endl; - cerr << "multipath alignment of read " << multipath_aln_pair.second.name() << " failed to validate" << endl; + cerr << "multipath alignment of read " << multipath_aln_pair.second.sequence() << " failed to validate" << endl; } } #endif - if (mapping_quality_method == None) { - for (pair& multipath_aln_pair : multipath_aln_pairs_out) { - multipath_aln_pair.first.clear_mapping_quality(); - multipath_aln_pair.second.clear_mapping_quality(); - } - } - return found_consistent; } @@ -1017,14 +1622,15 @@ namespace vg { vector& cluster_graphs1, vector& cluster_graphs2, vector>& duplicate_pairs, - vector>& multipath_aln_pairs_out, - vector, int64_t>>& cluster_pairs) { + vector>& multipath_aln_pairs_out, + vector, int64_t>>& cluster_pairs, + vector& pair_multiplicities, + const match_fanouts_t* fanouts1, const match_fanouts_t* fanouts2) { #ifdef debug_multipath_mapper cerr << "using rescue to find secondary mappings" << endl; #endif - unordered_set paired_clusters_1, paired_clusters_2; for (size_t i = 0; i < multipath_aln_pairs_out.size(); i++) { @@ -1039,24 +1645,28 @@ namespace vg { paired_clusters_2.insert(duplicate_pairs[i].second); } - int32_t cluster_score_1 = get_aligner()->match * get<2>(cluster_graphs1[cluster_pairs.front().first.first]); - int32_t cluster_score_2 = get_aligner()->match * get<2>(cluster_graphs2[cluster_pairs.front().first.second]); - int32_t max_score_diff = secondary_rescue_score_diff * get_aligner()->mapping_quality_score_diff(max_mapping_quality); + auto aligner = get_aligner(!alignment1.quality().empty() && !alignment2.quality().empty()); + int32_t cluster_score_1 = aligner->match * get<2>(cluster_graphs1[cluster_pairs.front().first.first]); + int32_t cluster_score_2 = aligner->match * get<2>(cluster_graphs2[cluster_pairs.front().first.second]); + int32_t max_score_diff = secondary_rescue_score_diff * aligner->mapping_quality_score_diff(max_mapping_quality); - vector> rescued_secondaries; + vector> rescued_secondaries; vector, int64_t>> rescued_distances; + vector rescued_multiplicities; auto align_and_rescue = [&](const Alignment& anchor_aln, const Alignment& rescue_aln, vector& cluster_graphs, unordered_set& paired_clusters, - int32_t max_score, bool anchor_is_read_1) { + int32_t max_score, bool anchor_is_read_1, const match_fanouts_t* anchor_fanouts) { #ifdef debug_multipath_mapper cerr << "checking for rescues from read " << (anchor_is_read_1 ? 1 : 2) << endl; #endif - + // remember how many pairs are already in the vector + size_t num_preexisting_pairs = rescued_secondaries.size(); + size_t num_rescuable = 0; size_t num_rescues = 0; - for (size_t i = 0; i < cluster_graphs.size() && num_rescues < secondary_rescue_attempts; i++) { + for (size_t i = 0; i < cluster_graphs.size(); ++i) { if (paired_clusters.count(i)) { // we already have a consistent pair from this cluster #ifdef debug_multipath_mapper @@ -1067,10 +1677,10 @@ namespace vg { } #ifdef debug_multipath_mapper - cerr << "cluster " << i << "'s approximate score is " << get<2>(cluster_graphs[i]) * get_aligner()->match << ", looking for " << max_score - max_score_diff << endl; + cerr << "cluster " << i << "'s approximate score is " << get<2>(cluster_graphs[i]) * aligner->match << ", looking for " << max_score - max_score_diff << endl; #endif - if (get<2>(cluster_graphs[i]) * get_aligner()->match < max_score - max_score_diff) { + if (get<2>(cluster_graphs[i]) * aligner->match < max_score - max_score_diff) { #ifdef debug_multipath_mapper cerr << "the approximate score of the remaining is too low to consider" << endl; #endif @@ -1078,32 +1688,44 @@ namespace vg { break; } + // count this one as potentially rescuable from + ++num_rescuable; + + if (num_rescues >= secondary_rescue_attempts) { + // we have used up our budget of rescue attempts + continue; + } + + ++num_rescues; + // TODO: repetitive with align_to_cluster_graphs // make the alignment - vector cluster_multipath_alns; + vector cluster_multipath_alns; cluster_multipath_alns.emplace_back(); - multipath_align(anchor_aln, get<0>(cluster_graphs[i]), get<1>(cluster_graphs[i]), cluster_multipath_alns.back()); + multipath_align(anchor_aln, cluster_graphs[i], cluster_multipath_alns.back(), anchor_fanouts); - // split it up if it turns out to be multiple components - split_multicomponent_alignments(cluster_multipath_alns); + if (!suppress_multicomponent_splitting) { + // split it up if it turns out to be multiple components + split_multicomponent_alignments(cluster_multipath_alns); + } // order the subpaths - for (MultipathAlignment& multipath_aln : cluster_multipath_alns) { + for (multipath_alignment_t& multipath_aln : cluster_multipath_alns) { topologically_order_subpaths(multipath_aln); } // if we split it up, move the best one to the front if (cluster_multipath_alns.size() > 1) { - sort_and_compute_mapping_quality(cluster_multipath_alns, None); + sort_and_compute_mapping_quality(cluster_multipath_alns); } // rescue from the alignment - MultipathAlignment rescue_multipath_aln; + multipath_alignment_t rescue_multipath_aln; if (!likely_mismapping(cluster_multipath_alns.front())) { bool rescued = attempt_rescue(cluster_multipath_alns.front(), rescue_aln, anchor_is_read_1, rescue_multipath_aln); #ifdef debug_multipath_mapper - cerr << "rescued alignment is " << pb2json(rescue_multipath_aln) << endl; + cerr << "rescued alignment is " << debug_string(rescue_multipath_aln) << endl; #endif if (rescued) { #ifdef debug_multipath_mapper @@ -1112,31 +1734,51 @@ namespace vg { if (anchor_is_read_1) { int64_t dist = distance_between(cluster_multipath_alns.front(), rescue_multipath_aln, true); if (dist >= 0 && dist != numeric_limits::max()) { + simplify_complicated_multipath_alignment(cluster_multipath_alns.front()); rescued_secondaries.emplace_back(move(cluster_multipath_alns.front()), move(rescue_multipath_aln)); - rescued_distances.emplace_back(make_pair(i, cluster_graphs2.size()), dist); + rescued_distances.emplace_back(make_pair(i, RESCUED), dist); } } else { int64_t dist = distance_between(rescue_multipath_aln, cluster_multipath_alns.front(), true); if (dist >= 0 && dist != numeric_limits::max()) { + simplify_complicated_multipath_alignment(cluster_multipath_alns.front()); rescued_secondaries.emplace_back(move(rescue_multipath_aln), move(cluster_multipath_alns.front())); - rescued_distances.emplace_back(make_pair(cluster_graphs1.size(), i), dist); + rescued_distances.emplace_back(make_pair(RESCUED, i), dist); } } - } else { + } #ifdef debug_multipath_mapper + else { cerr << "rescue failed" << endl; -#endif } - } else { +#endif + } #ifdef debug_multipath_mapper - cerr << "rescued alignment is likely a mismapping" << endl; + else { + cerr << "alignment we're rescuing from is likely a mismapping" << endl; + } #endif + } + + // estimate how many of these alignments there probably are in total + double rescue_multiplicity = double(num_rescuable) / double(num_rescues); + + // fill out the multiplicity with estimated multiplicity based on rescue and cluster + for (size_t i = num_preexisting_pairs; i < rescued_secondaries.size(); ++i) { + const auto& rescued_cluster_pair = rescued_distances[i]; + double clust_multiplicity; + if (rescued_cluster_pair.first.first == RESCUED) { + // the read 1 mapping is from a rescue, get the cluster multiplicity for read 2 + clust_multiplicity = cluster_multiplicity(get<1>(cluster_graphs2[rescued_cluster_pair.first.second])); } - - num_rescues++; + else { + // the read 2 mapping is from a rescue, get the cluster multiplicity for read 1 + clust_multiplicity = cluster_multiplicity(get<1>(cluster_graphs1[rescued_cluster_pair.first.first])); + } + rescued_multiplicities.push_back(rescue_multiplicity * clust_multiplicity); } //#pragma omp atomic @@ -1150,23 +1792,25 @@ namespace vg { }; // perform routine for both read ends - align_and_rescue(alignment1, alignment2, cluster_graphs1, paired_clusters_1, cluster_score_1, true); - align_and_rescue(alignment2, alignment1, cluster_graphs2, paired_clusters_2, cluster_score_2, false); + align_and_rescue(alignment1, alignment2, cluster_graphs1, paired_clusters_1, + cluster_score_1, true, fanouts1); + align_and_rescue(alignment2, alignment1, cluster_graphs2, paired_clusters_2, + cluster_score_2, false, fanouts2); #ifdef debug_validate_multipath_alignments - for (pair& multipath_aln_pair : rescued_secondaries) { + for (pair& multipath_aln_pair : rescued_secondaries) { #ifdef debug_multipath_mapper cerr << "validating rescued secondary multipath alignments:" << endl; - cerr << pb2json(multipath_aln_pair.first) << endl; - cerr << pb2json(multipath_aln_pair.second) << endl; + cerr << debug_string(multipath_aln_pair.first) << endl; + cerr << debug_string(multipath_aln_pair.second) << endl; #endif if (!validate_multipath_alignment(multipath_aln_pair.first, *xindex)) { cerr << "### WARNING ###" << endl; - cerr << "multipath alignment of read " << multipath_aln_pair.first.name() << " failed to validate" << endl; + cerr << "multipath alignment of read " << multipath_aln_pair.first.sequence() << " failed to validate" << endl; } if (!validate_multipath_alignment(multipath_aln_pair.second, *xindex)) { cerr << "### WARNING ###" << endl; - cerr << "multipath alignment of read " << multipath_aln_pair.second.name() << " failed to validate" << endl; + cerr << "multipath alignment of read " << multipath_aln_pair.second.sequence() << " failed to validate" << endl; } } #endif @@ -1182,8 +1826,8 @@ namespace vg { vector duplicate(rescued_secondaries.size(), false); for (size_t i = 1; i < rescued_secondaries.size(); i++) { for (size_t j = 0; j < i; j++) { - if (abs(distance_between(rescued_secondaries[i].first, rescued_secondaries[j].first)) < 20) { - if (abs(distance_between(rescued_secondaries[i].second, rescued_secondaries[j].second)) < 20) { + if (share_terminal_positions(rescued_secondaries[i].first, rescued_secondaries[j].first)) { + if (share_terminal_positions(rescued_secondaries[i].second, rescued_secondaries[j].second)) { duplicate[i] = true; duplicate[j] = true; } @@ -1198,6 +1842,7 @@ namespace vg { std::swap(rescued_secondaries[i], rescued_secondaries[end - 1]); std::swap(rescued_distances[i], rescued_distances[end - 1]); + std::swap(rescued_multiplicities[i], rescued_multiplicities[end - 1]); std::swap(duplicate[i], duplicate[end - 1]); end--; @@ -1211,10 +1856,12 @@ namespace vg { if (end < rescued_secondaries.size()) { rescued_secondaries.resize(end); rescued_distances.resize(end); + rescued_multiplicities.resize(end); } // merge the rescued secondaries into the return vector - merge_rescued_mappings(multipath_aln_pairs_out, cluster_pairs, rescued_secondaries, rescued_distances); + merge_rescued_mappings(multipath_aln_pairs_out, cluster_pairs, pair_multiplicities, + rescued_secondaries, rescued_distances, rescued_multiplicities); } else { #ifdef debug_multipath_mapper cerr << "no rescues succeeded" << endl; @@ -1222,15 +1869,20 @@ namespace vg { } } - void MultipathMapper::multipath_map_paired(const Alignment& alignment1, const Alignment& alignment2, - vector>& multipath_aln_pairs_out, - vector>& ambiguous_pair_buffer, - size_t max_alt_mappings) { - + bool MultipathMapper::multipath_map_paired(const Alignment& alignment1, const Alignment& alignment2, + vector>& multipath_aln_pairs_out, + vector>& ambiguous_pair_buffer) { + + //cerr << (to_string(omp_get_thread_num()) + " " + alignment1.name() + "\n"); #ifdef debug_multipath_mapper cerr << "multipath mapping paired reads " << pb2json(alignment1) << " and " << pb2json(alignment2) << endl; #endif +#ifdef debug_time_phases + cerr << "starting time measurement for pair " << alignment1.name() << endl; + clock_t start = clock(); +#endif + // empty the output vector (just for safety) multipath_aln_pairs_out.clear(); @@ -1242,19 +1894,20 @@ namespace vg { cerr << "no fragment length distribution yet, looking for unambiguous single ended pairs" << endl; #endif - attempt_unpaired_multipath_map_of_pair(alignment1, alignment2, multipath_aln_pairs_out, ambiguous_pair_buffer); - - return; + return attempt_unpaired_multipath_map_of_pair(alignment1, alignment2, multipath_aln_pairs_out, ambiguous_pair_buffer); } // the fragment length distribution has been estimated, so we can do full-fledged paired mode - - // query MEMs using GCSA2 - double dummy1, dummy2; - vector mems1 = find_mems_deep(alignment1.sequence().begin(), alignment1.sequence().end(), dummy1, dummy2, - 0, min_mem_length, mem_reseed_length, false, true, true, false); - vector mems2 = find_mems_deep(alignment2.sequence().begin(), alignment2.sequence().end(), dummy1, dummy2, - 0, min_mem_length, mem_reseed_length, false, true, true, false); + vector>> mem_fanouts1, mem_fanouts2; + auto mems1 = find_mems(alignment1, &mem_fanouts1); + auto mems2 = find_mems(alignment2, &mem_fanouts2); + unique_ptr fanouts1(mem_fanouts1.empty() ? nullptr + : new match_fanouts_t(record_fanouts(mems1, mem_fanouts1))); + unique_ptr fanouts2(mem_fanouts2.empty() ? nullptr + : new match_fanouts_t(record_fanouts(mems2, mem_fanouts2))); +#ifdef debug_time_phases + cerr << "collected MEMs, time elapsed: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif #ifdef debug_multipath_mapper cerr << "obtained read1 MEMs:" << endl; @@ -1267,352 +1920,196 @@ namespace vg { } #endif - // find the count of the most unique match among the MEMs to assess how repetitive the sequence is - size_t min_match_count_1 = numeric_limits::max(); - size_t min_match_count_2 = numeric_limits::max(); - for (const MaximalExactMatch& mem : mems1) { - min_match_count_1 = min(min_match_count_1, mem.match_count); - } - for (const MaximalExactMatch& mem : mems2) { - min_match_count_2 = min(min_match_count_2, mem.match_count); - } + MemoizingGraph memoizing_graph(xindex); + unique_ptr distance_measurer = get_distance_measurer(memoizing_graph); - // initialize cluster variables - vector clusters1, clusters2; - vector cluster_graphs1, cluster_graphs2; - vector, int64_t>> cluster_pairs; - vector> duplicate_pairs; +#ifdef debug_multipath_mapper + cerr << "clustering MEMs on both read ends..." << endl; +#endif + + // try to rescue high count runs of order-length MEMs for both reads before clustering + rescue_high_count_order_length_mems(mems1, order_length_repeat_hit_max); + rescue_high_count_order_length_mems(mems2, order_length_repeat_hit_max); - // intialize memos for the results of expensive succinct operations that we may need to do multiple times - OrientedDistanceClusterer::paths_of_node_memo_t paths_of_node_memo; - OrientedDistanceClusterer::oriented_occurences_memo_t oriented_occurences_memo; - OrientedDistanceClusterer::handle_memo_t handle_memo; + // do the clustering + vector clusters1 = get_clusters(alignment1, mems1, &(*distance_measurer), fanouts1.get()); + vector clusters2 = get_clusters(alignment2, mems2, &(*distance_measurer), fanouts2.get()); - // do we want to try to only cluster one read end and rescue the other? - bool do_repeat_rescue_from_1 = min_match_count_2 > rescue_only_min && min_match_count_1 <= rescue_only_anchor_max; - bool do_repeat_rescue_from_2 = min_match_count_1 > rescue_only_min && min_match_count_2 <= rescue_only_anchor_max; +#ifdef debug_time_phases + cerr << "clustered MEMs, time elapsed: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif + + // extract graphs around the clusters and get the assignments of MEMs to these graphs + vector cluster_graphs1 = query_cluster_graphs(alignment1, mems1, clusters1); + vector cluster_graphs2 = query_cluster_graphs(alignment2, mems2, clusters2); - bool rescued_order_length_runs_1 = false, rescued_order_length_runs_2 = false; +#ifdef debug_time_phases + cerr << "extracted subgraphs, time elapsed: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif #ifdef debug_multipath_mapper - cerr << "min hit count on read 1: " << min_match_count_1 << ", on read 2: " << min_match_count_2 << ", doing rescue from read 1? " << (do_repeat_rescue_from_1 ? "yes" : "no") << ", from read 2? " << (do_repeat_rescue_from_2 ? "yes" : "no") << endl; + cerr << "obtained independent clusters:" << endl; + cerr << "read 1" << endl; + for (int i = 0; i < cluster_graphs1.size(); i++) { + cerr << "\tcluster " << i << " (multiplicity " << get<1>(cluster_graphs1[i]).second << ")" << endl; + for (pair hit : get<1>(cluster_graphs1[i]).first) { + cerr << "\t\t" << hit.second << " " << hit.first->sequence() << endl; + } + } + cerr << "read 2" << endl; + for (int i = 0; i < cluster_graphs2.size(); i++) { + cerr << "\tcluster " << i << " (multiplicity " << get<1>(cluster_graphs2[i]).second << ")" << endl; + for (pair hit : get<1>(cluster_graphs2[i]).first) { + cerr << "\t\t" << hit.second << " " << hit.first->sequence() << endl; + } + } #endif - if (do_repeat_rescue_from_1 || do_repeat_rescue_from_2) { - - // one side appears to be repetitive and the other non-repetitive, so try to only align the non-repetitive side - // and get the other side from rescue - - // try to rescue high count runs of order-length MEMs for any read we're going to perform clustering on - if (order_length_repeat_hit_max && do_repeat_rescue_from_1 && !rescued_order_length_runs_1) { - rescue_high_count_order_length_mems(mems1, order_length_repeat_hit_max); - rescued_order_length_runs_1 = true; + // we haven't already obtained a paired mapping by rescuing into a repeat, so we should try to get one + // by cluster pairing + + vector, int64_t>> cluster_pairs = get_cluster_pairs(alignment1, alignment2, + cluster_graphs1, cluster_graphs2, + &(*distance_measurer)); +#ifdef debug_time_phases + cerr << "paired clusters, time elapsed: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif + +#ifdef debug_multipath_mapper + cerr << "obtained cluster pairs:" << endl; + for (int i = 0; i < cluster_pairs.size(); i++) { + cerr << "\tpair " << i << " at distance " << cluster_pairs[i].second << endl; + cerr << "\t\t read 1 (cluster " << cluster_pairs[i].first.first << ")" << endl; + for (pair hit : get<1>(cluster_graphs1[cluster_pairs[i].first.first]).first) { + cerr << "\t\t\t" << hit.second << " " << hit.first->sequence() << endl; } - if (order_length_repeat_hit_max && do_repeat_rescue_from_2 && !rescued_order_length_runs_2) { - rescue_high_count_order_length_mems(mems2, order_length_repeat_hit_max); - rescued_order_length_runs_2 = true; + cerr << "\t\t read 2 (cluster " << cluster_pairs[i].first.second << ")" << endl; + for (pair hit : get<1>(cluster_graphs2[cluster_pairs[i].first.second]).first) { + cerr << "\t\t\t" << hit.second << " " << hit.first->sequence() << endl; } + } +#endif + + // initialize some pair variables + vector pair_multiplicities; + vector> duplicate_pairs; + + bool proper_paired = true; + // do we find any pairs that satisfy the distance requirements? + if (!cluster_pairs.empty()) { + // We got some pairs that satisfy the distance requirements. + + // only perform the mappings that satisfy the expectations on distance - attempt_rescue_of_repeat_from_non_repeat(alignment1, alignment2, mems1, mems2, do_repeat_rescue_from_1, do_repeat_rescue_from_2, - clusters1, clusters2, cluster_graphs1, cluster_graphs2, multipath_aln_pairs_out, - cluster_pairs, max_alt_mappings, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); + align_to_cluster_graph_pairs(alignment1, alignment2, cluster_graphs1, cluster_graphs2, + multipath_aln_pairs_out, cluster_pairs, pair_multiplicities, + duplicate_pairs, fanouts1.get(), fanouts2.get()); + +#ifdef debug_time_phases + cerr << "made alignments, time elapsed: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif - if (multipath_aln_pairs_out.empty() && do_repeat_rescue_from_1 && !do_repeat_rescue_from_2) { - // we've clustered and extracted read 1, but rescue failed, so do the same for read 2 to prepare for the - // normal pair clustering routine + // do we produce at least one good looking pair alignments from the clustered clusters? + if (multipath_aln_pairs_out.empty() + || likely_mismapping(multipath_aln_pairs_out.front().first) + || likely_mismapping(multipath_aln_pairs_out.front().second)) { #ifdef debug_multipath_mapper - cerr << "repeat rescue failed from read 1, extracting clusters for read 2 and transitioning to standard clustering approach" << endl; + cerr << "pair may be mismapped, attempting individual end mappings" << endl; #endif + // we're not happy with the pairs we got, try to get a good pair by rescuing from single ended alignments - // rescue high count runs of order-length MEMs now that we're going to cluster here - if (order_length_repeat_hit_max && !rescued_order_length_runs_2) { - rescue_high_count_order_length_mems(mems2, order_length_repeat_hit_max); - rescued_order_length_runs_2 = true; - } + vector> rescue_aln_pairs; + vector, int64_t>> rescue_distances; + vector rescue_multiplicities; + bool rescued = align_to_cluster_graphs_with_rescue(alignment1, alignment2, cluster_graphs1, cluster_graphs2, mems1, + mems2, rescue_aln_pairs, rescue_distances, rescue_multiplicities, + fanouts1.get(), fanouts2.get()); +#ifdef debug_time_phases + cerr << "attempted rescue, time elapsed: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif - // do the clustering - if (adjust_alignments_for_base_quality) { - OrientedDistanceClusterer clusterer2(alignment2, mems2, *get_qual_adj_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters2 = clusterer2.clusters(alignment2, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); + if (rescued) { + // we found consistent pairs by rescue, merge the two lists + +#ifdef debug_multipath_mapper + cerr << "found some rescue pairs, merging into current list of consistent mappings" << endl; +#endif + + merge_rescued_mappings(multipath_aln_pairs_out, cluster_pairs, pair_multiplicities, + rescue_aln_pairs, rescue_distances, rescue_multiplicities); + } - else { - OrientedDistanceClusterer clusterer2(alignment2, mems2, *get_regular_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters2 = clusterer2.clusters(alignment2, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); + else if (multipath_aln_pairs_out.empty() || + (!(!likely_mismapping(multipath_aln_pairs_out.front().first) && + !likely_misrescue(multipath_aln_pairs_out.front().second)) || + !(!likely_misrescue(multipath_aln_pairs_out.front().first) && + !likely_mismapping(multipath_aln_pairs_out.front().second)))) { + + // rescue didn't find any consistent mappings and we didn't have any pairings + // that we would have accepted from rescue beforehand. just take the single ended + // mappings that were computed for the sake of rescue + + proper_paired = false; + std::swap(multipath_aln_pairs_out, rescue_aln_pairs); + + // Don't sort and compute mapping quality; preserve the single-ended MAPQs } - - cluster_graphs2 = query_cluster_graphs(alignment2, mems2, clusters2); } - - if (multipath_aln_pairs_out.empty() && do_repeat_rescue_from_2 && !do_repeat_rescue_from_1) { - // we've clustered and extracted read 2, but rescue failed, so do the same for read 1 to prepare for the - // normal pair clustering routine - -#ifdef debug_multipath_mapper - cerr << "repeat rescue failed from read 2, extracting clusters for read 1 and transitioning to standard clustering approach" << endl; -#endif + else { - // rescue high count runs of order-length MEMs now that we're going to cluster here - if (order_length_repeat_hit_max && !rescued_order_length_runs_1) { - rescue_high_count_order_length_mems(mems1, order_length_repeat_hit_max); - rescued_order_length_runs_1 = true; - } + // We don't think any of our hits are likely to be mismapped - // do the clustering - if (adjust_alignments_for_base_quality) { - OrientedDistanceClusterer clusterer1(alignment1, mems1, *get_qual_adj_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters1 = clusterer1.clusters(alignment1, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); - } - else { - OrientedDistanceClusterer clusterer1(alignment1, mems1, *get_regular_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters1 = clusterer1.clusters(alignment1, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); + if (multipath_aln_pairs_out.front().first.mapping_quality() >= max_mapping_quality - secondary_rescue_subopt_diff && + multipath_aln_pairs_out.front().second.mapping_quality() >= max_mapping_quality - secondary_rescue_subopt_diff) { + + // we're very confident about this pair, but it might be because we over-pruned at the clustering stage + // or because of problems with the seeds. we use this routine to use rescue on other very good looking + // independent end clusters + + attempt_rescue_for_secondaries(alignment1, alignment2, cluster_graphs1, cluster_graphs2, + duplicate_pairs, multipath_aln_pairs_out, cluster_pairs, + pair_multiplicities, fanouts1.get(), fanouts2.get()); + +#ifdef debug_time_phases + cerr << "attempted secondary rescue, time elapsed: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif } - - cluster_graphs1 = query_cluster_graphs(alignment1, mems1, clusters1); } } else { - // we have reasonably unique hits on both reads, so cluster them both and extract the graphs (i.e. don't try - // to rely on rescue for either end yet) + // We got no pairs that satisfy the distance requirements + + // revert to independent single ended mappings, but skip any rescues that we already tried #ifdef debug_multipath_mapper - cerr << "clustering MEMs on both read ends..." << endl; + cerr << "could not find a consistent pair, reverting to single ended mapping" << endl; #endif - // try to rescue high count runs of order-length MEMs for both reads before clustering - if (order_length_repeat_hit_max && !rescued_order_length_runs_1) { - rescue_high_count_order_length_mems(mems1, order_length_repeat_hit_max); - rescued_order_length_runs_1 = true; - } - if (order_length_repeat_hit_max && !rescued_order_length_runs_2) { - rescue_high_count_order_length_mems(mems2, order_length_repeat_hit_max); - rescued_order_length_runs_2 = true; - } + vector rescue_multiplicities; + proper_paired = align_to_cluster_graphs_with_rescue(alignment1, alignment2, cluster_graphs1, cluster_graphs2, mems1, + mems2, multipath_aln_pairs_out, cluster_pairs, rescue_multiplicities, + fanouts1.get(), fanouts2.get()); - // do the clustering - if (adjust_alignments_for_base_quality) { - OrientedDistanceClusterer clusterer1(alignment1, mems1, *get_qual_adj_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters1 = clusterer1.clusters(alignment1, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); - OrientedDistanceClusterer clusterer2(alignment2, mems2, *get_qual_adj_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters2 = clusterer2.clusters(alignment2, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); - } - else { - OrientedDistanceClusterer clusterer1(alignment1, mems1, *get_regular_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters1 = clusterer1.clusters(alignment1, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); - OrientedDistanceClusterer clusterer2(alignment2, mems2, *get_regular_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - clusters2 = clusterer2.clusters(alignment2, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); - } +#ifdef debug_time_phases + cerr << "no pairs, did alignments with rescue, time elapsed: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif - // extract graphs around the clusters and get the assignments of MEMs to these graphs - cluster_graphs1 = query_cluster_graphs(alignment1, mems1, clusters1); - cluster_graphs2 = query_cluster_graphs(alignment2, mems2, clusters2); - } - -#ifdef debug_multipath_mapper - cerr << "obtained independent clusters:" << endl; - cerr << "read 1" << endl; - for (int i = 0; i < cluster_graphs1.size(); i++) { - cerr << "\tcluster " << i << endl; - for (pair hit : get<1>(cluster_graphs1[i])) { - cerr << "\t\t" << hit.second << " " << hit.first->sequence() << endl; - } - } - cerr << "read 2" << endl; - for (int i = 0; i < cluster_graphs2.size(); i++) { - cerr << "\tcluster " << i << endl; - for (pair hit : get<1>(cluster_graphs2[i])) { - cerr << "\t\t" << hit.second << " " << hit.first->sequence() << endl; + if (proper_paired) { + // we'll want to remember the multiplicities + pair_multiplicities = move(rescue_multiplicities); } } -#endif if (multipath_aln_pairs_out.empty()) { - // we haven't already obtained a paired mapping by rescuing into a repeat, so we should try to get one - // by cluster pairing + // we tried all of our tricks and still didn't find a mapping - // make vectors of cluster pointers to shim into the cluster pairing function - vector cluster_mems_1(cluster_graphs1.size()), cluster_mems_2(cluster_graphs2.size()); - for (size_t i = 0; i < cluster_mems_1.size(); i++) { - cluster_mems_1[i] = &(get<1>(cluster_graphs1[i])); - } - for (size_t i = 0; i < cluster_mems_2.size(); i++) { - cluster_mems_2[i] = &(get<1>(cluster_graphs2[i])); - } - - // Chebyshev bound for 99% of all fragments regardless of distribution - // TODO: I don't love having this internal aspect of the stranded/unstranded clustering outside the clusterer... - int64_t max_separation, min_separation; - if (unstranded_clustering) { - max_separation = (int64_t) ceil(abs(fragment_length_distr.mean()) + 10.0 * fragment_length_distr.stdev()); - min_separation = -max_separation; - } - else { - max_separation = (int64_t) ceil(fragment_length_distr.mean() + 10.0 * fragment_length_distr.stdev()); - min_separation = (int64_t) fragment_length_distr.mean() - 10.0 * fragment_length_distr.stdev(); - } - - // Find the clusters that have a tie for the longest MEM, and create alternate anchor points for those clusters - vector> alt_anchors_1, alt_anchors_2; - for (size_t i = 0; i < cluster_mems_1.size(); i++) { - auto& mem_cluster = *cluster_mems_1[i]; - for (size_t j = 1; j < mem_cluster.size(); j++) { - if (mem_cluster[j].first->length() + alt_anchor_max_length_diff >= mem_cluster.front().first->length()) { - alt_anchors_1.emplace_back(i, j); - } - else { - break; - } - } - } - for (size_t i = 0; i < cluster_mems_2.size(); i++) { - auto& mem_cluster = *cluster_mems_2[i]; - for (size_t j = 1; j < mem_cluster.size(); j++) { - if (mem_cluster[j].first->length() + alt_anchor_max_length_diff >= mem_cluster.front().first->length()) { - alt_anchors_2.emplace_back(i, j); - } - else { - break; - } - } - } - - // Compute the pairs of cluster graphs and their approximate distances from each other - cluster_pairs = OrientedDistanceClusterer::pair_clusters(alignment1, alignment2, - cluster_mems_1, cluster_mems_2, - alt_anchors_1, alt_anchors_2, - xindex, - min_separation, max_separation, - unstranded_clustering, - &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - - -#ifdef debug_multipath_mapper - cerr << "obtained cluster pairs:" << endl; - for (int i = 0; i < cluster_pairs.size(); i++) { - cerr << "\tpair " << i << " at distance " << cluster_pairs[i].second << endl; - cerr << "\t\t read 1 (cluster " << cluster_pairs[i].first.first << ")" << endl; - for (pair hit : get<1>(cluster_graphs1[cluster_pairs[i].first.first])) { - cerr << "\t\t\t" << hit.second << " " << hit.first->sequence() << endl; - } - cerr << "\t\t read 2 (cluster " << cluster_pairs[i].first.second << ")" << endl; - for (pair hit : get<1>(cluster_graphs2[cluster_pairs[i].first.second])) { - cerr << "\t\t\t" << hit.second << " " << hit.first->sequence() << endl; - } - } -#endif - - // do we find any pairs that satisfy the distance requirements? - if (!cluster_pairs.empty()) { - // only perform the mappings that satisfy the expectations on distance - - align_to_cluster_graph_pairs(alignment1, alignment2, cluster_graphs1, cluster_graphs2, cluster_pairs, - multipath_aln_pairs_out, duplicate_pairs, - &paths_of_node_memo, &oriented_occurences_memo, &handle_memo); - - // do we produce at least one good looking pair alignments from the clustered clusters? - if (multipath_aln_pairs_out.empty() ? true : (likely_mismapping(multipath_aln_pairs_out.front().first) || - likely_mismapping(multipath_aln_pairs_out.front().second))) { - -#ifdef debug_multipath_mapper - cerr << "one end of the pair may be mismapped, attempting individual end mappings" << endl; -#endif - // we're not happy with the pairs we got, try to get a good pair by rescuing from single ended alignments - // but block rescue from any sides that we already tried rescue from in the repeat rescue routine - - vector> rescue_aln_pairs; - vector, int64_t>> rescue_distances; - bool rescued = align_to_cluster_graphs_with_rescue(alignment1, alignment2, cluster_graphs1, cluster_graphs2, - do_repeat_rescue_from_1, do_repeat_rescue_from_2, - rescue_aln_pairs, rescue_distances, max_alt_mappings); - - // if we find consistent pairs by rescue, merge the two lists - if (rescued) { -#ifdef debug_multipath_mapper - cerr << "found some rescue pairs, merging into current list of consistent mappings" << endl; -#endif - - merge_rescued_mappings(multipath_aln_pairs_out, cluster_pairs, rescue_aln_pairs, rescue_distances); - - // if we still haven't found mappings that are distinguishable from matches to random sequences, - // don't let them have any mapping quality - if (likely_mismapping(multipath_aln_pairs_out.front().first) || - likely_mismapping(multipath_aln_pairs_out.front().second)) { - multipath_aln_pairs_out.front().first.set_mapping_quality(0); - multipath_aln_pairs_out.front().second.set_mapping_quality(0); - } - else { - // also account for the possiblity that we selected the wrong ends to rescue with - cap_mapping_quality_by_rescue_probability(multipath_aln_pairs_out, cluster_pairs, - cluster_graphs1, cluster_graphs2, false); - - // and for the possibility that we missed the correct cluster because of hit sub-sampling - // within the MEMs of the cluster - cap_mapping_quality_by_hit_sampling_probability(multipath_aln_pairs_out, cluster_pairs, - cluster_graphs1, cluster_graphs2, false); - } - } - else { - // rescue didn't find any consistent mappings, revert to the single ended mappings - std::swap(multipath_aln_pairs_out, rescue_aln_pairs); - } - } - else { - - // does it look like we might be overconfident about this pair because of our clustering strategy - bool do_secondary_rescue = (multipath_aln_pairs_out.front().first.mapping_quality() >= max_mapping_quality - secondary_rescue_subopt_diff && - multipath_aln_pairs_out.front().second.mapping_quality() >= max_mapping_quality - secondary_rescue_subopt_diff); - - if (do_secondary_rescue) { - // we're very confident about this pair, but it might be because we over-pruned at the clustering stage - // so we use this routine to use rescue on other very good looking independent end clusters - attempt_rescue_for_secondaries(alignment1, alignment2, cluster_graphs1, cluster_graphs2, - duplicate_pairs, multipath_aln_pairs_out, cluster_pairs); - - // account for the possiblity that we selected the wrong ends to rescue with - cap_mapping_quality_by_rescue_probability(multipath_aln_pairs_out, cluster_pairs, - cluster_graphs1, cluster_graphs2, true); - } - - // account for the possibility that we missed the correct cluster because of hit sub-sampling - // within the MEMs of the cluster - cap_mapping_quality_by_hit_sampling_probability(multipath_aln_pairs_out, cluster_pairs, - cluster_graphs1, cluster_graphs2, do_secondary_rescue); - - } - } - else { - // revert to independent single ended mappings, but skip any rescues that we already tried - -#ifdef debug_multipath_mapper - cerr << "could not find a consistent pair, reverting to single ended mapping" << endl; -#endif - - bool rescued = align_to_cluster_graphs_with_rescue(alignment1, alignment2, cluster_graphs1, cluster_graphs2, do_repeat_rescue_from_1, - do_repeat_rescue_from_2, multipath_aln_pairs_out, cluster_pairs, max_alt_mappings); - - if (rescued) { - // account for the possiblity that we selected the wrong ends to rescue with - cap_mapping_quality_by_rescue_probability(multipath_aln_pairs_out, cluster_pairs, - cluster_graphs1, cluster_graphs2, false); - } - } - } - - if (multipath_aln_pairs_out.empty()) { - // we tried all of our tricks and still didn't find a mapping - - // add a null alignment so we know it wasn't mapped - multipath_aln_pairs_out.emplace_back(); - to_multipath_alignment(alignment1, multipath_aln_pairs_out.back().first); - to_multipath_alignment(alignment2, multipath_aln_pairs_out.back().second); + // add a null alignment so we know it wasn't mapped + multipath_aln_pairs_out.emplace_back(); + to_multipath_alignment(alignment1, multipath_aln_pairs_out.back().first); + to_multipath_alignment(alignment2, multipath_aln_pairs_out.back().second); + pair_multiplicities.emplace_back(); + cluster_pairs.emplace_back(); // in case we're realigning GAMs that have paths already multipath_aln_pairs_out.back().first.clear_subpath(); @@ -1621,69 +2118,77 @@ namespace vg { multipath_aln_pairs_out.back().second.clear_start(); } + // do paired spliced alignment only if we have real pairs + if (proper_paired && do_spliced_alignment) { + find_spliced_alignments(alignment1, alignment2, multipath_aln_pairs_out, cluster_pairs, pair_multiplicities, + mems1, mems2, cluster_graphs1, cluster_graphs2); +#ifdef debug_time_phases + cerr << "formed spliced alignments, time elapsed: " << double(clock() - start) / CLOCKS_PER_SEC << " secs" << endl; +#endif + } + + // only agglomerate if the pairs are true pairs, otherwise it gets too complicated + // to estimate mapping qualities + if (proper_paired && agglomerate_multipath_alns) { + agglomerate_alignment_pairs(multipath_aln_pairs_out, cluster_pairs, pair_multiplicities); + } + // if we computed extra alignments to get a mapping quality or investigate ambiguous clusters, remove them if (multipath_aln_pairs_out.size() > max_alt_mappings) { multipath_aln_pairs_out.resize(max_alt_mappings); } - if (simplify_topologies) { - for (pair& multipath_aln_pair : multipath_aln_pairs_out) { - merge_non_branching_subpaths(multipath_aln_pair.first); - merge_non_branching_subpaths(multipath_aln_pair.second); + // mark if any of the alignments are just noise + purge_unmapped_alignments(multipath_aln_pairs_out, proper_paired); + + for (size_t i = 0; i < multipath_aln_pairs_out.size(); ++i) { + multipath_aln_pairs_out[i].first.set_annotation("proper_pair", proper_paired); + multipath_aln_pairs_out[i].second.set_annotation("proper_pair", proper_paired); + if (i != 0) { + multipath_aln_pairs_out[i].first.set_annotation("secondary", true); + multipath_aln_pairs_out[i].second.set_annotation("secondary", true); } } // remove the full length bonus if we don't want it in the final score if (strip_bonuses) { - for (pair& multipath_aln_pair : multipath_aln_pairs_out) { + for (pair& multipath_aln_pair : multipath_aln_pairs_out) { strip_full_length_bonuses(multipath_aln_pair.first); strip_full_length_bonuses(multipath_aln_pair.second); } } // Compute the fragment length distribution. - // TODO: make this machine-readable instead of a copy-able string. - string distribution = "-I " + to_string(fragment_length_distr.mean()) + " -D " + to_string(fragment_length_distr.stdev()); + string distribution = "-I " + to_string(fragment_length_distr.mean()) + " -D " + to_string(fragment_length_distr.std_dev()); - for (pair& multipath_aln_pair : multipath_aln_pairs_out) { - // add pair names to connect the paired reads - multipath_aln_pair.first.set_paired_read_name(multipath_aln_pair.second.name()); - multipath_aln_pair.second.set_paired_read_name(multipath_aln_pair.first.name()); - + for (pair& multipath_aln_pair : multipath_aln_pairs_out) { // Annotate with paired end distribution - set_annotation(&multipath_aln_pair.first, "fragment_length_distribution", distribution); - set_annotation(&multipath_aln_pair.second, "fragment_length_distribution", distribution); - } - - // clean up the VG objects on the heap - for (auto cluster_graph : cluster_graphs1) { - delete get<0>(cluster_graph); - } - for (auto cluster_graph : cluster_graphs2) { - delete get<0>(cluster_graph); + multipath_aln_pair.first.set_annotation("fragment_length_distribution", distribution); + multipath_aln_pair.second.set_annotation("fragment_length_distribution", distribution); } #ifdef debug_pretty_print_alignments cerr << "final alignments being returned:" << endl; - for (const pair& multipath_aln_pair : multipath_aln_pairs_out) { + for (const pair& multipath_aln_pair : multipath_aln_pairs_out) { cerr << "read 1: " << endl; view_multipath_alignment(cerr, multipath_aln_pair.first, *xindex); cerr << "read 2: " << endl; view_multipath_alignment(cerr, multipath_aln_pair.second, *xindex); } #endif + return proper_paired; } - void MultipathMapper::reduce_to_single_path(const MultipathAlignment& multipath_aln, vector& alns_out, - size_t max_alt_mappings) const { + void MultipathMapper::reduce_to_single_path(const multipath_alignment_t& multipath_aln, vector& alns_out, + size_t max_number) const { #ifdef debug_multipath_mapper - cerr << "linearizing multipath alignment" << endl; -#endif + cerr << "linearizing multipath alignment to assess positional diversity" << endl; +#endif // Compute a few optimal alignments using disjoint sets of subpaths. // This hopefully gives us a feel for the positional diversity of the MultipathMapping. // But we still may have duplicates or overlaps in vg node space. - auto alns = optimal_alignments_with_disjoint_subpaths(multipath_aln, max_alt_mappings + 1); + auto alns = optimal_alignments_with_disjoint_subpaths(multipath_aln, max_number + 1); if (alns.empty()) { // This happens only if the read is totally unmapped @@ -1729,7 +2234,7 @@ namespace vg { } #ifdef debug_multipath_mapper - cerr << "found suboptimal mapping overlapping " << overlapped << "/" << alns[i].path().mapping_size() << " with score " + cerr << "found suboptimal mapping overlapping " << overlapped << "/" << alns[i].path().mapping_size() << " with score " << alns[i].score() << endl; cerr << "\t" << pb2json(alns[i]) << endl; #endif @@ -1751,519 +2256,2824 @@ namespace vg { #ifdef debug_multipath_mapper cerr << "overall found optimal mapping with score " << alns_out[0].score() << " plus " << (alns_out.size() - 1) - << " of " << max_alt_mappings << " alternate linearizations"; + << " of " << max_number << " alternate linearizations"; if (alns_out.size() >= 2) { cerr << " with best score " << alns_out[1].score(); } cerr << endl; -#endif +#endif - if (mapping_quality_method != None) { - // Now compute the MAPQ for the best alignment - auto placement_mapq = compute_raw_mapping_quality_from_scores(scores, mapping_quality_method); - // And min it in with what;s there already. - alns_out[0].set_mapping_quality(min(alns_out[0].mapping_quality(), placement_mapq)); - for (size_t i = 1; i < alns_out.size(); i++) { - // And zero all the others - alns_out[i].set_mapping_quality(0); - } + // Now compute the MAPQ for the best alignment + auto placement_mapqs = compute_raw_mapping_qualities_from_scores(scores, + !multipath_aln.quality().empty()); + // And min it in with what;s there already. + alns_out[0].set_mapping_quality(min(alns_out[0].mapping_quality(), placement_mapqs.front())); + for (size_t i = 1; i < alns_out.size(); i++) { + // And zero all the others + alns_out[i].set_mapping_quality(0); } } - - void MultipathMapper::split_multicomponent_alignments(vector& multipath_alns_out, - vector* cluster_idxs) const { + + vector> MultipathMapper::covered_intervals(const Alignment& alignment, + const clustergraph_t& cluster) const { - size_t num_original_alns = multipath_alns_out.size(); - for (size_t i = 0; i < num_original_alns; i++) { - - vector> comps = connected_components(multipath_alns_out[i]); - - if (comps.size() > 1) { -#ifdef debug_multipath_mapper - cerr << "splitting multicomponent alignment " << pb2json(multipath_alns_out[i]) << endl; -#endif - // split this multipath alignment into its connected components - for (size_t j = 1; j < comps.size(); j++) { - multipath_alns_out.emplace_back(); - extract_sub_multipath_alignment(multipath_alns_out[i], comps[j], - multipath_alns_out.back()); - // also label the split alignment with its cluster of origin, if we're keeping track of that - if (cluster_idxs) { - cluster_idxs->emplace_back(cluster_idxs->at(i)); - } - } - // put the first component into the original location - MultipathAlignment last_component; - extract_sub_multipath_alignment(multipath_alns_out[i], comps[0], last_component); - multipath_alns_out[i] = last_component; + // convert MEM subsequences to integer intervals + vector> mem_intervals; + mem_intervals.reserve(get<1>(cluster).first.size()); + for (const auto& hit : get<1>(cluster).first) { + mem_intervals.emplace_back(hit.first->begin - alignment.sequence().begin(), + hit.first->end - alignment.sequence().begin()); + } + + // put them in order + sort(mem_intervals.begin(), mem_intervals.end()); + + // do a sweep line + vector> interval_union; + int64_t begin, end; + tie(begin, end) = mem_intervals.front(); + for (size_t i = 1; i < mem_intervals.size(); ++i) { + if (mem_intervals[i].first > end) { + interval_union.emplace_back(begin, end); + tie(begin, end) = mem_intervals[i]; + } + else { + end = max(end, mem_intervals[i].second); } } + interval_union.emplace_back(begin, end); + return interval_union; } - - void MultipathMapper::attempt_rescue_of_repeat_from_non_repeat(const Alignment& alignment1, const Alignment& alignment2, - const vector& mems1, const vector& mems2, - bool do_repeat_rescue_from_1, bool do_repeat_rescue_from_2, - vector& clusters1, vector& clusters2, - vector& cluster_graphs1, vector& cluster_graphs2, - vector>& multipath_aln_pairs_out, - vector, int64_t>>& pair_distances, size_t max_alt_mappings, - OrientedDistanceClusterer::paths_of_node_memo_t* paths_of_node_memo, - OrientedDistanceClusterer::oriented_occurences_memo_t* oriented_occurences_memo, - OrientedDistanceClusterer::handle_memo_t* handle_memo) { + + bool MultipathMapper::test_splice_candidates(const Alignment& alignment, bool searching_left, + multipath_alignment_t& anchor_mp_aln, double* anchor_multiplicity_out, + SpliceStrand& strand, int64_t num_candidates, + const function& get_candidate, + const function& get_multiplicity, + const function& consume_candidate) const { + + - bool rescue_succeeded_from_1 = false, rescue_succeeded_from_2 = false; + /* + * The region around a candidate's end, which could contain a splice junction + */ + struct PrejoinSide { + int64_t candidate_idx; + SpliceRegion* splice_region; + pos_t search_pos; + int64_t clip_length; + int32_t untrimmed_score; + }; - if (do_repeat_rescue_from_1) { + /* + * Two consistent pairs of identified splice sites with a joining alignment + */ + struct PutativeJoin { + PutativeJoin(const PathPositionHandleGraph& graph, + const SpliceStats& splice_stats, const Alignment& opt, + const GSSWAligner& aligner, + const PrejoinSide& left, const PrejoinSide& right, + const tuple& left_location, + const tuple& right_location, + size_t motif_idx) + : joined_graph(graph, left.splice_region->get_subgraph(), + get<0>(left_location), get<1>(left_location), + right.splice_region->get_subgraph(), + get<0>(right_location), get<1>(right_location)), + left_search_dist(get<2>(left_location)), + right_search_dist(get<2>(right_location)), + left_clip_length(left.clip_length), + right_clip_length(right.clip_length), + left_candidate_idx(left.candidate_idx), + right_candidate_idx(right.candidate_idx), + estimated_intron_length(-1), + intron_score(0), + motif_idx(motif_idx), + untrimmed_score(left.untrimmed_score + right.untrimmed_score) + { + // memoize the best score + max_score = pre_align_max_score(aligner, splice_stats, opt); + } + + JoinedSpliceGraph joined_graph; + int64_t left_search_dist; + int64_t right_search_dist; + int64_t left_clip_length; + int64_t right_clip_length; + size_t left_candidate_idx; + size_t right_candidate_idx; + int32_t max_score; + int32_t untrimmed_score; + size_t motif_idx; + // intron stats start uninitialized until measuring length + int32_t intron_score; + int64_t estimated_intron_length; + // these two filled out after doing alignment + Alignment connecting_aln; + size_t splice_idx; + + int32_t fixed_score_components(const SpliceStats& splice_stats, + const Alignment& opt) { + return splice_stats.motif_score(motif_idx) + untrimmed_score - opt.score(); + } + + void set_intron_length(int64_t estimated_intron_length, + const SpliceStats& splice_stats) { + + estimated_intron_length = estimated_intron_length; + intron_score = splice_stats.intron_length_score(estimated_intron_length); + // memoize the max score again + max_score += intron_score; + } + + int32_t pre_align_max_score(const GSSWAligner& aligner, + const SpliceStats& splice_stats, + const Alignment& opt) { + // compute a bound on the best possible score this join could get + int64_t min_dist = joined_graph.min_link_length(); + int64_t max_dist = joined_graph.max_link_length(); + int32_t min_gap_penalty = 0; + int64_t link_length = left_clip_length + right_clip_length - opt.sequence().size(); + if (link_length < min_dist) { + min_gap_penalty = aligner.score_gap(min_dist - link_length); + } + else if (link_length > max_dist) { + min_gap_penalty = aligner.score_gap(link_length - max_dist); + } + return (min_gap_penalty + + aligner.score_exact_match(opt, opt.sequence().size() - left_clip_length, + link_length) + + fixed_score_components(splice_stats, opt)); + } + int32_t post_align_net_score(const SpliceStats& splice_stats, + const Alignment& opt) { + return fixed_score_components(splice_stats, opt) + connecting_aln.score() + intron_score; + } + }; + + /* + * Socket for iteration that either enumerates motifs exhaustively or by selecting the + * the most promising pairs if there are too many + */ + class MotifPairIterable { + public: + MotifPairIterable(size_t max_num_pairs, + const PrejoinSide& left_side, + const PrejoinSide& right_side, + size_t motif_num, size_t seq_len) : + motif_num(motif_num), + left_side(left_side), + right_side(right_side), + seq_len(seq_len) + { + const auto& left_sites = left_side.splice_region->candidate_splice_sites(motif_num); + const auto& right_sites = right_side.splice_region->candidate_splice_sites(motif_num); + + if (left_sites.size() * right_sites.size() > max_num_pairs) { #ifdef debug_multipath_mapper - cerr << "attempting repeat rescue from read 1" << endl; + cerr << "number of pairs " << left_sites.size() * right_sites.size() << " for motif " << motif_num << " is above maximum " << max_num_pairs << ", sampling downward" << endl; #endif - - // get the clusters for the non repeat - if (adjust_alignments_for_base_quality) { - OrientedDistanceClusterer clusterer1(alignment1, mems1, *get_qual_adj_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, paths_of_node_memo, oriented_occurences_memo, handle_memo); - clusters1 = clusterer1.clusters(alignment1, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); - } - else { - OrientedDistanceClusterer clusterer1(alignment1, mems1, *get_regular_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, paths_of_node_memo, oriented_occurences_memo, handle_memo); - clusters1 = clusterer1.clusters(alignment1, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); + + // there are too many pairs to just iterate over all of them, we have to select + // the most promising subset + + vector left_idxs = range_vector(left_sites.size()); + vector right_idxs = range_vector(right_sites.size()); + + // sort the indexes by min distance in opposite ordering + stable_sort(left_idxs.begin(), left_idxs.end(), [&](size_t i, size_t j) { + return get<2>(left_sites[i]) < get<2>(left_sites[j]); + }); + stable_sort(right_idxs.begin(), right_idxs.end(), [&](size_t i, size_t j) { + return get<2>(right_sites[i]) > get<2>(right_sites[j]); + }); + + // the difference from the ideal distance to the approximate graph distance, takes indexes + // in the left_idxs and right_idxs vectors + int64_t target_len = 2 * seq_len - left_side.clip_length - right_side.clip_length; + auto distance_diff = [&](size_t l, size_t r) { + return abs(get<2>(left_sites[left_idxs[l]]) + get<2>(right_sites[right_idxs[r]]) - target_len); + }; + + // sweep to identify pairs that most nearly align + // records of (left idx, right idx, searching left?) + vector> nearest_idx; + for (size_t l = 0, r = 0; l < left_sites.size(); ++l) { + while (r + 1 < right_idxs.size() && distance_diff(l, r + 1) < distance_diff(l, r)) { + ++r; + } + nearest_idx.emplace_back(l, r, true); + if (r + 1 < right_idxs.size()) { + nearest_idx.emplace_back(l, r + 1, false); + } + } + + // opposite order to get smallest differences first + auto cmp = [&](const tuple& a, const tuple& b) { + return distance_diff(get<0>(a), get<1>(a)) > distance_diff(get<0>(b), get<1>(b)); + }; + + // preprocess to find the minimum diff quickly + make_heap(nearest_idx.begin(), nearest_idx.end(), cmp); + + while (idx_pairs.size() < max_num_pairs && !nearest_idx.empty()) { + auto front = nearest_idx.front(); + idx_pairs.emplace_back(left_idxs[get<0>(front)], right_idxs[get<1>(front)]); + pop_heap(nearest_idx.begin(), nearest_idx.end(), cmp); + if (get<2>(front) && get<1>(front) > 0) { + // continue to the left + --get<1>(nearest_idx.back()); + push_heap(nearest_idx.begin(), nearest_idx.end(), cmp); + } + else if (!get<2>(front) && get<1>(front) + 1 < right_idxs.size()) { + // continue to the right + ++get<1>(nearest_idx.back()); + push_heap(nearest_idx.begin(), nearest_idx.end(), cmp); + } + else { + // we can't keep looking in this direction + nearest_idx.pop_back(); + } + } + } } - // extract the graphs around the clusters - cluster_graphs1 = query_cluster_graphs(alignment1, mems1, clusters1); - - // attempt rescue from these graphs - vector> rescued_pairs; - vector, int64_t>> rescued_distances; - rescue_succeeded_from_1 = align_to_cluster_graphs_with_rescue(alignment1, alignment2, cluster_graphs1, cluster_graphs2, - false, true, rescued_pairs, pair_distances, max_alt_mappings); - - // move the rescued pairs to the output vectors - if (rescue_succeeded_from_1) { + /* + * simple iterator interface + */ + class iterator { + public: + iterator(const MotifPairIterable& iteratee, bool begin) : iteratee(iteratee) { + if (!begin) { + if (iteratee.idx_pairs.empty()) { + i = (iteratee.left_side.splice_region->candidate_splice_sites(iteratee.motif_num).size() + * iteratee.right_side.splice_region->candidate_splice_sites(iteratee.motif_num).size()); + } + else { + i = iteratee.idx_pairs.size(); + } + } + } -#ifdef debug_multipath_mapper - cerr << "repeat rescue succeeded" << endl; -#endif + iterator& operator++() { + ++i; + return *this; + } - for (auto& multipath_aln_pair : rescued_pairs) { - multipath_aln_pairs_out.emplace_back(move(multipath_aln_pair)); + bool operator==(const iterator& other) const { + // this isn't valid if they are looking at different motifs or have a different maximum, + // but that should never happen within the limited scope of this function + return (&iteratee == &other.iteratee && i == other.i); } - for (auto& pair_distance : rescued_distances) { - pair_distances.emplace_back(pair_distance); + + bool operator!=(const iterator& other) const { + return !(*this == other); } - } - } - - if (do_repeat_rescue_from_2) { - // TODO: duplicative code + + pair, tuple> operator*() const { + if (iteratee.idx_pairs.empty()) { + size_t size = iteratee.left_side.splice_region->candidate_splice_sites(iteratee.motif_num).size(); + size_t j = i % size; + size_t k = i / size; + return make_pair(iteratee.left_side.splice_region->candidate_splice_sites(iteratee.motif_num)[j], + iteratee.right_side.splice_region->candidate_splice_sites(iteratee.motif_num)[k]); + } + else { + return make_pair(iteratee.left_side.splice_region->candidate_splice_sites(iteratee.motif_num)[iteratee.idx_pairs[i].first], + iteratee.right_side.splice_region->candidate_splice_sites(iteratee.motif_num)[iteratee.idx_pairs[i].second]); + } + } + + private: + size_t i = 0; + const MotifPairIterable& iteratee; + }; -#ifdef debug_multipath_mapper - cerr << "attempting repeat rescue from read 2" << endl; -#endif - // get the clusters for the non repeat - if (adjust_alignments_for_base_quality) { - OrientedDistanceClusterer clusterer2(alignment2, mems2, *get_qual_adj_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, paths_of_node_memo, oriented_occurences_memo, handle_memo); - clusters2 = clusterer2.clusters(alignment2, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); + // iterable methods + iterator begin() const { + return iterator(*this, true); } - else { - OrientedDistanceClusterer clusterer2(alignment2, mems2, *get_regular_aligner(), xindex, max_expected_dist_approx_error, min_clustering_mem_length, - unstranded_clustering, paths_of_node_memo, oriented_occurences_memo, handle_memo); - clusters2 = clusterer2.clusters(alignment2, max_mapping_quality, log_likelihood_approx_factor, min_median_mem_coverage_for_split); + iterator end() const { + return iterator(*this, false); } - // extract the graphs around the clusters - cluster_graphs2 = query_cluster_graphs(alignment2, mems2, clusters2); + private: - // attempt rescue from these graphs - vector> rescued_pairs; - vector, int64_t>> rescued_distances; - rescue_succeeded_from_2 = align_to_cluster_graphs_with_rescue(alignment1, alignment2, cluster_graphs1, cluster_graphs2, - true, false, rescued_pairs, pair_distances, max_alt_mappings); + friend class iterator; - // move the rescued pairs to the output vectors - if (rescue_succeeded_from_2) { - + size_t motif_num; + size_t seq_len; + const PrejoinSide& left_side; + const PrejoinSide& right_side; + vector> idx_pairs; + }; + + if (num_candidates == 0) { #ifdef debug_multipath_mapper - cerr << "repeat rescue succeeded" << endl; + cerr << "no splice candidate to attempt join with" << endl; + return false; #endif - for (auto& multipath_aln_pair : rescued_pairs) { - multipath_aln_pairs_out.emplace_back(move(multipath_aln_pair)); + } + + // we'll memoize the relatively expensive reference distance computations, since often + // there are multiple splice motifs on the same node + unordered_map, int64_t> ref_length_memo; + auto get_reference_dist = [&](const pos_t& pos_1, const pos_t& pos_2) -> int64_t { + + tuple key(id(pos_1), is_rev(pos_1), id(pos_2), is_rev(pos_2)); + + auto it = ref_length_memo.find(key); + if (it != ref_length_memo.end()) { + // the reference distance of these nodes is already memoized + return it->second - offset(pos_1) + offset(pos_2); + } + else { + int64_t dist = numeric_limits::max(); + if (xindex->get_path_count() != 0) { + // estimate the distance using the reference path + dist = algorithms::ref_path_distance(xindex, pos_1, pos_2, ref_path_handles, + max_splice_ref_search_length); + } - for (auto& pair_distance : rescued_distances) { - pair_distances.emplace_back(pair_distance); + +// if (distance_index && dist == numeric_limits::max()) { +// // FIXME: this will still sometimes produce finite distances for reads that +// // can't reach each other along a surjection path in cyclic graphs +// // FIXME: it can also find finite distances to different strands of a path, +// // but this might be okay sometimes? +// +// // they're probably still reachable if they got this far, get a worse estimate of the +// // distance from the distance index +// int64_t min_dist = distance_index->min_distance(pos_1, pos_2); +// if (min_dist >= 0) { +// dist = min_dist; +// } +// } + + if (dist != numeric_limits::max()) { + // not memoizing unreachable distances, since distance index should + // filter out most of those anyway, and they actually might change on + // different positions on the node + ref_length_memo[key] = dist + offset(pos_1) - offset(pos_2); } + return dist; } - } + }; - // re-sort the rescued alignments if we actually did it from both sides - if (rescue_succeeded_from_1 && rescue_succeeded_from_2) { - sort_and_compute_mapping_quality(multipath_aln_pairs_out, pair_distances); - } - // consider whether we should cap the mapping quality based on the chance that we rescued from the wrong clusters - if (rescue_succeeded_from_1 || rescue_succeeded_from_2) { - cap_mapping_quality_by_rescue_probability(multipath_aln_pairs_out, pair_distances, cluster_graphs1, cluster_graphs2, false); +#ifdef debug_multipath_mapper + cerr << "testing splice candidates for mp aln: " << endl; + cerr << debug_string(anchor_mp_aln) << endl; +#endif + + vector> splice_regions; + vector left_prejoin_sides, right_prejoin_sides; + + vector& anchor_prejoin_sides = searching_left ? right_prejoin_sides : left_prejoin_sides; + vector& candidate_prejoin_sides = searching_left ? left_prejoin_sides : right_prejoin_sides; + + // examine the region along the possible splice region of the anchor + Alignment opt; + optimal_alignment(anchor_mp_aln, opt); + if (opt.path().mapping_size() == 0) { + return false; } - } - - void MultipathMapper::merge_rescued_mappings(vector>& multipath_aln_pairs_out, - vector, int64_t>>& cluster_pairs, - vector>& rescued_multipath_aln_pairs, - vector, int64_t>>& rescued_cluster_pairs) const { + auto anchor_pos = trimmed_end(opt, max_splice_overhang, !searching_left, *xindex, + *get_aligner(!opt.quality().empty())); + + splice_regions.emplace_back(new SpliceRegion(get<0>(anchor_pos), searching_left, 2 * max_splice_overhang, + *xindex, dinuc_machine, splice_stats)); + + anchor_prejoin_sides.emplace_back(); + anchor_prejoin_sides.front().candidate_idx = -1; + anchor_prejoin_sides.front().splice_region = splice_regions.front().get(); + anchor_prejoin_sides.front().search_pos = get<0>(anchor_pos); + anchor_prejoin_sides.front().clip_length = get<1>(anchor_pos); + anchor_prejoin_sides.front().untrimmed_score = opt.score() - get<2>(anchor_pos); + +#ifdef debug_multipath_mapper + cerr << "anchor stats:" << endl; + cerr << "\tsearch pos " << anchor_prejoin_sides.front().search_pos << endl; + cerr << "\tclip length " << anchor_prejoin_sides.front().clip_length << endl; + cerr << "\topt score " << opt.score() << endl; + cerr << "\tuntrimmed score " << anchor_prejoin_sides.front().untrimmed_score << endl; +#endif - size_t num_unrescued_pairs = multipath_aln_pairs_out.size(); - for (size_t j = 0; j < rescued_multipath_aln_pairs.size(); j++) { + // examine the possible splice regions for the candidates + bool found_splice_aln = false; + for (int64_t i = 0; i < num_candidates; ++i) { - // make sure this pair isn't a duplicate with any of the original pairs - bool duplicate = false; - for (size_t i = 0; i < num_unrescued_pairs; i++) { + auto& candidate = get_candidate(i); + + if (candidate.subpath().empty()) { #ifdef debug_multipath_mapper - cerr << "checking if rescue pair " << j << " is duplicate of original pair " << i << endl; + cerr << "skipping empty candidate " << i << endl; #endif - if (abs(distance_between(multipath_aln_pairs_out[i].first, rescued_multipath_aln_pairs[j].first)) < 20) { - if (abs(distance_between(multipath_aln_pairs_out[i].second, rescued_multipath_aln_pairs[j].second)) < 20) { + continue; + } + #ifdef debug_multipath_mapper - cerr << "found a duplicate" << endl; + cerr << "extracting splice region for candidate " << i << ":" << endl; + cerr << debug_string(candidate) << endl; #endif - duplicate = true; - break; - } - } - } - if (!duplicate) { + Alignment candidate_opt; + optimal_alignment(candidate, candidate_opt); + if (candidate_opt.path().mapping_size() == 0) { #ifdef debug_multipath_mapper - cerr << "no duplicate, adding to return vector if distance is finite and positive" << endl; + cerr << "skipping negative scoring candidate " << i << endl; #endif - cluster_pairs.emplace_back(rescued_cluster_pairs[j]); - multipath_aln_pairs_out.emplace_back(move(rescued_multipath_aln_pairs[j])); + continue; } + + auto candidate_pos = trimmed_end(candidate_opt, max_splice_overhang, searching_left, *xindex, + *get_aligner(!opt.quality().empty())); + + splice_regions.emplace_back(new SpliceRegion(get<0>(candidate_pos), !searching_left, 2 * max_splice_overhang, + *xindex, dinuc_machine, splice_stats)); + + candidate_prejoin_sides.emplace_back(); + auto& candidate_side = candidate_prejoin_sides.back(); + candidate_side.candidate_idx = i; + candidate_side.splice_region = splice_regions.back().get(); + candidate_side.search_pos = get<0>(candidate_pos); + candidate_side.clip_length = get<1>(candidate_pos); + candidate_side.untrimmed_score = candidate_opt.score() - get<2>(candidate_pos); + +#ifdef debug_multipath_mapper + cerr << "candidate stats:" << endl; + cerr << "\tsearch pos " << candidate_side.search_pos << endl; + cerr << "\tclip length " << candidate_side.clip_length << endl; + cerr << "\topt score " << candidate_opt.score() << endl; + cerr << "\tuntrimmed score " << candidate_side.untrimmed_score << endl; +#endif } - sort_and_compute_mapping_quality(multipath_aln_pairs_out, cluster_pairs); - } - - void MultipathMapper::cap_mapping_quality_by_rescue_probability(vector>& multipath_aln_pairs_out, - vector, int64_t>>& cluster_pairs, - vector& cluster_graphs1, - vector& cluster_graphs2, - bool from_secondary_rescue) const { - + // identify the possible joins down to a base level, including the intron length + vector putative_joins; + for (auto& left_prejoin_side : left_prejoin_sides) { + + auto& left_region = *left_prejoin_side.splice_region; + auto left_seed = left_region.get_seed_pos(); + auto left_seed_under = left_region.get_subgraph().get_underlying_handle(left_seed.first); + pos_t left_seed_pos(xindex->get_id(left_seed_under), xindex->get_is_reverse(left_seed_under), left_seed.second); + + for (auto& right_prejoin_side : right_prejoin_sides) { + #ifdef debug_multipath_mapper - cerr << "checking whether we should enter rescue mapping quality capping routine" << endl; + cerr << "resolving joins for left candidate " << left_prejoin_side.candidate_idx << ", right candidate " << right_prejoin_side.candidate_idx << endl; #endif - - // did we use an out-of-bounds cluster index to flag either end as coming from a rescue? - bool opt_aln_1_is_rescued = cluster_pairs.front().first.first >= cluster_graphs1.size(); - bool opt_aln_2_is_rescued = cluster_pairs.front().first.second >= cluster_graphs2.size(); + auto& right_region = *right_prejoin_side.splice_region; + auto right_seed = right_region.get_seed_pos(); + auto right_seed_under = right_region.get_subgraph().get_underlying_handle(right_seed.first); + pos_t right_seed_pos(xindex->get_id(right_seed_under), xindex->get_is_reverse(right_seed_under), right_seed.second); - // was the optimal cluster pair obtained by rescue? - if (opt_aln_1_is_rescued || opt_aln_2_is_rescued) { - // let's figure out if we should reduce its mapping quality to reflect the fact that we may not have selected the - // correct cluster as a rescue candidate - + if (distance_index) { + // check if these regions can reach each other + size_t test_dist = minimum_distance(*distance_index, left_seed_pos, right_seed_pos, false, xindex); #ifdef debug_multipath_mapper - cerr << "the optimal alignment is a rescue, checking if we need to cap the mapping quality" << endl; + cerr << "got distance index test distance " << test_dist << " between seed positions " << left_seed_pos << " and " << right_seed_pos << endl; #endif - - vector& anchor_clusters = opt_aln_1_is_rescued ? cluster_graphs2 : cluster_graphs1; - size_t anchor_idx = opt_aln_1_is_rescued ? cluster_pairs.front().first.second : cluster_pairs.front().first.first; - - // find the range of clusters that could plausibly be about as good as the one that rescue succeeded from - size_t plausible_clusters_end_idx = anchor_idx; - for (; plausible_clusters_end_idx < anchor_clusters.size(); plausible_clusters_end_idx++) { - if (get<2>(anchor_clusters[plausible_clusters_end_idx]) < get<2>(anchor_clusters[anchor_idx]) - plausible_rescue_cluster_coverage_diff) { - break; + if (test_dist == numeric_limits::max() || test_dist > max_intron_length) { +#ifdef debug_multipath_mapper + cerr << "test distance shows that this pair of candidates cannot reach each other" << endl; +#endif + continue; + } } - } - - // figure out which index corresponds to the end of the range we would have rescued - size_t max_rescues_attempted_idx; - if (from_secondary_rescue) { - // find the indexes that were added by pair clustering - unordered_set paired_idxs; - for (auto& cluster_pair : cluster_pairs) { - paired_idxs.insert(opt_aln_1_is_rescued ? cluster_pair.first.second : cluster_pair.first.first); + + // figure out how many pairs there are of each motif and across all motifs + size_t total_num_pairs = 0; + vector num_candidate_pairs(splice_stats.motif_size(), 0); + for (size_t j = 0; j < splice_stats.motif_size(); ++j) { + if (strand != Undetermined && splice_stats.motif_is_reverse(j) != (strand == Reverse)) { + // we can only find splicing at junctions that have a consistent strand + continue; + } + + num_candidate_pairs[j] = (left_region.candidate_splice_sites(j).size() + * right_region.candidate_splice_sites(j).size()); + total_num_pairs += num_candidate_pairs[j]; + +#ifdef debug_multipath_mapper + cerr << "motif " << j << " will have num candidate pairs: " << left_region.candidate_splice_sites(j).size() << " * " << right_region.candidate_splice_sites(j).size() << " = " << num_candidate_pairs[j] << endl; +#endif } - // the "budget" of rescues we could have performed - size_t rescues_left = secondary_rescue_attempts; - size_t i = 0; - for (; i < anchor_clusters.size(); i++) { - // did we skip thi index because it was already in a pair? - if (!paired_idxs.count(i)) { - if (rescues_left) { - // we would have tried a secondary rescue here - rescues_left--; + // apportion the effort we'll spend across motifs + vector motif_max_num_pairs; + if (total_num_pairs < max_motif_pairs) { + // we can afford to do all of the candidates + motif_max_num_pairs = move(num_candidate_pairs); + } + else { + // we have to budget out the number of candidates, so we have this procedure + // to apportion them among motifs based the motif frequency + // note: each round has to either clear out the entire budget or clear out all + // instances of at least one motif pair, which guarantees eventual termination + // in `splice_stats.motif_size()` rounds + // (the rounding up can actually cause us to break the maximum, but it + // shouldn't be by very much, and rounding up is necessary to guarantee the + // clearing property) + + motif_max_num_pairs.resize(splice_stats.motif_size(), 0); + + int64_t budget_remaining = max_motif_pairs; + while (budget_remaining > 0) { +#ifdef debug_multipath_mapper + cerr << "new round of apportioning motif pair budget:" << endl; +#endif + // compute the normalizing factor for the frequency among the motifs that + // still have motifs to assign + double total_freq = 0.0; + for (size_t j = 0; j < splice_stats.motif_size(); ++j) { + if (motif_max_num_pairs[j] < num_candidate_pairs[j]) { + total_freq += splice_stats.motif_frequency(j); + } } - else { - // we would have run out of secondary rescues here - break; + int64_t next_budget_remaining = budget_remaining; + for (size_t j = 0; j < splice_stats.motif_size(); ++j) { + int64_t motif_remaining = num_candidate_pairs[j] - motif_max_num_pairs[j]; + if (motif_remaining > 0) { + // figure out how many pairs out of the budget to give to this motif on this + // round + double round_fraction = splice_stats.motif_frequency(j) / total_freq; + int64_t round_pairs = min(ceil(round_fraction * budget_remaining), + motif_remaining); + // give it out from the budget + next_budget_remaining -= round_pairs; + motif_max_num_pairs[j] += round_pairs; +#ifdef debug_multipath_mapper + cerr << "\tadd " << round_pairs << " to motif " << j << ": " << motif_max_num_pairs[j] << endl; +#endif + } + + } + budget_remaining = next_budget_remaining; + } +#ifdef debug_multipath_mapper + cerr << "final apportionment:" << endl; + for (size_t j = 0; j < splice_stats.motif_size(); ++j) { + + size_t left = left_region.candidate_splice_sites(j).size(); + size_t right = right_region.candidate_splice_sites(j).size(); + cerr << "motif " << j << ": " << motif_max_num_pairs[j] << " pairs out of " << left * right << endl; + } +#endif + } + + + for (size_t j = 0; j < splice_stats.motif_size(); ++j) { + if (motif_max_num_pairs[j] == 0) { + // skip this one + continue; + } + + // let the iterable decide which motif pairs we should look at + MotifPairIterable motif_pairs(motif_max_num_pairs[j], left_prejoin_side, + right_prejoin_side, j, alignment.sequence().size()); + + for (auto it = motif_pairs.begin(), end = motif_pairs.end(); it != end; ++it) { + + tuple left_location, right_location; + tie(left_location, right_location) = *it; + + auto l_under = left_region.get_subgraph().get_underlying_handle(get<0>(left_location)); + auto r_under = right_region.get_subgraph().get_underlying_handle(get<0>(right_location)); + + pos_t l_pos(xindex->get_id(l_under), xindex->get_is_reverse(l_under), get<1>(left_location)); + pos_t r_pos(xindex->get_id(r_under), xindex->get_is_reverse(r_under), get<1>(right_location)); + +#ifdef debug_multipath_mapper + cerr << "\tchecking shared motif " << j << " with has positions " << l_pos << ", and " << r_pos << endl; +#endif + + putative_joins.emplace_back(*xindex, splice_stats, opt, + *get_aligner(!alignment.quality().empty()), + left_prejoin_side, right_prejoin_side, + left_location, right_location, j); + + if (putative_joins.back().max_score < no_splice_log_odds) { +#ifdef debug_multipath_mapper + cerr << "\tscore bound of " << putative_joins.back().max_score << " ensures insigificant spliced alignment against prior log odds " << no_splice_log_odds << " before measuring intron length" << endl; +#endif + + // this has no chance of becoming significant, let's skip it + putative_joins.pop_back(); + continue; + } + + // measure the intron length + int64_t dist = get_reference_dist(l_pos, r_pos); + if (dist <= 0 || dist > max_intron_length || dist == numeric_limits::max()) { +#ifdef debug_multipath_mapper + cerr << "\tinconsistent intron length " << dist << ", skipping putative join" << endl; +#endif + putative_joins.pop_back(); + continue; + } + + putative_joins.back().set_intron_length(dist, splice_stats); + + // TODO: enforce pairing constraints? + +#ifdef debug_multipath_mapper + cerr << "\tshared motif has a spliceable path of length " << dist << " (intron score: " << putative_joins.back().intron_score << "), adding as a putative join with score bound " << putative_joins.back().max_score << endl; +#endif + if (putative_joins.back().max_score < no_splice_log_odds) { +#ifdef debug_multipath_mapper + cerr << "\tscore bound of " << putative_joins.back().max_score << " ensures insigificant spliced alignment against prior log odds " << no_splice_log_odds << " before doing alignment" << endl; +#endif + + // this has no chance of becoming significant, let's skip it + putative_joins.pop_back(); } } } - // this is the first index we didn't rescue - max_rescues_attempted_idx = i; } - else { - // simpler without secondary rescue, we would have just rescued up to the maximum allowable - max_rescues_attempted_idx = max_rescue_attempts; + } + +#ifdef debug_multipath_mapper + cerr << "realigning across " << putative_joins.size() << " putative join regions" << endl; +#endif + + // TODO: allow multiple splices in a multipath alignment + int32_t best_net_score = -1; + int64_t best_intron_length = numeric_limits::max(); + unique_ptr best_join; + + auto score_bound_comp = [](const PutativeJoin& join_1, const PutativeJoin& join_2) { + return join_1.max_score < join_2.max_score; + }; + + // order the joins by the highest upper bound on net score + make_heap(putative_joins.begin(), putative_joins.end(), score_bound_comp); + + while (!putative_joins.empty() && putative_joins.front().max_score >= best_net_score) { + + auto& join = putative_joins.front(); + + + size_t connect_begin = alignment.sequence().size() - join.left_clip_length; + size_t connect_len = join.left_clip_length + join.right_clip_length - alignment.sequence().size(); + + join.connecting_aln.set_sequence(alignment.sequence().substr(connect_begin, connect_len)); + if (!alignment.quality().empty()) { + join.connecting_aln.set_quality(alignment.quality().substr(connect_begin, connect_len)); } + // TODO: multi alignment? + auto alnr = get_aligner(!alignment.quality().empty()); + alnr->align_global_banded(join.connecting_aln, join.joined_graph, 1); + + // the total score of extending the anchor by the candidate + int32_t net_score = join.post_align_net_score(splice_stats, opt); + #ifdef debug_multipath_mapper - cerr << "performed up to " << max_rescues_attempted_idx << " out of " << plausible_clusters_end_idx << " plausible rescues" << endl; + cerr << "next candidate spliced alignment with score bound " << join.max_score << " has net score " << net_score << " after realigning read interval " << connect_begin << ":" << (connect_begin + connect_len) << ", must get " << no_splice_log_odds << " for significance" << endl; #endif - if (max_rescues_attempted_idx < plausible_clusters_end_idx) { - // we didn't attempt rescue from all of the plausible clusters, so we have to account for the possibility that - // we missed the mapping just because of the subset of rescues we attempted + if (net_score > no_splice_log_odds) { + // this is a statistically significant spliced alignment + + // find which mapping is immediately after the splice + auto path = join.connecting_aln.mutable_path(); + auto splice_id = join.joined_graph.get_id(join.joined_graph.right_splice_node()); + join.splice_idx = 1; + while (path->mapping(join.splice_idx).position().node_id() != splice_id) { + ++join.splice_idx; + } - double fraction_considered = double(max_rescues_attempted_idx) / double(plausible_clusters_end_idx); - int32_t rescue_mapq = round(prob_to_phred(1.0 - fraction_considered)); + // and translate into the original ID space + join.joined_graph.translate_node_ids(*path); + if (net_score > best_net_score || + (net_score == best_net_score && join.estimated_intron_length < best_intron_length)) { #ifdef debug_multipath_mapper - cerr << "capping mapping quality at " << rescue_mapq << endl; + cerr << "this score from motif " << join.motif_idx << " and splice site " << path->mapping(join.splice_idx - 1).position().node_id() << (path->mapping(join.splice_idx - 1).position().is_reverse() ? "-" : "+") << " -> " << path->mapping(join.splice_idx).position().node_id() << (path->mapping(join.splice_idx).position().is_reverse() ? "-" : "+") << " is the best so far, beating previous best " << best_net_score << endl; #endif - - // cap the mapping quality at this value - multipath_aln_pairs_out.front().first.set_mapping_quality(min(multipath_aln_pairs_out.front().first.mapping_quality(), - rescue_mapq)); - multipath_aln_pairs_out.front().second.set_mapping_quality(min(multipath_aln_pairs_out.front().second.mapping_quality(), - rescue_mapq)); + best_intron_length = join.estimated_intron_length; + best_net_score = net_score; + best_join = unique_ptr(new PutativeJoin(move(join))); + + } } + + // queue up the next join in the heap front + pop_heap(putative_joins.begin(), putative_joins.end(), score_bound_comp); + putative_joins.pop_back(); } - } - - double MultipathMapper::prob_equivalent_clusters_hits_missed(const memcluster_t& cluster) const { - // we will approximate the probability of missing a cluster with the same MEMs as this one due to hit sampling - // by the probability of missing the correct hit for all of the MEMs in this one - double prob_of_missing_all = 1.0; - for (const pair& hit : cluster) { - const MaximalExactMatch& mem = *hit.first; - prob_of_missing_all *= (mem.queried_count >= mem.match_count ? 0.0 : 1.0 - double(mem.queried_count) / double(mem.match_count)); + +#ifdef debug_multipath_mapper + cerr << "pruned " << putative_joins.size() << " putative joins for having low score bounds" << endl; +#endif + + if (best_join.get() == nullptr) { +#ifdef debug_multipath_mapper + cerr << "no splice candidates were statistically significant" << endl; +#endif + return false; } - return prob_of_missing_all; - } - - void MultipathMapper::cap_mapping_quality_by_hit_sampling_probability(vector>& multipath_aln_pairs_out, - vector, int64_t>>& cluster_pairs, - vector& cluster_graphs1, - vector& cluster_graphs2, - bool did_secondary_rescue) const { + + // greedily fix the strand + // TODO: ideally we'd probably try fixing it each way and see which is better + strand = (splice_stats.motif_is_reverse(best_join->motif_idx) ? Reverse : Forward); + + *anchor_multiplicity_out = min(get_multiplicity(best_join->left_candidate_idx), + get_multiplicity(best_join->right_candidate_idx)); + anchor_mp_aln = fuse_spliced_alignments(alignment, + consume_candidate(best_join->left_candidate_idx), + consume_candidate(best_join->right_candidate_idx), + alignment.sequence().size() - best_join->left_clip_length, + best_join->connecting_aln, best_join->splice_idx, + splice_stats.motif_score(best_join->motif_idx) + best_join->intron_score - no_splice_log_odds, + *get_aligner(!alignment.quality().empty()), *xindex); #ifdef debug_multipath_mapper - cerr << "checking whether to cap mapping quality based on hit sub-sampling" << endl; + cerr << "found significant splice join, fused mp aln:" << endl; + cerr << debug_string(anchor_mp_aln) << endl; #endif - // did we use an out-of-bounds cluster index to flag either end as coming from a rescue? - bool opt_aln_1_is_rescued = cluster_pairs.front().first.first >= cluster_graphs1.size(); - bool opt_aln_2_is_rescued = cluster_pairs.front().first.second >= cluster_graphs2.size(); +#ifdef debug_validate_multipath_alignments +#ifdef debug_multipath_mapper + cerr << "validating spliced alignment:" << endl; + cerr << debug_string(anchor_mp_aln) << endl; +#endif + if (!validate_multipath_alignment(anchor_mp_aln, *xindex)) { + cerr << "### WARNING ###" << endl; + cerr << "multipath alignment of read " << anchor_mp_aln.sequence() << " failed to validate" << endl; + } +#endif + + return true; + } + + void MultipathMapper::align_to_splice_candidates(const Alignment& alignment, + vector& cluster_graphs, + const vector& mems, + const vector& cluster_candidates, + const vector>& hit_candidates, + const pair& primary_interval, + bool searching_left, + bool is_read_1, + unordered_map>& unaligned_candidate_bank, + vector& candidates_out, + const match_fanouts_t* mem_fanouts) const { + + + // make an alignment to the a candidate's cluster + auto align_to_candidate = [&](clustergraph_t& cluster_graph, + const candidate_id_t& candidate_id) { + + auto& banked_candidate = unaligned_candidate_bank[candidate_id]; + + multipath_align(alignment, cluster_graph, banked_candidate.first, mem_fanouts); + topologically_order_subpaths(banked_candidate.first); + + banked_candidate.second = cluster_multiplicity(get<1>(cluster_graph)); + }; + + // get the candidate alignment associated with the ID, making the alignment if necessary + auto get_candidate = [&](const candidate_id_t& candidate_id) -> multipath_alignment_t& { + auto it = unaligned_candidate_bank.find(candidate_id); + if (it != unaligned_candidate_bank.end()) { + // the candidate is already in the bank + return it->second.first; + } + else { + if (get<1>(candidate_id) >= 0) { + // this is an unaligned cluster + align_to_candidate(cluster_graphs[get<1>(candidate_id)], candidate_id); + } + else { + // this is an unclustered hit + vector dummy_clusters(1); + dummy_clusters.front().first.emplace_back(make_pair(get<2>(candidate_id), + get<3>(candidate_id))); + dummy_clusters.front().second = 1.0; + auto dummy_cluster_graphs = query_cluster_graphs(alignment, mems, dummy_clusters); + align_to_candidate(dummy_cluster_graphs.front(), candidate_id); + } + return unaligned_candidate_bank[candidate_id].first; + } + }; + + // decide if a candidate should be evaluated as a potential splice + auto consider_candidate = [&](const candidate_id_t& candidate_id) { + + const multipath_alignment_t& candidate = get_candidate(candidate_id); + + // TODO: repetitive with identify + // check if the fully realized alignment still looks approx disjoint with the primary + auto interval = aligned_interval(candidate); + if (searching_left) { + if (interval.second >= primary_interval.first + 2 * max_softclip_overlap || + min(interval.second, primary_interval.first) - interval.first < min_softclip_length_for_splice) { +#ifdef debug_multipath_mapper + cerr << "rejecting candidate because of overlap" << endl; + cerr << "\tprimary interval: " << primary_interval.first << " " << primary_interval.second << endl; + cerr << "\tcandidate interval: " << interval.first << " " << interval.second << endl; +#endif + return; + } + } + else { + if (interval.first < primary_interval.second - 2 * max_softclip_overlap || + interval.second - max(interval.first, primary_interval.second) < min_softclip_length_for_splice) { +#ifdef debug_multipath_mapper + cerr << "rejecting candidate because of overlap" << endl; + cerr << "\tprimary interval: " << primary_interval.first << " " << primary_interval.second << endl; + cerr << "\tcandidate interval: " << interval.first << " " << interval.second << endl; +#endif + return; + } + } + // did not reject the candidate + candidates_out.push_back(candidate_id); + return; + }; - // what is the chance that we would have missed a cluster with the same MEMs because of hit sub-sampling - double prob_missing_equiv_cluster_1 = 0.0, prob_missing_equiv_cluster_2 = 0.0; - if (!opt_aln_1_is_rescued) { - prob_missing_equiv_cluster_1 = prob_equivalent_clusters_hits_missed(get<1>(cluster_graphs1[cluster_pairs.front().first.first])); + // unaligned clusters + for (auto i : cluster_candidates) { +#ifdef debug_multipath_mapper + size_t num_before_align = candidates_out.size(); +#endif + + // make the associated ID and evaluate it + candidate_id_t candidate_id(is_read_1, i, nullptr, pos_t()); + consider_candidate(candidate_id); + +#ifdef debug_multipath_mapper + if (candidates_out.size() > num_before_align) { + cerr << "added cluster splice candidate " << i << ":" << endl; + cerr << debug_string(get_candidate(candidates_out.back())) << endl; + } +#endif } - if (!opt_aln_2_is_rescued) { - prob_missing_equiv_cluster_2 = prob_equivalent_clusters_hits_missed(get<1>(cluster_graphs2[cluster_pairs.front().first.second])); + + // unclustered hits + for (const auto& hit : hit_candidates) { + +#ifdef debug_multipath_mapper + size_t num_before_align = candidates_out.size(); +#endif + // make the associated ID and evaluate it + candidate_id_t candidate_id(is_read_1, -1, hit.first, hit.second); + consider_candidate(candidate_id); + +#ifdef debug_multipath_mapper + if (candidates_out.size() > num_before_align) { + cerr << "made alignment to a seed splice candidate " << hit.first->sequence() << " " << hit.second << endl; + cerr << debug_string(get_candidate(candidates_out.back())) << endl; + } +#endif } +#ifdef debug_validate_multipath_alignments + for (size_t i = 0; i < candidates_out.size(); ++i) { + const auto& multipath_aln = get_candidate(candidates_out[i]); #ifdef debug_multipath_mapper - cerr << "estimate probability of missing correct cluster from hit sampling for read 1 as " << prob_missing_equiv_cluster_1 << " and read 2 as " << prob_missing_equiv_cluster_2 << endl; + cerr << "validating " << i << "-th splice candidate alignment:" << endl; + cerr << debug_string(multipath_aln) << endl; +#endif + if (!validate_multipath_alignment(multipath_aln, *xindex)) { + cerr << "### WARNING ###" << endl; + cerr << "multipath alignment of read " << multipath_aln.sequence() << " failed to validate" << endl; + } + } #endif + } + + bool MultipathMapper::attempt_rescue_for_splice_segment(const Alignment& alignment, const pair& primary_interval, + const multipath_alignment_t& rescue_anchor, + bool rescue_left, multipath_alignment_t& rescued_out) const { - if (prob_missing_equiv_cluster_1 == 0.0 && prob_missing_equiv_cluster_2 == 0.0) { - // we can bail out now if we don't think hit sub-sampling was a factor - return; + // extract the portion of the read that we want to form a spliced alignment with + string::const_iterator begin, end; + if (rescue_left) { + begin = alignment.sequence().begin() + max(0, primary_interval.second - max_softclip_overlap); + end = alignment.sequence().end(); + } + else { + begin = alignment.sequence().begin(); + end = alignment.sequence().begin() + min(primary_interval.first + max_softclip_overlap, + alignment.sequence().size()); + } + Alignment splice_aln; + splice_aln.set_sequence(string(begin, end)); + if (!alignment.quality().empty()) { + splice_aln.set_quality(string(alignment.quality().begin() + (begin - alignment.sequence().begin()), + alignment.quality().begin() + (end - alignment.sequence().begin()))); } - // what is the chance that we would have missed a pair? - double prob_missing_pair = 0.0; - if (opt_aln_1_is_rescued) { #ifdef debug_multipath_mapper - cerr << "optimal mapping is a rescue on read 1, sampling error only applies to read 2" << endl; + cerr << "attempting to rescue a spliced alignment segment to " << (rescue_left ? "left" : "right") << ":" << endl; + cerr << pb2json(splice_aln) << endl; #endif - - prob_missing_pair = prob_missing_equiv_cluster_2; + + // adjust the mean length to account for the part we're not realigning + double rescue_mean_length = max(fragment_length_distr.mean() + - (alignment.sequence().size() - splice_aln.sequence().size()), 0.0); + + // try to align + bool succeeded = do_rescue_alignment(rescue_anchor, splice_aln, !rescue_left, rescued_out, + rescue_mean_length, splice_rescue_graph_std_devs); + + if (!succeeded) { + // we couldn't do the alignment + return false; } - else if (opt_aln_2_is_rescued) { + + // check if we got enough new, disjoint matches to be worth looking at (we can be + // pretty permissive here because we will test this candidate for signficance in next + // step of spliced alignment algorithm + auto rescued_interval = aligned_interval(rescued_out); + auto aligned_length = rescued_interval.second - rescued_interval.first; + auto threshold = log(2.0 * splice_rescue_graph_std_devs * fragment_length_distr.std_dev()) / log(4.0); + if (aligned_length < threshold) { #ifdef debug_multipath_mapper - cerr << "optimal mapping is a rescue on read 2, sampling error only applies to read 1" << endl; + cerr << "rescue candidate with aligned length " << aligned_length << " does not reach significance threshold of " << threshold << ", not checking for spliced alignments" << endl; #endif - - prob_missing_pair = prob_missing_equiv_cluster_1; + return false; } - else if (!did_secondary_rescue) { - // in this case we are assured that there was no rescue (except perhaps a failed rescue - // that was trying to improve over a likely mismapping--a rare case where we won't mind - // being pessimistic about the mapping quality) + if (rescue_left) { + // we need to adjust offsets to make it match the full read + rescued_interval.first += (begin - alignment.sequence().begin()); + rescued_interval.second += (begin - alignment.sequence().begin()); - prob_missing_pair = 1.0 - (1.0 - prob_missing_equiv_cluster_1) * (1.0 - prob_missing_equiv_cluster_2); + succeeded = (rescued_interval.first >= primary_interval.second - max_softclip_overlap + && rescued_interval.second - max(rescued_interval.first, primary_interval.second) >= min_splice_rescue_matches + && rescued_interval.second > primary_interval.first); } else { - // the complicated case: - - // we did secondary rescue, so now we must account for the fact that we *could* have recovered a cluster that we missed - // because of MEM sub-sampling through the secondary rescue code path. - - // we will first cap the estimate of the probability of missing an equivalent cluster by the probability of rescuing - // from the other read's cluster (assuming the other cluster is correct) - + succeeded = (rescued_interval.second <= primary_interval.first + max_softclip_overlap + && min(rescued_interval.second, primary_interval.first) >= min_splice_rescue_matches + && rescued_interval.first < primary_interval.first); + } #ifdef debug_multipath_mapper - cerr << "we did secondary rescue, so we must also account for the ability to rescue a sub-sampled cluster" << endl; + cerr << "rescue candidate covers read interval " << rescued_interval.first << ":" << rescued_interval.second << " compared to primary interval " << primary_interval.first << ":" << primary_interval.second << ", considered successful? " << succeeded << endl; #endif - - // checks if one cluster is a subset of another - auto cluster_contained_in = [](const memcluster_t& sub, const memcluster_t& super) { - size_t i = 0; - if (sub.size() <= super.size()) { - // query_cluster_graphs sorts the cluster hits so that they are ordered by length - // and then lexicographically by read interval - for (size_t j = 0; i < sub.size() && j < super.size(); j++) { - const MaximalExactMatch* mem_sub = sub[i].first; - const MaximalExactMatch* mem_super = super[j].first; - if (mem_sub == mem_super) { - i++; + + if (succeeded) { +#ifdef debug_multipath_mapper + cerr << "re-introducing trimmed sequence to rescued multipath alignment:" << endl; + cerr << debug_string(rescued_out) << endl; +#endif + + // add in the soft-clips for the part of the read we trimmed off + if (rescue_left) { + // the softclip should come at the beginning of the rescued alignment + rescued_out.mutable_sequence()->insert(rescued_out.mutable_sequence()->begin(), + alignment.sequence().begin(), begin); + if (!alignment.quality().empty()) { + rescued_out.mutable_quality()->insert(rescued_out.mutable_quality()->begin(), + alignment.quality().begin(), + alignment.quality().begin() + (begin - alignment.sequence().begin())); + } + for (auto i : rescued_out.start()) { + auto subpath = rescued_out.mutable_subpath(i); + auto mapping = subpath->mutable_path()->mutable_mapping(0); + if (mapping->edit().front().from_length() == 0) { + // there's already a softclip, just expand it + auto edit = mapping->mutable_edit(0); + edit->mutable_sequence()->insert(edit->mutable_sequence()->begin(), + alignment.sequence().begin(), begin); + edit->set_to_length(edit->sequence().size()); + } + else { + // add a new edit for the softclip + edit_t softclip; + softclip.set_sequence(string(alignment.sequence().begin(), begin)); + softclip.set_to_length(softclip.sequence().size()); + softclip.set_from_length(0); + mapping->mutable_edit()->insert(mapping->edit().begin(), softclip); + // we need to take away the full length bonus + subpath->set_score(subpath->score() + - get_aligner(alignment.quality().empty())->score_full_length_bonus(true, splice_aln)); + } + } + } + else { + // the softclip should come at the end of the rescued alignment + *rescued_out.mutable_sequence() += string(end, alignment.sequence().end()); + if (!alignment.quality().empty()) { + *rescued_out.mutable_quality() += string(alignment.quality().begin() + (end - alignment.sequence().begin()), + alignment.quality().end()); + } + for (size_t i = 0; i < rescued_out.subpath_size(); ++i) { + auto subpath = rescued_out.mutable_subpath(i); + if (subpath->next().empty()) { + auto& mapping = subpath->mutable_path()->mutable_mapping()->back(); + if (mapping.edit().back().from_length() == 0) { + // expand existing softclip + auto& edit = mapping.mutable_edit()->back(); + *edit.mutable_sequence() += string(end, alignment.sequence().end()); + edit.set_to_length(edit.sequence().size()); } - else if (mem_sub->length() > mem_super->length() || (mem_sub->length() == mem_super->length() && - make_pair(mem_sub->begin, mem_sub->end) > make_pair(mem_super->begin, mem_super->end))) { - // we've passed the place in the vector where this hit should have occurred - break; + else { + // add new edit for softclip + auto softclip = mapping.add_edit(); + softclip->set_sequence(string(end, alignment.sequence().end())); + softclip->set_to_length(softclip->sequence().size()); + softclip->set_from_length(0); + // we need to take away the full length bonus + subpath->set_score(subpath->score() + - get_aligner(alignment.quality().empty())->score_full_length_bonus(false, splice_aln)); } - } } - // did we find all of the MEMs in the superset? - return i == sub.size();; - }; - - // TODO: figuring out which clusters we rescued from requires me to duplicate the secondary rescue code... + } +#ifdef debug_multipath_mapper + cerr << "final rescued multipath alignment segment:" << endl; + cerr << debug_string(rescued_out) << endl; +#endif + } + return succeeded; + } + + void MultipathMapper::identify_aligned_splice_candidates(const Alignment& alignment, bool search_left, + const pair& primary_interval, + const vector& multipath_alns, + const vector& cluster_idxs, + const vector& current_index, int64_t anchor, + unordered_set& clusters_used_out, + vector& mp_aln_candidates_out) const { + + // just to make sure we don't bother looking at earlier clusters (which have already had a chance to splice) + for (size_t idx = 0; idx <= anchor; ++idx) { + int64_t i = current_index[idx]; + if (i >= 0) { + clusters_used_out.insert(cluster_idxs[i]); + } + } + + for (size_t idx = anchor + 1; idx < current_index.size(); ++idx) { - // keep track of which clusters already have consistent pairs (so we wouldn't have rescued from them) - unordered_set paired_clusters_1, paired_clusters_2; - for (size_t i = 0; i < multipath_aln_pairs_out.size(); i++) { - paired_clusters_1.insert(cluster_pairs[i].first.first); - paired_clusters_2.insert(cluster_pairs[i].first.second); + int64_t i = current_index[idx]; + if (i < 0) { + continue; } - // the score parameters from the secondary rescue code that we use to decide which clusters to rescue from - int32_t cluster_score_1 = get_aligner()->match * get<2>(cluster_graphs1[cluster_pairs.front().first.first]); - int32_t cluster_score_2 = get_aligner()->match * get<2>(cluster_graphs2[cluster_pairs.front().first.second]); - int32_t max_score_diff = secondary_rescue_score_diff * get_aligner()->mapping_quality_score_diff(max_mapping_quality); + // check that the alignment is mostly disjoint of the primary and that + // that the independent aligned portion is significant to call this a potential splice alignment + auto interval = aligned_interval(multipath_alns[i]); + +#ifdef debug_multipath_mapper + cerr << "aligned candidate " << i << " has interval " << interval.first << " " << interval.second << endl; +#endif + - // we'll count up how many of the equivalent clusters we would have done secondary rescue from - size_t num_equivalent_1 = 0; - size_t num_equivalent_2 = 0; - size_t num_rescued_equivalent_1 = 0; - size_t num_rescued_equivalent_2 = 0; + if (search_left) { + if (interval.second < primary_interval.first + max_softclip_overlap && + min(interval.second, primary_interval.first) - interval.first >= min_softclip_length_for_splice) { + + mp_aln_candidates_out.push_back(i); + clusters_used_out.insert(cluster_idxs[i]); + } + } + else { + if (interval.first >= primary_interval.second - max_softclip_overlap && + interval.second - max(interval.first, primary_interval.second) >= min_softclip_length_for_splice) { + + mp_aln_candidates_out.push_back(i); + clusters_used_out.insert(cluster_idxs[i]); + } + } + } + +#ifdef debug_multipath_mapper + cerr << "found fully aligned splice candidates:" << endl; + for (auto i : mp_aln_candidates_out) { + cerr << "\t" << i << endl; + } +#endif + } + + void MultipathMapper::identify_aligned_splice_candidates(const Alignment& alignment, bool read_1, bool search_left, + const pair& primary_interval, + const vector>& multipath_aln_pairs, + const vector, int64_t>>& cluster_pairs, + const vector& current_index, int64_t anchor, + unordered_set& clusters_used_out, + vector& mp_aln_candidates_out) const { + + + // just to make sure we don't bother looking at earlier clusters (which have already had a chance to splice) + for (size_t idx = 0; idx <= anchor; ++idx) { + int64_t i = current_index[idx]; + if (i >= 0) { + clusters_used_out.insert(read_1 ? cluster_pairs[i].first.first : cluster_pairs[i].first.second); + } + } + + for (size_t idx = anchor + 1; idx < current_index.size(); ++idx) { - size_t num_rescues_1 = 0; - for (size_t i = 0; i < cluster_graphs1.size(); i++) { - // is this an equivalent cluster? - bool equiv = cluster_contained_in(get<1>(cluster_graphs1[cluster_pairs.front().first.first]), get<1>(cluster_graphs1[i])); - // and would we have done used it for a secondary rescue? - bool did_rescue = (!paired_clusters_1.count(i) && num_rescues_1 < secondary_rescue_attempts - && get<2>(cluster_graphs1[i]) * get_aligner()->match >= cluster_score_1 - max_score_diff); - // keep track of the counts - num_rescues_1 += did_rescue; - num_equivalent_1 += equiv; - num_rescued_equivalent_1 += did_rescue && equiv; + int64_t i = current_index[idx]; + if (i < 0) { + continue; } - size_t num_rescues_2 = 0; - for (size_t i = 0; i < cluster_graphs2.size(); i++) { - // is this an equivalent cluster? - bool equiv = cluster_contained_in(get<1>(cluster_graphs2[cluster_pairs.front().first.second]), get<1>(cluster_graphs2[i])); - // and would we have done used it for a secondary rescue? - bool did_rescue = (!paired_clusters_2.count(i) && num_rescues_2 < secondary_rescue_attempts - && get<2>(cluster_graphs2[i]) * get_aligner()->match >= cluster_score_2 - max_score_diff); - // keep track of the counts - num_rescues_2 += did_rescue; - num_equivalent_2 += equiv; - num_rescued_equivalent_2 += did_rescue && equiv; + const multipath_alignment_t& mp_aln = read_1 ? multipath_aln_pairs[i].first : multipath_aln_pairs[i].second; + + // check that the alignment is mostly disjoint of the primary and that + // that the independent aligned portion is significant to call this a potential splice alignment + auto interval = aligned_interval(mp_aln); + +#ifdef debug_multipath_mapper + cerr << "aligned candidate " << i << " has interval " << interval.first << " " << interval.second << endl; +#endif + if (search_left) { + if (interval.second < primary_interval.first + max_softclip_overlap && + min(interval.second, primary_interval.first) - interval.first >= min_softclip_length_for_splice) { + + size_t cluster = read_1 ? cluster_pairs[i].first.first : cluster_pairs[i].first.second; + if (!clusters_used_out.count(cluster)) { + mp_aln_candidates_out.push_back(i); + clusters_used_out.insert(cluster); + } + } + } + else { + if (interval.first >= primary_interval.second - max_softclip_overlap && + interval.second - max(interval.first, primary_interval.second) >= min_softclip_length_for_splice) { + + size_t cluster = read_1 ? cluster_pairs[i].first.first : cluster_pairs[i].first.second; + if (!clusters_used_out.count(cluster)) { + mp_aln_candidates_out.push_back(i); + clusters_used_out.insert(cluster); + } + } + } + } + +#ifdef debug_multipath_mapper + cerr << "found fully aligned splice candidates:" << endl; + for (auto i : mp_aln_candidates_out) { + cerr << "\t" << i << endl; + } +#endif + + } + + void MultipathMapper::identify_unaligned_splice_candidates(const Alignment& alignment, bool search_left, + const pair& primary_interval, + const vector& mems, + const vector& cluster_graphs, + const unordered_set& clusters_already_used, + vector& cluster_candidates_out, + vector>& hit_candidates_out) const { + +#ifdef debug_multipath_mapper + cerr << "looking for unaligned splice candidates" << endl; +#endif + + for (size_t i = 0; i < cluster_graphs.size(); ++i) { + + if (clusters_already_used.count(i) || get<1>(cluster_graphs[i]).first.empty()) { + continue; + } + + auto intervals = covered_intervals(alignment, cluster_graphs[i]); + +#ifdef debug_multipath_mapper + cerr << "cluster candidate " << i << " has intervals" << endl; + for (auto interval : intervals) { + cerr << "\t" << interval.first << " " << interval.second << endl; + } +#endif + + // check to make sure cluster doesn't overlap too much with the alignment + // and also covers a sufficient part of the alignment's softclip + + if (search_left) { + int64_t overlap = intervals.back().second - primary_interval.first; + int64_t ind_cov = 0; + for (size_t i = 0; i < intervals.size(); ++i) { + auto& interval = intervals[i]; + if (interval.first >= primary_interval.first) { + break; + } + else if (interval.second >= primary_interval.first) { + ind_cov += primary_interval.first - interval.first; + break; + } + else { + ind_cov += interval.second - interval.first; + } + } + + if (overlap < max_softclip_overlap && ind_cov >= min_softclip_length_for_splice) { + cluster_candidates_out.emplace_back(i); + } + } + else { + int64_t overlap = primary_interval.second - intervals.front().first; + int64_t ind_cov = 0; + for (int64_t i = intervals.size() - 1; i >= 0; --i) { + auto& interval = intervals[i]; + if (interval.second <= primary_interval.second) { + break; + } + else if (interval.first <= primary_interval.second) { + ind_cov += interval.second - primary_interval.second; + break; + } + else { + ind_cov += interval.second - interval.first; + } + } + + if (overlap < max_softclip_overlap && ind_cov >= min_softclip_length_for_splice) { + cluster_candidates_out.emplace_back(i); + } + } + } + +#ifdef debug_multipath_mapper + cerr << "found unaligned cluster candidates:" << endl; + for (auto i : cluster_candidates_out) { + cerr << "\t" << i << endl; + } +#endif + + // if we already processed a hit's cluster, we don't need to look at the hit + unordered_set> hits_already_used; + for (const auto& i : cluster_candidates_out) { + // make sure it isn't a phony cluster number from rescue + if (i < cluster_graphs.size()) { + for (const auto& hit : get<1>(cluster_graphs[i]).first) { + hits_already_used.insert(hit); + } + } + } + for (const auto& i : clusters_already_used) { + // make sure it isn't a phony cluster number from rescue + if (i < cluster_graphs.size()) { + for (const auto& hit : get<1>(cluster_graphs[i]).first) { + hits_already_used.insert(hit); + } + } + } + + // TODO: tie in mem fanouts? + + for (size_t i = 0; i < mems.size(); ++i) { + + const auto& mem = mems[i]; + + if (mem.length() < min_softclip_length_for_splice) { + continue; + } + if (search_left) { + int64_t overlap = (mem.end - alignment.sequence().begin()) - primary_interval.first; + int64_t ind_cov = (min(mem.end - alignment.sequence().begin(), primary_interval.first) + - (mem.begin - alignment.sequence().begin())); + + if (overlap < max_softclip_overlap && ind_cov >= min_softclip_length_for_splice) { + for (auto gcsa_node : mem.nodes) { + pos_t pos = make_pos_t(gcsa_node); + if (!hits_already_used.count(make_pair(&mem, pos))) { + hit_candidates_out.emplace_back(&mem, pos); + } + } + } + } + else { + int64_t overlap = primary_interval.second - (mem.begin - alignment.sequence().begin()); + int64_t ind_cov = ((mem.end - alignment.sequence().begin()) + - max(mem.begin - alignment.sequence().begin(), primary_interval.second)); + + if (overlap < max_softclip_overlap && ind_cov >= min_softclip_length_for_splice) { + for (auto gcsa_node : mem.nodes) { + pos_t pos = make_pos_t(gcsa_node); + if (!hits_already_used.count(make_pair(&mem, pos))) { + hit_candidates_out.emplace_back(&mem, pos); + } + } + } + } + } +#ifdef debug_multipath_mapper + cerr << "found unclustered hit candidates:" << endl; + for (auto hit : hit_candidates_out) { + cerr << "\t" << hit.first->sequence() << " " << hit.second << endl; + } +#endif + } + + bool MultipathMapper::find_spliced_alignments(const Alignment& alignment, + vector& multipath_alns_out, + vector& multiplicities, + vector& cluster_idxs, + const vector& mems, + vector& cluster_graphs, + const match_fanouts_t* fanouts, + const multipath_alignment_t* rescue_anchor, + bool rescue_left, + double rescue_multiplicity) { + + if (multipath_alns_out.empty()) { + return false; + } + + // TODO: it would be better to use likelihoods rather than scores (esp. in paired) + + // choose the score cutoff based on the original unspliced mappings + int32_t min_score_to_attempt = (optimal_alignment_score(multipath_alns_out.front()) + - get_aligner()->mapping_quality_score_diff(max_mapping_quality)); + + vector current_index(multipath_alns_out.size(), 0); + for (int64_t i = 1; i < multipath_alns_out.size(); ++i) { + current_index[i] = i; + } + vector original_index = current_index; + + // a shared bank of candidates that we can re-use across alignments + unordered_map> unaligned_candidate_bank; + + // we'll keep track of whether any spliced alignments succeeded + bool any_splices = false; + // so far we haven't restricted to splice motifs on any particular strand + SpliceStrand strand = Undetermined; + + for (size_t j = 0; j < current_index.size(); ) { + if (current_index[j] < 0) { + // this alignment has been consumed as a splice candidate + ++j; + continue; + } + +#ifdef debug_multipath_mapper + cerr << "deciding whether to look for spliced alignments on mp aln " << current_index[j] << endl; +#endif + + if (optimal_alignment_score(multipath_alns_out[current_index[j]]) < min_score_to_attempt) { + // the rest of the alignments are too low-scoring to look at + break; + } + + auto interval = aligned_interval(multipath_alns_out[current_index[j]]); + if (interval.first == interval.second) { + // this anchor is unmapped + ++j; + continue; + } + // TODO: repetitive with paired version + auto alnr = get_aligner(!alignment.quality().empty()); + int64_t left_max_score = (alnr->score_exact_match(alignment, 0, interval.first) + + (interval.first == 0 ? 0 : alnr->score_full_length_bonus(true, alignment))); + int64_t right_max_score = (alnr->score_exact_match(alignment, interval.second, + alignment.sequence().size() - interval.second) + + (interval.second == alignment.sequence().size() ? 0 : alnr->score_full_length_bonus(false, alignment))); + bool search_left = left_max_score >= min_softclipped_score_for_splice; + bool search_right = right_max_score >= min_softclipped_score_for_splice; + + if (!(search_left || search_right)) { +#ifdef debug_multipath_mapper + cerr << "soft clips are not sufficiently large to look for spliced alignment on interval " << interval.first << ":" << interval.second << " with max tail scores " << left_max_score << " and " << right_max_score << ", max score required: " << min_softclipped_score_for_splice << endl; +#endif + ++j; + continue; + } + +#ifdef debug_multipath_mapper + cerr << "looking for spliced alignments, to left? " << search_left << ", to right? " << search_right << ", interval " << interval.first << ":" << interval.second << ", with max tail scores " << left_max_score << " and " << right_max_score << ", max score required: " << min_softclipped_score_for_splice << endl; +#endif + + bool found_splice_for_anchor = false; + + for (bool do_left : {true, false}) { + if ((do_left && !search_left) || (!do_left && !search_right)) { + continue; + } + // try to figure out the softclip that we're seeing looks like it could be due to readthrough + // of a adapter across the paired reads + // TODO: magic number + static const int64_t adapter_overlap_slosh = 2; + if (!do_left && !read_1_adapter.empty()) { + size_t begin = max(0, interval.second - adapter_overlap_slosh); + size_t end = min(begin + read_1_adapter.size() + 2 * adapter_overlap_slosh, + alignment.sequence().size()); + size_t pos = kmp_search(alignment.sequence().c_str() + begin, end - begin, + read_1_adapter.c_str(), read_1_adapter.size(), + read_1_adapter_lps); + if (pos != string::npos) { + // this softclip is bracketed by a known adapter sequence, it is much more likely + // that it should be adapter trimmed rather than meriting a spliced alignment + continue; + } + } + + // move the anchor out of the vector to protect it from any shuffling that goes on + multipath_alignment_t splice_anchor = move(multipath_alns_out[current_index[j]]); + + // identify which alignments, clusters, and hits could be part of a spliced alignment + vector mp_aln_candidates; + unordered_set clusters_used; + vector cluster_candidates; + vector> hit_candidates; + identify_aligned_splice_candidates(alignment, do_left, interval, multipath_alns_out, cluster_idxs, + current_index, j, clusters_used, mp_aln_candidates); + identify_unaligned_splice_candidates(alignment, do_left, interval, mems, + cluster_graphs, clusters_used, cluster_candidates, + hit_candidates); + + // make alignments for any unaligned candidates + vector unaligned_candidates; + align_to_splice_candidates(alignment, cluster_graphs, mems, cluster_candidates, hit_candidates, + interval, do_left, true, unaligned_candidate_bank, unaligned_candidates, + fanouts); + + // set up functions for the spliced alignment decision process + + function get_candidate = [&](int64_t i) -> const multipath_alignment_t& { + if (i < mp_aln_candidates.size()) { + return multipath_alns_out[mp_aln_candidates[i]]; + } + else { + return unaligned_candidate_bank.at(unaligned_candidates[i - mp_aln_candidates.size()]).first; + } + }; + + function get_multiplicity = [&](int64_t i) { + if (i < 0) { + return multiplicities[current_index[j]]; + } + else if (i < mp_aln_candidates.size()) { + return multiplicities[mp_aln_candidates[i]]; + } + else { + return unaligned_candidate_bank.at(unaligned_candidates[i - mp_aln_candidates.size()]).second; + } + }; + + // TODO: this solution is kinda ugly + multipath_alignment_t tmp; + function consume_candidate = [&](int64_t i) -> multipath_alignment_t&& { + if (i < 0) { + // consume the anchor + return move(splice_anchor); + } + else if (i < mp_aln_candidates.size()) { + +#ifdef debug_multipath_mapper + cerr << "consuming alignment at index " << mp_aln_candidates[i] << endl; +#endif + + // pull the alignment out + tmp = move(multipath_alns_out[mp_aln_candidates[i]]); + + // replace it in the vectors and clear the final position + multipath_alns_out[mp_aln_candidates[i]] = move(multipath_alns_out.back()); + multiplicities[mp_aln_candidates[i]] = multiplicities.back(); + cluster_idxs[mp_aln_candidates[i]] = cluster_idxs.back(); + multipath_alns_out.pop_back(); + multiplicities.pop_back(); + cluster_idxs.pop_back(); + + // do the bookkeeping to track the original indexes + int64_t consuming_original_index = original_index[mp_aln_candidates[i]]; + int64_t moving_original_index = original_index[multipath_alns_out.size()]; + + current_index[moving_original_index] = mp_aln_candidates[i]; + current_index[consuming_original_index] = -1; + + original_index[mp_aln_candidates[i]] = moving_original_index; + original_index[multipath_alns_out.size()] = -1; + + // TODO: also do bookkeeping on the clusters to claim the hits + +#ifdef debug_multipath_mapper + cerr << "current indexes are now:" << endl; + for (size_t i = 0; i < current_index.size(); ++i) { + cerr << "\t" << i << " " << current_index[i] << endl; + } + cerr << "original indexes are now:" << endl; + for (size_t i = 0; i < original_index.size(); ++i) { + cerr << "\t" << i << " " << original_index[i] << endl; + } +#endif + + return move(tmp); + } + else { + // remove the candidate from the bank + auto candidate_id = unaligned_candidates[i - mp_aln_candidates.size()]; + tmp = move(unaligned_candidate_bank.at(candidate_id).first); + unaligned_candidate_bank.erase(candidate_id); + // and return it + return move(tmp); + } + }; + + double anchor_multiplicity = multiplicities[current_index[j]]; + bool did_splice = test_splice_candidates(alignment, do_left, splice_anchor, &anchor_multiplicity, + strand, mp_aln_candidates.size() + unaligned_candidates.size(), + get_candidate, get_multiplicity, consume_candidate); + + if (!did_splice && rescue_anchor && do_left != rescue_left) { + // we didn't find any splice junctions, but we might be able to rescue the spliced portion off + // of the other read + did_splice = find_rescuable_spliced_alignments(alignment, splice_anchor, anchor_multiplicity, + strand, *rescue_anchor, rescue_multiplicity, + rescue_left, interval); + } + +#ifdef debug_log_splice_align_stats + string line = alignment.name() + '\t' + to_string(did_splice) + '\t' + to_string(interval.first) + '\t' + to_string(interval.second) + '\t' + to_string(do_left) + '\t' + to_string(mp_aln_candidates.size()) + '\t' + to_string(cluster_candidates.size()) + '\t' + to_string(hit_candidates.size()) + '\n'; +#pragma omp critical + cerr << line; +#endif + + if (did_splice) { + // we may need to update the multiplicity based on the splicing + multiplicities[current_index[j]] = anchor_multiplicity; + } + + any_splices = any_splices || did_splice; + found_splice_for_anchor = found_splice_for_anchor || did_splice; + + // move the alignment back now that all of the shuffling is finished + multipath_alns_out[current_index[j]] = move(splice_anchor); + } + + if (!found_splice_for_anchor) { + // there's no more splicing to be found for this anchor alignment + ++j; + } + } + + if (any_splices) { + // we'll need to re-score and re-sort + sort_and_compute_mapping_quality(multipath_alns_out, + &cluster_idxs, &multiplicities); + } + + return any_splices; + } + + bool MultipathMapper::find_spliced_alignments(const Alignment& alignment1, const Alignment& alignment2, + vector>& multipath_aln_pairs_out, + vector, int64_t>>& cluster_pairs, + vector& pair_multiplicities, + const vector& mems1, const vector& mems2, + vector& cluster_graphs1, vector& cluster_graphs2, + const match_fanouts_t* fanouts) { + + if (multipath_aln_pairs_out.empty()) { + return false; + } + +#ifdef debug_check_adapters + bool attempt_1_left = false, attempt_1_right = false, attempt_2_left = false, attempt_2_right = false; + bool success_1_left = false, success_1_right = false, success_2_left = false, success_2_right = false; + bool adapter_1_left = false, adapter_1_right = false, adapter_2_left = false, adapter_2_right = false; +#endif + + // TODO: it would be better to use likelihoods rather than scores (esp. in paired) + + // choose the score cutoff based on the original unspliced mappings + int32_t min_score_to_attempt_1 = (optimal_alignment_score(multipath_aln_pairs_out.front().first) + - get_aligner()->mapping_quality_score_diff(max_mapping_quality)); + int32_t min_score_to_attempt_2 = (optimal_alignment_score(multipath_aln_pairs_out.front().second) + - get_aligner()->mapping_quality_score_diff(max_mapping_quality)); + + vector current_index(multipath_aln_pairs_out.size(), 0); + for (int64_t i = 1; i < multipath_aln_pairs_out.size(); ++i) { + current_index[i] = i; + } + vector original_index = current_index; + + // a shared bank of candidates that we can re-use across alignments + unordered_map> unaligned_candidate_bank; + + // we'll keep track of whether any spliced alignments succeeded + bool any_splices = false; + + // so far we haven't restricted to splice motifs on any particular strand + SpliceStrand strand = Undetermined; + + for (size_t j = 0; j < current_index.size(); ++j) { + if (current_index[j] < 0) { + // this alignment has been consumed as a splice candidate + continue; + } + + if (optimal_alignment_score(multipath_aln_pairs_out[current_index[j]].first) < min_score_to_attempt_1 + && optimal_alignment_score(multipath_aln_pairs_out[current_index[j]].second) < min_score_to_attempt_2) { + // the rest of the alignments are too low scoring to consider + break; + } + +#ifdef debug_multipath_mapper + cerr << "determining whether to make spliced alignment for pair at index " << current_index[j] << endl; +#endif + +#ifdef debug_check_adapters + bool attempt_1_left = false, attempt_1_right = false, attempt_2_left = false, attempt_2_right = false; + bool success_1_left = false, success_1_right = false, success_2_left = false, success_2_right = false; +#endif + + for (int read_num = 0; read_num < 2; ) { + + bool do_read_1 = (read_num == 0); + + // decide if this read looks like it could benefit from a spliced alignment + auto interval = aligned_interval(do_read_1 ? multipath_aln_pairs_out[current_index[j]].first + : multipath_aln_pairs_out[current_index[j]].second); + if (interval.first == interval.second) { + // this anchor is unmapped + read_num++; + continue; + } + + auto alnr = get_aligner(!alignment1.quality().empty() && !alignment2.quality().empty()); + + const Alignment& aln = do_read_1 ? alignment1 : alignment2; + + int64_t left_max_score = (alnr->score_exact_match(aln, 0, interval.first) + + (interval.first == 0 ? 0 : alnr->score_full_length_bonus(true, aln))); + int64_t right_max_score = (alnr->score_exact_match(aln, interval.second, + aln.sequence().size() - interval.second) + + (interval.second == aln.sequence().size() ? 0 : alnr->score_full_length_bonus(false, aln))); + bool search_left = left_max_score >= min_softclipped_score_for_splice; + bool search_right = right_max_score >= min_softclipped_score_for_splice; + +#ifdef debug_check_adapters + if (do_read_1) { + attempt_1_left |= search_left; + attempt_1_right |= search_right; + } + else { + attempt_2_left |= search_left; + attempt_2_right |= search_right; + } +#endif +#ifdef debug_multipath_mapper + cerr << "on read " << (do_read_1 ? 1 : 2) << " looking for spliced alignments, to left? " << search_left << ", to right? " << search_right << ", interval " << interval.first << ":" << interval.second << " with max tail scores " << left_max_score << " " << right_max_score << ", and required max score " << min_softclipped_score_for_splice << endl; +#endif + + bool found_splice_for_anchor = false; + + for (bool do_left : {true, false}) { + if ((do_left && !search_left) || (!do_left && !search_right)) { + continue; + } + // try to figure out the softclip that we're seeing looks like it could be due to readthrough + // of a adapter across the paired reads + // TODO: magic number + static const int64_t adapter_overlap_slosh = 2; + if (do_read_1 && !do_left && !read_1_adapter.empty()) { + size_t begin = max(0, interval.second - adapter_overlap_slosh); + size_t end = min(begin + read_1_adapter.size() + 2 * adapter_overlap_slosh, + aln.sequence().size()); + size_t pos = kmp_search(aln.sequence().c_str() + begin, end - begin, + read_1_adapter.c_str(), read_1_adapter.size(), + read_1_adapter_lps); + if (pos != string::npos) { + // this softclip is bracketed by a known adapter sequence, it is much more likely + // that it should be adapter trimmed rather than meriting a spliced alignment + continue; + } + } + if (!do_read_1 && do_left && !read_2_adapter.empty()) { + size_t end = min(interval.first + adapter_overlap_slosh, aln.sequence().size()); + size_t begin = max(0, end - read_2_adapter.size() - 2 * adapter_overlap_slosh); + size_t pos = kmp_search(aln.sequence().c_str() + begin, end - begin, + read_2_adapter.c_str(), read_2_adapter.size(), + read_2_adapter_lps); + if (pos != string::npos) { + // this softclip is bracketed by a known adapter sequence, it is much more likely + // that it should be adapter trimmed rather than meriting a spliced alignment + continue; + } + } + + // select the candidate sources for the corresponding read + // note: we move the whole anchor to protect it from any shuffling within the vector + // so we will have to move it back at the end of this iteration + multipath_alignment_t anchor_mp_aln; + const vector* mems; + vector* cluster_graphs; + if (do_read_1) { + anchor_mp_aln = move(multipath_aln_pairs_out[current_index[j]].first); + mems = &mems1; + cluster_graphs = &cluster_graphs1; + } + else { + anchor_mp_aln = move(multipath_aln_pairs_out[current_index[j]].second); + mems = &mems2; + cluster_graphs = &cluster_graphs2; + } + + // identify the splice candidate + vector mp_aln_candidates; + unordered_set clusters_used; + vector cluster_candidates; + vector> hit_candidates; + identify_aligned_splice_candidates(aln, do_read_1, do_left, interval, multipath_aln_pairs_out, cluster_pairs, + current_index, j, clusters_used, mp_aln_candidates); + identify_unaligned_splice_candidates(aln, do_left, interval, *mems, + *cluster_graphs, clusters_used, cluster_candidates, + hit_candidates); + + // align splice candidates that haven't been aligned yet + vector unaligned_candidates; + align_to_splice_candidates(aln, *cluster_graphs, *mems, cluster_candidates, hit_candidates, + interval, do_left, do_read_1, unaligned_candidate_bank, unaligned_candidates, + fanouts); + + + function get_candidate = [&](int64_t i) -> const multipath_alignment_t& { + if (i < mp_aln_candidates.size()) { + if (do_read_1) { + return multipath_aln_pairs_out[mp_aln_candidates[i]].first; + } + else { + return multipath_aln_pairs_out[mp_aln_candidates[i]].second; + } + } + else { + return unaligned_candidate_bank.at(unaligned_candidates[i - mp_aln_candidates.size()]).first; + } + }; + + function get_multiplicity = [&](int64_t i) -> double { + if (i < 0) { + return pair_multiplicities[current_index[j]]; + } + else if (i < mp_aln_candidates.size()) { + return pair_multiplicities[i]; + } + else { + return unaligned_candidate_bank.at(unaligned_candidates[i - mp_aln_candidates.size()]).second; + } + }; + + // TODO: this solution is kinda ugly + multipath_alignment_t tmp; + function consume_candidate = [&](int64_t i) -> multipath_alignment_t&& { + if (i < 0) { +#ifdef debug_multipath_mapper + cerr << "consuming anchor for read " << (do_read_1 ? 1 : 2) << " in pair at current index " << current_index[j] << " and original index " << original_index[current_index[j]] << endl; +#endif + // consume the anchor + return move(anchor_mp_aln); + } + else if (i < mp_aln_candidates.size()) { +#ifdef debug_multipath_mapper + cerr << "consuming read " << (do_read_1 ? 1 : 2) << " of pair " << mp_aln_candidates[i] << endl; +#endif + + // look to see if the opposite side of this pair exists in multiple pairs + size_t opposite_cluster = do_read_1 ? cluster_pairs[mp_aln_candidates[i]].first.second + : cluster_pairs[mp_aln_candidates[i]].first.first; + bool opposite_duplicated = false; + if (opposite_cluster != RESCUED) { + // we don't want to count rescued alignments as the same cluster as each other + for (size_t j = 0; j < cluster_pairs.size() && !opposite_duplicated; ++j) { + opposite_duplicated = (j != mp_aln_candidates[i] && + ((do_read_1 && cluster_pairs[j].first.second == opposite_cluster) + || (!do_read_1 && cluster_pairs[j].first.first == opposite_cluster))); + } + } + + if (opposite_duplicated) { + // the other side will continue to live in its other pair, so we can scavenge + // this multipath alignment +#ifdef debug_multipath_mapper + cerr << "the opposite side is duplicated, removing pair" << endl; + cerr << "pair is index " << i << " among candidates, which is current index " << mp_aln_candidates[i] << " and original index " << original_index[mp_aln_candidates[i]] << endl; + cerr << "will swap with current index " << multipath_aln_pairs_out.size() - 1 << ", which has original index " << original_index[multipath_aln_pairs_out.size() - 1] << endl; +#endif + + tmp = do_read_1 ? move(multipath_aln_pairs_out[mp_aln_candidates[i]].first) + : move(multipath_aln_pairs_out[mp_aln_candidates[i]].second); + + // replace it in the vectors and clear the final position + multipath_aln_pairs_out[mp_aln_candidates[i]] = move(multipath_aln_pairs_out.back()); + pair_multiplicities[mp_aln_candidates[i]] = pair_multiplicities.back(); + cluster_pairs[mp_aln_candidates[i]] = cluster_pairs.back(); + multipath_aln_pairs_out.pop_back(); + pair_multiplicities.pop_back(); + cluster_pairs.pop_back(); + + // do the bookkeeping to track the original indexes + int64_t consuming_original_index = original_index[mp_aln_candidates[i]]; + int64_t moving_original_index = original_index[multipath_aln_pairs_out.size()]; + + current_index[moving_original_index] = mp_aln_candidates[i]; + current_index[consuming_original_index] = -1; + + original_index[mp_aln_candidates[i]] = moving_original_index; + original_index[multipath_aln_pairs_out.size()] = -1; + // TODO: also do bookkeeping on the clusters to claim the hits + } + else { + // we don't want to mess up pairs, so just copy it out + tmp = do_read_1 ? multipath_aln_pairs_out[mp_aln_candidates[i]].first + : multipath_aln_pairs_out[mp_aln_candidates[i]].second; + } + +#ifdef debug_multipath_mapper + cerr << "pair current indexes are now:" << endl; + for (size_t i = 0; i < current_index.size(); ++i) { + cerr << "\t" << i << " " << current_index[i] << endl; + } + cerr << "pair original indexes are now:" << endl; + for (size_t i = 0; i < original_index.size(); ++i) { + cerr << "\t" << i << " " << original_index[i] << endl; + } + cerr << "pair order:" << endl; + for (auto& mp_aln_pair : multipath_aln_pairs_out) { + pos_t p1, p2; + Alignment aln1, aln2; + optimal_alignment(mp_aln_pair.first, aln1); + optimal_alignment(mp_aln_pair.second, aln2); + if (aln1.path().mapping_size() != 0) { + p1 = make_pos_t(aln1.path().mapping(0).position()); + } + if (aln2.path().mapping_size() != 0) { + p2 = make_pos_t(aln2.path().mapping(0).position()); + } + cerr << "\t" << p1 << " " << p2 << endl; + } +#endif + + return move(tmp); + } + else { + // remove the candidate from the bank + auto candidate_id = unaligned_candidates[i - mp_aln_candidates.size()]; + tmp = move(unaligned_candidate_bank.at(candidate_id).first); + unaligned_candidate_bank.erase(candidate_id); + // and return it + return move(tmp); + } + }; + + // see if we can actually make spliced alignments + double anchor_multiplicity = pair_multiplicities[current_index[j]]; + bool spliced_side = test_splice_candidates(aln, do_left, anchor_mp_aln, &anchor_multiplicity, + strand, mp_aln_candidates.size() + unaligned_candidates.size(), + get_candidate, get_multiplicity, consume_candidate); + + if (!spliced_side && do_read_1 != do_left) { + // we might be able to rescue a spliced alignment segment + const auto& rescue_anchor = do_read_1 ? multipath_aln_pairs_out[current_index[j]].second + : multipath_aln_pairs_out[current_index[j]].first; + spliced_side = find_rescuable_spliced_alignments(aln, anchor_mp_aln, anchor_multiplicity, + strand, rescue_anchor, anchor_multiplicity, + do_read_1, interval); + } + + if (spliced_side) { + // we may need to update the pair's multiplicity + pair_multiplicities[current_index[j]] = anchor_multiplicity; + } + +#ifdef debug_check_adapters + if (do_read_1) { + if (do_left) { + success_1_left |= spliced_side; + } + else { + success_1_right |= spliced_side; + } + } + else { + if (do_left) { + success_2_left |= spliced_side; + } + else { + success_2_right |= spliced_side; + } + } +#endif + + any_splices = any_splices || spliced_side; + found_splice_for_anchor = found_splice_for_anchor || spliced_side; + + // move the anchor back now that we've done all of the shuffling we were going to do + if (do_read_1) { + multipath_aln_pairs_out[current_index[j]].first = move(anchor_mp_aln); + } + else { + multipath_aln_pairs_out[current_index[j]].second = move(anchor_mp_aln); + } + } + + if (!found_splice_for_anchor) { + // there are no more splices to be found for this read end + ++read_num; + } + } + +#ifdef debug_check_adapters + vector adapter_aln_scores; + vector adapters{"AGATCGGAAGAG"}; // the illumina universal adapter + adapters.push_back(reverse_complement(adapters.back())); + for (auto mp_aln : {multipath_aln_pairs_out[current_index[j]].first, multipath_aln_pairs_out[current_index[j]].second}) { + for (auto adapter : adapters) { + adapter_aln_scores.push_back(SSWAligner().align(mp_aln.sequence(), adapter).score()); + } + } + string line = (alignment1.name() + + "\t" + to_string(attempt_1_left) + "\t" + to_string(attempt_1_right) + + "\t" + to_string(attempt_2_left) + "\t" + to_string(attempt_2_right) + + "\t" + to_string(success_1_left) + "\t" + to_string(success_1_right) + + "\t" + to_string(success_2_left) + "\t" + to_string(success_2_right)); + for (auto s : adapter_aln_scores) { + line += (string("\t") + to_string(s)); + } + line += "\n"; + cerr << line; +#endif + } + + if (any_splices) { + sort_and_compute_mapping_quality(multipath_aln_pairs_out, cluster_pairs, nullptr, &pair_multiplicities); + } + + return any_splices; + } + + bool MultipathMapper::find_rescuable_spliced_alignments(const Alignment& alignment, + multipath_alignment_t& splice_anchor, + double& anchor_multiplicity, + SpliceStrand& strand, + const multipath_alignment_t& rescue_anchor, + double rescue_multiplicity, + bool rescue_left, + const pair& primary_interval) const { + + auto rescue_interval = aligned_interval(rescue_anchor); + + // TODO: it's possible that the original estimated mapping qualities aren't reflective + // of the score ordering anymore now that we may have added spliced alignments, but this + // should be a rare problem and we want to be sparing with rescues + if (rescue_anchor.mapping_quality() >= min(30, max_mapping_quality) + && ((rescue_left && rescue_interval.first == 0) || + (!rescue_left && rescue_interval.second == rescue_anchor.sequence().size()))) { + + multipath_alignment_t rescued; + bool succeeded = attempt_rescue_for_splice_segment(alignment, primary_interval, rescue_anchor, + rescue_left, rescued); + + if (succeeded) { + // set up simple functions to provide the rescued alignment as a candidate + function get_rescued_candidate = [&](int64_t i) -> const multipath_alignment_t& { + if (i < 0) { + // TODO: is this branch actually necessary? + return splice_anchor; + } + else { + return rescued; + } + }; + function get_rescued_multiplicity = [&](int64_t i) { + if (i < 0) { + return anchor_multiplicity; + } + else { + return rescue_multiplicity; + } + }; + function consume_rescued = [&](int64_t i) -> multipath_alignment_t&& { + if (i < 0) { + // consume the anchor + return move(splice_anchor); + } + else { + return move(rescued); + } + }; +#ifdef debug_multipath_mapper + cerr << "testing rescued spliced alignment segment as a candidate" << endl; +#endif + + // do a test again with only the rescued candidate + return test_splice_candidates(alignment, !rescue_left, splice_anchor, &anchor_multiplicity, strand, 1, + get_rescued_candidate, get_rescued_multiplicity, consume_rescued); + + } + } + return false; + } + + bool MultipathMapper::retry_pairing_spliced_alignments(const Alignment& alignment1, const Alignment& alignment2, + vector& multipath_alns_1, + vector& multipath_alns_2, + const vector& cluster_idxs_1, + const vector& cluster_idxs_2, + const vector& multiplicities_1, + const vector& multiplicities_2, + vector>& multipath_aln_pairs_out, + vector, int64_t>>& cluster_pairs_out, + vector& pair_multiplicities_out) const { + +#ifdef debug_multipath_mapper + cerr << "trying to re-pair spliced alignments" << endl; +#endif + + MemoizingGraph memoizing_graph(xindex); + unique_ptr distance_measurer = get_distance_measurer(memoizing_graph); + + // we want to restrict our attention to alignments that have been spliced + vector is_spliced_1(multipath_alns_1.size()), is_spliced_2(multipath_alns_2.size()); + for (size_t i = 0; i < multipath_alns_1.size(); ++i) { + is_spliced_1[i] = contains_connection(multipath_alns_1[i]); + } + for (size_t i = 0; i < multipath_alns_2.size(); ++i) { + is_spliced_2[i] = contains_connection(multipath_alns_2[i]); + } + + // TODO: this could all be made more efficient by not doing these computations + // multiple times + bool found_consistent = false; + auto attempt_to_pair = [&](size_t i, size_t j) { + + if (multipath_alns_1[i].subpath_size() == 0 || multipath_alns_2[j].subpath_size() == 0) { + return; + } + + Alignment opt_1, opt_2; + optimal_alignment(multipath_alns_1[i], opt_1); + optimal_alignment(multipath_alns_2[j], opt_2); + if (opt_1.path().mapping_size() == 0 || opt_2.path().mapping_size() == 0) { + return; + } + + pos_t inner_pos_1 = final_position(opt_1.path()); + int64_t aligned_length_1 = path_from_length(opt_1.path()); + + pos_t inner_pos_2 = initial_position(opt_2.path()); + int64_t aligned_length_2 = path_from_length(opt_2.path()); + +#ifdef debug_multipath_mapper + cerr << "trying to re-pair alns " << i << " and " << j << " with inner positions " << inner_pos_1 << " and " << inner_pos_2 << ", and aligned lengths " << aligned_length_1 << " and " << aligned_length_2 << endl; +#endif + if (aligned_length_1 == 0 || aligned_length_2 == 0) { + return; + } + + int64_t dist = distance_measurer->oriented_distance(inner_pos_1, inner_pos_2); + if (dist != numeric_limits::max()) { + int64_t total_dist = dist + aligned_length_1 + aligned_length_2; +#ifdef debug_multipath_mapper + cerr << "re-estimated disatnce: " << total_dist << endl; +#endif + if (is_consistent(total_dist)) { + // note: we're kind of abusing cluster pairs here by temporarily making it + // point to alignments instead of clusters + cluster_pairs_out.emplace_back(make_pair(i, j), total_dist); + found_consistent = true; +#ifdef debug_multipath_mapper + cerr << "re-pair succeeded" << endl; +#endif + } + } + }; + + for (size_t i = 0; i < is_spliced_1.size(); ++i) { + if (is_spliced_1[i]) { + for (size_t j = 0; j < multipath_alns_2.size(); ++j) { + attempt_to_pair(i, j); + } + } + } + for (size_t j = 0; j < multipath_alns_2.size(); ++j) { + if (is_spliced_2[j]) { + for (size_t i = 0; i < multipath_alns_1.size(); ++i) { + if (!is_spliced_1[i]) { + // we haven't tried this pairing in the opposite direction + attempt_to_pair(i, j); + } + } + } + } + + if (found_consistent) { + // count up how many rescued pairs use each original alignment so we can + // know when it's safe to consume one + unordered_map left_count, right_count; + for (const auto& cluster_pair : cluster_pairs_out) { + ++left_count[cluster_pair.first.first]; + ++right_count[cluster_pair.first.second]; + } + + for (auto& cluster_pair : cluster_pairs_out) { + // either copy or move the individual end mappings + multipath_aln_pairs_out.emplace_back(); + if (--left_count[cluster_pair.first.first] == 0) { + multipath_aln_pairs_out.back().first = move(multipath_alns_1[cluster_pair.first.first]); + } + else { + multipath_aln_pairs_out.back().first = multipath_alns_1[cluster_pair.first.first]; + } + if (--right_count[cluster_pair.first.second] == 0) { + multipath_aln_pairs_out.back().second = move(multipath_alns_2[cluster_pair.first.second]); + } + else { + multipath_aln_pairs_out.back().second = multipath_alns_2[cluster_pair.first.second]; + } + + // pair is at least as unique as either of its two ends + pair_multiplicities_out.emplace_back(min(multiplicities_1[cluster_pair.first.first], + multiplicities_2[cluster_pair.first.second])); + + // fix cluster pair to point at clusters instead of mappings + cluster_pair.first.first = cluster_idxs_1[cluster_pair.first.first]; + cluster_pair.first.second = cluster_idxs_2[cluster_pair.first.second]; + } + } + return found_consistent; + } + + void MultipathMapper::agglomerate(size_t idx, multipath_alignment_t& agglomerating, const multipath_alignment_t& multipath_aln, + vector& agglomerated_group, unordered_set& agg_start_positions, + unordered_set& agg_end_positions) const { + // does it look like we've already agglomerated a mapping at this position + vector start_positions, end_positions; + for (auto j : multipath_aln.start()) { + start_positions.emplace_back(initial_position(multipath_aln.subpath(j).path())); + } + for (size_t j = 0; j < multipath_aln.subpath_size(); ++j) { + if (multipath_aln.subpath(j).next_size() == 0) { + end_positions.emplace_back(final_position(multipath_aln.subpath(j).path())); + } + } + + bool do_agglomerate = true; + for (size_t j = 0; j < start_positions.size() && do_agglomerate; ++j) { + do_agglomerate = !agg_start_positions.count(start_positions[j]); + } + for (size_t j = 0; j < end_positions.size() && do_agglomerate; ++j) { + do_agglomerate = !agg_end_positions.count(end_positions[j]); + } + + if (do_agglomerate) { + // we want to merge this one in with the primary mapping + agglomerated_group.push_back(idx); + // skip the merge step if it's the first mapping + if (idx > 0) { + append_multipath_alignment(agglomerating, multipath_aln); + } + // record that the aggregated mapping now has these start and end positions + agg_start_positions.insert(start_positions.begin(), start_positions.end()); + agg_end_positions.insert(end_positions.begin(), end_positions.end()); + } + } + + void MultipathMapper::agglomerate_alignments(vector& multipath_alns_out, + vector* multiplicities) { + + if (multipath_alns_out.empty()) { + return; + } + + // the likelihoods of each alignment, which we assume to be sorted + vector scores = mapping_likelihoods(multipath_alns_out); + auto alnr = get_aligner(!multipath_alns_out.front().quality().empty()); + double min_score = scores.front() - alnr->mapping_quality_score_diff(max_mapping_quality); + + size_t i; + vector agglomerated_group; + unordered_set agg_start_positions, agg_end_positions; + for (i = 0; i < multipath_alns_out.size(); ++i) { + // is the score good enough to agglomerate? + if (scores[i] < min_score || likely_mismapping(multipath_alns_out[i])) { + // none of the following will be either + break; + } + + // apply the agglomeration procedure + agglomerate(i, multipath_alns_out.front(), + multipath_alns_out[i], agglomerated_group, + agg_start_positions, agg_end_positions); + } + + if (i > 1) { + + // figure out the mapping quality for the whole aggregated alignment + double raw_mapq = alnr->compute_group_mapping_quality(scores, agglomerated_group, + multiplicities); + int32_t mapq = min(max_mapping_quality, int32_t(mapq_scaling_factor * raw_mapq)); + multipath_alns_out.front().set_mapping_quality(mapq); + + multipath_alns_out.front().set_annotation("disconnected", true); + + // move the remaining alignments up in the return vector and resize the remnants away + for (size_t j = i, k = 1; j < multipath_alns_out.size(); ++j, ++k) { + multipath_alns_out[k] = move(multipath_alns_out[j]); + } + multipath_alns_out.resize(multipath_alns_out.size() - i + 1); + } + } + + void MultipathMapper::agglomerate_alignment_pairs(vector>& multipath_aln_pairs_out, + vector, int64_t>>& cluster_pairs, + vector& multiplicities) { + + if (multipath_aln_pairs_out.empty()) { + return; + } + + // the likelihoods of each alignment, which we assume to be sorted + vector scores = pair_mapping_likelihoods(multipath_aln_pairs_out, cluster_pairs); + auto alnr = get_aligner(!multipath_aln_pairs_out.front().first.quality().empty() + && !multipath_aln_pairs_out.front().second.quality().empty()); + double min_score = scores.front() - alnr->mapping_quality_score_diff(max_mapping_quality); + + size_t i; + vector agglomerated_group_1, agglomerated_group_2; + unordered_set agg_start_positions_1, agg_end_positions_1, agg_start_positions_2, agg_end_positions_2; + for (i = 0; i < multipath_aln_pairs_out.size(); ++i) { + // is the score good enough to agglomerate? + if (scores[i] < min_score || + (likely_mismapping(multipath_aln_pairs_out[i].first) && likely_mismapping(multipath_aln_pairs_out[i].second))) { + // none of the following will be either + break; + } + + auto& multipath_aln_pair = multipath_aln_pairs_out[i]; + + // repeat agglomeration procedure for both ends of the pair + agglomerate(i, multipath_aln_pairs_out.front().first, + multipath_aln_pair.first, agglomerated_group_1, + agg_start_positions_1, agg_end_positions_1); + agglomerate(i, multipath_aln_pairs_out.front().second, + multipath_aln_pair.second, agglomerated_group_2, + agg_start_positions_2, agg_end_positions_2); + } + + if (i > 1) { + + // figure out the mapping quality for the whole aggregated alignment + double raw_mapq_1 = alnr->compute_group_mapping_quality(scores, agglomerated_group_1, + &multiplicities); + double raw_mapq_2 = alnr->compute_group_mapping_quality(scores, agglomerated_group_2, + &multiplicities); + int32_t mapq_1 = min(max_mapping_quality, int32_t(mapq_scaling_factor * raw_mapq_1)); + int32_t mapq_2 = min(max_mapping_quality, int32_t(mapq_scaling_factor * raw_mapq_2)); + multipath_aln_pairs_out.front().first.set_mapping_quality(mapq_1); + multipath_aln_pairs_out.front().second.set_mapping_quality(mapq_2); + multipath_aln_pairs_out.front().first.set_annotation("disconnected", true); + multipath_aln_pairs_out.front().second.set_annotation("disconnected", true); + + // move the remaining alignments up in the return vector and resize the remnants away + for (size_t j = i, k = 1; j < multipath_aln_pairs_out.size(); ++j, ++k) { + multipath_aln_pairs_out[k] = move(multipath_aln_pairs_out[j]); + } + multipath_aln_pairs_out.resize(multipath_aln_pairs_out.size() - i + 1); + } + } + + void MultipathMapper::purge_unmapped_alignments(vector& multipath_alns_out) { + for (size_t i = 0; i < multipath_alns_out.size(); ++i) { + // TODO: could do this more efficiently by bisect search + if (likely_mismapping(multipath_alns_out[i])) { + // we can't distinguish this alignment from the longest MEM of a random sequence + // so we don't report this mapping + if (i == 0) { +#ifdef debug_multipath_mapper + cerr << "mapping is not distinguishable from a random sequence, reporting as unmapped" << endl; +#endif + // leave an unmapped placeholder + multipath_alns_out.resize(1); + clear_alignment(multipath_alns_out.front()); + } + else { + // truncate the output from this point on + multipath_alns_out.resize(i); + } + break; + } + } + } + + void MultipathMapper::purge_unmapped_alignments(vector>& multipath_aln_pairs_out, bool proper_paired) { + + // decide if the read is unmapped + if (proper_paired) { + for (size_t i = 0; i < multipath_aln_pairs_out.size(); ++i) { + // if they're part of a proper pair, we count even pretty bad alignments + // as mapped + if (likely_mismapping(multipath_aln_pairs_out[i].first) && + likely_mismapping(multipath_aln_pairs_out[i].second)) { + // this pair is actually unmapped + // TODO: maybe we should accept it though, in a large genome they are very + // unlikely to be paired by chance... + if (i == 0) { + // the read is completely unmapped, get rid of multimappings + multipath_aln_pairs_out.resize(1); + clear_alignment(multipath_aln_pairs_out.front().first); + clear_alignment(multipath_aln_pairs_out.front().second); + } + else { + // truncate the list here + multipath_aln_pairs_out.resize(i); + } + break; + } + } + } + else { + // we have to do some complicated logic to look for unmapped reads in both primary + // and secondary because of vg's silly interleaving logic... + + // find where unmapped reads begin in each list + size_t i, j; + for (i = 0; i < multipath_aln_pairs_out.size(); ++i) { + if (multipath_aln_pairs_out[i].first.subpath().empty() + || likely_mismapping(multipath_aln_pairs_out[i].first)) { + break; + } + } + for (j = 0; j < multipath_aln_pairs_out.size(); ++j) { + if (multipath_aln_pairs_out[j].second.subpath().empty() + || likely_mismapping(multipath_aln_pairs_out[j].second)) { + break; + } + } + + if (i != multipath_aln_pairs_out.size() || j != multipath_aln_pairs_out.size()) { + // we need to clear out unmapped reads in at least one of the read lists + + // truncate + multipath_aln_pairs_out.resize(max(1, max(i, j))); + // add placeholders for unmapped reads if necessary + for (; i < multipath_aln_pairs_out.size(); ++i) { + clear_alignment(multipath_aln_pairs_out[i].first); + } + for (; j < multipath_aln_pairs_out.size(); ++j) { + clear_alignment(multipath_aln_pairs_out[j].second); + } + } + } + } + + void MultipathMapper::simplify_complicated_multipath_alignment(multipath_alignment_t& multipath_aln) const { + + if (multipath_aln.subpath_size() > prune_subpaths_multiplier * multipath_aln.sequence().size()) { + // this is a very complicated multipath alignment relative to the length of the sequence, + // so we'll see if we can maybe get rid of parts of it for having low score + + auto aligner = get_aligner(!multipath_aln.quality().empty()); + int32_t max_diff = ceil(aligner->mapping_quality_score_diff(max_mapping_quality) / mapq_scaling_factor); + +#ifdef debug_multipath_mapper + cerr << "multipath alignment has " << multipath_aln.subpath_size() << " subpaths, which is large relative to sequence length of " << multipath_aln.sequence().size() << ", attempting to simplify by pruning low-scoring sections, diff = " << max_diff << endl; +#endif + + remove_low_scoring_sections(multipath_aln, max_diff); + + // TODO: it would be nice to merge non branching subpaths here, but we don't know what + // the prohibited merges were... +#ifdef debug_multipath_mapper + cerr << "after pruning, alignment has " << multipath_aln.subpath_size() << " subpaths" << endl; +#endif + } + } + + void MultipathMapper::split_multicomponent_alignments(vector& multipath_alns_out, + const Alignment* alignment, + vector* cluster_graphs, + vector* cluster_idxs, + vector* multiplicities) const { + + size_t num_original_alns = multipath_alns_out.size(); + vector split_idxs; + for (size_t i = 0; i < num_original_alns; i++) { + + vector> comps = connected_components(multipath_alns_out[i]); + + if (comps.size() > 1) { +#ifdef debug_multipath_mapper + cerr << "splitting multicomponent alignment " << debug_string(multipath_alns_out[i]) << endl; +#endif + // split this multipath alignment into its connected components + for (size_t j = 1; j < comps.size(); j++) { + split_idxs.push_back(multipath_alns_out.size()); + multipath_alns_out.emplace_back(); + extract_sub_multipath_alignment(multipath_alns_out[i], comps[j], + multipath_alns_out.back()); + // also label the split alignment with its cluster of origin, if we're keeping track of that + if (cluster_idxs) { + cluster_idxs->emplace_back(cluster_idxs->at(i)); + } + if (multiplicities) { + multiplicities->emplace_back(multiplicities->at(i)); + } + } + // put the first component into the original location + multipath_alignment_t last_component; + extract_sub_multipath_alignment(multipath_alns_out[i], comps[0], last_component); + multipath_alns_out[i] = move(last_component); + split_idxs.push_back(i); + } + } + + if (alignment && cluster_graphs && cluster_idxs && do_spliced_alignment && !split_idxs.empty()) { + // we only do this in spliced alignment because we want to clustering to + // unclaim certain hits so they can be seen as spliced alignment candidates + + vector split_mp_alns(split_idxs.size()); + vector cluster_assignments(split_idxs.size()); + for (size_t i = 0; i < split_idxs.size(); ++i) { + auto& mp_aln = multipath_alns_out[split_idxs[i]]; + // TODO: we need to have these be ordered to find the MEMs, but this will be wastefully repeated later + topologically_order_subpaths(mp_aln); + split_mp_alns[i] = &mp_aln; + cluster_assignments[i] = &(*cluster_idxs)[split_idxs[i]]; + } + + vector all_cluster_assignments(cluster_idxs->size()); + for (size_t i = 0; i < all_cluster_assignments.size(); ++i) { + all_cluster_assignments[i] = &(*cluster_idxs)[i]; + } + + reassign_split_clusters(*alignment, *cluster_graphs, split_mp_alns, cluster_assignments, + all_cluster_assignments); + } + } + + void MultipathMapper::reassign_split_clusters(const Alignment& alignment, + vector& cluster_graphs, + const vector& split_mp_alns, + const vector& cluster_assignments, + const vector& all_cluster_assignments) const { +#ifdef debug_multipath_mapper + cerr << "reassigning split clusters for mp alns" << endl; + for (auto mp_aln : split_mp_alns) { + cerr << debug_string(*mp_aln) << endl; + } +#endif + + // it's often possible to divvy up the hits into smaller clusters now + + // reorganize the split alignments by their original cluster + unordered_map> original_cluster; + for (size_t i = 0; i < cluster_assignments.size(); ++i) { + original_cluster[*cluster_assignments[i]].push_back(i); +#ifdef debug_multipath_mapper + cerr << "mp aln " << i << " associated with cluster " << *cluster_assignments[i] << endl; +#endif + } + + bool any_new_clusters = false; + for (const auto& record : original_cluster) { + +#ifdef debug_multipath_mapper + cerr << "reassigning clusters for mp alns originally from cluster " << record.first << ":" << endl; + for (auto i : record.second) { + cerr << "\t" << i << endl; + } +#endif + + bool all_hits_found = true; + vector> hits_found_in_aln(record.second.size()); + + // note: we use the ugly "get" function every time because we're also going to be modifying + // the cluster graphs vector and we don't want there to be trouble with a local reference + + for (size_t i = 0; i < get<1>(cluster_graphs[record.first]).first.size() && all_hits_found; ++i) { + all_hits_found = false; + auto& hit = get<1>(cluster_graphs[record.first]).first[i]; + for (size_t j = 0; j < record.second.size(); ++j) { +#ifdef debug_multipath_mapper + cerr << "checking for " << i << "-th hit in this cluster the in " << j << "-th mp aln" << endl; +#endif + + // check if the i-th MEM is contained in the j-th split up component + if (contains_match(*split_mp_alns[record.second[j]], hit.second, + hit.first->begin - alignment.sequence().begin(), hit.first->length())) { + +#ifdef debug_multipath_mapper + cerr << "hit found" << endl; +#endif + + all_hits_found = true; + hits_found_in_aln[j].push_back(i); + } + } + } + + if (!all_hits_found) { + // our partition is incomplete, which makes the process of divvying up the + // hits too complicated (some will get lost), so just skip it + // TODO: is it really a problem if we lose some hits? +#ifdef debug_multipath_mapper + cerr << "not all hits are still found, skipping reassignment" << endl; +#endif + continue; + } + + map, size_t> new_cluster; + for (size_t j = 0; j < record.second.size(); ++j) { +#ifdef debug_multipath_mapper + cerr << "reassigning cluster of " << j << "-th mp aln, " << record.second[j] << endl; +#endif + if (hits_found_in_aln[j].size() == get<1>(cluster_graphs[record.first]).first.size()) { + // this alignment still contains the whole cluster, so we don't need + // to point it at anything else +#ifdef debug_multipath_mapper + cerr << "still in original cluster " << record.first << endl; +#endif + continue; + } + + auto it = new_cluster.find(hits_found_in_aln[j]); + if (it == new_cluster.end()) { +#ifdef debug_multipath_mapper + cerr << "making a new cluster at index " << cluster_graphs.size() << " with hits:" << endl; + for (auto hit_idx : hits_found_in_aln[j]) { + cerr << "\t" << hit_idx << endl; + } +#endif + + // this is the first time we're encountering this combination of hits, so we need to make + // a corresponding cluster + it = new_cluster.insert(make_pair(move(hits_found_in_aln[j]), cluster_graphs.size())).first; + + // and now make the cluster graph itself + cluster_graphs.emplace_back(); + auto& cluster_graph = cluster_graphs.back(); + get<0>(cluster_graph) = unique_ptr(new bdsg::HashGraph()); + handlealgs::copy_handle_graph(get<0>(cluster_graphs[record.first]).get(), + get<0>(cluster_graph).get()); + + for (auto i : it->first) { + get<1>(cluster_graph).first.emplace_back(get<1>(cluster_graphs[record.first]).first[i]); + } + get<1>(cluster_graph).second = get<1>(cluster_graphs[record.first]).second; + +#ifdef debug_multipath_mapper + cerr << "\tmultiplicity " << get<1>(cluster_graph).second << endl; +#endif + + set_read_coverage(cluster_graph); + +#ifdef debug_multipath_mapper + cerr << "assign total coverage " << get<2>(cluster_graphs.back()) << endl; +#endif + any_new_clusters = true; + } +#ifdef debug_multipath_mapper + cerr << "assigned to cluster " << it->second << endl; +#endif + // reassign the cluster idx to the newer cluster + *cluster_assignments[record.second[j]] = it->second; + } + } + + if (any_new_clusters) { + // we reassigned some clusters, now let's check if we can clear any of the old clusters + + // clear the original clusters out of the map if at least one alignment is + // still using them + for (auto cluster_ptr : all_cluster_assignments) { + auto it = original_cluster.find(*cluster_ptr); + if (it != original_cluster.end()) { + original_cluster.erase(it); + } + } + + // any remaining clusters aren't being used by any alignment and they can + // have their hits released + for (const auto& record : original_cluster) { +#ifdef debug_multipath_mapper + cerr << "cluster " << record.first << " has no more assignees, unclaiming all of its hits" << endl; +#endif + get<1>(cluster_graphs[record.first]).first.clear(); + get<2>(cluster_graphs[record.first]) = 0; + } + + + // reorder the clusters based on the new coverage + vector order(cluster_graphs.size(), 0); + for (size_t i = 1; i < cluster_graphs.size(); ++i) { + order[i] = i; + } + + stable_sort(order.begin(), order.end(), [&](size_t i, size_t j) { + return get<2>(cluster_graphs[i]) > get<2>(cluster_graphs[j]); + }); + + vector index(order.size()); + for (size_t i = 0; i < order.size(); ++i) { + index[order[i]] = i; + } + +#ifdef debug_multipath_mapper + cerr << "reordering clusters based on coverage" << endl; + for (size_t i = 0; i < index.size(); ++i) { + cerr << "\t" << i << " -> " << index[i] << endl; + } +#endif + + for (size_t* cluster_assignment : all_cluster_assignments) { + if (*cluster_assignment != RESCUED) { + *cluster_assignment = index[*cluster_assignment]; + } + } + + for (size_t i = 0; i < index.size(); ++i) { + while (index[i] != i) { + std::swap(cluster_graphs[index[i]], cluster_graphs[i]); + std::swap(index[index[i]], index[i]); + + } + } + +#ifdef debug_multipath_mapper + cerr << "new cluster ordering" << endl; + for (int i = 0; i < cluster_graphs.size(); i++) { + cerr << "cluster " << i << ", coverage " << get<2>(cluster_graphs[i]) << ", multiplicity " << get<1>(cluster_graphs[i]).second << endl; + for (pair hit : get<1>(cluster_graphs[i]).first) { + cerr << "\t" << hit.second << " " << hit.first->sequence() << endl; + } + } +#endif + } + } + + void MultipathMapper::merge_rescued_mappings(vector>& multipath_aln_pairs_out, + vector, int64_t>>& cluster_pairs, + vector& pair_multiplicities, + vector>& rescued_multipath_aln_pairs, + vector, int64_t>>& rescued_cluster_pairs, + vector& rescued_multiplicities) const { + + size_t num_unrescued_pairs = multipath_aln_pairs_out.size(); + + for (size_t j = 0; j < rescued_multipath_aln_pairs.size(); j++) { + + // make sure this pair isn't a duplicate with any of the original pairs + bool duplicate = false; + for (size_t i = 0; i < num_unrescued_pairs; i++) { +#ifdef debug_multipath_mapper + cerr << "checking if rescue pair " << j << " is duplicate of original pair " << i << endl; +#endif + if (share_terminal_positions(multipath_aln_pairs_out[i].first, rescued_multipath_aln_pairs[j].first)) { + if (share_terminal_positions(multipath_aln_pairs_out[i].second, rescued_multipath_aln_pairs[j].second)) { +#ifdef debug_multipath_mapper + cerr << "found a duplicate" << endl; +#endif + duplicate = true; + break; + } + } + } + + if (!duplicate) { +#ifdef debug_multipath_mapper + cerr << "no duplicate, adding to return vector if distance is finite and positive" << endl; +#endif + multipath_aln_pairs_out.emplace_back(move(rescued_multipath_aln_pairs[j])); + cluster_pairs.emplace_back(rescued_cluster_pairs[j]); + pair_multiplicities.emplace_back(rescued_multiplicities[j]); + } + } + + sort_and_compute_mapping_quality(multipath_aln_pairs_out, cluster_pairs, nullptr, &pair_multiplicities); + } + + double MultipathMapper::estimate_missed_rescue_multiplicity(size_t which_pair, + const vector, int64_t>>& cluster_pairs, + const vector& cluster_graphs1, + const vector& cluster_graphs2, + bool from_secondary_rescue) const { +#ifdef debug_multipath_mapper + cerr << "checking whether we should enter rescue multiplicity routine" << endl; +#endif + + + double multiplicity = 1.0; + + // did we use an out-of-bounds cluster index to flag either end as coming from a rescue? + bool opt_aln_1_is_rescued = cluster_pairs[which_pair].first.first == RESCUED; + bool opt_aln_2_is_rescued = cluster_pairs[which_pair].first.second == RESCUED; + + // was the optimal cluster pair obtained by rescue? + if (opt_aln_1_is_rescued || opt_aln_2_is_rescued) { + // let's figure out if we should reduce its mapping quality to reflect the fact that we may not have selected the + // correct cluster as a rescue candidate + +#ifdef debug_multipath_mapper + cerr << "the optimal alignment is a rescue, checking if we need to cap the mapping quality" << endl; +#endif + + const vector& anchor_clusters = opt_aln_1_is_rescued ? cluster_graphs2 : cluster_graphs1; + size_t anchor_idx = opt_aln_1_is_rescued ? cluster_pairs[which_pair].first.second : cluster_pairs[which_pair].first.first; + + // find the range of clusters that could plausibly be about as good as the one that rescue succeeded from + size_t plausible_clusters_end_idx = anchor_idx; + for (; plausible_clusters_end_idx < anchor_clusters.size(); plausible_clusters_end_idx++) { + if (get<2>(anchor_clusters[plausible_clusters_end_idx]) < get<2>(anchor_clusters[anchor_idx]) - plausible_rescue_cluster_coverage_diff) { + break; + } + } + + // TODO: it's a bit ugly/fragile to have the secondary rescue logic recapitulated here + + // figure out which index corresponds to the end of the range we would have rescued + size_t max_rescues_attempted_idx; + if (from_secondary_rescue) { + // find the indexes that were added by pair clustering + unordered_set paired_idxs; + for (auto& cluster_pair : cluster_pairs) { + paired_idxs.insert(opt_aln_1_is_rescued ? cluster_pair.first.second : cluster_pair.first.first); + } + + // the "budget" of rescues we could have performed + size_t rescues_left = secondary_rescue_attempts; + size_t i = 0; + for (; i < anchor_clusters.size(); i++) { + // did we skip thi index because it was already in a pair? + if (!paired_idxs.count(i)) { + if (rescues_left) { + // we would have tried a secondary rescue here + rescues_left--; + } + else { + // we would have run out of secondary rescues here + break; + } + } + } + // this is the first index we didn't rescue + max_rescues_attempted_idx = i; + } + else { + // simpler without secondary rescue, we would have just rescued up to the maximum allowable + max_rescues_attempted_idx = max_rescue_attempts; } -#ifdef debug_multipath_mapper - cerr << "did secondary rescue from " << num_rescued_equivalent_1 << " of " << num_equivalent_1 << " equivalent clusters on read 1 and " << num_rescued_equivalent_2 << " of " << num_equivalent_2 << " equivalent clusters on read 2" << endl; -#endif - - // compute the probability that we failed to rescue each of the clusters from the other - double prob_missed_rescue_of_cluster_1 = 1.0 - (1.0 - prob_missing_equiv_cluster_2) * double(num_rescued_equivalent_2) / double(num_equivalent_2); - double prob_missed_rescue_of_cluster_2 = 1.0 - (1.0 - prob_missing_equiv_cluster_1) * double(num_rescued_equivalent_1) / double(num_equivalent_1); #ifdef debug_multipath_mapper - cerr << "estimate the probability we missed rescue of cluster 1 at " << prob_missed_rescue_of_cluster_1 << " and cluster 2 at " << prob_missed_rescue_of_cluster_2 << endl; + cerr << "performed up to " << max_rescues_attempted_idx << " out of " << plausible_clusters_end_idx << " plausible rescues" << endl; #endif - // possibly lower our estimate of the probabilty of missing a cluster if there was a high probabilty - // that we would have found it during secondary rescue - prob_missing_equiv_cluster_1 = min(prob_missed_rescue_of_cluster_1, prob_missing_equiv_cluster_1); - prob_missing_equiv_cluster_2 = min(prob_missed_rescue_of_cluster_2, prob_missing_equiv_cluster_2); - - prob_missing_pair = 1.0 - (1.0 - prob_missing_equiv_cluster_1) * (1.0 - prob_missing_equiv_cluster_2); + multiplicity = max(1.0, (double)(plausible_clusters_end_idx) / (double)(max_rescues_attempted_idx)); } -#ifdef debug_multipath_mapper - cerr << "estimate probability that hit sampling caused us to miss correct cluster pair at " << prob_missing_pair << endl; -#endif - - if (prob_missing_pair > 0.0) { - int32_t hit_sampling_mapq = round(prob_to_phred(prob_missing_pair)); - -#ifdef debug_multipath_mapper - cerr << "capping mapping quality to " << hit_sampling_mapq << endl; -#endif - - // cap the mapping quality at this value - multipath_aln_pairs_out.front().first.set_mapping_quality(min(multipath_aln_pairs_out.front().first.mapping_quality(), - hit_sampling_mapq)); - multipath_aln_pairs_out.front().second.set_mapping_quality(min(multipath_aln_pairs_out.front().second.mapping_quality(), - hit_sampling_mapq)); + return multiplicity; + } + + double MultipathMapper::cluster_multiplicity(const memcluster_t& cluster) const { + if (cluster.first.empty()) { + return cluster.second; + } + double max_fraction_sampled = 0.0; + for (const pair& hit : cluster.first) { + const MaximalExactMatch& mem = *hit.first; + max_fraction_sampled = max(max_fraction_sampled, (double)(mem.queried_count) / (double)(mem.match_count)); } + return cluster.second / max_fraction_sampled; } - - void MultipathMapper::cap_mapping_quality_by_hit_sampling_probability(vector& multipath_alns_out, - vector& cluster_idxs, - vector& cluster_graphs) const { - - clustergraph_t& opt_cluster = cluster_graphs[cluster_idxs.front()]; - - // what is the chance that we would have missed a cluster with the same MEMs because of hit sub-sampling - double prob_missing_equiv_cluster = prob_equivalent_clusters_hits_missed(get<1>(opt_cluster)); - - -#ifdef debug_multipath_mapper - cerr << "estimate probability that hit sampling caused us to miss correct cluster at " << prob_missing_equiv_cluster << endl; -#endif - - if (prob_missing_equiv_cluster > 0.0) { - int32_t hit_sampling_mapq = round(prob_to_phred(prob_missing_equiv_cluster)); - -#ifdef debug_multipath_mapper - cerr << "capping mapping quality to " << hit_sampling_mapq << endl; -#endif - - // cap the mapping quality at this value - multipath_alns_out.front().set_mapping_quality(min(multipath_alns_out.front().mapping_quality(), - hit_sampling_mapq)); + + double MultipathMapper::pair_cluster_multiplicity(const memcluster_t& cluster_1, const memcluster_t& cluster_2) const { + return min(cluster_multiplicity(cluster_1), cluster_multiplicity(cluster_2)); + } + + MultipathMapper::match_fanouts_t MultipathMapper::record_fanouts(const vector& mems, + vector>>& fanouts) const { + + match_fanouts_t match_fanouts; + if (!fanouts.empty()) { + assert(fanouts.size() == mems.size()); + for (size_t i = 0; i < mems.size(); ++i) { + if (!fanouts[i].empty()) { + match_fanouts[&mems[i]] = move(fanouts[i]); + } + } } + return match_fanouts; } - void MultipathMapper::split_multicomponent_alignments(vector>& multipath_aln_pairs_out, - vector, int64_t>>& cluster_pairs) const { + void MultipathMapper::split_multicomponent_alignments(const Alignment& alignment1, const Alignment& alignment2, + vector>& multipath_aln_pairs_out, + vector& cluster_graphs1, + vector& cluster_graphs2, + vector, int64_t>>& cluster_pairs, + vector& pair_multiplicities) const { size_t original_num_pairs = multipath_aln_pairs_out.size(); + vector split_idxs_1, split_idxs_2; for (size_t i = 0; i < original_num_pairs; i++) { vector> connected_components_1 = connected_components(multipath_aln_pairs_out[i].first); vector> connected_components_2 = connected_components(multipath_aln_pairs_out[i].second); #ifdef debug_multipath_mapper cerr << "finding connected components for mapping:" << endl; - cerr << pb2json(multipath_aln_pairs_out[i].first) << endl; - cerr << pb2json(multipath_aln_pairs_out[i].second) << endl; +#endif +#ifdef debug_multipath_mapper_alignment + view_multipath_alignment_as_dot(cerr, multipath_aln_pairs_out[i].first); + view_multipath_alignment_as_dot(cerr, multipath_aln_pairs_out[i].second); +#endif +#ifdef debug_multipath_mapper cerr << "read 1 connected components:" << endl; for (vector& comp : connected_components_1) { cerr << "\t"; @@ -2282,7 +5092,7 @@ namespace vg { } #endif // we will put pairs of split up components in here - vector> split_multipath_alns; + vector> split_multipath_alns; if (connected_components_1.size() > 1 && connected_components_2.size() > 1) { #ifdef debug_multipath_mapper @@ -2305,7 +5115,7 @@ namespace vg { #endif // only need to split first end for (size_t j = 0; j < connected_components_1.size(); j++) { - split_multipath_alns.emplace_back(MultipathAlignment(), multipath_aln_pairs_out[i].second); + split_multipath_alns.emplace_back(multipath_alignment_t(), multipath_aln_pairs_out[i].second); extract_sub_multipath_alignment(multipath_aln_pairs_out[i].first, connected_components_1[j], split_multipath_alns.back().first); } @@ -2316,7 +5126,7 @@ namespace vg { #endif // only need to split second end for (size_t j = 0; j < connected_components_2.size(); j++) { - split_multipath_alns.emplace_back(multipath_aln_pairs_out[i].first, MultipathAlignment()); + split_multipath_alns.emplace_back(multipath_aln_pairs_out[i].first, multipath_alignment_t()); extract_sub_multipath_alignment(multipath_aln_pairs_out[i].second, connected_components_2[j], split_multipath_alns.back().second); } @@ -2326,18 +5136,17 @@ namespace vg { if (!split_multipath_alns.empty()) { bool replaced_original = false; - for (pair& split_multipath_aln_pair : split_multipath_alns) { + for (pair& split_multipath_aln_pair : split_multipath_alns) { // we also need to measure the disance for scoring - int64_t dist = distance_between(split_multipath_aln_pair.first, split_multipath_aln_pair.second, - true, unstranded_clustering); + int64_t dist = distance_between(split_multipath_aln_pair.first, split_multipath_aln_pair.second, true); // if we can't measure a distance, then don't add the pair if (dist != numeric_limits::max()) { #ifdef debug_multipath_mapper cerr << "adding component pair at distance " << dist << ":" << endl; - cerr << pb2json(split_multipath_aln_pair.first) << endl; - cerr << pb2json(split_multipath_aln_pair.second) << endl; + cerr << debug_string(split_multipath_aln_pair.first) << endl; + cerr << debug_string(split_multipath_aln_pair.second) << endl; #endif if (!replaced_original) { @@ -2345,53 +5154,117 @@ namespace vg { multipath_aln_pairs_out[i] = move(split_multipath_aln_pair); cluster_pairs[i].second = dist; replaced_original = true; + if (connected_components_1.size() > 1) { + split_idxs_1.push_back(i); + } + if (connected_components_2.size() > 1) { + split_idxs_2.push_back(i); + } } else { // append the rest of them to the end + if (connected_components_1.size() > 1) { + split_idxs_1.push_back(multipath_aln_pairs_out.size()); + } + if (connected_components_2.size() > 1) { + split_idxs_2.push_back(multipath_aln_pairs_out.size()); + } multipath_aln_pairs_out.emplace_back(move(split_multipath_aln_pair)); cluster_pairs.emplace_back(cluster_pairs[i].first, dist); + pair_multiplicities.emplace_back(pair_multiplicities[i]); } } } } } + + if (do_spliced_alignment) { + // we only do this in spliced alignment because we want to clustering to + // unclaim certain hits so they can be seen as spliced alignment candidates + + if (!split_idxs_1.empty()) { + + vector split_mp_alns_1(split_idxs_1.size()); + vector cluster_assignments_1(split_idxs_1.size()); + for (size_t i = 0; i < split_idxs_1.size(); ++i) { + auto& mp_aln = multipath_aln_pairs_out[split_idxs_1[i]].first; + // TODO: we need to have these be ordered to find the MEMs, but this will be wastefully repeated later + topologically_order_subpaths(mp_aln); + split_mp_alns_1[i] = &mp_aln; + cluster_assignments_1[i] = &cluster_pairs[split_idxs_1[i]].first.first; + } + + vector all_cluster_assignments_1(cluster_pairs.size()); + for (size_t i = 0; i < all_cluster_assignments_1.size(); ++i) { + all_cluster_assignments_1[i] = &cluster_pairs[i].first.first; + } + + reassign_split_clusters(alignment1, cluster_graphs1, split_mp_alns_1, cluster_assignments_1, + all_cluster_assignments_1); + } + + if (!split_idxs_2.empty()) { + vector split_mp_alns_2(split_idxs_2.size()); + vector cluster_assignments_2(split_idxs_2.size()); + for (size_t i = 0; i < split_idxs_2.size(); ++i) { + auto& mp_aln = multipath_aln_pairs_out[split_idxs_2[i]].second; + // TODO: we need to have these be ordered to find the MEMs, but this will be wastefully repeated later + topologically_order_subpaths(mp_aln); + split_mp_alns_2[i] = &mp_aln; + cluster_assignments_2[i] = &cluster_pairs[split_idxs_2[i]].first.second; + } + + vector all_cluster_assignments_2(cluster_pairs.size()); + for (size_t i = 0; i < all_cluster_assignments_2.size(); ++i) { + all_cluster_assignments_2[i] = &cluster_pairs[i].first.second; + } + + reassign_split_clusters(alignment2, cluster_graphs2, split_mp_alns_2, cluster_assignments_2, + all_cluster_assignments_2); + } + } } void MultipathMapper::align_to_cluster_graph_pairs(const Alignment& alignment1, const Alignment& alignment2, vector& cluster_graphs1, vector& cluster_graphs2, + vector>& multipath_aln_pairs_out, vector, int64_t>>& cluster_pairs, - vector>& multipath_aln_pairs_out, + vector& pair_multiplicities, vector>& duplicate_pairs_out, - OrientedDistanceClusterer::paths_of_node_memo_t* paths_of_node_memo, - OrientedDistanceClusterer::oriented_occurences_memo_t* oriented_occurences_memo, - OrientedDistanceClusterer::handle_memo_t* handle_memo) { + const match_fanouts_t* fanouts1, const match_fanouts_t* fanouts2) { assert(multipath_aln_pairs_out.empty()); + auto aligner = get_aligner(!alignment1.quality().empty() && !alignment2.quality().empty()); auto get_pair_approx_likelihood = [&](const pair, int64_t>& cluster_pair) { return ((get<2>(cluster_graphs1[cluster_pair.first.first]) - + get<2>(cluster_graphs2[cluster_pair.first.second])) * get_aligner()->match * get_aligner()->log_base - + fragment_length_log_likelihood(cluster_pair.second)); + + get<2>(cluster_graphs2[cluster_pair.first.second])) * aligner->match + + fragment_length_log_likelihood(cluster_pair.second) / aligner->log_base); }; // sort the pairs descending by approximate likelihood stable_sort(cluster_pairs.begin(), cluster_pairs.end(), [&](const pair, int64_t>& a, const pair, int64_t>& b) { - // compute approximate likelihood in similar way to how the mapping quality routine will - double likelihood_1 = get_pair_approx_likelihood(a); - double likelihood_2 = get_pair_approx_likelihood(b); - size_t hash_1 = wang_hash, int64_t>>()(a); - size_t hash_2 = wang_hash, int64_t>>()(b); - return (likelihood_1 > likelihood_2 || (likelihood_1 == likelihood_2 && hash_1 < hash_2)); - }); + // compute approximate likelihood in similar way to how the mapping quality routine will + double likelihood_1 = get_pair_approx_likelihood(a); + double likelihood_2 = get_pair_approx_likelihood(b); + size_t hash_1 = wang_hash, int64_t>>()(a); + size_t hash_2 = wang_hash, int64_t>>()(b); + return (likelihood_1 > likelihood_2 || (likelihood_1 == likelihood_2 && hash_1 < hash_2)); + }); #ifdef debug_multipath_mapper + cerr << "sorting cluster pairs by approximate likelihood:" << endl; + for (size_t i = 0; i < cluster_pairs.size(); i++) { + cerr << i << "-th cluster: " << cluster_pairs[i].first.first << " " << cluster_pairs[i].first.second << ", likelihood " << get_pair_approx_likelihood(cluster_pairs[i]) << endl; + } + cerr << "aligning to cluster pairs..." << endl; #endif // we may need to compute an extra mapping above the one we'll report if we're computing mapping quality - size_t num_mappings_to_compute = mapping_quality_method != None ? max(num_mapping_attempts, (size_t) 2) : num_mapping_attempts; + size_t num_mappings_to_compute = max(num_mapping_attempts, (size_t) 2); // TODO: some cluster pairs will produce redundant subgraph pairs. // We'll end up with redundant pairs being output. @@ -2407,35 +5280,43 @@ namespace vg { // For each cluster pair const pair, int64_t>& cluster_pair = cluster_pairs[i]; + // TODO: using a multiplier here instead of a difference is pretty ugly, really. it also has + // weird effects, like not producing any alignments if the log likelihood is negative (which + // shouldn't matter). but in practice that only happens on very small clusters with bad fragment + // lengths. + // if we have a cluster graph pair with small enough MEM coverage // compared to the best one or we've made the maximum number of // alignments we stop producing alternate alignments if (get_pair_approx_likelihood(cluster_pair) < mem_coverage_min_ratio * get_pair_approx_likelihood(cluster_pairs.front()) || num_mappings >= num_mappings_to_compute) { - // remove the rest of the cluster pairs so we maintain the invariant that there are the + // remove the rest of the cluster pairs to establish the invariant that there are the // same number of cluster pairs as alternate mappings cluster_pairs.resize(i); break; } + #ifdef debug_multipath_mapper cerr << "doing pair " << cluster_pair.first.first << " " << cluster_pair.first.second << endl; #endif // create multipath alignments to fill multipath_aln_pairs_out.emplace_back(); - + pair_multiplicities.push_back(pair_cluster_multiplicity(get<1>(cluster_graphs1[cluster_pair.first.first]), + get<1>(cluster_graphs2[cluster_pair.first.second]))); + auto prev_1 = previous_multipath_alns_1.find(cluster_pair.first.first); if (prev_1 == previous_multipath_alns_1.end()) { // we haven't done this alignment yet, so we have to complete it for the first time - VG* vg1 = get<0>(cluster_graphs1[cluster_pair.first.first]); - memcluster_t& graph_mems1 = get<1>(cluster_graphs1[cluster_pair.first.first]); #ifdef debug_multipath_mapper - cerr << "performing alignment to subgraph " << pb2json(vg1->graph) << endl; + cerr << "performing alignment of read 1 to subgraph" << endl; #endif - multipath_align(alignment1, vg1, graph_mems1, multipath_aln_pairs_out.back().first); + multipath_align(alignment1, cluster_graphs1[cluster_pair.first.first], + multipath_aln_pairs_out.back().first, + fanouts1); // keep track of the fact that we have completed this multipath alignment previous_multipath_alns_1[cluster_pair.first.first] = i; @@ -2453,14 +5334,13 @@ namespace vg { auto prev_2 = previous_multipath_alns_2.find(cluster_pair.first.second); if (prev_2 == previous_multipath_alns_2.end()) { // we haven't done this alignment yet, so we have to complete it for the first time - VG* vg2 = get<0>(cluster_graphs2[cluster_pair.first.second]); - memcluster_t& graph_mems2 = get<1>(cluster_graphs2[cluster_pair.first.second]); #ifdef debug_multipath_mapper - cerr << "performing alignment to subgraph " << pb2json(vg2->graph) << endl; + cerr << "performing alignment of read 2 to subgraph" << endl; #endif - multipath_align(alignment2, vg2, graph_mems2, multipath_aln_pairs_out.back().second); + multipath_align(alignment2, cluster_graphs2[cluster_pair.first.second], + multipath_aln_pairs_out.back().second, fanouts2); // keep track of the fact that we have completed this multipath alignment previous_multipath_alns_2[cluster_pair.first.second] = i; @@ -2476,50 +5356,300 @@ namespace vg { num_mappings++; } - // split up any multi-component multipath alignments - split_multicomponent_alignments(multipath_aln_pairs_out, cluster_pairs); + if (!multipath_aln_pairs_out.empty()) { + + double likelihood_diff = aligner->mapping_quality_score_diff(truncation_multiplicity_mq_limit); + double tail_likelihood = get_pair_approx_likelihood(cluster_pairs[multipath_aln_pairs_out.size() - 1]); + + // find clusters whose likelihoods are approximately the same as the low end of the clusters we aligned + int64_t max_tail_idx = multipath_aln_pairs_out.size(); + while (max_tail_idx < cluster_pairs.size() + && get_pair_approx_likelihood(cluster_pairs[max_tail_idx]) >= tail_likelihood - likelihood_diff) { + ++max_tail_idx; + } + + if (max_tail_idx > multipath_aln_pairs_out.size()) { + // there are some (nearly) identical cluster pairs that we ignored, so we'll account for them in the multiplicity + + // find the pairs that are approximately the same as the last + int64_t min_tail_idx = multipath_aln_pairs_out.size() - 1; + while (min_tail_idx > 0 && + get_pair_approx_likelihood(cluster_pairs[min_tail_idx]) <= tail_likelihood + likelihood_diff) { + --min_tail_idx; + } + + // multiply their multiplicity by the inverse of the fraction aligned + double trunc_multiplicity = double(max_tail_idx - min_tail_idx) / double(multipath_aln_pairs_out.size() - min_tail_idx); + for (size_t i = min_tail_idx; i < multipath_aln_pairs_out.size(); ++i) { + pair_multiplicities[i] *= trunc_multiplicity; + } + } + } + + if (!suppress_multicomponent_splitting) { + // split up any multi-component multipath alignments + split_multicomponent_alignments(alignment1, alignment2, + multipath_aln_pairs_out, cluster_graphs1, cluster_graphs2, + cluster_pairs, pair_multiplicities); + + // it's possible to get supraquadratic growth in the number of alignments from doing this, so + // we add a bit of restraint + size_t max_split_pairs = num_mappings_to_compute * num_mappings_to_compute; + if (multipath_aln_pairs_out.size() > max_split_pairs) { +#ifdef debug_multipath_mapper + cerr << "too many pairs after splitting multicomponent alignments, truncating from " << multipath_aln_pairs_out.size() << " to " << max_split_pairs << " pairs" << endl; +#endif + + // TODO: repetitive with the previous truncation routine + // TODO: these likelihood were just computed in the sort routine, it would be nice to do + // be able to re-use the results + + // figure out a truncation multiplier for this pruning step + auto scores = pair_mapping_likelihoods(multipath_aln_pairs_out, cluster_pairs); + double score_diff = aligner->mapping_quality_score_diff(truncation_multiplicity_mq_limit); + size_t last_idx = max_split_pairs - 1; + size_t min_tail_idx = last_idx; + while (min_tail_idx > 0 && scores[min_tail_idx - 1] <= scores[last_idx] + score_diff) { + --min_tail_idx; + } + size_t max_tail_idx = max_split_pairs; + while (max_tail_idx < scores.size() && scores[max_tail_idx] >= scores[last_idx] - score_diff) { + ++max_tail_idx; + } + double trunc_multiplicity = double(max_tail_idx - min_tail_idx) / double(max_split_pairs - min_tail_idx); + + // discard the low-likelihood pairs + multipath_aln_pairs_out.resize(max_split_pairs); + cluster_pairs.resize(max_split_pairs); + pair_multiplicities.resize(max_split_pairs); + + // increase the multiplicity for pairs near the cutoff point + for (size_t i = min_tail_idx; i < pair_multiplicities.size(); ++i) { + pair_multiplicities[i] *= trunc_multiplicity; + } + } + } // downstream algorithms assume multipath alignments are topologically sorted (including the scoring // algorithm in the next step) - for (pair& multipath_aln_pair : multipath_aln_pairs_out) { + for (auto& multipath_aln_pair : multipath_aln_pairs_out) { topologically_order_subpaths(multipath_aln_pair.first); topologically_order_subpaths(multipath_aln_pair.second); - } - - // if we haven't been checking strand consistency, enforce it now at the end - if (unstranded_clustering) { - establish_strand_consistency(multipath_aln_pairs_out, cluster_pairs, paths_of_node_memo, oriented_occurences_memo, handle_memo); + simplify_complicated_multipath_alignment(multipath_aln_pair.first); + simplify_complicated_multipath_alignment(multipath_aln_pair.second); } // put pairs in score sorted order and compute mapping quality of best pair using the score - sort_and_compute_mapping_quality(multipath_aln_pairs_out, cluster_pairs, &duplicate_pairs_out); + sort_and_compute_mapping_quality(multipath_aln_pairs_out, cluster_pairs, + &duplicate_pairs_out, &pair_multiplicities); #ifdef debug_validate_multipath_alignments - for (pair& multipath_aln_pair : multipath_aln_pairs_out) { + for (pair& multipath_aln_pair : multipath_aln_pairs_out) { #ifdef debug_multipath_mapper cerr << "validating multipath alignments:" << endl; - cerr << pb2json(multipath_aln_pair.first) << endl; - cerr << pb2json(multipath_aln_pair.second) << endl; + cerr << debug_string(multipath_aln_pair.first) << endl; + cerr << debug_string(multipath_aln_pair.second) << endl; #endif if (!validate_multipath_alignment(multipath_aln_pair.first, *xindex)) { cerr << "### WARNING ###" << endl; - cerr << "multipath alignment of read " << multipath_aln_pair.first.name() << " failed to validate" << endl; + cerr << "multipath alignment of read " << multipath_aln_pair.first.sequence() << " failed to validate" << endl; } if (!validate_multipath_alignment(multipath_aln_pair.second, *xindex)) { cerr << "### WARNING ###" << endl; - cerr << "multipath alignment of read " << multipath_aln_pair.second.name() << " failed to validate" << endl; + cerr << "multipath alignment of read " << multipath_aln_pair.second.sequence() << " failed to validate" << endl; } } #endif } - - auto MultipathMapper::query_cluster_graphs(const Alignment& alignment, - const vector& mems, - const vector& clusters) -> vector { + + pair, bool> MultipathMapper::extract_maximal_graph(const Alignment& alignment, + const memcluster_t& mem_cluster) const { + + // Figure out the aligner to use + auto aligner = get_aligner(!alignment.quality().empty()); + // get the seed hits + const auto& cluster = mem_cluster.first; + + vector positions; + vector forward_max_dist; + vector backward_max_dist; + + positions.reserve(cluster.size()); + forward_max_dist.reserve(cluster.size()); + backward_max_dist.reserve(cluster.size()); + + for (auto& mem_hit : cluster) { + // get the start position of the MEM + positions.push_back(mem_hit.second); + // search far enough away to get any hit detectable without soft clipping + forward_max_dist.push_back(min(aligner->longest_detectable_gap(alignment, mem_hit.first->end), max_alignment_gap) + + (alignment.sequence().end() - mem_hit.first->begin)); + backward_max_dist.push_back(min(aligner->longest_detectable_gap(alignment, mem_hit.first->begin), max_alignment_gap) + + (mem_hit.first->begin - alignment.sequence().begin())); + } + + // TODO: a progressive expansion of the subgraph if the MEM hit is already contained in + // a cluster graph somewhere? + + // extract the subgraph within the search distance + + unique_ptr cluster_graph(new bdsg::HashGraph()); + + algorithms::extract_containing_graph(xindex, cluster_graph.get(), positions, forward_max_dist, backward_max_dist, + num_alt_alns > 1 ? reversing_walk_length : 0); + + return move(make_pair(move(cluster_graph), cluster.size() == 1)); + } + + // TODO: entirely duplicative with MultipathAlignmentGraph... + const size_t MultipathMapper::gap_memo_max_size = 1000; + thread_local unordered_map> MultipathMapper::pessimistic_gap_memo; + int64_t MultipathMapper::pessimistic_gap(int64_t length, double multiplier) const { + int64_t gap_length; + if (length >= gap_memo_max_size) { + gap_length = multiplier * sqrt(length); + } + else { + vector& memo = pessimistic_gap_memo[multiplier]; + while (memo.size() <= length) { + memo.emplace_back(multiplier * sqrt(memo.size())); + } + gap_length = memo[length]; + } + return gap_length; + } + + pair, bool> MultipathMapper::extract_restrained_graph(const Alignment& alignment, + const memcluster_t& mem_cluster) const { // Figure out the aligner to use - BaseAligner* aligner = get_aligner(); + auto aligner = get_aligner(!alignment.quality().empty()); + // get the seed hits + const auto& cluster = mem_cluster.first; + + // the MEMs are size sorted, we want to know the read order so we can + // use the inter-MEM distance to figure out how much to extract + vector order(cluster.size(), 0); + for (size_t i = 1; i < order.size(); ++i) { + order[i] = i; + } + stable_sort(order.begin(), order.end(), [&](size_t i, size_t j) { + return cluster[i].first->begin < cluster[j].first->begin; + }); + + // and we'll also want to + vector index(order.size()); + for (size_t i = 0; i < index.size(); ++i) { + index[order[i]] = i; + } + + vector positions(cluster.size()); + + // determine an initial restrained set of distances to extract from + vector forward_dist(cluster.size()), backward_dist(cluster.size()); + for (size_t i = 0; i < cluster.size(); ++i) { + size_t idx = index[i]; + if (idx == 0) { + // this is the left tail + if (use_pessimistic_tail_alignment) { + int64_t tail_length = cluster[i].first->begin - alignment.sequence().begin(); + backward_dist[i] = tail_length + pessimistic_gap(tail_length, pessimistic_gap_multiplier); + } + else { + backward_dist[i] = aligner->longest_detectable_gap(alignment, cluster[i].first->begin); + } + } + else { + // there is another MEM leftward + int64_t between_length = max(0, cluster[i].first->begin - cluster[order[idx - 1]].first->end); + backward_dist[i] = between_length + pessimistic_gap(between_length, pessimistic_gap_multiplier); + } + + if (idx + 1 == cluster.size()) { + // this is the right tail + if (use_pessimistic_tail_alignment) { + int64_t tail_length = alignment.sequence().end() - cluster[i].first->end; + forward_dist[i] = tail_length + pessimistic_gap(tail_length, pessimistic_gap_multiplier) + cluster[i].first->length(); + } + else { + forward_dist[i] = aligner->longest_detectable_gap(alignment, cluster[i].first->end) + cluster[i].first->length(); + } + } + else { + // there is another MEM rightward + int64_t between_length = max(0, cluster[order[idx + 1]].first->begin - cluster[i].first->end); + forward_dist[i] = between_length + pessimistic_gap(between_length, pessimistic_gap_multiplier) + cluster[i].first->length(); + } + + positions[i] = cluster[i].second; + } + + // expand the restrained search distances until we extract a connected graph or + // expand the distances up to the maximum detectable length + + unique_ptr cluster_graph; + bool do_extract = true; + bool connected = false; + while (do_extract) { + + // get rid of the old graph (if there is one) + cluster_graph = unique_ptr(new bdsg::HashGraph()); + + // extract according to the current search distances + algorithms::extract_containing_graph(xindex, cluster_graph.get(), positions, forward_dist, backward_dist, + num_alt_alns > 1 ? reversing_walk_length : 0); + + // we can avoid a costly algorithm when the cluster was extracted from one position (and therefore + // must be connected) + if (cluster.size() == 1 || handlealgs::is_weakly_connected(cluster_graph.get())) { + // we consider enough of the graph extracted once it is connected + // stop doing further exttraction + do_extract = false; + connected = true; + } + else { + // double the search distances, up to the maximum detectable gap + bool any_dists_changed = false; + for (size_t i = 0; i < cluster.size(); ++i) { + size_t bwd_dist = min(backward_dist[i] * 2, + aligner->longest_detectable_gap(alignment, cluster[i].first->begin)); + size_t fwd_dist = min(forward_dist[i] * 2, + aligner->longest_detectable_gap(alignment, cluster[i].first->end) + cluster[i].first->length()); + if (bwd_dist > backward_dist[i]) { + backward_dist[i] = bwd_dist; + any_dists_changed = true; + } + if (fwd_dist > forward_dist[i]) { + forward_dist[i] = fwd_dist; + any_dists_changed = true; + } + } + // do another extraction as long as we increased at least one search distance + do_extract = any_dists_changed; + } + } + return move(make_pair(move(cluster_graph), connected)); + } + + pair, bool> MultipathMapper::extract_cluster_graph(const Alignment& alignment, + const memcluster_t& mem_cluster) const { + if (restrained_graph_extraction) { + return extract_restrained_graph(alignment, mem_cluster); + } + else { + return extract_maximal_graph(alignment, mem_cluster); + } + } + + vector MultipathMapper::query_cluster_graphs(const Alignment& alignment, + const vector& mems, + const vector& clusters) const { + + // some settings want us to not merge clusters that have overlapping nodes, and + // we can also save some bookkeeping work if we neglect to do cluster merging + // when there's only one cluster anyway + bool do_merge_suppression = suppress_cluster_merging || clusters.size() <= 1; // We populate this with all the cluster graphs. vector cluster_graphs_out; @@ -2528,11 +5658,13 @@ namespace vg { // cluster and we use this to record which one unordered_map node_id_to_cluster; - // to hold the clusters as they are (possibly) merged - unordered_map cluster_graphs; + // to hold the clusters as they are (possibly) merged, bools indicate + // whether we've verified that the graph is connected + // doubles are the cluster graph's multiplicity + unordered_map, bool, double>> cluster_graphs; // to keep track of which clusters have been merged - UnionFind union_find(clusters.size()); + UnionFind union_find(clusters.size(), false); // (for the suppressed merge code path) // maps the hits that make up a cluster to the index of the cluster @@ -2545,56 +5677,29 @@ namespace vg { #endif // gather the parameters for subgraph extraction from the MEM hits + auto& cluster = clusters[i]; + auto extracted = extract_cluster_graph(alignment, cluster); + tuple, bool, double> cluster_graph(move(extracted.first), extracted.second, cluster.second); - const memcluster_t& cluster = clusters[i]; - vector positions; - vector forward_max_dist; - vector backward_max_dist; - - positions.reserve(cluster.size()); - forward_max_dist.reserve(cluster.size()); - backward_max_dist.reserve(cluster.size()); - - for (auto& mem_hit : cluster) { - // get the start position of the MEM - positions.push_back(mem_hit.second); - // search far enough away to get any hit detectable without soft clipping - forward_max_dist.push_back(aligner->longest_detectable_gap(alignment, mem_hit.first->end) - + (alignment.sequence().end() - mem_hit.first->begin)); - backward_max_dist.push_back(aligner->longest_detectable_gap(alignment, mem_hit.first->begin) - + (mem_hit.first->begin - alignment.sequence().begin())); - } - - - // TODO: a progressive expansion of the subgraph if the MEM hit is already contained in - // a cluster graph somewhere? - - // extract the subgraph within the search distance - - VG* cluster_graph = new VG(); - algorithms::extract_containing_graph(xindex, cluster_graph, positions, forward_max_dist, - backward_max_dist); - Graph& graph = cluster_graph->graph; - - // check if this subgraph overlaps with any previous subgraph (indicates a probable clustering failure where // one cluster was split into multiple clusters) unordered_set overlapping_graphs; - if (!suppress_cluster_merging) { - for (size_t j = 0; j < graph.node_size(); j++) { - id_t node_id = graph.node(j).id(); + if (!do_merge_suppression) { + get<0>(cluster_graph)->for_each_handle([&](const handle_t& handle) { + id_t node_id = get<0>(cluster_graph)->get_id(handle); if (node_id_to_cluster.count(node_id)) { overlapping_graphs.insert(node_id_to_cluster[node_id]); } else { node_id_to_cluster[node_id] = i; } - } + return true; + }); } else { // assign the hits to clusters - for (auto& mem_hit : cluster) { + for (auto& mem_hit : cluster.first) { hit_to_cluster[mem_hit] = i; } } @@ -2605,7 +5710,7 @@ namespace vg { #ifdef debug_multipath_mapper cerr << "cluster graph does not overlap with any other cluster graphs, adding as cluster " << i << endl; #endif - cluster_graphs[i] = cluster_graph; + cluster_graphs[i] = move(cluster_graph); } else { // this graph overlaps at least one other graph, so we merge them into one @@ -2627,33 +5732,44 @@ namespace vg { cerr << "merging as cluster " << remaining_idx << endl; #endif - VG* merging_graph; + bdsg::HashGraph* merging_graph; + bool all_connected; + double multiplicity; if (remaining_idx == i) { // the new graph was chosen to remain, so add it to the record - cluster_graphs[i] = cluster_graph; - merging_graph = cluster_graph; + cluster_graphs[i] = move(cluster_graph); + merging_graph = get<0>(cluster_graph).get(); + all_connected = get<1>(cluster_graph); + multiplicity = get<2>(cluster_graph); } else { // the new graph will be merged into an existing graph - merging_graph = cluster_graphs[remaining_idx]; - merging_graph->extend(graph); - delete cluster_graph; + merging_graph = get<0>(cluster_graphs[remaining_idx]).get(); + + // add in the new graph + handlealgs::extend(get<0>(cluster_graph).get(), merging_graph); + all_connected = get<1>(cluster_graphs[remaining_idx]) && get<1>(cluster_graph); + multiplicity = min(get<2>(cluster_graphs[remaining_idx]), get<2>(cluster_graph)); } // merge any other chained graphs into the remaining graph for (size_t j : overlapping_graphs) { if (j != remaining_idx) { - VG* removing_graph = cluster_graphs[j]; - merging_graph->extend(removing_graph->graph); - delete removing_graph; + auto removing_graph = move(cluster_graphs[j]); + handlealgs::extend(get<0>(removing_graph).get(), merging_graph); + all_connected = all_connected && get<1>(removing_graph); + multiplicity = min(multiplicity, get<2>(removing_graph)); cluster_graphs.erase(j); } } + get<1>(cluster_graphs[remaining_idx]) = all_connected; + get<2>(cluster_graphs[remaining_idx]) = multiplicity; - Graph& merged_graph = merging_graph->graph; - for (size_t j = 0; j < merged_graph.node_size(); j++) { - node_id_to_cluster[merged_graph.node(j).id()] = remaining_idx; - } + // update the node-to-cluster mapping + merging_graph->for_each_handle([&](const handle_t& handle) { + node_id_to_cluster[merging_graph->get_id(handle)] = remaining_idx; + return true; + }); } } @@ -2665,10 +5781,12 @@ namespace vg { unordered_map> multicomponent_splits; size_t max_graph_idx = 0; - for (const pair cluster_graph : cluster_graphs) { - vector> connected_components = algorithms::weakly_connected_components(cluster_graph.second); - if (connected_components.size() > 1) { - multicomponent_graphs.emplace_back(cluster_graph.first, std::move(connected_components)); + for (const auto& cluster_graph : cluster_graphs) { + if (!get<1>(cluster_graph.second)) { + vector> connected_components = handlealgs::weakly_connected_components(get<0>(cluster_graph.second).get()); + if (connected_components.size() > 1) { + multicomponent_graphs.emplace_back(cluster_graph.first, std::move(connected_components)); + } } max_graph_idx = max(cluster_graph.first, max_graph_idx); } @@ -2689,52 +5807,48 @@ namespace vg { #endif for (size_t i = 0; i < multicomponent_graph.second.size(); i++) { - cluster_graphs[max_graph_idx + i] = new VG(); + cluster_graphs[max_graph_idx + i] = make_tuple(unique_ptr(new bdsg::HashGraph()), true, + get<2>(cluster_graphs[multicomponent_graph.first])); } - Graph& joined_graph = cluster_graphs[multicomponent_graph.first]->graph; - // divvy up the nodes - for (size_t i = 0; i < joined_graph.node_size(); i++) { - const Node& node = joined_graph.node(i); + auto joined_graph = get<0>(cluster_graphs[multicomponent_graph.first]).get(); + joined_graph->for_each_handle([&](const handle_t& handle) { for (size_t j = 0; j < multicomponent_graph.second.size(); j++) { - if (multicomponent_graph.second[j].count(node.id())) { - cluster_graphs[max_graph_idx + j]->add_node(node); + if (multicomponent_graph.second[j].count(joined_graph->get_id(handle))) { + get<0>(cluster_graphs[max_graph_idx + j])->create_handle(joined_graph->get_sequence(handle), + joined_graph->get_id(handle)); // if we're suppressing cluster merging, we don't maintain this index - if (!suppress_cluster_merging) { - node_id_to_cluster[node.id()] = max_graph_idx + j; + if (!do_merge_suppression) { + node_id_to_cluster[joined_graph->get_id(handle)] = max_graph_idx + j; } break; } } - } + return true; + }); // divvy up the edges - for (size_t i = 0; i < joined_graph.edge_size(); i++) { - const Edge& edge = joined_graph.edge(i); + joined_graph->for_each_edge([&](const edge_t& edge) { for (size_t j = 0; j < multicomponent_graph.second.size(); j++) { - if (multicomponent_graph.second[j].count(edge.from())) { - cluster_graphs[max_graph_idx + j]->add_edge(edge); + if (multicomponent_graph.second[j].count(joined_graph->get_id(edge.first))) { + auto comp_graph = get<0>(cluster_graphs[max_graph_idx + j]).get(); + comp_graph->create_edge(comp_graph->get_handle(joined_graph->get_id(edge.first), + joined_graph->get_is_reverse(edge.first)), + comp_graph->get_handle(joined_graph->get_id(edge.second), + joined_graph->get_is_reverse(edge.second))); break; } } - } - -#ifdef debug_multipath_mapper - cerr << "split graphs:" << endl; - for (size_t i = 0; i < multicomponent_graph.second.size(); i++) { - cerr << "component " << max_graph_idx + i << ":" << endl; - cerr << pb2json(cluster_graphs[max_graph_idx + i]->graph) << endl; - } -#endif + return true; + }); // remove the old graph - delete cluster_graphs[multicomponent_graph.first]; cluster_graphs.erase(multicomponent_graph.first); - if (suppress_cluster_merging) { + if (do_merge_suppression) { // we need to re-assign the hits to the new cluster graphs - for (auto& mem_hit : clusters[multicomponent_graph.first]) { + for (auto& mem_hit : clusters[multicomponent_graph.first].first) { for (size_t i = 0; i < multicomponent_graph.second.size(); i++) { if (multicomponent_graph.second[i].count(id(mem_hit.second))) { hit_to_cluster[mem_hit] = max_graph_idx + i; @@ -2751,26 +5865,28 @@ namespace vg { // vector each MEM cluster ended up in cluster_graphs_out.reserve(cluster_graphs.size()); unordered_map cluster_to_idx; - for (const auto& cluster_graph : cluster_graphs) { + for (pair, bool, double>>& cluster_graph : cluster_graphs) { + cluster_to_idx[cluster_graph.first] = cluster_graphs_out.size(); + cluster_graphs_out.emplace_back(); + get<0>(cluster_graphs_out.back()) = move(get<0>(cluster_graph.second)); + get<1>(cluster_graphs_out.back()).second = get<2>(cluster_graph.second); #ifdef debug_multipath_mapper - cerr << "adding cluster graph " << cluster_graph.first << " to return vector at index " << cluster_graphs_out.size() << endl; + cerr << "adding cluster graph " << cluster_graph.first << " to return vector at index " << cluster_graphs_out.size() << " with multiplicity " << get<1>(cluster_graphs_out.back()).second << endl; #endif - cluster_to_idx[cluster_graph.first] = cluster_graphs_out.size(); - cluster_graphs_out.emplace_back(cluster_graph.second, memcluster_t(), 0); } #ifdef debug_multipath_mapper cerr << "computing MEM assignments to cluster graphs" << endl; #endif - if (!suppress_cluster_merging) { + if (!do_merge_suppression) { // which MEMs are in play for which cluster? for (const MaximalExactMatch& mem : mems) { for (gcsa::node_type hit : mem.nodes) { id_t node_id = gcsa::Node::id(hit); if (node_id_to_cluster.count(node_id)) { size_t cluster_idx = cluster_to_idx[node_id_to_cluster[node_id]]; - get<1>(cluster_graphs_out[cluster_idx]).push_back(make_pair(&mem, make_pos_t(hit))); + get<1>(cluster_graphs_out[cluster_idx]).first.push_back(make_pair(&mem, make_pos_t(hit))); #ifdef debug_multipath_mapper cerr << "\tMEM " << mem.sequence() << " at " << make_pos_t(hit) << " found in cluster " << node_id_to_cluster[node_id] << " at index " << cluster_idx << endl; #endif @@ -2791,10 +5907,11 @@ namespace vg { // identify all of the clusters that contain each node unordered_map> node_id_to_cluster_idxs; for (size_t i = 0; i < cluster_graphs_out.size(); i++) { - Graph& graph = get<0>(cluster_graphs_out[i])->graph; - for (size_t j = 0; j < graph.node_size(); j++) { - node_id_to_cluster_idxs[graph.node(j).id()].push_back(i); - } + auto cluster_graph = get<0>(cluster_graphs_out[i]).get(); + cluster_graph->for_each_handle([&](const handle_t& handle){ + node_id_to_cluster_idxs[cluster_graph->get_id(handle)].push_back(i); + return true; + }); } for (const MaximalExactMatch& mem : mems) { @@ -2803,7 +5920,7 @@ namespace vg { // force the hits that generated a cluster to be assigned to it auto iter = hit_to_cluster.find(mem_hit); if (iter != hit_to_cluster.end()) { - get<1>(cluster_graphs_out[cluster_to_idx[iter->second]]).push_back(mem_hit); + get<1>(cluster_graphs_out[cluster_to_idx[iter->second]]).first.push_back(mem_hit); #ifdef debug_multipath_mapper cerr << "\tMEM " << mem.sequence() << " at " << mem_hit.second << " assigned as seed to cluster at index " << cluster_to_idx[iter->second] << endl; #endif @@ -2813,7 +5930,7 @@ namespace vg { auto id_iter = node_id_to_cluster_idxs.find(id(mem_hit.second)); if (id_iter != node_id_to_cluster_idxs.end()) { for (size_t cluster_idx : id_iter->second) { - get<1>(cluster_graphs_out[cluster_idx]).push_back(mem_hit); + get<1>(cluster_graphs_out[cluster_idx]).first.push_back(mem_hit); #ifdef debug_multipath_mapper cerr << "\tMEM " << mem.sequence() << " at " << mem_hit.second << " found in cluster at index " << cluster_idx << endl; #endif @@ -2827,179 +5944,359 @@ namespace vg { // compute the read coverage of each cluster graph and sort the assigned MEMs by length // and then lexicographically by read index for (size_t i = 0; i < cluster_graphs_out.size(); i++) { - auto& cluster_graph = cluster_graphs_out[i]; - get<2>(cluster_graph) = read_coverage(get<1>(cluster_graph)); -#ifdef debug_multipath_mapper - cerr << "compute read coverage of cluster at index " << i << " to be " << get<2>(cluster_graph) << endl; -#endif - sort(get<1>(cluster_graph).begin(), get<1>(cluster_graph).end(), - [](const pair& hit_1, - const pair& hit_2) { - return hit_1.first->length() > hit_2.first->length() || - (hit_1.first->length() == hit_2.first->length() && - (hit_1.first->begin < hit_2.first->begin || - (hit_1.first->begin == hit_2.first->begin && hit_1.first->end < hit_2.first->end))); - }); + set_read_coverage(cluster_graphs_out[i]); +#ifdef debug_multipath_mapper + cerr << "compute read coverage of cluster at index " << i << " to be " << get<2>(cluster_graphs_out[i]) << endl; +#endif } // find the node ID range for the cluster graphs to help set up a stable, system-independent ordering // note: technically this is not quite a total ordering, but it should be close to one - unordered_map> node_range; - node_range.reserve(cluster_graphs_out.size()); + unordered_map graph_hash; + graph_hash.reserve(cluster_graphs_out.size()); for (const auto& cluster_graph : cluster_graphs_out) { - node_range[get<0>(cluster_graph)] = make_pair(get<0>(cluster_graph)->min_node_id(), - get<0>(cluster_graph)->max_node_id()); + graph_hash[get<0>(cluster_graph).get()] = wang_hash>()(make_pair(get<0>(cluster_graph)->min_node_id(), + get<0>(cluster_graph)->max_node_id())); } // sort the cluster graphs descending by unique sequence coverage, breaking ties by scrambling according to a hash - stable_sort(cluster_graphs_out.begin(), cluster_graphs_out.end(), - [&](const clustergraph_t& cluster_graph_1, - const clustergraph_t& cluster_graph_2) { - return (get<2>(cluster_graph_1) > get<2>(cluster_graph_2) || - (get<2>(cluster_graph_1) == get<2>(cluster_graph_2) && - wang_hash>()(node_range[get<0>(cluster_graph_1)]) < wang_hash>()(node_range[get<0>(cluster_graph_2)]))); - }); - - return move(cluster_graphs_out); + sort(cluster_graphs_out.begin(), cluster_graphs_out.end(), [&](const clustergraph_t& cluster_graph_1, + const clustergraph_t& cluster_graph_2) { + return (get<2>(cluster_graph_1) > get<2>(cluster_graph_2) || + (get<2>(cluster_graph_1) == get<2>(cluster_graph_2) && + graph_hash[get<0>(cluster_graph_1).get()] < graph_hash[get<0>(cluster_graph_2).get()])); + }); + + return cluster_graphs_out; + } + + bool MultipathMapper::expand_for_softclips(clustergraph_t& cluster_graph, + const multipath_alignment_t& multipath_aln) const { + + // TODO: this post-hoc check might make it possible to extract less graph initially... + // but that could get fiddly when there is a mismatch at the graph boundary (makes soft clip) + // TODO: this should work on non-branching path tips, not node tips + + unordered_map expansion_points; + auto& graph = *get<0>(cluster_graph); + + // find expansion points on source subpaths + for (auto i : multipath_aln.start()) { + const auto& mapping = multipath_aln.subpath(i).path().mapping().front(); + const auto& edit = mapping.edit().front(); + if (edit.from_length() == 0 && edit.to_length() != 0) { + // this is soft-clipped + size_t max_len = get_aligner()->longest_detectable_gap(multipath_aln.sequence().size(), edit.to_length()); + if (mapping.position().offset() < max_len) { + // the clipped sequence's alignment might not fit on this node + handle_t handle = graph.get_handle(mapping.position().node_id(), mapping.position().is_reverse()); + bool is_source = graph.follow_edges(handle, true, [](const handle_t& h) {return false;}); + if (is_source) { + // this soft clip might arise because we didn't extract enough graph + auto flipped = graph.flip(handle); + expansion_points[flipped] = max(expansion_points[flipped], + max_len - mapping.position().offset()); + } + } + } + } + // find expansion points on sink subpaths + for (const auto& subpath : multipath_aln.subpath()) { + if (subpath.next().empty()) { + const auto& mapping = subpath.path().mapping().back(); + const auto& edit = mapping.edit().back(); + if (edit.from_length() == 0 && edit.to_length() != 0) { + // this is soft-clipped + handle_t handle = graph.get_handle(mapping.position().node_id(), mapping.position().is_reverse()); + int64_t final_offset = mapping.position().offset() + mapping_from_length(mapping); + size_t max_len = get_aligner()->longest_detectable_gap(multipath_aln.sequence().size(), edit.to_length()); + if (graph.get_length(handle) - final_offset < max_len) { + // the clipped sequence's alignment might not fit on this node + bool is_sink = graph.follow_edges(handle, false, [](const handle_t& h) {return false;}); + if (is_sink) { + // this soft clip might arise because we didn't extract enough graph + expansion_points[handle] = max(expansion_points[handle], + max_len - (graph.get_length(handle) - final_offset)); + } + } + } + } + } + if (!expansion_points.empty()) { + int64_t num_edges_before_expansion = graph.get_edge_count(); + for (const pair& expansion_point : expansion_points) { + // we might need to extend the subgraph here to get a full alignment + algorithms::locally_expand_graph(*xindex, graph, expansion_point.first, expansion_point.second); + } + return num_edges_before_expansion != graph.get_edge_count(); + } + else { + return false; + } } - void MultipathMapper::multipath_align(const Alignment& alignment, VG* vg, - memcluster_t& graph_mems, - MultipathAlignment& multipath_aln_out) const { + void MultipathMapper::multipath_align(const Alignment& alignment, clustergraph_t& cluster_graph, + multipath_alignment_t& multipath_aln_out, + const match_fanouts_t* fanouts) const { + // we put this in a loop so that we can check to make sure we didn't miss part of the + // alignment because there wasn't enough graph extracted + bool new_graph_material = true; + while (new_graph_material) { + // there are parts of this graph we haven't tried to align to yet + + multipath_aln_out.mutable_subpath()->clear(); + multipath_aln_out.mutable_start()->clear(); + + auto graph = get<0>(cluster_graph).get(); + auto& graph_mems = get<1>(cluster_graph); + +#ifdef debug_multipath_mapper_alignment + cerr << "constructing alignment graph for cluster of " << get<1>(cluster_graph).first.size() << " hits" << endl; +#endif + + if (graph_mems.first.empty()) { +#ifdef debug_multipath_mapper_alignment + cerr << "cluster is empty, aborting" << endl; +#endif + transfer_read_metadata(alignment, multipath_aln_out); + return; + } + + // the longest path we could possibly align to (full gap and a full sequence) + auto aligner = get_aligner(!alignment.quality().empty()); + size_t target_length = alignment.sequence().size() + min(aligner->longest_detectable_gap(alignment), max_alignment_gap); + + // check if we can get away with using only one strand of the graph + bool use_single_stranded = handlealgs::is_single_stranded(graph); + bool mem_strand = false; + if (use_single_stranded) { + mem_strand = is_rev(graph_mems.first[0].second); + for (size_t i = 1; i < graph_mems.first.size(); i++) { + if (is_rev(graph_mems.first[i].second) != mem_strand) { + use_single_stranded = false; + break; + } + } + } + + // make the graph we need to align to +#ifdef debug_multipath_mapper_alignment + cerr << "use_single_stranded: " << use_single_stranded << " mem_strand: " << mem_strand << endl; +#endif + +#ifdef debug_multipath_mapper_alignment + cerr << "initial alignment graph with " << graph->get_node_count() << " nodes and " << graph->get_edge_count() << ":" << endl; + graph->for_each_handle([&](const handle_t& h) { + cerr << graph->get_id(h) << " " << graph->get_sequence(h) << endl; + graph->follow_edges(h, false, [&](const handle_t& n) { + cerr << "\t-> " << graph->get_id(n) << " " << (graph->get_is_reverse(n) ? "-" : "+") << endl; + }); + graph->follow_edges(h, true, [&](const handle_t& n) { + cerr << "\t " << graph->get_id(n) << " " << (graph->get_is_reverse(n) ? "-" : "+") << " <-" << endl; + }); + }); +#endif + + // make our options for single stranded graphs + IdentityOverlay fwd_graph(graph); + ReverseGraph rev_graph(graph, true); + StrandSplitGraph split_graph(graph); + + // choose which one we want + ExpandingOverlayGraph* align_digraph = nullptr; + if (!use_single_stranded) { + align_digraph = &split_graph; + } + else if (mem_strand) { + align_digraph = &rev_graph; + } + else { + align_digraph = &fwd_graph; + } + + // if necessary, convert from cyclic to acylic (can be expensive to construct, only do it + // if we need to) + + IdentityOverlay undagified(align_digraph); + unique_ptr dagified; + + ExpandingOverlayGraph* align_dag = nullptr; + if (handlealgs::is_directed_acyclic(align_digraph)) { + align_dag = &undagified; + } + else { +#ifdef debug_multipath_mapper_alignment + cerr << "graph contains directed cycles, performing dagification" << endl; +#endif + dagified = unique_ptr(new DagifiedGraph(align_digraph, target_length, max_dagify_duplications)); + align_dag = dagified.get(); + } + + // a function to translate from the transformed graphs ID space to the original graph's + function(id_t)> translator = [&](const id_t node_id) { + handle_t original = align_digraph->get_underlying_handle(align_dag->get_underlying_handle(align_dag->get_handle(node_id))); + return make_pair(graph->get_id(original), graph->get_is_reverse(original)); + }; + +#ifdef debug_multipath_mapper_alignment + cerr << "final alignment graph of size " << align_dag->get_node_count() << " nodes and " << align_dag->get_edge_count() << ":" << endl; + align_dag->for_each_handle([&](const handle_t& h) { + auto tr = translator(align_dag->get_id(h)); + cerr << align_dag->get_id(h) << " (" << tr.first << (tr.second ? "-" : "+") << ") " << align_dag->get_sequence(h) << endl; + align_dag->follow_edges(h, false, [&](const handle_t& n) { + cerr << "\t-> " << align_dag->get_id(n) << " " << (align_dag->get_is_reverse(n) ? "-" : "+") << endl; + }); + align_dag->follow_edges(h, true, [&](const handle_t& n) { + cerr << "\t " << align_dag->get_id(n) << " " << (align_dag->get_is_reverse(n) ? "-" : "+") << " <-" << endl; + }); + }); +#endif + + // construct a graph that summarizes reachability between MEMs + +#ifdef debug_multipath_mapper_alignment + cerr << "making multipath alignment MEM graph" << endl; +#endif + + vector hit_provenance; + MultipathAlignmentGraph multi_aln_graph(*align_dag, graph_mems, translator, hit_provenance, + max_branch_trim_length, gcsa, fanouts); + + { + // Compute a topological order over the graph + vector topological_order; + multi_aln_graph.topological_sort(topological_order); + + // it's sometimes possible for transitive edges to survive the original construction algorithm, so remove them + multi_aln_graph.remove_transitive_edges(topological_order); + + // prune this graph down the paths that have reasonably high likelihood + size_t size_before_prune = multi_aln_graph.size(); + multi_aln_graph.prune_to_high_scoring_paths(alignment, aligner, max_suboptimal_path_score_ratio, + topological_order, hit_provenance); + + if (multi_aln_graph.size() != size_before_prune && do_spliced_alignment) { + // we pruned away some path nodes, so let's check if we pruned away any entire hits + // and, if so, un-claim them from this cluster + + vector found(graph_mems.first.size(), false); + for (auto i : hit_provenance) { + found[i] = true; + } + + // if necessary remove the ones we didn't keep + size_t removed = 0; + for (size_t i = 0; i < graph_mems.first.size(); ++i) { + if (!found[i]) { +#ifdef debug_multipath_alignment + cerr << "completely pruned hit from cluster: " << graph_mems.first[i].second << " " << graph_mems.first[i].first->sequence() << endl; +#endif + ++removed; + } + else if (removed) { + graph_mems.first[i - removed] = graph_mems.first[i]; + } + } + graph_mems.first.resize(graph_mems.first.size() - removed); + + // we may need to recompute the coverage of the cluster because some MEMs were pruned out of it + set_read_coverage(cluster_graph); + } + } + + if (snarl_manager || distance_index) { + // We want to do snarl cutting + + if (!suppress_tail_anchors) { + #ifdef debug_multipath_mapper_alignment - cerr << "constructing alignment graph" << endl; + cerr << "Synthesizing tail anchors for snarl cutting" << endl; #endif - - // the longest path we could possibly align to (full gap and a full sequence) - size_t target_length = alignment.sequence().size() + get_aligner()->longest_detectable_gap(alignment); - - // convert from bidirected to directed - unordered_map > node_trans; - VG align_graph; - - // check if we can get away with using only one strand of the graph - bool use_single_stranded = algorithms::is_single_stranded(vg); - bool mem_strand = false; - if (use_single_stranded) { - mem_strand = is_rev(graph_mems[0].second); - for (size_t i = 1; i < graph_mems.size(); i++) { - if (is_rev(graph_mems[i].second) != mem_strand) { - use_single_stranded = false; - break; + + // Make fake anchor paths to cut the snarls out of in the tails + multi_aln_graph.synthesize_tail_anchors(alignment, *align_dag, aligner, min_tail_anchor_length, num_alt_alns, + false, max_alignment_gap, + use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0); + } - } - } - - // make the graph we need to align to + #ifdef debug_multipath_mapper_alignment - cerr << "use_single_stranded: " << use_single_stranded << " mem_strand: " << mem_strand << endl; + cerr << "MultipathAlignmentGraph going into snarl cutting:" << endl; + multi_aln_graph.to_dot(cerr, &alignment); #endif - if (use_single_stranded) { - if (mem_strand) { - align_graph = vg->reverse_complement_graph(node_trans); - } - else { - // if we are using only the forward strand of the current graph, a make trivial node translation so - // the later code's expectations are met - // TODO: can we do this without the copy constructor? - align_graph = *vg; - vg->identity_translation(node_trans); + + // Do the snarl cutting, which modifies the nodes in the multipath alignment graph + if (max_snarl_cut_size) { + multi_aln_graph.resect_snarls_from_paths(snarl_manager, distance_index, translator, max_snarl_cut_size); + } } - } - else { - node_trans = algorithms::split_strands(vg, &align_graph); - } - - // if necessary, convert from cyclic to acylic - if (!algorithms::is_directed_acyclic(vg)) { - unordered_map > dagify_trans; - align_graph = align_graph.dagify(target_length, // high enough that num SCCs is never a limiting factor - dagify_trans, - target_length, - 0); // no maximum on size of component - node_trans = align_graph.overlay_node_translations(dagify_trans, node_trans); - } - - // put the internal graph in topological order for the MultipathAlignmentGraph algorithm - algorithms::lazier_sort(&align_graph); - + + #ifdef debug_multipath_mapper_alignment - cerr << "making multipath alignment MEM graph" << endl; + cerr << "MultipathAlignmentGraph going into alignment:" << endl; + multi_aln_graph.to_dot(cerr, &alignment); + + for (auto& ids : multi_aln_graph.get_connected_components()) { + cerr << "Component: "; + for (auto& id : ids) { + cerr << id << " "; + } + cerr << endl; + } #endif - - // construct a graph that summarizes reachability between MEMs - // First we need to reverse node_trans - auto node_inj = MultipathAlignmentGraph::create_injection_trans(node_trans); - MultipathAlignmentGraph multi_aln_graph(align_graph, graph_mems, node_trans, node_inj, gcsa); - - { - // Compute a topological order over the graph - vector topological_order; - multi_aln_graph.topological_sort(topological_order); - // it's sometimes possible for transitive edges to survive the original construction algorithm, so remove them - multi_aln_graph.remove_transitive_edges(topological_order); + function choose_band_padding = [&](const Alignment& seq, const HandleGraph& graph) { + size_t read_length = seq.sequence().size(); + return read_length < band_padding_memo.size() ? band_padding_memo.at(read_length) + : size_t(band_padding_multiplier * sqrt(read_length)) + 1; + }; + + // do the connecting alignments and fill out the multipath_alignment_t object + multi_aln_graph.align(alignment, *align_dag, aligner, true, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap, + use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies, + max_tail_merge_supress_length, choose_band_padding, multipath_aln_out, snarl_manager, + distance_index, &translator); + + // Note that we do NOT topologically order the multipath_alignment_t. The + // caller has to do that, after it is finished breaking it up into + // connected components or whatever. - // prune this graph down the paths that have reasonably high likelihood - multi_aln_graph.prune_to_high_scoring_paths(alignment, get_aligner(), - max_suboptimal_path_score_ratio, topological_order); - } - - if (snarl_manager) { - // We want to do snarl cutting - - // Do the snarl cutting, which modifies the nodes in the multipath alignment graph - multi_aln_graph.resect_snarls_from_paths(snarl_manager, node_trans, max_snarl_cut_size); - } - #ifdef debug_multipath_mapper_alignment - cerr << "MultipathAlignmentGraph going into alignment:" << endl; - multi_aln_graph.to_dot(cerr); - - for (auto& ids : multi_aln_graph.get_connected_components()) { - cerr << "Component: "; - for (auto& id : ids) { - cerr << id << " "; - } - cerr << endl; - } + cerr << "multipath alignment before translation: " << debug_string(multipath_aln_out) << endl; #endif - - function choose_band_padding = [&](const Alignment& seq, const HandleGraph& graph) { - size_t read_length = seq.sequence().end() - seq.sequence().begin(); - return read_length < band_padding_memo.size() ? band_padding_memo.at(read_length) - : size_t(band_padding_multiplier * sqrt(read_length)) + 1; - }; - - // do the connecting alignments and fill out the MultipathAlignment object - multi_aln_graph.align(alignment, align_graph, get_aligner(), true, num_alt_alns, dynamic_max_alt_alns, choose_band_padding, multipath_aln_out); - - + for (size_t j = 0; j < multipath_aln_out.subpath_size(); j++) { + translate_oriented_node_ids(*multipath_aln_out.mutable_subpath(j)->mutable_path(), translator); + } + #ifdef debug_multipath_mapper_alignment - cerr << "multipath alignment before translation: " << pb2json(multipath_aln_out) << endl; + cerr << "completed multipath alignment: " << debug_string(multipath_aln_out) << endl; #endif - for (size_t j = 0; j < multipath_aln_out.subpath_size(); j++) { - translate_oriented_node_ids(*multipath_aln_out.mutable_subpath(j)->mutable_path(), node_trans); - } - + + // check if we need to expand the graph to get a full alignment + new_graph_material = expand_for_softclips(cluster_graph, multipath_aln_out); #ifdef debug_multipath_mapper_alignment - cerr << "completed multipath alignment: " << pb2json(multipath_aln_out) << endl; + if (new_graph_material) { + cerr << "found soft clip near graph boundary, expanding subraph and re-aligning" << endl; + } #endif + } } - void MultipathMapper::make_nontrivial_multipath_alignment(const Alignment& alignment, VG& subgraph, - unordered_map>& translator, - SnarlManager& snarl_manager, MultipathAlignment& multipath_aln_out) const { + void MultipathMapper::make_nontrivial_multipath_alignment(const Alignment& alignment, const HandleGraph& subgraph, + const function(id_t)>& translator, + multipath_alignment_t& multipath_aln_out) const { #ifdef debug_multipath_mapper_alignment cerr << "attempting to make nontrivial alignment for " << alignment.name() << endl; #endif + auto aligner = get_aligner(!alignment.quality().empty()); + // create an alignment graph with the internals of snarls removed - MultipathAlignmentGraph multi_aln_graph(subgraph, alignment, snarl_manager, max_snarl_cut_size, translator); + MultipathAlignmentGraph multi_aln_graph(subgraph, alignment, snarl_manager, distance_index, max_snarl_cut_size, translator); // remove any transitive edges that may have found their way in there // TODO: is this necessary? all edges should be across snarls, how could they be transitive? from trimmed indels maybe? @@ -3014,8 +6311,10 @@ namespace vg { : size_t(band_padding_multiplier * sqrt(read_length)) + 1; }; - // do the connecting alignments and fill out the MultipathAlignment object - multi_aln_graph.align(alignment, subgraph, get_aligner(), false, num_alt_alns, dynamic_max_alt_alns, choose_band_padding, multipath_aln_out); + // do the connecting alignments and fill out the multipath_alignment_t object + multi_aln_graph.align(alignment, subgraph, aligner, false, num_alt_alns, dynamic_max_alt_alns, max_alignment_gap, + use_pessimistic_tail_alignment ? pessimistic_gap_multiplier : 0.0, simplify_topologies, + max_tail_merge_supress_length, choose_band_padding, multipath_aln_out); for (size_t j = 0; j < multipath_aln_out.subpath_size(); j++) { translate_oriented_node_ids(*multipath_aln_out.mutable_subpath(j)->mutable_path(), translator); @@ -3024,48 +6323,64 @@ namespace vg { topologically_order_subpaths(multipath_aln_out); #ifdef debug_multipath_mapper_alignment - cerr << "completed multipath alignment: " << pb2json(multipath_aln_out) << endl; + cerr << "completed multipath alignment: " << debug_string(multipath_aln_out) << endl; #endif } - int64_t MultipathMapper::read_coverage(const memcluster_t& mem_hits) { - if (mem_hits.empty()) { - return 0; + void MultipathMapper::set_read_coverage(clustergraph_t& cluster_graph) { + + auto& mem_hits = get<1>(cluster_graph); + if (mem_hits.first.empty()) { + get<2>(cluster_graph) = 0; + return; + } + + // lexicographic comparison for the sort this algorithm needs + auto lex_cmp = [](const pair& hit_1, + const pair& hit_2) { + return (hit_1.first->begin < hit_2.first->begin || + (hit_1.first->begin == hit_2.first->begin && hit_1.first->end < hit_2.first->end)); + }; + + // length first comparison for later steps + auto length_first_cmp = [](const pair& hit_1, + const pair& hit_2) { + return (hit_1.first->length() > hit_2.first->length() || + (hit_1.first->length() == hit_2.first->length() && + (hit_1.first->begin < hit_2.first->begin || + (hit_1.first->begin == hit_2.first->begin && hit_1.first->end < hit_2.first->end)))); + }; + + + if (!is_sorted(get<1>(cluster_graph).first.begin(), get<1>(cluster_graph).first.end(), lex_cmp)) { + stable_sort(get<1>(cluster_graph).first.begin(), get<1>(cluster_graph).first.end(), lex_cmp); } vector> mem_read_segments; - mem_read_segments.reserve(mem_hits.size()); - for (auto& mem_hit : mem_hits) { + mem_read_segments.reserve(mem_hits.first.size()); + for (auto& mem_hit : mem_hits.first) { mem_read_segments.emplace_back(mem_hit.first->begin, mem_hit.first->end); } - std::sort(mem_read_segments.begin(), mem_read_segments.end()); - auto curr_begin = mem_read_segments[0].first; - auto curr_end = mem_read_segments[0].second; + get<2>(cluster_graph) = algorithms::count_covered(mem_read_segments); - int64_t total = 0; - for (size_t i = 1; i < mem_read_segments.size(); i++) { - if (mem_read_segments[i].first >= curr_end) { - total += (curr_end - curr_begin); - curr_begin = mem_read_segments[i].first; - curr_end = mem_read_segments[i].second; - } - else if (mem_read_segments[i].second > curr_end) { - curr_end = mem_read_segments[i].second; - } - } - return total + (curr_end - curr_begin); + stable_sort(get<1>(cluster_graph).first.begin(), get<1>(cluster_graph).first.end(), length_first_cmp); } - void MultipathMapper::strip_full_length_bonuses(MultipathAlignment& multipath_aln) const { + void MultipathMapper::strip_full_length_bonuses(multipath_alignment_t& multipath_aln) const { - int32_t full_length_bonus = get_aligner()->full_length_bonus; + // TODO: this could technically be wrong if only one read in a pair has qualities + const auto& aligner = *get_aligner(!multipath_aln.quality().empty()); // strip bonus from source paths if (multipath_aln.start_size()) { // use the precomputed list of sources if we have it for (size_t i = 0; i < multipath_aln.start_size(); i++) { - Subpath* source_subpath = multipath_aln.mutable_subpath(multipath_aln.start(i)); - if (edit_is_insertion(source_subpath->path().mapping(0).edit(0))) { - source_subpath->set_score(source_subpath->score() - full_length_bonus); + subpath_t* source_subpath = multipath_aln.mutable_subpath(multipath_aln.start(i)); + const edit_t& edit = source_subpath->path().mapping(0).edit(0); + if (edit.to_length() != 0 && edit.from_length() != 0) { + source_subpath->set_score(source_subpath->score() + - aligner.score_full_length_bonus(true, multipath_aln.sequence().begin(), + multipath_aln.sequence().end(), + multipath_aln.quality().begin())); } } } @@ -3073,7 +6388,7 @@ namespace vg { // find sources vector is_source(multipath_aln.subpath_size(), true); for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - const Subpath& subpath = multipath_aln.subpath(i); + const subpath_t& subpath = multipath_aln.subpath(i); for (size_t j = 0; j < subpath.next_size(); j++) { is_source[subpath.next(j)] = false; } @@ -3083,74 +6398,51 @@ namespace vg { if (!is_source[i]) { continue; } - Subpath* source_subpath = multipath_aln.mutable_subpath(i); - if (edit_is_insertion(source_subpath->path().mapping(0).edit(0))) { - source_subpath->set_score(source_subpath->score() - full_length_bonus); + subpath_t* source_subpath = multipath_aln.mutable_subpath(i); + const edit_t& edit = source_subpath->path().mapping(0).edit(0); + if (edit.to_length() != 0 && edit.from_length() != 0) { + source_subpath->set_score(source_subpath->score() + - aligner.score_full_length_bonus(true, multipath_aln.sequence().begin(), + multipath_aln.sequence().end(), + multipath_aln.quality().begin())); } } } // strip bonus from sink paths for (size_t i = 0; i < multipath_aln.subpath_size(); i++) { - Subpath* subpath = multipath_aln.mutable_subpath(i); + subpath_t* subpath = multipath_aln.mutable_subpath(i); if (subpath->next_size() == 0) { - const Mapping& final_mapping = subpath->path().mapping(subpath->path().mapping_size() - 1); - if (edit_is_insertion(final_mapping.edit(final_mapping.edit_size() - 1))) { - subpath->set_score(subpath->score() - full_length_bonus); + const path_mapping_t& final_mapping = subpath->path().mapping(subpath->path().mapping_size() - 1); + const edit_t& edit = final_mapping.edit(final_mapping.edit_size() - 1); + if (edit.to_length() != 0 && edit.from_length() != 0) { + subpath->set_score(subpath->score() + - aligner.score_full_length_bonus(false, multipath_aln.sequence().begin(), + multipath_aln.sequence().end(), + multipath_aln.quality().begin())); } } } } - - int32_t MultipathMapper::compute_raw_mapping_quality_from_scores(const vector& scores, MappingQualityMethod mapq_method) const { - - // We should never actually compute a MAPQ with the None method. If we try, it means something has gonbe wrong. - assert(mapq_method != None); - - // TODO: BaseAligner's mapping quality computation insists on sometimes appending a 0 to your score list. - // We don't want to take a mutable score list, so we copy it here. - // This can be removed when BaseAligner is fixed to take const score lists. - vector mutable_scores(scores.begin(), scores.end()); - - int32_t raw_mapq; - if (mapping_quality_method == Adaptive) { - raw_mapq = get_aligner()->compute_mapping_quality(mutable_scores, mutable_scores.size() < 2 ? true : - (mutable_scores[1] < mutable_scores[0] - - get_aligner()->mapping_quality_score_diff(max_mapping_quality))); - } - else { - raw_mapq = get_aligner()->compute_mapping_quality(mutable_scores, mapping_quality_method == Approx); - } - - // arbitrary scaling, seems to help performance - raw_mapq *= mapq_scaling_factor; - -#ifdef debug_multipath_mapper - cerr << "scores yield a raw MAPQ of " << raw_mapq << endl; -#endif - return raw_mapq; - } - - void MultipathMapper::sort_and_compute_mapping_quality(vector& multipath_alns, - MappingQualityMethod mapq_method, - vector* cluster_idxs) const { - if (multipath_alns.empty()) { - return; - } + vector MultipathMapper::mapping_likelihoods(vector& multipath_alns) const { - // Only do the population MAPQ if it might disambiguate two paths (since it's not - // as cheap as just using the score), or if we set the setting to always do it. + // Only do the population MAPQ if it might disambiguate two paths + // (since it's not as cheap as just using the score), or if we set the + // setting to always do it. bool include_population_component = (use_population_mapqs && (multipath_alns.size() > 1 || always_check_population)); - // records whether all of the pathsenumerated across all multipath alignments followed the edges in the index - bool all_paths_pop_consistent = true; - - double log_base = get_aligner()->log_base; - - // The score of the optimal Alignment for each MultipathAlignment, not adjusted for population - vector base_scores(multipath_alns.size(), 0.0); - // The scores of the best Alignment for each MultipathAlignment, adjusted for population. - // These can be negativem but will be bumped up to all be positive later. + // Records whether, for each multipath alignment, at least one of the + // paths enumerated followed only edges in the index. We count totally + // unmapped reads as pop consistent, as all 0 edges they cross are pop + // consistent. + bool all_multipaths_pop_consistent = true; + + double log_base = get_aligner(!multipath_alns.front().quality().empty())->log_base; + + // The score of the optimal Alignment for each multipath_alignment_t, not adjusted for population + vector scores(multipath_alns.size(), 0.0); + // The scores of the best Alignment for each multipath_alignment_t, adjusted for population. + // These can be negative but will be bumped up to all be positive later. vector pop_adjusted_scores; if (include_population_component) { pop_adjusted_scores.resize(multipath_alns.size()); @@ -3159,7 +6451,7 @@ namespace vg { // We need to track the score adjustments so we can compensate for // negative values, turning the largest penalty into a 0 bonus. double min_adjustment = numeric_limits::max(); - + for (size_t i = 0; i < multipath_alns.size(); i++) { // Score all the multipath alignment candidates, optionally using @@ -3167,125 +6459,500 @@ namespace vg { // We will query the population database for this alignment if it // is turned on and it succeeded for the others. - bool query_population = include_population_component && all_paths_pop_consistent; + bool query_population = include_population_component && all_multipaths_pop_consistent; - // Generate the top alignment, or the top population_max_paths - // alignments if we are doing multiple alignments for population - // scoring. - auto wanted_alignments = query_population ? population_max_paths : 1; - auto alignments = optimal_alignments(multipath_alns[i], wanted_alignments); - assert(!alignments.empty()); + /// Get all the linearizations we are going to work with, possibly with duplicates. + /// The first alignment will be optimal. + vector alignments; + int32_t aln_score = -1; + + if (query_population) { + // We want to do population scoring + if (!top_tracebacks && haplo_score_provider->has_incremental_search()) { + // We can use incremental haplotype search to find all the linearizations consistent with haplotypes + // Make sure to also always include the optimal alignment first, even if inconsistent. + // And also include up to population_max_paths non-consistent but hopefully scorable paths + alignments = haplotype_consistent_alignments(multipath_alns[i], *haplo_score_provider, population_max_paths, + population_paths_hard_cap, true); + } else { + // We will just find the top n best-alignment-scoring linearizations and hope some match haplotypes + alignments = optimal_alignments(multipath_alns[i], population_max_paths); + } + + aln_score = alignments.front().score(); + } else { + // Just compute a single optimal alignment + aln_score = optimal_alignment_score(multipath_alns[i]); + } #ifdef debug_multipath_mapper - cerr << "Got " << alignments.size() << " / " << wanted_alignments << " tracebacks for multipath " << i << endl; + cerr << "Got " << alignments.size() << " tracebacks for multipath " << i << endl; #endif #ifdef debug_multipath_mapper_alignment - cerr << pb2json(multipath_alns[i]) << endl; + cerr << debug_string(multipath_alns[i]) << endl; #endif - + + // Now, we may have been fed a multipath_alignment_t where the best + // single path alignment is to leave it unmapped alltogether. Maybe + // we cut out a really terrible bit of the alignment graph somehow. + // We used to fail an assert in that case, but we can handle it as + // just an unmapped read with score 0. + // Collect the score of the optimal alignment, to use if population // scoring fails for a multipath alignment. Put it in the optimal // base score. - base_scores[i] = alignments[0].score(); + scores[i] = max(aln_score, 0); if (query_population) { + // Work out the population size. Use the override, then try the score provider, and then fall back to the xg. + auto haplotype_count = force_haplotype_count; + + if (haplotype_count == 0) { + haplotype_count = haplo_score_provider->get_haplotype_count(); + } + + if (haplotype_count == 0 || haplotype_count == -1) { + // The score provider doesn't ahve a haplotype count. Fall back to the count in the XG. + // No longer available! + //haplotype_count = xindex->get_haplotype_count(); + } + + if (haplotype_count == 0 || haplotype_count == -1) { + // We really should have a haplotype count + throw runtime_error("Cannot score any haplotypes with a 0 or -1 haplotype count; are haplotypes available?"); + } + // Make sure to grab the memo - auto& memo = get_rr_memo(recombination_penalty, xindex->get_haplotype_count()); + auto& memo = get_rr_memo(recombination_penalty, haplotype_count); + + // Now we need to score the linearizations. The pop-adjusted + // score of the pop-scorable linearization with the best + // pop-adjusted score lives in pop_adjusted_scores[i]. + double& best_linearization_total_score = pop_adjusted_scores[i]; + // The population score for that alignment lives here. + double best_linearization_pop_score = 0; + // We set this to true if we found a best linearization. + bool have_best_linearization = false; - // Now compute population scores for all the top paths - vector alignment_pop_scores(alignments.size(), 0.0); for (size_t j = 0; j < alignments.size(); j++) { // Score each alignment if possible auto pop_score = haplo_score_provider->score(alignments[j].path(), memo); #ifdef debug_multipath_mapper cerr << "Got pop score " << pop_score.first << ", " << pop_score.second << " for alignment " << j - << " score " << alignments[j].score() << " of multipath " << i << endl; + << " score " << alignments[j].score() << " of multipath " << i << endl; #endif #ifdef debug_multipath_mapper_alignment cerr << pb2json(alignments[j]) << endl; #endif - - alignment_pop_scores[j] = pop_score.first / log_base; - if (std::isnan(alignment_pop_scores[j]) && pop_score.second) { + if (std::isnan(pop_score.first) && pop_score.second) { // This shouldn't happen. Bail out on haplotype adjustment for this read and warn. cerr << "warning:[vg::MultipathMapper]: NAN population score obtained for read " - << alignments[j].name() << " with ostensibly successful query. Changing to failure." << endl; + << alignments[j].name() << " with ostensibly successful query. Changing to failure." << endl; pop_score.second = false; } if (std::isnan(alignments[j].score())) { // This is even worse! The alignment score itself is somehow NAN. cerr << "warning:[vg::MultipathMapper]: NAN alignment score obtained in alignment being considered for read " - << alignments[j].name() << ". This should never happen! Bailing out on population scoring." << endl; + << alignments[j].name() << ". This should never happen! Changing to failure." << endl; pop_score.second = false; } - - all_paths_pop_consistent &= pop_score.second; + + if (pop_score.second) { + // If the alignment was pop-score-able, mix it in as a candidate for the best linearization + + // Compute its pop-adjusted score. + // Make sure to account for the aligner's log base to have consistent point values. + double total_score = alignments[j].score() + pop_score.first / log_base; + + if (!have_best_linearization || total_score > best_linearization_total_score) { + // This is the new best linearization + + best_linearization_total_score = total_score; + best_linearization_pop_score = pop_score.first / log_base; + have_best_linearization = true; + } + + } + + // Otherwise, skip it + } + + + if (!have_best_linearization) { + // If we have no best linear pop-scored Alignment, bail out on population score correction for this read entirely. + // We probably have a placement in a region not covered by the haplotype index at all. + all_multipaths_pop_consistent = false; + continue; + } + + // Otherwise, we have population scores. + +#ifdef debug_multipath_mapper + cerr << "Best population-adjusted linearization score is " << best_linearization_total_score << endl; +#endif + + // Save the population score from the best total score Alignment. + // TODO: This is not the pop score of the linearization that the multipath_alignment_t wants to give us by default. + multipath_alns[i].set_annotation("haplotype_score", best_linearization_pop_score); + + // The multipath's base score is the base score of the + // best-base-score linear alignment. This is the "adjustment" + // we apply to the multipath's score to make it match the + // pop-adjusted score of the best-pop-adjusted-score linear + // alignment. + double adjustment = best_linearization_total_score - scores[i]; + + // See if we have a new minimum adjustment value, for the adjustment applicable to the chosen traceback. + min_adjustment = min(min_adjustment, adjustment); + } + } + + if (include_population_component && all_multipaths_pop_consistent) { + // We will go ahead with pop scoring for this read + +#ifdef debug_multipath_mapper + cerr << "Haplotype consistency score is being used." << endl; +#endif + + for (auto& score : pop_adjusted_scores) { + // Adjust the adjusted scores up/down by the minimum adjustment to ensure no scores are negative + score -= min_adjustment; + } + + for (auto& mpaln : multipath_alns) { + // Remember that we did use population scoring on all these multipath_alignment_ts + mpaln.set_annotation("haplotype_score_used", true); + } + } else { + // Clean up pop score annotations and remove scores on all the reads. + +#ifdef debug_multipath_mapper + cerr << "Haplotype consistency score is not being used." << endl; +#endif + + for (auto& mpaln : multipath_alns) { + mpaln.clear_annotation("haplotype_score_used"); + mpaln.clear_annotation("haplotype_score"); + } + } + + // Select whether to use base or adjusted scores depending on whether + // we did population-aware alignment and succeeded for all the + // multipath alignments. + if (include_population_component && all_multipaths_pop_consistent) { + scores = move(pop_adjusted_scores); + } + return scores; + } + + vector MultipathMapper::pair_mapping_likelihoods(vector>& multipath_aln_pairs, + const vector, int64_t>>& cluster_pairs) const { + +#ifdef debug_multipath_mapper + cerr << "computing paired read likelihoods" << endl; +#endif + + // Only do the population MAPQ if it might disambiguate two paths (since it's not + // as cheap as just using the score), or if we set the setting to always do it. + bool include_population_component = (use_population_mapqs && + (multipath_aln_pairs.size() > 1 || always_check_population)); + // Records whether, for each multipath alignment pair, at least one of + // the paths enumerated for each end followed only edges in the index. + // We count totally unmapped reads as pop consistent, as all 0 edges + // they cross are pop consistent. + bool all_multipaths_pop_consistent = true; + + double log_base = get_aligner(!multipath_aln_pairs.front().first.quality().empty() && + !multipath_aln_pairs.front().second.quality().empty())->log_base; + + // the scores of the optimal alignments and fragments, ignoring population + vector scores(multipath_aln_pairs.size(), 0.0); + + // the scores of the optimal alignments and fragments, accounting for population + vector pop_adjusted_scores; + if (include_population_component) { + pop_adjusted_scores.resize(multipath_aln_pairs.size()); + } + // population + fragment score, for when population adjustment is used, to make scores nonnegative + double min_extra_score = numeric_limits::max(); + // just fragment score, for running without population adjustment, to make scores nonnegative + double min_frag_score = numeric_limits::max(); + + + for (size_t i = 0; i < multipath_aln_pairs.size(); i++) { + // For each pair of read placements + pair& multipath_aln_pair = multipath_aln_pairs[i]; + + // We will query the population database for this alignment pair if it + // is turned on and it succeeded for the others. + bool query_population = include_population_component && all_multipaths_pop_consistent; + + // Generate the top alignments on each side, or the top + // population_max_paths alignments if we are doing multiple + // alignments for population scoring. + vector> alignments; + int32_t aln_score_1 = -1, aln_score_2 = -1; + + if (query_population) { + // We want to do population scoring + alignments.resize(2); + if (!top_tracebacks && haplo_score_provider->has_incremental_search()) { + // We can use incremental haplotype search to find all the linearizations consistent with haplotypes + // Make sure to also always include the optimal alignment first, even if inconsistent. + // Also pad out with population_max_paths inconsistent or unscorable paths + alignments[0] = haplotype_consistent_alignments(multipath_aln_pair.first, *haplo_score_provider, population_max_paths, + population_paths_hard_cap, true); + alignments[1] = haplotype_consistent_alignments(multipath_aln_pair.second, *haplo_score_provider, population_max_paths, + population_paths_hard_cap, true); + } else { + // We will just find the top n best-alignment-scoring linearizations and hope some match haplotypes + alignments[0] = optimal_alignments(multipath_aln_pair.first, population_max_paths); + alignments[1] = optimal_alignments(multipath_aln_pair.second, population_max_paths); + } + + if (!alignments[0].empty()) { + aln_score_1 = alignments[0].front().score(); + } + if (!alignments[1].empty()) { + aln_score_2 = alignments[1].front().score(); + } + +#ifdef debug_multipath_mapper + + cerr << "Got " << alignments[0].size() << " and " << alignments[1].size() << " linearizations on each end" << endl; +#endif + } else { + // Just compute a single optimal alignment + aln_score_1 = optimal_alignment_score(multipath_aln_pair.first); + aln_score_2 = optimal_alignment_score(multipath_aln_pair.second); + } + + + // We used to fail an assert if either list of optimal alignments + // was empty, but now we handle it as if that side is an unmapped + // read with score 0. + + // Compute the optimal alignment score ignoring population + int32_t alignment_score = max(aln_score_1, 0) + max(aln_score_2, 0); + + // This is the contribution to the alignment's score from the fragment length distribution + double frag_score; + if (aln_score_1 == -1 || aln_score_2 == -1) { + // Actually there should be no fragment score, because one or both ends are unmapped + frag_score = 0; + } else { + // compute the fragment distribution's contribution to the score + frag_score = fragment_length_log_likelihood(cluster_pairs[i].second) / log_base; + } + min_frag_score = min(frag_score, min_frag_score); + + // Record the base score, including fragment contribution + scores[i] = alignment_score + frag_score; + + if (query_population) { + // We also want to select the optimal population-scored alignment on each side and compute a pop-adjusted score. + + // Work out the population size. Use the override, then try the score provider, and then fall back to the xg. + auto haplotype_count = force_haplotype_count; + + if (haplotype_count == 0) { + haplotype_count = haplo_score_provider->get_haplotype_count(); + } + + if (haplotype_count == 0 || haplotype_count == -1) { + // The score provider doesn't ahve a haplotype count. Fall back to the count in the XG. + //haplotype_count = xindex->get_haplotype_count(); + } + + if (haplotype_count == 0 || haplotype_count == -1) { + // We really should have a haplotype count + throw runtime_error("Cannot score any haplotypes with a 0 or -1 haplotype count; are haplotypes available?"); + } + + // Make sure to grab the memo + auto& memo = get_rr_memo(recombination_penalty, haplotype_count); + + // Now we need to score the linearizations. + + // This is the best pop-adjusted linearization score for each end. + double best_total_score[2] = {0, 0}; + // This is the pop score that goes with it + double best_pop_score[2] = {0, 0}; + // We set this to true if we find a best linearization for each end. + bool have_best_linearization[2] = {false, false}; + // Note that for unmapped reads, the total and pop scores will stay 0. + + for (int end : {0, 1}) { + // For each read in the pair + + for (size_t j = 0; j < alignments[end].size(); j++) { + // For each alignment of the read in this location + + // Pop score the alignment + auto pop_score = haplo_score_provider->score(alignments[end][j].path(), memo); + + if (std::isnan(pop_score.first) && pop_score.second) { + // This shouldn't happen. Bail out on haplotype adjustment for this read and warn. + cerr << "warning:[vg::MultipathMapper]: NAN population adjusted score obtained for paired read " + << alignments[end][j].name() << " with ostensibly successful query. Changing to failure." << endl; + pop_score.second = false; + } + + if (std::isnan(alignments[end][j].score())) { + // This is even worse! The alignment score itself is somehow NAN. + cerr << "warning:[vg::MultipathMapper]: NAN alignment score obtained in alignment being considered for paired read " + << alignments[end][j].name() << ". This should never happen! Changing to failure." << endl; + pop_score.second = false; + } + +#ifdef debug_multipath_mapper + cerr << "Linearization " << j << " on end " << end << " gets pop score " << pop_score.first + << " and alignment score " << alignments[end][j].score() << endl; +#endif + + if (pop_score.second) { + // If the alignment was pop-score-able, mix it in as a candidate for the best linearization + + // Compute its pop-adjusted score. + // Make sure to account for the aligner's log base to have consistent point values. + double total_score = alignments[end][j].score() + pop_score.first / log_base; + + if (!have_best_linearization[end] || total_score > best_total_score[end]) { + // This is the new best linearization + + best_total_score[end] = total_score; + best_pop_score[end] = pop_score.first / log_base; + have_best_linearization[end] = true; + } + + } + } } - if (!all_paths_pop_consistent) { - // If we failed, bail out on population score correction for the whole MultipathAlignment. - - // Go and do the next MultipathAlignment since we have the base score for this one + if ((!alignments[0].empty() && !have_best_linearization[0]) || + (!alignments[1].empty() && !have_best_linearization[1])) { + // If we couldn't find a linearization for each mapped end that we could score, bail on pop scoring. + all_multipaths_pop_consistent = false; continue; } - // Otherwise, we have population scores. + // Compute the total pop adjusted score for this multipath_alignment_t + pop_adjusted_scores[i] = best_total_score[0] + best_total_score[1] + frag_score; - // Pick the best adjusted score and its difference from the best unadjusted score - pop_adjusted_scores[i] = numeric_limits::min(); - double adjustment; - for (size_t j = 0; j < alignments.size(); j++) { - // Compute the adjusted score for each alignment - auto adjusted_score = alignments[j].score() + alignment_pop_scores[j]; - - if (adjusted_score > pop_adjusted_scores[i]) { - // It is the best, so use it. - // TODO: somehow know we want this Alignment when collapsing the MultipathAlignment later. - pop_adjusted_scores[i] = adjusted_score; - adjustment = pop_adjusted_scores[i] - base_scores[i]; - } - } - - // Save the population score from the best total score read, - // even if population scoring doesn't get used. TODO: When - // flattening the multipath alignment back to single path, we - // should replace this score or make sure to use this winning - // single path alignment. - auto max_scoring_it = std::max_element(pop_adjusted_scores.begin(), pop_adjusted_scores.end()); - auto max_scoring_index = max_scoring_it - pop_adjusted_scores.begin(); - set_annotation(multipath_alns[i], "haplotype_score", alignment_pop_scores[max_scoring_index]); + // Save the pop scores without the base scores to the multipath alignments. + // TODO: Should we be annotating unmapped reads with 0 pop scores when the other read in the pair is mapped? + multipath_aln_pair.first.set_annotation("haplotype_score", best_pop_score[0]); + multipath_aln_pair.second.set_annotation("haplotype_score", best_pop_score[1]); - // See if we have a new minimum adjustment value, for the adjustment applicable to the chosen traceback. - min_adjustment = min(min_adjustment, adjustment); + assert(!std::isnan(best_total_score[0])); + assert(!std::isnan(best_total_score[1])); + assert(!std::isnan(frag_score)); + assert(!std::isnan(pop_adjusted_scores[i])); + + // How much was extra over the score of the top-base-score alignment on each side? + // This might be negative if e.g. that alignment looks terrible population-wise but we take it anyway. + auto extra = pop_adjusted_scores[i] - alignment_score; + + // Record our extra score if it was a new minimum + min_extra_score = min(extra, min_extra_score); } } - if (include_population_component && all_paths_pop_consistent) { - for (auto& score : pop_adjusted_scores) { - // Adjust the adjusted scores up/down by the minimum adjustment to ensure no scores are negative - score -= min_adjustment; - } - - for (auto& mpaln : multipath_alns) { - // Remember that we did use population scoring on all these MultipathAlignments - set_annotation(mpaln, "haplotype_score_used", true); + // Decide which scores to use depending on whether we have pop adjusted scores we want to use + if (include_population_component && all_multipaths_pop_consistent) { + scores = move(pop_adjusted_scores); + } + + // Pull the min frag or extra score out of the score so it will be nonnegative + double zero_point = (include_population_component && all_multipaths_pop_consistent) ? min_extra_score : min_frag_score; + for (auto& score : scores) { + score -= zero_point; + } + + if (include_population_component && all_multipaths_pop_consistent) { + // Record that we used the population score +#ifdef debug_multipath_mapper + cerr << "Haplotype consistency score is being used." << endl; +#endif + for (auto& multipath_aln_pair : multipath_aln_pairs) { + // We have to do it on each read in each pair. + // TODO: Come up with a simpler way to dump annotations in based on what happens during mapping. + multipath_aln_pair.first.set_annotation("haplotype_score_used", true); + multipath_aln_pair.second.set_annotation("haplotype_score_used", true); } } else { - // Clean up pop score annotations and remove scores on all the reads. - for (auto& mpaln : multipath_alns) { - clear_annotation(mpaln, "haplotype_score_used"); - clear_annotation(mpaln, "haplotype_score"); + // Clean up pop score annotations if present and remove scores from all the reads +#ifdef debug_multipath_mapper + cerr << "Haplotype consistency score is not being used." << endl; +#endif + for (auto& multipath_aln_pair : multipath_aln_pairs) { + // We have to do it on each read in each pair. + multipath_aln_pair.first.clear_annotation("haplotype_score_used"); + multipath_aln_pair.first.clear_annotation("haplotype_score"); + multipath_aln_pair.second.clear_annotation("haplotype_score_used"); + multipath_aln_pair.second.clear_annotation("haplotype_score"); } } + return scores; + } + + vector MultipathMapper::compute_raw_mapping_qualities_from_scores(const vector& scores, bool have_qualities, + const vector* multiplicities) const { + + auto aligner = get_aligner(have_qualities); - // Select whether to use base or adjusted scores depending on whether - // we did population-aware alignment and succeeded for all the - // multipath alignments. - auto& scores = (include_population_component && all_paths_pop_consistent) ? pop_adjusted_scores : base_scores; + vector raw_mapqs; + + if (scores.size() > 1 && max_alt_mappings > 1) { + // we want MAPQs for all of the multi-mapped reads, so we need the exact algorithm + raw_mapqs = aligner->compute_all_mapping_qualities(scores, multiplicities); + } + else { + // we only need a MAPQ for the primary + + bool use_exact = (mapping_quality_method == Exact); + if (!use_exact && scores.size() >= 2 + && (scores[1] > scores[0] || + (mapping_quality_method == Adaptive && scores[1] < scores[0] - get_aligner()->mapping_quality_score_diff(max_mapping_quality)))) { + use_exact = true; + } + + raw_mapqs.push_back(aligner->compute_first_mapping_quality(scores, !use_exact, multiplicities)); + } + + // arbitrary scaling, seems to help performance + for (auto& raw_mapq : raw_mapqs) { + raw_mapq *= mapq_scaling_factor; + +#ifdef debug_multipath_mapper + cerr << "scores yield a raw MAPQ of " << raw_mapq << endl; +#endif + } + + return raw_mapqs; + + } + + void MultipathMapper::sort_and_compute_mapping_quality(vector& multipath_alns, + vector* cluster_idxs, + vector* multiplicities) const { + if (cluster_idxs) { + assert(cluster_idxs->size() == multipath_alns.size()); + } + if (multiplicities) { + assert(multiplicities->size() == multipath_alns.size()); + } + + if (multipath_alns.empty()) { + return; + } + + // get the log-likelihoods of each mapping + vector scores = mapping_likelihoods(multipath_alns); // find the order of the scores vector order(multipath_alns.size(), 0); @@ -3293,9 +6960,12 @@ namespace vg { order[i] = i; } // Sort, shuffling based on the aligned sequence to break ties. + LazyRNG rng([&]() { + return make_shuffle_seed(multipath_alns.front()); + }); sort_shuffling_ties(order.begin(), order.end(), [&](const size_t i, const size_t j) { return scores[i] > scores[j]; }, - [&](const size_t seed_source) {return multipath_alns[seed_source].sequence(); }); + rng); // translate the order to an index vector index(multipath_alns.size()); @@ -3303,7 +6973,7 @@ namespace vg { index[order[i]] = i; } - // put the scores, clusters-of-origin, and alignments in order + // put the scores, multiplicities, clusters-of-origin, and alignments in order for (size_t i = 0; i < multipath_alns.size(); i++) { while (index[i] != i) { std::swap(scores[index[i]], scores[i]); @@ -3311,8 +6981,10 @@ namespace vg { if (cluster_idxs) { std::swap((*cluster_idxs)[index[i]], (*cluster_idxs)[i]); } + if (multiplicities) { + std::swap((*multiplicities)[index[i]], (*multiplicities)[i]); + } std::swap(index[index[i]], index[i]); - } } @@ -3328,6 +7000,9 @@ namespace vg { if (cluster_idxs) { (*cluster_idxs)[i - removed_so_far] = (*cluster_idxs)[i]; } + if (multiplicities) { + (*multiplicities)[i - removed_so_far] = (*multiplicities)[i]; + } } } if (removed_so_far) { @@ -3336,6 +7011,9 @@ namespace vg { if (cluster_idxs) { cluster_idxs->resize(cluster_idxs->size() - removed_so_far); } + if (multiplicities) { + multiplicities->resize(multiplicities->size() - removed_so_far); + } } #ifdef debug_multipath_mapper @@ -3343,189 +7021,95 @@ namespace vg { for (size_t i = 0; i < scores.size(); i++) { Alignment aln; optimal_alignment(multipath_alns[i], aln); - cerr << "\t" << scores[i] << " " << make_pos_t(aln.path().mapping(0).position()) << endl; + cerr << "\t" << scores[i] << " " << (aln.path().mapping_size() ? make_pos_t(aln.path().mapping(0).position()) : pos_t()); + if (multiplicities) { + cerr << ", multiplicity " << multiplicities->at(i); + } + cerr << endl; } #endif - if (mapq_method != None) { - // Sometimes we are passed None, which means to not update the MAPQs at all. But otherwise, we do MAPQs. - // Compute and set the mapping quality - int32_t raw_mapq = compute_raw_mapping_quality_from_scores(scores, mapq_method); - multipath_alns.front().set_mapping_quality(min(raw_mapq, max_mapping_quality)); + // Compute and set the mapping quality + vector uncapped_mapqs = compute_raw_mapping_qualities_from_scores(scores, !multipath_alns.front().quality().empty(), + multiplicities); + for (size_t i = 0; i < uncapped_mapqs.size(); ++i) { + multipath_alns[i].set_mapping_quality(min(uncapped_mapqs[i], max_mapping_quality)); + } + + if (report_allelic_mapq) { + // figure out what the mapping quality would be for the lowest-scoring combination of + // alleles + int32_t allelic_diff = optimal_alignment_score(multipath_alns.front()) - worst_alignment_score(multipath_alns.front()); + if (allelic_diff != 0) { + scores[0] -= allelic_diff; + vector uncapped_allelic_mapqs = compute_raw_mapping_qualities_from_scores(scores, + !multipath_alns.front().quality().empty(), + multiplicities); + auto uncapped_allelic_mapq = uncapped_allelic_mapqs.front(); + int32_t allelic_mapq = min(uncapped_allelic_mapqs.front(), max_mapping_quality); + if (allelic_mapq != multipath_alns.front().mapping_quality()) { + // other alleles do not place this read as confidently + multipath_alns.front().set_annotation("allelic_mapq", (double) allelic_mapq); + } + scores[0] += allelic_diff; + } + } + + if (report_group_mapq) { + // TODO: this can include alignments that are later removed as being insigificant, but they shouldn't + // affect the sum much at all + size_t num_reporting = min(multipath_alns.size(), max_alt_mappings); + vector reporting_idxs(num_reporting, 0); + for (size_t i = 1; i < num_reporting; ++i) { + reporting_idxs[i] = i; + } + double raw_mapq = get_aligner(!multipath_alns.front().quality().empty())->compute_group_mapping_quality(scores, reporting_idxs, + multiplicities); + // TODO: for some reason set_annotation will accept a double but not an int + double group_mapq = min(max_mapping_quality, mapq_scaling_factor * raw_mapq); + + for (size_t i = 0; i < num_reporting; ++i) { + multipath_alns[i].set_annotation("group_mapq", group_mapq); + } } } // TODO: pretty duplicative with the unpaired version - void MultipathMapper::sort_and_compute_mapping_quality(vector>& multipath_aln_pairs, + void MultipathMapper::sort_and_compute_mapping_quality(vector>& multipath_aln_pairs, vector, int64_t>>& cluster_pairs, - vector>* duplicate_pairs_out) const { + vector>* duplicate_pairs_out, + vector* multiplicities) const { #ifdef debug_multipath_mapper cerr << "Sorting and computing mapping qualities for paired reads" << endl; #endif assert(multipath_aln_pairs.size() == cluster_pairs.size()); + if (multiplicities) { + assert(multipath_aln_pairs.size() == multiplicities->size()); + } if (multipath_aln_pairs.empty()) { return; } - // Only do the population MAPQ if it might disambiguate two paths (since it's not - // as cheap as just using the score), or if we set the setting to always do it. - bool include_population_component = (use_population_mapqs && (multipath_aln_pairs.size() > 1 || always_check_population)); - // records whether of the paths followed the edges in the index - bool all_paths_pop_consistent = true; - - double log_base = get_aligner()->log_base; - - // the scores of the optimal alignments and fragments, ignoring population - vector base_scores(multipath_aln_pairs.size(), 0.0); - - // the scores of the optimal alignments and fragments, accounting for population - vector pop_adjusted_scores; - if (include_population_component) { - pop_adjusted_scores.resize(multipath_aln_pairs.size()); - } - // population + fragment score, for when population adjustment is used, to make scores nonnegative - double min_extra_score = numeric_limits::max(); - // just fragment score, for running without population adjustment, to make scores nonnegative - double min_frag_score = numeric_limits::max(); - - for (size_t i = 0; i < multipath_aln_pairs.size(); i++) { - pair& multipath_aln_pair = multipath_aln_pairs[i]; - - // We will query the population database for this alignment if it - // is turned on and it succeeded for the others. - bool query_population = include_population_component && all_paths_pop_consistent; - - // Generate the top alignments on each side, or the top - // population_max_paths alignments if we are doing multiple - // alignments for population scoring. - auto alignments1 = optimal_alignments(multipath_aln_pair.first, query_population ? population_max_paths : 1); - auto alignments2 = optimal_alignments(multipath_aln_pair.second, query_population ? population_max_paths : 1); - assert(!alignments1.empty()); - assert(!alignments2.empty()); - - // Compute the optimal alignment score ignoring population - int32_t alignment_score = alignments1[0].score() + alignments2[0].score(); - - // compute the fragment distribution's contribution to the score - double frag_score = fragment_length_log_likelihood(cluster_pairs[i].second) / log_base; - min_frag_score = min(frag_score, min_frag_score); - - // Record the base score, including fragment contribution - base_scores[i] = alignment_score + frag_score; - - if (query_population) { - // We also want to select the optimal population-scored alignment on each side and compute a pop-adjusted score. - - // Make sure to grab the memo - auto& memo = get_rr_memo(recombination_penalty, xindex->get_haplotype_count()); - - // What's the base + population score for each alignment? - // We need to consider them together because there's a trade off between recombinations and mismatches. - vector base_pop_scores1(alignments1.size()); - vector base_pop_scores2(alignments2.size()); - - for (size_t j = 0; j < alignments1.size(); j++) { - // Pop score the first alignments - auto pop_score = haplo_score_provider->score(alignments1[j].path(), memo); - base_pop_scores1[j] = alignments1[j].score() + pop_score.first / log_base; - - if (std::isnan(base_pop_scores1[j]) && pop_score.second) { - // This shouldn't happen. Bail out on haplotype adjustment for this read and warn. - cerr << "warning:[vg::MultipathMapper]: NAN population adjusted score obtained for paired read " - << alignments1[j].name() << " with ostensibly successful query. Changing to failure." << endl; - pop_score.second = false; - } - - all_paths_pop_consistent &= pop_score.second; - } - - for (size_t j = 0; j < alignments2.size(); j++) { - // Pop score the second alignments - auto pop_score = haplo_score_provider->score(alignments2[j].path(), memo); - base_pop_scores2[j] = alignments2[j].score() + pop_score.first / log_base; - - if (std::isnan(base_pop_scores2[j]) && pop_score.second) { - // This shouldn't happen. Bail out on haplotype adjustment for this read and warn. - cerr << "warning:[vg::MultipathMapper]: NAN population adjusted score obtained for paired read " - << alignments2[j].name() << " with ostensibly successful query. Changing to failure." << endl; - pop_score.second = false; - } - - all_paths_pop_consistent &= pop_score.second; - } - - if (!all_paths_pop_consistent) { - // If we couldn't score everything, bail - continue; - } - - // Pick the best alignment on each side - auto best_index1 = max_element(base_pop_scores1.begin(), base_pop_scores1.end()) - base_pop_scores1.begin(); - auto best_index2 = max_element(base_pop_scores2.begin(), base_pop_scores2.end()) - base_pop_scores2.begin(); - - // Compute the total pop adjusted score for this MultipathAlignment - pop_adjusted_scores[i] = base_pop_scores1[best_index1] + base_pop_scores2[best_index2] + frag_score; - - // Save the pop scores to the multipath alignments - set_annotation(multipath_aln_pair.first, "haplotype_score", base_pop_scores1[best_index1]); - set_annotation(multipath_aln_pair.second, "haplotype_score", base_pop_scores2[best_index2]); - - assert(!std::isnan(base_pop_scores1[best_index1])); - assert(!std::isnan(base_pop_scores2[best_index2])); - assert(!std::isnan(frag_score)); - assert(!std::isnan(pop_adjusted_scores[i])); - - // How much was extra over the score of the top-base-score alignment on each side? - // This might be negative if e.g. that alignment looks terrible population-wise but we take it anyway. - auto extra = pop_adjusted_scores[i] - alignment_score; - - // Record our extra score if it was a new minimum - min_extra_score = min(extra, min_extra_score); - } - } - - // Decide which scores to use depending on whether we have pop adjusted scores we want to use - auto& scores = (include_population_component && all_paths_pop_consistent) ? pop_adjusted_scores : base_scores; - - for (auto& score : scores) { - // Pull the min frag or extra score out of the score so it will be nonnegative - score -= (include_population_component && all_paths_pop_consistent) ? min_extra_score : min_frag_score; - } - - if (include_population_component && all_paths_pop_consistent) { - // Record that we used the population score - for (auto& multipath_aln_pair : multipath_aln_pairs) { - // We have to do it on each read in each pair. - // TODO: Come up with a simpler way to dump annotations in based on what happens during mapping. - set_annotation(multipath_aln_pair.first, "haplotype_score_used", true); - set_annotation(multipath_aln_pair.second, "haplotype_score_used", true); - } - } else { - // Clean up pop score annotations if present and remove scores from all the reads - for (auto& multipath_aln_pair : multipath_aln_pairs) { - // We have to do it on each read in each pair. - clear_annotation(multipath_aln_pair.first, "haplotype_score_used"); - clear_annotation(multipath_aln_pair.first, "haplotype_score"); - clear_annotation(multipath_aln_pair.second, "haplotype_score_used"); - clear_annotation(multipath_aln_pair.second, "haplotype_score"); - } - } + // get the log-likelihoods of each mapping + vector scores = pair_mapping_likelihoods(multipath_aln_pairs, cluster_pairs); // find the order of the scores vector order(multipath_aln_pairs.size(), 0); for (size_t i = 1; i < multipath_aln_pairs.size(); i++) { order[i] = i; } + // Sort, shuffling based on the aligned sequence to break ties. + LazyRNG rng([&]() { + return make_shuffle_seed(multipath_aln_pairs.front()); + }); sort_shuffling_ties(order.begin(), order.end(), [&](const size_t i, const size_t j) { - return scores[i] > scores[j]; + return scores[i] > scores[j]; }, - [&](const size_t seed_source) { - return multipath_aln_pairs[seed_source].first.sequence() + multipath_aln_pairs[seed_source].second.sequence(); - }); + rng); // translate the order to an index vector index(multipath_aln_pairs.size()); @@ -3539,6 +7123,9 @@ namespace vg { std::swap(scores[index[i]], scores[i]); std::swap(cluster_pairs[index[i]], cluster_pairs[i]); std::swap(multipath_aln_pairs[index[i]], multipath_aln_pairs[i]); + if (multiplicities) { + std::swap((*multiplicities)[index[i]], (*multiplicities)[i]); + } std::swap(index[index[i]], index[i]); } @@ -3550,127 +7137,300 @@ namespace vg { Alignment aln1, aln2; optimal_alignment(multipath_aln_pairs[i].first, aln1); optimal_alignment(multipath_aln_pairs[i].second, aln2); - auto start1 = aln1.path().mapping(0).position().node_id(); - auto start2 = aln2.path().mapping(0).position().node_id(); + int64_t id1 = 0, id2 = 0; + bool rev1 = false, rev2 = false; + if (aln1.path().mapping_size()) { + const auto& pos = aln1.path().mapping(0).position(); + id1 = pos.node_id(); + rev1 = pos.is_reverse(); + } + else { + const auto& pos = multipath_aln_pairs[i].first.subpath(0).path().mapping(0).position(); + id1 = pos.node_id(); + rev1 = pos.is_reverse(); + } + if (aln2.path().mapping_size()) { + const auto& pos = aln2.path().mapping(0).position(); + id2 = pos.node_id(); + rev2 = pos.is_reverse(); + } + else { + const auto& pos = multipath_aln_pairs[i].second.subpath(0).path().mapping(0).position(); + id2 = pos.node_id(); + rev2 = pos.is_reverse(); + } - cerr << "\tpos:" << start1 << "(" << aln1.score() << ")-" << start2 << "(" << aln2.score() << ")" + cerr << "\tpos:" << id1 << (rev1 ? "-" : "+") << "(" << aln1.score() << ") - " << id2 << (rev2 ? "-" : "+") << "(" << aln2.score() << ")" << " align:" << optimal_alignment_score(multipath_aln_pairs[i].first) + optimal_alignment_score(multipath_aln_pairs[i].second) - << ", length: " << cluster_pairs[i].second; - if (include_population_component && all_paths_pop_consistent) { - cerr << ", pop: " << scores[i] - base_scores[i]; - } - cerr << ", combined: " << scores[i] << endl; - } -#endif - - if (mapping_quality_method != None) { - // Compute the raw mapping quality - int32_t raw_mapq = compute_raw_mapping_quality_from_scores(scores, mapping_quality_method); - // Limit it to the max. - int32_t mapq = min(raw_mapq, max_mapping_quality); - multipath_aln_pairs.front().first.set_mapping_quality(mapq); - multipath_aln_pairs.front().second.set_mapping_quality(mapq); - - if (multipath_aln_pairs.size() > 1) { - // find the duplicates of the optimal pair (initially mark with only the pair itself) - vector duplicates_1(1, 0); - vector duplicates_2(1, 0); - vector to_remove; - for (size_t i = 1; i < multipath_aln_pairs.size(); i++) { - bool duplicate_1 = share_terminal_positions(multipath_aln_pairs[0].first, multipath_aln_pairs[i].first); - bool duplicate_2 = share_terminal_positions(multipath_aln_pairs[0].second, multipath_aln_pairs[i].second); - if (duplicate_1 && duplicate_2) { -#ifdef debug_multipath_mapper - cerr << "found double end duplication at index " << i << endl; -#endif - // this pair is a complete duplication (not just one end) we want it gone - to_remove.push_back(i); - if (duplicate_pairs_out) { - duplicate_pairs_out->push_back(cluster_pairs[i].first); - } - } - else if (duplicate_1) { + << ", length: " << cluster_pairs[i].second; + cerr << ", combined: " << scores[i]; + if (multiplicities) { + cerr << ", multiplicity: " << multiplicities->at(i); + } + cerr << endl; + } +#endif + + // Compute the raw mapping quality + vector uncapped_mapqs = compute_raw_mapping_qualities_from_scores(scores, + !multipath_aln_pairs.front().first.quality().empty() && + !multipath_aln_pairs.front().second.quality().empty(), + multiplicities); + // Limit it to the max. + for (size_t i = 0; i < uncapped_mapqs.size(); ++i) { + int32_t mapq = min(uncapped_mapqs[i], max_mapping_quality); + multipath_aln_pairs[i].first.set_mapping_quality(mapq); + multipath_aln_pairs[i].second.set_mapping_quality(mapq); + } + + + int32_t allelic_diff_1 = 0, allelic_diff_2 = 0; + if (report_allelic_mapq) { + // figure out what the mapping quality would be for the lowest-scoring combination of alleles + allelic_diff_1 = (optimal_alignment_score(multipath_aln_pairs.front().first) + - worst_alignment_score(multipath_aln_pairs.front().first)); + allelic_diff_2 = (optimal_alignment_score(multipath_aln_pairs.front().second) + - worst_alignment_score(multipath_aln_pairs.front().second)); + if (allelic_diff_1 != 0 || allelic_diff_2 != 0) { + scores[0] -= allelic_diff_1 + allelic_diff_2; + vector uncapped_allelic_mapqs = compute_raw_mapping_qualities_from_scores(scores, + !multipath_aln_pairs.front().first.quality().empty() && + !multipath_aln_pairs.front().second.quality().empty(), + multiplicities); + auto uncapped_allelic_mapq = uncapped_allelic_mapqs.front(); + int32_t allelic_mapq = min(uncapped_allelic_mapq, max_mapping_quality); + if (allelic_mapq != multipath_aln_pairs.front().first.mapping_quality() || + allelic_mapq != multipath_aln_pairs.front().second.mapping_quality()) { + // other alleles might not place this read as confidently + multipath_aln_pairs.front().first.set_annotation("allelic_mapq", (double) allelic_mapq); + multipath_aln_pairs.front().second.set_annotation("allelic_mapq", (double) allelic_mapq); + } + scores[0] += allelic_diff_1 + allelic_diff_2; + } + } + + + if (multipath_aln_pairs.size() > 1) { + // TODO: it would be nice to also look for duplicates with other pairs, but i don't love + // the quadratic work that this would require... + + // find the duplicates of the optimal pair (initially mark with only the pair itself) + vector duplicates_1(1, 0); + vector duplicates_2(1, 0); + vector to_remove; + for (size_t i = 1; i < multipath_aln_pairs.size(); i++) { + bool duplicate_1 = share_terminal_positions(multipath_aln_pairs[0].first, multipath_aln_pairs[i].first); + bool duplicate_2 = share_terminal_positions(multipath_aln_pairs[0].second, multipath_aln_pairs[i].second); + if (duplicate_1 && duplicate_2) { #ifdef debug_multipath_mapper - cerr << "found left end duplication at index " << i << endl; + cerr << "found double end duplication at index " << i << endl; #endif - duplicates_1.push_back(i); + // this pair is a complete duplication (not just one end) we want it gone + to_remove.push_back(i); + if (duplicate_pairs_out) { + duplicate_pairs_out->push_back(cluster_pairs[i].first); } - else if (duplicate_2) { + } + else if (duplicate_1) { #ifdef debug_multipath_mapper - cerr << "found right end duplication at index " << i << endl; + cerr << "found left end duplication at index " << i << endl; #endif - duplicates_2.push_back(i); - } + duplicates_1.push_back(i); } + else if (duplicate_2) { +#ifdef debug_multipath_mapper + cerr << "found right end duplication at index " << i << endl; +#endif + duplicates_2.push_back(i); + } + } + + if (!to_remove.empty()) { - if (!to_remove.empty()) { - // remove the full duplicates from all relevant vectors - for (size_t i = 1, removed_so_far = 0; i < multipath_aln_pairs.size(); i++) { - if (removed_so_far < to_remove.size() ? i == to_remove[removed_so_far] : false) { - removed_so_far++; - } - else if (removed_so_far > 0) { - // move these items into their new position - multipath_aln_pairs[i - removed_so_far] = move(multipath_aln_pairs[i]); - scores[i - removed_so_far] = move(scores[i]); - cluster_pairs[i - removed_so_far] = move(cluster_pairs[i]); - } + // remove the full duplicates from all relevant vectors + for (size_t i = 1, removed_so_far = 0; i < multipath_aln_pairs.size(); i++) { + if (removed_so_far < to_remove.size() ? i == to_remove[removed_so_far] : false) { + removed_so_far++; } - - // remove the end positions that are now empty - multipath_aln_pairs.resize(multipath_aln_pairs.size() - to_remove.size()); - scores.resize(scores.size() - to_remove.size()); - cluster_pairs.resize(cluster_pairs.size() - to_remove.size()); - - // update the indexes of the marked single-end duplicates - for (size_t i = 0, removed_so_far = 0; i < duplicates_1.size(); i++) { - while (removed_so_far < to_remove.size() ? to_remove[removed_so_far] < duplicates_1[i] : false) { - removed_so_far++; + else if (removed_so_far > 0) { + // move these items into their new position + multipath_aln_pairs[i - removed_so_far] = move(multipath_aln_pairs[i]); + scores[i - removed_so_far] = scores[i]; + cluster_pairs[i - removed_so_far] = move(cluster_pairs[i]); + if (multiplicities) { + (*multiplicities)[i - removed_so_far] = (*multiplicities)[i]; } - duplicates_1[i] -= removed_so_far; } - - for (size_t i = 0, removed_so_far = 0; i < duplicates_2.size(); i++) { - while (removed_so_far < to_remove.size() ? to_remove[removed_so_far] < duplicates_2[i] : false) { - removed_so_far++; - } - duplicates_2[i] -= removed_so_far; + } + + // remove the end positions that are now empty + multipath_aln_pairs.resize(multipath_aln_pairs.size() - to_remove.size()); + scores.resize(scores.size() - to_remove.size()); + cluster_pairs.resize(cluster_pairs.size() - to_remove.size()); + if (multiplicities) { + multiplicities->resize(multiplicities->size() - to_remove.size()); + } + + // update the indexes of the marked single-end duplicates + for (size_t i = 0, removed_so_far = 0; i < duplicates_1.size(); i++) { + while (removed_so_far < to_remove.size() ? to_remove[removed_so_far] < duplicates_1[i] : false) { + removed_so_far++; } + duplicates_1[i] -= removed_so_far; } - // did we find any duplicates with the optimal pair? - if (duplicates_1.size() > 1 || duplicates_2.size() > 1 || !to_remove.empty()) { - // compute the mapping quality of the whole group of duplicates for each end - int32_t raw_mapq_1 = get_aligner()->compute_group_mapping_quality(scores, duplicates_1); - int32_t raw_mapq_2 = get_aligner()->compute_group_mapping_quality(scores, duplicates_2); - - // arbitrary scaling, seems to help performance - raw_mapq_1 *= mapq_scaling_factor; - raw_mapq_2 *= mapq_scaling_factor; - + for (size_t i = 0, removed_so_far = 0; i < duplicates_2.size(); i++) { + while (removed_so_far < to_remove.size() ? to_remove[removed_so_far] < duplicates_2[i] : false) { + removed_so_far++; + } + duplicates_2[i] -= removed_so_far; + } + } + + // did we find any duplicates with the optimal pair? + if (duplicates_1.size() > 1 || duplicates_2.size() > 1 || !to_remove.empty()) { + // compute the mapping quality of the whole group of duplicates for each end + auto aligner = get_aligner(!multipath_aln_pairs.front().first.quality().empty() && + !multipath_aln_pairs.front().second.quality().empty()); + + int32_t raw_mapq_1 = aligner->compute_group_mapping_quality(scores, duplicates_1, multiplicities); + int32_t raw_mapq_2 = aligner->compute_group_mapping_quality(scores, duplicates_2, multiplicities); + +#ifdef debug_multipath_mapper + cerr << "deduplicated raw MAPQs are " << raw_mapq_1 << " and " << raw_mapq_2 << endl; +#endif + + // arbitrary scaling, seems to help performance + int32_t mapq_1 = min(raw_mapq_1 * mapq_scaling_factor, max_mapping_quality); + int32_t mapq_2 = min(raw_mapq_2 * mapq_scaling_factor, max_mapping_quality); + #ifdef debug_multipath_mapper - cerr << "deduplicated raw MAPQs are " << raw_mapq_1 << " and " << raw_mapq_2 << endl; + cerr << "processed MAPQs are " << mapq_1 << " and " << mapq_2 << endl; #endif + + multipath_aln_pairs.front().first.set_mapping_quality(mapq_1); + multipath_aln_pairs.front().second.set_mapping_quality(mapq_2); + + if (report_allelic_mapq && (allelic_diff_1 != 0 || allelic_diff_2)) { + for (auto i : duplicates_1) { + scores[i] -= allelic_diff_1; + } + for (auto i : duplicates_2) { + scores[i] -= allelic_diff_2; + } + + int32_t raw_allelic_mapq_1 = aligner->compute_group_mapping_quality(scores, duplicates_1, multiplicities); + int32_t raw_allelic_mapq_2 = aligner->compute_group_mapping_quality(scores, duplicates_2, multiplicities); - int32_t mapq_1 = min(raw_mapq_1, max_mapping_quality); - int32_t mapq_2 = min(raw_mapq_2, max_mapping_quality); + int32_t allelic_mapq_1 = min(raw_mapq_1 * mapq_scaling_factor, max_mapping_quality); + int32_t allelic_mapq_2 = min(raw_mapq_2 * mapq_scaling_factor, max_mapping_quality); - multipath_aln_pairs.front().first.set_mapping_quality(mapq_1); - multipath_aln_pairs.front().second.set_mapping_quality(mapq_2); + if (allelic_mapq_1 != mapq_1) { + // other alleles might not place this read as confidently + multipath_aln_pairs.front().first.set_annotation("allelic_mapq", (double) allelic_mapq_1); + } + if (allelic_mapq_2 != mapq_2) { + // other alleles might not place this read as confidently + multipath_aln_pairs.front().second.set_annotation("allelic_mapq", (double) allelic_mapq_2); + } + + for (auto i : duplicates_1) { + scores[i] += allelic_diff_1; + } + for (auto i : duplicates_2) { + scores[i] += allelic_diff_2; + } } } } + + if (report_group_mapq) { + // TODO: this can include alignments that are later removed as being insigificant, but they shouldn't + // affect the sum much at all + size_t num_reporting = min(multipath_aln_pairs.size(), max_alt_mappings); + vector reporting_idxs(num_reporting, 0); + for (size_t i = 1; i < num_reporting; ++i) { + reporting_idxs[i] = i; + } + auto aligner = get_aligner(!multipath_aln_pairs.front().first.quality().empty() && + !multipath_aln_pairs.front().second.quality().empty()); + double raw_mapq = aligner->compute_group_mapping_quality(scores, reporting_idxs, + multiplicities); + + // TODO: for some reason set_annotation will accept a double but not an int + double group_mapq = min(max_mapping_quality, mapq_scaling_factor * raw_mapq); + for (size_t i = 0; i < num_reporting; ++i) { + multipath_aln_pairs[i].first.set_annotation("group_mapq", group_mapq); + multipath_aln_pairs[i].second.set_annotation("group_mapq", group_mapq); + } + } } double MultipathMapper::fragment_length_log_likelihood(int64_t length) const { double dev = length - fragment_length_distr.mean(); - return -dev * dev / (2.0 * fragment_length_distr.stdev() * fragment_length_distr.stdev()); + return -dev * dev / (2.0 * fragment_length_distr.std_dev() * fragment_length_distr.std_dev()); } void MultipathMapper::set_automatic_min_clustering_length(double random_mem_probability) { - min_clustering_mem_length = max(log(1.0 - pow(random_mem_probability, 1.0 / xindex->seq_length)) / log(0.25), 1); + min_clustering_mem_length = max(log(1.0 - pow(random_mem_probability, 1.0 / total_seq_length)) / log(0.25), 1); } - + + void MultipathMapper::set_min_softclip_length_for_splice(size_t length) { + min_softclip_length_for_splice = length; + + // find the lowest score that could correspond to a high quality match of this length + // TODO: kinda ugly, but whatever + string dummy_a(length, 'A'); + string dummy_c(length, 'C'); + string dummy_g(length, 'G'); + string dummy_t(length, 'T'); + int32_t score_a = get_regular_aligner()->score_exact_match(dummy_a); + int32_t score_c = get_regular_aligner()->score_exact_match(dummy_c); + int32_t score_g = get_regular_aligner()->score_exact_match(dummy_g); + int32_t score_t = get_regular_aligner()->score_exact_match(dummy_t); + int32_t lowest_score = min(score_a, min(score_c, min(score_g, score_t))); + + // add in the full length bonus, this is the criterion we will actually check against + string dummy_qual(length, char(40)); + min_softclipped_score_for_splice = lowest_score + get_regular_aligner()->score_full_length_bonus(false, dummy_a.begin(), + dummy_a.end(), + dummy_qual.begin()); + } + + void MultipathMapper::set_log_odds_against_splice(double log_odds) { + no_splice_natural_log_odds = log_odds; + no_splice_log_odds = round(log_odds / get_regular_aligner()->log_base); + } + + void MultipathMapper::set_intron_length_distribution(const vector& intron_mixture_weights, + const vector>& intron_component_params) { + splice_stats.update_intron_length_distribution(intron_mixture_weights, intron_component_params, *get_aligner()); + } + + void MultipathMapper::set_max_merge_supression_length() { + max_tail_merge_supress_length = ceil(double(get_regular_aligner()->match) / double(get_regular_aligner()->mismatch)); + } + + void MultipathMapper::set_read_1_adapter(const string& adapter) { + // TODO: magic number + string trimmed_adapter = adapter; + if (trimmed_adapter.size() > 12) { + trimmed_adapter.resize(12); + } + read_1_adapter = trimmed_adapter; + read_1_adapter_lps = make_prefix_suffix_table(read_1_adapter.c_str(), read_1_adapter.size()); + } + + void MultipathMapper::set_read_2_adapter(const string& adapter) { + // TODO: magic number + string trimmed_adapter = adapter; + if (trimmed_adapter.size() > 12) { + trimmed_adapter.resize(12); + } + read_2_adapter = reverse_complement(trimmed_adapter); + read_2_adapter_lps = make_prefix_suffix_table(read_2_adapter.c_str(), read_2_adapter.size()); + } + // make the memos live in this .o file thread_local unordered_map, haploMath::RRMemo> MultipathMapper::rr_memos; @@ -3685,20 +7445,8 @@ namespace vg { return rr_memos.at(make_pair(recombination_penalty, population_size)); } } - - double MultipathMapper::read_coverage_z_score(int64_t coverage, const Alignment& alignment) const { - /* algebraically equivalent to - * - * Coverage - ReadLen / 4 - * ------------------------------- - * sqrt(ReadLen * 1/4 * (1 - 1/4)) - * - * from the Normal approximation to a Binomal(ReadLen, 1/4) - */ - double root_len = sqrt(alignment.sequence().size()); - return 0.5773502691896258 * (4.0 * coverage / root_len - root_len); - } } + diff --git a/src/multipath_mapper.hpp b/src/multipath_mapper.hpp index aad825d2822..38ee19d239a 100644 --- a/src/multipath_mapper.hpp +++ b/src/multipath_mapper.hpp @@ -1,46 +1,39 @@ -// -// multipath_mapper.hpp -// -// -// +/** + * \file multipath_mapper.hpp + * + * Defines the MultipathMapper class + */ #ifndef multipath_mapper_hpp #define multipath_mapper_hpp -#include "hash_map.hpp" +#include +#include +#include +#include +#include +#include + #include "mapper.hpp" -#include "gssw_aligner.hpp" +#include "aligner.hpp" #include "types.hpp" #include "multipath_alignment.hpp" -#include "xg.hpp" -#include "vg.pb.h" -#include "position.hpp" -#include "nodeside.hpp" -#include "path.hpp" -#include "edit.hpp" #include "snarls.hpp" #include "haplotypes.hpp" +#include "snarl_distance_index.hpp" +#include "path_component_index.hpp" +#include "splicing.hpp" +#include "memoizing_graph.hpp" -#include "algorithms/extract_containing_graph.hpp" -#include "algorithms/extract_connecting_graph.hpp" -#include "algorithms/extract_extending_graph.hpp" -#include "algorithms/topological_sort.hpp" -#include "algorithms/weakly_connected_components.hpp" -#include "algorithms/is_acyclic.hpp" -#include "algorithms/is_single_stranded.hpp" -#include "algorithms/split_strands.hpp" -#include "algorithms/count_walks.hpp" -#include -#include +// note: only activated for single end mapping +//#define mpmap_instrument_mem_statistics using namespace std; using namespace haplo; using namespace structures; namespace vg { - - class MultipathMapper : public BaseMapper { public: @@ -49,30 +42,30 @@ namespace vg { // Interface //////////////////////////////////////////////////////////////////////// - MultipathMapper(xg::XG* xg_index, gcsa::GCSA* gcsa_index, gcsa::LCPArray* lcp_array, - haplo::ScoreProvider* haplo_score_provider = nullptr, SnarlManager* snarl_manager = nullptr); + MultipathMapper(PathPositionHandleGraph* graph, gcsa::GCSA* gcsa_index, gcsa::LCPArray* lcp_array, + haplo::ScoreProvider* haplo_score_provider = nullptr, SnarlManager* snarl_manager = nullptr, + SnarlDistanceIndex* distance_index = nullptr); ~MultipathMapper(); /// Map read in alignment to graph and make multipath alignments. void multipath_map(const Alignment& alignment, - vector& multipath_alns_out, - size_t max_alt_mappings); + vector& multipath_alns_out); /// Map a paired read to the graph and make paired multipath alignments. Assumes reads are on the /// same strand of the DNA/RNA molecule. If the fragment length distribution is still being estimated /// and the pair cannot be mapped unambiguously, adds the reads to a buffer for ambiguous pairs and /// does not output any multipath alignments. - void multipath_map_paired(const Alignment& alignment1, const Alignment& alignment2, - vector>& multipath_aln_pairs_out, - vector>& ambiguous_pair_buffer, - size_t max_alt_mappings); + /// Returns true if the output is properly paired, or false if it is independent end mappings. + bool multipath_map_paired(const Alignment& alignment1, const Alignment& alignment2, + vector>& multipath_aln_pairs_out, + vector>& ambiguous_pair_buffer); - /// Given a mapped MultipathAlignment, reduce it to up to - /// max_alt_mappings + 1 nonoverlapping single path alignments, with + /// Given a mapped multipath_alignment_t, reduce it to up to + /// max_number + 1 nonoverlapping single path alignments, with /// mapping qualities accounting for positional uncertainty between /// them. /// Even if the read is unmapped, there will always be at least one (possibly score 0) output alignment. - void reduce_to_single_path(const MultipathAlignment& multipath_aln, vector& alns_out, size_t max_alt_mappings) const; + void reduce_to_single_path(const multipath_alignment_t& multipath_aln, vector& alns_out, size_t max_number) const; /// Sets the minimum clustering MEM length to the approximate length that a MEM would have to be to /// have at most the given probability of occurring in random sequence of the same size as the graph @@ -80,40 +73,102 @@ namespace vg { /// Map random sequences against the graph to calibrate a parameterized distribution that detects /// when mappings are likely to have occurred by chance - void calibrate_mismapping_detection(size_t num_simulations = 1000, size_t simulated_read_length = 150); + void calibrate_mismapping_detection(size_t num_simulations, const vector& simulated_read_lengths); + + /// Experimental: skeleton code for predicting path distance from minimum distance + void determine_distance_correlation(); /// Should be called once after construction, or any time the band padding multiplier is changed void init_band_padding_memo(); + using AlignerClient::set_alignment_scores; + + /// Set the algner scoring parameters and create the stored aligner instances. The + /// score matrix should by a 4 x 4 array in the order (ACGT) + void set_alignment_scores(const int8_t* score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus); + + /// How big of a softclip should lead us to attempt spliced alignment? + void set_min_softclip_length_for_splice(size_t length); + + /// What should the prior odds against a spliced alignment be? + void set_log_odds_against_splice(double log_odds); + + /// Use a non-default intron length distribution + void set_intron_length_distribution(const vector& intron_mixture_weights, + const vector>& intron_component_params); + + /// Decide how long of a tail alignment we want before we allow its subpath to be merged + void set_max_merge_supression_length(); + + /// Use adapter sequences to help identify soft-clips that should not be splice-aligned, sequences + /// should be ~12 bp presented in the orientation that a trimmable sequence is found in the + /// sequencing data (reverse complement to the actual sequence, since it is encountered on the + /// other read) + void set_read_1_adapter(const string& adapter); + void set_read_2_adapter(const string& adapter); + // parameters + size_t max_branch_trim_length = 1; + bool agglomerate_multipath_alns = false; int64_t max_snarl_cut_size = 5; + size_t max_tail_merge_supress_length = 4; + bool suppress_tail_anchors = false; + size_t min_tail_anchor_length = 3; double band_padding_multiplier = 1.0; + bool use_pessimistic_tail_alignment = false; + double pessimistic_gap_multiplier = 0.0; + bool restrained_graph_extraction = false; size_t max_expected_dist_approx_error = 8; int32_t num_alt_alns = 4; + size_t max_dagify_duplications = 10; double mem_coverage_min_ratio = 0.5; + double truncation_multiplicity_mq_limit = 7.0; double max_suboptimal_path_score_ratio = 2.0; size_t num_mapping_attempts = 48; double log_likelihood_approx_factor = 1.0; size_t min_clustering_mem_length = 0; + bool use_stripped_match_alg = false; + size_t stripped_match_alg_strip_length = 16; + size_t stripped_match_alg_max_length = 0; + size_t stripped_match_alg_target_count = 5; + bool use_fanout_match_alg = false; + int max_fanout_base_quality = 20; + int max_fans_out = 5; size_t max_p_value_memo_size = 500; - size_t band_padding_memo_size = 500; - double pseudo_length_multiplier = 1.65; - double max_mapping_p_value = 0.00001; - bool unstranded_clustering = true; + size_t band_padding_memo_size = 2000; + double max_exponential_rate_intercept = 0.612045; + double max_exponential_rate_slope = 0.000555181; + double max_exponential_shape_intercept = 12.136; + double max_exponential_shape_slope = 0.0113637; + double max_mapping_p_value = 0.0001; + double max_rescue_p_value = 0.1; + size_t max_alt_mappings = 1; size_t max_single_end_mappings_for_rescue = 64; size_t max_rescue_attempts = 32; size_t plausible_rescue_cluster_coverage_diff = 5; size_t secondary_rescue_attempts = 4; double secondary_rescue_score_diff = 1.0; - double mapq_scaling_factor = 1.0 / 4.0; + bool get_rescue_graph_from_paths = true; + double rescue_graph_std_devs = 6.0; + double splice_rescue_graph_std_devs = 3.0; + double mapq_scaling_factor = 1.0; + bool report_group_mapq = false; + bool report_allelic_mapq = false; // There must be a ScoreProvider provided, and a positive population_max_paths, if this is true bool use_population_mapqs = false; + // If this is nonzero, it takes precedence over any haplotype count + // available from the score provider or the XG index. If neither of + // those has a haplotype count, this must be set for haplotype scoring + // to work. + size_t force_haplotype_count = 0; // If this is set, use_population_mapqs must be set, and we will always // try to compute population scores, even if there is nothing to // disambiguate. This lets us get an accurate count of scorable reads. bool always_check_population = false; size_t population_max_paths = 10; + size_t population_paths_hard_cap = 1000; + bool top_tracebacks = false; // Note that, like the haplotype scoring code, we work with recombiantion penalties in exponent form. double recombination_penalty = 20.7; // 20.7 = 9 * 2.3 size_t rescue_only_min = 128; @@ -122,9 +177,32 @@ namespace vg { int32_t secondary_rescue_subopt_diff = 10; size_t min_median_mem_coverage_for_split = 0; bool suppress_cluster_merging = false; + bool suppress_multicomponent_splitting = false; size_t alt_anchor_max_length_diff = 5; bool dynamic_max_alt_alns = false; bool simplify_topologies = false; + double prune_subpaths_multiplier = 2.0; + bool use_tvs_clusterer = false; + bool use_min_dist_clusterer = false; + bool greedy_min_dist = false; + bool component_min_dist = false; + bool no_clustering = false; + // length of reversing walks during graph extraction + size_t reversing_walk_length = 0; + bool suppress_p_value_memoization = false; + size_t fragment_length_warning_factor = 0; + size_t max_alignment_gap = 5000; + bool suppress_mismapping_detection = false; + bool do_spliced_alignment = false; + int64_t max_softclip_overlap = 8; + int64_t max_splice_overhang = 3; + int64_t min_splice_rescue_matches = 6; + // about 250k + int64_t max_intron_length = 1 << 18; + int64_t max_splice_ref_search_length = 32; + // the maximum number of pairs of each motif that we will consider during spliced alignment + size_t max_motif_pairs = 1024; + unordered_set ref_path_handles; //static size_t PRUNE_COUNTER; //static size_t SUBGRAPH_TOTAL; @@ -132,99 +210,125 @@ namespace vg { //static size_t SECONDARY_RESCUE_ATTEMPT; //static size_t SECONDARY_RESCUE_TOTAL; - /// We often pass around clusters of MEMs and their graph positions. - using memcluster_t = vector>; + /// We often pass around clusters of MEMs and their graph positions, paired with a multiplicity + using memcluster_t = pair>, double>; /// This represents a graph for a cluster, and holds a pointer to the /// actual extracted graph, a list of assigned MEMs, and the number of /// bases of read coverage that that MEM cluster provides (which serves /// as a priority). - using clustergraph_t = tuple; + using clustergraph_t = tuple, memcluster_t, size_t>; + + /// Represents the mismatches that were allowed in "MEMs" from the fanout + /// match algorithm + using match_fanouts_t = unordered_map>>; + + /// Unique identifier for an unaligned splicing candidate. Specified by: + /// - Cluster candidate: (is read 1, cluster index, nullptr, pos_t()) + /// - Hit candidate: (is read 1, -1, MEM, position) + using candidate_id_t = tuple; protected: - /// Wrapped internal function that allows some code paths to circumvent the current - /// mapping quality method option. - void multipath_map_internal(const Alignment& alignment, - MappingQualityMethod mapq_method, - vector& multipath_alns_out, - size_t max_alt_mappings); + /// Enum for the strand of a splice alignment's splice motifs + enum SpliceStrand {Undetermined, Forward, Reverse}; /// Before the fragment length distribution has been estimated, look for an unambiguous mapping of /// the reads using the single ended routine. If we find one record the fragment length and report /// the pair, if we don't find one, add the read pair to a buffer instead of the output vector. - void attempt_unpaired_multipath_map_of_pair(const Alignment& alignment1, const Alignment& alignment2, - vector>& multipath_aln_pairs_out, + /// Returns true if we successfully find a measurable pair. + bool attempt_unpaired_multipath_map_of_pair(const Alignment& alignment1, const Alignment& alignment2, + vector>& multipath_aln_pairs_out, vector>& ambiguous_pair_buffer); - /// Extracts a section of graph at a distance from the MultipathAlignment based on the fragment length + /// Extracts a section of graph at a distance from the multipath_alignment_t based on the fragment length /// distribution and attempts to align the other paired read to it. If rescuing forward, assumes the - /// provided MultipathAlignment is the first read and vice versa if rescuing backward. Rescue constructs - /// a conventional local alignment with gssw and converts the Alignment to a MultipathAlignment. The - /// MultipathAlignment will be stored in the object passed by reference as an argument. - bool attempt_rescue(const MultipathAlignment& multipath_aln, const Alignment& other_aln, - bool rescue_forward, MultipathAlignment& rescue_multipath_aln); - + /// provided multipath_alignment_t is the first read and vice versa if rescuing backward. Rescue constructs + /// a conventional local alignment with gssw and converts the Alignment to a multipath_alignment_t. The + /// multipath_alignment_t will be stored in the object passed by reference as an argument. + bool attempt_rescue(const multipath_alignment_t& multipath_aln, const Alignment& other_aln, + bool rescue_forward, multipath_alignment_t& rescue_multipath_aln); + + /// Make an alignment to a rescue graph and translate it back to the original node space + /// Returns false if the alignment fails, but does not check statistical significance + bool do_rescue_alignment(const multipath_alignment_t& multipath_aln, const Alignment& other_aln, + bool rescue_forward, multipath_alignment_t& rescue_multipath_aln, + double rescue_mean_length, double num_std_devs) const; + + /// Use the algorithm implied by the mapper settings to extract a subgraph to perform a rescue alignment against + void extract_rescue_graph(const multipath_alignment_t& multipath_aln, const Alignment& other_aln, + bool rescue_forward, MutableHandleGraph* rescue_graph, + double rescue_mean_length, double num_std_devs) const; /// After clustering MEMs, extracting graphs, and assigning hits to cluster graphs, perform - /// multipath alignment + /// multipath alignment. + /// Produces topologically sorted multipath_alignment_ts. void align_to_cluster_graphs(const Alignment& alignment, - MappingQualityMethod mapq_method, vector& cluster_graphs, - vector& multipath_alns_out, + vector& multipath_alns_out, + vector& multiplicities_out, size_t num_mapping_attempts, + const match_fanouts_t* fanouts = nullptr, vector* cluster_idxs = nullptr); /// After clustering MEMs, extracting graphs, assigning hits to cluster graphs, and determining /// which cluster graph pairs meet the fragment length distance constraints, perform multipath /// alignment + /// Produces topologically sorted multipath_alignment_ts. void align_to_cluster_graph_pairs(const Alignment& alignment1, const Alignment& alignment2, vector& cluster_graphs1, vector& cluster_graphs2, + vector>& multipath_aln_pairs_out, vector, int64_t>>& cluster_pairs, - vector>& multipath_aln_pairs_out, + vector& pair_multiplicities, vector>& duplicate_pairs_out, - OrientedDistanceClusterer::paths_of_node_memo_t* paths_of_node_memo = nullptr, - OrientedDistanceClusterer::oriented_occurences_memo_t* oriented_occurences_memo = nullptr, - OrientedDistanceClusterer::handle_memo_t* handle_memo = nullptr); + const match_fanouts_t* fanouts1, const match_fanouts_t* fanouts2); /// Align the read ends independently, but also try to form rescue alignments for each from /// the other. Return true if output obeys pair consistency and false otherwise. + /// Produces topologically sorted multipath_alignment_ts. bool align_to_cluster_graphs_with_rescue(const Alignment& alignment1, const Alignment& alignment2, vector& cluster_graphs1, vector& cluster_graphs2, - bool block_rescue_from_1, bool block_rescue_from_2, - vector>& multipath_aln_pairs_out, - vector, int64_t>>& pair_distances, - size_t max_alt_mappings); - - /// Use the rescue routine on strong suboptimal clusters to see if we can find a good secondary + vector& mems1, + vector& mems2, + vector>& multipath_aln_pairs_out, + vector, int64_t>>& pair_distances_out, + vector& pair_multiplicities_out, + const match_fanouts_t* fanouts1, const match_fanouts_t* fanouts2); + + /// Use the rescue routine on strong suboptimal clusters to see if we can find a good secondary. + /// Produces topologically sorted multipath_alignment_ts. void attempt_rescue_for_secondaries(const Alignment& alignment1, const Alignment& alignment2, vector& cluster_graphs1, vector& cluster_graphs2, vector>& duplicate_pairs, - vector>& multipath_aln_pairs_out, - vector, int64_t>>& cluster_pairs); - - /// Cluster and extract subgraphs for (possibly) only one end, meant to be a non-repeat, and use them to rescue - /// an alignment for the other end, meant to be a repeat - void attempt_rescue_of_repeat_from_non_repeat(const Alignment& alignment1, const Alignment& alignment2, - const vector& mems1, const vector& mems2, - bool do_repeat_rescue_from_1, bool do_repeat_rescue_from_2, - vector& clusters1, vector& clusters2, - vector& cluster_graphs1, vector& cluster_graphs2, - vector>& multipath_aln_pairs_out, - vector, int64_t>>& pair_distances, - size_t max_alt_mappings, - OrientedDistanceClusterer::paths_of_node_memo_t* paths_of_node_memo = nullptr, - OrientedDistanceClusterer::oriented_occurences_memo_t* oriented_occurences_memo = nullptr, - OrientedDistanceClusterer::handle_memo_t* handle_memo = nullptr); + vector>& multipath_aln_pairs_out, + vector, int64_t>>& cluster_pairs, + vector& pair_multiplicities, + const match_fanouts_t* fanouts1, const match_fanouts_t* fanouts2); /// Merge the rescued mappings into the output vector and deduplicate pairs - void merge_rescued_mappings(vector>& multipath_aln_pairs_out, + void merge_rescued_mappings(vector>& multipath_aln_pairs_out, vector, int64_t>>& cluster_pairs, - vector>& rescued_multipath_aln_pairs, - vector, int64_t>>& rescued_cluster_pairs) const; + vector& pair_multiplicities, + vector>& rescued_multipath_aln_pairs, + vector, int64_t>>& rescued_cluster_pairs, + vector& rescued_multiplicities) const; + + /// Use the oriented distance clusterer or the TVS clusterer to cluster MEMs depending on parameters. + /// If using oriented distance cluster, must alo provide an oriented distance measurer. + vector get_clusters(const Alignment& alignment, const vector& mems, + OrientedDistanceMeasurer* distance_measurer = nullptr, + const match_fanouts_t* fanouts = nullptr) const; + + /// Use the oriented distance clusterer or the TVS clusterer to cluster pairs of clusters. Assumes that + /// the fragment length distribution has been estimated and fixed. + vector, int64_t>> get_cluster_pairs(const Alignment& alignment1, + const Alignment& alignment2, + vector& cluster_graphs1, + vector& cluster_graphs2, + OrientedDistanceMeasurer* distance_measurer = nullptr); /// Extracts a subgraph around each cluster of MEMs that encompasses any /// graph position reachable (according to the Mapper's aligner) with @@ -235,129 +339,363 @@ namespace vg { /// caller must delete the VG objects produced! vector query_cluster_graphs(const Alignment& alignment, const vector& mems, - const vector& clusters); - - /// If there are any MultipathAlignments with multiple connected components, split them - /// up and add them to the return vector - void split_multicomponent_alignments(vector& multipath_alns_out, - vector* cluster_idxs = nullptr) const; - - /// If there are any MultipathAlignments with multiple connected components, split them + const vector& clusters) const; + + /// Return a graph (on the heap) that contains a cluster. The paired bool + /// indicates whether the graph is known to be connected (but it is possible + /// for the graph to be connected and have it return false) + pair, bool> extract_cluster_graph(const Alignment& alignment, + const memcluster_t& mem_cluster) const; + + /// Extract a graph that is guaranteed to contain all local alignments that include + /// the MEMs of the cluster. The paired bool indicates whether the graph is + /// known to be connected (but it is possible for the graph to be connected and have + /// it return false) + pair, bool> extract_maximal_graph(const Alignment& alignment, + const memcluster_t& mem_cluster) const; + + /// Extract a graph with an algorithm that tries to extract not much more than what + /// is required to contain the cluster in a single connected component (can be slower + /// than the maximal algorithm for alignments that require large indels), The paired bool + /// indicates whether the graph is known to be connected (but it is possible + /// for the graph to be connected and have it return false) + pair, bool> extract_restrained_graph(const Alignment& alignment, + const memcluster_t& mem_cluster) const; + + /// Returns the union of the intervals on the read that a cluster cover in sorted order + vector> covered_intervals(const Alignment& alignment, + const clustergraph_t& cluster) const; + + /// If there are any multipath_alignment_ts with multiple connected components, split them + /// up and add them to the return vector. + /// Properly handles multipath_alignment_ts that are unmapped. + /// Does not depend on or guarantee topological order in the multipath_alignment_ts. + void split_multicomponent_alignments(vector& multipath_alns_out, + const Alignment* alignment = nullptr, + vector* cluster_graphs = nullptr, + vector* cluster_idxs = nullptr, + vector* multiplicities = nullptr) const; + + /// If there are any multipath_alignment_ts with multiple connected components, split them /// up and add them to the return vector, also measure the distance between them and add - /// a record to the cluster pairs vector - void split_multicomponent_alignments(vector>& multipath_aln_pairs_out, - vector, int64_t>>& cluster_pairs) const; - + /// a record to the cluster pairs vector. + /// Properly handles multipath_alignment_ts that are unmapped. + /// Does not depend on or guarantee topological order in the multipath_alignment_ts. + void split_multicomponent_alignments(const Alignment& alignment1, const Alignment& alignment2, + vector>& multipath_aln_pairs_out, + vector& cluster_graphs1, + vector& cluster_graphs2, + vector, int64_t>>& cluster_pairs, + vector& multiplicities) const; + + /// If the alignment seems very complicated, try to simplify low-scoring parts out of it + void simplify_complicated_multipath_alignment(multipath_alignment_t& multipath_aln) const; + + /// Helper function to be called by split_multicomponent_alignments to reassign hits to the + /// split clusters + void reassign_split_clusters(const Alignment& alignment, + vector& cluster_graphs, + const vector& split_mp_alns, + const vector& cluster_assignments, + const vector& all_cluster_assignments) const; + + /// Combine all of the significant alignments into one. Requires alignments to be sorted by + /// significance already + void agglomerate_alignments(vector& multipath_alns_out, + vector* multiplicities = nullptr); + + /// Combine all of the significant alignments into one pair. Requires alignments to be sorted by + /// significance already + void agglomerate_alignment_pairs(vector>& multipath_aln_pairs_out, + vector, int64_t>>& cluster_pairs, + vector& multiplicities); + + /// Before returning, remove alignments that are likely noise and add a placeholder + /// for an unmapped read if necessary + void purge_unmapped_alignments(vector& multipath_alns_out); + + /// Before returning, remove alignments that are likely noise and add placeholders + /// for unmapped reads if necessary + void purge_unmapped_alignments(vector>& multipath_aln_pairs_out, + bool proper_paired); + + /// The internal agglomeration procedure + void agglomerate(size_t idx, multipath_alignment_t& agglomerating, const multipath_alignment_t& multipath_aln, + vector& agglomerated_group, unordered_set& agg_start_positions, + unordered_set& agg_end_positions) const; + + /// Look for spliced alignments among the results of various stages in the mapping algorithm + /// Returns true if any spliced alignments were made + bool find_spliced_alignments(const Alignment& alignment, vector& multipath_alns_out, + vector& multiplicities, vector& cluster_idxs, + const vector& mems, vector& cluster_graphs, + const match_fanouts_t* fanouts = nullptr, + const multipath_alignment_t* rescue_anchor = nullptr, + bool rescue_left = false, + double rescue_multiplicity = 1.0); + + /// Look for spliced alignments among the results of various stages in the mapping algorithm for pairs + /// Returns true if any spliced alignments were made + bool find_spliced_alignments(const Alignment& alignment1, const Alignment& alignment2, + vector>& multipath_aln_pairs_out, + vector, int64_t>>& cluster_pairs, + vector& pair_multiplicities, + const vector& mems1, const vector& mems2, + vector& cluster_graphs1, vector& cluster_graphs2, + const match_fanouts_t* fanouts = nullptr); + + /// Find candidates for spliced alignment sections for a given multipath alignment among the + /// aligned clusters + void identify_aligned_splice_candidates(const Alignment& alignment, bool search_left, + const pair& primary_interval, + const vector& multipath_alns, + const vector& cluster_idxs, + const vector& current_index, int64_t anchor, + unordered_set& clusters_used_out, + vector& mp_aln_candidates_out) const; + + /// Find candidates for spliced alignment sections for a given multipath alignment among the + /// aligned cluster pairs + void identify_aligned_splice_candidates(const Alignment& alignment, bool read_1, bool search_left, + const pair& primary_interval, + const vector>& multipath_aln_pairs, + const vector, int64_t>>& cluster_pairs, + const vector& current_index, int64_t anchor, + unordered_set& clusters_used_out, + vector& mp_aln_candidates_out) const; + + /// Find candidates for spliced alignment sections for a given multipath alignment among the + /// unaligned clusters and MEMs + void identify_unaligned_splice_candidates(const Alignment& alignment, bool search_left, + const pair& primary_interval, + const vector& mems, + const vector& cluster_graphs, + const unordered_set& clusters_already_used, + vector& cluster_candidates_out, + vector>& hit_candidates_out) const; + + /// Make alignments for the splice alignment cancidates from MEMs and unaligned clusters + void align_to_splice_candidates(const Alignment& alignment, + vector& cluster_graphs, + const vector& mems, + const vector& cluster_candidates, + const vector>& hit_candidates, + const pair& primary_interval, + bool searching_left, + bool is_read_1, + unordered_map>& unaligned_candidate_bank, + vector& candidates_out, + const match_fanouts_t* mem_fanouts = nullptr) const; + + /// Check whether splice segment candidates can form a statistically significant spliced + /// alignment. Returns true if a spliced alignment is made + bool test_splice_candidates(const Alignment& alignment, bool searching_left, + multipath_alignment_t& anchor_mp_aln, double* anchor_multiplicity_out, + SpliceStrand& strand, int64_t num_candidates, + const function& get_candidate, + const function& get_multiplicity, + const function& consume_candidate) const; + + /// Try to rescue an anchor for a missing spliced alignment section between + /// the reads in a pair + bool attempt_rescue_for_splice_segment(const Alignment& alignment, + const pair& primary_interval, + const multipath_alignment_t& rescue_anchor, + bool rescue_left, multipath_alignment_t& rescued_out) const; + + /// See if we can find a spliced alignment segment by aligning between the pair + bool find_rescuable_spliced_alignments(const Alignment& alignment, + multipath_alignment_t& splice_anchor, + double& anchor_multiplicity, + SpliceStrand& strand, + const multipath_alignment_t& rescue_anchor, + double rescue_multiplicity, + bool rescue_left, + const pair& primary_interval) const; + + /// Check if any of the unpaired spliced alignments can make pairs now + /// If any pairs are identified, can invalidate the input alignments + bool retry_pairing_spliced_alignments(const Alignment& alignment1, const Alignment& alignment2, + vector& multipath_alns_1, + vector& multipath_alns_2, + const vector& cluster_idxs_1, + const vector& cluster_idxs_2, + const vector& multiplicities_1, + const vector& multiplicities_2, + vector>& multipath_aln_pairs_out, + vector, int64_t>>& cluster_pairs_out, + vector& pair_multiplicities_out) const; + /// Make a multipath alignment of the read against the indicated graph and add it to /// the list of multimappings. - void multipath_align(const Alignment& alignment, VG* vg, - memcluster_t& graph_mems, - MultipathAlignment& multipath_aln_out) const; + /// Does NOT necessarily produce a multipath_alignment_t in topological order. + void multipath_align(const Alignment& alignment, + clustergraph_t& cluster_graph, + multipath_alignment_t& multipath_aln_out, + const match_fanouts_t* fanouts) const; + + /// If any softclips could have arisen because not enough graph was extracted, extract + /// extra graph in those areas. Returns true if the graph was expanded. + bool expand_for_softclips(clustergraph_t& cluster_graph, + const multipath_alignment_t& multipath_aln) const; /// Removes the sections of an Alignment's path within snarls and re-aligns them with multiple traceback - /// to create a multipath alignment with non-trivial topology - void make_nontrivial_multipath_alignment(const Alignment& alignment, VG& subgraph, - unordered_map>& translator, - SnarlManager& snarl_manager, MultipathAlignment& multipath_aln_out) const; + /// to create a multipath alignment with non-trivial topology. + /// Guarantees that the resulting multipath_alignment_t is in topological order. + void make_nontrivial_multipath_alignment(const Alignment& alignment, const HandleGraph& subgraph, + const function(id_t)>& translator, + multipath_alignment_t& multipath_aln_out) const; /// Remove the full length bonus from all source or sink subpaths that received it - void strip_full_length_bonuses(MultipathAlignment& multipath_aln) const; + void strip_full_length_bonuses(multipath_alignment_t& multipath_aln) const; + + /// Returns a vector of log-likelihoods for each mapping + vector mapping_likelihoods(vector& multipath_alns) const; + + /// Returns a vector of log-likelihoods for each pair mapping + vector pair_mapping_likelihoods(vector>& multipath_aln_pairs, + const vector, int64_t>>& cluster_pairs) const; /// Compute a mapping quality from a list of scores, using the selected method. - int32_t compute_raw_mapping_quality_from_scores(const vector& scores, MappingQualityMethod mapq_method) const; + /// Optionally considers non-present duplicates of the scores encoded as multiplicities + /// Depending on settings, may only return mapping qualities for a prefix of the scores + vector compute_raw_mapping_qualities_from_scores(const vector& scores, bool have_qualities, + const vector* multiplicities = nullptr) const; + - /// Sorts mappings by score and store mapping quality of the optimal alignment in the MultipathAlignment object + /// Sorts mappings by score and store mapping quality of the optimal alignment in the multipath_alignment_t object /// Optionally also sorts a vector of indexes to keep track of the cluster-of-origin - void sort_and_compute_mapping_quality(vector& multipath_alns, MappingQualityMethod mapq_method, - vector* cluster_idxs = nullptr) const; + /// Allows multipath alignments where the best single path alignment is leaving the read unmapped. + /// multipath_alignment_ts MUST be topologically sorted. + void sort_and_compute_mapping_quality(vector& multipath_alns, + vector* cluster_idxs = nullptr, vector* multiplicities = nullptr) const; - /// Sorts mappings by score and store mapping quality of the optimal alignment in the MultipathAlignment object + /// Sorts mappings by score and store mapping quality of the optimal alignment in the multipath_alignment_t object /// If there are ties between scores, breaks them by the expected distance between pairs as computed by the /// OrientedDistanceClusterer::cluster_pairs function (modified cluster_pairs vector) - void sort_and_compute_mapping_quality(vector>& multipath_aln_pairs, + /// Allows multipath alignments where the best single path alignment is leaving the read unmapped. + /// multipath_alignment_ts MUST be topologically sorted. + /// Optionally considers non-present duplicates of the scores encoded as multiplicities + void sort_and_compute_mapping_quality(vector>& multipath_aln_pairs, vector, int64_t>>& cluster_pairs, - vector>* duplicate_pairs_out = nullptr) const; - - /// Estimates the probability that the correct cluster was not chosen as a cluster to rescue from and caps the - /// mapping quality to the minimum of the current mapping quality and this probability (in Phred scale) - void cap_mapping_quality_by_rescue_probability(vector>& multipath_aln_pairs_out, - vector, int64_t>>& cluster_pairs, - vector& cluster_graphs1, - vector& cluster_graphs2, - bool from_secondary_rescue) const; - - /// Estimates the probability that the correct cluster was not identified because of sub-sampling MEM hits and - /// caps the mapping quality to this probability (in Phred scale) - void cap_mapping_quality_by_hit_sampling_probability(vector& multipath_alns_out, - vector& cluster_idxs, - vector& cluster_graphs) const; - - /// Estimates the probability that the correct cluster pair was not identified because of sub-sampling MEM hits and - /// caps the mapping quality to this probability (in Phred scale) - void cap_mapping_quality_by_hit_sampling_probability(vector>& multipath_aln_pairs_out, - vector, int64_t>>& cluster_pairs, - vector& cluster_graphs1, - vector& cluster_graphs2, - bool did_secondary_rescue) const; - - /// Estimates the probability that a cluster with the same hits would have been missed because of - /// subsampling high-count SMEMs - double prob_equivalent_clusters_hits_missed(const memcluster_t& cluster) const; + vector>* duplicate_pairs_out = nullptr, + vector* pair_multiplicities = nullptr) const; + + /// Estimates the number of equivalent mappings (including this one), which we may not have seen due to + /// unexplored rescues. + double estimate_missed_rescue_multiplicity(size_t which_pair, + const vector, int64_t>>& cluster_pairs, + const vector& cluster_graphs1, + const vector& cluster_graphs2, + bool from_secondary_rescue) const; + + /// Estimates the number of equivalent mappings (including this one), which we may not have seen due to + /// limits on the numbers of hits returns for a MEM + double cluster_multiplicity(const memcluster_t& cluster) const; + + /// Estimates the number of equivalent pair mappings (including this one), which we may not have seen due to + /// limits on the numbers of hits returns for a MEM + double pair_cluster_multiplicity(const memcluster_t& cluster_1, const memcluster_t& cluster_2) const; /// Computes the log-likelihood of a given fragment length in the trained distribution double fragment_length_log_likelihood(int64_t length) const; /// Computes the number of read bases a cluster of MEM hits covers. - static int64_t read_coverage(const memcluster_t& mem_hits); + static void set_read_coverage(clustergraph_t& cluster_graph); + + /// Would an alignment this good be expected against a graph this big by chance alone + bool likely_mismapping(const multipath_alignment_t& multipath_aln); /// Would an alignment this good be expected against a graph this big by chance alone - bool likely_mismapping(const MultipathAlignment& multipath_aln); + bool likely_misrescue(const multipath_alignment_t& multipath_aln); /// A scaling of a score so that it approximately follows the distribution of the longest match in p-value test - size_t pseudo_length(const MultipathAlignment& multipath_aln) const; + int64_t pseudo_length(const multipath_alignment_t& multipath_aln) const; /// The approximate p-value for a match length of the given size against the current graph - double random_match_p_value(size_t match_length, size_t read_length); + double random_match_p_value(int64_t match_length, size_t read_length); + + /// Reorganizes the fan-out breaks into the format that MultipathAlignmentGraph wants it in + match_fanouts_t record_fanouts(const vector& mems, + vector>>& fanouts) const; + + /// Get a distance measurer based on the configuartion of the mapper + unique_ptr get_distance_measurer(MemoizingGraph& memoizing_graph) const; /// Compute the approximate distance between two multipath alignments - int64_t distance_between(const MultipathAlignment& multipath_aln_1, const MultipathAlignment& multipath_aln_2, + /// If either is unmapped, or the distance cannot be obtained, returns numeric_limits::max() + int64_t distance_between(const multipath_alignment_t& multipath_aln_1, const multipath_alignment_t& multipath_aln_2, bool full_fragment = false, bool forward_strand = false) const; + int64_t distance(const pos_t& pos_1, const pos_t& pos_2) const; + /// Are two multipath alignments consistently placed based on the learned fragment length distribution? - bool are_consistent(const MultipathAlignment& multipath_aln_1, const MultipathAlignment& multipath_aln_2) const; + bool are_consistent(const multipath_alignment_t& multipath_aln_1, const multipath_alignment_t& multipath_aln_2) const; /// Is this a consistent inter-pair distance based on the learned fragment length distribution? bool is_consistent(int64_t distance) const; - /// Computes the Z-score of the number of matches against an equal length random DNA string. - double read_coverage_z_score(int64_t coverage, const Alignment& alignment) const; - /// Return true if any of the initial positions of the source Subpaths are shared between the two /// multipath alignments - bool share_terminal_positions(const MultipathAlignment& multipath_aln_1, const MultipathAlignment& multipath_aln_2) const; + bool share_terminal_positions(const multipath_alignment_t& multipath_aln_1, const multipath_alignment_t& multipath_aln_2) const; /// Get a thread_local RRMemo with these parameters - haploMath::RRMemo& get_rr_memo(double recombination_penalty, size_t population_size) const;; + haploMath::RRMemo& get_rr_memo(double recombination_penalty, size_t population_size) const; /// Detects if each pair can be assigned to a consistent strand of a path, and if not removes them. Also /// inverts the distances in the cluster pairs vector according to the strand - void establish_strand_consistency(vector>& multipath_aln_pairs, - vector, int64_t>>& cluster_pairs, - OrientedDistanceClusterer::paths_of_node_memo_t* paths_of_node_memo = nullptr, - OrientedDistanceClusterer::oriented_occurences_memo_t* oriented_occurences_memo = nullptr, - OrientedDistanceClusterer::handle_memo_t* handle_memo = nullptr); + void establish_strand_consistency(vector>& multipath_aln_pairs, + vector, int64_t>>& cluster_pairs); + /// A restrained estimate of the amount of gap we would like to align for a read tail + int64_t pessimistic_gap(int64_t length, double multiplier) const; + + /// Return exact matches according to the object's parameters + /// If using the fan-out algorithm, we can optionally leave fan-out MEMs in tact and + /// return a vector of their breaks. + vector find_mems(const Alignment& alignment, + vector>>* mem_fanout_breaks = nullptr); + + string read_1_adapter = ""; + string read_2_adapter = ""; + vector read_1_adapter_lps; + vector read_2_adapter_lps; + + int64_t min_softclip_length_for_splice = 20; + int64_t min_softclipped_score_for_splice = 25; + + // log odds against finding a spliced alignment, in natural log + double no_splice_natural_log_odds = 22.55; + // log odds against finding a spliced alignment, in same log base as score + int32_t no_splice_log_odds = 16; + + DinucleotideMachine dinuc_machine; + SpliceStats splice_stats; SnarlManager* snarl_manager; + SnarlDistanceIndex* distance_index; + unique_ptr path_component_index; + + static const size_t RESCUED; /// Memos used by population model static thread_local unordered_map, haploMath::RRMemo> rr_memos; // a memo for the transcendental p-value function (thread local to maintain threadsafety) - static thread_local unordered_map, double> p_value_memo; + static thread_local unordered_map, double> p_value_memo; + + // a memo for the transcendental restrained extraction function (thread local to maintain threadsafety) + static thread_local unordered_map> pessimistic_gap_memo; + static const size_t gap_memo_max_size; // a memo for transcendental band padidng function (gets initialized at construction) vector band_padding_memo; + +#ifdef mpmap_instrument_mem_statistics + public: + ofstream _mem_stats; + bool _wrote_mem_stats_header = false; +#endif }; } diff --git a/src/nested_traversal_finder.cpp b/src/nested_traversal_finder.cpp index c19bc4962eb..07b1ddf290c 100644 --- a/src/nested_traversal_finder.cpp +++ b/src/nested_traversal_finder.cpp @@ -60,8 +60,8 @@ vector NestedTraversalFinder::find_traversals(const Snarl& site) }; // Get our contained nodes and edges - unordered_set nodes; - unordered_set edges; + unordered_set nodes; + unordered_set edges; // Grab them, including child boundaries but not our boundaries (which we're // guaranteed to visit) @@ -69,8 +69,8 @@ vector NestedTraversalFinder::find_traversals(const Snarl& site) for(auto it = nodes.begin(); it != nodes.end(); ) { // For each node - if (snarl_manager.into_which_snarl((*it)->id(), false) || - snarl_manager.into_which_snarl((*it)->id(), true)) { + if (snarl_manager.into_which_snarl(*it, false) || + snarl_manager.into_which_snarl(*it, true)) { // If the node is a child boundary, don't use it. Use visits to the // child instead. @@ -82,12 +82,18 @@ vector NestedTraversalFinder::find_traversals(const Snarl& site) } } - for (Node* node : nodes) { + for (id_t node_id : nodes) { // Find bubbles for nodes + Node* node = augmented.graph.get_node(node_id); emit_path(find_bubble(node, nullptr, nullptr, site)); } - for (Edge* edge : edges) { + for (const edge_t& edge_handle : edges) { + Edge* edge = augmented.graph.get_edge( + NodeTraversal(augmented.graph.get_node(augmented.graph.get_id(edge_handle.first)), + augmented.graph.get_is_reverse(edge_handle.first)), + NodeTraversal(augmented.graph.get_node(augmented.graph.get_id(edge_handle.second)), + augmented.graph.get_is_reverse(edge_handle.second))); // Find bubbles for edges emit_path(find_bubble(nullptr, edge, nullptr, site)); } @@ -182,7 +188,7 @@ Support NestedTraversalFinder::min_support_in_path(const vector& path) { function support_for_visit = [&](const Visit& v) { if (v.node_id()) { // This is a node visit - Node* node = augmented.graph.get_node(v.node_id()); + id_t node = v.node_id(); // Return the support for it, or 0 if it's not in the map. return augmented.node_supports.count(node) ? augmented.node_supports.at(node) : Support(); @@ -205,8 +211,12 @@ Support NestedTraversalFinder::min_support_in_path(const vector& path) { min_support = support_min(min_support, support_for_visit(*next)); // check the edge support - Edge* edge = augmented.graph.get_edge(to_left_side(*cur), to_right_side(*next)); - assert(edge != NULL); + NodeSide from_side = to_right_side(*cur); + NodeSide to_side = to_left_side(*next); + edge_t edge = augmented.graph.edge_handle(augmented.graph.get_handle(from_side.node, !from_side.is_end), + augmented.graph.get_handle(to_side.node, to_side.is_end)); + + assert(augmented.graph.has_edge(edge.first, edge.second)); Support edge_support = augmented.edge_supports.count(edge) ? augmented.edge_supports.at(edge) : Support(); min_support = support_min(min_support, edge_support); } @@ -278,8 +288,11 @@ set>> NestedTraversalFinder::search_left(const Visit& r continue; } - // Check the edge to it to make sure it has coverage - Edge* edge = augmented.graph.get_edge(to_right_side(extension), to_left_side(to_extend_from)); + // Check the edge to it to make sure it has coverag + NodeSide from_side = to_right_side(extension); + NodeSide to_side = to_left_side(to_extend_from); + edge_t edge = augmented.graph.edge_handle(augmented.graph.get_handle(from_side.node, !from_side.is_end), + augmented.graph.get_handle(to_side.node, to_side.is_end)); if (!augmented.edge_supports.count(edge) || total(augmented.edge_supports.at(edge)) == 0) { // This edge is not supported, so don't explore this extension. @@ -289,7 +302,7 @@ set>> NestedTraversalFinder::search_left(const Visit& r // Look up the node we're entering (either the snarl boundary or // just the node we're going to visit), so we can check to make // sure it has coverage. - Node* node = augmented.graph.get_node(to_right_side(extension).node); + id_t node = to_right_side(extension).node; if (!augmented.node_supports.count(node) || total(augmented.node_supports.at(node)) == 0) { // This node is not supported, so don't explore this extension. diff --git a/src/nodeside.hpp b/src/nodeside.hpp index 445c13130a7..de09308ab6f 100644 --- a/src/nodeside.hpp +++ b/src/nodeside.hpp @@ -4,7 +4,7 @@ #include #include -#include "vg.pb.h" +#include #include "types.hpp" #include "hash_map.hpp" diff --git a/src/nodetraversal.hpp b/src/nodetraversal.hpp index ce58e00f886..126318a2986 100644 --- a/src/nodetraversal.hpp +++ b/src/nodetraversal.hpp @@ -1,7 +1,7 @@ #ifndef VG_NODETRAVERSAL_HPP_INCLUDED #define VG_NODETRAVERSAL_HPP_INCLUDED -#include "vg.pb.h" +#include #include "hash_map.hpp" diff --git a/src/null_masking_graph.cpp b/src/null_masking_graph.cpp new file mode 100644 index 00000000000..507164c1eb5 --- /dev/null +++ b/src/null_masking_graph.cpp @@ -0,0 +1,95 @@ +/** + * \file null_masking_graph.cpp: contains the implementation of NullMaskingGraph + */ + + +#include "null_masking_graph.hpp" + + +namespace vg { + +using namespace std; + +NullMaskingGraph::NullMaskingGraph(const HandleGraph* graph) : graph(graph) { + graph->for_each_handle([&](const handle_t& handle) { + if (graph->get_length(handle) == 0) { + num_null_nodes++; + } + }); +} + +bool NullMaskingGraph::has_node(id_t node_id) const { + bool found_node = false; + if (graph->has_node(node_id)) { + if (graph->get_length(graph->get_handle(node_id)) > 0) { + found_node = true; + } + } + return found_node; +} + +handle_t NullMaskingGraph::get_handle(const id_t& node_id, bool is_reverse) const { + // TODO: should we throw an assert that it's non-empty? + return graph->get_handle(node_id, is_reverse); +} + +id_t NullMaskingGraph::get_id(const handle_t& handle) const { + return graph->get_id(handle); +} + +bool NullMaskingGraph::get_is_reverse(const handle_t& handle) const { + return graph->get_is_reverse(handle); +} + +handle_t NullMaskingGraph::flip(const handle_t& handle) const { + return graph->flip(handle); +} + +size_t NullMaskingGraph::get_length(const handle_t& handle) const { + return graph->get_length(handle); +} + +string NullMaskingGraph::get_sequence(const handle_t& handle) const { + return graph->get_sequence(handle); +} + +bool NullMaskingGraph::follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const { + + return graph->follow_edges(handle, go_left, [&](const handle_t& next) { + bool keep_going = true; + if (graph->get_length(next) > 0) { + keep_going = iteratee(next); + } + return keep_going; + }); +} + +bool NullMaskingGraph::for_each_handle_impl(const function& iteratee, bool parallel) const { + return graph->for_each_handle([&](const handle_t& handle) { + bool keep_going = true; + if (graph->get_length(handle) > 0) { + keep_going = iteratee(handle); + } + return keep_going; + }, parallel); +} + +size_t NullMaskingGraph::get_node_count() const { + return graph->get_node_count() - num_null_nodes; +} + +id_t NullMaskingGraph::min_node_id() const { + return graph->min_node_id(); +} + +id_t NullMaskingGraph::max_node_id() const { + return graph->max_node_id(); +} + +handle_t NullMaskingGraph::get_underlying_handle(const handle_t& handle) const { + return handle; +} + +} + diff --git a/src/null_masking_graph.hpp b/src/null_masking_graph.hpp new file mode 100644 index 00000000000..5889b98ee68 --- /dev/null +++ b/src/null_masking_graph.hpp @@ -0,0 +1,97 @@ +#ifndef VG_NULL_MASKING_GRAPH_HPP_INCLUDED +#define VG_NULL_MASKING_GRAPH_HPP_INCLUDED + +/** \file + * null_masking_graph.hpp: defines a handle graph implementation that hides nodes + * that have no sequence in another graph + */ + +#include "handle.hpp" + +namespace vg { + +using namespace std; + +/** + * A HandleGraph implementation that wraps some other handle graph and hides any + * nodes that have no sequence associated with them. + */ +class NullMaskingGraph : public ExpandingOverlayGraph { +public: + + /// Initialize with the graph we want to mask null nodes in + NullMaskingGraph(const HandleGraph* graph); + + /// Default constructor -- not actually functional + NullMaskingGraph() = default; + + /// Default destructor + ~NullMaskingGraph() = default; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + // Method to check if a node exists by ID + bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph. + size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + id_t max_node_id() const; + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /// Returns the handle in the underlying graph that corresponds to a handle in the + /// overlay + handle_t get_underlying_handle(const handle_t& handle) const; + +private: + /// The graph we're masking empty nodes in + const HandleGraph* graph = nullptr; + + /// The total number of null nodes + size_t num_null_nodes = 0; +}; +} + +#endif diff --git a/src/packer.cpp b/src/packer.cpp index e15474e9e27..2533d7e7298 100644 --- a/src/packer.cpp +++ b/src/packer.cpp @@ -1,21 +1,152 @@ +#include +#include #include "packer.hpp" +#include "statistics.hpp" +#include "../vg.hpp" + +//#define debug +using namespace vg::io; namespace vg { -Packer::Packer(void) : xgidx(nullptr) { } +const int Packer::maximum_quality = 60; +const int Packer::lru_cache_size = 4096; + +size_t Packer::estimate_data_width(size_t expected_coverage) { + return std::ceil(std::log2(2 * expected_coverage)); +} + +size_t Packer::estimate_batch_size(size_t num_threads) { + size_t batch_size = max((size_t)128, (size_t)(pow(2, 14 - log2(num_threads)))); + if (batch_size % 2 != 0) { + ++batch_size; + } + return batch_size; +} + +size_t Packer::estimate_bin_count(size_t num_threads) { + return pow(2, log2(num_threads) + 14); +} + +Packer::Packer(const HandleGraph* graph) : graph(graph), data_width(8), cov_bin_size(0), edge_cov_bin_size(0), num_bases_dynamic(0), base_locks(nullptr), num_edges_dynamic(0), edge_locks(nullptr), node_quality_locks(nullptr), tmpfstream_locks(nullptr) { } + +Packer::Packer(const HandleGraph* graph, bool record_bases, bool record_edges, bool record_edits, bool record_qualities, size_t bin_size, size_t coverage_bins, size_t data_width) : + graph(graph), data_width(data_width), bin_size(bin_size), record_bases(record_bases), record_edges(record_edges), record_edits(record_edits), record_qualities(record_qualities) { + // get the size of the base coverage counter + num_bases_dynamic = 0; + if (record_bases) { + graph->for_each_handle([&](const handle_t& handle) { num_bases_dynamic += graph->get_length(handle); }); + } + // get the size of the edge coverage counter + num_edges_dynamic = 0; + if (record_edges) { + const VectorizableHandleGraph* vec_graph = dynamic_cast(graph); + assert(vec_graph != nullptr); + graph->for_each_edge([&](const edge_t& edge) { + auto edge_index = vec_graph->edge_index(edge); +#ifdef debug + cerr << "Observed edge at index " << edge_index << endl; +#endif + num_edges_dynamic = std::max(num_edges_dynamic, edge_index); + }); + ++num_edges_dynamic; // add one so our size is greater than the max element + } + // get the size of the node qualitiy counter + num_nodes_dynamic = 0; + size_t qual_coverage_bins = 0; + if (record_qualities) { + num_nodes_dynamic = graph->get_node_count() + 1; // add one for 1-based ranks + if (num_bases_dynamic > 0) { + // scale down the quality bins to be proportional in size with the base coverage bins if we can + qual_coverage_bins = coverage_bins * ((double)num_nodes_dynamic / (double)num_bases_dynamic); + qual_coverage_bins = max((size_t)1, qual_coverage_bins); + } + } + + // only bin if we need to + if (num_edges_dynamic <= coverage_bins || num_bases_dynamic <= coverage_bins) { + coverage_bins = 1; + qual_coverage_bins = 1; + } + assert(coverage_bins > 0); + + // coverage counter for each bin (totally independent from the edit coverage bins) + // they are initialized on-demand to better support sparse use-cases + coverage_dynamic.resize(coverage_bins, nullptr); + edge_coverage_dynamic.resize(coverage_bins, nullptr); + node_quality_dynamic.resize(qual_coverage_bins, nullptr); + + // need this for every lookup, so store here + cov_bin_size = coverage_dynamic.size() > 0 ? num_bases_dynamic / coverage_dynamic.size() : 0; + edge_cov_bin_size = edge_coverage_dynamic.size() > 0 ? num_edges_dynamic / edge_coverage_dynamic.size() : 0; + node_qual_bin_size = node_quality_dynamic.size() > 0 ? num_nodes_dynamic / node_quality_dynamic.size() : 0; + + // mutexes for coverage + base_locks = new std::mutex[coverage_dynamic.size()]; + edge_locks = new std::mutex[edge_coverage_dynamic.size()]; + node_quality_locks = new std::mutex[node_quality_dynamic.size()]; + tmpfstream_locks = nullptr; + + // count the bins if binning + if (bin_size) { + n_bins = num_bases_dynamic / bin_size + 1; + } + if (record_edits) { + tmpfstream_locks = new std::mutex[n_bins]; + // open tmpfile if needed + ensure_edit_tmpfiles_open(); + } -Packer::Packer(xg::XG* xidx, size_t binsz) : xgidx(xidx), bin_size(binsz) { - coverage_dynamic = gcsa::CounterArray(xgidx->seq_length, 8); - if (binsz) n_bins = xgidx->seq_length / bin_size + 1; + // speed up quality computation if necessary + for (size_t i = 0; i < get_thread_count(); ++i) { + quality_cache.push_back(new LRUCache, int>(lru_cache_size)); + } + +#ifdef debug + cerr << "Packing across " << num_edges_dynamic << " edge slots and " << num_bases_dynamic << " base slots in " << coverage_bins << " bins" << endl; +#endif } -Packer::~Packer(void) { +void Packer::clear() { + for (auto& counter : coverage_dynamic) { + delete counter; + counter = nullptr; + } + for (auto& counter : edge_coverage_dynamic) { + delete counter; + counter = nullptr; + } + for (auto& counter : node_quality_dynamic) { + delete counter; + counter = nullptr; + } + delete [] base_locks; + base_locks = nullptr; + delete [] edge_locks; + edge_locks = nullptr; + delete [] node_quality_locks; + node_quality_locks = nullptr; + delete [] tmpfstream_locks; + tmpfstream_locks = nullptr; close_edit_tmpfiles(); remove_edit_tmpfiles(); + for (auto& lru_cache : quality_cache) { + delete lru_cache; + lru_cache = nullptr; + } +} + +Packer::~Packer() { + clear(); } void Packer::load_from_file(const string& file_name) { ifstream in(file_name); + if (!in) { + stringstream ss; + ss << "Error [Packer]: unable to read pack file: \"" << file_name << "\"" << endl; + throw runtime_error(ss.str()); + } load(in); } @@ -28,10 +159,12 @@ void Packer::load(istream& in) { sdsl::read_member(bin_size, in); sdsl::read_member(n_bins, in); coverage_civ.load(in); + edge_coverage_civ.load(in); edit_csas.resize(n_bins); for (size_t i = 0; i < n_bins; ++i) { edit_csas[i].load(in); } + node_quality_civ.load(in); // We can only load compacted. is_compacted = true; } @@ -58,7 +191,7 @@ void Packer::merge_from_files(const vector& file_names) { assert(n_bins == c.get_n_bins()); } c.write_edits(tmpfstreams); - collect_coverage(c); + collect_coverage({&c}); } } @@ -79,8 +212,8 @@ void Packer::merge_from_dynamic(vector& packers) { assert(n_bins == c.get_n_bins()); } c.write_edits(tmpfstreams); - collect_coverage(c); } + collect_coverage(packers); } size_t Packer::get_bin_size(void) const { @@ -117,11 +250,50 @@ void Packer::write_edits(ostream& out, size_t bin) const { } } -void Packer::collect_coverage(const Packer& c) { +void Packer::collect_coverage(const vector& packers) { // assume the same basis vector assert(!is_compacted); - for (size_t i = 0; i < c.graph_length(); ++i) { - coverage_dynamic.increment(i, c.coverage_at_position(i)); + if (record_bases) { +#pragma omp parallel for + for (size_t i = 0; i < coverage_dynamic.size(); ++i) { + size_t base_offset = i * cov_bin_size; + size_t bin_size = coverage_bin_size(i); + for (size_t j = 0; j < bin_size; ++j) { + size_t inc_cov = 0; + for (size_t k = 0; k < packers.size(); ++k) { + inc_cov += packers[k]->coverage_at_position(j + base_offset); + } + increment_coverage(j + base_offset, inc_cov); + } + } + } + if (record_edges) { +#pragma omp parallel for + for (size_t i = 0; i < edge_coverage_dynamic.size(); ++i) { + size_t edge_base_offset = i * edge_cov_bin_size; + size_t edge_bin_size = edge_coverage_bin_size(i); + for (size_t j = 0; j < edge_bin_size; ++j) { + size_t inc_edge_cov = 0; + for (size_t k = 0; k < packers.size(); ++k) { + inc_edge_cov += packers[k]->edge_coverage(j + edge_base_offset); + } + increment_edge_coverage(j + edge_base_offset, inc_edge_cov); + } + } + } + if (record_qualities) { +#pragma omp parallel for + for (size_t i = 1; i < node_quality_dynamic.size(); ++i) { + size_t qual_base_offset = i * node_qual_bin_size; + size_t qual_bin_size = node_quality_bin_size(i); + for (size_t j = 0; j < qual_bin_size; ++j) { + size_t inc_qual_cov = 0; + for (size_t k = 0; k < packers.size(); ++k) { + inc_qual_cov += packers[k]->total_node_quality(j + qual_base_offset); + } + increment_node_quality(j + qual_base_offset, inc_qual_cov); + } + } } } @@ -134,15 +306,17 @@ size_t Packer::serialize(std::ostream& out, written += sdsl::write_member(bin_size, out, child, "bin_size_" + name); written += sdsl::write_member(edit_csas.size(), out, child, "n_bins_" + name); written += coverage_civ.serialize(out, child, "graph_coverage_" + name); + written += edge_coverage_civ.serialize(out, child, "edge_coverage_" +name); for (auto& edit_csa : edit_csas) { written += edit_csa.serialize(out, child, "edit_csa_" + name); } + written += node_quality_civ.serialize(out, child, "node_quality_" + name); sdsl::structure_tree::add_size(child, written); return written; } void Packer::make_compact(void) { - // pack the dynamic countarry and edit coverage into the compact data structure + // pack the dynamic countarray and edit coverage into the compact data structure if (is_compacted) { #ifdef debug cerr << "Packer is already compact" << endl; @@ -155,15 +329,70 @@ void Packer::make_compact(void) { } // sync edit file close_edit_tmpfiles(); + // temporaries for construction - size_t basis_length = coverage_dynamic.size(); + size_t basis_length = coverage_size(); int_vector<> coverage_iv; - util::assign(coverage_iv, int_vector<>(basis_length)); - for (size_t i = 0; i < coverage_dynamic.size(); ++i) { - coverage_iv[i] = coverage_dynamic[i]; + size_t edge_coverage_length = edge_vector_size(); + int_vector<> edge_coverage_iv; + size_t node_quality_length = node_quality_vector_size(); + int_vector<> node_quality_iv; + +#ifdef debug + cerr << "Concatenating entries for " << basis_length << " bases and " << edge_coverage_length << " edges" << endl; +#endif + +#pragma omp parallel + { +#pragma omp single + { +#pragma omp task + { + util::assign(coverage_iv, int_vector<>(basis_length)); + } +#pragma omp task + { + util::assign(edge_coverage_iv, int_vector<>(edge_coverage_length)); + } +#pragma omp task + { + util::assign(node_quality_iv, int_vector<>(node_quality_length)); + } + } + } +#pragma omp parallel for + for (size_t i = 0; i < basis_length; ++i) { + coverage_iv[i] = coverage_at_position(i); + } +#pragma omp parallel for + for (size_t i = 0; i < edge_coverage_length; ++i) { + edge_coverage_iv[i] = edge_coverage(i); + } +#pragma omp parallel for + for (size_t i = 1; i < node_quality_length; ++i) { + node_quality_iv[i] = average_node_quality(i); } + + #pragma omp parallel + { +#pragma omp single + { +#pragma omp task + { + util::assign(coverage_civ, coverage_iv); + } +#pragma omp task + { + util::assign(edge_coverage_civ, edge_coverage_iv); + } +#pragma omp task + { + util::assign(node_quality_civ, node_quality_iv); + } + } + } + edit_csas.resize(edit_tmpfile_names.size()); - util::assign(coverage_civ, coverage_iv); construct_config::byte_algo_sa = SE_SAIS; #pragma omp parallel for for (size_t i = 0; i < edit_tmpfile_names.size(); ++i) { @@ -181,10 +410,14 @@ void Packer::make_dynamic(void) { is_compacted = false; } -bool Packer::is_dynamic(void) { +bool Packer::is_dynamic(void) const { return !is_compacted; } +const HandleGraph* Packer::get_graph() const { + return graph; +} + void Packer::ensure_edit_tmpfiles_open(void) { if (tmpfstreams.empty()) { string base = "vg-pack_"; @@ -222,49 +455,128 @@ void Packer::remove_edit_tmpfiles(void) { } } -void Packer::add(const Alignment& aln, bool record_edits) { - // open tmpfile if needed - ensure_edit_tmpfiles_open(); +void Packer::add(const Alignment& aln, int min_mapq, int min_baseq, int trim_ends) { + // mapping quality threshold filter + int mapping_quality = aln.mapping_quality(); + if (mapping_quality < min_mapq) { + return; + } // count the nodes, edges, and edits - for (auto& mapping : aln.path().mapping()) { + Mapping prev_mapping; + bool has_prev_mapping = false; + int prev_bq_total = 0; + int prev_bq_count = 0; + size_t position_in_read = 0; + size_t read_length = aln.sequence().length(); + if (trim_ends > 0 && read_length == 0) { + // could happen in gaf, where we don't bother parsing the sequence + for (auto& mapping : aln.path().mapping()) { + for (auto& edit : mapping.edit()) { + read_length += edit.to_length(); + } + } + } + size_t trim_last = read_length + 1 < trim_ends ? 0 : read_length - trim_ends - 1; + + size_t cur_pos = 0; + for (size_t mi = 0; mi < aln.path().mapping_size(); ++mi) { + auto& mapping = aln.path().mapping(mi); if (!mapping.has_position()) { #ifdef debug cerr << "Mapping has no position" << endl; #endif + has_prev_mapping = false; continue; } // skip nodes outside of our graph, assuming this may be a subgraph - if (!xgidx->has_node(mapping.position().node_id())) { + if (!graph->has_node(mapping.position().node_id())) { + has_prev_mapping = false; continue; } size_t i = position_in_basis(mapping.position()); - for (auto& edit : mapping.edit()) { - if (edit_is_match(edit)) { + size_t node_quality_index = node_index(mapping.position().node_id()); + size_t total_node_quality = 0; + // keep track of average base quality in the mapping + int bq_total = 0; + int bq_count = 0; + int ei = 0; + size_t prev_position_in_read = position_in_read; //snapshot position at first base of mapping + if (record_bases || record_qualities || trim_ends > 0) { + for (auto& edit : mapping.edit()) { + if (edit_is_match(edit)) { #ifdef debug - cerr << "Recording a match" << endl; + cerr << "Recording a match" << endl; #endif + int direction = mapping.position().is_reverse() ? -1 : 1; + for (size_t j = 0; j < edit.from_length(); ++j, ++position_in_read) { + int64_t coverage_idx = i + direction * j; + int base_quality = compute_quality(aln, position_in_read); + bq_total += base_quality; + ++bq_count; + // base quality threshold filter (only if we found some kind of quality) + if (record_bases && (base_quality < 0 || base_quality >= min_baseq) && + position_in_read >= trim_ends && position_in_read <= trim_last) { + increment_coverage(coverage_idx); + if (record_qualities && mapping_quality > 0) { + total_node_quality += mapping_quality; + } + } + } + } else if (record_edits) { + // we represent things on the forward strand + string pos_repr = pos_key(i); + string edit_repr = edit_value(edit, mapping.position().is_reverse()); + size_t bin = bin_for_position(i); + std::lock_guard guard(tmpfstream_locks[bin]); + *tmpfstreams[bin] << pos_repr << edit_repr; + } if (mapping.position().is_reverse()) { - for (size_t j = 0; j < edit.from_length(); ++j) { - coverage_dynamic.increment(i-j); - } + i -= edit.from_length(); } else { - for (size_t j = 0; j < edit.from_length(); ++j) { - coverage_dynamic.increment(i+j); - } + i += edit.from_length(); + } + if (!edit_is_match(edit)) { + position_in_read += edit.to_length(); } - } else if (record_edits) { - // we represent things on the forward strand - string pos_repr = pos_key(i); - string edit_repr = edit_value(edit, mapping.position().is_reverse()); - size_t bin = bin_for_position(i); - *tmpfstreams[bin] << pos_repr << edit_repr; + ++ei; } - if (mapping.position().is_reverse()) { - i -= edit.from_length(); - } else { - i += edit.from_length(); + if (total_node_quality > 0) { + increment_node_quality(node_quality_index, total_node_quality); } } + + if (record_edges && has_prev_mapping && prev_mapping.position().node_id() != mapping.position().node_id() && + (prev_position_in_read - 1 >= trim_ends && prev_position_in_read <= trim_last)) { + // Note: we are effectively ignoring edits here. So an edge is covered even + // if there's a sub or indel at either of its ends in the path. + Edge e; + e.set_from(prev_mapping.position().node_id()); + e.set_from_start(prev_mapping.position().is_reverse()); + e.set_to(mapping.position().node_id()); + e.set_to_end(mapping.position().is_reverse()); + size_t edge_idx = edge_index(e); +#ifdef debug + cerr << "Observed visit to edge " << pb2json(e) << " at index " << edge_idx << endl; +#endif + if (edge_idx != 0) { + // heuristic: for an edge, we average out the base qualities from the matches in its two flanking mappings + int avg_base_quality = -1; + if (!aln.quality().empty()) { + if (bq_count + prev_bq_count == 0) { + avg_base_quality = 0; + } else { + avg_base_quality = (float)(bq_total + prev_bq_total) / (bq_count + prev_bq_count); + } + } + // base quality threshold filter (only if we found some kind of quality) + if (avg_base_quality < 0 || avg_base_quality >= min_baseq) { + increment_edge_coverage(edge_idx); + } + } + } + + prev_mapping = mapping; + has_prev_mapping = true; } } @@ -272,10 +584,11 @@ void Packer::add(const Alignment& aln, bool record_edits) { size_t Packer::position_in_basis(const Position& pos) const { // get position on the forward strand if (pos.is_reverse()) { - return (int64_t)xg_node_start(pos.node_id(), xgidx) - + (int64_t)reverse(pos, xg_node_length(pos.node_id(), xgidx)).offset() - 1; + return (int64_t)dynamic_cast(graph)->node_vector_offset(pos.node_id()) + + (int64_t)reverse(pos, graph->get_length(graph->get_handle(pos.node_id()))).offset() - 1; } else { - return (int64_t)xg_node_start(pos.node_id(), xgidx) + (int64_t)pos.offset(); + return (int64_t)dynamic_cast(graph)->node_vector_offset(pos.node_id()) + + (int64_t)pos.offset(); } } @@ -335,19 +648,243 @@ string Packer::unescape_delim(const string& s, char d) const { return unescaped; } -size_t Packer::graph_length(void) const { - if (is_compacted) { +size_t Packer::coverage_size(void) const { + if (is_compacted){ return coverage_civ.size(); + } + else{ + return num_bases_dynamic; + } +} + +size_t Packer::edge_vector_size(void) const{ + if (is_compacted){ + return edge_coverage_civ.size(); + } + else{ + return num_edges_dynamic; + } +} + +size_t Packer::node_quality_vector_size(void) const { + if (is_compacted) { + return node_quality_civ.size(); + } else { + return num_nodes_dynamic; + } +} + +pair Packer::coverage_bin_offset(size_t i) const { + size_t bin = min((size_t)(i / cov_bin_size), (size_t)(coverage_dynamic.size() - 1)); + // last bin can have different size so we don't use mod + size_t offset = i - bin * cov_bin_size; + return make_pair(bin, offset); +} + +pair Packer::edge_coverage_bin_offset(size_t i) const { + size_t bin = min((size_t)(i / edge_cov_bin_size), (size_t)(edge_coverage_dynamic.size() - 1)); + // last bin can have different size so we don't use mod + size_t offset = i - bin * edge_cov_bin_size; + return make_pair(bin, offset); +} + +pair Packer::node_quality_bin_offset(size_t i) const { + size_t bin = min((size_t)(i / node_qual_bin_size), (size_t)(node_quality_dynamic.size() - 1)); + // last bin can have different size so we don't use mod + size_t offset = i - bin * node_qual_bin_size; + return make_pair(bin, offset); +} + +size_t Packer::coverage_bin_size(size_t i) const { + size_t bin_size = cov_bin_size; + if (i == coverage_dynamic.size() - 1) { + bin_size += num_bases_dynamic % coverage_dynamic.size(); + } + return bin_size; +} + +size_t Packer::edge_coverage_bin_size(size_t i) const { + size_t bin_size = edge_cov_bin_size; + if (i == edge_coverage_dynamic.size() - 1) { + bin_size += num_edges_dynamic % edge_coverage_dynamic.size(); + } + return bin_size; +} + +size_t Packer::node_quality_bin_size(size_t i) const { + size_t bin_size = node_qual_bin_size; + if (i == node_quality_dynamic.size() - 1) { + bin_size += num_nodes_dynamic % node_quality_dynamic.size(); + } + return bin_size; +} + +void Packer::init_coverage_bin(size_t i) { + if (coverage_dynamic[i] == nullptr) { + coverage_dynamic[i] = new gcsa::CounterArray(coverage_bin_size(i), data_width); + } +} + +void Packer::init_edge_coverage_bin(size_t i) { + if (edge_coverage_dynamic[i] == nullptr) { + edge_coverage_dynamic[i] = new gcsa::CounterArray(edge_coverage_bin_size(i), data_width); + } +} + +void Packer::init_node_quality_bin(size_t i) { + if (node_quality_dynamic[i] == nullptr) { + // add some bits to the data width because we have to add a quality and not 1 each time + node_quality_dynamic[i] = new gcsa::CounterArray(node_quality_bin_size(i), data_width + 12); + } +} + +void Packer::increment_coverage(size_t i) { + pair bin_offset = coverage_bin_offset(i); + std::lock_guard guard(base_locks[bin_offset.first]); + init_coverage_bin(bin_offset.first); + coverage_dynamic.at(bin_offset.first)->increment(bin_offset.second); +} + +void Packer::increment_coverage(size_t i, size_t v) { + if (v > 0) { + pair bin_offset = coverage_bin_offset(i); + std::lock_guard guard(base_locks[bin_offset.first]); + init_coverage_bin(bin_offset.first); + coverage_dynamic.at(bin_offset.first)->increment(bin_offset.second, v); + } +} + +void Packer::increment_edge_coverage(size_t i) { + pair bin_offset = edge_coverage_bin_offset(i); + std::lock_guard guard(edge_locks[bin_offset.first]); + init_edge_coverage_bin(bin_offset.first); + edge_coverage_dynamic.at(bin_offset.first)->increment(bin_offset.second); +#ifdef debug + cerr << "Set coverage of edge " << i << " at " << bin_offset.first << "/" << bin_offset.second << " to " << (*edge_coverage_dynamic.at(bin_offset.first))[bin_offset.second] << endl; +#endif +} + +void Packer::increment_edge_coverage(size_t i, size_t v) { + if (v > 0) { + pair bin_offset = edge_coverage_bin_offset(i); + std::lock_guard guard(edge_locks[bin_offset.first]); + init_edge_coverage_bin(bin_offset.first); + edge_coverage_dynamic.at(bin_offset.first)->increment(bin_offset.second, v); +#ifdef debug + cerr << "Set coverage of edge " << i << " at " << bin_offset.first << "/" << bin_offset.second << " to " << (*edge_coverage_dynamic.at(bin_offset.first))[bin_offset.second] << endl; +#endif + } +} + +void Packer::increment_node_quality(size_t i, size_t v) { + if (v > 0) { + pair bin_offset = node_quality_bin_offset(i); + std::lock_guard guard(node_quality_locks[bin_offset.first]); + init_node_quality_bin(bin_offset.first); + node_quality_dynamic.at(bin_offset.first)->increment(bin_offset.second, v); +#ifdef debug + cerr << "Set quality of node " << i << " at " << bin_offset.first << "/" << bin_offset.second << " to " << (*node_quality_dynamic.at(bin_offset.first))[bin_offset.second] << endl; +#endif + } +} + +bool Packer::has_qualities() const { + if (is_compacted) { + for (size_t i = 0; i < node_quality_civ.size(); ++i) { + if (node_quality_civ[i] > 0) { + return true; + } + } } else { - return coverage_dynamic.size(); + for (size_t i = 0; i < node_quality_dynamic.size(); ++i) { + if (node_quality_dynamic[i] != nullptr) { + for (size_t j = 0; j < node_quality_dynamic[i]->size(); ++j) { + if ((*node_quality_dynamic.at(i))[j] > 0) { + return true; + } + } + } + } } + return false; } size_t Packer::coverage_at_position(size_t i) const { if (is_compacted) { return coverage_civ[i]; } else { - return coverage_dynamic[i]; + pair bin_offset = coverage_bin_offset(i); + if (coverage_dynamic[bin_offset.first] == nullptr) { + return 0; + } else { + return (*coverage_dynamic.at(bin_offset.first))[bin_offset.second]; + } + } +} + +size_t Packer::edge_coverage(size_t i) const { + if (is_compacted){ + return edge_coverage_civ[i]; + } + else{ + pair bin_offset = edge_coverage_bin_offset(i); + if (edge_coverage_dynamic[bin_offset.first] == nullptr) { + return 0; + } else { + return (*edge_coverage_dynamic.at(bin_offset.first))[bin_offset.second]; + } + } +} + +size_t Packer::edge_coverage(Edge& e) const { + size_t pos = edge_index(e); + return edge_coverage(pos); +} + +size_t Packer::total_node_quality(size_t i) const { + if (is_compacted) { + size_t avg_qual = average_node_quality(i); + Position pos; + pos.set_node_id(index_to_node(i)); + pos.set_is_reverse(false); + size_t cov_pos = position_in_basis(pos); + size_t node_len = graph->get_length(graph->get_handle(pos.node_id())); + size_t base = position_in_basis(pos); + size_t coverage = 0; + for (size_t i = 0; i < node_len; ++i) { + coverage += coverage_at_position(base + i); + } + return avg_qual * coverage; + } else { + pair bin_offset = node_quality_bin_offset(i); + if (node_quality_dynamic[bin_offset.first] == nullptr) { + return 0; + } else { + return (*node_quality_dynamic.at(bin_offset.first))[bin_offset.second]; + } + } +} + +size_t Packer::average_node_quality(size_t i) const { + if (is_compacted) { + return node_quality_civ[i]; + } else { + Position pos; + pos.set_node_id(index_to_node(i)); + pos.set_is_reverse(false); + size_t cov_pos = position_in_basis(pos); + size_t node_len = graph->get_length(graph->get_handle(pos.node_id())); + size_t base = position_in_basis(pos); + size_t total_coverage = 0; + for (size_t i = 0; i < node_len; ++i) { + total_coverage += coverage_at_position(base + i); + } + if (total_coverage == 0) { + assert(total_node_quality(i) == 0); + return 0; + } else { + return (size_t)std::lround(total_node_quality(i) / total_coverage); + } } } @@ -377,13 +914,36 @@ vector Packer::edits_at_position(size_t i) const { } string value = unescape_delims(extract(edit_csa, b, e)); Edit edit; - edit.ParseFromString(value); + vg::io::ProtobufIterator::parse_from_string(edit, value); edits.push_back(edit); } return edits; } -ostream& Packer::as_table(ostream& out, bool show_edits) { +size_t Packer::edge_index(const Edge& e) const { + edge_t edge = graph->edge_handle(graph->get_handle(e.from(), e.from_start()), + graph->get_handle(e.to(), e.to_end())); + + if (!graph->has_edge(edge)) { + // We can only query the edge index for edges that exist. This edge doesn't. + return 0; + } + + return dynamic_cast(graph)->edge_index(edge); +} + +size_t Packer::node_index(nid_t node_id) const { + if (!graph->has_node(node_id)) { + return 0; + } + return dynamic_cast(graph)->id_to_rank(node_id); +} + +nid_t Packer::index_to_node(size_t i) const { + return dynamic_cast(graph)->rank_to_id(i); +} + +ostream& Packer::as_table(ostream& out, bool show_edits, vector node_ids) { #ifdef debug cerr << "Packer table of " << coverage_civ.size() << " rows:" << endl; #endif @@ -396,8 +956,11 @@ ostream& Packer::as_table(ostream& out, bool show_edits) { out << endl; // write the coverage as a vector for (size_t i = 0; i < coverage_civ.size(); ++i) { - id_t node_id = xgidx->node_at_seq_pos(i+1); - size_t offset = i - xgidx->node_start(node_id); + nid_t node_id = dynamic_cast(graph)->node_at_vector_offset(i+1); + if (!node_ids.empty() && find(node_ids.begin(), node_ids.end(), node_id) == node_ids.end()) { + continue; + } + size_t offset = i - dynamic_cast(graph)->node_vector_offset(node_id); out << i << "\t" << node_id << "\t" << offset << "\t" << coverage_civ[i]; if (show_edits) { out << "\t" << count(edit_csas[bin_for_position(i)], pos_key(i)); @@ -408,6 +971,83 @@ ostream& Packer::as_table(ostream& out, bool show_edits) { return out; } +ostream& Packer::as_edge_table(ostream& out, vector node_ids) { +#ifdef debug + cerr << "Packer edge table of " << edge_coverage_civ.size() << " rows:" << endl; +#endif + + out << "from.id" << "\t" + << "from.start" << "\t" + << "to.id" << "\t" + << "to.end" << "\t" + << "coverage" << endl; + graph->for_each_edge([&](const edge_t& handle_edge) { + Edge edge; + edge.set_from(graph->get_id(handle_edge.first)); + edge.set_from_start(graph->get_is_reverse(handle_edge.first)); + edge.set_to(graph->get_id(handle_edge.second)); + edge.set_to_end(graph->get_is_reverse(handle_edge.second)); + + if (!node_ids.empty() && + (find(node_ids.begin(), node_ids.end(), edge.from()) == node_ids.end() || + find(node_ids.begin(), node_ids.end(), edge.to()) == node_ids.end())) { + + // We need to skip this edge because it deals with nodes outsode of our set. + return true; + } + + // Otherwise, we need to use the edge; all edges are visited exactly once. + // But we want to output it smaller node ID first, which is not guaranteed. + if (edge.from() > edge.to()) { + { + // Swap the from and to around + nid_t temp = edge.from(); + edge.set_from(edge.to()); + edge.set_to(temp); + } + { + // And the flags + bool temp = edge.from_start(); + edge.set_from_start(!edge.to_end()); + edge.set_to_end(!temp); + } + } + + // TODO: we don't canonicalize self loops at all + + // Print out the edge + out << edge.from() << "\t" + << edge.from_start() << "\t" + << edge.to() << "\t" + << edge.to_end() << "\t" + << edge_coverage_civ[edge_index(edge)] + << endl; + + // Look at the enxt edge + return true; + }); + return out; +} + +ostream& Packer::as_quality_table(ostream& out, vector node_ids) { +#ifdef debug + cerr << "Packer quality table of " << node_quality_civ.size() << " rows:" << endl; +#endif + + out << "node.rank" << "\t" + << "node.id" << "\t" + << "avg-mapq"; + out << endl; + for (size_t i = 1; i < node_quality_civ.size(); ++i) { + nid_t node_id = index_to_node(i); + if (!node_ids.empty() && find(node_ids.begin(), node_ids.end(), node_id) == node_ids.end()) { + continue; + } + out << i << "\t" << node_id << "\t" << node_quality_civ[i] << endl; + } + return out; +} + ostream& Packer::show_structure(ostream& out) { out << coverage_civ << endl; // graph coverage (compacted coverage_dynamic) for (auto& edit_csa : edit_csas) { @@ -418,8 +1058,42 @@ ostream& Packer::show_structure(ostream& out) { return out; } -size_t Packer::coverage_size(void) { - return coverage_civ.size(); +int Packer::compute_quality(const Alignment& aln, size_t position_in_read) const { + int map_quality = (int)aln.mapping_quality(); + int base_quality = -1; + if (!aln.quality().empty()) { + base_quality = (int)aln.quality()[position_in_read]; + } + return combine_qualities(map_quality, base_quality); } +int Packer::combine_qualities(int map_quality, int base_quality) const { + if (base_quality < 0) { + // no base quality in read: just return the mapping quality + return map_quality; + } else { + if (base_quality == 0 || map_quality == 0) { + return 0; + } + + // look up the mapping and base quality in the cache to avoid recomputing + auto& qual_cache = *quality_cache[omp_get_thread_num()]; + pair cached = qual_cache.retrieve(make_pair(map_quality, base_quality)); + if (cached.second == true) { + return cached.first; + } else { + // assume independence: P[Correct] = P[Correct Base] * P[Correct Map] + // --> P[Error] = 1 - (1 - P[Base Error]) * (1 - P[Map Error]) + double p_err = logprob_invert(logprob_invert(phred_to_logprob(base_quality)) + + logprob_invert(phred_to_logprob(map_quality))); + // clamp our quality to 60 + int qual = min((int)logprob_to_phred(p_err), (int)maximum_quality); + // update the cache + qual_cache.put(make_pair(map_quality, base_quality), qual); + return qual; + } + } +} + + } diff --git a/src/packer.hpp b/src/packer.hpp index 90316f52e8d..17792572287 100644 --- a/src/packer.hpp +++ b/src/packer.hpp @@ -5,27 +5,62 @@ #include #include #include +#include #include "omp.h" -#include "xg.hpp" +#include "lru_cache.h" #include "alignment.hpp" #include "path.hpp" #include "position.hpp" -#include "json2pb.h" +#include "vg/io/json2pb.h" #include "graph.hpp" #include "gcsa/internal.h" -#include "xg_position.hpp" +#include "sdsl/csa_wt.hpp" +#include "sdsl/suffix_arrays.hpp" #include "utility.hpp" namespace vg { using namespace sdsl; +/// Packer collects coverage of a GAM using compressed indexes +/// Any combination of these 3 types of information can be stored +/// - base coverage : number of reads aligning to a given base (offset in node) in the graph +/// - edge coverage : number of reads aligning to a given edge in the graph +/// - edits : a list of edits at a given base in the graph +/// In memory, the coverages are stored in SDSL int vectors (dynamic) and on disk they are compressed int vectors class Packer { public: - Packer(void); - Packer(xg::XG* xidx, size_t bin_size = 0); - ~Packer(void); - xg::XG* xgidx; + + /// Some helper functions to heuristically estimate input parameters for constructor + static size_t estimate_data_width(size_t expected_coverage); + static size_t estimate_batch_size(size_t num_threads); + static size_t estimate_bin_count(size_t num_threads); + + /// Create a Packer (to read from a file) + Packer(const HandleGraph* graph = nullptr); + + /// Create a Packer (to write to) + /// graph : Must implement the VectorizableHandleGraph interface + /// record_bases : Store the base coverage + /// record_edges : Store the edge coverage + /// record_edits : Store the edits + /// record_qualities : Store the average MAPQ for each node rank + /// bin_size : Bin coverage into bins + /// coverage_bins : Use this many coverage objects. Using one / thread allows faster merge + /// coverage_locks : Number of mutexes to use for each of node and edge coverage. + /// data_width : Number of bits per entry in the dynamic coverage vector. Higher values get stored in a map + Packer(const HandleGraph* graph, bool record_bases, bool record_edges, bool record_edits, bool record_qualities, + size_t bin_size = 0, size_t coverage_bins = 1, size_t data_width = 8); + ~Packer(); + void clear(); + + /// Add coverage from given alignment to the indexes + /// aln : given alignemnt + /// min_mapq : ignore alignments with mapping_quality below this value + /// min_baseq : ignore bases in the alignment if their read quality is below this value + /// trim_ends : ignore first and last bases + void add(const Alignment& aln, int min_mapq = 0, int min_baseq = 0, int trim_ends = 0); + void merge_from_files(const vector& file_names); void merge_from_dynamic(vector& packers); void load_from_file(const string& file_name); @@ -36,31 +71,103 @@ class Packer { std::string name = ""); void make_compact(void); void make_dynamic(void); - void add(const Alignment& aln, bool record_edits = true); - size_t graph_length(void) const; size_t position_in_basis(const Position& pos) const; string pos_key(size_t i) const; string edit_value(const Edit& edit, bool revcomp) const; vector edits_at_position(size_t i) const; size_t coverage_at_position(size_t i) const; - void collect_coverage(const Packer& c); - ostream& as_table(ostream& out, bool show_edits = true); + void collect_coverage(const vector& packers); + ostream& as_table(ostream& out, bool show_edits, vector node_ids); + ostream& as_edge_table(ostream& out, vector node_ids); + ostream& as_quality_table(ostream& out, vector node_ids); ostream& show_structure(ostream& out); // debugging void write_edits(vector& out) const; // for merge void write_edits(ostream& out, size_t bin) const; // for merge size_t get_bin_size(void) const; size_t get_n_bins(void) const; - bool is_dynamic(void); - size_t coverage_size(void); + bool is_dynamic(void) const; + const HandleGraph* get_graph() const; + size_t coverage_size(void) const ; + void increment_coverage(size_t i); + void increment_coverage(size_t i, size_t v); + + size_t edge_coverage(Edge& e) const; + size_t edge_coverage(size_t i) const; + size_t edge_vector_size(void) const; + /// Return the 1-based index of the given edge in our vectorization order, + /// or 0 if the edge does not exist in the graph. + size_t edge_index(const Edge& e) const; + void increment_edge_coverage(size_t i); + void increment_edge_coverage(size_t i, size_t v); + + /// total node quality (faster from dynamimc) + size_t total_node_quality(size_t i) const; + /// average node quality (faster from static) + size_t average_node_quality(size_t i) const; + size_t node_quality_vector_size(void) const; + /// Return the 1-based node rank or 0 if node not in graph + size_t node_index(nid_t node_id) const; + /// and back + nid_t index_to_node(size_t i) const; + void increment_node_quality(size_t i, size_t v); + /// return true if there's at least one nonzero quality in the structure + bool has_qualities() const; + private: + /// map from absolute postion to positions in the binned arrays + pair coverage_bin_offset(size_t i) const; + pair edge_coverage_bin_offset(size_t i) const; + pair node_quality_bin_offset(size_t i) const; + /// get the size of a bin + size_t coverage_bin_size(size_t i) const; + size_t edge_coverage_bin_size(size_t i) const; + size_t node_quality_bin_size(size_t i) const; + /// initialize coverage bins on demand + void init_coverage_bin(size_t i); + void init_edge_coverage_bin(size_t i); + void init_node_quality_bin(size_t i); + void ensure_edit_tmpfiles_open(void); void close_edit_tmpfiles(void); void remove_edit_tmpfiles(void); bool is_compacted = false; + + // base graph + const HandleGraph* graph; + + // data with for counter arrays + size_t data_width; + // bin sizes (last bins may be a bit bigger) + size_t cov_bin_size; + size_t edge_cov_bin_size; + size_t node_qual_bin_size; + // dynamic model - gcsa::CounterArray coverage_dynamic; + // base coverage. we bin to make merging faster + vector coverage_dynamic; + // total length of above vectors + size_t num_bases_dynamic; + // one mutex per element of coverage_dynamic + std::mutex* base_locks; + // edge coverage. we bin to make merging faster + vector edge_coverage_dynamic; + // total length of above + size_t num_edges_dynamic; + // one mutex per element of edge_coverage_dynamic + std::mutex* edge_locks; + // node qualities + // in the dynamic structure, we keep count of total quality for a node + // and it gets averaged out in the static, so slightly different + // semantics than the node and edge coverage which store the same information + vector node_quality_dynamic; + // total length of above + size_t num_nodes_dynamic; + // one mutex per element of node_quality_dynamic + std::mutex* node_quality_locks; + vector edit_tmpfile_names; vector tmpfstreams; + std::mutex* tmpfstream_locks; // which bin should we use size_t bin_for_position(size_t i) const; size_t n_bins = 1; @@ -68,7 +175,9 @@ class Packer { size_t edit_length = 0; size_t edit_count = 0; enc_vector<> coverage_civ; // graph coverage (compacted coverage_dynamic) - // + vlc_vector<> edge_coverage_civ; // edge coverage (compacted edge_coverage_dynamic) + vlc_vector<> node_quality_civ; // averge mapq for each node rank (compacted node_quality_dynamic) + // edits vector, 32, 32, sa_order_sa_sampling<>, isa_sampling<>, succinct_byte_alphabet<> > > edit_csas; // make separators that are somewhat unusual, as we escape these char delim1 = '\xff'; @@ -79,6 +188,22 @@ class Packer { // take each double delimiter back to a single string unescape_delim(const string& s, char d) const; string unescape_delims(const string& s) const; + + // toggles: + bool record_bases; + bool record_edges; + bool record_edits; + bool record_qualities; + + // Combine the MAPQ and base quality (if available) for a given position in the read + int compute_quality(const Alignment& aln, size_t position_in_read) const; + int combine_qualities(int map_quality, int base_quality) const; + + // Avoid recomputing qualities in above (one per thread) + mutable vector, int>*> quality_cache; + static const int maximum_quality; + static const int lru_cache_size; + }; // for making a combined matrix output and maybe doing other fun operations diff --git a/src/path.cpp b/src/path.cpp index a32524f3422..10ad343af2e 100644 --- a/src/path.cpp +++ b/src/path.cpp @@ -1,10 +1,59 @@ #include "path.hpp" -#include "stream.hpp" +#include #include "region.hpp" +#include + +using namespace vg::io; namespace vg { -const std::regex Paths::is_alt("_alt_.+_[0-9]+"); +const std::function Paths::is_alt = [](const string& path_name) { + // Really we want things that match the regex "_alt_.+_[0-9]+" + // But std::regex was taking loads and loads of time (probably matching .+) so we're replacing it with special-purpose code. + + string prefix("_alt_"); + + if (path_name.length() < prefix.length() || !std::equal(prefix.begin(), prefix.end(), path_name.begin())) { + // We lack the prefix + return false; + } + + // Otherwise it's almost certainly an alt, but make sure it ends with numbers after '_' to be sure. + + size_t found_digits = 0; + for (auto it = path_name.rbegin(); it != path_name.rend() && *it != '_'; ++it) { + // Scan in reverse until '_' (which we know exists) + if (*it < '0' || *it > '9') { + // Out of range character + return false; + } + found_digits++; + } + + // If there were any digits, and ony digits, it matches. + return (found_digits > 0); + +}; + +string Paths::strip_subrange(const string& path_name, subrange_t* out_subrange) { + subrange_t subrange = PathMetadata::parse_subrange(path_name); + string base_name; + if (subrange == PathMetadata::NO_SUBRANGE) { + base_name = path_name; + } else { + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange); + base_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, PathMetadata::NO_SUBRANGE); + } + if (out_subrange) { + *out_subrange = subrange; + } + return base_name; +} mapping_t::mapping_t(void) : traversal(0), length(0), rank(1) { } @@ -50,6 +99,10 @@ void mapping_t::set_is_reverse(bool is_rev) { traversal = abs(traversal) * (is_rev ? -1 : 1); } +ostream& operator<<(ostream& out, mapping_t mapping) { + return out << mapping.node_id() << " " << (mapping.is_reverse() ? "rev" : "fwd"); +} + Paths::Paths(void) { max_path_id = 0; // noop @@ -59,7 +112,7 @@ void Paths::load(istream& in) { function lambda = [this](Path& p) { this->extend(p); }; - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); } void Paths::write(ostream& out) { @@ -81,8 +134,8 @@ void Paths::write(ostream& out) { } return path; }; - stream::write(out, _paths.size(), lambda); - stream::finish(out); + vg::io::write(out, _paths.size(), lambda); + vg::io::finish(out); } void Paths::to_graph(Graph& g) { @@ -124,13 +177,23 @@ void Paths::for_each(const function& lambda) { } } -void Paths::for_each_name(const function& lambda) { +void Paths::for_each_name(const function& lambda) const { for (auto& p : _paths) { const string& name = p.first; lambda(name); } } +bool Paths::for_each_name_stoppable(const function& lambda) const { + for (auto& p : _paths) { + const string& name = p.first; + if (!lambda(name)) { + return false; + } + } + return true; +} + void Paths::for_each_mapping(const function& lambda) { for (auto& p : _paths) { auto& path = p.second; @@ -141,7 +204,7 @@ void Paths::for_each_mapping(const function& lambda) { } void Paths::for_each_stream(istream& in, const function& lambda) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); } void Paths::make_circular(const string& name) { @@ -152,9 +215,10 @@ void Paths::make_linear(const string& name) { circular.erase(name); } -void Paths::extend(const Path& p, bool warn_on_duplicates) { +void Paths::extend(const Path& p, bool warn_on_duplicates, bool rebuild_indexes) { const string& name = p.name(); - auto& path = get_create_path(name); + // Make sure we preserve empty paths + get_create_path(name); for (int i = 0; i < p.mapping_size(); ++i) { const Mapping& m = p.mapping(i); append_mapping(name, m, warn_on_duplicates); @@ -162,30 +226,15 @@ void Paths::extend(const Path& p, bool warn_on_duplicates) { if (p.is_circular()) { make_circular(name); } - // re-sort? - sort_by_mapping_rank(); - rebuild_mapping_aux(); -} - -// one of these should go away -void Paths::extend(const Paths& p, bool warn_on_duplicates) { - for (auto& l : p._paths) { - const string& name = l.first; - auto& path = l.second; - // Make sure we preserve empty paths - get_create_path(name); - for (auto& m : path) { - append_mapping(name, m.to_mapping(), warn_on_duplicates); - } - if (p.circular.count(name)) { - make_circular(name); - } + if (rebuild_indexes) { + // re-sort? + sort_by_mapping_rank(); + rebuild_mapping_aux(); } - sort_by_mapping_rank(); - rebuild_mapping_aux(); } -void Paths::append(const Paths& paths, bool warn_on_duplicates) { +// one of these should go away +void Paths::extend(const Paths& paths, bool warn_on_duplicates, bool rebuild_indexes) { for (auto& p : paths._paths) { const string& name = p.first; auto& path = p.second; @@ -198,22 +247,34 @@ void Paths::append(const Paths& paths, bool warn_on_duplicates) { make_circular(name); } } - sort_by_mapping_rank(); - rebuild_mapping_aux(); + if (rebuild_indexes) { + sort_by_mapping_rank(); + rebuild_mapping_aux(); + } +} + +void Paths::extend(const vector & paths, bool warn_on_duplicates, bool rebuild_indexes) { + for (auto& p : paths) { + extend(p, warn_on_duplicates, false); + } + if (rebuild_indexes) { + sort_by_mapping_rank(); + rebuild_mapping_aux(); + } +} + +void Paths::append(const Paths& paths, bool warn_on_duplicates, bool rebuild_indexes) { + extend(paths, warn_on_duplicates, rebuild_indexes); } -void Paths::append(const Graph& g, bool warn_on_duplicates) { +void Paths::append(const Graph& g, bool warn_on_duplicates, bool rebuild_indexes) { for (int i = 0; i < g.path_size(); ++i) { - const Path& p = g.path(i); // Make sure we preserve empty paths - get_create_path(p.name()); - for (int j = 0; j < p.mapping_size(); ++j) { - const Mapping& m = p.mapping(j); - append_mapping(p.name(), m, warn_on_duplicates); - if (p.is_circular()) { - make_circular(p.name()); - } - } + extend(g.path(i), warn_on_duplicates, false); + } + if (rebuild_indexes) { + sort_by_mapping_rank(); + rebuild_mapping_aux(); } } @@ -222,8 +283,12 @@ Path& append_path(Path& a, const Path& b) { return a; } -bool Paths::has_mapping(const string& name, size_t rank) { - return mappings_by_rank.count(name) && mappings_by_rank[name].count(rank); +bool Paths::has_mapping(const string& name, int32_t rank) { + auto iter = mappings_by_rank.find(name); + if (iter != mappings_by_rank.end()) { + return iter->second.count(rank); + } + return false; } void Paths::append_mapping(const string& name, const mapping_t& m, bool warn_on_duplicates) { @@ -296,18 +361,31 @@ void Paths::append_mapping(const string& name, const mapping_t& m, bool warn_on_ } } -int64_t Paths::get_path_id(const string& name) { - auto f = name_to_id.find(name); - if (f == name_to_id.end()) { - ++max_path_id; - name_to_id[name] = max_path_id; - id_to_name[max_path_id] = name; +int64_t Paths::get_path_id(const string& name) const { + int64_t path_id; +#pragma omp critical (path_id_map) + { + // in order to keep the critical section inside above if (so it's only touched when initializing) + // we need the second check here + if (!name_to_id.count(name)) { + // Assign an ID. + // These members are mutable. + ++max_path_id; + id_to_name[max_path_id] = name; + name_to_id[name] = max_path_id; + } + path_id = name_to_id[name]; } - return name_to_id[name]; + return path_id; } -const string& Paths::get_path_name(int64_t id) { - return id_to_name[id]; +const string& Paths::get_path_name(int64_t id) const { + const string* name; +#pragma omp critical (path_id_map) + { + name = &id_to_name[id]; + } + return *name; } void Paths::append_mapping(const string& name, id_t id, bool is_reverse, size_t length, size_t rank, bool warn_on_duplicates) { @@ -327,16 +405,29 @@ void Paths::prepend_mapping(const string& name, const Mapping& m, bool warn_on_d // get or create the path with this name list& pt = get_create_path(name); - // TODO: Implement dealing with no rank. - // We can't prepend a mapping that doesn't have a rank set. We would like to - // generate ranks, but we can't keep decrementing the first rank - // indefinitely, and that might not be correct. Also, what rank would we use - // for the only mapping in a path? - assert(m.rank()); + // TODO: I'm not sure if this is the best way for handling ranks, but the ranks + // are really a chunked serialization thing, not an in-memory construct. Moreover, + // we're ideally going to move away from using the VG graph in the future, so I don't + // expect this will even come up. Mostly just trying to meet the HandleGraph interface + // in the interim. + int32_t rank = m.rank(); + if (rank == 0) { + // no given rank, decrement the first rank, skipping over 0 to preserve it as + // a sentinel + if (pt.empty()) { + rank = 1; + } + else if (pt.front().rank != 1) { + rank = pt.front().rank - 1; + } + else { + rank = -1; + } + } // now if we haven't already supplied a mapping // add it - if (!has_mapping(name, m.rank())) { + if (!has_mapping(name, rank)) { // If we don't have a rank set or we don't have a mapping in this path // with that rank, we need to add the mapping. @@ -354,7 +445,7 @@ void Paths::prepend_mapping(const string& name, const Mapping& m, bool warn_on_d } else if (warn_on_duplicates) { // This mapping duplicates the rank of an existing mapping. // We're not going to keep it, so we should complain. - cerr << "[vg] warning: path " << name << " rank " << m.rank() << " appears multiple times. Skipping." << endl; + cerr << "[vg] warning: path " << name << " rank " << rank << " appears multiple times. Skipping." << endl; } } @@ -423,7 +514,7 @@ pair Paths::replace_mapping(mapping_t* m, pair& id_mapping) { +void Paths::swap_node_ids(const std::function& get_new_id) { for (auto& p : _paths) { const string& name = p.first; list& path = p.second; for (auto& m : path) { // Look up the replacement ID - auto replacement = id_mapping.find(m.node_id()); - if(replacement != id_mapping.end()) { - // If there is a replacement, use it. - m.set_node_id((*replacement).second); + auto replacement = get_new_id(m.node_id()); + if(replacement != 0) { + // If there is a nonzero replacement, use it. + m.set_node_id(replacement); } } } rebuild_node_mapping(); } +void Paths::swap_node_ids(hash_map& id_mapping) { + swap_node_ids([&](const nid_t& id) -> nid_t { + auto it = id_mapping.find(id); + if (it == id_mapping.end()) { + // Not found + return 0; + } else { + // Use the result + return it->second; + } + }); +} + void Paths::reassign_node(id_t new_id, mapping_t* m) { // erase the old node id node_mapping[m->node_id()][mapping_path_id(m)].erase(m); @@ -672,17 +776,21 @@ list& Paths::get_create_path(const string& name) { } } -bool Paths::has_node_mapping(id_t id) { +bool Paths::has_node_mapping(id_t id) const { return node_mapping.find(id) != node_mapping.end(); } -bool Paths::has_node_mapping(Node* n) { +bool Paths::has_node_mapping(Node* n) const { return node_mapping.find(n->id()) != node_mapping.end(); } map>& Paths::get_node_mapping(id_t id) { return node_mapping[id]; } + +const map>& Paths::get_node_mapping(id_t id) const { + return node_mapping.at(id); +} map>& Paths::get_node_mapping(Node* n) { return node_mapping[n->id()]; @@ -1112,6 +1220,15 @@ Path& extend_path(Path& path1, const Path& path2) { // concatenates paths Path concat_paths(const Path& path1, const Path& path2) { + + if (path1.mapping_size() == 0) { + return path2; + } else if (path2.mapping_size() == 0) { + return path1; + } + + // Otherwise there are mappings in both and we have real work to do + Path res = path1; //cerr << "-------------------- concat thing ------------------" << endl; //cerr << pb2json(path1) << endl << pb2json(path2) << endl; @@ -1265,7 +1382,7 @@ Path simplify(const Path& p, bool trim_internal_deletions) { auto& m = r.mapping(i); int curr_to_length = mapping_to_length(m); // skip bits at the beginning and end - if (!seen_to_length && !curr_to_length + if ((!seen_to_length && !curr_to_length) || seen_to_length == total_to_length) continue; Mapping n; *n.mutable_position() = m.position(); @@ -1384,16 +1501,8 @@ Mapping simplify(const Mapping& m, bool trim_internal_deletions) { // if the edit types are the same, merge them if (edit_is_empty(f)) { continue; - } else if ((edit_is_match(e) && edit_is_match(f)) - || (edit_is_sub(e) && edit_is_sub(f)) - || (edit_is_deletion(e) && edit_is_deletion(f)) - || (edit_is_insertion(e) && edit_is_insertion(f))) { - // will be 0 for insertions, and + for the rest - e.set_from_length(e.from_length()+f.from_length()); - // will be 0 for deletions, and + for the rest - e.set_to_length(e.to_length()+f.to_length()); - // will be empty for both or have sequence for both - e.set_sequence(e.sequence() + f.sequence()); + } else if (edits_are_compatible(e, f)) { + merge_edits_in_place(e, f); } else { // mismatched types are just put on *n.add_edit() = e; @@ -1409,6 +1518,22 @@ Mapping simplify(const Mapping& m, bool trim_internal_deletions) { return n; } +bool edits_are_compatible(const Edit& e, const Edit& f) { + return (edit_is_match(e) && edit_is_match(f)) + || (edit_is_sub(e) && edit_is_sub(f)) + || (edit_is_deletion(e) && edit_is_deletion(f)) + || (edit_is_insertion(e) && edit_is_insertion(f)); +} + +void merge_edits_in_place(Edit& e, const Edit& f) { + // will be 0 for insertions, and + for the rest + e.set_from_length(e.from_length() + f.from_length()); + // will be 0 for deletions, and + for the rest + e.set_to_length(e.to_length() + f.to_length()); + // will be empty for both or have sequence for both + e.set_sequence(e.sequence() + f.sequence()); +} + Mapping merge_adjacent_edits(const Mapping& m) { Mapping n; @@ -1513,6 +1638,21 @@ Path trim_hanging_ends(const Path& p) { return r; } +bool mappings_equivalent(const Mapping& m1, const Mapping& m2) { + bool equivalent = (m1.position().node_id() == m2.position().node_id() + && m1.position().is_reverse() == m2.position().is_reverse() + && m1.position().offset() == m2.position().offset() + && m1.edit_size() == m2.edit_size()); + for (size_t i = 0; i < m1.edit_size() && equivalent; ++i) { + const auto& e1 = m1.edit(i); + const auto& e2 = m2.edit(i); + equivalent = (e1.from_length() == e2.from_length() + && e1.to_length() == e2.to_length() + && e1.sequence() == e2.sequence()); + } + return equivalent; +} + bool mapping_ends_in_deletion(const Mapping& m){ return m.edit_size() >= 1 && edit_is_deletion(m.edit(m.edit_size()-1)); } @@ -1525,6 +1665,10 @@ bool mapping_is_total_deletion(const Mapping& m) { return m.edit_size() == 1 && edit_is_deletion(m.edit(0)); } +bool mapping_is_total_insertion(const Mapping& m) { + return m.edit_size() == 1 && edit_is_insertion(m.edit(0)); +} + bool mapping_is_simple_match(const Mapping& m) { return m.edit_size() == 1 && edit_is_match(m.edit(0)); } @@ -1572,14 +1716,33 @@ const string mapping_sequence(const Mapping& mp, const string& node_seq) { const string mapping_sequence(const Mapping& mp, const Node& n) { if (!mp.has_position() || !mp.position().node_id()) { - assert(mp.edit_size()==1); - return mp.edit(0).sequence(); + // With no grap position we must be a pure insert. + // n is undefined. + // But we might have multiple edits. + std::stringstream s; + for (auto& e : mp.edit()) { + // We can't have any from bases if we have no graph position. + assert(e.from_length() == 0); + s << e.sequence(); + } + return s.str(); } assert(mp.position().node_id() == n.id()); auto& node_seq = n.sequence(); return mapping_sequence(mp, node_seq); } +// convert the path to a sequence +string path_sequence(const HandleGraph& graph, const Path& path) { + string seq; + for (int i = 0; i < path.mapping_size(); ++i) { + auto& m = path.mapping(i); + handle_t h = graph.get_handle(m.position().node_id(), m.position().is_reverse()); + seq.append(graph.get_sequence(h)); + } + return seq; +} + Mapping reverse_complement_mapping(const Mapping& m, const function& node_length) { // Make a new reversed mapping @@ -2033,7 +2196,7 @@ bool maps_to_node(const Path& p, id_t id) { } // returns the start position, or an empty position if the path has no mappings with positions -Position path_start(const Path& path) { +Position path_start_position(const Path& path) { for (size_t i = 0; i < path.mapping_size(); ++i) { auto& mapping = path.mapping(i); if (mapping.has_position()) return mapping.position(); @@ -2055,7 +2218,7 @@ string path_to_string(Path p){ } // determine the path end -Position path_end(const Path& path) { +Position path_end_position(const Path& path) { Position pos; if (!path.mapping_size()) return pos; auto& last = path.mapping(path.mapping_size()-1); @@ -2182,6 +2345,31 @@ void translate_node_ids(Path& path, const unordered_map& translator) } } +void translate_node_ids(Path& path, const unordered_map& translator, id_t cut_node, size_t bases_removed, bool from_right) { + // First just translate the IDs + translate_node_ids(path, translator); + + + for (size_t i = 0; i < path.mapping_size(); i++) { + // Scan the whole path again. We can't count on the cut node only being in the first and last mappings. + Position* position = path.mutable_mapping(i)->mutable_position(); + if (position->node_id() == cut_node) { + // Then adjust offsets to account for the cut on the original node + + // If the position in the path is counting from the same end of the + // node that we didn't keep after the cut, we have to bump up its + // offset. + if ((!position->is_reverse() && !from_right) || // We cut off the left of the node, and we're counting from the left + (position->is_reverse() && from_right)) { // We cut off the right of the node, and we're counting from the right + // Update the offset to reflect the removed bases + position->set_offset(position->offset() + bases_removed); + } + } + } + + +} + void translate_oriented_node_ids(Path& path, const unordered_map>& translator) { for (size_t i = 0; i < path.mapping_size(); i++) { Position* position = path.mutable_mapping(i)->mutable_position(); @@ -2190,22 +2378,62 @@ void translate_oriented_node_ids(Path& path, const unordered_mapset_is_reverse(translation.second != position->is_reverse()); } } + +void translate_oriented_node_ids(Path& path, const function(id_t)>& translator) { + for (size_t i = 0; i < path.mapping_size(); i++) { + Position* position = path.mutable_mapping(i)->mutable_position(); + const pair& translation = translator(position->node_id()); + position->set_node_id(translation.first); + position->set_is_reverse(translation.second != position->is_reverse()); + } +} + + +void translate_node_ids(path_t& path, const unordered_map& translator) { + for (size_t i = 0; i < path.mapping_size(); i++) { + position_t* position = path.mutable_mapping(i)->mutable_position(); + position->set_node_id(translator.at(position->node_id())); + } +} +void translate_oriented_node_ids(path_t& path, const unordered_map>& translator) { + for (size_t i = 0; i < path.mapping_size(); i++) { + position_t* position = path.mutable_mapping(i)->mutable_position(); + const pair& translation = translator.at(position->node_id()); + position->set_node_id(translation.first); + position->set_is_reverse(translation.second != position->is_reverse()); + } +} + +void translate_oriented_node_ids(path_t& path, const function(id_t)>& translator) { + for (size_t i = 0; i < path.mapping_size(); i++) { + position_t* position = path.mutable_mapping(i)->mutable_position(); + const pair& translation = translator(position->node_id()); + position->set_node_id(translation.first); + position->set_is_reverse(translation.second != position->is_reverse()); + } +} pos_t initial_position(const Path& path) { - if (!path.mapping_size()) { - return pos_t(); + pos_t pos; + if (path.mapping_size()) { + const Position& position = path.mapping(0).position(); + get_id(pos) = position.node_id(); + get_is_rev(pos) = position.is_reverse(); + get_offset(pos) = position.offset(); } - return path.mapping_size() ? make_pos_t(path.mapping(0).position()) : pos_t(); + return pos; } pos_t final_position(const Path& path) { - if (!path.mapping_size()) { - return pos_t(); + pos_t pos; + if (path.mapping_size()) { + const Mapping& mapping = path.mapping(path.mapping_size() - 1); + const Position& position = mapping.position(); + get_id(pos) = position.node_id(); + get_is_rev(pos) = position.is_reverse(); + get_offset(pos) = position.offset() + mapping_from_length(mapping); } - const Mapping& mapping = path.mapping(path.mapping_size() - 1); - return make_pos_t(mapping.position().node_id(), - mapping.position().is_reverse(), - mapping.position().offset() + mapping_from_length(mapping) - 1); + return pos; } Path path_from_node_traversals(const list& traversals) { @@ -2237,11 +2465,11 @@ Path path_from_node_traversals(const list& traversals) { return toReturn; } -void remove_paths(Graph& graph, const std::regex& paths_to_take, std::list* matching) { +void remove_paths(Graph& graph, const function& paths_to_take, std::list* matching) { std::list non_matching; for (size_t i = 0; i < graph.path_size(); i++) { - if (std::regex_match(graph.path(i).name(), paths_to_take)) { + if (paths_to_take(graph.path(i).name())) { if (matching != nullptr) { matching->push_back(graph.path(i)); } @@ -2256,4 +2484,309 @@ void remove_paths(Graph& graph, const std::regex& paths_to_take, std::list } } +Path path_from_path_handle(const PathHandleGraph& graph, path_handle_t path_handle) { + Path path; + path.set_name(graph.get_path_name(path_handle)); + size_t rank = 1; + for (handle_t handle : graph.scan_path(path_handle)) { + Mapping* mapping = path.add_mapping(); + mapping->mutable_position()->set_node_id(graph.get_id(handle)); + mapping->mutable_position()->set_is_reverse(graph.get_is_reverse(handle)); + mapping->set_rank(rank++); + Edit* edit = mapping->add_edit(); + edit->set_from_length(graph.get_length(handle)); + edit->set_to_length(graph.get_length(handle)); + } + return path; +} + +// Wrap a Path in an Alignment +Alignment alignment_from_path(const HandleGraph& graph, const Path& path) { + Alignment aln; + *aln.mutable_path() = path; + aln.set_name(aln.path().name()); + aln.set_sequence(path_sequence(graph, path)); + return aln; +} + +void from_proto_edit(const Edit& proto_edit, edit_t& edit) { + edit.set_from_length(proto_edit.from_length()); + edit.set_to_length(proto_edit.to_length()); + edit.set_sequence(proto_edit.sequence()); +} + +void to_proto_edit(const edit_t& edit, Edit& proto_edit) { + proto_edit.set_from_length(edit.from_length()); + proto_edit.set_to_length(edit.to_length()); + proto_edit.set_sequence(edit.sequence()); +} + +void from_proto_mapping(const Mapping& proto_mapping, path_mapping_t& mapping) { + const auto& position = proto_mapping.position(); + auto position_copy = mapping.mutable_position(); + position_copy->set_node_id(position.node_id()); + position_copy->set_offset(position.offset()); + position_copy->set_is_reverse(position.is_reverse()); + for (const auto& edit : proto_mapping.edit()) { + from_proto_edit(edit, *mapping.add_edit()); + } +} + +void to_proto_mapping(const path_mapping_t& mapping, Mapping& proto_mapping) { + const auto& position = mapping.position(); + auto position_copy = proto_mapping.mutable_position(); + position_copy->set_node_id(position.node_id()); + position_copy->set_offset(position.offset()); + position_copy->set_is_reverse(position.is_reverse()); + for (const auto& edit : mapping.edit()) { + to_proto_edit(edit, *proto_mapping.add_edit()); + } +} + +void from_proto_path(const Path& proto_path, path_t& path) { + for (const auto& mapping : proto_path.mapping()) { + from_proto_mapping(mapping, *path.add_mapping()); + } +} +void to_proto_path(const path_t& path, Path& proto_path) { + for (const auto& mapping : path.mapping()) { + auto mapping_copy = proto_path.add_mapping(); + to_proto_mapping(mapping, *mapping_copy); + mapping_copy->set_rank(proto_path.mapping_size()); + } +} + +int mapping_from_length(const path_mapping_t& mapping) { + int length = 0; + for (const auto& edit : mapping.edit()) { + length += edit.from_length(); + } + return length; +} + +int path_from_length(const path_t& path) { + int length = 0; + for (const auto& mapping : path.mapping()) { + length += mapping_from_length(mapping); + } + return length; +} + +int mapping_to_length(const path_mapping_t& mapping) { + int length = 0; + for (const auto& edit : mapping.edit()) { + length += edit.to_length(); + } + return length; +} + +int path_to_length(const path_t& path) { + int length = 0; + for (const auto& mapping : path.mapping()) { + length += mapping_to_length(mapping); + } + return length; +} + + +void reverse_complement_mapping_in_place(path_mapping_t* m, + const function& node_length) { + + position_t* pos = m->mutable_position(); + pos->set_is_reverse(!pos->is_reverse()); + pos->set_offset(node_length(pos->node_id()) - pos->offset() - mapping_from_length(*m)); + + size_t swap_size = m->edit_size() / 2; + for (size_t i = 0, j = m->edit_size() - 1; i < swap_size; i++, j--) { + edit_t* e1 = m->mutable_edit(i); + edit_t* e2 = m->mutable_edit(j); + + int64_t from_length_tmp = e1->from_length(); + int64_t to_length_tmp = e1->to_length(); + string sequence_tmp = e1->sequence(); + + e1->set_from_length(e2->from_length()); + e1->set_to_length(e2->to_length()); + e1->set_sequence(reverse_complement(e2->sequence())); + + e2->set_from_length(from_length_tmp); + e2->set_to_length(to_length_tmp); + e2->set_sequence(reverse_complement(sequence_tmp)); + } + + + if (m->edit_size() % 2) { + edit_t* e = m->mutable_edit(swap_size); + reverse_complement_in_place(*e->mutable_sequence()); + } +} + +path_mapping_t reverse_complement_mapping(const path_mapping_t& m, + const function& node_length) { + + path_mapping_t reversed; + position_t* rev_pos = reversed.mutable_position(); + rev_pos->set_node_id(m.position().node_id()); + rev_pos->set_is_reverse(!m.position().is_reverse()); + rev_pos->set_offset(node_length(m.position().node_id()) - m.position().offset() - mapping_from_length(m)); + + for (int64_t i = m.edit_size() - 1; i >= 0; i--) { + const edit_t& e = m.edit(i); + edit_t* rev_edit = reversed.add_edit(); + rev_edit->set_from_length(e.from_length()); + rev_edit->set_to_length(e.to_length()); + rev_edit->set_sequence(reverse_complement(e.sequence())); + } + + return reversed; +} + +path_t reverse_complement_path(const path_t& path, + const function& node_length) { + + // Make a new reversed path + path_t reversed; + + for (int64_t i = path.mapping_size() - 1; i >= 0; i--) { + // For each mapping in reverse order, put it in reverse complemented and + // measured from the other end of the node. + *reversed.add_mapping() = reverse_complement_mapping(path.mapping(i), node_length); + } + + return reversed; +} + +void reverse_complement_path_in_place(path_t* path, + const function& node_length) { + + size_t swap_size = path->mapping_size() / 2; + for (size_t i = 0, j = path->mapping_size() - 1; i < swap_size; i++, j--) { + path_mapping_t* m1 = path->mutable_mapping(i); + path_mapping_t* m2 = path->mutable_mapping(j); + + reverse_complement_mapping_in_place(m1, node_length); + reverse_complement_mapping_in_place(m2, node_length); + + std::swap(*m1, *m2); + } + + if (path->mapping_size() % 2) { + reverse_complement_mapping_in_place(path->mutable_mapping(swap_size), node_length); + } +} + +pos_t initial_position(const path_t& path) { + pos_t pos; + if (path.mapping_size()) { + const position_t& position = path.mapping(0).position(); + get_id(pos) = position.node_id(); + get_is_rev(pos) = position.is_reverse(); + get_offset(pos) = position.offset(); + } + return pos; +} + +pos_t final_position(const path_t& path) { + pos_t pos; + if (path.mapping_size()) { + const path_mapping_t& mapping = path.mapping(path.mapping_size() - 1); + const position_t& position = mapping.position(); + get_id(pos) = position.node_id(); + get_is_rev(pos) = position.is_reverse(); + get_offset(pos) = position.offset() + mapping_from_length(mapping); + } + return pos; +} + +string debug_string(const path_t& path) { + string to_return = "{"; + if (!path.mapping().empty()) { + to_return += "mapping: ["; + for (size_t i = 0; i < path.mapping_size(); ++i) { + if (i > 0) { + to_return += ", "; + } + to_return += debug_string(path.mapping(i)); + } + to_return += "]"; + } + to_return += "}"; + return to_return; +} + +string debug_string(const path_mapping_t& mapping) { + string to_return = "{pos: " + debug_string(mapping.position()); + if (!mapping.edit().empty()) { + to_return += ", edit: ["; + for (size_t i = 0; i < mapping.edit_size(); ++i) { + if (i > 0) { + to_return += ", "; + } + to_return += debug_string(mapping.edit(i)); + } + to_return += "]"; + } + to_return += "}"; + return to_return; +} + +string debug_string(const edit_t& edit) { + string to_return = "{fl: " + to_string(edit.from_length()) + ", tl: " + to_string(edit.to_length()); + if (!edit.sequence().empty()) { + to_return += ", seq: " + edit.sequence(); + } + to_return += "}"; + return to_return; +} + +int corresponding_length_internal(const path_t& path, int given_length, bool is_from_length, bool from_end) { + int from_length = 0; + if (path.mapping().empty()) { + return from_length; + } + int incr, i_begin; + if (from_end) { + i_begin = path.mapping_size() - 1; + incr = -1; + } + else { + incr = 1; + i_begin = 0; + } + int remaining = given_length; + int other_length_total = 0; + for (int i = i_begin; i >= 0 && i < path.mapping_size() && remaining != 0; i += incr) { + const auto& mapping = path.mapping(i); + int j_begin = from_end ? mapping.edit_size() - 1 : 0; + for (int j = j_begin; j >= 0 && j < mapping.edit_size() && remaining != 0; j += incr) { + const edit_t& edit = mapping.edit(j); + int walking_length, other_length; + if (is_from_length) { + walking_length = edit.from_length(); + other_length = edit.to_length(); + } + else { + walking_length = edit.to_length(); + other_length = edit.from_length(); + } + if (remaining >= walking_length) { + remaining -= walking_length; + other_length_total += other_length; + } + else { + other_length_total += (remaining * other_length) / walking_length; + remaining = 0; + } + } + } + return other_length_total; +} +int corresponding_to_length(const path_t& path, int from_length, bool from_end) { + return corresponding_length_internal(path, from_length, true, from_end); +} + +int corresponding_from_length(const path_t& path, int to_length, bool from_end) { + return corresponding_length_internal(path, to_length, false, from_end); +} + } diff --git a/src/path.hpp b/src/path.hpp index e57812a070d..b50208186b0 100644 --- a/src/path.hpp +++ b/src/path.hpp @@ -1,4 +1,4 @@ -#ifndef VG_PATH_HPP_INCLUDED + #ifndef VG_PATH_HPP_INCLUDED #define VG_PATH_HPP_INCLUDED #include @@ -7,10 +7,9 @@ #include #include #include -#include -#include "json2pb.h" -#include "vg.pb.h" -#include "edit.hpp" +#include "vg/io/json2pb.h" +#include +#include "vg/io/edit.hpp" #include "hash_map.hpp" #include "utility.hpp" #include "types.hpp" @@ -37,11 +36,19 @@ class mapping_t { void set_is_reverse(bool is_rev); }; +/// Allow a mapping_t to be printed, for debugging purposes +ostream& operator<<(ostream& out, mapping_t mapping); + class Paths { public: - // This regex matches the names of alt paths. - const static std::regex is_alt; + // This predicate matches the names of alt paths. + // We used to use a regex but that's a very slow way to check a prefix. + const static function is_alt; + + // Use the path metadata api to strip a subrange from a path name. + // If desired, the subrange can be stored in start and end + static string strip_subrange(const string& path_name, subrange_t* subrange = nullptr); Paths(void); @@ -75,11 +82,19 @@ class Paths { // This maps from path name to the list of Mappings for that path. map > _paths; - int64_t max_path_id; - map name_to_id; - int64_t get_path_id(const string& name); - map id_to_name; - const string& get_path_name(int64_t id); + +private: + // These need to be private because path names are lazily assigned IDs by the accessors + // They also need to be mutable because we want our accessors to treat our object as logical const + mutable int64_t max_path_id; + mutable map name_to_id; + mutable map id_to_name; +public: + /// Get the lazily assigned numeric ID for a path, by name. + int64_t get_path_id(const string& name) const; + /// Get the name of a path, by numeric ID. + const string& get_path_name(int64_t id) const; + // This maps from mapping_t* pointer to its iterator in its list of Mappings // for its path and the id of the path. // The list in question is stored above in _paths. @@ -93,7 +108,7 @@ class Paths { // We need this in order to make sure we aren't adding duplicate mappings // with the same rank in the same path. Maps from path name and rank to // Mapping pointer. - map> mappings_by_rank; + map> mappings_by_rank; // This maps from node ID, then path name, then rank and orientation, to // Mapping pointers for the mappings on that path to that node. hash_map>> node_mapping; @@ -133,7 +148,7 @@ class Paths { void remove_path(const string& name); void keep_paths(const set& name); void remove_node(id_t id); - bool has_path(const string& name); + bool has_path(const string& name) const; void to_json(ostream& out); list& get_path(const string& name); list& get_create_path(const string& name); @@ -141,14 +156,15 @@ class Paths { // Does the given path have a mapping meeting the given criteria? // Is there a mapping in the given path with the given assigned rank? Note // that the rank passed may not be 0. - bool has_mapping(const string& name, size_t rank); + bool has_mapping(const string& name, int32_t rank); // We used to be able to search for a Mapping by value, but that's not // efficient if the Mappings don't have ranks, and it never checked the // edits for equality anyway. - bool has_node_mapping(id_t id); - bool has_node_mapping(Node* n); + bool has_node_mapping(id_t id) const; + bool has_node_mapping(Node* n) const; map >& get_node_mapping(Node* n); map >& get_node_mapping(id_t id); + const map >& get_node_mapping(id_t id) const; map > get_node_mapping_by_path_name(Node* n); map > get_node_mapping_by_path_name(id_t id); map > get_node_mappings_by_rank(id_t id); @@ -203,19 +219,24 @@ class Paths { void append_mapping(const string& name, id_t id, bool is_reverse, size_t length, size_t rank = 0, bool warn_on_duplicates = false); // TODO: Adapt this to use mapping_t instead. void prepend_mapping(const string& name, const Mapping& m, bool warn_on_duplicates = false); - void prepend_mapping(const string& name, id_t id, bool is_reverse, size_t length, size_t rank, bool warn_on_duplicates = false); + void prepend_mapping(const string& name, id_t id, bool is_reverse, size_t length, size_t rank = 0, bool warn_on_duplicates = false); size_t get_next_rank(const string& name); - void append(const Paths& p, bool warn_on_duplicates = false); - void append(const Graph& g, bool warn_on_duplicates = false); - void extend(const Paths& p, bool warn_on_duplicates = false); - void extend(const Path& p, bool warn_on_duplicates = false); + void append(const Paths& paths, bool warn_on_duplicates = false, bool rebuild_indexes = true); + void append(const Graph& g, bool warn_on_duplicates = false, bool rebuild_indexes = true); + void extend(const Paths& paths, bool warn_on_duplicates = false, bool rebuild_indexes = true); + void extend(const Path& p, bool warn_on_duplicates = false, bool rebuild_indexes = true); + void extend(const vector & paths, bool warn_on_duplicates = false, bool rebuild_indexes = true); void for_each(const function& lambda); // Loop over the names of paths without actually extracting the Path objects. - void for_each_name(const function& lambda); + void for_each_name(const function& lambda) const; + // Like for_each_name but allows stopping early. + // TODO: Use the libhandlegraph unified iteratee pattern here. + bool for_each_name_stoppable(const function& lambda) const; void for_each_stream(istream& in, const function& lambda); void increment_node_ids(id_t inc); - // Replace the node IDs used as keys with those used as values. + // Replace the node IDs according to a mapping from old ID to new ID. // This is only efficient to do in a batch. + void swap_node_ids(const std::function& get_new_id); void swap_node_ids(hash_map& id_mapping); // sets the mapping to the new id // erases current (old index information) @@ -225,6 +246,9 @@ class Paths { string path_to_string(Path p); Path& increment_node_mapping_ids(Path& p, id_t inc); +/// Append the second path onto the end of the first, without combining +/// mappings or simplifying. Modifies and returns a reference to the first +/// path. Path& append_path(Path& a, const Path& b); const Paths paths_from_graph(Graph& g); int path_to_length(const Path& path); @@ -235,14 +259,18 @@ Position first_path_position(const Path& path); Position last_path_position(const Path& path); int to_length(const Mapping& m); int from_length(const Mapping& m); +bool mappings_equivalent(const Mapping& m1, const Mapping& m2); bool mapping_ends_in_deletion(const Mapping& m); bool mapping_starts_in_deletion(const Mapping& m); +bool mapping_is_total_insertion(const Mapping& m); bool mapping_is_total_deletion(const Mapping& m); bool mapping_is_simple_match(const Mapping& m); bool path_is_simple_match(const Path& p); // convert the mapping to the particular node into the sequence implied by the mapping const string mapping_sequence(const Mapping& m, const string& node_seq); const string mapping_sequence(const Mapping& m, const Node& n); +// convert the path to a sequence +string path_sequence(const HandleGraph& graph, const Path& path); // Reverse-complement a Mapping and all the Edits in it. A function to get node // lengths is needed, because the mapping will need to count its position from // the other end of the node. @@ -269,6 +297,11 @@ Path simplify(const Path& p, bool trim_internal_deletions = true); /// actually set. Mapping simplify(const Mapping& m, bool trim_internal_deletions = true); +/// Return true if two edits could be combined into one (assuming adjacency). +bool edits_are_compatible(const Edit& e, const Edit& f); +/// Glom the second edit into the first, assuming adjacency. +void merge_edits_in_place(Edit& e, const Edit& f); + /// Merge adjacent edits of the same type Path merge_adjacent_edits(const Path& m); /// Merge adjacent edits of the same type @@ -296,8 +329,8 @@ pair cut_path(const Path& path, const Position& pos); pair cut_path(const Path& path, size_t offset); bool maps_to_node(const Path& p, id_t id); // the position that starts just after the path ends -Position path_start(const Path& path); -Position path_end(const Path& path); +Position path_start_position(const Path& path); +Position path_end_position(const Path& path); bool adjacent_mappings(const Mapping& m1, const Mapping& m2); // Return true if a mapping is a perfect match (i.e. contains no non-match edits) bool mapping_is_match(const Mapping& m); @@ -310,23 +343,250 @@ double overlap(const Path& p1, const Path& p2); // helps estimate overapls quickly void decompose(const Path& path, map& ref_positions, map& edits); -// switches the node ids in the path to the ones indicated by the translator +/// Switches the node ids in the path to the ones indicated by the translator void translate_node_ids(Path& path, const unordered_map& translator); -// switches the node ids and orientations in the path to the ones indicated by the translator +/// Replaces the node IDs in the path with the ones indicated by the +/// translator. Supports a single cut node in the source graph, where the given +/// number of bases of the given node were removed from its left or right side +/// when making the source graph from the destination graph. +void translate_node_ids(Path& path, const unordered_map& translator, id_t cut_node, size_t bases_removed, bool from_right); +/// Switches the node ids and orientations in the path to the ones indicated by the translator void translate_oriented_node_ids(Path& path, const unordered_map>& translator); +/// Switches node ids and orientations in the path to the ones indicated by the translator +void translate_oriented_node_ids(Path& path, const function(id_t)>& translator); // the first position on the path pos_t initial_position(const Path& path); // the last position on the path pos_t final_position(const Path& path); - + // Turn a list of node traversals into a path Path path_from_node_traversals(const list& traversals); -// Remove the paths with names matching the regex from the graph. +// Remove the paths with names matching the predicate from the graph. // Store them in the list unless it is nullptr. -void remove_paths(Graph& graph, const std::regex& paths_to_take, std::list* matching); +void remove_paths(Graph& graph, const function& paths_to_take, std::list* matching); + +// Get a Path from a handle graph +Path path_from_path_handle(const PathHandleGraph& graph, path_handle_t path_handle); + +// Wrap a Path in an Alignment +Alignment alignment_from_path(const HandleGraph& graph, const Path& path); + + +/* + * STL implementations of the protobuf object for use in in-memory operations + */ +class edit_t { +public: + edit_t() = default; + edit_t(const edit_t&) = default; + edit_t(edit_t&&) = default; + ~edit_t() = default; + edit_t& operator=(const edit_t&) = default; + edit_t& operator=(edit_t&&) = default; + inline int32_t from_length() const; + inline void set_from_length(int32_t l); + inline int32_t to_length() const; + inline void set_to_length(int32_t l); + inline const string& sequence() const; + inline void set_sequence(const string& s); + inline string* mutable_sequence(); + inline bool operator==(const edit_t& other) const; + inline bool operator!=(const edit_t& other) const; +private: + int32_t _from_length; + int32_t _to_length; + string _sequence; +}; + +// the mapping_t name is already taken +class path_mapping_t { +public: + path_mapping_t() = default; + path_mapping_t(const path_mapping_t&) = default; + path_mapping_t(path_mapping_t&&) = default; + ~path_mapping_t() = default; + path_mapping_t& operator=(const path_mapping_t&) = default; + path_mapping_t& operator=(path_mapping_t&&) = default; + inline const position_t& position() const; + inline position_t* mutable_position(); + inline const vector& edit() const; + inline const edit_t& edit(size_t i) const; + inline vector* mutable_edit(); + inline edit_t* mutable_edit(size_t i); + inline edit_t* add_edit(); + inline size_t edit_size() const; + inline bool operator==(const path_mapping_t& other) const; + inline bool operator!=(const path_mapping_t& other) const; +private: + position_t _position; + vector _edit; +}; + +class path_t { +public: + path_t() = default; + path_t(const path_t&) = default; + path_t(path_t&&) = default; + ~path_t() = default; + path_t& operator=(const path_t&) = default; + path_t& operator=(path_t&&) = default; + inline const vector& mapping() const; + inline const path_mapping_t& mapping(size_t i) const; + inline vector* mutable_mapping(); + inline path_mapping_t* mutable_mapping(size_t i); + inline path_mapping_t* add_mapping(); + inline void clear_mapping(); + inline size_t mapping_size() const; + inline bool operator==(const path_t& other) const; + inline bool operator!=(const path_t& other) const; +private: + vector _mapping; +}; + +void from_proto_edit(const Edit& proto_edit, edit_t& edit); +void to_proto_edit(const edit_t& edit, Edit& proto_edit); +void from_proto_mapping(const Mapping& proto_mapping, path_mapping_t& mapping); +void to_proto_mapping(const path_mapping_t& mapping, Mapping& proto_mapping); +void from_proto_path(const Path& proto_path, path_t& path); +void to_proto_path(const path_t& path, Path& proto_path); + + +// repeated functions for the new path_t class +void translate_node_ids(path_t& path, const unordered_map& translator); +void translate_oriented_node_ids(path_t& path, const unordered_map>& translator); +void translate_oriented_node_ids(path_t& path, const function(id_t)>& translator); + +int mapping_from_length(const path_mapping_t& mapping); +int path_from_length(const path_t& path); +int mapping_to_length(const path_mapping_t& mapping); +int path_to_length(const path_t& path); + +path_mapping_t reverse_complement_mapping(const path_mapping_t& m, + const function& node_length); +path_t reverse_complement_path(const path_t& path, + const function& node_length); +void reverse_complement_mapping_in_place(path_mapping_t* m, + const function& node_length); +void reverse_complement_path_in_place(path_t* path, + const function& node_length); + +// the first position on the path +pos_t initial_position(const path_t& path); +// the last position on the path +pos_t final_position(const path_t& path); + +int corresponding_to_length(const path_t& path, int from_length, bool from_end); +int corresponding_from_length(const path_t& path, int to_length, bool from_end); + +string debug_string(const path_t& path); +string debug_string(const path_mapping_t& mapping); +string debug_string(const edit_t& edit); +/* + * Implementations of inline methods + */ + +/* + * edit_t + */ +inline int32_t edit_t::from_length() const { + return _from_length; +} +inline void edit_t::set_from_length(int32_t l) { + _from_length = l; +} +inline int32_t edit_t::to_length() const { + return _to_length; +} +inline void edit_t::set_to_length(int32_t l) { + _to_length = l; +} +inline const string& edit_t::sequence() const { + return _sequence; +} +inline void edit_t::set_sequence(const string& s) { + _sequence = s; +} +inline string* edit_t::mutable_sequence() { + return &_sequence; +} +inline bool edit_t::operator==(const edit_t& other) const { + return (_to_length == other._to_length + && _from_length == other._from_length + && _sequence == other._sequence); +} +inline bool edit_t::operator!=(const edit_t& other) const { + return !(*this == other); +} + +/* + * path_mapping_t + */ +inline const position_t& path_mapping_t::position() const { + return _position; +} +inline position_t* path_mapping_t::mutable_position() { + return &_position; +} +inline const vector& path_mapping_t::edit() const { + return _edit; +} +inline const edit_t& path_mapping_t::edit(size_t i) const { + return _edit[i]; +} +inline vector* path_mapping_t::mutable_edit() { + return &_edit; +} +inline edit_t* path_mapping_t::add_edit() { + _edit.emplace_back(); + return &_edit.back(); +} +inline edit_t* path_mapping_t::mutable_edit(size_t i) { + return &_edit[i]; +} +inline size_t path_mapping_t::edit_size() const { + return _edit.size(); +} +inline bool path_mapping_t::operator==(const path_mapping_t& other) const { + return (_position == other._position && _edit == other._edit); +} +inline bool path_mapping_t::operator!=(const path_mapping_t& other) const { + return !(*this == other); +} + +/* + * path_t + */ +inline const vector& path_t::mapping() const { + return _mapping; +} +inline const path_mapping_t& path_t::mapping(size_t i) const { + return _mapping[i]; +} +inline vector* path_t::mutable_mapping() { + return &_mapping; +} +inline path_mapping_t* path_t::mutable_mapping(size_t i) { + return &_mapping[i]; +} +inline path_mapping_t* path_t::add_mapping() { + _mapping.emplace_back(); + return &_mapping.back(); +} +inline void path_t::clear_mapping() { + _mapping.clear(); +} +inline size_t path_t::mapping_size() const { + return _mapping.size(); +} +inline bool path_t::operator==(const path_t& other) const { + return _mapping == other._mapping; +} +inline bool path_t::operator!=(const path_t& other) const { + return !(*this == other); +} } #endif diff --git a/src/path_component_index.cpp b/src/path_component_index.cpp new file mode 100644 index 00000000000..24bf52e7774 --- /dev/null +++ b/src/path_component_index.cpp @@ -0,0 +1,39 @@ +#include "path_component_index.hpp" + +#include +#include "sdsl/bit_vectors.hpp" +#include "algorithms/component.hpp" + +//#define debug_component_index + +namespace vg { + + PathComponentIndex::PathComponentIndex() { + // Nothing to do + } + + PathComponentIndex::PathComponentIndex(const PathHandleGraph* graph) { + + component_path_sets = algorithms::component_paths(*graph); + + // make it so we can index into this with the path rank directly + component_path_set_of_path.reserve(graph->get_path_count()); + + // index from the paths to their component set + for (size_t i = 0; i < component_path_sets.size(); i++) { + for (const path_handle_t& path : component_path_sets[i]) { + if (component_path_set_of_path.count(path)) { + cerr << "warning:[PathComponentIndex] Graph contains path " << graph->get_path_name(path) << " that spans multiple connected components. This path must follow edges that are not included in the graph. The PathComponentIndex may not be semantically meaningful for this graph." << endl; + continue; + } + component_path_set_of_path[path] = i; + } + } + } + + bool PathComponentIndex::paths_on_same_component(const path_handle_t& path_1, + const path_handle_t& path_2) const { + + return component_path_sets.at(component_path_set_of_path.at(path_1)).count(path_2); + } +} diff --git a/src/path_component_index.hpp b/src/path_component_index.hpp new file mode 100644 index 00000000000..5d267c56156 --- /dev/null +++ b/src/path_component_index.hpp @@ -0,0 +1,50 @@ +#ifndef VG_PATH_COMPONENT_INDEX_HPP_INCLUDED +#define VG_PATH_COMPONENT_INDEX_HPP_INCLUDED + +/** \file + * + * Contains an index that maps embedded paths to the connected components of a graph + */ + +#include +#include +#include + +#include "handle.hpp" + +namespace vg { + + using namespace std; + + /* + * A class that can keep track of which embedded paths are on which + * component of the graph. + */ + class PathComponentIndex { + public: + + /// Constructor + PathComponentIndex(const PathHandleGraph* graph); + + /// Teturns true if the paths are on the same connected component of the graph + bool paths_on_same_component(const path_handle_t& path_1, + const path_handle_t& path_2) const; + + + + private: + + /// We make the default constructor private so that it can be used + /// in move's, etc. but isn't exposed + PathComponentIndex(); + + /// Memoized sets of the paths that co-occur on a connected component + vector> component_path_sets; + + /// An index from a path to the set of paths that occur on the same + /// connected component as it + unordered_map component_path_set_of_path; + }; +} + +#endif diff --git a/src/path_index.cpp b/src/path_index.cpp index 4a754c0e573..8e215a22e4c 100644 --- a/src/path_index.cpp +++ b/src/path_index.cpp @@ -149,8 +149,8 @@ PathIndex::PathIndex(const list& mappings, VG& vg) { } -PathIndex::PathIndex(const Path& path, const xg::XG& index) { - // Trace the given path in the given XG graph, collecting sequence +PathIndex::PathIndex(const Path& path, const HandleGraph& graph) { + // Trace the given path in the given graph, collecting sequence // We're going to build the sequence string std::stringstream seq_stream; @@ -174,7 +174,7 @@ PathIndex::PathIndex(const Path& path, const xg::XG& index) { #pragma omp critical (cerr) std::cerr << "Node " << mapping.position().node_id() << " rank " << mapping.rank() << " starts at base " << path_base << " with " - << index.node_sequence(mapping.position().node_id()) << std::endl; + << graph.get_sequence(graph.get_handle(mapping.position().node_id())) << std::endl; #endif // Make sure ranks are monotonically increasing along the path, or @@ -191,7 +191,7 @@ PathIndex::PathIndex(const Path& path, const xg::XG& index) { node_occurrences[mapping.position().node_id()].push_back(by_start.find(path_base)); // Find the node's sequence - std::string node_sequence = index.node_sequence(mapping.position().node_id()); + std::string node_sequence = graph.get_sequence(graph.get_handle(mapping.position().node_id())); while(path_base == 0 && node_sequence.size() > 0 && (node_sequence[0] != 'A' && node_sequence[0] != 'T' && node_sequence[0] != 'C' && @@ -230,7 +230,7 @@ PathIndex::PathIndex(const Path& path, const xg::XG& index) { // Record the length of the last mapping's node, since there's no next mapping to work it out from last_node_length = path.mapping_size() > 0 ? - index.node_length(path.mapping(path.mapping_size() - 1).position().node_id()) : + graph.get_length(graph.get_handle(path.mapping(path.mapping_size() - 1).position().node_id())) : 0; // Create the actual reference sequence we will use @@ -262,15 +262,18 @@ PathIndex::PathIndex(VG& vg, const string& path_name, bool extract_sequence) { } } -PathIndex::PathIndex(const xg::XG& index, const string& path_name, bool extract_sequence) { +PathIndex::PathIndex(const PathHandleGraph& graph, const string& path_name, bool extract_sequence) { // Make sure the path is present - assert(index.path_rank(path_name) != 0); + assert(graph.has_path(path_name)); + // Make a Protobuf path object + auto path = path_from_path_handle(graph, graph.get_path_handle(path_name)); + if (extract_sequence) { // Constructor dispatch hack - *this = PathIndex(index.path(path_name), index); + *this = PathIndex(path, graph); } else { - *this = PathIndex(index.path(path_name)); + *this = PathIndex(path); } } @@ -296,13 +299,42 @@ void PathIndex::update_mapping_positions(VG& vg, const string& path_name) { } } -bool PathIndex::path_contains_node(int64_t node_id){ +bool PathIndex::path_contains_node(int64_t node_id) const { if (by_id.find(node_id) != by_id.end()){ return true; } return false; } +bool PathIndex::path_contains_node_in_orientation(int64_t node_id, bool is_reverse) const { + return find_in_orientation(node_id, is_reverse) != end(); +} + +PathIndex::iterator PathIndex::find_in_orientation(int64_t node_id, bool is_reverse) const { + auto found_occurrences = node_occurrences.find(node_id); + + if (found_occurrences == node_occurrences.end()) { + // There are no occurrences + return end(); + } + + for (auto& occ : found_occurrences->second) { + // Do a linear scan for the correct orientation. + // TODO: index by orientation + if (occ->second.is_end == is_reverse) { + // We found an occurrence in the requested orientation. + return occ; + } + } + + return end(); +} + +pair PathIndex::get_contained_orientations(int64_t node_id) const { + // TODO: Do scans manually to be twice as fast! + return make_pair(path_contains_node_in_orientation(node_id, false), path_contains_node_in_orientation(node_id, true)); +} + NodeSide PathIndex::at_position(size_t position) const { return find_position(position)->second; } diff --git a/src/path_index.hpp b/src/path_index.hpp index db52dfd40fd..349d77fc08c 100644 --- a/src/path_index.hpp +++ b/src/path_index.hpp @@ -14,7 +14,7 @@ #include #include "vg.hpp" -#include "xg.hpp" +#include "handle.hpp" namespace vg { @@ -53,14 +53,14 @@ struct PathIndex { /// pull sequence from the given vg. PathIndex(const list& mappings, VG& vg); - /// Index a path and pull sequence from an XG index. - PathIndex(const Path& path, const xg::XG& vg); + /// Index a path and pull sequence from a graph. + PathIndex(const Path& path, const HandleGraph& graph); /// Make a PathIndex from a path in a graph PathIndex(VG& vg, const string& path_name, bool extract_sequence = false); /// Make a PathIndex from a path in an indexed graph - PathIndex(const xg::XG& index, const string& path_name, bool extract_sequence = false); + PathIndex(const PathHandleGraph& graph, const string& path_name, bool extract_sequence = false); /// Rebuild the mapping positions map by tracing all the paths in the given /// graph. TODO: We ought to move this functionality to the Paths object and @@ -71,12 +71,22 @@ struct PathIndex { /// be greater than the path length. NodeSide at_position(size_t position) const; - // Check whether a node is on the reference path. - bool path_contains_node(int64_t node_id); + /// Check whether a node is on the reference path. + bool path_contains_node(int64_t node_id) const ; + + /// Check whether a node is on the reference path in a given path-relative orientation. + bool path_contains_node_in_orientation(int64_t node_id, bool is_reverse) const; + + /// Return two flags for if the path contains the given node in forward and + /// reverse orientation. + pair get_contained_orientations(int64_t node_id) const; /// We keep iterators to node occurrences along the ref path. using iterator = map::const_iterator; + /// Find the first occurrence of the given node in the given orientation + iterator find_in_orientation(int64_t node_id, bool is_reverse) const; + /// Get the iterator to the first node occurrence on the indexed path. iterator begin() const; /// Get the iterator to the last node occurrence on the indexed path. diff --git a/src/path_subgraph.cpp b/src/path_subgraph.cpp new file mode 100644 index 00000000000..f876a7ea961 --- /dev/null +++ b/src/path_subgraph.cpp @@ -0,0 +1,264 @@ +/** + * \file path_subgraph.cpp: contains the implementation of PathSubgraph + */ + + +#include "path_subgraph.hpp" +#include "path.hpp" +#include +#include + +namespace vg { + +using namespace std; + + PathSubgraph::PathSubgraph(const HandleGraph* base, const Path& path) : super(base), defining_path(path) { + // Check our input + assert(defining_path.mapping_size() > 0); + } + + bool PathSubgraph::has_node(id_t node_id) const { + bool result = (node_id > 0 && node_id <= defining_path.mapping_size()); +#ifdef debug + cerr << "Have node " << node_id << ": " << result << endl; +#endif + return result; + } + + vector PathSubgraph::get_topological_order() const { + vector order; + order.reserve(defining_path.mapping_size()); + for (id_t i = 1; i <= defining_path.mapping_size(); i++) { + // Make one handle per node in the path + order.push_back(get_handle(i, false)); + } + +#ifdef debug + cerr << "Path: " << pb2json(defining_path) << endl; + cerr << "Order:"; + for (auto& h : order) { + cerr << " " << get_id(h) << (get_is_reverse(h) ? "-" : "+"); + } + cerr << endl; +#endif + + return order; + } + + handle_t PathSubgraph::get_handle(const id_t& node_id, bool is_reverse) const { + assert(node_id >= 1 && node_id <= defining_path.mapping_size()); + handle_t handle = handlegraph::number_bool_packing::pack(node_id, is_reverse); + assert(get_id(handle) == node_id); + assert(get_is_reverse(handle) == is_reverse); + return handle; + } + + id_t PathSubgraph::get_id(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_number(handle); + } + + bool PathSubgraph::get_is_reverse(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_bit(handle); + } + + handle_t PathSubgraph::flip(const handle_t& handle) const { + handle_t flipped = handlegraph::number_bool_packing::toggle_bit(handle); + assert(get_is_reverse(flipped) != get_is_reverse(handle)); + assert(get_id(flipped) == get_id(handle)); + return flipped; + } + + size_t PathSubgraph::get_length(const handle_t& handle) const { + size_t index = (size_t)get_id(handle) - 1; + // No need to go back to the backing graph; the path knows lengths. + return mapping_from_length(defining_path.mapping(index)); + } + + string PathSubgraph::get_sequence(const handle_t& handle) const { + // Find the backing node in its local forward orientation + size_t index = (size_t)get_id(handle) - 1; + assert(index >= 0 && index < defining_path.mapping_size()); + auto& pos = defining_path.mapping(index).position(); + handle_t backing_handle = super->get_handle(pos.node_id(), false); + + // Get its sequence in its local forward orientation + string backing_sequence = super->get_sequence(backing_handle); + + // Work out what range of that sequence we want. + size_t wanted_length = get_length(handle); + size_t backing_first = 0; + +#ifdef debug + cerr << "Start selecting " << wanted_length << " bp starting at " << backing_first << endl; +#endif + + // For every offset, even 0 + + // Work out whether we should do it from the + // start or end of the backing sequence in its local forward orientation. + // If the path visits the node forward, we cut from the start. + // Otherwise, we cut from the end. + bool cut_from_start = !pos.is_reverse(); + + // Reposition the window + // accordingly. + size_t budge; + if (cut_from_start) { + // Account for the space at the start of the node consumed by the offset + budge = pos.offset(); + } else { + // Leave only the space at the end of the node consumed by the offset. + // Budge by all the unwanted bases not consumed by the offset. + budge = backing_sequence.size() - wanted_length - pos.offset(); + } + +#ifdef debug + cerr << "Budge by " << budge << endl; +#endif + + backing_first += budge; + +#ifdef debug + cerr << "End selecting " << wanted_length << " bp starting at " << backing_first << endl; +#endif + + // Pull out and reverse complement if necessary + string wanted_sequence = backing_sequence.substr(backing_first, wanted_length); + if (get_is_reverse(handle) != pos.is_reverse()) { + // If we reverse the backing sequence only once, flip it. + wanted_sequence = reverse_complement(wanted_sequence); +#ifdef debug + cerr << "Flip it" << endl; +#endif + } + +#ifdef debug + cerr << "Mapping " << pb2json(defining_path.mapping(index)) << " on sequence " + << backing_sequence << " visited " << (get_is_reverse(handle) ? "rev" : "fwd") + << " produces " << wanted_sequence << endl; +#endif + + // Return it + return wanted_sequence; + } + + bool PathSubgraph::follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const { + // There's only ever 0 or 1 edges + size_t index = (size_t)get_id(handle) - 1; + assert(index >= 0 && index < defining_path.mapping_size()); + bool backward = get_is_reverse(handle); + + if (index == 0 && ((go_left && !backward) || (!go_left && backward))) { + // Hit left edge + return true; + } + + if (index == defining_path.mapping_size() - 1 && ((go_left && backward) || (!go_left && !backward))) { + // Hit right edge + return true; + } + + // Otherwise we can go somewhere + if ((go_left && backward) || (!go_left && !backward)) { + // Going forward in path + index++; + } else { + index--; + } + + assert(index >= 0 && index < defining_path.mapping_size()); + + // Go there and return the bool flag + return iteratee(get_handle(index + 1, backward)); + } + + bool PathSubgraph::for_each_handle_impl(const function& iteratee, bool parallel) const { + // TODO: implement parallel mode. + // Paths should be short so we shouldn't really need it. + for (size_t i = 0; i < defining_path.mapping_size(); i++) { + // Try visiting each path visit + if (!iteratee(get_handle(i + 1, false))) { + // Stop early + return false; + } + } + return true; + } + + size_t PathSubgraph::get_node_count() const { + return defining_path.mapping_size(); + } + + id_t PathSubgraph::min_node_id() const { + return 1; + } + + id_t PathSubgraph::max_node_id() const { + return defining_path.mapping_size(); + } + + handle_t PathSubgraph::get_underlying_handle(const handle_t& handle) const { + // Look up the defining Mapping we are visiting + auto& defining_mapping = defining_path.mapping(get_id(handle) - 1); + + // Get the handle corresponding to this mapping in our path. + return super->get_handle(defining_mapping.position().node_id(), defining_mapping.position().is_reverse()); + } + + Path PathSubgraph::translate_down(const Path& path_against_subgraph) const { + Path translated; + + for (auto& subgraph_mapping : path_against_subgraph.mapping()) { + // Translate each mapping + Mapping* translated_mapping = translated.add_mapping(); + + // Look up the defining Mapping we are visiting + auto& defining_mapping = defining_path.mapping(subgraph_mapping.position().node_id() - 1); + + // TODO: simplify out repeated code here once we're sure each case is really correct + if (defining_mapping.position().is_reverse() == false && subgraph_mapping.position().is_reverse() == false) { + // We're in the forward orientation all the way through. + // If there's an offset in the defining mapping, we need to add that to the offset in the subgraph mapping. + // If the defining mapping has a short length, we don't care because we know the subgraph mapping won't be longer. + translated_mapping->mutable_position()->set_node_id(defining_mapping.position().node_id()); + translated_mapping->mutable_position()->set_offset(defining_mapping.position().offset() + subgraph_mapping.position().offset()); + // The result will be forward + } else if (defining_mapping.position().is_reverse() == false && subgraph_mapping.position().is_reverse() == true) { + // We're in the forward orientation agaisnt the backing graph but the reverse orientation against the path. + // Any shortness in the path mapping from length needs to be turned into an offset and added to the backing path offset. + // Any offset in it will be ignored. + size_t shortness = mapping_from_length(defining_mapping) - mapping_from_length(subgraph_mapping) - subgraph_mapping.position().offset(); + + translated_mapping->mutable_position()->set_node_id(defining_mapping.position().node_id()); + translated_mapping->mutable_position()->set_offset(defining_mapping.position().offset() + shortness); + // We come out backward + translated_mapping->mutable_position()->set_is_reverse(true); + } else if (defining_mapping.position().is_reverse() == true && subgraph_mapping.position().is_reverse() == false) { + // We're in the reverse orientation against the backing graph, and the mapping to the path agrees with that. + // We need to sum the offsets and ignore shortness + translated_mapping->mutable_position()->set_node_id(defining_mapping.position().node_id()); + translated_mapping->mutable_position()->set_offset(defining_mapping.position().offset() + subgraph_mapping.position().offset()); + // And we will stay reverse + translated_mapping->mutable_position()->set_is_reverse(true); + } else { + // We're in the reverse orientation in the backing graph, but then flip back against that. + // We need to add shortness to offset + size_t shortness = mapping_from_length(defining_mapping) - mapping_from_length(subgraph_mapping) - subgraph_mapping.position().offset(); + + translated_mapping->mutable_position()->set_node_id(defining_mapping.position().node_id()); + translated_mapping->mutable_position()->set_offset(defining_mapping.position().offset() + shortness); + // We come out in the forward orientation + translated_mapping->mutable_position()->set_is_reverse(true); + } + + // The edits always stay the same + for (auto& edit : subgraph_mapping.edit()) { + *translated_mapping->add_edit() = edit; + } + } + + return translated; + } + +} + diff --git a/src/path_subgraph.hpp b/src/path_subgraph.hpp new file mode 100644 index 00000000000..24c689b6606 --- /dev/null +++ b/src/path_subgraph.hpp @@ -0,0 +1,108 @@ +#ifndef VG_PATH_SUBGRAPH_HPP_INCLUDED +#define VG_PATH_SUBGRAPH_HPP_INCLUDED + +/** \file + * path_subgraph.hpp: represents a subgraph defined by a path in another graph. + */ + +#include "handle.hpp" +#include +#include +#include + +namespace vg { + +using namespace std; + + /** + * A HandleGraph implementation that represents a subgraph of another HandleGraph, defined by a Path. + * The leading and trailing nodes are cut according to the Path. + * Supports translation of other Paths from this graph into the base graph. + * + * Nodes are numbered 1 to n along the path; multiple visits on the path to the same backing node will become distinct. + */ + class PathSubgraph : public handlegraph::ExpandingOverlayGraph { + public: + + /// Create a PathSubgraph describing the subgraph of the given graph + /// defined by the given path. The path must not be empty. The path + /// can take part of the start and end nodes, but all mappings must be + /// perfect matches, and all adjacent mappings must properly cross a real + /// edge. + PathSubgraph(const HandleGraph* base, const Path& path); + + /// Get a topological order very easily, since the path defines one. + vector get_topological_order() const; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + /// Method to check if a node exists by ID + virtual bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + virtual handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + virtual id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + virtual bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + virtual handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + virtual size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + virtual string get_sequence(const handle_t& handle) const; + + protected: + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + virtual bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + public: + /// Return the number of nodes in the graph + virtual size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t max_node_id() const; + + ////////////////////////// + /// ExpandingOverlayGraph interface + ////////////////////////// + + /// Get the handle in the backing graph that the given handle in this graph represents. + virtual handle_t get_underlying_handle(const handle_t& handle) const; + + ////////////////////////// + /// Additional Interface + ////////////////////////// + + /// Translate a Path against us to a Path against the base graph + Path translate_down(const Path& path_against_subgraph) const; + + private: + const HandleGraph* super = nullptr; + Path defining_path; + }; +} + +#endif diff --git a/src/phase_unfolder.cpp b/src/phase_unfolder.cpp index 4525424d935..675b3684c63 100644 --- a/src/phase_unfolder.cpp +++ b/src/phase_unfolder.cpp @@ -1,5 +1,6 @@ #include "phase_unfolder.hpp" #include "progress_bar.hpp" +#include "algorithms/disjoint_components.hpp" #include #include @@ -8,72 +9,84 @@ namespace vg { -PhaseUnfolder::PhaseUnfolder(const xg::XG& xg_index, const gbwt::GBWT& gbwt_index, vg::id_t next_node) : - xg_index(xg_index), gbwt_index(gbwt_index), mapping(next_node) { - assert(this->mapping.begin() > this->xg_index.get_max_id()); +PhaseUnfolder::PhaseUnfolder(const PathHandleGraph& path_graph, const gbwt::GBWT& gbwt_index, vg::id_t next_node) : + path_graph(path_graph), gbwt_index(gbwt_index), mapping(next_node) { + assert(this->mapping.begin() > this->path_graph.max_node_id()); } -void PhaseUnfolder::unfold(VG& graph, bool show_progress) { - std::list components = this->complement_components(graph, show_progress); - +void PhaseUnfolder::unfold(MutableHandleGraph& graph, bool show_progress) { + + std::list components = this->complement_components(graph, show_progress); + size_t haplotype_paths = 0; - VG unfolded; - for (VG& component : components) { + bdsg::HashGraph unfolded; + for (MutableHandleGraph& component : components) { haplotype_paths += this->unfold_component(component, graph, unfolded); } if (show_progress) { std::cerr << "Unfolded graph: " - << unfolded.node_count() << " nodes, " << unfolded.edge_count() << " edges on " + << unfolded.get_node_count() << " nodes, " << unfolded.get_edge_count() << " edges on " << haplotype_paths << " paths" << std::endl; } - - graph.extend(unfolded); + + handlealgs::extend(&unfolded, &graph); } -void PhaseUnfolder::restore_paths(VG& graph, bool show_progress) const { - - for (size_t path_rank = 1; path_rank <= this->xg_index.max_path_rank(); path_rank++) { - const xg::XGPath& path = this->xg_index.get_path(this->xg_index.path_name(path_rank)); - if (path.ids.size() == 0) { - continue; - } - - gbwt::node_type prev = gbwt::Node::encode(path.node(0), path.is_reverse(0)); - for (size_t i = 1; i < path.ids.size(); i++) { - gbwt::node_type curr = gbwt::Node::encode(path.node(i), path.is_reverse(i)); - Edge candidate = make_edge(prev, curr); - if (!graph.has_edge(candidate)) { - graph.add_node(this->xg_index.node(candidate.from())); - graph.add_node(this->xg_index.node(candidate.to())); - graph.add_edge(candidate); +void PhaseUnfolder::restore_paths(MutableHandleGraph& graph, bool show_progress) const { + // we include generic to also pick up transcript paths + this->path_graph.for_each_path_matching({PathSense::GENERIC, PathSense::REFERENCE}, {}, {}, + [&](const path_handle_t& path) { + handle_t prev; + bool first = true; + this->path_graph.for_each_step_in_path(path, [&](const step_handle_t& step) { + handle_t handle = this->path_graph.get_handle_of_step(step); + vg::id_t id = this->path_graph.get_id(handle); + handle_t curr; + if (!graph.has_node(id)) { + curr = graph.create_handle(this->path_graph.get_sequence(this->path_graph.forward(handle)), id); + if (this->path_graph.get_is_reverse(handle)) { + curr = graph.flip(curr); + } + } + else { + curr = graph.get_handle(id, this->path_graph.get_is_reverse(handle)); + } + if (first) { + // nothing to on the first step + first = false; + } else { + edge_t candidate = make_pair(prev, curr); + if (!graph.has_edge(candidate)) { + graph.create_edge(candidate); + } } prev = curr; - } - } + }); + }); if (show_progress) { - std::cerr << "Restored graph: " << graph.node_count() << " nodes, " << graph.edge_count() << " edges" << std::endl; + std::cerr << "Restored graph: " << graph.get_node_count() << " nodes" << std::endl; } } -size_t path_size(const xg::XGPath& path) { - return path.ids.size(); +vg::id_t path_node(const vector>& path, size_t i) { + return path[i].first; } -size_t path_size(const gbwt::vector_type& path) { - return path.size(); +vg::id_t path_node(const gbwt::vector_type& path, size_t i) { + return gbwt::Node::id(path[i]); } -vg::id_t path_node(const xg::XGPath& path, size_t i) { - return path.node(i); +size_t path_size(const vector>& path) { + return path.size(); } -vg::id_t path_node(const gbwt::vector_type& path, size_t i) { - return gbwt::Node::id(path[i]); +size_t path_size(const gbwt::vector_type& path) { + return path.size(); } -bool path_reverse(const xg::XGPath& path, size_t i) { - return path.is_reverse(i); +bool path_reverse(const vector>& path, size_t i) { + return path[i].second; } bool path_reverse(const gbwt::vector_type& path, size_t i) { @@ -92,8 +105,13 @@ struct PathBranch { } }; +std::ostream& operator<<(std::ostream& out, PathBranch branch) { + out << "(" << branch.offset << ", " << branch.curr << ", " << branch.next << ")"; + return out; +} + template -bool verify_path(const PathType& path, VG& unfolded, const hash_map>& reverse_mapping) { +bool verify_path(const PathType& path, MutableHandleGraph& unfolded, const hash_map>& reverse_mapping) { if (path_size(path) < 2) { return true; @@ -138,7 +156,7 @@ bool verify_path(const PathType& path, VG& unfolded, const hash_map duplicates. hash_map> reverse_mapping; @@ -179,7 +197,7 @@ size_t PhaseUnfolder::verify_paths(VG& unfolded, bool show_progress) const { gcsa::removeDuplicates(mapping.second, false); } - size_t total_paths = this->xg_index.max_path_rank() + this->gbwt_index.sequences(), verified = 0, failures = 0; + size_t total_paths = this->path_graph.get_path_count() + this->gbwt_index.sequences(), verified = 0, failures = 0; std::set failed_threads; ProgressBar* progress = nullptr; size_t progress_step = std::max(total_paths / 100, static_cast(32)); @@ -188,23 +206,34 @@ size_t PhaseUnfolder::verify_paths(VG& unfolded, bool show_progress) const { progress->Progressed(verified); } - #pragma omp parallel for schedule(dynamic, 1) - for (size_t i = 0; i < total_paths; i++) { - bool successful = true; - if (i < this->xg_index.max_path_rank()) { - const xg::XGPath& path = this->xg_index.get_path(this->xg_index.path_name(i + 1)); - successful = verify_path(path, unfolded, reverse_mapping); - } else { - path_type path = this->gbwt_index.extract(i - this->xg_index.max_path_rank()); - successful = verify_path(path, unfolded, reverse_mapping); + this->path_graph.for_each_path_handle([&](const path_handle_t& path_handle) { + vector> path; + this->path_graph.for_each_step_in_path(path_handle, [&](const step_handle_t& step) { + handle_t handle = this->path_graph.get_handle_of_step(step); + path.push_back(make_pair(this->path_graph.get_id(handle), + this->path_graph.get_is_reverse(handle))); + }); + bool successful = verify_path(path, unfolded, reverse_mapping); + if (!successful) { + failures++; + } + verified++; + if (show_progress && (verified % progress_step == 0 || verified >= total_paths)) { + progress->Progressed(verified); } - #pragma omp critical + }); + +#pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < this->gbwt_index.sequences(); i++) { + + path_type path = this->gbwt_index.extract(i); + bool successful = verify_path(path, unfolded, reverse_mapping); + +#pragma omp critical { if (!successful) { failures++; - if (i >= this->xg_index.max_path_rank()) { - failed_threads.insert(i - this->xg_index.max_path_rank()); - } + failed_threads.insert(i); } verified++; if (show_progress && (verified % progress_step == 0 || verified >= total_paths)) { @@ -247,68 +276,113 @@ void PhaseUnfolder::read_mapping(const std::string& filename) { } this->mapping.load(in); in.close(); - assert(this->mapping.begin() > this->xg_index.get_max_id()); + assert(this->mapping.begin() > this->path_graph.max_node_id()); } vg::id_t PhaseUnfolder::get_mapping(vg::id_t node) const { return this->mapping(node); } -std::list PhaseUnfolder::complement_components(VG& graph, bool show_progress) { - VG complement; +std::list PhaseUnfolder::complement_components(MutableHandleGraph& graph, bool show_progress) { + + bdsg::HashGraph complement; - // Add missing edges supported by XG paths. - for (size_t path_rank = 1; path_rank <= this->xg_index.max_path_rank(); path_rank++) { - const xg::XGPath& path = this->xg_index.get_path(this->xg_index.path_name(path_rank)); - if (path.ids.size() == 0) { - continue; + // checks whether the graph contains an edge + auto graph_has_edge = [&](const vg::id_t from_id, const vg::id_t to_id, + const bool from_rev, const bool to_rev) { + if (graph.has_node(from_id) && graph.has_node(to_id)) { + return graph.has_edge(graph.get_handle(from_id, from_rev), graph.get_handle(to_id, to_rev)); } - gbwt::node_type prev = gbwt::Node::encode(path.node(0), path.is_reverse(0)); - for (size_t i = 1; i < path.ids.size(); i++) { - gbwt::node_type curr = gbwt::Node::encode(path.node(i), path.is_reverse(i)); - Edge candidate = make_edge(prev, curr); - if (!graph.has_edge(candidate)) { - complement.add_node(this->xg_index.node(candidate.from())); - complement.add_node(this->xg_index.node(candidate.to())); - complement.add_edge(candidate); - } - prev = curr; + return false; + }; + + // checks whether an edge from the PathHandleGraph is in the graph + auto graph_has_path_graph_edge = [&](const handle_t& from, const handle_t& to) { + return graph_has_edge(path_graph.get_id(from), path_graph.get_id(to), + path_graph.get_is_reverse(from), path_graph.get_is_reverse(to)); + }; + + // checks whether an edge from the GBWT is in the graph + auto graph_has_gbwt_edge = [&](const gbwt::node_type& from, const gbwt::node_type& to) { + return graph_has_edge(gbwt::Node::id(from), gbwt::Node::id(to), + gbwt::Node::is_reverse(from), gbwt::Node::is_reverse(to)); + }; + + // takes a handle to the PathHandleGraph and returns the equivalent handle in + // the complement, making the node if necessary + auto get_or_make_complement_handle = [&](const handle_t& counterpart) { + vg::id_t id = path_graph.get_id(counterpart); + if (!complement.has_node(id)) { + complement.create_handle(path_graph.get_sequence(path_graph.forward(counterpart)), id); } - } + return complement.get_handle(id, path_graph.get_is_reverse(counterpart)); + }; + + // takes an edge in the XG and ensures that it exists in the complement + auto make_complement_edge = [&](const handle_t& from, const handle_t& to) { + handle_t comp_from = get_or_make_complement_handle(from); + handle_t comp_to = get_or_make_complement_handle(to); + if (!complement.has_edge(comp_from, comp_to)) { + complement.create_edge(comp_from, comp_to); + } + }; + + // Add missing edges supported by XG paths. + this->path_graph.for_each_path_handle([&](const path_handle_t& path) { + handle_t prev; + bool first = true; + this->path_graph.for_each_step_in_path(path, [&](const step_handle_t& step) { + handle_t handle = this->path_graph.get_handle_of_step(step); + if (!first) { + if (!graph_has_path_graph_edge(prev, handle)) { + make_complement_edge(prev, handle); + } + } + else { + first = false; + } + prev = handle; + }); + }); - // Add missing edges supported by GBWT threads. + // Add missing edges supported by GBWT threads, but only if the nodes exist + // in the original graph. for (gbwt::comp_type comp = 1; comp < this->gbwt_index.effective(); comp++) { gbwt::node_type gbwt_node = this->gbwt_index.toNode(comp); + if (!this->path_graph.has_node(gbwt::Node::id(gbwt_node))) { + continue; + } + std::vector outgoing = this->gbwt_index.edges(gbwt_node); for (gbwt::edge_type outedge : outgoing) { - if (outedge.first == gbwt::ENDMARKER) { + if (outedge.first == gbwt::ENDMARKER || !this->path_graph.has_node(gbwt::Node::id(outedge.first))) { continue; } - Edge candidate = make_edge(gbwt_node, outedge.first); - if (!graph.has_edge(candidate)) { - complement.add_node(this->xg_index.node(candidate.from())); - complement.add_node(this->xg_index.node(candidate.to())); - complement.add_edge(candidate); + if (!graph_has_gbwt_edge(gbwt_node, outedge.first)) { + make_complement_edge(path_graph.get_handle(gbwt::Node::id(gbwt_node), + gbwt::Node::is_reverse(gbwt_node)), + path_graph.get_handle(gbwt::Node::id(outedge.first), + gbwt::Node::is_reverse(outedge.first))); } } } - std::list components; - complement.disjoint_subgraphs(components); + std::list components = algorithms::disjoint_components(complement); if (show_progress) { std::cerr << "Complement graph: " - << complement.node_count() << " nodes, " << complement.edge_count() << " edges in " + << complement.get_node_count() << " nodes, " << complement.get_edge_count() << " edges in " << components.size() << " components" << std::endl; } return components; } -size_t PhaseUnfolder::unfold_component(VG& component, VG& graph, VG& unfolded) { +size_t PhaseUnfolder::unfold_component(MutableHandleGraph& component, MutableHandleGraph& graph, MutableHandleGraph& unfolded) { // Find the border nodes shared between the component and the graph. - component.for_each_node([&](Node* node) { - if (graph.has_node(node->id())) { - this->border.insert(node->id()); - } + component.for_each_handle([&](const handle_t& handle) { + vg::id_t id = component.get_id(handle); + if (graph.has_node(id)) { + this->border.insert(id); + } }); // Generate the paths starting from each border node. @@ -317,14 +391,16 @@ size_t PhaseUnfolder::unfold_component(VG& component, VG& graph, VG& unfolded) { } // Generate the threads for each node. - component.for_each_node([&](Node* node) { - this->generate_threads(component, node->id()); + component.for_each_handle([&](const handle_t& handle) { + this->generate_threads(component, component.get_id(handle)); }); - + auto insert_node = [&](gbwt::node_type node) { - Node temp = this->xg_index.node(this->get_mapping(gbwt::Node::id(node))); - temp.set_id(gbwt::Node::id(node)); - unfolded.add_node(temp); + // create a new node + if (!unfolded.has_node(gbwt::Node::id(node))) { + handle_t temp = this->path_graph.get_handle(this->get_mapping(gbwt::Node::id(node)));; + unfolded.create_handle(this->path_graph.get_sequence(temp), gbwt::Node::id(node)); + } }; // Create the unfolded component from the tries. @@ -335,7 +411,7 @@ size_t PhaseUnfolder::unfold_component(VG& component, VG& graph, VG& unfolded) { } insert_node(to); if (from != gbwt::ENDMARKER) { - unfolded.add_edge(make_edge(from, to)); + unfolded.create_edge(make_edge(unfolded, from, to)); } } for (auto mapping : this->suffixes) { @@ -343,13 +419,13 @@ size_t PhaseUnfolder::unfold_component(VG& component, VG& graph, VG& unfolded) { insert_node(from); if (to != gbwt::ENDMARKER) { insert_node(to); - unfolded.add_edge(make_edge(from, to)); + unfolded.create_edge(make_edge(unfolded, from, to)); } } for (auto edge : this->crossing_edges) { insert_node(edge.first); insert_node(edge.second); - unfolded.add_edge(make_edge(edge.first, edge.second)); + unfolded.create_edge(make_edge(unfolded, edge.first, edge.second)); } size_t haplotype_paths = this->crossing_edges.size(); @@ -361,59 +437,80 @@ size_t PhaseUnfolder::unfold_component(VG& component, VG& graph, VG& unfolded) { return haplotype_paths; } -void PhaseUnfolder::generate_paths(VG& component, vg::id_t from) { - - for (size_t path_rank = 1; path_rank <= this->xg_index.max_path_rank(); path_rank++) { - const xg::XGPath& path = this->xg_index.get_path(this->xg_index.path_name(path_rank)); - - std::vector occurrences = this->xg_index.node_ranks_in_path(from, path_rank); - for (size_t occurrence : occurrences) { - // Forward. - { - gbwt::node_type prev = gbwt::Node::encode(path.node(occurrence), path.is_reverse(occurrence)); - path_type buffer(1, prev); - for (size_t i = occurrence + 1; i < path.ids.size(); i++) { - gbwt::node_type curr = gbwt::Node::encode(path.node(i), path.is_reverse(i)); - Edge candidate = make_edge(prev, curr); - if (!component.has_edge(candidate)) { - break; // Found a maximal path. - } - buffer.push_back(curr); - if (this->border.find(gbwt::Node::id(curr)) != this->border.end()) { - break; // Found a border-to-border path. - } - prev = curr; +void PhaseUnfolder::generate_paths(MutableHandleGraph& component, vg::id_t from) { + + handle_t from_handle = this->path_graph.get_handle(from); + this->path_graph.for_each_step_on_handle(from_handle, [&](const step_handle_t& _step) { + // Forward. + { + step_handle_t step = _step; + handle_t handle = this->path_graph.get_handle_of_step(step); + vg::id_t id = this->path_graph.get_id(handle); + bool is_rev = this->path_graph.get_is_reverse(handle); + gbwt::node_type prev = gbwt::Node::encode(id, is_rev); + path_type buffer(1, prev); + while (this->path_graph.has_next_step(step)) { + step = this->path_graph.get_next_step(step); + handle = this->path_graph.get_handle_of_step(step); + id = this->path_graph.get_id(handle); + is_rev = this->path_graph.get_is_reverse(handle); + if (!component.has_node(id)) { + break; // Found a maximal path, no matching node. + } + gbwt::node_type curr = gbwt::Node::encode(id, is_rev); + edge_t candidate = make_edge(component, prev, curr); + if (!component.has_edge(candidate)) { + break; // Found a maximal path, no matching edge. } - bool to_border = (this->border.find(gbwt::Node::id(buffer.back())) != this->border.end()); - this->reference_paths.push_back(buffer); - this->insert_path(buffer, true, to_border); + buffer.push_back(curr); + if (this->border.find(gbwt::Node::id(curr)) != this->border.end()) { + break; // Found a border-to-border path. + } + prev = curr; } + + bool to_border = (this->border.find(gbwt::Node::id(buffer.back())) != this->border.end()); + this->reference_paths.push_back(buffer); + this->insert_path(buffer, true, to_border); + } - // Backward. - { - gbwt::node_type prev = gbwt::Node::encode(path.node(occurrence), !path.is_reverse(occurrence)); - path_type buffer(1, prev); - for (size_t i = occurrence; i > 0 ; i--) { - gbwt::node_type curr = gbwt::Node::encode(path.node(i - 1), !path.is_reverse(i - 1)); - Edge candidate = make_edge(prev, curr); - if (!component.has_edge(candidate)) { - break; // Found a maximal path. - } - buffer.push_back(curr); - if (this->border.find(gbwt::Node::id(curr)) != this->border.end()) { - break; // Found a border-to-border path. - } - prev = curr; + // Backward. + { + step_handle_t step = _step; + handle_t handle = this->path_graph.get_handle_of_step(step); + vg::id_t id = this->path_graph.get_id(handle); + bool is_rev = this->path_graph.get_is_reverse(handle); + gbwt::node_type prev = gbwt::Node::encode(id, !is_rev); + path_type buffer(1, prev); + while (this->path_graph.has_previous_step(step)) { + step = this->path_graph.get_previous_step(step); + handle = this->path_graph.get_handle_of_step(step); + id = this->path_graph.get_id(handle); + is_rev = this->path_graph.get_is_reverse(handle); + if (!component.has_node(id)) { + break; // Found a maximal path, no matching node. + } + gbwt::node_type curr = gbwt::Node::encode(id, !is_rev); + edge_t candidate = make_edge(component, prev, curr); + if (!component.has_edge(candidate)) { + break; // Found a maximal path, no matching edge. } - bool to_border = (this->border.find(gbwt::Node::id(buffer.back())) != this->border.end()); - this->reference_paths.push_back(buffer); - this->insert_path(buffer, true, to_border); + buffer.push_back(curr); + if (this->border.find(gbwt::Node::id(curr)) != this->border.end()) { + break; // Found a border-to-border path. + } + prev = curr; } + + bool to_border = (this->border.find(gbwt::Node::id(buffer.back())) != this->border.end()); + this->reference_paths.push_back(buffer); + this->insert_path(buffer, true, to_border); } - } + + }); } -void PhaseUnfolder::generate_threads(VG& component, vg::id_t from) { +void PhaseUnfolder::generate_threads(MutableHandleGraph& component, vg::id_t from) { bool is_internal = (this->border.find(from) == this->border.end()); this->create_state(from, false, is_internal); @@ -431,17 +528,14 @@ void PhaseUnfolder::generate_threads(VG& component, vg::id_t from) { continue; // The path reached a border. } - std::vector edges = component.edges_of(component.get_node(node)); bool was_extended = false; - for (Edge* edge : edges) { - if (edge->from() == node && edge->from_start() == is_reverse) { - was_extended |= this->extend_state(state, edge->to(), edge->to_end()); - } - else if (edge->to() == node && edge->to_end() != is_reverse) { - was_extended |= this->extend_state(state, edge->from(), !edge->from_start()); - } - } - + handle_t from = component.get_handle(node, is_reverse); + component.follow_edges(from, false, [&](const handle_t& handle) { + was_extended |= this->extend_state(state, component.get_id(handle), component.get_is_reverse(handle)); + }); + component.follow_edges(from, true, [&](const handle_t& handle) { + was_extended |= this->extend_state(state, component.get_id(handle), !component.get_is_reverse(handle)); + }); if (!was_extended) { this->extend_path(state.second); // Maximal path. } @@ -480,7 +574,6 @@ PhaseUnfolder::path_type canonical_orientation(const PhaseUnfolder::path_type& p } void PhaseUnfolder::extend_path(const path_type& path) { - if (path.size() < 2) { return; } @@ -503,8 +596,8 @@ void PhaseUnfolder::extend_path(const path_type& path) { const path_type& reference = this->reference_paths[ref]; bool found = false; for (size_t i = 0; i < reference.size(); i++) { - Edge candidate = make_edge(reference[i], to_extend.front()); - if (this->xg_index.has_edge(candidate)) { + edge_t candidate = make_edge(path_graph, reference[i], to_extend.front()); + if (this->path_graph.has_edge(candidate.first, candidate.second)) { to_extend.insert(to_extend.begin(), reference.begin(), reference.begin() + i + 1); from_border = true; found = true; @@ -523,8 +616,8 @@ void PhaseUnfolder::extend_path(const path_type& path) { const path_type& reference = this->reference_paths[ref]; bool found = false; for (size_t i = 0; i < reference.size(); i++) { - Edge candidate = make_edge(to_extend.back(), reference[i]); - if (this->xg_index.has_edge(candidate)) { + edge_t candidate = make_edge(path_graph, to_extend.back(), reference[i]); + if (this->path_graph.has_edge(candidate.first, candidate.second)) { to_extend.insert(to_extend.end(), reference.begin() + i, reference.end()); to_border = true; found = true; @@ -553,7 +646,7 @@ void PhaseUnfolder::insert_path(const path_type& path, bool from_border, bool to the node and insert the mapping into the corresponding trie. Finally create a crossing edge between the full prefix and the full suffix. */ - + // Prefixes. gbwt::node_type from = to_insert.front(); if (!from_border) { diff --git a/src/phase_unfolder.hpp b/src/phase_unfolder.hpp index 667baf335b7..6155a6f8ac4 100644 --- a/src/phase_unfolder.hpp +++ b/src/phase_unfolder.hpp @@ -7,8 +7,9 @@ */ #include "vg.hpp" -#include "xg.hpp" +#include "handle.hpp" #include "hash_map.hpp" +#include "gbwt_helper.hpp" #include #include @@ -16,8 +17,8 @@ #include #include -#include #include +#include namespace vg { @@ -41,7 +42,7 @@ class PhaseUnfolder { * These indexes must represent the same original graph. 'next_node' should * usually be max_node_id() + 1 in the original graph. */ - PhaseUnfolder(const xg::XG& xg_index, const gbwt::GBWT& gbwt_index, vg::id_t next_node); + PhaseUnfolder(const PathHandleGraph& path_graph, const gbwt::GBWT& gbwt_index, vg::id_t next_node); /** * Unfold the pruned regions in the input graph: @@ -56,21 +57,21 @@ class PhaseUnfolder { * * - Extend the input graph with the unfolded components. */ - void unfold(VG& graph, bool show_progress = false); + void unfold(MutableHandleGraph& graph, bool show_progress = false); /** * Restore the edges on XG paths. This is effectively the same as * unfolding with an empty GBWT index, except that the inserted nodes will * have their original identifiers. */ - void restore_paths(VG& graph, bool show_progress = false) const; + void restore_paths(MutableHandleGraph& graph, bool show_progress = false) const; /** * Verify that the graph contains the XG paths and the GBWT threads in the * backing indexes. Returns the number of paths for which the verification * failed. Uses OMP threads. */ - size_t verify_paths(VG& unfolded, bool show_progress = false) const; + size_t verify_paths(MutableHandleGraph& unfolded, bool show_progress = false) const; /** * Write the mapping to the specified file with a header. The file will @@ -94,9 +95,9 @@ class PhaseUnfolder { /** * Create an edge between two node orientations. */ - static Edge make_edge(gbwt::node_type from, gbwt::node_type to) { - return xg::make_edge(gbwt::Node::id(from), gbwt::Node::is_reverse(from), - gbwt::Node::id(to), gbwt::Node::is_reverse(to)); + static edge_t make_edge(const HandleGraph& graph, gbwt::node_type from, gbwt::node_type to) { + return make_pair(graph.get_handle(gbwt::Node::id(from), gbwt::Node::is_reverse(from)), + graph.get_handle(gbwt::Node::id(to), gbwt::Node::is_reverse(to))); } private: @@ -105,14 +106,14 @@ class PhaseUnfolder { * GBWT index but not in the input graph. Split the complement into * disjoint components and return the components. */ - std::list complement_components(VG& graph, bool show_progress); + std::list complement_components(MutableHandleGraph& graph, bool show_progress); /** * Generate all border-to-border paths in the component supported by the * indexes. Unfold the paths by duplicating the inner nodes so that the * paths become disjoint, except for their shared prefixes/suffixes. */ - size_t unfold_component(VG& component, VG& graph, VG& unfolded); + size_t unfold_component(MutableHandleGraph& component, MutableHandleGraph& graph, MutableHandleGraph& unfolded); /** * Generate all paths supported by the XG index passing through the given @@ -120,7 +121,7 @@ class PhaseUnfolder { * paths into the set in the canonical orientation, and use them as * reference paths for extending threads. */ - void generate_paths(VG& component, vg::id_t from); + void generate_paths(MutableHandleGraph& component, vg::id_t from); /** * Generate all paths supported by the GBWT index from the given node until @@ -129,7 +130,7 @@ class PhaseUnfolder { * passing through it. Otherwise consider only the threads starting from * it, and do not output threads reaching a border. */ - void generate_threads(VG& component, vg::id_t from); + void generate_threads(MutableHandleGraph& component, vg::id_t from); /** * Create or extend the state with the given node orientation, and insert @@ -157,7 +158,7 @@ class PhaseUnfolder { gbwt::node_type get_suffix(gbwt::node_type node, gbwt::node_type to); /// XG and GBWT indexes for the original graph. - const xg::XG& xg_index; + const PathHandleGraph& path_graph; const gbwt::GBWT& gbwt_index; /// Mapping from duplicated nodes to original ids. diff --git a/src/phased_genome.cpp b/src/phased_genome.cpp index aeb921f06fd..49cc5341b2e 100644 --- a/src/phased_genome.cpp +++ b/src/phased_genome.cpp @@ -4,6 +4,8 @@ #include "phased_genome.hpp" +//#define debug_phased_genome + using namespace std; namespace vg { @@ -33,7 +35,7 @@ namespace vg { } } - PhasedGenome::PhasedGenome(SnarlManager& snarl_manager) : snarl_manager(snarl_manager) { + PhasedGenome::PhasedGenome(const SnarlManager& snarl_manager) : snarl_manager(&snarl_manager) { // nothing to do } @@ -45,13 +47,42 @@ namespace vg { } + PhasedGenome::PhasedGenome(PhasedGenome& rhs){ + + *this = rhs; + } + PhasedGenome& PhasedGenome::operator = (PhasedGenome& phased_genome){ + + snarl_manager = phased_genome.snarl_manager; + + for (Haplotype* haplotype : haplotypes) { + delete haplotype; + } + node_locations.clear(); + site_starts.clear(); + site_ends.clear(); + haplotypes.clear(); + + for(int i = 0; i < phased_genome.haplotypes.size(); i++ ){ + // build haplotypes + Haplotype* new_haplo = new Haplotype(phased_genome.begin(i), phased_genome.end(i)); + + haplotypes.push_back(new_haplo); + } + + // build indices on the new object + build_indices(); + + return phased_genome; + } + void PhasedGenome::build_indices() { #ifdef debug_phased_genome cerr << "[PhasedGenome::build_indices]: building node id to site index" << endl; #endif // construct the start and end of site indices - for (const Snarl* snarl : snarl_manager.top_level_snarls()) { + for (const Snarl* snarl : snarl_manager->top_level_snarls()) { build_site_indices_internal(snarl); } @@ -86,7 +117,7 @@ namespace vg { // are we leaving or entering the site? if (site_start_sides.count(site)) { #ifdef debug_phased_genome - cerr << "[PhasedGenome::build_indices]: leaving at start of site " << site->start.node->id() << "->" << site->end.node->id() << endl; + cerr << "[PhasedGenome::build_indices]: leaving at start of site " << site->start().node_id() << "->" << site->end().node_id() << endl; #endif // leaving: put the site in the index in the orientation of haplotype travesal HaplotypeNode* other_side_node = site_start_sides[site]; @@ -95,7 +126,7 @@ namespace vg { } else { #ifdef debug_phased_genome - cerr << "[PhasedGenome::build_indices]: entering at start of site " << site->start.node->id() << "->" << site->end.node->id() << endl; + cerr << "[PhasedGenome::build_indices]: entering at start of site " << site->start().node_id() << "->" << site->end().node_id() << endl; #endif // entering: mark the node in the haplotype path where we entered site_end_sides[site] = haplo_node; @@ -107,7 +138,7 @@ namespace vg { // are we leaving or entering the site? if (site_end_sides.count(site)) { #ifdef debug_phased_genome - cerr << "[PhasedGenome::build_indices]: leaving at end of site " << site->start.node->id() << "->" << site->end.node->id() << endl; + cerr << "[PhasedGenome::build_indices]: leaving at end of site " << site->start().node_id() << "->" << site->end().node_id() << endl; #endif // leaving: put the site in the index in the orientation of haplotype travesal HaplotypeNode* other_side_node = site_end_sides[site]; @@ -116,7 +147,7 @@ namespace vg { } else { #ifdef debug_phased_genome - cerr << "[PhasedGenome::build_indices]: entering at end of site " << site->start.node->id() << "->" << site->end.node->id() << endl; + cerr << "[PhasedGenome::build_indices]: entering at end of site " << site->start().node_id() << "->" << site->end().node_id() << endl; #endif // entering: mark the node in the haplotype path where we entered site_start_sides[site] = haplo_node; @@ -140,7 +171,7 @@ namespace vg { site_ends[snarl->end().node_id()] = snarl; // recurse through child sites - for (const Snarl* subsnarl : snarl_manager.children_of(snarl)) { + for (const Snarl* subsnarl : snarl_manager->children_of(snarl)) { build_site_indices_internal(subsnarl); } } @@ -182,7 +213,7 @@ namespace vg { } // update index for child sites - for (const Snarl* child_site : snarl_manager.children_of(&site)) { + for (const Snarl* child_site : snarl_manager->children_of(&site)) { swap_label(*child_site, haplotype_1, haplotype_2); } } @@ -198,7 +229,48 @@ namespace vg { PhasedGenome::iterator PhasedGenome::end(int which_haplotype) { return iterator(0, which_haplotype, nullptr); } - + + vector PhasedGenome::get_haplotypes_with_snarl(const Snarl* snarl_to_find){ + + // a vector that will hold the haplotype IDs of haplotypes found to traverse through the snarl + vector matched_haplotype_ids; + + // interate through the vector of haplotype pointers and do a lookup for snarl_to_find + // if found then we add it to the list of matched haplotypes + unordered_map >::iterator it; + id_t id = 0; + for (Haplotype* haplotype : haplotypes){ + bool found = haplotype->sites.count(snarl_to_find); + + if(found){ + // add the ID to the haplotype to the vector + matched_haplotype_ids.push_back(id); + } + id++; + } + + return matched_haplotype_ids; + + } + + void PhasedGenome::print_phased_genome(){ + // output number of haplotypes contained in phased genome + size_t haplo_num = num_haplotypes(); + //cerr << "The haplotype num is: " << haplo_num << endl; + + // iterate through the genome and all its haplotypes + for(int i = 0; i < haplo_num; i++){ + cerr << "Haplotype ID: " << i <id() << ": " <<(*iter).node->sequence() < PhasedGenome::get_allele(const Snarl& site, int which_haplotype) { Haplotype& haplotype = *haplotypes[which_haplotype]; @@ -277,12 +349,230 @@ namespace vg { #endif // update index for child sites - for (const Snarl* child_site : snarl_manager.children_of(&site)) { + for (const Snarl* child_site : snarl_manager->children_of(&site)) { swap_label(*child_site, haplo_1, haplo_2); } } + + double PhasedGenome::read_log_likelihood(const multipath_alignment_t& multipath_aln, double log_base) { +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] computing read likelihood with log base " << log_base << endl; + cerr << "read:" << endl; + cerr << debug_string(multipath_aln) << endl; + +#endif + + if (multipath_aln.mapping_quality() == 0) { + // this is the answer we'll produce anyway and handling it as an edge case + // avoids numerical problems at the end + return 0.0; + } + + // an accumulator that we will sum the log-likelihood of each alignment into + double log_likelihood = numeric_limits::lowest(); + + // iteration functions to facilitate iterating on forward/reverse strands + auto move_right = [](HaplotypeNode*& path_node) { path_node = path_node->next; }; + auto move_left = [](HaplotypeNode*& path_node) { path_node = path_node->prev; }; + + /* + * STEP 1: find which subpaths are represented on the current haplotypes and which + * subpaths are adjacent to each order (expressed as "links") + */ + + // records of: (subpath coming from, which match of that subpath, node the next should be on, should it be matching the orientation?) + vector>> possible_forward_links(multipath_aln.subpath_size()); + + // for each subpath + // for each copy of it in the phased genome + // the (subpath index, which copy)'s that this copy links back to + vector>>> backward_links(multipath_aln.subpath_size()); + + for (size_t i = 0; i < multipath_aln.subpath_size(); ++i) { +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] looking for matches of subpath " << i << endl; + +#endif + + const subpath_t& subpath = multipath_aln.subpath(i); + const path_t& path = subpath.path(); + + // check all of the locations of this subpath among the haplotypes + for (HaplotypeNode* starting_haplo_node : node_locations[path.mapping(0).position().node_id()]) { + + // are we traversing forward or backward along the haplotype? + bool matches_orientation = (starting_haplo_node->node_traversal.backward == path.mapping(0).position().is_reverse()); + auto move_forward = matches_orientation ? move_right : move_left; + +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] found starting haplo node that has " << (matches_orientation ? "matching" : "reverse") << " orientation at " << starting_haplo_node << " with trav " << starting_haplo_node->node_traversal << endl; +#endif + + // determine whether the rest of the subpath matches + bool full_match = true; + HaplotypeNode* haplo_node = starting_haplo_node; + for (size_t j = 1; j < path.mapping_size(); ++j) { + + // advance to the haplo node that we would find next + move_forward(haplo_node); + +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] moving forward to haplo node at " << haplo_node << " with trav " << haplo_node->node_traversal << endl; +#endif + + const position_t& pos = path.mapping(j).position(); + if (haplo_node->node_traversal.node->id() != pos.node_id() || + (haplo_node->node_traversal.backward == pos.is_reverse()) != matches_orientation) { + // the subpath doesn't match the path of the haplotype here + full_match = false; + break; + } + } + + if (!full_match) { + // scores aren't necessarily dynamic programmable except on full matches, so + // we'll leave this alone + // TODO: allow for alignments to partial subpaths? + continue; + } + +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] found a full match" << endl; +#endif + + const path_mapping_t& final_mapping = path.mapping(path.mapping_size() - 1); + size_t final_offset = mapping_from_length(final_mapping) + final_mapping.position().offset(); + if (final_offset == haplo_node->node_traversal.node->sequence().size()) { + // the last mapping hits the end of its node, so we expect to find + // the next subpath on the following node + move_forward(haplo_node); +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] any subsequent match is expected to be on the next haplo node at " << haplo_node << endl; +#endif + } + + // we found a full match of this subpath on the phased genome, add a match + size_t match_num = backward_links[i].size(); + backward_links[i].emplace_back(); + vector>& links = backward_links[i].back(); + + // let's check if we could have extended along any of the forward links we + // previously identified + for (auto& possible_link : possible_forward_links[i]) { + // TODO: make this a multimap from (node,orientation) instead? + if (get<2>(possible_link) == starting_haplo_node && + get<3>(possible_link) == matches_orientation) { + // we started on haplo node and orientation that we would have expected + // if we followed this forward link + links.emplace_back(get<0>(possible_link), get<1>(possible_link)); +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] confirmed a link from subpath " << get<0>(possible_link) << ", match number " << get<1>(possible_link) << endl; +#endif + } + } + + // add a candidate forward link from here to each subsequent subpath + for (size_t j : subpath.next()) { +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] adding a possible link to " << j << endl; +#endif + possible_forward_links[j].emplace_back(i, match_num, haplo_node, matches_orientation); + } + } + } + + /* + * STEP 2: follow the links we discovered in step 1 to compute the + * scores of the alignments consistent with the current haplotypes + * and add them to the log likelihood + */ + + // to keep track of which partial alignments we've already accounted for + vector> traversed(multipath_aln.subpath_size()); + for (size_t i = 0; i < traversed.size(); ++i) { + traversed[i].resize(backward_links[i].size(), false); + } + + // iterate backwards over the backward facing links for each subpath + for (int64_t i = backward_links.size() - 1; i >= 0; --i) { + auto& links = backward_links[i]; + // iterate over the links for each match we found for this subpath + for (size_t j = 0; j < links.size(); ++j) { + if (traversed[i][j]) { + // we already scored the alignments corresponding to these + // links + continue; + } +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] checking backward links from " << i << ", match num " << j << endl; +#endif + + // use DFS to generate all longest-possible alignments along + // the backward links + + // records of (subpath idx, copy of subpath, index of next link to take) + vector> stack; + stack.emplace_back(i, j, 0); + while (!stack.empty()) { + auto& record = stack.back(); +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] unstacking " << get<0>(record) << ", " << get<1>(record) << ", " << get<2>(record) << endl; +#endif + + auto& links = backward_links[get<0>(record)][get<1>(record)]; + if (get<2>(record) < links.size()) { + auto next = links[get<2>(record)++]; + stack.emplace_back(next.first, next.second, 0); +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] following link to " << next.first << ", " << next.second << endl; +#endif + } + else { + // we've finished traversing this + traversed[get<0>(record)][get<1>(record)] = true; +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] finished traversing " << get<0>(record) << ", " << get<1>(record) << endl; +#endif + if (links.empty()) { + // this is the final subpath in the alignment, so the stack now + // represents a valid alignment. we can compute the highest scoring + // segment of the alignment with dynamic programming +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] completed an alignment, current alignment is " << endl; + for (auto& stack_record : stack) { + cerr << "\t" << get<0>(stack_record) << ", " << get<1>(stack_record) << ", " << get<2>(stack_record) << endl; + } +#endif + + int32_t current_score = multipath_aln.subpath(get<0>(stack[0])).score(); + int32_t max_score = current_score; + for (size_t k = 1; k < stack.size(); ++k) { + int32_t subpath_score = multipath_aln.subpath(get<0>(stack[k])).score(); + current_score = max(current_score + subpath_score, subpath_score); + max_score = max(max_score, current_score); + } + + // TODO: do i need to check if the same alignment sub sequence gets + // double-counted here? + log_likelihood = add_log(log_likelihood, max_score * log_base); +#ifdef debug_phased_genome + cerr << "[PhasedGenome::read_log_likelihood] added max score " << max_score << " to update likelihood to " << log_likelihood << endl; +#endif + } + stack.pop_back(); + } + } + } + } + + // adjust by the mapping quality (likelihood = (1-err)*l + err) + double mapping_err_log_prob = phred_to_logprob(multipath_aln.mapping_quality()); + return add_log(subtract_log(0.0, mapping_err_log_prob) + log_likelihood, + mapping_err_log_prob); + } - int32_t PhasedGenome::optimal_score_on_genome(const MultipathAlignment& multipath_aln, VG& graph) { + int32_t PhasedGenome::optimal_score_on_genome(const multipath_alignment_t& multipath_aln, VG& graph) { + // must have identified start subpaths before computing optimal score assert(multipath_aln.start_size() > 0); @@ -297,8 +587,8 @@ namespace vg { unordered_map< pair, vector> candidate_start_positions; for (int i = 0; i < multipath_aln.start_size(); i++) { // a starting subpath in the multipath alignment - const Subpath& start_subpath = multipath_aln.subpath(multipath_aln.start(i)); - const Position& start_pos = start_subpath.path().mapping(0).position(); + const subpath_t& start_subpath = multipath_aln.subpath(multipath_aln.start(i)); + const position_t& start_pos = start_subpath.path().mapping(0).position(); #ifdef debug_phased_genome cerr << "[PhasedGenome::optimal_score_on_genome]: looking for candidate start positions for subpath " << multipath_aln.start(i) << " on node " << start_pos.node_id() << endl; @@ -356,20 +646,27 @@ namespace vg { cerr << "[PhasedGenome::optimal_score_on_genome]: checking subpath " << i << " for consistent paths" << endl; #endif - const Subpath& subpath = multipath_aln.subpath(i); + const subpath_t& subpath = multipath_aln.subpath(i); // iterate through mappings in this subpath (assumes one mapping per node) bool subpath_follows_path = true; - for (int j = 0; j < subpath.path().mapping_size(); j++, move_forward(subpath_node)) { + for (int j = 0; j < subpath.path().mapping_size(); j++) { // check if mapping corresponds to the next node in the path in the correct orientation - const Position& position = subpath.path().mapping(j).position(); + const path_mapping_t& mapping = subpath.path().mapping(j); + const position_t& position = mapping.position(); if (position.node_id() != subpath_node->node_traversal.node->id() || ((position.is_reverse() == subpath_node->node_traversal.backward) != oriented_forward)) { subpath_follows_path = false; - break; + #ifdef debug_phased_genome cerr << "[PhasedGenome::optimal_score_on_genome]: subpath " << i << " is inconsistent with haplotype" << endl; -#endif + +#endif + break; + } + + if (position.offset() + mapping_from_length(mapping) == subpath_node->node_traversal.node->sequence().size()) { + move_forward(subpath_node); } } @@ -377,7 +674,7 @@ namespace vg { if (subpath_follows_path) { #ifdef debug_phased_genome cerr << "[PhasedGenome::optimal_score_on_genome]: subpath " << i << " is consistent with haplotype" << endl; -#endif +#endif int32_t extended_prefix_score = subpath_prefix_score[i] + subpath.score(); if (subpath.next_size() == 0) { #ifdef debug_phased_genome @@ -393,14 +690,6 @@ namespace vg { #ifdef debug_phased_genome cerr << "[PhasedGenome::optimal_score_on_genome]: non sink path, extending score of " << extended_prefix_score << endl; #endif - // edge case: check if subpath_node was improperly incremented from a mapping that ended in the - // middle of a node - Position end_pos = last_path_position(subpath.path()); - if (end_pos.offset() != graph.get_node(end_pos.node_id())->sequence().length()) { - move_backward(subpath_node); - } - // TODO: this could be a problem if the next node is the end of a chromosome (will seg fault - // because can't get the previous node from nullptr) // mark which node the next subpath starts at for (int j = 0; j < subpath.next_size(); j++) { diff --git a/src/phased_genome.hpp b/src/phased_genome.hpp index 85a768644c2..a8300efa1c9 100644 --- a/src/phased_genome.hpp +++ b/src/phased_genome.hpp @@ -9,18 +9,20 @@ #ifndef phased_genome_hpp #define phased_genome_hpp + #include #include #include #include -#include "vg.pb.h" +#include #include "vg.hpp" #include "nodetraversal.hpp" #include "genotypekit.hpp" #include "hash_map.hpp" #include "snarls.hpp" +#include "multipath_alignment.hpp" +#include "statistics.hpp" -//#define debug_phased_genome using namespace std; @@ -49,8 +51,17 @@ namespace vg { */ /// Constructor - PhasedGenome(SnarlManager& snarl_manager); + PhasedGenome(const SnarlManager& snarl_manager); ~PhasedGenome(); + /// overloaded constructor + PhasedGenome(PhasedGenome& phased_genome); + /// move assignment ctor + PhasedGenome(PhasedGenome&& other) = delete; + /// move assignment operator + PhasedGenome& operator =(PhasedGenome&& phased_genome) = delete; + + /// copy assignment operator + PhasedGenome& operator =(PhasedGenome& phased_genome); /// Build a haplotype in place from an iterator that returns NodeTraversal objects from its /// dereference operator (allows construction without instantiating the haplotype elsewhere) @@ -80,6 +91,13 @@ namespace vg { /// Iterator representing the past-the-last position of the given haplotype, with the last /// position being the right telomere node. iterator end(int which_haplotype); + + /// Check which haplotypes a snarl is found in + // Returns a list of haplotype IDs + vector get_haplotypes_with_snarl(const Snarl* snarl_to_find); + + /// Prints out the haplotypes, and node values + void print_phased_genome(); /* * HAPLOTYPE EDITING METHODS @@ -109,17 +127,21 @@ namespace vg { /// Returns the score of the highest scoring alignment contained in the multipath alignment /// that is restricted to the phased genome's paths through the variation graph. /// - /// Note: assumes that MultipathAlignment has 'start' field filled in - int32_t optimal_score_on_genome(const MultipathAlignment& multipath_aln, VG& graph); + /// Note: assumes that multipath_alignment_t has 'start' field filled in + int32_t optimal_score_on_genome(const multipath_alignment_t& multipath_aln, VG& graph); // TODO: make a local subalignment optimal score function (main obstacle is scoring partial subpaths) + /// Returns the sum of the log-likelihoods of all of the alignments expressed in a multipath + /// alignment, given a + double read_log_likelihood(const multipath_alignment_t& multipath_aln, double log_base); + private: struct HaplotypeNode; class Haplotype; - SnarlManager& snarl_manager; + const SnarlManager* snarl_manager; /// All haplotypes in the genome (generally 2 per chromosome) vector haplotypes; @@ -252,7 +274,9 @@ namespace vg { iterator(size_t rank, int haplotype_number, HaplotypeNode* haplo_node); public: - + + using value_type = NodeTraversal; + /// Default constructor iterator(); /// Copy constructor @@ -546,6 +570,15 @@ namespace vg { } } +namespace std{ + template<> + struct iterator_traits{ + using value_type = vg::NodeTraversal; + using iterator_category = forward_iterator_tag; + }; +} + + #endif /* phased_genome_hpp */ diff --git a/src/pileup.cpp b/src/pileup.cpp deleted file mode 100644 index 882cf0b38b5..00000000000 --- a/src/pileup.cpp +++ /dev/null @@ -1,827 +0,0 @@ -#include -#include -#include -#include "json2pb.h" -#include "pileup.hpp" -#include "stream.hpp" - -using namespace std; - -namespace vg { - -void Pileups::clear() { - for (auto& p : _node_pileups) { - delete p.second; - } - _node_pileups.clear(); - - for (auto& p : _edge_pileups) { - delete p.second; - } - _edge_pileups.clear(); - _min_quality_count = 0; - _max_mismatch_count = 0; - _bases_count = 0; -} - -void Pileups::to_json(ostream& out) { - out << "{\"node_pileups\": ["; - for (NodePileupHash::iterator i = _node_pileups.begin(); i != _node_pileups.end();) { - out << pb2json(*i->second); - ++i; - if (i != _node_pileups.end()) { - out << ","; - } - } - out << "]," << endl << "\"edge_pileups\": ["; - for (EdgePileupHash::iterator i = _edge_pileups.begin(); i != _edge_pileups.end();) { - out << pb2json(*i->second); - ++i; - if (i != _edge_pileups.end()) { - out << ","; - } - } - out << "]}" << endl; -} - -void Pileups::load(istream& in) { - function lambda = [this](Pileup& pileup) { - extend(pileup); - }; - stream::for_each(in, lambda); -} - -void Pileups::write(ostream& out, size_t chunk_size) { - - int64_t count = max(_node_pileups.size(), _edge_pileups.size()) / chunk_size; - if (max(_node_pileups.size(), _edge_pileups.size()) % chunk_size != 0) { - ++count; - } - - NodePileupHash::iterator node_it = _node_pileups.begin(); - EdgePileupHash::iterator edge_it = _edge_pileups.begin(); - Pileup pileup; - - // note: this won't work at all in parallel but write is single threaded... - function lambda = [&](size_t i) -> Pileup& { - pileup.clear_node_pileups(); - pileup.clear_edge_pileups(); - for (size_t j = 0; j < chunk_size && node_it != _node_pileups.end(); ++j, ++node_it) { - NodePileup* np = pileup.add_node_pileups(); - *np = *node_it->second; - } - // unlike for Graph, we don't bother to try to group edges with nodes they attach - for (size_t j = 0; j < chunk_size && edge_it != _edge_pileups.end(); ++j, ++edge_it) { - EdgePileup* ep = pileup.add_edge_pileups(); - *ep = *edge_it->second; - } - - return pileup; - }; - - stream::write(out, count, lambda); - stream::finish(out); -} - -void Pileups::for_each_node_pileup(const function& lambda) { - for (auto& p : _node_pileups) { - lambda(*p.second); - } -} - -void Pileups::for_each_edge_pileup(const function& lambda) { - for (auto& p : _edge_pileups) { - lambda(*p.second); - } -} - -EdgePileup* Pileups::get_edge_pileup(pair sides) { - if (sides.second < sides.first) { - std::swap(sides.first, sides.second); - } - auto p = _edge_pileups.find(sides); - return p != _edge_pileups.end() ? p->second : NULL; -} - -// get a pileup. if it's null, create a new one and insert it. -EdgePileup* Pileups::get_create_edge_pileup(pair sides) { - if (sides.second < sides.first) { - std::swap(sides.first, sides.second); - } - EdgePileup* p = get_edge_pileup(sides); - if (p == NULL) { - p = new EdgePileup(); - p->mutable_edge()->set_from(sides.first.node); - p->mutable_edge()->set_from_start(!sides.first.is_end); - p->mutable_edge()->set_to(sides.second.node); - p->mutable_edge()->set_to_end(sides.second.is_end); - _edge_pileups[sides] = p; - } - return p; -} - - -void Pileups::extend(Pileup& pileup) { - for (int i = 0; i < pileup.node_pileups_size(); ++i) { - insert_node_pileup(new NodePileup(pileup.node_pileups(i))); - } - for (int i = 0; i < pileup.edge_pileups_size(); ++i) { - insert_edge_pileup(new EdgePileup(pileup.edge_pileups(i))); - } -} - -bool Pileups::insert_node_pileup(NodePileup* pileup) { - NodePileup* existing = get_node_pileup(pileup->node_id()); - if (existing != NULL) { - merge_node_pileups(*existing, *pileup); - delete pileup; - } else { - _node_pileups[pileup->node_id()] = pileup; - } - return existing == NULL; -} - -bool Pileups::insert_edge_pileup(EdgePileup* pileup) { - EdgePileup* existing = get_edge_pileup(NodeSide::pair_from_edge(*pileup->mutable_edge())); - if (existing != NULL) { - merge_edge_pileups(*existing, *pileup); - delete pileup; - } else { - _edge_pileups[NodeSide::pair_from_edge(*pileup->mutable_edge())] = pileup; - } - return existing == NULL; -} - -void Pileups::compute_from_alignment(Alignment& alignment) { - const Path& path = alignment.path(); - int64_t read_offset = 0; - vector mismatch_counts; - count_mismatches(*_graph, path, mismatch_counts); - // element i = location of rank i in the mapping array - vector ranks(path.mapping_size() + 1, -1); - // keep track of read offset of mapping array element i - vector in_read_offsets(path.mapping_size()); - vector out_read_offsets(path.mapping_size()); - // keep track of last mapping, offset of match, and open deletion for - // calling deletion endpoints (which are beside, but not on the base offsets they get written to) - pair last_match(NULL, -1); - pair last_del(NULL, -1); - pair open_del(NULL, -1); - for (int i = 0; i < path.mapping_size(); ++i) { - const Mapping& mapping = path.mapping(i); - int rank = mapping.rank() <= 0 ? i + 1 : mapping.rank(); - if (_graph->has_node(mapping.position().node_id())) { - const Node* node = _graph->get_node(mapping.position().node_id()); - NodePileup* pileup = get_create_node_pileup(node); - int64_t node_offset = mapping.position().offset(); - // utilize forward-relative node offset (old way), which - // is not consistent with current protobuf. conversion here. - if (mapping.position().is_reverse()) { - node_offset = node->sequence().length() - 1 - node_offset; - } - // If we mismatch alignments and graphs, we can get into trouble. - assert(node_offset >= 0); - in_read_offsets[i] = read_offset; - for (int j = 0; j < mapping.edit_size(); ++j) { - const Edit& edit = mapping.edit(j); - const Edit* next_edit = NULL; - if (j + 1 < mapping.edit_size()) { - next_edit = &mapping.edit(j + 1); - } else if (i + 1 < path.mapping_size() && path.mapping(i + 1).edit_size() > 0) { - next_edit = &path.mapping(i + 1).edit(0); - } - // process all pileups in edit. - // update the offsets as we go - compute_from_edit(*pileup, node_offset, read_offset, *node, - alignment, mapping, edit, next_edit, mismatch_counts, - last_match, last_del, open_del); - } - out_read_offsets[i] = read_offset - 1; - - if (rank <= 0 || rank >= ranks.size() || ranks[rank] != -1) { - cerr << "Error determining rank of mapping " << i << " in path " << path.name() << ": " - << pb2json(mapping) << endl; - } - else { - ranks[rank] = i; - } - } else { - // node not in graph. that's okay, we do nothing but update the read_offset to - // not trigger assert at end of this function - for (int j = 0; j < mapping.edit_size(); ++j) { - read_offset += mapping.edit(j).to_length(); - } - ranks[rank] = -1; - } - } - // loop again over all the edges crossed by the mapping alignment, using - // the offsets and ranking information we got in the first pass - for (int i = 2; i < ranks.size(); ++i) { - int rank1_idx = ranks[i-1]; - int rank2_idx = ranks[i]; - if ((rank1_idx > 0 || rank2_idx > 0) && (rank1_idx >= 0 && rank2_idx >= 0)) { - auto& m1 = path.mapping(rank1_idx); - auto& m2 = path.mapping(rank2_idx); - // only count edges bookended by matches - size_t m1eds = m1.edit_size(); - if ((m1eds == 0 || m1.edit(m1eds - 1).from_length() == m1.edit(m1eds - 1).to_length()) && - (m2.edit_size() == 0 || m2.edit(0).from_length() == m2.edit(0).to_length())) { - auto s1 = NodeSide(m1.position().node_id(), (m1.position().is_reverse() ? false : true)); - auto s2 = NodeSide(m2.position().node_id(), (m2.position().is_reverse() ? true : false)); - // no quality gives a free pass from quality filter - char edge_qual = 127; - if (!alignment.quality().empty()) { - char from_qual = alignment.quality()[out_read_offsets[rank1_idx]]; - char to_qual = alignment.quality()[in_read_offsets[rank2_idx]]; - edge_qual = combined_quality(min(from_qual, to_qual), alignment.mapping_quality()); - } - if (edge_qual >= _min_quality) { - EdgePileup* edge_pileup = get_create_edge_pileup(pair(s1, s2)); - if (edge_pileup->num_reads() < _max_depth) { - edge_pileup->set_num_reads(edge_pileup->num_reads() + 1); - if (!m1.position().is_reverse()) { - edge_pileup->set_num_forward_reads(edge_pileup->num_forward_reads() + 1); - } - if (!alignment.quality().empty()) { - *edge_pileup->mutable_qualities() += edge_qual; - } - } - } - } - } - } - - assert(alignment.sequence().empty() || - alignment.path().mapping_size() == 0 || - read_offset == alignment.sequence().length()); - -} - -void Pileups::compute_from_edit(NodePileup& pileup, int64_t& node_offset, - int64_t& read_offset, - const Node& node, const Alignment& alignment, - const Mapping& mapping, const Edit& edit, - const Edit* next_edit, - const vector& mismatch_counts, - pair& last_match, - pair& last_del, - pair& open_del) { - string seq = edit.sequence(); - // is the mapping reversed wrt read sequence? use for iterating - bool map_reverse = mapping.position().is_reverse(); - - // ***** MATCH ***** - if (edit.from_length() == edit.to_length()) { - assert (edit.from_length() > 0); - make_match(seq, edit.from_length(), map_reverse); - assert(seq.length() == edit.from_length()); - int64_t delta = map_reverse ? -1 : 1; - for (int64_t i = 0; i < edit.from_length(); ++i) { - if (pass_filter(alignment, read_offset, 1, mismatch_counts)) { - // Don't go outside the node - if (node_offset >= _graph->get_node(pileup.node_id())->sequence().size()) { - cerr << "error [vg::Pileups] node_offset of " << node_offset << " on " << pileup.node_id() << " is too big for node of size " << _graph->get_node(pileup.node_id())->sequence().size() << endl; - cerr << "Alignment: " << pb2json(alignment) << endl; - throw runtime_error("Node offset too large in alignment"); - } - BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset); - if (base_pileup->num_bases() < _max_depth) { - // reference_base if empty - if (base_pileup->num_bases() == 0) { - base_pileup->set_ref_base(node.sequence()[node_offset]); - } else { - assert(base_pileup->ref_base() == node.sequence()[node_offset]); - } - // add base to bases field (converting to ,. if match) - char base = seq[i]; - *base_pileup->mutable_bases() += base; - // add quality if there - if (!alignment.quality().empty()) { - *base_pileup->mutable_qualities() += min((int32_t)alignment.quality()[read_offset], (int32_t)alignment.mapping_quality()); - } - // pileup size increases by 1 - base_pileup->set_num_bases(base_pileup->num_bases() + 1); - } - // close off any open deletion - if (open_del.first != NULL) { - string del_seq; - make_delete(del_seq, map_reverse, last_match, mapping, node_offset); - int64_t dp_node_id; - int64_t dp_node_offset; - // store in canonical position - if (make_pair(make_pair(last_del.first->position().node_id(), last_del.second), - last_del.first->position().is_reverse()) < - make_pair(make_pair(open_del.first->position().node_id(), open_del.second), - open_del.first->position().is_reverse())) { - dp_node_id = last_del.first->position().node_id(); - dp_node_offset = last_del.second; - } else { - dp_node_id = open_del.first->position().node_id(); - dp_node_offset = open_del.second; - } - Node* dp_node = _graph->get_node(dp_node_id); - // Don't go outside the node - assert(dp_node_offset < dp_node->sequence().size()); - NodePileup* dp_node_pileup = get_create_node_pileup(dp_node); - BasePileup* dp_base_pileup = get_create_base_pileup(*dp_node_pileup, dp_node_offset); - if (dp_base_pileup->num_bases() < _max_depth) { - // reference_base if empty - if (dp_base_pileup->num_bases() == 0) { - dp_base_pileup->set_ref_base(dp_node->sequence()[dp_node_offset]); - } else { - assert(dp_base_pileup->ref_base() == dp_node->sequence()[dp_node_offset]); - } - *dp_base_pileup->mutable_bases() += del_seq; - if (!alignment.quality().empty()) { - // we only use quality of one endpoint here. should average - *dp_base_pileup->mutable_qualities() += combined_quality(alignment.quality()[read_offset], - alignment.mapping_quality()); - } - dp_base_pileup->set_num_bases(dp_base_pileup->num_bases() + 1); - } - open_del = make_pair((Mapping*)NULL, -1); - last_del = make_pair((Mapping*)NULL, -1); - } - - last_match = make_pair(&mapping, node_offset); - } - // move right along read, and left/right depending on strand on reference - node_offset += delta; - ++read_offset; - } - } - // ***** INSERT ***** - else if (edit.from_length() < edit.to_length()) { - if (pass_filter(alignment, read_offset, edit.to_length(), mismatch_counts)) { - make_insert(seq, map_reverse); - assert(edit.from_length() == 0); - // we define insert (like sam) as insertion between current and next - // position (on forward node coordinates). this means an insertion before - // offset 0 is invalid! - int64_t insert_offset = map_reverse ? node_offset : node_offset - 1; - if (insert_offset >= 0 && - // make sure we have a match before and after the insert to take it seriously - next_edit != NULL && last_match.first != NULL && - next_edit->from_length() == next_edit->to_length()) { - // Don't go outside the node - assert(insert_offset < _graph->get_node(pileup.node_id())->sequence().size()); - BasePileup* base_pileup = get_create_base_pileup(pileup, insert_offset); - if (base_pileup->num_bases() < _max_depth) { - // reference_base if empty - if (base_pileup->num_bases() == 0) { - base_pileup->set_ref_base(node.sequence()[insert_offset]); - } else { - assert(base_pileup->ref_base() == node.sequence()[insert_offset]); - } - // add insertion string to bases field - base_pileup->mutable_bases()->append(seq); - if (!alignment.quality().empty()) { - *base_pileup->mutable_qualities() += combined_quality(alignment.quality()[read_offset], - alignment.mapping_quality()); - } - // pileup size increases by 1 - base_pileup->set_num_bases(base_pileup->num_bases() + 1); - } - } - else { - // need to check with aligner to make sure this doesn't happen, ie - // inserts would hang off the end of previous node instead of start - // of this node - /* - stringstream ss; - ss << "Warning: pileup does not support insertions before 0th base in node." - << " Offending edit: " << pb2json(edit) << endl; - #pragma omp critical(cerr) - cerr << ss.str(); - */ - } - } - // move right along read (and stay put on reference) - read_offset += edit.to_length(); - } - // ***** DELETE ***** - else { - if (pass_filter(alignment, read_offset, 1, mismatch_counts)) { - assert(edit.to_length() == 0); - assert(edit.sequence().empty()); - - // deltion will get written in the "Match" section - // note: deletions will only get written if there's a match on either side - // so deletions at beginning/end of read ignored in pileup - if (open_del.first == NULL && last_match.first != NULL) { - open_del = make_pair(&mapping, node_offset); - } - // open_del : first base deleted by deleltion - // last_del : most recent base deleted by deletion - // last_match : most recent base in a match - // (most recent is in order we are scanning here) - - // a deletion will be an edge between two matches. - // but in the pileup, it will be stored in either open_del or last_del - // (which ever has lower coordinate). - } - int64_t delta = map_reverse ? -edit.from_length() : edit.from_length(); - - // stay put on read, move left/right depending on strand on reference - node_offset += delta; - - last_del = make_pair(&mapping, map_reverse ? node_offset + 1 : node_offset - 1); - } -} - -void Pileups::count_mismatches(VG& graph, const Path& path, - vector& mismatches, - bool skipIndels) -{ - mismatches.clear(); - int64_t read_offset = 0; - for (int i = 0; i < path.mapping_size(); ++i) { - const Mapping& mapping = path.mapping(i); - if (graph.has_node(mapping.position().node_id())) { - const Node* node = graph.get_node(mapping.position().node_id()); - int64_t node_offset = mapping.position().offset(); - // utilize forward-relative node offset (old way), which - // is not consistent with current protobuf. conversion here. - if (mapping.position().is_reverse()) { - node_offset = node->sequence().length() - 1 - node_offset; - } - - for (int j = 0; j < mapping.edit_size(); ++j) { - const Edit& edit = mapping.edit(j); - // process all pileups in edit. - // update the offsets as we go - string seq = edit.sequence(); - bool is_reverse = mapping.position().is_reverse(); - if (is_reverse) { - seq = reverse_complement(seq); - } - - // ***** MATCH ***** - if (edit.from_length() == edit.to_length()) { - int64_t delta = is_reverse ? -1 : 1; - for (int64_t i = 0; i < edit.from_length(); ++i) { - if (!edit.sequence().empty() && - !base_equal(seq[i], node->sequence()[node_offset], false)) { - mismatches.push_back(1); - } - else { - mismatches.push_back(0); - } - // move right along read, and left/right depending on strand on reference - node_offset += delta; - ++read_offset; - } - } - // ***** INSERT ***** - else if (edit.from_length() < edit.to_length()) { - if (skipIndels == false) { - mismatches.push_back(1); - for (int x = 1; x < edit.to_length(); ++x) { - mismatches.push_back(0); - } - } - // move right along read (and stay put on reference) - read_offset += edit.to_length(); - } - // ***** DELETE ***** - else { - if (skipIndels == false) { - // since we're working in read coordinates, we count - // a single mismatch right before the delete. - if (mismatches.size() > 0) { - mismatches[mismatches.size() - 1] = 1; - } - } - int64_t delta = is_reverse ? -edit.from_length() : edit.from_length(); - // stay put on read, move left/right depending on strand on reference - node_offset += delta; - } - } - } else { - // node not in graph: count 0 mismatches for each absent position - for (int j = 0; j < mapping.edit_size(); ++j) { - read_offset += mapping.edit(j).to_length(); - for (int k = 0; k < mapping.edit(j).to_length(); ++k) { - mismatches.push_back(0); - } - } - } - } - assert(skipIndels || read_offset == mismatches.size()); - // too lazy to do full count inline. sum up here - int count = 0; - for (int i = 0; i < mismatches.size(); ++i) { - count += mismatches[i]; - mismatches[i] = count; - } -} - -bool Pileups::pass_filter(const Alignment& alignment, int64_t read_offset, - int64_t length, const vector& mismatches) const -{ - bool min_quality_fail = false; - bool max_mismatch_fail = false; - // loop is becaues insertions are considered as one block - // in this case entire block fails if single base fails - for (int64_t cur_offset = read_offset; cur_offset < read_offset + length; ++cur_offset) { - if (!alignment.quality().empty()) { - if (combined_quality(alignment.quality()[cur_offset], alignment.mapping_quality()) < _min_quality) { - min_quality_fail = true; - break; - } - } - if (_window_size > 0) { - // counts in left window - int64_t left_point = max((int64_t)0, cur_offset - _window_size / 2 - 1); - int64_t right_point = max((int64_t)0, cur_offset - 1); - int64_t count = mismatches[right_point] - mismatches[left_point]; - // coutns in right window - left_point = cur_offset; - right_point = min(cur_offset + _window_size / 2, (int64_t)mismatches.size() - 1); - count += mismatches[right_point] - mismatches[left_point]; - if (count > _max_mismatches) { - max_mismatch_fail = true; - break; - } - } - } - if (max_mismatch_fail) { - _max_mismatch_count += length; - } - if (min_quality_fail) { - _min_quality_count += length; - } - _bases_count += length; - return !max_mismatch_fail && !min_quality_fail; -} - -Pileups& Pileups::merge(Pileups& other) { - for (auto& p : other._node_pileups) { - insert_node_pileup(p.second); - } - other._node_pileups.clear(); - for (auto& p : other._edge_pileups) { - insert_edge_pileup(p.second); - } - other._edge_pileups.clear(); - _min_quality_count += other._min_quality_count; - other._min_quality_count = 0; - _max_mismatch_count += other._max_mismatch_count; - other._max_mismatch_count = 0; - _bases_count += other._bases_count; - other._bases_count = 0; - return *this; -} - -BasePileup& Pileups::merge_base_pileups(BasePileup& p1, BasePileup& p2) { - assert(p1.num_bases() == 0 || p2.num_bases() == 0 || - p1.ref_base() == p2.ref_base()); - if (p1.num_bases() == 0) { - p1.set_ref_base(p2.ref_base()); - } - int merge_size = min(p2.num_bases(), _max_depth - p1.num_bases()); - p1.set_num_bases(p1.num_bases() + merge_size); - if (merge_size == p2.num_bases()) { - p1.mutable_bases()->append(p2.bases()); - p1.mutable_qualities()->append(p2.qualities()); - } else if (merge_size > 0) { - vector > offsets; - parse_base_offsets(p2, offsets); - int merge_length = offsets[merge_size].first; - p1.mutable_bases()->append(p2.bases().substr(0, merge_length)); - if (!p2.qualities().empty()) { - p1.mutable_qualities()->append(p2.qualities().substr(0, merge_size)); - } - } - p2.set_num_bases(0); - p2.clear_bases(); - p2.clear_qualities(); - return p1; -} - -NodePileup& Pileups::merge_node_pileups(NodePileup& p1, NodePileup& p2) { - assert(p1.node_id() == p2.node_id()); - // Don't go outside the node - assert(p1.base_pileup_size() <= _graph->get_node(p1.node_id())->sequence().size()); - assert(p2.base_pileup_size() <= _graph->get_node(p2.node_id())->sequence().size()); - for (int i = 0; i < p2.base_pileup_size(); ++i) { - BasePileup* bp1 = get_create_base_pileup(p1, i); - BasePileup* bp2 = get_base_pileup(p2, i); - merge_base_pileups(*bp1, *bp2); - } - p2.clear_base_pileup(); - return p1; -} - -EdgePileup& Pileups::merge_edge_pileups(EdgePileup& p1, EdgePileup& p2) { - assert(p1.edge().from() == p2.edge().from()); - assert(p1.edge().to() == p2.edge().to()); - assert(p1.edge().from_start() == p2.edge().from_start()); - assert(p1.edge().to_end() == p2.edge().to_end()); - int merge_size = min(p2.num_reads(), _max_depth - p1.num_reads()); - p1.set_num_reads(p1.num_reads() + merge_size); - int forward_merge_size = p2.num_forward_reads() * - ((double)merge_size / (double)p2.num_reads()); - p1.set_num_forward_reads(p1.num_forward_reads() + forward_merge_size); - if (merge_size == p2.num_reads()) { - p1.mutable_qualities()->append(p2.qualities()); - } else if (!p2.qualities().empty()) { - p1.mutable_qualities()->append(p2.qualities().substr(0, merge_size)); - } - p2.set_num_reads(0); - p2.set_num_forward_reads(0); - p2.clear_qualities(); - return p1; -} - -void Pileups::parse_base_offsets(const BasePileup& bp, - vector >& offsets) { - offsets.clear(); - - const string& quals = bp.qualities(); - const string& bases = bp.bases(); - char ref_base = ::toupper(bp.ref_base()); - // we can use i to index the quality for the ith row of pileup, but - // need base_offset to get position of appropriate token in bases string - int64_t base_offset = 0; - for (int i = 0; i < bp.num_bases(); ++i) { - // insert - if (bases[base_offset] == '+') { - offsets.push_back(make_pair(base_offset, i < quals.length() ? i : -1)); - int64_t lf = base_offset + 1; - int64_t rf = lf; - while (rf < bases.length() && bases[rf] >= '0' && bases[rf] <= '9') { - ++rf; - } - stringstream ss(bases.substr(lf, rf - lf + 1)); - int64_t indel_len; - ss >> indel_len; - // ex: +5aaaaa. rf = lf = 1. indel_len = 5 -> increment 2+0+5=7 - base_offset += 1 + rf - lf + indel_len; - // delete - } else if (bases[base_offset] == '-') { - offsets.push_back(make_pair(base_offset, i < quals.length() ? i : -1)); - int64_t lf = base_offset + 1; - // eat up six semicolons - for (int64_t sc_count = 0; sc_count < 6; ++lf) { - if (bases[lf] == ';') { - ++sc_count; - } - } - // and last number - for (; bases[lf] >= '0' && bases[lf] <= '9'; ++lf); - base_offset = lf; - } - // match / snp - else { - offsets.push_back(make_pair(base_offset, i < quals.length() ? i : -1)); - ++base_offset; - } - } - assert(base_offset == bases.length()); -} - -// transform case of every character in string -void Pileups::casify(string& seq, bool is_reverse) { - if (is_reverse) { - transform(seq.begin(), seq.end(), seq.begin(), ::tolower); - } else { - transform(seq.begin(), seq.end(), seq.begin(), ::toupper); - } -} - -// make the sam pileup style token -void Pileups::make_match(string& seq, int64_t from_length, bool is_reverse) { - if (seq.length() == 0) { - seq = string(from_length, is_reverse ? ',' : '.'); - } else { - casify(seq, is_reverse); - } -} - -void Pileups::make_insert(string& seq, bool is_reverse) { - casify(seq, is_reverse); - stringstream ss; - ss << "+" << seq.length() << seq; - seq = ss.str(); -} - -void Pileups::make_delete(string& seq, bool is_reverse, const pair& last_match, - const Mapping& mapping, int64_t node_offset){ - int64_t from_id = last_match.first->position().node_id(); - int64_t from_offset = last_match.second; - bool from_start = last_match.first->position().is_reverse(); - int64_t to_id = mapping.position().node_id(); - int64_t to_offset = node_offset; - bool to_end = mapping.position().is_reverse(); - - // canonical order - if (make_pair(make_pair(from_id, from_offset), from_start) > - make_pair(make_pair(to_id, to_offset), to_end)) { - std::swap(from_id, to_id); - std::swap(from_offset, to_offset); - std::swap(from_start, to_end); - from_start = !from_start; - to_end = !to_end; - } - - make_delete(seq, is_reverse, from_id, from_offset, from_start, to_id, to_offset, to_end); -} - -void Pileups::make_delete(string& seq, bool is_reverse, - int64_t from_id, int64_t from_offset, bool from_start, - int64_t to_id, int64_t to_offset, bool to_end) { - // format : -is_reverse;from_id;from_offset;from_start;to_id;do_offset;to_end - stringstream ss; - ss << "-" << is_reverse << ";" << from_id << ";" << from_offset << ";" << from_start << ";" - << to_id << ";" << to_offset << ";" << to_end; - seq = ss.str(); -} - -void Pileups::parse_insert(const string& tok, int64_t& len, string& seq, bool& is_reverse) { - assert(tok[0] == '+'); - int64_t i = 1; - for (; tok[i] >= '0' && tok[i] <= '9'; ++i); - stringstream ss; - ss << tok.substr(1, i - 1); - ss >> len; - seq = tok.substr(i, tok.length() - i); - is_reverse = ::islower(seq[0]); -} - -void Pileups::parse_delete(const string& tok, bool& is_reverse, - int64_t& from_id, int64_t& from_offset, bool& from_start, - int64_t& to_id, int64_t& to_offset, bool& to_end) { - assert(tok[0] == '-'); - vector toks; - regex sc_re(";"); - std::copy(sregex_token_iterator(tok.begin(), tok.end(), sc_re, -1), - sregex_token_iterator(), back_inserter(toks)); - - assert(toks.size() == 7); - is_reverse = std::stoi(toks[0]) != 0; - - from_id = std::stoi(toks[1]); - from_offset = std::stoi(toks[2]); - from_start = std::stoi(toks[3]) != 0; - - to_id = std::stoi(toks[4]); - to_offset = std::stoi(toks[5]); - to_end = std::stoi(toks[6]) != 0; -} - -bool Pileups::base_equal(char c1, char c2, bool is_reverse) { - char t1 = ::toupper(c1); - char t2 = ::toupper(c2); - return is_reverse ? t1 == reverse_complement(t2) : t1 == t2; -} - -char Pileups::extract_match(const BasePileup& bp, int64_t offset) { - char v = bp.bases()[offset]; - assert(v != '+' && v != '-'); - if (v == ',' || v == '.') { - return ::toupper(bp.ref_base()); - } else if (::islower(v)) { - return reverse_complement(::toupper(v)); - } - return v; -} - -// get arbitrary value from offset on forward strand -string Pileups::extract(const BasePileup& bp, int64_t offset) { - const string& bases = bp.bases(); - if (bases[offset] != '+' && bases[offset] != '-') { - return string(1, extract_match(bp, offset)); - } - else if (bases[offset] == '+') { - string len_str; - for (int64_t i = offset + 1; bases[i] >= '0' && bases[i] <= '9'; ++i) { - len_str += bases[i]; - } - int64_t len = atoi(len_str.c_str()); - // forward strand, return as is - if (::isupper(bases[offset + 1 + len_str.length()])) { - return bases.substr(offset, 1 + len_str.length() + len); - } - // reverse strand, flip the dna bases part and return upper case - else { - string dna = bases.substr(offset + 1 + len_str.length(), len); - casify(dna, false); - return string(1, bases[offset]) + len_str + reverse_complement(dna); - } - } - else { - assert(bases[offset] == '-'); - // todo : consolidate deletion parsing code better than this - int64_t sc = 0; - int64_t i = offset; - for (; sc < 6; ++i) { - if (bases[i] == ';') { - ++sc; - } - } - return bases.substr(offset, i - offset + 1); - } -} - -} diff --git a/src/pileup.hpp b/src/pileup.hpp deleted file mode 100644 index ea556394fc0..00000000000 --- a/src/pileup.hpp +++ /dev/null @@ -1,283 +0,0 @@ -#ifndef VG_PILEUP_HPP_INCLUDED -#define VG_PILEUP_HPP_INCLUDED - -#include -#include -#include -#include "vg.pb.h" -#include "vg.hpp" -#include "hash_map.hpp" -#include "utility.hpp" - -namespace vg { - -using namespace std; - -/// This is a collection of protobuf NodePileup records that are indexed -/// on their position, as well as EdgePileup records. -/// Pileups can be merged and streamed, and computed -/// from Alignments. The pileup records themselves are essentially -/// protobuf versions of lines in Samtools pileup format, with deletions -/// represented using a graph-based notation. -class Pileups { -public: - - Pileups(VG* graph, int min_quality = 0, int max_mismatches = 1, int window_size = 0, - int max_depth = 1000, bool use_mapq = false) : - _graph(graph), - _min_quality(min_quality), - _max_mismatches(max_mismatches), - _window_size(window_size), - _max_depth(max_depth), - _min_quality_count(0), - _max_mismatch_count(0), - _bases_count(0), - _use_mapq(use_mapq) -{} - - /// copy constructor - Pileups(const Pileups& other) { - if (this != &other) { - _graph = other._graph; - for (auto& p : other._node_pileups) { - insert_node_pileup(new NodePileup(*p.second)); - } - _min_quality = other._min_quality; - _max_mismatches = other._max_mismatches; - _window_size = other._window_size; - _max_depth = other._max_depth; - _min_quality_count = other._min_quality_count; - _max_mismatch_count = other._max_mismatch_count; - _bases_count = other._bases_count; - _use_mapq = other._use_mapq; - } - } - - /// move constructor - Pileups(Pileups&& other) noexcept { - _graph = other._graph; - _node_pileups = other._node_pileups; - other._node_pileups.clear(); - _min_quality = other._min_quality; - _max_mismatches = other._max_mismatches; - _window_size = other._window_size; - _max_depth = other._max_depth; - _min_quality_count = other._min_quality_count; - _max_mismatch_count = other._max_mismatch_count; - _bases_count = other._bases_count; - _use_mapq = other._use_mapq; - } - - /// copy assignment operator - Pileups& operator=(const Pileups& other) { - Pileups tmp(other); - *this = move(tmp); - return *this; - } - - /// move assignment operator - Pileups& operator=(Pileups&& other) noexcept { - _graph = other._graph; - swap(_node_pileups, other._node_pileups); - other._node_pileups.clear(); - _min_quality = other._min_quality; - _max_mismatches = other._max_mismatches; - _window_size = other._window_size; - _max_depth = other._max_depth; - _min_quality_count = other._min_quality_count; - _max_mismatch_count = other._max_mismatch_count; - _bases_count = other._bases_count; - _use_mapq = other._use_mapq; - return *this; - } - - /// delete contents of table - ~Pileups() { - clear(); - } - void clear(); - - // XXXX these should be hash_map but it won't compile unless they're explicitly defined - typedef spp::sparse_hash_map > NodePileupHash; - typedef spp::sparse_hash_map, EdgePileup*, wang_hash > > EdgePileupHash; - - VG* _graph; - - /// This maps from Position to Pileup. - NodePileupHash _node_pileups; - EdgePileupHash _edge_pileups; - - /// Ignore bases with quality less than this - int _min_quality; - /// max mismatches within window_size - int _max_mismatches; - /// number of bases to scan in each direction for mismatches - int _window_size; - /// prevent giant protobufs - int _max_depth; - /// toggle whether we incorporate Alignment.mapping_quality - bool _use_mapq; - /// Keep count of bases filtered by quality - mutable uint64_t _min_quality_count; - /// keep count of bases filtered by mismatches - mutable uint64_t _max_mismatch_count; - /// overall count for perspective on above - mutable uint64_t _bases_count; - - /// write to JSON - void to_json(ostream& out); - /// read from protobuf - void load(istream& in); - /// write to protobuf, with EOF marker - void write(ostream& out, size_t buffer_size = 5); - - /// apply function to each pileup in table - void for_each_node_pileup(const function& lambda); - - /// search hash table for node id - NodePileup* get_node_pileup(int64_t node_id) { - auto p = _node_pileups.find(node_id); - return p != _node_pileups.end() ? p->second : NULL; - } - - /// get a pileup. if it's null, create a new one and insert it. - NodePileup* get_create_node_pileup(const Node* node) { - NodePileup* p = get_node_pileup(node->id()); - if (p == NULL) { - p = new NodePileup(); - p->set_node_id(node->id()); - for (int i = 0; i < node->sequence().length(); ++i) { - BasePileup* b = p->add_base_pileup(); - b->set_num_bases(0); - b->set_ref_base((int)node->sequence()[i]); - } - _node_pileups[node->id()] = p; - } - return p; - } - - void for_each_edge_pileup(const function& lambda); - - /// search hash table for edge id - EdgePileup* get_edge_pileup(pair sides); - - /// get a pileup. if it's null, create a new one and insert it. - EdgePileup* get_create_edge_pileup(pair sides); - - void extend(Pileup& pileup); - - /// insert a pileup into the table. it will be deleted by ~Pileups()!!! - /// return true if new pileup inserted, false if merged into existing one - bool insert_node_pileup(NodePileup* pileup); - bool insert_edge_pileup(EdgePileup* edge_pileup); - - /// create / update all pileups from a single alignment - void compute_from_alignment(Alignment& alignment); - - /// create / update all pileups from an edit (called by above). - /// query stores the current position (and nothing else). - void compute_from_edit(NodePileup& pileup, int64_t& node_offset, int64_t& read_offset, - const Node& node, const Alignment& alignment, - const Mapping& mapping, const Edit& edit, - const Edit* next_edit, - const vector& mismatch_counts, - pair& last_match, - pair& last_del, - pair& open_del); - - /// do one pass to count all mismatches in read, so we can do - /// mismatch filter efficiently in 2nd path. - /// mismatches[i] stores number of mismatches in range (0, i) - static void count_mismatches(VG& graph, const Path& path, vector& mismatches, - bool skipIndels = false); - - /// check base quality as well as miss match filter - bool pass_filter(const Alignment& alignment, int64_t read_offset, - int64_t length, - const vector& mismatches) const; - - /// move all entries in other object into this one. - /// if two positions collide, they are merged. - /// other will be left empty. this is returned - Pileups& merge(Pileups& other); - - /// merge p2 into p1 and return 1. p2 is left an empty husk - BasePileup& merge_base_pileups(BasePileup& p1, BasePileup& p2); - - /// merge p2 into p1 and return 1. p2 is lef an empty husk - NodePileup& merge_node_pileups(NodePileup& p1, NodePileup& p2); - - /// merge p2 into p1 and return 1. p2 is lef an empty husk - EdgePileup& merge_edge_pileups(EdgePileup& p1, EdgePileup& p2); - - /// create combine map quality (optionally) with base quality - char combined_quality(char base_quality, int map_quality) const { - if (!_use_mapq) { - return base_quality; - } else { - // assume independence: P[Correct] = P[Correct Base] * P[Correct Map] - // --> P[Error] = 1 - (1 - P[Base Error]) * (1 - P[Map Error]) - // (using same code as gentoyper:) - double p_err = logprob_invert(logprob_invert(phred_to_logprob((int)base_quality)) + - logprob_invert(phred_to_logprob(map_quality))); - int qual = logprob_to_phred(p_err); - return (char)min(qual, (int)numeric_limits::max()); - } - } - - /// get ith BasePileup record - static BasePileup* get_base_pileup(NodePileup& np, int64_t offset) { - assert(offset < np.base_pileup_size() && offset >= 0); - return np.mutable_base_pileup(offset); - } - static const BasePileup* get_base_pileup(const NodePileup& np, int64_t offset) { - assert(offset < np.base_pileup_size() && offset >= 0); - return &np.base_pileup(offset); - } - - /// get ith BasePileup record, create if doesn't exist - static BasePileup* get_create_base_pileup(NodePileup& np, int64_t offset) { - for (int64_t i = np.base_pileup_size(); i <= offset; ++i) { - np.add_base_pileup(); - } - return get_base_pileup(np, offset); - } - - /// the bases string in BasePileup doesn't allow random access. This function - /// will parse out all the offsets of snps, insertions, and deletions - /// into one array, each offset is a pair of indexes in the bases and qualities arrays - static void parse_base_offsets(const BasePileup& bp, - vector >& offsets); - - /// transform case of every character in string - static void casify(string& seq, bool is_reverse); - - /// make the sam pileup style token - static void make_match(string& seq, int64_t from_length, bool is_reverse); - static void make_insert(string& seq, bool is_reverse); - static void make_delete(string& seq, bool is_reverse, - const pair& last_match, - const Mapping& mapping, int64_t node_offset); - static void make_delete(string& seq, bool is_reverse, - int64_t from_id, int64_t from_offset, bool from_start, - int64_t to_id, int64_t to_offset, bool to_end); - - static void parse_insert(const string& tok, int64_t& len, string& seq, bool& is_reverse); - static void parse_delete(const string& tok, bool& is_reverse, - int64_t& from_id, int64_t& from_offset, bool& from_start, - int64_t& to_id, int64_t& to_offset, bool& to_end); - - static bool base_equal(char c1, char c2, bool is_reverse); - - /// get a pileup value on forward strand - static char extract_match(const BasePileup& bp, int64_t offset); - - /// get arbitrary value from offset on forward strand - static string extract(const BasePileup& bp, int64_t offset); -}; - - - -} - -#endif diff --git a/src/pileup_augmenter.cpp b/src/pileup_augmenter.cpp deleted file mode 100644 index 1fd787ba7d2..00000000000 --- a/src/pileup_augmenter.cpp +++ /dev/null @@ -1,1183 +0,0 @@ -#include -#include -#include "json2pb.h" -#include "pileup_augmenter.hpp" -#include "stream.hpp" - -using namespace std; - -namespace vg { - -const double PileupAugmenter::Log_zero = (double)-1e100; - -const char PileupAugmenter::Default_default_quality = 30; -const int PileupAugmenter::Default_min_aug_support = 2; - -PileupAugmenter::PileupAugmenter(VG* graph, - int default_quality, - int min_aug_support): - _graph(graph), - _default_quality(default_quality), - _min_aug_support(min_aug_support) { - assert(_min_aug_support > 0); - _max_id = _graph->max_node_id(); - _node_divider._max_id = &_max_id; - _augmented_graph.base_graph = graph; -} - -// delete contents of table -PileupAugmenter::~PileupAugmenter() { - clear(); -} - -void PileupAugmenter::clear() { - _node_calls.clear(); - _node_supports.clear(); - _insert_calls.clear(); - _insert_supports.clear(); - _augmented_graph.clear(); - _node_divider.clear(); - _visited_nodes.clear(); - _called_edges.clear(); - _augmented_edges.clear(); - _inserted_nodes.clear(); -} - -void PileupAugmenter::write_augmented_graph(ostream& out, bool json) { - if (json) { - _augmented_graph.graph.paths.to_graph(_augmented_graph.graph.graph); - out << pb2json(_augmented_graph.graph.graph); - } else { - _augmented_graph.graph.serialize_to_ostream(out); - } -} - -void PileupAugmenter::call_node_pileup(const NodePileup& pileup) { - - _node = _graph->get_node(pileup.node_id()); - assert(_node != NULL); - assert(_node->sequence().length() == pileup.base_pileup_size()); - - _node_calls.clear(); - _insert_calls.clear(); - string def_char = "-"; - _node_calls.assign(_node->sequence().length(), Genotype(def_char, def_char)); - _insert_calls.assign(_node->sequence().length(), Genotype(def_char, def_char)); - _node_supports.clear(); - _node_supports.assign(_node->sequence().length(), make_pair( - StrandSupport(), StrandSupport())); - _insert_supports.clear(); - _insert_supports.assign(_node->sequence().length(), make_pair( - StrandSupport(), StrandSupport())); - - // process each base in pileup individually - #pragma omp parallel for - for (int i = 0; i < pileup.base_pileup_size(); ++i) { - int num_inserts = 0; - for (auto b : pileup.base_pileup(i).bases()) { - if (b == '+') { - ++num_inserts; - } - } - int pileup_depth = max(num_inserts, pileup.base_pileup(i).num_bases() - num_inserts); - if (pileup_depth >= 1) { - call_base_pileup(pileup, i, false); - call_base_pileup(pileup, i, true); - } - } - - // add nodes and edges created when making calls to the output graph - // (_side_map gets updated) - create_node_calls(pileup); - - _visited_nodes.insert(_node->id()); -} - -void PileupAugmenter::call_edge_pileup(const EdgePileup& pileup) { - if (pileup.num_reads() >= 1) { - - double qual_sum = 0; - - for (int i = 0; i < pileup.num_reads(); ++i) { - char qual = !pileup.qualities().empty() ? pileup.qualities().at(i) : _default_quality; - qual_sum += (double)qual; - } - - Edge edge = pileup.edge(); // gcc not happy about passing directly - _called_edges[NodeSide::pair_from_edge(edge)] = StrandSupport( - pileup.num_forward_reads(), - pileup.num_reads() - pileup.num_forward_reads(), - qual_sum); - } -} - -void PileupAugmenter::update_augmented_graph() { - - // Add nodes we don't think necessarily exist. - function add_node = [&](Node* node) { - if (_visited_nodes.find(node->id()) == _visited_nodes.end()) { - Node* call_node = _augmented_graph.graph.create_node(node->sequence(), node->id()); - _node_divider.add_fragment(node, 0, call_node, NodeDivider::EntryCat::Ref, - vector()); - } - }; - _graph->for_each_node(add_node); - - // map every edge in the original graph to equivalent sides - // in the call graph. if both sides exist, make an edge in the call graph - function map_edge = [&](Edge* edge) { - pair sides = NodeSide::pair_from_edge(edge); - // skip uncalled edges if not writing augmented graph - auto called_it = _called_edges.find(sides); - bool called = called_it != _called_edges.end(); - - StrandSupport support = called ? called_it->second : StrandSupport(); - assert(support.fs >= 0 && support.rs >= 0); - - Node* side1 = _graph->get_node(sides.first.node); - Node* side2 = _graph->get_node(sides.second.node); - // find up to two nodes matching side1 in the call graph - int from_offset = !sides.first.is_end ? 0 : side1->sequence().length() - 1; - int to_offset = sides.second.is_end ? side2->sequence().length() - 1 : 0; - char cat = called ? 'R' : 'U'; - create_augmented_edge(side1, from_offset, !sides.first.is_end, true, - side2, to_offset, !sides.second.is_end, true, cat, - support); - }; - - function process_augmented_edges = [&](bool pass1) { - for (auto& i : _augmented_edges) { - auto& sides = i.first; - char cat = i.second; - NodeOffSide os1 = sides.first; - Node* node1; - bool aug1; - if (_graph->has_node(os1.first.node)) { - node1 = _graph->get_node(os1.first.node); - aug1 = true; - } else { - // snp or isnert node -- need to get from call grpah - // note : that we should never break these as they aren't in - // the divider structure (will be caught down the road) - node1 = _augmented_graph.graph.get_node(os1.first.node); - aug1 = false; - } - int from_offset = os1.second; - bool left1 = !os1.first.is_end; - NodeOffSide os2 = sides.second; - Node* node2; - bool aug2; - if (_graph->has_node(os2.first.node)) { - node2 = _graph->get_node(os2.first.node); - aug2 = true; - } else { - // snp or insert node -- need to get from call graph - node2 = _augmented_graph.graph.get_node(os2.first.node); - aug2 = false; - } - // only need to pass support for here insertions, other cases handled elsewhere - StrandSupport support(-1, -1); - if (cat == 'I') { - auto ins_it = _inserted_nodes.find(os1.first.node); - if (ins_it != _inserted_nodes.end()) { - support = ins_it->second.sup; - } else { - ins_it = _inserted_nodes.find(os2.first.node); - assert(ins_it != _inserted_nodes.end()); - support = ins_it->second.sup; - } - assert(support.fs >= 0 && support.rs >= 0); - } - int to_offset = os2.second; - bool left2 = !os2.first.is_end; - // todo: clean this up - if (!pass1) { - create_augmented_edge(node1, from_offset, left1, aug1, node2, to_offset, left2, aug2, cat, support); - } else { - _node_divider.break_end(node1, &_augmented_graph.graph, from_offset, left1); - _node_divider.break_end(node2, &_augmented_graph.graph, to_offset, left2); - } - } - }; - - // two passes here is a hack to make sure break_end is called on all edge ends - // before processing any of them. - process_augmented_edges(true); - _graph->for_each_edge(map_edge); - process_augmented_edges(false); - - // Annotate all the nodes in the divider structure in the AugmentedGraph - annotate_augmented_nodes(); - // add on the inserted nodes - for (auto i : _inserted_nodes) { - auto& n = i.second; - annotate_augmented_node(n.node, 'I', n.sup, n.orig_id, n.orig_offset); - } - annotate_non_augmented_nodes(); -} - -void PileupAugmenter::map_path(const Path& path, list& aug_path, bool expect_edits) { - int64_t last_rank = -1; - int64_t last_call_rank = 0; - size_t running_len = 0; - size_t path_len = 0; - for (int i = 0; i < path.mapping_size(); ++i) { - const Mapping& mapping = path.mapping(i); - int64_t rank = mapping.rank() == 0 ? i+1 : mapping.rank(); - size_t len = mapping_from_length(mapping); - // force length so that insertions come back through map_node() - // (they will be corrected in apply_edits) - if (len == 0) { - assert(expect_edits); - len = 1; - } - - int64_t node_id = mapping.position().node_id(); - Node* node = _graph->get_node(node_id); - - int start = mapping.position().offset(); - if (mapping.position().is_reverse()) { - start = node->sequence().length() - 1 - start; - } - int end = mapping.position().is_reverse() ? start - len + 1 : start + len - 1; - - // this is a projection onto the augmented graph for the entire "from length" - // of the mapping - list aug_mappings = _node_divider.map_node(node_id, start, len, - mapping.position().is_reverse()); - - // undo insertion length hack above - if (len == 1 && mapping_from_length(mapping) == 0) { - assert(aug_mappings.size() == 1 && aug_mappings.front().edit_size() == 1); - aug_mappings.front().mutable_edit(0)->set_from_length(0); - } - - // now we apply edits post-hoc to the new mappings - if (expect_edits) { - apply_mapping_edits(mapping, aug_mappings); - } - else { - assert(mapping.edit_size() == 1 - && mapping_from_length(mapping) == mapping_to_length(mapping)); - } - - // add our new mappings to the augmented path - for (auto& cm : aug_mappings) { - running_len += mapping_from_length(cm); - cm.set_rank(++last_call_rank); - aug_path.push_back(mapping_t(cm)); - } - path_len += len; - last_rank = rank; - } - assert(running_len == path_len); - verify_path(path, aug_path); -} - -void PileupAugmenter::apply_mapping_edits(const Mapping& base_mapping, list& aug_mappings) { - // current place in aug_mappings list - auto aug_mapping_it = aug_mappings.begin(); - // from length of current mapping - size_t aug_mapping_len = mapping_from_length(*aug_mapping_it); - // amount of from length we've covered in the current augmented mapping - size_t aug_mapping_used = 0; - - // make new edits in a list then add to protobuf (simpler for now) - list aug_mapping_edits; - - // sanity checking - size_t total_aug_from_length = 0; - size_t total_aug_to_length = 0; - - for (size_t base_edit_idx = 0; base_edit_idx < base_mapping.edit_size(); ++base_edit_idx) { - const Edit& base_edit = base_mapping.edit(base_edit_idx); - const Edit* next_base_edit = base_edit_idx < base_mapping.edit_size() - 1 ? - &base_mapping.edit(base_edit_idx + 1) : NULL; - // walk along our current base edit, making as many new augmented edits as required - // the max(1, from_length) below is a hack for insertions - for (size_t be_covered = 0; be_covered < max(1, base_edit.from_length());) { - Edit aug_edit; - size_t edit_len = min((size_t)base_edit.from_length() - be_covered, - aug_mapping_len - aug_mapping_used); - - aug_edit.set_from_length(edit_len); - if (base_edit.to_length() == base_edit.from_length()) { - // match : we need to cut the to length - aug_edit.set_to_length(edit_len); - if (!base_edit.sequence().empty()) { - aug_edit.set_sequence(base_edit.sequence().substr(be_covered, edit_len)); - } - } else { - // indel : we can leave the to length - aug_edit.set_to_length(base_edit.to_length()); - // insertion sequence - if (!base_edit.sequence().empty()) { - aug_edit.set_sequence(base_edit.sequence()); - } - } - - total_aug_from_length += aug_edit.from_length(); - total_aug_to_length += aug_edit.to_length(); - - aug_mapping_edits.push_back(aug_edit); - - // advance in base edit - be_covered += max(edit_len, (size_t)1); - - // advance in augmented edit (and mapping if necessary) - aug_mapping_used += edit_len; - - assert(aug_mapping_used <= aug_mapping_len); - if (aug_mapping_used == aug_mapping_len && - // we don't advance to the next mapping if we've got an insertion (0 from length) next - (be_covered != base_edit.from_length() || !next_base_edit || - next_base_edit->from_length() > 0)) { - // appy to protobuf - assert(aug_mapping_it != aug_mappings.end()); - aug_mapping_it->clear_edit(); - for (auto& aug_edit : aug_mapping_edits) { - *aug_mapping_it->add_edit() = aug_edit; - } - aug_mapping_edits.clear(); - // advance to next input mapping - ++aug_mapping_it; - aug_mapping_len = aug_mapping_it != aug_mappings.end() ? mapping_from_length(*aug_mapping_it) : 0; - aug_mapping_used = 0; - } - } - } - - assert(total_aug_to_length == mapping_to_length(base_mapping)); - assert(total_aug_from_length == mapping_from_length(base_mapping)); -} - -void PileupAugmenter::map_paths() { - - // We don't remove any nodes, so paths always stay connected - function lambda = [&](const Path& path) { - list& call_path = _augmented_graph.graph.paths.create_path(path.name()); - map_path(path, call_path, false); - }; - _graph->paths.for_each(lambda); - - // make sure paths are saved - _augmented_graph.graph.paths.rebuild_node_mapping(); - _augmented_graph.graph.paths.rebuild_mapping_aux(); - _augmented_graph.graph.paths.to_graph(_augmented_graph.graph.graph); -} - -void PileupAugmenter::verify_path(const Path& in_path, const list& call_path) { - function lambda = [](VG* graph, const mapping_t& mapping) { - const Node* node = graph->get_node(mapping.node_id()); - return mapping_sequence(mapping.to_mapping(), *node); - }; - - string in_string; - for (int i = 0; i < in_path.mapping_size(); ++i) { - in_string += lambda(_graph, in_path.mapping(i)); - } - string call_string; - for (auto& m : call_path) { - call_string += lambda(&_augmented_graph.graph, m); - } - - assert(in_string == call_string); - -} - -void PileupAugmenter::create_augmented_edge(Node* node1, int from_offset, bool left_side1, bool aug1, - Node* node2, int to_offset, bool left_side2, bool aug2, char cat, - StrandSupport support) { - NodeDivider::Entry call_sides1; - NodeDivider::Entry call_sides2; - - if (aug1) { - call_sides1 = _node_divider.break_end(node1, &_augmented_graph.graph, from_offset, - left_side1); - } else { - call_sides1 = NodeDivider::Entry(node1, vector(1, support)); - } - if (aug2) { - call_sides2 = _node_divider.break_end(node2, &_augmented_graph.graph, to_offset, - left_side2); - } else { - call_sides2 = NodeDivider::Entry(node2, vector(1, support)); - } - - // make up to 9 edges connecting them in the call graph - for (int i = 0; i < (int)NodeDivider::EntryCat::Last; ++i) { - for (int j = 0; j < (int)NodeDivider::EntryCat::Last; ++j) { - if (call_sides1[i] != NULL && call_sides2[j] != NULL) { - // always make links between alts and reference - // (be more strict on deletion edges, only linking two reference) - bool link_edge = ((i == (int)NodeDivider::EntryCat::Ref && - j == (int)NodeDivider::EntryCat::Ref) || - ((i == (int)NodeDivider::EntryCat::Ref || - j == (int)NodeDivider::EntryCat::Ref) && - cat != 'L')); - - NodeSide side1(call_sides1[i]->id(), !left_side1); - NodeSide side2(call_sides2[j]->id(), !left_side2); - if (!_augmented_graph.graph.has_edge(side1, side2)) { - StrandSupport edge_support = support >= StrandSupport() ? support : - min(avgSup(call_sides1.sup(i)), avgSup(call_sides2.sup(j))); - - NodeOffSide no1(NodeSide(node1->id(), !left_side1), from_offset); - NodeOffSide no2(NodeSide(node2->id(), !left_side2), to_offset); - // take augmented deletion edge support from the pileup - if (cat == 'L') { - edge_support = _deletion_supports[minmax(no1, no2)]; - } - // hack to decrease support for an edge that spans an insertion, by subtracting - // that insertion's copy number. - auto is_it = _insertion_supports.find(minmax(no1, no2)); - if (is_it != _insertion_supports.end()) { - edge_support = edge_support - is_it->second; - } - - if (link_edge || edge_support.total() >= _min_aug_support) { - - Edge* edge = _augmented_graph.graph.create_edge(call_sides1[i], call_sides2[j], - left_side1, !left_side2); - - // TODO: can edges be annotated more than once with - // different cats? if so, last one will prevail. should - // check if this can impact vcf converter... - annotate_augmented_edge(edge, cat, edge_support); - } - } - } - } - } -} - -void PileupAugmenter::call_base_pileup(const NodePileup& np, int64_t offset, bool insertion) { - const BasePileup& bp = np.base_pileup(offset); - - // parse the pilueup structure - vector > base_offsets; - Pileups::parse_base_offsets(bp, base_offsets); - - // compute top two most frequent bases and their counts - string top_base; - int top_count; - int top_rev_count; - string second_base; - int second_count; - int second_rev_count; - int total_count; - compute_top_frequencies(bp, base_offsets, top_base, top_count, top_rev_count, - second_base, second_count, second_rev_count, total_count, - insertion); - - // note first and second base will be upper case too - string ref_base = string(1, ::toupper(bp.ref_base())); - - // get references to node-level members we want to update - Genotype& base_call = insertion ? _insert_calls[offset] : _node_calls[offset]; - pair& support = insertion ? _insert_supports[offset] : _node_supports[offset]; - - if (top_count >= _min_aug_support || (top_base == ref_base && top_count > 0)) { - base_call.first = top_base != ref_base ? top_base : "."; - support.first.fs = top_count - top_rev_count; - support.first.rs = top_rev_count; - support.first.qual = total_base_quality(bp, base_offsets, top_base); - } - if (second_count >= _min_aug_support || (second_base == ref_base && second_count > 0)) { - base_call.second = second_base != ref_base ? second_base : "."; - support.second.fs = second_count - second_rev_count; - support.second.rs = second_rev_count; - support.second.qual = total_base_quality(bp, base_offsets, second_base); - } -} - -void PileupAugmenter::compute_top_frequencies(const BasePileup& bp, - const vector >& base_offsets, - string& top_base, int& top_count, int& top_rev_count, - string& second_base, int& second_count, int& second_rev_count, - int& total_count, bool inserts) { - - // histogram of pileup entries (base, indel) - unordered_map hist; - // same thing but just reverse strand (used for strand bias filter) - unordered_map rev_hist; - - total_count = 0; - const string& bases = bp.bases(); - string ref_base = string(1, ::toupper(bp.ref_base())); - - // compute histogram from pileup - for (auto i : base_offsets) { - string val = Pileups::extract(bp, i.first); - - if ((inserts && val[0] != '+') || (!inserts && val[0] == '+')) { - // toggle inserts - continue; - } - - // We want to know if this pileup supports an N - if (is_all_n(val)) { - // N is not a real base, so we should never augment with it. - continue; - } - - ++total_count; - - // val will always be uppcase / forward strand. we check - // the original pileup to see if reversed - bool reverse = bases[i.first] == ',' || - (bases[i.first] == '+' && ::islower(bases[i.first + val.length() - 1])) || - (bases[i.first] != '-' && ::islower(bases[i.first])); - if (bases[i.first] == '-') { - string tok = Pileups::extract(bp, i.first); - bool is_reverse, from_start, to_end; - int64_t from_id, from_offset, to_id, to_offset; - Pileups::parse_delete(tok, is_reverse, from_id, from_offset, from_start, to_id, to_offset, to_end); - reverse = is_reverse; - // reset reverse to forward - if (is_reverse) { - Pileups::make_delete(val, false, from_id, from_offset, from_start, to_id, to_offset, to_end); - } - } - - if (hist.find(val) == hist.end()) { - hist[val] = 0; - rev_hist[val] = 0; - } - ++hist[val]; - - if (reverse) { - ++rev_hist[val]; - } - } - - // tie-breaker heuristic: - // reference > transition > transversion > delete > insert > N - function base_priority = [&ref_base](const string& base) { - size_t n = base.length(); - if(n == 0) { - // base == '' -> Uncalled: 0 Points - return 0; - } - else - { - char cbase = base[0]; - - if(n == 1) - { - switch(cbase) { - case '.':// Ref: 6 Points - return 6; - break; - case '-': // Uncalled: 0 Points - return 0; - break; - - // Transition: 5 points. Transversion: 4 points - case 'A': - case 't': - return cbase == 'G' ? 5 : 4; - break; - - case 'C': - case 'g': - return cbase == 'T' ? 5 : 4; - break; - - case 'G': - case 'c': - return cbase == 'A' ? 5 : 4; - break; - - case 'T': - case 'a': - return cbase == 'C' ? 5 : 4; - break; - } - } - - // need to happen in any other case - i.e. also for n > 1 - switch(cbase) { - case '-': - return 3; // Deletion: 3 Points - break; - case '+': - return 2; // Insertion: 2 Points - break; - } - } - return 1; - }; - - // compare to pileup entries, to see which has greater count, use tie breaker logic - // if count is the same - function base_greater = [&base_priority] ( - const string& base1, int count1, const string& base2, int count2) { - if (count1 == count2) { - int p1 = base_priority(base1); - int p2 = base_priority(base2); - if (p1 == p2) { - return base1 > base2; - } else { - return p1 > p2; - } - } - return count1 > count2; - }; - - // find highest occurring string - top_base.clear(); - top_count = 0; - for (auto i : hist) { - if (base_greater(i.first, i.second, top_base, top_count)) { - top_base = i.first; - top_count = i.second; - } - } - - // find second highest occurring string - // todo: do it in same pass as above - second_base.clear(); - second_count = 0; - for (auto i : hist) { - if (i.first != top_base && - base_greater(i.first, i.second, second_base, second_count)) { - second_base = i.first; - second_count = i.second; - } - } - assert(top_base == "" || top_base != second_base); - top_rev_count = rev_hist[top_base]; - second_rev_count = rev_hist[second_base]; -} - -double PileupAugmenter::total_base_quality(const BasePileup& bp, - const vector >& base_offsets, - const string& val) { - double qual_sum = 0; - - const string& bases = bp.bases(); - const string& quals = bp.qualities(); - - for (int i = 0; i < base_offsets.size(); ++i) { - string base = Pileups::extract(bp, base_offsets[i].first); - - // make sure deletes always compared without is_reverse flag - if (base.length() > 1 && base[0] == '-') { - bool is_reverse, from_start, to_end; - int64_t from_id, from_offset, to_id, to_offset; - Pileups::parse_delete(base, is_reverse, from_id, from_offset, from_start, to_id, to_offset, to_end); - // reset reverse to forward - if (is_reverse) { - Pileups::make_delete(base, false, from_id, from_offset, from_start, to_id, to_offset, to_end); - } - } - - if (base == val) { - char qual = base_offsets[i].second >= 0 ? quals[base_offsets[i].second] : _default_quality; - qual_sum += (double)qual; - } - } - - return qual_sum; -} - -// please refactor me! -void PileupAugmenter::create_node_calls(const NodePileup& np) { - - int n = _node->sequence().length(); - const string& seq = _node->sequence(); - int cur = 0; - int cat = call_cat(_node_calls[cur]); - - // scan calls, merging contiguous reference calls. only consider - // ref / snp / inserts on first pass. - // scan contiguous chunks of a node with same call - // (note: snps will always be 1-base -- never merged) - for (int next = 1; next <= n; ++next) { - int next_cat = next == n ? -1 : call_cat(_node_calls[next]); - - // for anything but case where we merge consec. ref/refs - if (cat == 2 || cat != next_cat || - _insert_calls[next-1].first[0] == '+' || _insert_calls[next-1].second[0] == '+') { - - if (cat == 0 || cat == 1) { - // add reference - vector sup; - if (_node_calls[cur].first == ".") { - for (int i = cur; i < next; ++i) { - sup.push_back(_node_supports[i].first); - } - } - if (_node_calls[cur].second == ".") { - assert (_node_calls[cur].first != "."); - for (int i = cur; i < next; ++i) { - sup.push_back(_node_supports[i].second); - } - } - string new_seq = seq.substr(cur, next - cur); - Node* node = _augmented_graph.graph.create_node(new_seq, ++_max_id); - _node_divider.add_fragment(_node, cur, node, NodeDivider::EntryCat::Ref, sup); - // bridge to node - NodeOffSide no1(NodeSide(_node->id(), true), cur-1); - NodeOffSide no2(NodeSide(_node->id(), false), cur); - _augmented_edges[make_pair(no1, no2)] = 'R'; - // bridge from node - no1 = NodeOffSide(NodeSide(_node->id(), true), next-1); - no2 = NodeOffSide(NodeSide(_node->id(), false), next); - _augmented_edges[make_pair(no1, no2)] = 'R'; - } - else { - // some mix of reference and alts - assert(next == cur + 1); - - function call_het = - [&](string& call1, StrandSupport support1, string& call2, NodeDivider::EntryCat altCat) { - - if (call1 == "." || (altCat == NodeDivider::EntryCat::Alt1 && call2 != ".")) { - // reference base - StrandSupport sup = call1 == "." ? support1 : StrandSupport(); - assert(call2 != "."); // should be handled above - string new_seq = seq.substr(cur, 1); - Node* node = _augmented_graph.graph.create_node(new_seq, ++_max_id); - _node_divider.add_fragment(_node, cur, node, NodeDivider::EntryCat::Ref, - vector(1, sup)); - // bridge to node - NodeOffSide no1(NodeSide(_node->id(), true), cur-1); - NodeOffSide no2(NodeSide(_node->id(), false), cur); - _augmented_edges[make_pair(no1, no2)] = 'R'; - // bridge from node - no1 = NodeOffSide(NodeSide(_node->id(), true), next-1); - no2 = NodeOffSide(NodeSide(_node->id(), false), next); - _augmented_edges[make_pair(no1, no2)] = 'R'; - } - if (call1 != "." && call1[0] != '-' && call1[0] != '+' && ( - // we only want to process a homozygous snp once: - call1 != call2 || altCat == NodeDivider::EntryCat::Alt1)) { - StrandSupport sup = support1; - // snp base - string new_seq = call1; - Node* node = _augmented_graph.graph.create_node(new_seq, ++_max_id); - _node_divider.add_fragment(_node, cur, node, altCat, - vector(1, sup)); - // bridge to node - NodeOffSide no1(NodeSide(_node->id(), true), cur-1); - NodeOffSide no2(NodeSide(_node->id(), false), cur); - _augmented_edges[make_pair(no1, no2)] = 'S'; - // bridge from node - no1 = NodeOffSide(NodeSide(_node->id(), true), next-1); - no2 = NodeOffSide(NodeSide(_node->id(), false), next); - _augmented_edges[make_pair(no1, no2)] = 'S'; - } - else if (call1 != "." && call1[0] == '-' && call1.length() > 1 && ( - // we only want to process homozygous delete once - call1 != call2 || altCat == NodeDivider::EntryCat::Alt1)) { - // delete - int64_t del_len; - bool from_start; - int64_t from_id; - int64_t from_offset; - int64_t to_id; - int64_t to_offset; - bool to_end; - bool reverse; - Pileups::parse_delete(call1, reverse, from_id, from_offset, from_start, to_id, to_offset, to_end); - NodeOffSide s1(NodeSide(from_id, !from_start), from_offset); - NodeOffSide s2(NodeSide(to_id, to_end), to_offset); - Node* node1 = _graph->get_node(from_id); - assert(from_offset >=0 && from_offset < node1->sequence().length()); - Node* node2 = _graph->get_node(to_id); - assert(to_offset >=0 && to_offset < node2->sequence().length()); - - // we're just going to update the divider here, since all - // edges get done at the end - _augmented_edges[make_pair(s1, s2)] = 'L'; - // keep track of its support - _deletion_supports[minmax(s1, s2)] = support1; - - // also need to bridge any fragments created above - if ((from_start && from_offset > 0) || - (!from_start && from_offset < node1->sequence().length() - 1)) { - NodeOffSide no1(NodeSide(from_id, !from_start), from_offset); - NodeOffSide no2(NodeSide(from_id, from_start), - (from_start ? from_offset - 1 : from_offset + 1)); - if (_augmented_edges.find(make_pair(no1, no2)) == _augmented_edges.end()) { - _augmented_edges[make_pair(no1, no2)] = 'R'; - } - } - if ((!to_end && to_offset > 0) || - (to_end && to_offset < node2->sequence().length() - 1)) { - NodeOffSide no1(NodeSide(to_id, to_end), to_offset); - NodeOffSide no2(NodeSide(to_id, !to_end), !to_end ? to_offset - 1 : to_offset + 1); - if (_augmented_edges.find(make_pair(no1, no2)) == _augmented_edges.end()) { - _augmented_edges[make_pair(no1, no2)] = 'R'; - } - - } - } - }; - - // apply same logic to both calls, updating opposite arrays - call_het(_node_calls[cur].first, _node_supports[cur].first, - _node_calls[cur].second, NodeDivider::EntryCat::Alt1); - call_het(_node_calls[cur].second, _node_supports[cur].second, - _node_calls[cur].first, NodeDivider::EntryCat::Alt2); - } - - // inserts done separate at end since they take start between cur and next - function call_inserts = - [&](string& ins_call1, StrandSupport ins_support1, string& ins_call2, StrandSupport ins_support2, - NodeDivider::EntryCat altCat) { - if (ins_call1[0] == '+' && ( - // we only want to process homozygous insert once - ins_call1 != ins_call2 || altCat == NodeDivider::EntryCat::Alt1)) { - int64_t ins_len; - string ins_seq; - bool ins_rev; - Pileups::parse_insert(ins_call1, ins_len, ins_seq, ins_rev); - // todo: check reverse? - Node* node = _augmented_graph.graph.create_node(ins_seq, ++_max_id); - StrandSupport sup = ins_support1; - InsertionRecord ins_rec = {node, sup, _node->id(), next-1}; - _inserted_nodes[node->id()] = ins_rec; - - // bridge to insert - NodeOffSide no1(NodeSide(_node->id(), true), next-1); - NodeOffSide no2(NodeSide(node->id(), false), 0); - _augmented_edges[make_pair(no1, no2)] = 'I'; - // bridge from insert - if (next < _node->sequence().length()) { - NodeOffSide no3 = NodeOffSide(NodeSide(node->id(), true), node->sequence().length() - 1); - NodeOffSide no4 = NodeOffSide(NodeSide(_node->id(), false), next); - _augmented_edges[make_pair(no3, no4)] = 'I'; - // bridge across insert - _augmented_edges[make_pair(no1, no4)] = 'R'; - // remember support "lost" to insertion so we - // can subtract it from the bridge later on - if (_insertion_supports.count(minmax(no1, no4))) { - _insertion_supports[minmax(no1, no4)] += sup; - } else { - _insertion_supports[minmax(no1, no4)] = sup; - } - } else { - // we have to link all outgoing edges to our insert if - // we're at end of node (unlike snps, the fragment structure doesn't - // handle these cases) - vector> next_nodes = _graph->edges_end(_node->id()); - NodeOffSide no3 = NodeOffSide(NodeSide(node->id(), true), node->sequence().length() - 1); - for (auto nn : next_nodes) { - int64_t offset4 = !nn.second ? 0 : _graph->get_node(nn.first)->sequence().length() - 1; - NodeOffSide no4 = NodeOffSide(NodeSide(nn.first, nn.second), offset4); - _augmented_edges[make_pair(no3, no4)] = 'I'; - // bridge across insert - _augmented_edges[make_pair(no1, no4)] = 'R'; - // remember support "lost" to insertion so we - // can subtract it from the bridge later on - if (_insertion_supports.count(minmax(no1, no4))) { - _insertion_supports[minmax(no1, no4)] += sup; - } else { - _insertion_supports[minmax(no1, no4)] = sup; - } - } - } - } - }; - - call_inserts(_insert_calls[next-1].first, _insert_supports[next-1].first, - _insert_calls[next-1].second, _insert_supports[next-1].second, - NodeDivider::EntryCat::Alt1); - call_inserts(_insert_calls[next-1].second, _insert_supports[next-1].second, - _insert_calls[next-1].first, _insert_supports[next-1].first, - NodeDivider::EntryCat::Alt2); - - // shift right - cur = next; - cat = next_cat; - } - } -} - -void PileupAugmenter::annotate_augmented_node(Node* node, char call, StrandSupport support, int64_t orig_id, int orig_offset) -{ - _augmented_graph.node_supports[node].set_forward(support.fs); - _augmented_graph.node_supports[node].set_reverse(support.rs); - _augmented_graph.node_supports[node].set_quality(support.qual); - - if (orig_id != 0 && call != 'S' && call != 'I') { - // Add translations for preserved parts - Translation trans; - auto* new_mapping = trans.mutable_to()->add_mapping(); - new_mapping->mutable_position()->set_node_id(node->id()); - auto* new_edit = new_mapping->add_edit(); - new_edit->set_from_length(node->sequence().size()); - new_edit->set_to_length(node->sequence().size()); - auto* old_mapping = trans.mutable_from()->add_mapping(); - old_mapping->mutable_position()->set_node_id(orig_id); - old_mapping->mutable_position()->set_offset(orig_offset); - auto* old_edit = old_mapping->add_edit(); - old_edit->set_from_length(node->sequence().size()); - old_edit->set_to_length(node->sequence().size()); - - _augmented_graph.translator.translations.push_back(trans); - } -} - -void PileupAugmenter::annotate_augmented_edge(Edge* edge, char call, StrandSupport support) -{ - _augmented_graph.edge_supports[edge].set_forward(support.fs); - _augmented_graph.edge_supports[edge].set_reverse(support.rs); - _augmented_graph.edge_supports[edge].set_quality(support.qual); -} - -void PileupAugmenter::annotate_augmented_nodes() -{ - for (auto& i : _node_divider.index) { - int64_t orig_node_id = i.first; - for (auto& j : i.second) { - int64_t orig_node_offset = j.first; - NodeDivider::Entry& entry = j.second; - char call = entry.sup_ref.empty() || maxSup(entry.sup_ref) == StrandSupport() ? 'U' : 'R'; - annotate_augmented_node(entry.ref, call, maxSup(entry.sup_ref), orig_node_id, orig_node_offset); - if (entry.alt1 != NULL) { - annotate_augmented_node(entry.alt1, 'S', maxSup(entry.sup_alt1), orig_node_id, orig_node_offset); - } - if (entry.alt2 != NULL) { - annotate_augmented_node(entry.alt2, 'S', maxSup(entry.sup_alt2), orig_node_id, orig_node_offset); - } - } - } -} - -void PileupAugmenter::annotate_non_augmented_nodes() { - _graph->for_each_node([&](Node* node) { - if (!_node_divider.index.count(node->id())) { - Translation trans; - auto* new_mapping = trans.mutable_to()->add_mapping(); - new_mapping->mutable_position()->set_node_id(node->id()); - auto* new_edit = new_mapping->add_edit(); - new_edit->set_from_length(node->sequence().size()); - new_edit->set_to_length(node->sequence().size()); - auto* old_mapping = trans.mutable_from()->add_mapping(); - old_mapping->mutable_position()->set_node_id(node->id()); - old_mapping->mutable_position()->set_offset(0); - auto* old_edit = old_mapping->add_edit(); - old_edit->set_from_length(node->sequence().size()); - old_edit->set_to_length(node->sequence().size()); - _augmented_graph.translator.translations.push_back(trans); - } - }); -} - -void NodeDivider::add_fragment(const Node* orig_node, int offset, Node* fragment, - EntryCat cat, vector sup) { - - NodeHash::iterator i = index.find(orig_node->id()); - if (i == index.end()) { - i = index.insert(make_pair(orig_node->id(), NodeMap())).first; - } - - NodeMap& node_map = i->second; - NodeMap::iterator j = node_map.find(offset); - - if (j != node_map.end()) { - assert(j->second[cat] == NULL); - j->second[cat] = fragment; - j->second.sup(cat) = sup; - } else { - Entry ins_triple; - ins_triple[cat] = fragment; - ins_triple.sup(cat) = sup; - j = node_map.insert(make_pair(offset, ins_triple)).first; - } - // sanity checks to make sure we don't introduce an overlap - if (offset == 0) { - assert(j == node_map.begin()); - } - if (offset + fragment->sequence().length() == orig_node->sequence().length()) { - assert(j == --node_map.end()); - } else if (j != --node_map.end()) { - NodeMap::iterator next = j; - ++next; - assert(offset + fragment->sequence().length() <= next->first); - } -} - -NodeDivider::Entry NodeDivider::break_end(const Node* orig_node, VG* graph, int offset, bool left_side) { - NodeHash::iterator i = index.find(orig_node->id()); - if (i == index.end()) { - return Entry(); - } - NodeMap& node_map = i->second; - NodeMap::iterator j = node_map.upper_bound(offset); - if (j == node_map.begin()) { - return Entry(); - } - - --j; - int sub_offset = j->first; - - function>(Node*, EntryCat, vector& )> lambda = - [&](Node* fragment, EntryCat cat, vector& sup) { - if (offset < sub_offset || offset >= sub_offset + fragment->sequence().length()) { - return make_pair((Node*)NULL, vector()); - } - - // if our cut point is already the exact left or right side of the node, then - // we don't have anything to do than return it. - if (offset == sub_offset && left_side == true) { - return make_pair(fragment, sup); - } - if (offset == sub_offset + fragment->sequence().length() - 1 && left_side == false) { - return make_pair(fragment, sup); - } - - // otherwise, we're somewhere in the middle, and have to subdivide the node - // first, shorten the exsisting node - int new_len = left_side ? offset - sub_offset : offset - sub_offset + 1; - assert(new_len > 0 && new_len != fragment->sequence().length()); - string frag_seq = fragment->sequence(); - *fragment->mutable_sequence() = frag_seq.substr(0, new_len); - - // then make a new node for the right part - Node* new_node = graph->create_node(frag_seq.substr(new_len, frag_seq.length() - new_len), ++(*_max_id)); - - // now divide up the support, starting with the right bit - vector new_sup; - if (!sup.empty()) { - new_sup = vector(sup.begin() + new_len, sup.end()); - // then cut the input (left bit) in place - sup.resize(new_len); - } - - // update the data structure with the new node - add_fragment(orig_node, sub_offset + new_len, new_node, cat, new_sup); - - return make_pair(new_node, new_sup); - }; - - vector& sup_ref = j->second.sup_ref; - Node* fragment_ref = j->second.ref; - auto new_node_info = fragment_ref != NULL ? lambda(fragment_ref, Ref, sup_ref) : - make_pair((Node*)NULL, vector()); - - vector& sup_alt1 = j->second.sup_alt1; - Node* fragment_alt1 = j->second.alt1; - auto new_node_alt1_info = fragment_alt1 != NULL ? lambda(fragment_alt1, Alt1, sup_alt1) : - make_pair((Node*)NULL, vector()); - - vector& sup_alt2 = j->second.sup_alt2; - Node* fragment_alt2 = j->second.alt2; - auto new_node_alt2_info = fragment_alt2 != NULL ? lambda(fragment_alt2, Alt2, sup_alt2) : - make_pair((Node*)NULL, vector()); - - Entry ret = left_side ? Entry(new_node_info.first, new_node_info.second, - new_node_alt1_info.first, new_node_alt1_info.second, - new_node_alt2_info.first, new_node_alt2_info.second) : - Entry(fragment_ref, sup_ref, fragment_alt1, sup_alt1, fragment_alt2, sup_alt2); - - return ret; -} - -// this function only works if node is completely covered in divider structure, -list NodeDivider::map_node(int64_t node_id, int64_t start_offset, int64_t length, bool reverse){ - NodeHash::iterator i = index.find(node_id); - assert(i != index.end()); - NodeMap& node_map = i->second; - assert(!node_map.empty()); - list out_mappings; - int cur_len = 0; - if (!reverse) { - for (auto i : node_map) { - Node* call_node = i.second.ref; - if (i.first + call_node->sequence().length() > start_offset && cur_len < length) { - assert(call_node != NULL); - Mapping mapping; - mapping.mutable_position()->set_node_id(call_node->id()); - if (start_offset > i.first && out_mappings.empty()) { - mapping.mutable_position()->set_offset(start_offset - i.first); - } else { - mapping.mutable_position()->set_offset(0); - } - int map_len = call_node->sequence().length() - mapping.position().offset(); - if (map_len + cur_len > length) { - map_len = length - cur_len; - } - assert(map_len > 0); - Edit* edit = mapping.add_edit(); - edit->set_from_length(map_len); - edit->set_to_length(map_len); - cur_len += map_len; - out_mappings.push_back(mapping); - } - } - } else { - // should fold into above when on less cold meds. - for (NodeMap::reverse_iterator i = node_map.rbegin(); i != node_map.rend(); ++i) - { - Node* call_node = i->second.ref; - if (i->first <= start_offset && cur_len < length) { - Mapping mapping; - mapping.mutable_position()->set_is_reverse(true); - mapping.mutable_position()->set_node_id(call_node->id()); - if (start_offset >= i->first && start_offset < i->first + call_node->sequence().length() - 1) { - assert(out_mappings.empty()); - mapping.mutable_position()->set_offset(start_offset - i->first); - } else { - mapping.mutable_position()->set_offset(call_node->sequence().length() - 1); - } - int map_len = mapping.position().offset() + 1; - if (map_len + cur_len > length) { - map_len = length - cur_len; - } - // switch up to new-style offset (todo: revise whole function) - mapping.mutable_position()->set_offset(call_node->sequence().length() - 1 - - mapping.position().offset()); - assert(map_len <= call_node->sequence().length()); - assert(mapping.position().offset() >= 0 && - mapping.position().offset() < call_node->sequence().length()); - Edit* edit = mapping.add_edit(); - edit->set_from_length(map_len); - edit->set_to_length(map_len); - cur_len += map_len; - out_mappings.push_back(mapping); - } - } - } - - assert(cur_len == length); - return out_mappings; -} - -void NodeDivider::clear() { - index.clear(); -} - -ostream& operator<<(ostream& os, const NodeDivider::NodeMap& nm) { - for (auto& x : nm) { - os << x.first << "[" << (x.second.ref ? x.second.ref->id() : -1) << ", " - << (x.second.alt1 ? x.second.alt1->id() : -1) << ", " - << (x.second.alt2 ? x.second.alt2->id() : -1) << "]" << endl; - } - return os; -} - -ostream& operator<<(ostream& os, NodeDivider::Entry entry) { - for (int i = 0; i < NodeDivider::EntryCat::Last; ++i) { - if (entry[i] != NULL) { - os << pb2json(*entry[i]); - } - else { - os << "NULL"; - } - os << ", "; - } - - return os; -} - -ostream& operator<<(ostream& os, const PileupAugmenter::NodeOffSide& no) { - os << "NOS(" << no.first.node << ":" << no.second << ",left=" << !no.first.is_end << ")"; - return os; -} - -} diff --git a/src/pileup_augmenter.hpp b/src/pileup_augmenter.hpp deleted file mode 100644 index cf325b45db2..00000000000 --- a/src/pileup_augmenter.hpp +++ /dev/null @@ -1,326 +0,0 @@ -// Augments a graph using a pileup (made with vg pileup) - -#ifndef VG_CALLER_HPP_INCLUDED -#define VG_CALLER_HPP_INCLUDED - -#include -#include -#include -#include -#include -#include -#include -#include "vg.pb.h" -#include "vg.hpp" -#include "hash_map.hpp" -#include "utility.hpp" -#include "pileup.hpp" -#include "path_index.hpp" -#include "genotypekit.hpp" -#include "option.hpp" - -namespace vg { - -using namespace std; - -// container for storing pairs of support for calls (value for each strand) -struct StrandSupport { - int fs; // forward support - int rs; // reverse support - double qual; // phred score (derived from sum log p-err over all observations) - StrandSupport(int f = 0, int r = 0, double q = 0) : - fs(f), rs(r), qual(q) {} - bool operator<(const StrandSupport& other) const { - if ((fs + rs) == (other.fs + other.rs)) { - // more strand bias taken as less support - return abs(fs - rs) > abs(other.fs - rs); - } - return fs + rs < other.fs + other.rs; - } - bool operator>=(const StrandSupport& other) const { - return !(*this < other); - } - bool operator==(const StrandSupport& other) const { - return fs == other.fs && rs == other.rs && qual == other.qual; - } - // min out at 0 - StrandSupport operator-(const StrandSupport& other) const { - return StrandSupport(max(0, fs - other.fs), max(0, rs - other.rs), - max(0., qual - other.qual)); - } - StrandSupport& operator+=(const StrandSupport& other) { - fs += other.fs; - rs += other.rs; - qual += other.qual; - return *this; - } - int total() { return fs + rs; } -}; - -inline StrandSupport minSup(vector& s) { - if (s.empty()) { - return StrandSupport(); - } - return *min_element(s.begin(), s.end()); -} -inline StrandSupport maxSup(vector& s) { - if (s.empty()) { - return StrandSupport(); - } - return *max_element(s.begin(), s.end()); -} -inline StrandSupport avgSup(vector& s) { - StrandSupport ret; - if (!s.empty()) { - for (auto sup : s) { - ret.fs += sup.fs; - ret.rs += sup.rs; - ret.qual += sup.qual; - } - ret.fs /= s.size(); - ret.rs /= s.size(); - ret.qual /= s.size(); - } - return ret; -} -inline StrandSupport totalSup(vector& s) { - StrandSupport ret; - if (!s.empty()) { - for (auto sup : s) { - ret.fs += sup.fs; - ret.rs += sup.rs; - ret.qual += sup.qual; - } - } - return ret; -} - -inline ostream& operator<<(ostream& os, const StrandSupport& sup) { - return os << sup.fs << ", " << sup.rs << ", " << sup.qual; -} - -// We need to break apart nodes but remember where they came from to update edges. -// Wrap all this up in this class. For a position in the input graph, we can have -// up to three nodes in the augmented graph (Ref, Alt1, Alt2), so we map to node -// triplets (Entry struct below). Note we never *call* all three nodes due to -// diploid assumption, but the augmented graph stores everything. -struct NodeDivider { - // up to three fragments per position in augmented graph (basically a Node 3-tuple, - // avoiding aweful C++ tuple syntax) - enum EntryCat {Ref = 0, Alt1, Alt2, Last}; - struct Entry { Entry(Node* r = 0, vector sup_r = vector(), - Node* a1 = 0, vector sup_a1 = vector(), - Node* a2 = 0, vector sup_a2 = vector()) : ref(r), alt1(a1), alt2(a2), - sup_ref(sup_r), sup_alt1(sup_a1), sup_alt2(sup_a2){} - Node* ref; Node* alt1; Node* alt2; - vector sup_ref; - vector sup_alt1; - vector sup_alt2; - Node*& operator[](int i) { - assert(i >= 0 && i <= 2); - return i == EntryCat::Ref ? ref : (i == EntryCat::Alt1 ? alt1 : alt2); - } - vector& sup(int i) { - assert(i >= 0 && i <= 2); - return i == EntryCat::Ref ? sup_ref : (i == EntryCat::Alt1 ? sup_alt1 : sup_alt2); - } - }; - // offset in original graph node -> up to 3 nodes in call graph - typedef map NodeMap; - // Node id in original graph to map above - typedef hash_map NodeHash; - NodeHash index; - int64_t* _max_id; - // map given node to offset i of node with id in original graph - // this function can never handle overlaps (and should only be called before break_end) - void add_fragment(const Node* orig_node, int offset, Node* subnode, EntryCat cat, vector sup); - // break node if necessary so that we can attach edge at specified side - // this function wil return NULL if there's no node covering the given location - Entry break_end(const Node* orig_node, VG* graph, int offset, bool left_side); - // assuming input node is fully covered, list of nodes that correspond to it in call graph - // if node not in structure at all, just return input (assumption uncalled nodes kept as is) - list map_node(int64_t node_id, int64_t start_offset, int64_t length, bool reverse); - // erase everything (but don't free any Node pointers, they belong to the graph) - void clear(); -}; -ostream& operator<<(ostream& os, const NodeDivider::NodeMap& nm); -ostream& operator<<(ostream& os, NodeDivider::Entry entry); - -/** - * Super simple graph augmentor/caller. - * Idea: Idependently process Pileup records, using simple model to make calls that - * take into account read errors with diploid assumption. Edges and node positions - * are called independently for now. - * Outputs either a sample graph (only called nodes and edges) or augmented graph - * (include uncalled nodes and edges too). - */ -class PileupAugmenter { -public: - - // log of zero - static const double Log_zero; - // use this score when pileup is missing quality - static const char Default_default_quality; - // don't augment graph without minimum support - static const int Default_min_aug_support; - - PileupAugmenter(VG* graph, - int default_quality = Default_default_quality, - int min_aug_support = Default_min_aug_support); - - ~PileupAugmenter(); - void clear(); - - // input graph - VG* _graph; - // Output augmented graph with annotations - SupportAugmentedGraph _augmented_graph; - - // buffer for base calls for each position in the node - // . = reference - // - = missing - typedef pair Genotype; - vector _node_calls; - vector > _node_supports; - // separate structure for isnertion calls since they - // don't really have reference coordinates (instead happen just to - // right of offset). - vector _insert_calls; - vector > _insert_supports; - // buffer for current node; - const Node* _node; - // max id in call_graph - int64_t _max_id; - // link called nodes back to the original graph. needed - // to figure out where edges go - NodeDivider _node_divider; - unordered_set _visited_nodes; - unordered_map, StrandSupport> _called_edges; // map to support - // deletes can don't necessarily need to be in incident to node ends - // so we throw in an offset into the mix. - typedef pair NodeOffSide; - // map a call category to an edge - typedef unordered_map, char> EdgeHash; - EdgeHash _augmented_edges; - // keep track of inserted nodes for tsv output - struct InsertionRecord { - Node* node; - StrandSupport sup; - int64_t orig_id; - int orig_offset; - }; - typedef unordered_map InsertionHash; - InsertionHash _inserted_nodes; - // hack for better estimating support for edges that go around - // insertions (between the adjacent ref nodes) - typedef unordered_map, StrandSupport> EdgeSupHash; - EdgeSupHash _insertion_supports; - - // need to keep track of support for augmented deletions - // todo: generalize augmented edge support - EdgeSupHash _deletion_supports; - - // maximum number of nodes to call before writing out output stream - int _buffer_size; - // if we don't have a mapping quality for a read position, use this - char _default_quality; - // minimum support to augment graph - int _min_aug_support; - - // write the augmented graph - void write_augmented_graph(ostream& out, bool json); - - // call every position in the node pileup - void call_node_pileup(const NodePileup& pileup); - - // call an edge. remembering it in a table for the whole graph - void call_edge_pileup(const EdgePileup& pileup); - - // fill in edges in the augmented graph (those that are incident to 2 call - // nodes) and add uncalled nodes (optionally) - void update_augmented_graph(); - - // map a path (can have edits, ie from Alignment) from base graph to augmented graph - // aug_path parameter is empty path that will be written to - void map_path(const Path& base_path, list& aug_path, bool expect_edits); - - // Apply edits from base_mapping to corresponding augmented mappings that share same - // from interval, but don't yet have edits (called by map_path); - void apply_mapping_edits(const Mapping& base_mapping, list& aug_mappings); - - // TODO: - // method to normalize mapped paths back onto the augmented graph. ie check each - // non-match edit to see if it can be turned into a match on the augmented graph. - - // map paths from input graph into called (augmented) graph - void map_paths(); - // make sure mapped paths generate same strings as input paths - void verify_path(const Path& in_path, const list& call_path); - - // call position at given base - // if insertion flag set to true, call insertion between base and next base - void call_base_pileup(const NodePileup& np, int64_t offset, bool insertions); - - // Find the top-two bases in a pileup, along with their counts - // Last param toggles whether we consider only inserts or everything else - // (do not compare all at once since inserts do not have reference coordinates) - void compute_top_frequencies(const BasePileup& bp, - const vector >& base_offsets, - string& top_base, int& top_count, int& top_rev_count, - string& second_base, int& second_count, int& second_rev_count, - int& total_count, bool inserts); - - // Sum up the qualities of a given symbol in a pileup - double total_base_quality(const BasePileup& pb, - const vector >& base_offsets, - const string& val); - - // write graph structure corresponding to all the calls for the current - // node. - void create_node_calls(const NodePileup& np); - - void create_augmented_edge(Node* node1, int from_offset, bool left_side1, bool aug1, - Node* node2, int to_offset, bool left_side2, bool aug2, char cat, - StrandSupport support); - - // Annotate nodes and edges in the augmented graph with call info. - void annotate_augmented_node(Node* node, char call, StrandSupport support, int64_t orig_id, int orig_offset); - void annotate_augmented_edge(Edge* edge, char call, StrandSupport support); - void annotate_augmented_nodes(); - - // Add nodes that are passed through as-is (ie not augmented at all) to the translation table - void annotate_non_augmented_nodes(); - - // log function that tries to avoid 0s - static double safe_log(double v) { - return v == 0. ? Log_zero : ::log10(v); - } - - // call missing - static bool missing_call(const Genotype& g) { - return g.first == "-" && g.second == "-"; - } - - // call is reference - static bool ref_call(const Genotype& g) { - return g.first == "." && (g.second == "." || g.second == "-"); - } - - // classify call as 0: missing 1: reference 2: snp - // (holdover from before indels) - static int call_cat(const Genotype&g) { - if (missing_call(g)) { - return 0; - } else if (ref_call(g)) { - return 1; - } - return 2; - } -}; - -ostream& operator<<(ostream& os, const PileupAugmenter::NodeOffSide& no); - - -} - -#endif diff --git a/src/position.cpp b/src/position.cpp index 69875c774eb..ffb80082c70 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -6,8 +6,8 @@ pos_t make_pos_t(const Position& pos) { return make_tuple(pos.node_id(), pos.is_reverse(), pos.offset()); } -pos_t make_pos_t(id_t id, bool is_rev, off_t off) { - return make_tuple(id, is_rev, off); +pos_t make_pos_t(const position_t& pos) { + return make_tuple(pos.node_id(), pos.is_reverse(), pos.offset()); } pos_t make_pos_t(gcsa::node_type node) { @@ -22,7 +22,7 @@ Position make_position(const pos_t& pos) { return p; } -Position make_position(id_t id, bool is_rev, off_t off) { +Position make_position(id_t id, bool is_rev, offset_t off) { Position p; p.set_node_id(id); p.set_is_reverse(is_rev); @@ -38,43 +38,6 @@ Position make_position(gcsa::node_type node) { return p; } -bool is_empty(const pos_t& pos) { - return id(pos) == 0; -} - -id_t id(const pos_t& pos) { - return get<0>(pos); -} - -bool is_rev(const pos_t& pos) { - return get<1>(pos); -} - -off_t offset(const pos_t& pos) { - return get<2>(pos); -} - -id_t& get_id(pos_t& pos) { - return get<0>(pos); -} - -bool& get_is_rev(pos_t& pos) { - return get<1>(pos); -} - -off_t& get_offset(pos_t& pos) { - return get<2>(pos); -} - -pos_t reverse(const pos_t& pos, size_t node_length) { - pos_t rev = pos; - // swap the offset onto the other strand - get_offset(rev) = node_length - offset(rev); - // invert the position - get_is_rev(rev) = !is_rev(rev); - return rev; -} - Position reverse(const Position& pos, size_t node_length) { auto p = pos; p.set_offset(node_length - pos.offset()); @@ -82,12 +45,8 @@ Position reverse(const Position& pos, size_t node_length) { return p; } -ostream& operator<<(ostream& out, const pos_t& pos) { - return out << id(pos) << (is_rev(pos) ? "-" : "+") << offset(pos); -} - -pair min_oriented_distances(const map > >& path_offsets1, - const map > >& path_offsets2) { +pair min_oriented_distances(const unordered_map > >& path_offsets1, + const unordered_map > >& path_offsets2) { int64_t distance_same = std::numeric_limits::max(); int64_t distance_diff = std::numeric_limits::max(); for (auto& path : path_offsets1) { @@ -111,4 +70,15 @@ pair min_oriented_distances(const map #include "types.hpp" #include "utility.hpp" -#include "json2pb.h" +#include "vg/io/json2pb.h" #include -#include +#include "handle.hpp" /** \file * Functions for working with Positions and `pos_t`s. @@ -16,42 +16,84 @@ namespace vg { using namespace std; -/// Return true if a pos_t is unset. -bool is_empty(const pos_t& pos); -/// Extract the id of the node a pos_t is on. -id_t id(const pos_t& pos); -/// Return true if a pos_t is on the reverse strand of its node. -bool is_rev(const pos_t& pos); -/// Get the offset from a pos_t. -off_t offset(const pos_t& pos); -/// Get a reference to the Node ID of a pos_t. -id_t& get_id(pos_t& pos); -/// Get a reference to the reverse flag of a pos_t. -bool& get_is_rev(pos_t& pos); -/// Get a reference to the offset field of a pos_t. -off_t& get_offset(pos_t& pos); -/// Reverse a pos_t and get a pos_t at the same base, going the other direction. -pos_t reverse(const pos_t& pos, size_t node_length); -/// Reverse a Position and get a Position at the same base, going the orther direction. + + +// duplicative with pos_t, but this will make it so much easier +// to refactor -- can always eliminate later +class position_t { +public: + position_t() = default; + position_t(const position_t&) = default; + position_t(position_t&&) = default; + ~position_t() = default; + position_t& operator=(const position_t&) = default; + position_t& operator=(position_t&&) = default; + inline int64_t node_id() const; + inline void set_node_id(int64_t i); + inline int64_t offset() const; + inline void set_offset(int64_t o); + inline bool is_reverse() const; + inline void set_is_reverse(bool r); + inline bool operator==(const position_t& other) const; + inline bool operator!=(const position_t& other) const; +private: + int64_t _node_id; + int64_t _offset; + bool _is_reverse; +}; + +/// Reverse a Position and get a Position at the same **point between bases**, going the other direction. +/// To get a Position to the same *base*, subtract 1 from the resulting offset. Position reverse(const Position& pos, size_t node_length); -/// Print a pos_t to a stream. -ostream& operator<<(ostream& out, const pos_t& pos); /// Convert a Position to a (much smaller) pos_t. pos_t make_pos_t(const Position& pos); -/// Create a pos_t from a Node ID, an orientation flag, and an offset. -pos_t make_pos_t(id_t id, bool is_rev, off_t off); +pos_t make_pos_t(const position_t& pos); /// Create a pos_t from a gcsa node pos_t make_pos_t(gcsa::node_type node); /// Convert a pos_t to a Position. Position make_position(const pos_t& pos); -/// Create a Position from a Node ID, an orientation flag, and an offset. -Position make_position(id_t id, bool is_rev, off_t off); +/// Create a Position from a Node ID, an orientation flag, and an offset along that strand of the node. +Position make_position(id_t id, bool is_rev, offset_t off); /// Make a Position from a gcsa node Position make_position(gcsa::node_type node); /// Find the min distance in the path offsets where the path orientation is the same and different -pair min_oriented_distances(const map > >& path_offsets1, - const map > >& path_offsets2); +pair min_oriented_distances(const unordered_map > >& path_offsets1, + const unordered_map > >& path_offsets2); + +string debug_string(const position_t& pos); +void from_proto_position(const Position& from, position_t& to); +/* + * position_t + */ +inline int64_t position_t::node_id() const { + return _node_id; +} +inline void position_t::set_node_id(int64_t i) { + _node_id = i; +} +inline int64_t position_t::offset() const { + return _offset; +} +inline void position_t::set_offset(int64_t o) { + _offset = o; +} +inline bool position_t::is_reverse() const { + return _is_reverse; +} +inline void position_t::set_is_reverse(bool r) { + _is_reverse = r; +} + +inline bool position_t::operator==(const position_t& other) const { + return (_node_id == other._node_id + && _is_reverse == other._is_reverse + && _offset == other._offset); +} + +inline bool position_t::operator!=(const position_t& other) const { + return !(*this == other); +} } #endif diff --git a/src/preflight.cpp b/src/preflight.cpp index 94c5e6ab8a9..fde71e4b8e1 100644 --- a/src/preflight.cpp +++ b/src/preflight.cpp @@ -2,7 +2,10 @@ #include #include + +#ifdef __x86_64__ #include +#endif namespace vg { @@ -10,6 +13,7 @@ using namespace std; void preflight_check() { +#ifdef __x86_64__ // We assume we are on x86_64 on POSIX (and not Windows). // We use the method of dlib's dlib/simd/simd_check.h @@ -27,6 +31,8 @@ void preflight_check() { << "Please use a system with SSE4.2 support." << endl; exit(1); } +#endif + // If not on x86_64, we are probably on ARM and using fake SSE anyway. } diff --git a/src/preflight.hpp b/src/preflight.hpp index 6f45d62ead1..7db872eab0d 100644 --- a/src/preflight.hpp +++ b/src/preflight.hpp @@ -6,14 +6,24 @@ * Mostly exists to check for SSE4.2 support which is still not universal. */ +// Get standard library to identify itself by including a no-op header +#include + namespace vg { -using namespace std; +/// Define a macro to tell things to be built for every X86_64 architecture, if possible. +/// This *doesn't* work on Mac with GNU GCC and Apple libc++, because functions +/// for x86-64 can't use std::endl, so we exclude that combination. +#if defined(__x86_64__) && (!defined(__GNUC__) || !defined(_LIBCPP_VERSION) || !defined(__APPLE__)) + #define VG_PREFLIGHT_EVERYWHERE __attribute__((__target__("arch=x86-64"))) +#else + #define VG_PREFLIGHT_EVERYWHERE +#endif /// Run a preflight check to make sure that the system is usable for this build of vg. /// Aborts with a helpful message if this is not the case. /// We make sure to build it for a lowest-common-denominator architecture. -void preflight_check() __attribute__((__target__("arch=x86-64"))); +void preflight_check() VG_PREFLIGHT_EVERYWHERE; } diff --git a/src/prune.cpp b/src/prune.cpp deleted file mode 100644 index a5170f9ad97..00000000000 --- a/src/prune.cpp +++ /dev/null @@ -1,100 +0,0 @@ -#include "prune.hpp" - -namespace vg { - -vector find_edges_to_prune(const HandleGraph& graph, size_t k, size_t edge_max) { - // for each position on the forward and reverse of the graph - //unordered_set edges_to_prune; - vector > edges_to_prune; - edges_to_prune.resize(get_thread_count()); - graph.for_each_handle([&](const handle_t& h) { - // for the forward and reverse of this handle - // walk k bases from the end, so that any kmer starting on the node will be represented in the tree we build - for (auto handle_is_rev : { false, true }) { - //cerr << "###########################################" << endl; - handle_t handle = handle_is_rev ? graph.flip(h) : h; - list walks; - // for each position in the node, set up a kmer with that start position and the node end or kmer length as the end position - // determine next positions - id_t handle_id = graph.get_id(handle); - size_t handle_length = graph.get_length(handle); - string handle_seq = graph.get_sequence(handle); - for (size_t i = 0; i < handle_length; ++i) { - pos_t begin = make_pos_t(handle_id, handle_is_rev, handle_length); - pos_t end = make_pos_t(handle_id, handle_is_rev, min(handle_length, i+k)); - walk_t walk = walk_t(offset(end)-offset(begin), begin, end, handle, 0); - if (walk.length < k) { - // are we branching over more than one edge? - size_t next_count = 0; - graph.follow_edges(walk.curr, false, [&](const handle_t& next) { ++next_count; }); - graph.follow_edges(walk.curr, false, [&](const handle_t& next) { - if (next_count > 1 && edge_max == walk.forks) { // our next step takes us over the max - int tid = omp_get_thread_num(); - edges_to_prune[tid].push_back(graph.edge_handle(walk.curr, next)); - } else { - walks.push_back(walk); - auto& todo = walks.back(); - todo.curr = next; - if (next_count > 1) { - ++todo.forks; - } - } - }); - } else { - walks.push_back(walk); - } - } - // now expand the kmers until they reach k - while (!walks.empty()) { - // first we check which ones have reached length k in the current handle; for each of these we run lambda and remove them from our list - auto walks_end = walks.end(); - for (list::iterator q = walks.begin(); q != walks_end; ++q) { - auto& walk = *q; - // did we reach our target length? - if (walk.length >= k) { - q = walks.erase(q); - } else { - id_t curr_id = graph.get_id(walk.curr); - size_t curr_length = graph.get_length(walk.curr); - bool curr_is_rev = graph.get_is_reverse(walk.curr); - size_t take = min(curr_length, k-walk.length); - walk.end = make_pos_t(curr_id, curr_is_rev, take); - walk.length += take; - if (walk.length < k) { - // if not, we need to expand through the node then follow on - size_t next_count = 0; - graph.follow_edges(walk.curr, false, [&](const handle_t& next) { ++next_count; }); - graph.follow_edges(walk.curr, false, [&](const handle_t& next) { - if (next_count > 1 && edge_max == walk.forks) { // our next step takes us over the max - int tid = omp_get_thread_num(); - edges_to_prune[tid].push_back(graph.edge_handle(walk.curr, next)); - } else { - walks.push_back(walk); - auto& todo = walks.back(); - todo.curr = next; - if (next_count > 1) { - ++todo.forks; - } - } - }); - q = walks.erase(q); - } else { - // nothing, we'll remove it next time around - } - } - } - } - } - }, true); - uint64_t total_edges = 0; - for (auto& v : edges_to_prune) total_edges += v.size(); - vector merged; merged.reserve(total_edges); - for (auto& v : edges_to_prune) { - merged.insert(merged.end(), v.begin(), v.end()); - } - // duplicates are assumed to be dealt with externally - return merged; -} - - -} diff --git a/src/prune.hpp b/src/prune.hpp deleted file mode 100644 index e2d914cadbd..00000000000 --- a/src/prune.hpp +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef VG_PRUNE_HPP_INCLUDED -#define VG_PRUNE_HPP_INCLUDED - -#include "vg.pb.h" -#include -#include "json2pb.h" -#include "handle.hpp" -#include "position.hpp" - -/** \file - * Functions for working with `kmers_t`'s in HandleGraphs. - */ - -namespace vg { - -using namespace std; - -/// Record a <=k-length walk in the context of a graph. -struct walk_t { - walk_t(uint16_t l, - const pos_t& b, - const pos_t& e, - const handle_t& c, - uint16_t f) - : length(l), begin(b), end(e), curr(c), forks(f) { }; - /// our start position - pos_t begin; - pos_t end; /// one past the (current) end of the kmer - handle_t curr; /// the next handle we extend into - uint16_t forks; /// how many branching edge crossings we took to get here - uint16_t length; /// how far we've been -}; - -/// Iterate over all the walks up to length k, adding edges which -vector find_edges_to_prune(const HandleGraph& graph, size_t k, size_t edge_max); - -} - -#endif diff --git a/src/qual_adj_xdrop_aligner.cpp b/src/qual_adj_xdrop_aligner.cpp new file mode 100644 index 00000000000..64105e941a5 --- /dev/null +++ b/src/qual_adj_xdrop_aligner.cpp @@ -0,0 +1,138 @@ +/** + * \file qual_adj_xdrop_aliigner.cpp: contains implementation of QualAdjXdropAligner + */ +#include "dozeu_interface.hpp" + +// Configure dozeu: +// We want the full length bonus included +#ifndef DZ_FULL_LENGTH_BONUS +#define DZ_FULL_LENGTH_BONUS +#endif +// We want the quality adjusted versions of functions +#ifndef DZ_QUAL_ADJ +#define DZ_QUAL_ADJ +#endif +// We require these particular values for this enum because we index arrays with it. +enum { MISMATCH = 1, MATCH = 2, INS = 3, DEL = 4 }; +// Set dozeu's CIGAR codes to match our enum +#ifndef DZ_CIGAR_OP +#define DZ_CIGAR_OP 0x04030201 +#endif + +#include + +using namespace vg; + +QualAdjXdropAligner::QualAdjXdropAligner(const QualAdjXdropAligner& other) +{ + *this = other; +} + +QualAdjXdropAligner& QualAdjXdropAligner::operator=(const QualAdjXdropAligner& other) +{ + if (this != &other) { + + if (dz) { + dz_destroy(dz); + } + + // TODO: a bit of an arcane step + // we need to pull out the 0-padded quality adjusted matrices from dz into a contiguous array + int8_t* qual_adj_matrix = (int8_t*) malloc(DZ_QUAL_MATRIX_SIZE * sizeof(int8_t)); + for (size_t i = 0; i < DZ_QUAL_MATRIX_SIZE; ++i) { + qual_adj_matrix[i] = dz_qual_matrix(other.dz)[(i / 16) * 32 + (i % 16)]; + } + + dz = dz_qual_adj_init(other.dz->matrix, + qual_adj_matrix, + *((const uint16_t*) &other.dz->giv), + *((const uint16_t*) &other.dz->gev)); + + free(qual_adj_matrix); + } + return *this; +} + +QualAdjXdropAligner::QualAdjXdropAligner(QualAdjXdropAligner&& other) +{ + *this = other; +} + +QualAdjXdropAligner& QualAdjXdropAligner::operator=(QualAdjXdropAligner&& other) +{ + if (this != &other) { + if (dz) { + dz_destroy(dz); + } + dz = other.dz; + other.dz = nullptr; + } + + return *this; +} + +QualAdjXdropAligner::QualAdjXdropAligner(const int8_t* _score_matrix, + const int8_t* _qual_adj_score_matrix, + int8_t _gap_open, int8_t _gap_extension) +{ + // xdrop aligner uses the parameterization where both gap open and gap extend + // are added when opening a gap + assert(_gap_open - _gap_extension >= 0); + assert(_gap_extension > 0); + + // convert the 5x5 matrices into a 4x4 like dozeu wants, and also transpose + // the matrix so that the read error probabilities are where dozeu expects + uint32_t max_qual = 255; + int8_t* qual_adj_scores_4x4 = (int8_t*) malloc(16 * (max_qual + 1)); + for (int q = 0; q <= max_qual; ++q) { + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + qual_adj_scores_4x4[q * 16 + i * 4 + j] = _qual_adj_score_matrix[q * 25 + j * 5 + i]; + } + } + } + + dz = dz_qual_adj_init(_score_matrix, qual_adj_scores_4x4, _gap_open - _gap_extension, + _gap_extension); + + free(qual_adj_scores_4x4); +} + +QualAdjXdropAligner::~QualAdjXdropAligner(void) +{ + dz_destroy(dz); +} + +dz_query_s* QualAdjXdropAligner::pack_query_forward(const char* seq, const uint8_t* qual, + int8_t full_length_bonus, size_t len) { + return dz_qual_adj_pack_query_forward(dz, seq, qual, full_length_bonus, len); +} + +dz_query_s* QualAdjXdropAligner::pack_query_reverse(const char* seq, const uint8_t* qual, + int8_t full_length_bonus, size_t len) { + return dz_qual_adj_pack_query_reverse(dz, seq, qual, full_length_bonus, len); +} + +const dz_forefront_s* QualAdjXdropAligner::scan(const dz_query_s* query, const dz_forefront_s** forefronts, + size_t n_forefronts, const char* ref, int32_t rlen, + uint32_t rid, uint16_t xt) { + return dz_qual_adj_scan(dz, query, forefronts, n_forefronts, ref, rlen, rid, xt); +} + +const dz_forefront_s* QualAdjXdropAligner::extend(const dz_query_s* query, const dz_forefront_s** forefronts, + size_t n_forefronts, const char* ref, int32_t rlen, + uint32_t rid, uint16_t xt) { + return dz_qual_adj_extend(dz, query, forefronts, n_forefronts, ref, rlen, rid, xt); +} + +dz_alignment_s* QualAdjXdropAligner::trace(const dz_forefront_s* forefront) { + return dz_qual_adj_trace(dz, forefront); +} + +void QualAdjXdropAligner::flush() { + dz_qual_adj_flush(dz); +} + +/** + * end of xdrop_aligner.cpp + */ diff --git a/src/rare_variant_simplifier.cpp b/src/rare_variant_simplifier.cpp new file mode 100644 index 00000000000..6aec7e43152 --- /dev/null +++ b/src/rare_variant_simplifier.cpp @@ -0,0 +1,183 @@ +#include "rare_variant_simplifier.hpp" + +namespace vg { + +using namespace std; + +RareVariantSimplifier::RareVariantSimplifier(MutablePathDeletableHandleGraph& graph, VcfBuffer& variant_source) : Progressive(), graph(graph), variant_source(variant_source) { + // Nothing to do! +} + +void RareVariantSimplifier::simplify() { + // This holds the IDs of all the nodes we want to keep around + unordered_set to_keep; + + graph.for_each_path_handle([&](const path_handle_t& path) { + // For each path + + if (!Paths::is_alt(graph.get_path_name(path))) { + // If it isn't an alt path, we want to trace it + + // For each occurrence from start to end + // Put the ID of the node we are visiting in the to-keep set + for (handle_t handle : graph.scan_path(path)) { + to_keep.insert(graph.get_id(handle)); + } + } + }); + + variant_source.fill_buffer(); + while(variant_source.get() != nullptr) { + // For each variant + auto* variant = variant_source.get(); + + // TODO: We will use getInfoValueFloat from vcflib, but that API + // doesn't have support for sniffing the existence of fields, so we + // have to grab stuff manually too. + + const map >& info = variant->info; + + // Count the AF, AC, and AN values + size_t af_count = info.count("AF") ? info.at("AF").size() : 0; + size_t ac_count = info.count("AC") ? info.at("AC").size() : 0; + size_t an_count = info.count("AN") ? info.at("AN").size() : 0; + + // Default to keeping this variant + bool keep = true; + + if (keep && min_frequency_to_keep != 0) { + // A frequency condition is imposed, so we have to compute frequency + + // Determine the total frequency of alt alleles of this variant. + // Sum that AF values if they exist, and sum the AC and AN values and divide otherwise. + double frequency = 0; + + if (af_count == 0 && (ac_count == 0 || an_count == 0)) { + // Bail out if any variant doesn't have a well-defined frequency + cerr << "error[vg::RareVariantSimplifier]: variant at " << variant->sequenceName << ":" + << variant->position << " is missing sufficient AF, AC, and/or AN INFO tags to compute frequency" << endl; + exit(1); + } + + if (af_count >= ac_count && af_count >= an_count) { + // AF is the best tag to use + for (size_t i = 0; i < af_count; i++) { + frequency += variant->getInfoValueFloat("AF", i); + } + } else { + // We have to use AC and AN + + if (ac_count != an_count) { + cerr << "error[vg::RareVariantSimplifier]: variant at " << variant->sequenceName << ":" + << variant->position << " has " << ac_count << " AC values but " << an_count << " AN values. " + << "Can't compute frequency!" << endl; + exit(1); + } + + size_t ac_total = 0; + size_t an_total = 0; + + for (size_t i = 0; i < ac_count; i++) { + // Sum up the AC and AN values. + // TODO: vcflib has no way to get an int except as a float. + ac_total += (size_t) variant->getInfoValueFloat("AC", i); + an_total += (size_t) variant->getInfoValueFloat("AN", i); + } + + if (an_total == 0) { + // There are no calls so we can't compute a frequency + cerr << "error[vg::RareVariantSimplifier]: variant at " << variant->sequenceName << ":" + << variant->position << " has total AN of 0." + << "Can't compute frequency!" << endl; + exit(1); + } + + // Compute the frequency + frequency = (double) ac_total / (double) an_total; + } + + // Keep only if frequency is sufficiently high + keep = keep && (frequency >= min_frequency_to_keep); + } + + if (keep && min_count_to_keep != 0) { + // A count condition is imposed, so we need to check the raw count. + + if (ac_count == 0) { + // If we have no AC info we can't do this threshold + cerr << "error[vg::RareVariantSimplifier]: variant at " << variant->sequenceName << ":" + << variant->position << " has no AC INFO tag; can't apply count threshold." << endl; + exit(1); + } + + // Sum up the AC tags. + // TODO: if using both conditions we do this loop twice! + size_t ac_total = 0; + for (size_t i = 0; i < ac_count; i++) { + ac_total += (size_t) variant->getInfoValueFloat("AC", i); + } + + // Keep the variant if it has a sufficient count + keep = keep && (ac_total >= min_count_to_keep); + } + + // Now we know if we are keeping the variant or not + + // Work out the variant's alt path names, to either trace or destroy + vector variant_alt_paths; + + // TODO: this should be factored out of here, construction, and GBWT generation into some central place. + string var_name = make_variant_id(*variant); + variant_alt_paths.push_back("_alt_" + var_name + "_0"); + for (size_t alt_index = 1; alt_index < variant->alleles.size(); alt_index++) { + variant_alt_paths.push_back("_alt_" + var_name + "_" + to_string(alt_index)); + } + + if (keep) { + // If it is sufficiently common, mark all its alt path nodes as to-keep + for (auto& path_name : variant_alt_paths) { + // For each alt path + if (!graph.has_path(path_name)) { + // Skip those that do not exist + continue; + } + + // For each occurrence from start to end + // Put the ID of the node we are visiting in the to-keep set + for (handle_t handle : graph.scan_path(graph.get_path_handle(path_name))) { + to_keep.insert(graph.get_id(handle)); + } + } + } else { + // Otherwise delete all its alt paths and also its ref path + for (auto& path_name : variant_alt_paths) { + if (!graph.has_path(path_name)) { + // This path doesn't exist at all, so skip it + continue; + } + + // For each alt path + path_handle_t path = graph.get_path_handle(path_name); + + // Destroy it + graph.destroy_path(path); + + } + + // The nodes will get destroyed if nothing else sufficiently frequent visited them. + } + + // Load the next variant + variant_source.handle_buffer(); + variant_source.fill_buffer(); + } + + graph.for_each_handle([&](const handle_t& handle) { + // After going through all the variants, delete all nodes that aren't to-keep + if (!to_keep.count(graph.get_id(handle))) { + graph.destroy_handle(handle); + } + }); +} + +} diff --git a/src/rare_variant_simplifier.hpp b/src/rare_variant_simplifier.hpp new file mode 100644 index 00000000000..57cf9834c44 --- /dev/null +++ b/src/rare_variant_simplifier.hpp @@ -0,0 +1,50 @@ +#ifndef VG_RARE_VARIANT_SIMPLIFIER_HPP_INCLUDED +#define VG_RARE_VARIANT_SIMPLIFIER_HPP_INCLUDED + + +#include "progressive.hpp" +#include "vg.hpp" +#include "vcf_buffer.hpp" + + +/** \file + * Provides a class for simplifying graphs by removing rare variants. + */ + +namespace vg { + +using namespace std; + +/** + * A class that can be used to simplify a graph, by removing rare variants' alt + * and ref paths and their exclusively-used nodes. + */ +class RareVariantSimplifier : public Progressive { + +public: + /// Make a simplifier that simplifies the given graph in place, using + /// variants read using the given buffer. + RareVariantSimplifier(MutablePathDeletableHandleGraph& graph, VcfBuffer& variant_source); + + /// Simplify the graph. + void simplify(); + + /// Keep variants at this total alt allele frequency or higher. + double min_frequency_to_keep = 0; + + /// Keep variants with this total alt allele count or higher. + /// AND'd with the frequency condition. + size_t min_count_to_keep = 0; + +protected: + + /// Holds a reference to the graph we're simplifying + MutablePathDeletableHandleGraph& graph; + + /// Holds a reference to the variant buffer we are getting avriants from. + VcfBuffer& variant_source; +}; + +} + +#endif diff --git a/src/readfilter.cpp b/src/readfilter.cpp index fb6ee2a9f74..34d6dffc842 100644 --- a/src/readfilter.cpp +++ b/src/readfilter.cpp @@ -1,858 +1,48 @@ #include "readfilter.hpp" -#include "IntervalTree.h" -#include "stream.hpp" - -#include -#include - -#include namespace vg { using namespace std; - -bool ReadFilter::trim_ambiguous_ends(xg::XG* index, Alignment& alignment, int k) { - assert(index != nullptr); - - // Define a way to get node length, for flipping alignments - function get_node_length = [&index](id_t node) { - return index->node_length(node); - }; - - // Because we need to flip the alignment, make sure it is new-style and - // doesn't have any Mappings with no Edits. - for(size_t i = 0; i < alignment.path().mapping_size(); i++) { - if(alignment.path().mapping(i).edit_size() == 0) { - // Complain! - throw runtime_error("Found mapping wit no edits in " + pb2json(alignment)); - } - } - - // TODO: we're going to flip the alignment twice! This is a waste of time! - // Define some kind of oriented view or something, or just two duplicated - // trimming functions, so we can just trim once without flipping. - - // Trim the end - bool end_changed = trim_ambiguous_end(index, alignment, k); - // Flip and trim the start - - Alignment flipped = reverse_complement_alignment(alignment, get_node_length); - - if(trim_ambiguous_end(index, flipped, k)) { - // The start needed trimming - - // Flip the trimmed flipped alignment back - alignment = reverse_complement_alignment(flipped, get_node_length); - // We definitely changed something - return true; - } - - // We maybe changed something - return end_changed; - +using namespace vg::io; + +ostream& operator<<(ostream& os, const Counts& counts) { + os << "Total Filtered: " << counts.counts[Counts::FilterName::filtered] << " / " + << counts.counts[Counts::FilterName::read] << endl + << "Read Name Filter: " << counts.counts[Counts::FilterName::wrong_name] << endl + << "Subsequence Filter: " << counts.counts[Counts::FilterName::subsequence] << endl + << "Proper Pair Filter: " << counts.counts[Counts::FilterName::proper_pair] << endl + << "Unmapped Filter: " << counts.counts[Counts::FilterName::unmapped] << endl + << "refpos Contig Filter: " << counts.counts[Counts::FilterName::wrong_refpos] << endl + << "Feature Filter: " << counts.counts[Counts::FilterName::excluded_feature] << endl + << "Min Identity Filter: " << counts.counts[Counts::FilterName::min_score] << endl + << "Min Secondary Identity Filter: " << counts.counts[Counts::FilterName::min_sec_score] << endl + << "Max Overhang Filter: " << counts.counts[Counts::FilterName::max_overhang] << endl + << "Min End Match Filter: " << counts.counts[Counts::FilterName::min_end_matches] << endl + << "Split Read Filter: " << counts.counts[Counts::FilterName::split] << endl + << "Repeat Ends Filter: " << counts.counts[Counts::FilterName::repeat] << endl + << "All Defrayed Filter: " << counts.counts[Counts::FilterName::defray_all] << endl + << "Min Quality Filter: " << counts.counts[Counts::FilterName::min_mapq] << endl + << "Min Base Quality Filter: " << counts.counts[Counts::FilterName::min_base_qual] << endl + << "Random Filter: " << counts.counts[Counts::FilterName::random] << endl + << endl; + return os; } -bool ReadFilter::trim_ambiguous_end(xg::XG* index, Alignment& alignment, int k) { - // What mapping in the alignment is the leftmost one starting in the last k - // bases? (Except when that would be the first mapping, we use the second.) - // Start out with it set to the past-the-end value. - size_t trim_start_mapping = alignment.path().mapping_size(); - - // How many real non-softclip bases have we seen reading in from the end of - // the read? - size_t real_base_count = 0; - // How many softclip bases have we seen in from the end of the read? - size_t softclip_base_count = 0; - for(size_t i = alignment.path().mapping_size() - 1; i != -1 && i != 0; i--) { - // Scan in from the end of the read. - - auto* mapping = alignment.mutable_path()->mutable_mapping(i); - - // We should always have edits in our mappings. - assert(mapping->edit_size() > 0); - - for(int j = mapping->edit_size() - 1; j != -1; j--) { - // Visit every edit in the mapping - auto& edit = mapping->edit(j); - - - if(real_base_count == 0 && edit.from_length() == 0) { - // This is a trailing insert. Put it as a softclip - softclip_base_count += edit.to_length(); - } else { - // This is some other kind of thing. Record it as real bases. - real_base_count += edit.to_length(); - } - } - - if(real_base_count <= k) { - // This mapping starts fewer than k non-softclipped alignment - // bases from the end of the read. - trim_start_mapping = i; - } else { - // This mapping starts more than k in from the end. So the - // previous one, if we had one, must be the right one. - break; - } - } - - if(trim_start_mapping == alignment.path().mapping_size()) { - // No mapping was found that starts within the last k non-softclipped - // bases. So there's nothing to do. - return false; - } - - if(real_base_count == 0) { - // We have an anchoring mapping, but all the mappings we could trim are - // softclips, so there's no point. TODO: will we ever get softclips - // placed as the only thing on a node? - return false; - } - - // Which is the last assumed-non-ambiguous mapping from which we can anchor - // our search? - size_t root_mapping = trim_start_mapping - 1; - - // What's the sequence, including that root node, that we are looking for? - // We need the sequence of the nodes, rather than the read's sequence, - // because you can still be ambiguous even if you have a SNP on top of the - // ambiguous thing. - - // We need to ignore all the offsets and from_lengths, except for the from - // length on the last node to let us know if we end early. It's sort of - // nonsense to have offsets and non-full from_lengths on internal mappings, - // and everything is easiest if we use the full length sequence of the root - // node. - stringstream target_sequence_stream; - for(size_t i = root_mapping; i < alignment.path().mapping_size(); i++) { - // Collect the appropriately oriented from sequence from each mapping - auto& mapping = alignment.path().mapping(i); - string sequence = index->node_sequence(mapping.position().node_id()); - if(mapping.position().is_reverse()) { - // Have it in the right orientation - sequence = reverse_complement(sequence); - } - - if(i == root_mapping) { - // Use the full length of the node and ignore any offset - target_sequence_stream << sequence; - } else { - // Use the offset plus the total from_length of all the - // edits (in case we're the last node and ending early). We made - // sure all non-root nodes had edits earlier. - - size_t from_length = mapping.position().offset(); - for(size_t j = 0; j < mapping.edit_size(); j++) { - from_length += mapping.edit(j).from_length(); - } - - // Put in the sequence that the mapping visits - target_sequence_stream << sequence.substr(0, from_length); - } - } - string target_sequence = target_sequence_stream.str(); - -#ifdef debug - #pragma omp critical(cerr) - cerr << "Need to look for " << target_sequence << " right of mapping " << root_mapping << endl; -#endif - - // We're not going to recurse hundreds of nodes deep, so we can use the real - // stack and a real recursive function. - - // Do the DFS into the given node, after already having matched the given - // number of bases of the target sequence. See if you can match any more - // bases of the target sequence. - - // Return the total number of leaves in all subtrees that match the full - // target sequence, and the depth in bases of the shallowest point at which - // multiple subtrees with full lenght matches are unified. - - // We keep a maximum number of visited nodes here, just to prevent recursion - // from going on forever in worst-case-type graphs - size_t dfs_visit_count = 0; - function(id_t, bool, size_t)> do_dfs = - [&](id_t node_id, bool is_reverse, size_t matched) -> pair { - - ++dfs_visit_count; - - // Grab the node sequence and match more of the target sequence. - string node_sequence = index->node_sequence(node_id); - if(is_reverse) { - node_sequence = reverse_complement(node_sequence); - } - - -#ifdef debug - #pragma omp critical(cerr) - cerr << "Node " << node_id << " " << (is_reverse ? "rev" : "fwd") << ": " - << node_sequence << " at offset " << matched << " in " << target_sequence << endl; -#endif - - // Now count up the new matches between this node and the target sequence. - size_t new_matches; - for( - // Start with no matches - new_matches = 0; - // Keep going as long as we're inside both strings and haven't had a mismatch - new_matches < node_sequence.size() && - matched + new_matches < target_sequence.size() && - node_sequence[new_matches] == target_sequence[matched + new_matches]; - // Count up all the matches we find - new_matches++ - ); - - if(matched + new_matches == target_sequence.size()) { - // We found a tail end of a complete match of the target sequence - // on this node. - -#ifdef debug - #pragma omp critical(cerr) - cerr << "Node " << node_id << " is a matching leaf" << endl; -#endif - - // Return one match and unification at full length (i.e. nothing can - // be discarded). - return make_pair(1, target_sequence.size()); - } - - if(new_matches < node_sequence.size()) { - // We didn't make it all the way through this node, nor did we - // finish the target sequence; there's a mismatch between the node - // and the target sequence. - -#ifdef debug - #pragma omp critical(cerr) - cerr << "Node " << node_id << " has a mismatch" << endl; -#endif - - // If we mismatch, return 0 matches and unification at full length. - return make_pair(0, target_sequence.size()); - } - - // If we get through the whole node sequence without mismatching or - // running out of target sequence, keep going. - -#ifdef debug - #pragma omp critical(cerr) - cerr << "Node " << node_id << " has " << new_matches << " internal new matches" << endl; -#endif - - // Get all the edges we can take off of the right side of this oriented - // node. - auto edges = is_reverse ? index->edges_on_start(node_id) : index->edges_on_end(node_id); - -#ifdef debug - #pragma omp critical(cerr) - cerr << "Recurse into " << edges.size() << " children" << endl; -#endif - - // We're going to call all the children and collect the results, and - // then aggregate them. It might be slightly faster to aggregate while - // calling, but that might be less clear. - vector> child_results; - - for(auto& edge : edges) { - // check the user-supplied visit count before recursing any more - if (dfs_visit_count < defray_count) { - if(edge.from() == node_id && edge.from_start() == is_reverse) { - // The end we are leaving matches this edge's from, so we can - // just go to its to end and recurse on it. - child_results.push_back(do_dfs(edge.to(), edge.to_end(), matched + node_sequence.size())); - } else if(edge.to() == node_id && edge.to_end() == !is_reverse) { - // The end we are leaving matches this edge's to, so we can just - // recurse on its from end. - child_results.push_back(do_dfs(edge.from(), !edge.from_start(), matched + node_sequence.size())); - } else { - // XG is feeding us nonsense up with which we should not put. - throw runtime_error("Edge " + pb2json(edge) + " does not attach to " + - to_string(node_id) + (is_reverse ? " start" : " end")); - } - } -#ifdef debug - else { - #pragma omp critical(cerr) - cerr << "Aborting read filter DFS at node " << node_id << " after " << dfs_visit_count << " visited" << endl; - } -#endif - - } - - // Sum up the total leaf matches, which will be our leaf match count. - size_t total_leaf_matches = 0; - // If we don't find multiple children with leaf matches, report - // unification at the min unification depth of any subtree (and there - // will only be one that isn't at full length). - size_t children_with_leaf_matches = 0; - size_t unification_depth = target_sequence.size(); - - for(auto& result : child_results) { - total_leaf_matches += result.first; - if(result.first > 0) { - children_with_leaf_matches++; - } - unification_depth = min(unification_depth, result.second); - } - if(children_with_leaf_matches > 1) { - // If multiple children have nonzero leaf match counts, report - // unification at the end of this node. - unification_depth = matched + node_sequence.size(); - } - - return make_pair(total_leaf_matches, unification_depth); - }; - - // Search from the root mapping's node looking right in its orientation in - // the mapping - auto result = do_dfs(alignment.path().mapping(root_mapping).position().node_id(), - alignment.path().mapping(root_mapping).position().is_reverse(), 0); - -#ifdef debug - #pragma omp critical(cerr) - cerr << "Found " << result.first << " matching leaves with closest unification at " << result.second << endl; -#endif - - // We keep this much of the target sequence. - size_t target_sequence_to_keep = result.second; - - if(target_sequence_to_keep == target_sequence.size()) { - // Nothing to trim! - return false; - } - - // Figure out how many mappings we need to keep from the root in order to - // get that much sequence; we know it is either full length or at a mapping - // boundary. We handle the root special because it's always full length and - // we have to cut after its end. - size_t kept_sequence_accounted_for = index->node_length(alignment.path().mapping(root_mapping).position().node_id()); - size_t first_mapping_to_drop; - for(first_mapping_to_drop = root_mapping + 1; - first_mapping_to_drop < alignment.path().mapping_size(); - first_mapping_to_drop++) { - // Consider starting dropping at each mapping after the root. - if(kept_sequence_accounted_for == target_sequence_to_keep) { - // OK this mapping really is the first one to drop. - break; - } else { - // Keep going. Account for the sequence from this mapping. - auto& mapping = alignment.path().mapping(first_mapping_to_drop); - - // We know it's not the root mapping, and it can't be the non-full- - // length end mapping (because we would have kept the full length - // target sequence and not had to cut). So assume full node is used. - kept_sequence_accounted_for += index->node_length(mapping.position().node_id()); - } - } - - // OK we know the first mapping to drop. We need to work out the to_size, - // including all softclips, from there to the end, so we know how much to - // trim off of the sequence and quality. - size_t to_length = 0; - for(size_t i = first_mapping_to_drop; i < alignment.path().mapping_size(); i++) { - // Go through all the mappings - auto& mapping = alignment.path().mapping(i); - for(size_t j = 0; j < mapping.edit_size(); j++) { - // Add up the to_length of all the edits - to_length += mapping.edit(j).to_length(); - } - } - - // Trim sequence - alignment.set_sequence(alignment.sequence().substr(0, alignment.sequence().size() - to_length)); - - // Trim quality - if(!alignment.quality().empty()) { - // If we have a quality, it always ought to have been the same length as the sequence. - assert(alignment.quality().size() > to_length); - alignment.set_quality(alignment.quality().substr(0, alignment.quality().size() - to_length)); - } - - // Now we can discard the extra mappings - size_t to_delete = alignment.path().mapping_size() - first_mapping_to_drop; - alignment.mutable_path()->mutable_mapping()->DeleteSubrange(first_mapping_to_drop, to_delete); - - // Now the alignment is fixed! - return true; +// for some reason these and only these methods become duplicate symbols if they're included +// in the header? i don't really understand why +template<> +bool ReadFilter::is_mapped(const Alignment& alignment) const { + return alignment.path().mapping_size() != 0; } -// quick and dirty filter to see if removing reads that can slip around -// and still map perfectly helps vg call. returns true if at either -// end of read sequence, at least k bases are repetitive, checking repeats -// of up to size 2k -bool ReadFilter::has_repeat(Alignment& aln, int k) { - if (k == 0) { - return false; - } - const string& s = aln.sequence(); - for (int i = 1; i <= 2 * k; ++i) { - int covered = 0; - bool ffound = true; - bool bfound = true; - for (int j = 1; (ffound || bfound) && (j + 1) * i < s.length(); ++j) { - ffound = ffound && s.substr(0, i) == s.substr(j * i, i); - bfound = bfound && s.substr(s.length() - i, i) == s.substr(s.length() - i - j * i, i); - if (ffound || bfound) { - covered += i; - } - } - if (covered >= k) { +template<> +bool ReadFilter::is_mapped(const MultipathAlignment& mp_alignment) const { + for (size_t i = 0; i < mp_alignment.subpath_size(); ++i) { + if (mp_alignment.subpath(i).path().mapping_size() != 0) { return true; } } return false; } -bool ReadFilter::is_split(xg::XG* index, Alignment& alignment) { - if(index == nullptr) { - // Can't tell if the read is split. - throw runtime_error("XG index required to check for split reads"); - } - - - for(size_t i = 0; i + 1 < alignment.path().mapping_size(); i++) { - // For each mapping and the one after it - auto& pos1 = alignment.path().mapping(i).position(); - auto& pos2 = alignment.path().mapping(i + 1).position(); - - id_t from_id = pos1.node_id(); - bool from_start = pos1.is_reverse(); - id_t to_id = pos2.node_id(); - bool to_end = pos2.is_reverse(); - - // Can we find the same articulation of the edge as the alignment uses - bool found = index->has_edge(from_id, from_start, to_id, to_end); - - if(!found) { - // Check the other articulation of the edge - std::swap(from_id, to_id); - std::swap(from_start, to_end); - from_start = !from_start; - to_end = !to_end; - - - found = index->has_edge(from_id, from_start, to_id, to_end); - } - - if(!found) { - // We found a skip! - if(verbose) { - cerr << "Warning: read " << alignment.name() << " has an unknown edge " - << from_id << " " << from_start << " " << to_id << " " << to_end - << ". Removing!" << endl; - } - return true; - } - } - - // No wandering jumps between nodes found - return false; -} - - -bool ReadFilter::sample_read(const Alignment& aln) { - // Decide if the alignment is paired. - // It is paired if fragment_next or fragment_prev point to something. - bool is_paired = (!aln.fragment_prev().name().empty() || aln.fragment_prev().path().mapping_size() != 0 || - !aln.fragment_next().name().empty() || aln.fragment_next().path().mapping_size() != 0); - - // Compute the QNAME that samtools would use - string qname; - if (is_paired) { - // Strip pair end identifiers like _1 or /2 that vg uses at the end of the name. - qname = regex_replace(aln.name(), regex("[/_][12]$"), ""); - } else { - // Any _1 in the name is part of the actual read name. - qname = aln.name(); - } - - // Now treat it as samtools would. - // See https://github.com/samtools/samtools/blob/60138c42cf04c5c473dc151f3b9ca7530286fb1b/sam_view.c#L101-L104 - - // Hash that with __ac_X31_hash_string from htslib and XOR against the seed mask - auto masked_hash = __ac_X31_hash_string(qname.c_str()) ^ downsample_seed_mask; - - // Hash that again with __ac_Wang_hash from htslib, apparently to mix the bits. - uint32_t mixed_hash = __ac_Wang_hash(masked_hash); - - // Take the low 24 bits and compute a double from 0 to 1 - const int32_t LOW_24_BITS = 0xffffff; - double sample = ((double)(mixed_hash & LOW_24_BITS)) / (LOW_24_BITS + 1); - - // If the result is >= the portion to downsample to, discard the read. - // Otherwise, keep it. - return (sample < downsample_probability); -} - - -int ReadFilter::filter(istream* alignment_stream, xg::XG* xindex) { - - // name helper for output - function chunk_name = [this](int num) -> string { - stringstream ss; - ss << outbase << "-" << num << ".gam"; - return ss.str(); - }; - - // index regions by their inclusive ranges - vector > interval_list; - vector regions; - // use strings instead of ofstreams because worried about too many handles - vector chunk_names; - vector chunk_new; // flag if write or append - - // parse a bed, for now this is only way to do regions. note - // this operation converts from 0-based BED to 1-based inclusive VCF - if (!regions_file.empty()) { - if (outbase.empty()) { - cerr << "-B option required with -R" << endl; - return 1; - } - parse_bed_regions(regions_file, regions); - if (regions.empty()) { - cerr << "No regions read from BED file, doing whole graph" << endl; - } - } - - if(defray_length > 0 && xindex == nullptr) { - cerr << "xg index required for end de-fraying" << endl; - return 1; - } - - if (regions.empty()) { - // empty region, do everything - // we handle empty intervals as special case when looking up, otherwise, - // need to insert giant interval here. - chunk_names.push_back(outbase.empty() ? "-" : chunk_name(0)); - } else { - // otherwise, need to extract regions with xg - if (xindex == nullptr) { - cerr << "xg index required for -R option" << endl; - return 1; - } - - // fill in the map using xg index - // relies entirely on the assumption that are path chunks - // are perfectly spanned by an id range - for (int i = 0; i < regions.size(); ++i) { - Graph graph; - int rank = xindex->path_rank(regions[i].seq); - int64_t path_size = rank == 0 ? 0 : xindex->path_length(regions[i].seq); - - if (regions[i].start >= path_size) { - cerr << "Unable to find region in index: " << regions[i].seq << ":" << regions[i].start - << "-" << regions[i].end << endl; - } else { - // clip region on end of path - regions[i].end = min(path_size - 1, regions[i].end); - // do path node query - xindex->get_path_range(regions[i].seq, regions[i].start, regions[i].end, graph); - if (context_size > 0) { - xindex->expand_context(graph, context_size); - } - } - // find node range of graph, without bothering to build vg indices.. - int64_t min_id = numeric_limits::max(); - int64_t max_id = 0; - for (int j = 0; j < graph.node_size(); ++j) { - min_id = min(min_id, (int64_t)graph.node(j).id()); - max_id = max(max_id, (int64_t)graph.node(j).id()); - } - // map the chunk id to a name - chunk_names.push_back(chunk_name(i)); - - // map the node range to the chunk id. - if (graph.node_size() > 0) { - interval_list.push_back(Interval(min_id, max_id, i)); - assert(chunk_names.size() == i + 1); - } - } - } - - // index chunk regions - IntervalTree region_map(interval_list); - - // which chunk(s) does a gam belong to? - function&)> get_chunks = [®ion_map, ®ions](Alignment& aln, vector& chunks) { - // speed up case where no chunking - if (regions.empty()) { - chunks.push_back(0); - } else { - int64_t min_aln_id = numeric_limits::max(); - int64_t max_aln_id = -1; - for (int i = 0; i < aln.path().mapping_size(); ++i) { - const Mapping& mapping = aln.path().mapping(i); - min_aln_id = min(min_aln_id, (int64_t)mapping.position().node_id()); - max_aln_id = max(max_aln_id, (int64_t)mapping.position().node_id()); - } - vector > found_ranges; - region_map.findOverlapping(min_aln_id, max_aln_id, found_ranges); - for (auto& interval : found_ranges) { - chunks.push_back(interval.value); - } - } - }; - - // buffered output (one buffer per chunk), one set of chunks per thread - // buffer[THREAD][CHUNK] = vector - vector > > buffer(threads); - for (int i = 0; i < buffer.size(); ++i) { - buffer[i].resize(chunk_names.size()); - } - - static const int buffer_size = 1000; // we let this be off by 1 - - // remember if write or append - vector chunk_append(chunk_names.size(), append_regions); - - // flush a buffer specified by cur_buffer to target in chunk_names, and clear it. - // if end is true, write an EOF marker - function flush_buffer = [&buffer, &chunk_names, &chunk_append](int tid, int cur_buffer, bool end) { - ofstream outfile; - auto& outbuf = chunk_names[cur_buffer] == "-" ? cout : outfile; - if (chunk_names[cur_buffer] != "-") { - outfile.open(chunk_names[cur_buffer], chunk_append[cur_buffer] ? ios::app : ios_base::out); - chunk_append[cur_buffer] = true; - } - function write_buffer = [&buffer, &tid, &cur_buffer](size_t i) -> Alignment& { - return buffer[tid][cur_buffer][i]; - }; - stream::write(outbuf, buffer[tid][cur_buffer].size(), write_buffer); - if (end) { - stream::finish(outbuf); - } - buffer[tid][cur_buffer].clear(); - }; - - // add alignment to all appropriate buffers, flushing as necessary - function&)> update_buffers = [ - &buffer, ®ion_map, &get_chunks, &flush_buffer](int tid, Alignment& aln, - const vector& aln_chunks) { - for (auto chunk : aln_chunks) { - buffer[tid][chunk].push_back(aln); - if (buffer[tid][chunk].size() >= buffer_size) { - // flush buffer (could get fancier and allow parallel writes to different - // files, but unlikely to be worth effort as we're mostly trying to - // speed up defray and not write IO) -#pragma omp critical (ReadFilter_flush_buffer) - { - flush_buffer(tid, chunk, false); - } - } - } - }; - - // keep counts of what's filtered to report (in verbose mode) - vector counts_vec(threads); - - // we assume that every primary alignment has 0 or 1 secondary alignment - // immediately following in the stream - function lambda = [&](Alignment& aln) { - int tid = omp_get_thread_num(); - Counts& counts = counts_vec[tid]; - double score = (double)aln.score(); - double denom = aln.sequence().length(); - // toggle substitution score - if (sub_score == true) { - // hack in ident to replace old counting logic. - score = aln.identity() * aln.sequence().length(); - assert(score <= denom); - } else if (rescore == true) { - // We need to recalculate the score with the base aligner always - const static Aligner unadjusted; - BaseAligner* aligner = (BaseAligner*)&unadjusted; - - // Rescore and assign the score - aln.set_score(aligner->score_ungapped_alignment(aln)); - // Also use the score - score = aln.score(); - } - - // toggle absolute or fractional score - if (frac_score == true) { - if (denom > 0.) { - score /= denom; - } - else { - assert(score == 0.); - } - } - // compute overhang - int overhang = 0; - if (aln.path().mapping_size() > 0) { - const auto& left_mapping = aln.path().mapping(0); - if (left_mapping.edit_size() > 0) { - overhang = left_mapping.edit(0).to_length() - left_mapping.edit(0).from_length(); - } - const auto& right_mapping = aln.path().mapping(aln.path().mapping_size() - 1); - if (right_mapping.edit_size() > 0) { - const auto& edit = right_mapping.edit(right_mapping.edit_size() - 1); - overhang = max(overhang, edit.to_length() - edit.from_length()); - } - } else { - overhang = aln.sequence().length(); - } - // compute end matches. - int end_matches = 0; - // from the left - for (int i = 0; i < aln.path().mapping_size() && end_matches < min_end_matches; ++i) { - for (int j = 0; j < aln.path().mapping(i).edit_size() && end_matches < min_end_matches; ++j) { - const Edit& edit = aln.path().mapping(i).edit(j); - if (edit.from_length() == edit.to_length() && edit.sequence().empty()) { - end_matches += edit.to_length(); - } else { - i = aln.path().mapping_size(); - break; - } - } - } - if (end_matches >= min_end_matches) { - end_matches = 0; - // from the right - for (int i = aln.path().mapping_size() - 1; i >= 0 && end_matches < min_end_matches; --i) { - for (int j = aln.path().mapping(i).edit_size() - 1; j >= 0 && end_matches < min_end_matches; --j) { - const Edit& edit = aln.path().mapping(i).edit(j); - if (edit.from_length() == edit.to_length() && edit.sequence().empty()) { - end_matches += edit.to_length(); - } else { - i = -1; - break; - } - } - } - } - - // offset in count tuples - int co = aln.is_secondary() ? 1 : 0; - - ++counts.read[co]; - bool keep = true; - // filter (current) alignment - if (!name_prefix.empty() && !std::equal(name_prefix.begin(), name_prefix.end(), aln.name().begin())) { - // There's a prefix and a mismatch against it - ++counts.wrong_name[co]; - keep = false; - } - if ((keep || verbose) && !excluded_refpos_contigs.empty() && aln.refpos_size() != 0) { - // We have refpos exclusion filters and a refpos is set. - // We need to bang every refpos anme against every filter. - - bool found_match = false; - for (auto& expression : excluded_refpos_contigs) { - for (auto& refpos : aln.refpos()) { - if (regex_search(refpos.name(), expression)) { - // We don't want this read because of this match - found_match = true; - break; - } - } - if (found_match) { - break; - } - } - - if (found_match) { - ++counts.wrong_refpos[co]; - keep = false; - } - } - if ((keep || verbose) && ((aln.is_secondary() && score < min_secondary) || - (!aln.is_secondary() && score < min_primary))) { - ++counts.min_score[co]; - keep = false; - } - if ((keep || verbose) && overhang > max_overhang) { - ++counts.max_overhang[co]; - keep = false; - } - if ((keep || verbose) && end_matches < min_end_matches) { - ++counts.min_end_matches[co]; - keep = false; - } - if ((keep || verbose) && aln.mapping_quality() < min_mapq) { - ++counts.min_mapq[co]; - keep = false; - } - - // do region check before heavier filters - vector aln_chunks; - if (keep || verbose) { - get_chunks(aln, aln_chunks); - if (aln_chunks.empty()) { - keep = false; - } - } - - if ((keep || verbose) && drop_split && is_split(xindex, aln)) { - ++counts.split[co]; - keep = false; - } - if ((keep || verbose) && has_repeat(aln, repeat_size)) { - ++counts.repeat[co]; - keep = false; - } - if ((keep || verbose) && defray_length && trim_ambiguous_ends(xindex, aln, defray_length)) { - ++counts.defray[co]; - // We keep these, because the alignments get modified. - } - if ((keep || verbose) && downsample_probability != 1.0 && !sample_read(aln)) { - ++counts.random[co]; - keep = false; - } - if (!keep) { - ++counts.filtered[co]; - } - - // add to write buffer - if (keep) { - update_buffers(tid, aln, aln_chunks); - } - }; - stream::for_each_parallel(*alignment_stream, lambda); - - for (int tid = 0; tid < buffer.size(); ++tid) { - for (int chunk = 0; chunk < buffer[tid].size(); ++chunk) { - // Give every chunk, even those going to standard out or with no buffered reads, an EOF marker. - // This also makes sure empty chunks exist. - flush_buffer(tid, chunk, true); - } - } - - if (verbose) { - Counts& counts = counts_vec[0]; - for (int i = 1; i < counts_vec.size(); ++i) { - counts += counts_vec[i]; - } - size_t tot_reads = counts.read[0] + counts.read[1]; - size_t tot_filtered = counts.filtered[0] + counts.filtered[1]; - cerr << "Total Filtered (primary): " << counts.filtered[0] << " / " - << counts.read[0] << endl - << "Total Filtered (secondary): " << counts.filtered[1] << " / " - << counts.read[1] << endl - << "Read Name Filter (primary): " << counts.wrong_name[0] << endl - << "Read Name Filter (secondary): " << counts.wrong_name[1] << endl - << "refpos Contig Filter (primary): " << counts.wrong_refpos[0] << endl - << "refpos Contig Filter (secondary): " << counts.wrong_refpos[1] << endl - << "Min Identity Filter (primary): " << counts.min_score[0] << endl - << "Min Identity Filter (secondary): " << counts.min_score[1] << endl - << "Max Overhang Filter (primary): " << counts.max_overhang[0] << endl - << "Max Overhang Filter (secondary): " << counts.max_overhang[1] << endl - << "Min End Match Filter (primary): " << counts.min_end_matches[0] << endl - << "Min End Match Filter (secondary): " << counts.min_end_matches[1] << endl - << "Split Read Filter (primary): " << counts.split[0] << endl - << "Split Read Filter (secondary): " << counts.split[1] << endl - << "Repeat Ends Filter (primary): " << counts.repeat[0] << endl - << "Repeat Ends Filter (secondary): " << counts.repeat[1] << endl - << "Min Quality Filter (primary): " << counts.min_mapq[0] << endl - << "Min Quality Filter (secondary): " << counts.min_mapq[1] << endl - << "Random Filter (primary): " << counts.random[0] << endl - << "Random Filter (secondary): " << counts.random[1] << endl - - - << endl; - } - - return 0; - -} - } diff --git a/src/readfilter.hpp b/src/readfilter.hpp index b893ff7a48f..454983a4644 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -6,9 +6,19 @@ #include #include #include +#include +#include + #include "vg.hpp" -#include "xg.hpp" -#include "vg.pb.h" +#include "handle.hpp" +#include "IntervalTree.h" +#include "annotation.hpp" +#include "multipath_alignment_emitter.hpp" +#include +#include +#include + +#include /** \file * Provides a way to filter and transform reads, implementing the bulk of the @@ -19,95 +29,92 @@ namespace vg{ using namespace std; +struct Counts; + +template class ReadFilter{ public: // Filtering parameters - /// Read name must have this prefix - string name_prefix; + + /// Actually take the complement of the filter + bool complement_filter = false; + /// Read name must have one of these prefixes, if any are present. + /// TODO: This should be a trie but I don't have one handy. + /// Must be sorted for vaguely efficient search. + vector name_prefixes; /// Read must not have a refpos set with a contig name containing a match to any of these vector excluded_refpos_contigs; - double min_secondary = 0.; - double min_primary = 0.; - // Should we rescore each alignment with default parameters and no e.g. - // haplotype info? + /// Read must contain at least one of these strings as a subsequence + vector subsequences; + /// If a read has one of the features in this set as annotations, the read + /// is filtered out. + unordered_set excluded_features; + double min_secondary = numeric_limits::lowest(); + double min_primary = numeric_limits::lowest(); + /// Should we rescore each alignment with default parameters and no e.g. + /// haplotype info? bool rescore = false; bool frac_score = false; bool sub_score = false; - int max_overhang = 99999; - int min_end_matches = 0; - int context_size = 0; + int max_overhang = numeric_limits::max() / 2; + int min_end_matches = numeric_limits::min() / 2; bool verbose = false; - double min_mapq = 0.; + double min_mapq = numeric_limits::lowest(); int repeat_size = 0; - // How far in from the end should we look for ambiguous end alignment to - // clip off? - int defray_length = 0; - // Limit defray recursion to visit this many nodes - int defray_count = 99999; - // Should we drop split reads that follow edges not in the graph? + /// Should we drop split reads that follow edges not in the graph? bool drop_split = false; - // We can also pseudorandomly drop reads. What's the probability that we keep a read? + + /// We can also pseudorandomly drop reads. What's the probability that we keep a read? double downsample_probability = 1.0; - // Samtools-compatible internal seed mask, for deciding which read pairs to keep. - // To be generated with rand() after srand() from the user-visible seed. - uint32_t downsample_seed_mask = 0; - // default to 1 thread (as opposed to all) - int threads = 1; - - // Keep some basic counts for when verbose mode is enabled - struct Counts { - vector read; - vector filtered; - vector wrong_name; - vector wrong_refpos; - vector min_score; - vector max_overhang; - vector min_end_matches; - vector min_mapq; - vector split; - vector repeat; - vector defray; - vector random; - Counts() : read(2, 0), filtered(2, 0), wrong_name(2, 0), wrong_refpos(2, 0), - min_score(2, 0), max_overhang(2, 0), min_end_matches(2, 0), - min_mapq(2, 0), split(2, 0), repeat(2, 0), defray(2, 0), random(2, 0) {} - Counts& operator+=(const Counts& other) { - for (int i = 0; i < 2; ++i) { - read[i] += other.read[i]; - filtered[i] += other.filtered[i]; - wrong_name[i] += other.wrong_name[i]; - wrong_refpos[i] += other.wrong_refpos[i]; - min_score[i] += other.min_score[i]; - max_overhang[i] += other.max_overhang[i]; - min_end_matches[i] += other.min_end_matches[i]; - min_mapq[i] += other.min_mapq[i]; - split[i] += other.split[i]; - repeat[i] += other.repeat[i]; - defray[i] += other.defray[i]; - random[i] += other.random[i]; - } - return *this; - } - }; + /// Samtools-compatible internal seed mask, for deciding which read pairs to keep. + /// To be generated with rand() after srand() from the user-visible seed. + uint32_t downsample_seed_mask = 0; - // Extra filename things we need for chunking. TODO: refactor that somehow - // to maybe be a different class? - string regions_file; - string outbase; - bool append_regions = false; + /// How far in from the end should we look for ambiguous end alignment to + /// clip off? + int defray_length = 0; + /// Limit defray recursion to visit this many nodes + int defray_count = 99999; + + /// Filter to proper pairs + bool only_proper_pairs = true; + + /// Filter to only mapped reads + bool only_mapped = true; + + /// Number of threads from omp + int threads = -1; + /// GAM output buffer size + int buffer_size = 512; + /// Sometimes we only want a report, and not a filtered gam. toggling off output + /// speeds things up considerably. + bool write_output = true; + /// A HandleGraph is required for some filters (Note: ReadFilter doesn't own/free this) + const HandleGraph* graph = nullptr; + /// Interleaved input + bool interleaved = false; + /// When outputting paired reads, fail the pair only if both (all) reads + /// fail (true) instead of if either (any) read fails (false) + bool filter_on_all = false; + + // minimum base quality as PHRED score + int min_base_quality = numeric_limits::min() / 2; + // minimum fraction of bases in reads that must have quality at least + double min_base_quality_fraction = numeric_limits::lowest(); + + /** + * Run all the filters on an alignment. The alignment may get modified in-place by the defray filter + */ + Counts filter_alignment(Read& aln); /** * Filter the alignments available from the given stream, placing them on * standard output or in the appropriate file. Returns 0 on success, exit * code to use on error. * - * If an XG index is required, use the specified one. If one is required and - * not provided, the function will complain and return nonzero. - * - * TODO: Refactor to be less CLI-aware and more modular-y. */ - int filter(istream* alignment_stream, xg::XG* xindex = nullptr); + int filter(istream* alignment_stream); /** * Look at either end of the given alignment, up to k bases in from the end. @@ -122,31 +129,36 @@ class ReadFilter{ * * MUST NOT be called with a null index. */ - bool trim_ambiguous_ends(xg::XG* index, Alignment& alignment, int k); + bool trim_ambiguous_ends(Read& read, int k) const; private: /** -  * quick and dirty filter to see if removing reads that can slip around + * quick and dirty filter to see if removing reads that can slip around * and still map perfectly helps vg call. returns true if at either * end of read sequence, at least k bases are repetitive, checking repeats * of up to size 2k */ - bool has_repeat(Alignment& aln, int k); + bool has_repeat(const Read& read, int k) const; + + /** + * Check if the alignment includes any aligned bases + */ + bool is_mapped(const Read& read) const; /** * Trim only the end of the given alignment, leaving the start alone. Two * calls of this implement trim_ambiguous_ends above. */ - bool trim_ambiguous_end(xg::XG* index, Alignment& alignment, int k); + bool trim_ambiguous_end(Alignment& alignment, int k) const; /** - * Return false if the read only follows edges in the xg index, and true if + * Return false if the read only follows edges in the graph, and true if * the read is split (or just incorrect) and takes edges not in the index. * - * Throws an error if no XG index is specified. + * Throws an error if no graph is specified. */ - bool is_split(xg::XG* index, Alignment& alignment); + bool is_split(const Read& read) const; /** * Based on the read name and paired-ness, compute the SAM-style QNAME and @@ -155,9 +167,1159 @@ class ReadFilter{ * kept. Returns true if the read should stay, and false if it should be * removed. Always accepts or rejects paired reads together. */ - bool sample_read(const Alignment& read); + bool sample_read(const Read& read) const; + + /** + * Convert a multipath alignment to a single path + */ + Alignment to_alignment(const MultipathAlignment& multipath_aln) const; + + /** + * Get the score indicated by the params + */ + double get_score(const Read& read) const; + + /** + * Does the read name have one of the indicated prefixes? + */ + bool matches_name(const Read& read) const; + + /** + * Does the read match one of the excluded refpos contigs? + */ + bool has_excluded_refpos(const Read& read) const; + + /** + * Is the read annotated with any of the excluded features + */ + bool has_excuded_feature(const Read& read) const; + + /** + * Is the read a secondary alignments? + */ + bool is_secondary(const Read& read) const; + + /** + * How long are the read overhangs? + */ + int get_overhang(const Read& read) const; + + /** + * Internal helper for get_overhang + */ + int alignment_overhang(const Alignment& aln) const; + + /** + * What is the shortest run of end matches on either end? + */ + int get_end_matches(const Read& read) const; + + /** + * internal helper for get_end_matches + */ + int alignment_end_matches(const Alignment& aln) const; + + /** + * What is the read's mapping quality + */ + int get_mapq(const Read& read) const; + + /** + * What fraction of the base qualities are at least as large as the min base quality + */ + double get_min_base_qual_fraction(const Read& read) const; + + /** + * Is the read paired? + */ + bool get_is_paired(const Read& read) const; + + /** + * Is the read in a proper-mapped pair? + */ + bool is_proper_pair(const Read& read) const; + + /** + * Does the read contain at least one of the indicated sequences + */ + bool contains_subsequence(const Read& read) const; + + /** + * Write the read to stdout + */ + void emit(Read& read); + + /** + * Write a read pair to stdout + */ + void emit(Read& read1, Read& read2); + + /// The twp specializations have different writing infrastructure + unique_ptr aln_emitter; + unique_ptr mp_aln_emitter; + + /// Helper function for filter + void filter_internal(istream* in); }; + +// Keep some basic counts for when verbose mode is enabled +struct Counts { + // note: "last" must be kept as the final value in this enum + enum FilterName { read = 0, wrong_name, wrong_refpos, excluded_feature, min_score, min_sec_score, max_overhang, + min_end_matches, min_mapq, split, repeat, defray, defray_all, random, min_base_qual, subsequence, filtered, + proper_pair, unmapped, last}; + vector counts; + Counts () : counts(FilterName::last, 0) {} + Counts& operator+=(const Counts& other) { + for (int i = 0; i < FilterName::last; ++i) { + counts[i] += other.counts[i]; + } + return *this; + } + /// If any read was filtered, count the other read as filtered + Counts& set_paired_any() { + for (int i = 0; i < FilterName::last; ++i) { + counts[i] = counts[i] == 1 ? 2 : counts[i]; + } + return *this; + } + /// If not all reads were filtered, count filtered ones as unfiltered. + Counts& set_paired_all() { + // We know that the pair as a whole was filtered out if counts[FilterName::filtered] == 2, and that it was kept otherwise. + if (counts[FilterName::filtered] != 2) { + // The read pair was not discarded, so clear out all the fail counts (including FilterName::filtered). + for (int i = 0; i < FilterName::last; ++i) { + counts[i] = 0; + } + } + // Otherwise the read pair was discarded, so leave all the + // counts alone. There is at least one fail on each side to be + // responsible for it (and if not verbose, only one fail on + // each side). + return *this; + } + void reset() { + std::fill(counts.begin(), counts.end(), 0); + } + bool keep() { + return counts[FilterName::filtered] == 0; + } +}; +ostream& operator<<(ostream& os, const Counts& counts); + + +/** + * Template implementations + */ + +template +void ReadFilter::filter_internal(istream* in) { + + // keep counts of what's filtered to report (in verbose mode) + vector counts_vec(threads); + + function lambda = [&](Read& read) { +#ifdef debug + cerr << "Encountered read named \"" << read.name() << "\" with " << read.sequence().size() + << " bp sequence and " << read.quality().size() << " quality values" << endl; +#endif + Counts read_counts = filter_alignment(read); + counts_vec[omp_get_thread_num()] += read_counts; + if ((read_counts.keep() != complement_filter) && write_output) { + emit(read); + } + }; + + function pair_lambda = [&](Read& read1, Read& read2) { + Counts read_counts = filter_alignment(read1); + read_counts += filter_alignment(read2); + if (filter_on_all) { + // Unless both reads were filtered out (total filtered count == 2), keep the read. + read_counts.set_paired_all(); + } else { + // Either read failing is sufficient to scuttle the pair. + // So if we filter out one end for any reason, we filter out the other as well. + read_counts.set_paired_any(); + } + counts_vec[omp_get_thread_num()] += read_counts; + if ((read_counts.keep() != complement_filter) && write_output) { + emit(read1, read2); + } + }; + + if (interleaved) { + vg::io::for_each_interleaved_pair_parallel(*in, pair_lambda); + } else { + vg::io::for_each_parallel(*in, lambda); + } + + if (verbose) { + Counts& counts = counts_vec[0]; + for (int i = 1; i < counts_vec.size(); ++i) { + counts += counts_vec[i]; + } + cerr << counts; + } +} + +template<> +inline int ReadFilter::filter(istream* alignment_stream) { + + if(defray_length > 0 && graph == nullptr) { + cerr << "HandleGraph (e.g. XG) required for end de-fraying" << endl; + return 1; + } + + if (write_output) { + // Keep an AlignmentEmitter to multiplex output from multiple threads. + aln_emitter = get_non_hts_alignment_emitter("-", "GAM", map(), get_thread_count()); + } + + filter_internal(alignment_stream); + + return 0; +} + +template<> +inline int ReadFilter::filter(istream* alignment_stream) { + + if (defray_length > 0) { + cerr << "Cannot defray multipath alignments" << endl; + return 1; + } + if (!excluded_refpos_contigs.empty()) { + cerr << "Cannot filter multipath alignments by ref pos" << endl; + return 1; + } + + if (write_output) { + // Keep an AlignmentEmitter to multiplex output from multiple threads. + mp_aln_emitter = unique_ptr(new MultipathAlignmentEmitter("-", get_thread_count())); + } + + filter_internal(alignment_stream); + + return 0; +} + +template<> +inline void ReadFilter::emit(Alignment& aln) { + aln_emitter->emit_single(std::move(aln)); +} + +template<> +inline void ReadFilter::emit(Alignment& aln1, Alignment& aln2) { + aln_emitter->emit_pair(std::move(aln1), std::move(aln2)); +} + +template<> +inline void ReadFilter::emit(MultipathAlignment& mp_aln) { + vector emit_vec(1); + from_proto_multipath_alignment(mp_aln, emit_vec.front()); + mp_aln_emitter->emit_singles(mp_aln.name(), std::move(emit_vec)); +} + +template<> +inline void ReadFilter::emit(MultipathAlignment& mp_aln1, MultipathAlignment& mp_aln2) { + vector> emit_vec(1); + from_proto_multipath_alignment(mp_aln1, emit_vec.front().first); + from_proto_multipath_alignment(mp_aln2, emit_vec.front().second); + mp_aln_emitter->emit_pairs(mp_aln1.name(), mp_aln2.name(), std::move(emit_vec)); +} + +template +Counts ReadFilter::filter_alignment(Read& read) { + Counts counts; + + ++counts.counts[Counts::FilterName::read]; + bool keep = true; + // filter (current) alignment + if (!name_prefixes.empty()) { + if (!matches_name(read)) { + // There are prefixes and we don't match any, so drop the read. + ++counts.counts[Counts::FilterName::wrong_name]; + keep = false; + } + } + if ((keep || verbose) && !subsequences.empty()) { + if (!contains_subsequence(read)) { + // There are subsequences and we don't match any, so drop the read. + ++counts.counts[Counts::FilterName::subsequence]; + keep = false; + } + } + if ((keep || verbose) && only_proper_pairs) { + if (!is_proper_pair(read)) { + ++counts.counts[Counts::FilterName::proper_pair]; + keep = false; + } + } + if ((keep || verbose) && !excluded_refpos_contigs.empty()) { + if (has_excluded_refpos(read)) { + ++counts.counts[Counts::FilterName::wrong_refpos]; + keep = false; + } + } + if ((keep || verbose) && !excluded_features.empty()) { + if (has_excuded_feature(read)) { + ++counts.counts[Counts::FilterName::excluded_feature]; + keep = false; + } + } + double score = get_score(read); + bool secondary = is_secondary(read); + if ((keep || verbose) && !secondary && score < min_primary) { + ++counts.counts[Counts::FilterName::min_score]; + keep = false; + } + if ((keep || verbose) && secondary && score < min_secondary) { + ++counts.counts[Counts::FilterName::min_sec_score]; + keep = false; + } + if ((keep || verbose) && max_overhang > 0) { + if (get_overhang(read) > max_overhang) { + ++counts.counts[Counts::FilterName::max_overhang]; + keep = false; + } + } + if ((keep || verbose) && min_end_matches > 0) { + if (get_end_matches(read) < min_end_matches) { + ++counts.counts[Counts::FilterName::min_end_matches]; + keep = false; + } + } + if ((keep || verbose) && min_mapq > 0) { + if (get_mapq(read) < min_mapq) { + ++counts.counts[Counts::FilterName::min_mapq]; + keep = false; + } + } + if ((keep || verbose) && min_base_quality > 0 && min_base_quality_fraction > 0.0) { + if (get_min_base_qual_fraction(read) < min_base_quality_fraction) { + ++counts.counts[Counts::FilterName::min_base_qual]; + keep = false; + } + } + if ((keep || verbose) && drop_split) { + if (is_split(read)) { + ++counts.counts[Counts::FilterName::split]; + keep = false; + } + } + if ((keep || verbose) && repeat_size > 0) { + if (has_repeat(read, repeat_size)) { + ++counts.counts[Counts::FilterName::repeat]; + keep = false; + } + } + if ((keep || verbose) && defray_length) { + ++counts.counts[Counts::FilterName::defray]; + if (trim_ambiguous_ends(read, defray_length)) { + // We keep these, because the alignments get modified. + // Unless the *entire* read gets trimmed + if (read.sequence().empty()) { + keep = false; + ++counts.counts[Counts::FilterName::defray_all]; + } + } + } + if ((keep || verbose) && only_mapped) { + if (!is_mapped(read)) { + ++counts.counts[Counts::FilterName::unmapped]; + keep = false; + } + } + if ((keep || verbose) && downsample_probability != 1.0) { + if (!sample_read(read)) { + ++counts.counts[Counts::FilterName::random]; + keep = false; + } + } + + if (!keep) { + ++counts.counts[Counts::FilterName::filtered]; + } + + return counts; +} + +template +Alignment ReadFilter::to_alignment(const MultipathAlignment& multipath_aln) const { + multipath_alignment_t mp_aln; + from_proto_multipath_alignment(multipath_aln, mp_aln); + Alignment aln; + optimal_alignment(mp_aln, aln); + return aln; +} + +template<> +inline double ReadFilter::get_score(const Alignment& aln) const { + double score = (double)aln.score(); + double denom = aln.sequence().length(); + // toggle substitution score + if (sub_score == true) { + // hack in ident to replace old counting logic. + score = aln.identity() * aln.sequence().length(); + assert(score <= denom); + } else if (rescore == true) { + // We need to recalculate the score with the base aligner always + const static Aligner unadjusted; + GSSWAligner* aligner = (GSSWAligner*)&unadjusted; + // Also use the score + score = aligner->score_contiguous_alignment(aln); + } + + // toggle absolute or fractional score + if (frac_score) { + if (denom > 0.) { + score /= denom; + } + else { + assert(score == 0.); + } + } + return score; +} + +template<> +inline double ReadFilter::get_score(const MultipathAlignment& read) const { + multipath_alignment_t mp_aln; + from_proto_multipath_alignment(read, mp_aln); + double score; + if (!sub_score && !rescore) { + score = optimal_alignment_score(mp_aln); + } + else { + Alignment aln; + optimal_alignment(mp_aln, aln); + if (sub_score) { + score = identity(aln.path()); + } + else { + const static Aligner unadjusted; + GSSWAligner* aligner = (GSSWAligner*)&unadjusted; + score = aligner->score_contiguous_alignment(aln); + } + } + + if (frac_score && read.sequence().size()) { + score /= read.sequence().size(); + } + return score; +} + +template +bool ReadFilter::matches_name(const Read& aln) const { + bool keep = true; + // filter (current) alignment + if (!name_prefixes.empty()) { + // Make sure we match at least one name prefix + + bool found = false; + + // Do a binary search for the closest prefix and see if all of any prefix exists. + // We assume the prefixes are sorted. + size_t left_bound = 0; + size_t left_match = 0; + while (left_match < name_prefixes[left_bound].size() && + left_match < aln.name().size() && + name_prefixes[left_bound][left_match] == aln.name()[left_match]) { + // Scan all the matches at the start + left_match++; + } + + size_t right_bound = name_prefixes.size() - 1; + size_t right_match = 0; + while (right_match < name_prefixes[right_bound].size() && + right_match < aln.name().size() && + name_prefixes[right_bound][right_match] == aln.name()[right_match]) { + // Scan all the matches at the end + right_match++; + } + + if (left_match == name_prefixes[left_bound].size() || right_match == name_prefixes[right_bound].size()) { + // We found a match already + found = true; + } else { + while (left_bound + 1 < right_bound) { + // Until we run out of unexamined prefixes, do binary search + size_t center = (left_bound + right_bound) / 2; + // No need to re-check any common prefix + size_t center_match = min(left_match, right_match); + + while (center_match < name_prefixes[center].size() && + center_match < aln.name().size() && + name_prefixes[center][center_match] == aln.name()[center_match]) { + // Scan all the matches here + center_match++; + } + + if (center_match == name_prefixes[center].size()) { + // We found a hit! + found = true; + break; + } + + if (center_match == aln.name().size() || + name_prefixes[center][center_match] > aln.name()[center_match]) { + // The match, if it exists, must be before us + right_bound = center; + right_match = center_match; + } + else { + // The match, if it exists, must be after us. + left_bound = center; + left_match = center_match; + } + } + } + + if (!found) { + // There are prefixes and we don't match any, so drop the read. + keep = false; + } + } + return keep; +} + +template<> +inline bool ReadFilter::has_excluded_refpos(const MultipathAlignment& read) const { + // TODO: multipath alignments don't record refpos + return false; +} + +template<> +inline bool ReadFilter::has_excluded_refpos(const Alignment& aln) const { + bool found_match = false; + if (!excluded_refpos_contigs.empty() && aln.refpos_size() != 0) { + // We have refpos exclusion filters and a refpos is set. + // We need to bang every refpos anme against every filter. + + for (auto& expression : excluded_refpos_contigs) { + for (auto& refpos : aln.refpos()) { + if (regex_search(refpos.name(), expression)) { + // We don't want this read because of this match + found_match = true; + break; + } + } + if (found_match) { + break; + } + } + } + return found_match; +} + +template +bool ReadFilter::has_excuded_feature(const Read& read) const { + bool found_match = false; + vector features(get_annotation>(read, "features")); + + for (auto& feature : features) { + if (excluded_features.count(feature)) { + // If the read has any banned features, fail it. + found_match = true; + break; + } + } + return found_match; +} + +template<> +inline bool ReadFilter::is_secondary(const MultipathAlignment& mp_aln) const { + return get_annotation(mp_aln, "secondary"); +} + +template<> +inline bool ReadFilter::is_secondary(const Alignment& aln) const { + return aln.is_secondary(); +} + +template +int ReadFilter::alignment_overhang(const Alignment& aln) const { + int overhang = 0; + if (aln.path().mapping_size() > 0) { + const auto& left_mapping = aln.path().mapping(0); + if (left_mapping.edit_size() > 0) { + overhang = left_mapping.edit(0).to_length() - left_mapping.edit(0).from_length(); + } + const auto& right_mapping = aln.path().mapping(aln.path().mapping_size() - 1); + if (right_mapping.edit_size() > 0) { + const auto& edit = right_mapping.edit(right_mapping.edit_size() - 1); + overhang = max(overhang, edit.to_length() - edit.from_length()); + } + } + else { + overhang = aln.sequence().size(); + } + return overhang; +} + +template<> +inline int ReadFilter::get_overhang(const Alignment& aln) const { + return alignment_overhang(aln); +} + +template<> +inline int ReadFilter::get_overhang(const MultipathAlignment& mp_aln) const { + return alignment_overhang(to_alignment(mp_aln)); +} + +template +int ReadFilter::alignment_end_matches(const Alignment& aln) const { + // compute end matches. + int left_end_matches = 0; + // from the left + for (int i = 0; i < aln.path().mapping_size() && left_end_matches < min_end_matches; ++i) { + for (int j = 0; j < aln.path().mapping(i).edit_size() && left_end_matches < min_end_matches; ++j) { + const Edit& edit = aln.path().mapping(i).edit(j); + if (edit.from_length() == edit.to_length() && edit.sequence().empty()) { + left_end_matches += edit.to_length(); + } else { + i = aln.path().mapping_size(); + break; + } + } + } + int right_end_matches = 0; + // from the right + for (int i = aln.path().mapping_size() - 1; i >= 0 && right_end_matches < min_end_matches; --i) { + for (int j = aln.path().mapping(i).edit_size() - 1; j >= 0 && right_end_matches < min_end_matches; --j) { + const Edit& edit = aln.path().mapping(i).edit(j); + if (edit.from_length() == edit.to_length() && edit.sequence().empty()) { + right_end_matches += edit.to_length(); + } else { + i = -1; + break; + } + } + } + return min(left_end_matches, right_end_matches); +} + +template<> +inline int ReadFilter::get_end_matches(const MultipathAlignment& read) const { + return alignment_end_matches(to_alignment(read)); +} + +template<> +inline int ReadFilter::get_end_matches(const Alignment& read) const { + return alignment_end_matches(read); +} + +template +int ReadFilter::get_mapq(const Read& read) const { + return read.mapping_quality(); +} + +template +double ReadFilter::get_min_base_qual_fraction(const Read& read) const { + int mq_count = 0; + const string& base_qualities = read.quality(); + for (int i = 0; i < base_qualities.length(); ++i) { + if (short(base_qualities[i]) >= min_base_quality) { + ++mq_count; + } + } + return (double)mq_count / (double)base_qualities.size() < min_base_quality_fraction; +} + +template<> +inline bool ReadFilter::is_split(const Alignment& alignment) const { + if(graph == nullptr) { + // Can't tell if the read is split. + throw runtime_error("HandleGraph (e.g. XG) required to check for split reads"); + } + + handle_t prev; + for(size_t i = 0; i + 1 < alignment.path().mapping_size(); i++) { + if (i == 0) { + const auto& pos = alignment.path().mapping(i).position(); + prev = graph->get_handle(pos.node_id(), pos.is_reverse()); + } + const auto& pos = alignment.path().mapping(i + 1).position(); + handle_t here = graph->get_handle(pos.node_id(), pos.is_reverse()); + + // Can we find the same articulation of the edge as the alignment uses + + if(!graph->has_edge(prev, here)) { + // We found a skip! + if(verbose) { + cerr << "Warning: read " << alignment.name() << " has an unknown edge " + << graph->get_id(prev) << (graph->get_is_reverse(prev) ? "-" : "+") << " -> " << graph->get_id(here) << (graph->get_is_reverse(here) ? "-" : "+") + << ". Removing!" << endl; + } + return true; + } + + prev = here; + } + + // No wandering jumps between nodes found + return false; +} + +template<> +inline bool ReadFilter::is_split(const MultipathAlignment& mp_aln) const { + bool found_connection = false; + for (const auto& subpath : mp_aln.subpath()) { + if (subpath.connection_size() != 0) { + found_connection = true; + break; + } + } + return found_connection; +} + +// quick and dirty filter to see if removing reads that can slip around +// and still map perfectly helps vg call. returns true if at either +// end of read sequence, at least k bases are repetitive, checking repeats +// of up to size 2k +template +bool ReadFilter::has_repeat(const Read& read, int k) const { + if (k == 0) { + return false; + } + const string& s = read.sequence(); + for (int i = 1; i <= 2 * k; ++i) { + int covered = 0; + bool ffound = true; + bool bfound = true; + for (int j = 1; (ffound || bfound) && (j + 1) * i < s.length(); ++j) { + ffound = ffound && s.substr(0, i) == s.substr(j * i, i); + bfound = bfound && s.substr(s.length() - i, i) == s.substr(s.length() - i - j * i, i); + if (ffound || bfound) { + covered += i; + } + } + if (covered >= k) { + return true; + } + } + return false; +} + +template<> +inline bool ReadFilter::trim_ambiguous_ends(Alignment& alignment, int k) const { + assert(graph != nullptr); + + // Define a way to get node length, for flipping alignments + function get_node_length = [&](id_t node) { + return graph->get_length(graph->get_handle(node)); + }; + + // Because we need to flip the alignment, make sure it is new-style and + // doesn't have any Mappings with no Edits. + for(size_t i = 0; i < alignment.path().mapping_size(); i++) { + if(alignment.path().mapping(i).edit_size() == 0) { + // Complain! + throw runtime_error("Found mapping with no edits in " + pb2json(alignment)); + } + } + + // TODO: we're going to flip the alignment twice! This is a waste of time! + // Define some kind of oriented view or something, or just two duplicated + // trimming functions, so we can just trim once without flipping. + + // Trim the end + bool end_changed = trim_ambiguous_end(alignment, k); + // Flip and trim the start + + Alignment flipped = reverse_complement_alignment(alignment, get_node_length); + + if(trim_ambiguous_end(flipped, k)) { + // The start needed trimming + + // Flip the trimmed flipped alignment back + alignment = reverse_complement_alignment(flipped, get_node_length); + // We definitely changed something + return true; + } + + // We maybe changed something + return end_changed; +} + +template<> +inline bool ReadFilter::trim_ambiguous_ends(MultipathAlignment& aln, int k) const { + // TODO: apply this filter to mp alns? + return false; +} + +template +bool ReadFilter::trim_ambiguous_end(Alignment& alignment, int k) const { + // What mapping in the alignment is the leftmost one starting in the last k + // bases? (Except when that would be the first mapping, we use the second.) + // Start out with it set to the past-the-end value. + size_t trim_start_mapping = alignment.path().mapping_size(); + + // How many real non-softclip bases have we seen reading in from the end of + // the read? + size_t real_base_count = 0; + // How many softclip bases have we seen in from the end of the read? + size_t softclip_base_count = 0; + for(size_t i = alignment.path().mapping_size() - 1; i != -1 && i != 0; i--) { + // Scan in from the end of the read. + + auto* mapping = alignment.mutable_path()->mutable_mapping(i); + + // We should always have edits in our mappings. + assert(mapping->edit_size() > 0); + + for(int j = mapping->edit_size() - 1; j != -1; j--) { + // Visit every edit in the mapping + auto& edit = mapping->edit(j); + + + if(real_base_count == 0 && edit.from_length() == 0) { + // This is a trailing insert. Put it as a softclip + softclip_base_count += edit.to_length(); + } else { + // This is some other kind of thing. Record it as real bases. + real_base_count += edit.to_length(); + } + } + + if(real_base_count <= k) { + // This mapping starts fewer than k non-softclipped alignment + // bases from the end of the read. + trim_start_mapping = i; + } else { + // This mapping starts more than k in from the end. So the + // previous one, if we had one, must be the right one. + break; + } + } + + if(trim_start_mapping == alignment.path().mapping_size()) { + // No mapping was found that starts within the last k non-softclipped + // bases. So there's nothing to do. + return false; + } + + if(real_base_count == 0) { + // We have an anchoring mapping, but all the mappings we could trim are + // softclips, so there's no point. TODO: will we ever get softclips + // placed as the only thing on a node? + return false; + } + + // Which is the last assumed-non-ambiguous mapping from which we can anchor + // our search? + size_t root_mapping = trim_start_mapping - 1; + + // What's the sequence, including that root node, that we are looking for? + // We need the sequence of the nodes, rather than the read's sequence, + // because you can still be ambiguous even if you have a SNP on top of the + // ambiguous thing. + + // We need to ignore all the offsets and from_lengths, except for the from + // length on the last node to let us know if we end early. It's sort of + // nonsense to have offsets and non-full from_lengths on internal mappings, + // and everything is easiest if we use the full length sequence of the root + // node. + stringstream target_sequence_stream; + for(size_t i = root_mapping; i < alignment.path().mapping_size(); i++) { + // Collect the appropriately oriented from sequence from each mapping + auto& mapping = alignment.path().mapping(i); + handle_t handle = graph->get_handle(mapping.position().node_id(), + mapping.position().is_reverse()); + string sequence = graph->get_sequence(handle); + + if(i == root_mapping) { + // Use the full length of the node and ignore any offset + target_sequence_stream << sequence; + } else { + // Use the offset plus the total from_length of all the + // edits (in case we're the last node and ending early). We made + // sure all non-root nodes had edits earlier. + + size_t from_length = mapping.position().offset(); + for(size_t j = 0; j < mapping.edit_size(); j++) { + from_length += mapping.edit(j).from_length(); + } + + // Put in the sequence that the mapping visits + target_sequence_stream << sequence.substr(0, from_length); + } + } + string target_sequence = target_sequence_stream.str(); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "Need to look for " << target_sequence << " right of mapping " << root_mapping << endl; +#endif + + // We're not going to recurse hundreds of nodes deep, so we can use the real + // stack and a real recursive function. + + // Do the DFS into the given node, after already having matched the given + // number of bases of the target sequence. See if you can match any more + // bases of the target sequence. + + // Return the total number of leaves in all subtrees that match the full + // target sequence, and the depth in bases of the shallowest point at which + // multiple subtrees with full lenght matches are unified. + + // We keep a maximum number of visited nodes here, just to prevent recursion + // from going on forever in worst-case-type graphs + size_t dfs_visit_count = 0; + function(const handle_t&, size_t)> do_dfs = + [&](const handle_t& handle, size_t matched) -> pair { + + ++dfs_visit_count; + + // Grab the node sequence and match more of the target sequence. + string node_sequence = graph->get_sequence(handle); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "Node " << graph->get_id(handle) << " " << (graph->get_is_reverse(handle) ? "rev" : "fwd") << ": " + << node_sequence << " at offset " << matched << " in " << target_sequence << endl; +#endif + + // Now count up the new matches between this node and the target sequence. + size_t new_matches; + for( + // Start with no matches + new_matches = 0; + // Keep going as long as we're inside both strings and haven't had a mismatch + new_matches < node_sequence.size() && + matched + new_matches < target_sequence.size() && + node_sequence[new_matches] == target_sequence[matched + new_matches]; + // Count up all the matches we find + new_matches++ + ); + + if(matched + new_matches == target_sequence.size()) { + // We found a tail end of a complete match of the target sequence + // on this node. + +#ifdef debug +#pragma omp critical(cerr) + cerr << "Node " << node_id << " is a matching leaf" << endl; +#endif + + // Return one match and unification at full length (i.e. nothing can + // be discarded). + return make_pair(1, target_sequence.size()); + } + + if(new_matches < node_sequence.size()) { + // We didn't make it all the way through this node, nor did we + // finish the target sequence; there's a mismatch between the node + // and the target sequence. + +#ifdef debug +#pragma omp critical(cerr) + cerr << "Node " << node_id << " has a mismatch" << endl; +#endif + + // If we mismatch, return 0 matches and unification at full length. + return make_pair(0, target_sequence.size()); + } + + // If we get through the whole node sequence without mismatching or + // running out of target sequence, keep going. + +#ifdef debug +#pragma omp critical(cerr) + cerr << "Node " << graph->get_id(handle) << " has " << new_matches << " internal new matches" << endl; +#endif + + // We're going to call all the children and collect the results, and + // then aggregate them. It might be slightly faster to aggregate while + // calling, but that might be less clear. + vector> child_results; + + graph->follow_edges(handle, false, [&](const handle_t& next) { + if (dfs_visit_count < defray_count) { + child_results.push_back(do_dfs(next, matched + node_sequence.size())); + return true; + } + else { +#ifdef debug +#pragma omp critical(cerr) + cerr << "Aborting read filter DFS at node " << graph->get_id(next) << " after " << dfs_visit_count << " visited" << endl; +#endif + return false; + } + }); + + // Sum up the total leaf matches, which will be our leaf match count. + size_t total_leaf_matches = 0; + // If we don't find multiple children with leaf matches, report + // unification at the min unification depth of any subtree (and there + // will only be one that isn't at full length). + size_t children_with_leaf_matches = 0; + size_t unification_depth = target_sequence.size(); + + for(auto& result : child_results) { + total_leaf_matches += result.first; + if(result.first > 0) { + children_with_leaf_matches++; + } + unification_depth = min(unification_depth, result.second); + } + if(children_with_leaf_matches > 1) { + // If multiple children have nonzero leaf match counts, report + // unification at the end of this node. + unification_depth = matched + node_sequence.size(); + } + + return make_pair(total_leaf_matches, unification_depth); + }; + + // Search from the root mapping's node looking right in its orientation in + // the mapping + const auto& root_pos = alignment.path().mapping(root_mapping).position(); + auto result = do_dfs(graph->get_handle(root_pos.node_id(), root_pos.is_reverse()), 0); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "Found " << result.first << " matching leaves with closest unification at " << result.second << endl; +#endif + + // We keep this much of the target sequence. + size_t target_sequence_to_keep = result.second; + + if(target_sequence_to_keep == target_sequence.size()) { + // Nothing to trim! + return false; + } + + // Figure out how many mappings we need to keep from the root in order to + // get that much sequence; we know it is either full length or at a mapping + // boundary. We handle the root special because it's always full length and + // we have to cut after its end. + size_t kept_sequence_accounted_for = graph->get_length(graph->get_handle(alignment.path().mapping(root_mapping).position().node_id())); + size_t first_mapping_to_drop; + for(first_mapping_to_drop = root_mapping + 1; + first_mapping_to_drop < alignment.path().mapping_size(); + first_mapping_to_drop++) { + // Consider starting dropping at each mapping after the root. + if(kept_sequence_accounted_for == target_sequence_to_keep) { + // OK this mapping really is the first one to drop. + break; + } else { + // Keep going. Account for the sequence from this mapping. + auto& mapping = alignment.path().mapping(first_mapping_to_drop); + + // We know it's not the root mapping, and it can't be the non-full- + // length end mapping (because we would have kept the full length + // target sequence and not had to cut). So assume full node is used. + kept_sequence_accounted_for += graph->get_length(graph->get_handle(mapping.position().node_id())); + } + } + + // OK we know the first mapping to drop. We need to work out the to_size, + // including all softclips, from there to the end, so we know how much to + // trim off of the sequence and quality. + size_t to_length_to_remove = 0; + for(size_t i = first_mapping_to_drop; i < alignment.path().mapping_size(); i++) { + // Go through all the mappings + auto& mapping = alignment.path().mapping(i); + for(size_t j = 0; j < mapping.edit_size(); j++) { + // Add up the to_length of all the edits + to_length_to_remove += mapping.edit(j).to_length(); + } + } + +#ifdef debug + cerr << "Want to trim " << alignment.sequence().size() << " bp sequence and " << alignment.quality().size() + << " quality values to remove " << to_length_to_remove << endl; +#endif + + // Make sure we have at least enough to trim. + // Note that we allow the entire alignment to be trimmed away! + assert(alignment.sequence().size() >= to_length_to_remove); + assert(alignment.quality().empty() || alignment.quality().size() >= to_length_to_remove); + // And that we made sence before trimming + assert(alignment.quality().empty() || alignment.quality().size() == alignment.sequence().size()); + + // Trim sequence + alignment.set_sequence(alignment.sequence().substr(0, alignment.sequence().size() - to_length_to_remove)); + + // Trim quality + if(!alignment.quality().empty()) { + alignment.set_quality(alignment.quality().substr(0, alignment.quality().size() - to_length_to_remove)); + } + + // Now we can discard the extra mappings + size_t to_delete = alignment.path().mapping_size() - first_mapping_to_drop; + alignment.mutable_path()->mutable_mapping()->DeleteSubrange(first_mapping_to_drop, to_delete); + + // Now the alignment is fixed! + return true; +} + +template<> +inline bool ReadFilter::get_is_paired(const Alignment& aln) const { + return (!aln.fragment_prev().name().empty() || aln.fragment_prev().path().mapping_size() != 0 || + !aln.fragment_next().name().empty() || aln.fragment_next().path().mapping_size() != 0); +} + +template<> +inline bool ReadFilter::get_is_paired(const MultipathAlignment& mp_aln) const { + return !mp_aln.paired_read_name().empty(); +} + +template +bool ReadFilter::is_proper_pair(const Read& read) const { + if (!has_annotation(read, "proper_pair")) { + return false; + } + return get_annotation(read, "proper_pair"); +} + +template +bool ReadFilter::contains_subsequence(const Read& read) const { + bool found = false; + for (const string& seq : subsequences) { + if (read.sequence().find(seq) != string::npos) { + found = true; + break; + } + } + return found; +} + +template +bool ReadFilter::sample_read(const Read& read) const { + // Decide if the alignment is paired. + // It is paired if fragment_next or fragment_prev point to something. + bool is_paired = get_is_paired(read); + + // Compute the QNAME that samtools would use + string qname; + if (is_paired) { + // Strip pair end identifiers like _1 or /2 that vg uses at the end of the name. + qname = regex_replace(read.name(), regex("[/_][12]$"), ""); + } else { + // Any _1 in the name is part of the actual read name. + qname = read.name(); + } + + // Now treat it as samtools would. + // See https://github.com/samtools/samtools/blob/60138c42cf04c5c473dc151f3b9ca7530286fb1b/sam_view.c#L101-L104 + + // Hash that with __ac_X31_hash_string from htslib and XOR against the seed mask + auto masked_hash = __ac_X31_hash_string(qname.c_str()) ^ downsample_seed_mask; + + // Hash that again with __ac_Wang_hash from htslib, apparently to mix the bits. + uint32_t mixed_hash = __ac_Wang_hash(masked_hash); + + // Take the low 24 bits and compute a double from 0 to 1 + const int32_t LOW_24_BITS = 0xffffff; + double sample = ((double)(mixed_hash & LOW_24_BITS)) / (LOW_24_BITS + 1); + + // If the result is >= the portion to downsample to, discard the read. + // Otherwise, keep it. + return (sample < downsample_probability); +} + } #endif diff --git a/src/recombinator.cpp b/src/recombinator.cpp new file mode 100644 index 00000000000..cddce6c21ea --- /dev/null +++ b/src/recombinator.cpp @@ -0,0 +1,1558 @@ +#include "recombinator.hpp" + +#include "kff.hpp" +#include "statistics.hpp" +#include "algorithms/component.hpp" + +#include +#include +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +// Numerical class constants. + +constexpr std::uint32_t Haplotypes::Header::MAGIC_NUMBER; +constexpr std::uint32_t Haplotypes::Header::VERSION; +constexpr std::uint32_t Haplotypes::Header::MIN_VERSION; +constexpr std::uint64_t Haplotypes::Header::DEFAULT_K; + +constexpr size_t HaplotypePartitioner::SUBCHAIN_LENGTH; +constexpr size_t HaplotypePartitioner::APPROXIMATE_JOBS; + +constexpr size_t Recombinator::NUM_HAPLOTYPES; +constexpr size_t Recombinator::COVERAGE; +constexpr size_t Recombinator::KFF_BLOCK_SIZE; +constexpr double Recombinator::PRESENT_DISCOUNT; +constexpr double Recombinator::HET_ADJUSTMENT; +constexpr double Recombinator::ABSENT_SCORE; + +//------------------------------------------------------------------------------ + +// Returns a GBWTGraph handle as a string (id, orientation). +std::string to_string(handle_t handle) { + gbwt::node_type node = gbwtgraph::GBWTGraph::handle_to_node(handle); + return std::string("(") + std::to_string(gbwt::Node::id(node)) + std::string(", ") + std::to_string(gbwt::Node::is_reverse(node)) + std::string(")"); +} + +//------------------------------------------------------------------------------ + +hash_map::iterator +find_kmer(hash_map& counts, Haplotypes::Subchain::kmer_type kmer, size_t k) { + Haplotypes::Subchain::kmer_type rc = minimizer_reverse_complement(kmer, k); + auto forward = counts.find(kmer); + auto reverse = counts.find(rc); + return (forward != counts.end() ? forward : reverse); +} + +hash_map Haplotypes::kmer_counts(const std::string& kff_file, Verbosity verbosity) const { + double start = gbwt::readTimer(); + if (verbosity >= verbosity_basic) { + std::cerr << "Reading kmer counts" << std::endl; + } + + // Open and validate the kmer count file. + ParallelKFFReader reader(kff_file); + + // Populate the map with the kmers we are interested in. + double checkpoint = gbwt::readTimer(); + hash_map result; + result.reserve(this->header.total_kmers); + for (size_t chain_id = 0; chain_id < this->chains.size(); chain_id++) { + const TopLevelChain& chain = this->chains[chain_id]; + for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) { + const Subchain& subchain = chain.subchains[subchain_id]; + for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { + result[subchain.kmers[kmer_id].first] = 0; + } + } + } + if (verbosity >= verbosity_detailed) { + double seconds = gbwt::readTimer() - checkpoint; + std::cerr << "Initialized the hash map with " << result.size() << " kmers in " << seconds << " seconds" << std::endl; + } + + // Read the KFF file and add the counts using multiple threads. + checkpoint = gbwt::readTimer(); + size_t kmer_count = 0; + #pragma omp parallel + { + #pragma omp task + { + while (true) { + std::vector> block = reader.read(Recombinator::KFF_BLOCK_SIZE); + if (block.empty()) { + break; + } + std::vector::iterator, size_t>> buffer; + for (auto kmer : block) { + auto iter = find_kmer(result, kmer.first, this->k()); + if (iter != result.end()) { + buffer.push_back({ iter, kmer.second }); + } + } + #pragma omp critical + { + for (auto to_update : buffer) { + to_update.first->second += to_update.second; + } + kmer_count += block.size(); + } + } + } + } + if (verbosity >= verbosity_detailed) { + double seconds = gbwt::readTimer() - checkpoint; + std::cerr << "Read " << kmer_count << " kmers in " << seconds << " seconds" << std::endl; + } + + if (verbosity >= verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Read the kmer counts in " << seconds << " seconds" << std::endl; + } + return result; +} + +//------------------------------------------------------------------------------ + +std::string Haplotypes::Subchain::to_string() const { + std::string result; + switch (this->type) { + case normal: + result.append("normal"); + break; + case prefix: + result.append("prefix"); + break; + case suffix: + result.append("suffix"); + break; + case full_haplotype: + result.append("full"); + break; + default: + result.append("invalid"); + break; + } + + result.append(" from "); + result.append(to_string_gbwtgraph(this->start)); + result.append(" to "); + result.append(to_string_gbwtgraph(this->end)); + + return result; +} + +void Haplotypes::Subchain::simple_sds_serialize(std::ostream& out) const { + sdsl::simple_sds::serialize_value(this->type, out); + sdsl::simple_sds::serialize_value(this->start, out); + sdsl::simple_sds::serialize_value(this->end, out); + sdsl::simple_sds::serialize_vector(this->kmers, out); + sdsl::simple_sds::serialize_vector(this->sequences, out); + this->kmers_present.simple_sds_serialize(out); +} + +void Haplotypes::Subchain::simple_sds_load(std::istream& in) { + std::uint64_t temp = sdsl::simple_sds::load_value(in); + switch (temp) { + case normal: // Fall through. + case prefix: // Fall through. + case suffix: // Fall through. + case full_haplotype: + this->type = static_cast(temp); + break; + default: + throw sdsl::simple_sds::InvalidData("Invalid subchain type: " + std::to_string(temp)); + } + + this->start = sdsl::simple_sds::load_value(in); + this->end = sdsl::simple_sds::load_value(in); + bool should_have_start = (this->type == normal || this->type == suffix); + bool should_have_end = (this->type == normal || this->type == prefix); + if ((this->start != gbwt::ENDMARKER) != should_have_start) { + throw sdsl::simple_sds::InvalidData("Subchain start node " + std::to_string(this->start) + " does not match type " + std::to_string(temp)); + } + if ((this->end != gbwt::ENDMARKER) != should_have_end) { + throw sdsl::simple_sds::InvalidData("Subchain end node " + std::to_string(this->end) + " does not match type" + std::to_string(temp)); + } + + this->kmers = sdsl::simple_sds::load_vector>(in); + this->sequences = sdsl::simple_sds::load_vector(in); + this->kmers_present.simple_sds_load(in); + if (kmers_present.size() != kmers.size() * sequences.size()) { + throw sdsl::simple_sds::InvalidData("Invalid length for the kmer presence bitvector in subchain from " + + std::to_string(this->start) + " to " + std::to_string(this->end)); + } +} + +size_t Haplotypes::Subchain::simple_sds_size() const { + size_t result = sdsl::simple_sds::value_size() + 2 * sdsl::simple_sds::value_size(); + result += sdsl::simple_sds::vector_size(this->kmers); + result += sdsl::simple_sds::vector_size(this->sequences); + result += this->kmers_present.simple_sds_size(); + return result; +} + +void Haplotypes::TopLevelChain::simple_sds_serialize(std::ostream& out) const { + sdsl::simple_sds::serialize_value(this->offset, out); + sdsl::simple_sds::serialize_value(this->job_id, out); + sdsl::simple_sds::serialize_string(this->contig_name, out); + sdsl::simple_sds::serialize_value(this->subchains.size(), out); + for (auto& subchain : this->subchains) { + subchain.simple_sds_serialize(out); + } +} + +void Haplotypes::TopLevelChain::simple_sds_load(std::istream& in) { + this->offset = sdsl::simple_sds::load_value(in); + this->job_id = sdsl::simple_sds::load_value(in); + this->contig_name = sdsl::simple_sds::load_string(in); + size_t subchain_count = sdsl::simple_sds::load_value(in); + this->subchains.resize(subchain_count); + for (size_t i = 0; i < subchain_count; i++) { + this->subchains[i].simple_sds_load(in); + } +} + +void Haplotypes::TopLevelChain::load_old(std::istream& in) { + this->offset = sdsl::simple_sds::load_value(in); + this->job_id = sdsl::simple_sds::load_value(in); + this->contig_name = "chain_" + std::to_string(this->offset); + size_t subchain_count = sdsl::simple_sds::load_value(in); + this->subchains.resize(subchain_count); + for (size_t i = 0; i < subchain_count; i++) { + this->subchains[i].simple_sds_load(in); + } +} + +size_t Haplotypes::TopLevelChain::simple_sds_size() const { + size_t result = 3 * sdsl::simple_sds::value_size(); + result += sdsl::simple_sds::string_size(this->contig_name); + for (auto& subchain : this->subchains) { + result += subchain.simple_sds_size(); + } + return result; +} + +void Haplotypes::simple_sds_serialize(std::ostream& out) const { + sdsl::simple_sds::serialize_value
    (this->header, out); + sdsl::simple_sds::serialize_vector(this->jobs_for_cached_paths, out); + for (auto& chain : this->chains) { + chain.simple_sds_serialize(out); + } +} + +void Haplotypes::simple_sds_load(std::istream& in) { + this->header = sdsl::simple_sds::load_value
    (in); + if (this->header.magic_number != Header::MAGIC_NUMBER) { + throw sdsl::simple_sds::InvalidData("Haplotypes::simple_sds_load(): Expected magic number " + std::to_string(Header::MAGIC_NUMBER) + + ", got " + std::to_string(this->header.magic_number)); + } + if (this->header.version < Header::MIN_VERSION || this->header.version > Header::VERSION) { + std::string msg = "Haplotypes::simple_sds_load(): Expected version " + std::to_string(Header::MIN_VERSION) + + " to " + std::to_string(Header::VERSION) + ", got version " + std::to_string(this->header.version); + throw sdsl::simple_sds::InvalidData(msg); + } + + this->jobs_for_cached_paths = sdsl::simple_sds::load_vector(in); + + this->chains.resize(this->header.top_level_chains); + for (auto& chain : this->chains) { + if (this->header.version == Header::VERSION) { + chain.simple_sds_load(in); + } else { + chain.load_old(in); + } + } + + // Update to the current version. + this->header.version = Header::VERSION; +} + +size_t Haplotypes::simple_sds_size() const { + size_t result = sdsl::simple_sds::value_size
    (); + result += sdsl::simple_sds::vector_size(this->jobs_for_cached_paths); + for (auto& chain : this->chains) { + result += chain.simple_sds_size(); + } + return result; +} + +//------------------------------------------------------------------------------ + +HaplotypePartitioner::HaplotypePartitioner(const gbwtgraph::GBZ& gbz, + const gbwt::FastLocate& r_index, + const SnarlDistanceIndex& distance_index, + const minimizer_index_type& minimizer_index, + Verbosity verbosity) : + gbz(gbz), r_index(r_index), distance_index(distance_index), minimizer_index(minimizer_index), + verbosity(verbosity) +{ +} + +//------------------------------------------------------------------------------ + +Haplotypes HaplotypePartitioner::partition_haplotypes(const Parameters& parameters) const { + + // Sanity checks. + if (parameters.subchain_length == 0) { + std::string msg = "HaplotypePartitioner::partition_haplotypes(): subchain length cannot be 0"; + throw std::runtime_error(msg); + } + if (parameters.approximate_jobs == 0) { + std::string msg = "HaplotypePartitioner::partition_haplotypes(): number of jobs cannot be 0"; + throw std::runtime_error(msg); + } + + Haplotypes result; + result.header.k = this->minimizer_index.k(); + + // Determine top-level chains. + double start = gbwt::readTimer(); + if (this->verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Determining construction jobs" << std::endl; + } + size_t total_chains = 0; + this->distance_index.for_each_child(this->distance_index.get_root(), [&](const handlegraph::net_handle_t&) { + total_chains++; + }); + size_t size_bound = this->gbz.graph.get_node_count() / parameters.approximate_jobs; + gbwtgraph::ConstructionJobs jobs = gbwtgraph::gbwt_construction_jobs(this->gbz.graph, size_bound); + if (jobs.components != total_chains) { + // TODO: Could we instead identify the components with multiple top-level chains + // and skip them? + std::string msg = "HaplotypePartitioner::partition_haplotypes(): there are " + + std::to_string(total_chains) + " top-level chains and " + std::to_string(jobs.components) + + " weakly connected components; haplotype sampling cannot be used with this graph"; + throw std::runtime_error(msg); + } + result.header.top_level_chains = jobs.components; + result.header.construction_jobs = jobs.size(); + result.chains.resize(result.components()); + for (size_t chain_id = 0; chain_id < result.components(); chain_id++) { + result.chains[chain_id].offset = chain_id; + result.chains[chain_id].job_id = result.jobs(); // Not assigned to any job yet. + result.chains[chain_id].contig_name = "chain_" + std::to_string(chain_id); // Placeholder name. + } + + // Assign chains to jobs and determine contig names. + auto chains_by_job = gbwtgraph::partition_chains(this->distance_index, this->gbz.graph, jobs); + for (size_t job_id = 0; job_id < result.jobs(); job_id++) { + for (auto& chain : chains_by_job[job_id]) { + result.chains[chain.offset].job_id = job_id; + auto da = this->r_index.decompressDA(gbwtgraph::GBWTGraph::handle_to_node(chain.handle)); + for (gbwt::size_type seq_id : da) { + gbwt::size_type path_id = gbwt::Path::id(seq_id); + auto iter = this->gbz.graph.id_to_path.find(path_id); + if (iter != this->gbz.graph.id_to_path.end()) { + // This is a generic / reference path. Use its contig name for the chain. + gbwt::PathName path_name = this->gbz.index.metadata.path(path_id); + result.chains[chain.offset].contig_name = this->gbz.index.metadata.contig(path_name.contig); + if (this->verbosity >= Haplotypes::verbosity_debug) { + std::cerr << "Using contig name " << result.chains[chain.offset].contig_name << " for chain " << chain.offset << std::endl; + } + break; + } + } + } + } + + // Assign named and reference paths to jobs. + result.jobs_for_cached_paths.reserve(this->gbz.graph.named_paths.size()); + for (size_t i = 0; i < this->gbz.graph.named_paths.size(); i++) { + const gbwtgraph::NamedPath& path = this->gbz.graph.named_paths[i]; + if (path.from == gbwt::invalid_edge()) { + // Skip empty paths. + result.jobs_for_cached_paths.push_back(result.jobs()); + continue; + } + nid_t node_id = gbwt::Node::id(path.from.first); + auto iter = jobs.node_to_job.find(node_id); + if (iter == jobs.node_to_job.end()) { + std::string msg = "HaplotypePartitioner::partition_haplotypes(): cannot assign node " + std::to_string(node_id) + " to a job"; + throw std::runtime_error(msg); + } + result.jobs_for_cached_paths.push_back(iter->second); + } + + jobs = gbwtgraph::ConstructionJobs(); // Save memory. + if (this->verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Partitioned " << result.components() << " components into " << result.jobs() << " jobs in " << seconds << " seconds" << std::endl; + } + + // Determine the subchains and sequences for each top-level chain. + if (verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Running " << omp_get_max_threads() << " jobs in parallel" << std::endl; + } + start = gbwt::readTimer(); + #pragma omp parallel for schedule(dynamic, 1) + for (size_t job = 0; job < chains_by_job.size(); job++) { + const std::vector& chains = chains_by_job[job]; + size_t total_subchains = 0, total_kmers = 0; + for (auto& chain : chains) { + try { + this->build_subchains(chain, result.chains[chain.offset], parameters); + } catch (const std::runtime_error& e) { + std::cerr << "error: [job " << job << "]: " << e.what() << std::endl; + std::exit(EXIT_FAILURE); + } + total_subchains += result.chains[chain.offset].subchains.size(); + for (auto& subchain : result.chains[chain.offset].subchains) { + total_kmers += subchain.kmers.size(); + } + } + #pragma omp critical + { + result.header.total_subchains += total_subchains; + result.header.total_kmers += total_kmers; + if (this->verbosity >= Haplotypes::verbosity_detailed) { + std::cerr << "Finished job " << job << " with " << chains.size() << " chains, " << total_subchains << " subchains, and " << total_kmers << " kmers" << std::endl; + } + } + } + if (verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Finished the jobs in " << seconds << " seconds" << std::endl; + } + + return result; +} + +//------------------------------------------------------------------------------ + +size_t HaplotypePartitioner::get_distance(handle_t from, handle_t to) const { + return this->distance_index.minimum_distance( + this->gbz.graph.get_id(from), this->gbz.graph.get_is_reverse(from), this->gbz.graph.get_length(from) - 1, + this->gbz.graph.get_id(to), this->gbz.graph.get_is_reverse(to), 0, + false, &this->gbz.graph + ); +} + +std::vector +HaplotypePartitioner::get_subchains(const gbwtgraph::TopLevelChain& chain, const Parameters& parameters) const { + std::vector result; + + // First pass: take all snarls as subchains. + std::vector snarls; + handle_t snarl_start = empty_gbwtgraph_handle(); + bool has_start = false; + bool was_snarl = false; + net_handle_t curr = this->distance_index.get_bound(chain.chain, false, true); + net_handle_t chain_end = this->distance_index.get_bound(chain.chain, true, false); + while (curr != chain_end) { + if (this->distance_index.is_node(curr)) { + handle_t handle = this->distance_index.get_handle(curr, &this->gbz.graph); + if (was_snarl) { + if (!has_start) { + // If the chain starts with a snarl, we take it as a prefix. + snarls.push_back({ Haplotypes::Subchain::prefix, empty_gbwtgraph_handle(), handle }); + } else { + size_t distance = this->get_distance(snarl_start, handle); + if (distance < std::numeric_limits::max()) { + // Normal snarl with two boundary nodes. + snarls.push_back({ Haplotypes::Subchain::normal, snarl_start, handle }); + } else { + // The snarl is not connected, so we break it into two. + snarls.push_back({ Haplotypes::Subchain::suffix, snarl_start, empty_gbwtgraph_handle() }); + snarls.push_back({ Haplotypes::Subchain::prefix, empty_gbwtgraph_handle(), handle }); + } + } + } + snarl_start = handle; + has_start = true; + was_snarl = false; + } else if (this->distance_index.is_snarl(curr)) { + was_snarl = true; + } + net_handle_t next; + size_t successors = 0; + this->distance_index.follow_net_edges(curr, &this->gbz.graph, false, [&](const net_handle_t& child) { + successors++; + next = child; + }); + if (successors != 1) { + throw std::runtime_error("HaplotypePartitioner::get_subchains(): chain " + std::to_string(chain.offset) + " has " + std::to_string(successors) + " successors for a child"); + } + curr = next; + } + if (was_snarl && has_start) { + // If the chain ends with a snarl, we take it as a suffix. + snarls.push_back({ Haplotypes::Subchain::suffix, snarl_start, empty_gbwtgraph_handle() }); + } + + // Second pass: Combine snarls into subchains. + size_t head = 0; + while (head < snarls.size()) { + if (snarls[head].type != Haplotypes::Subchain::normal) { + // Prefixes and suffixes should be rare, so we can simply use them directly. + result.push_back(snarls[head]); + head++; + continue; + } + size_t tail = head; + while (tail + 1 < snarls.size()) { + if (snarls[tail + 1].type != Haplotypes::Subchain::normal) { + break; + } + size_t candidate = this->get_distance(snarls[head].start, snarls[tail + 1].end); + if (candidate <= parameters.subchain_length) { + tail++; + } else { + break; + } + } + result.push_back({ Haplotypes::Subchain::normal, snarls[head].start, snarls[tail].end }); + head = tail + 1; + } + + return result; +} + +//------------------------------------------------------------------------------ + +std::vector HaplotypePartitioner::get_sequence_visits(handle_t handle) const { + std::vector sa = this->r_index.decompressSA(gbwtgraph::GBWTGraph::handle_to_node(handle)); + std::vector result; + result.reserve(sa.size()); + for (size_t i = 0; i < sa.size(); i++) { + result.push_back({ sa[i], i }); + } + std::sort(result.begin(), result.end(), [&](sequence_type a, sequence_type b) -> bool { + gbwt::size_type a_id = r_index.seqId(a.first); + gbwt::size_type a_offset = r_index.seqOffset(a.first); + gbwt::size_type b_id = r_index.seqId(b.first); + gbwt::size_type b_offset = r_index.seqOffset(b.first); + return ((a_id < b_id) || ((a_id == b_id) && (a_offset > b_offset))); + }); + return result; +} + +void sa_to_da(std::vector& sequences, const gbwt::FastLocate& r_index) { + for (auto& sequence : sequences) { + sequence.first = r_index.seqId(sequence.first); + } +} + +std::vector HaplotypePartitioner::get_sequences(handle_t handle) const { + auto result = this->get_sequence_visits(handle); + sa_to_da(result, this->r_index); + return result; +} + +std::vector HaplotypePartitioner::get_sequences(Subchain subchain) const { + if (subchain.type == Haplotypes::Subchain::prefix) { + return this->get_sequences(subchain.end); + } + if (subchain.type == Haplotypes::Subchain::suffix) { + return this->get_sequences(subchain.start); + } + auto from = this->get_sequence_visits(subchain.start); + auto to = this->get_sequence_visits(subchain.end); + + auto from_iter = from.begin(); + auto to_iter = to.begin(); + std::vector result; + while (from_iter != from.end() && to_iter != to.end()) { + gbwt::size_type from_id = this->r_index.seqId(from_iter->first); + gbwt::size_type to_id = this->r_index.seqId(to_iter->first); + if (from_id == to_id) { + // If a haplotype crosses the subchain multiple times, we take the last entry before + // each exit. + gbwt::size_type to_offset = this->r_index.seqOffset(to_iter->first); + if (this->r_index.seqOffset(from_iter->first) >= to_offset) { + auto peek = from_iter +1; + while (peek != from.end() && this->r_index.seqId(peek->first) == from_id && this->r_index.seqOffset(peek->first) >= to_offset) { + from_iter = peek; + ++peek; + } + result.push_back(*from_iter); + ++from_iter; ++to_iter; + } else { + ++to_iter; + } + } else if (from_id < to_id) { + ++from_iter; + } else if (from_id > to_id) { + ++to_iter; + } + } + + sa_to_da(result, this->r_index); + return result; +} + +//------------------------------------------------------------------------------ + +// Generate a haplotype over the closed range from `pos` to `end`. +// Take at most start_max and end_max characters from the initial and the final +// node, respectively +// Returns an empty haplotype if there is only one node. +// Set `end = empty_gbwtgraph_handle()` to continue until the end without a final node. +std::string generate_haplotype(gbwt::edge_type pos, handle_t end, size_t start_max, size_t end_max, const gbwtgraph::GBWTGraph& graph) { + std::string haplotype; + if (pos == gbwt::invalid_edge() || pos.first == gbwt::ENDMARKER) { + return haplotype; + } + + // Handle the initial node. + handle_t curr = gbwtgraph::GBWTGraph::node_to_handle(pos.first); + if (curr == end) { + return haplotype; + } + gbwtgraph::view_type view = graph.get_sequence_view(curr); + size_t offset = (view.second > start_max ? view.second - start_max : 0); + haplotype.append(view.first + offset, view.second - offset); + + while (true) { + pos = graph.index->LF(pos); + if (pos.first == gbwt::ENDMARKER) { + break; + } + curr = gbwtgraph::GBWTGraph::node_to_handle(pos.first); + view = graph.get_sequence_view(curr); + if (curr == end) { + haplotype.append(view.first, std::min(view.second, end_max)); + break; + } else { + haplotype.append(view.first, view.second); + } + } + + return haplotype; +} + +// Return the sorted set of kmers that are minimizers in the sequence and have a single +// occurrence in the graph. +std::vector take_unique_minimizers(const std::string& sequence, const HaplotypePartitioner::minimizer_index_type& minimizer_index) { + std::vector result; + auto minimizers = minimizer_index.minimizers(sequence); + result.reserve(minimizers.size()); + for (auto& minimizer : minimizers) { + if (minimizer_index.count(minimizer) == 1) { + result.push_back(minimizer.key.get_key()); + } + } + gbwt::removeDuplicates(result, false); + return result; +} + +std::vector HaplotypePartitioner::unique_minimizers(gbwt::size_type sequence_id) const { + gbwt::edge_type pos = this->gbz.index.start(sequence_id); + size_t limit = std::numeric_limits::max(); + std::string haplotype = generate_haplotype(pos, empty_gbwtgraph_handle(), limit, limit, this->gbz.graph); + return take_unique_minimizers(haplotype, this->minimizer_index); +} + +std::vector HaplotypePartitioner::unique_minimizers(sequence_type sequence, Subchain subchain) const { + gbwt::edge_type pos; + size_t start_max = std::numeric_limits::max(), end_max = this->minimizer_index.k() - 1; + if (subchain.has_start()) { + pos = gbwt::edge_type(gbwtgraph::GBWTGraph::handle_to_node(subchain.start), sequence.second); + start_max = this->minimizer_index.k() - 1; + } else { + pos = this->gbz.index.start(sequence.first); + } + std::string haplotype = generate_haplotype(pos, subchain.end, start_max, end_max, this->gbz.graph); + return take_unique_minimizers(haplotype, this->minimizer_index); +} + +//------------------------------------------------------------------------------ + +// TODO: Return the number of skipped non-informative kmers. +/* + Take a set of sequences defined by the sorted set of kmers that are present. + Output a sorted vector of all kmers and the concatenated bitvectors that mark + the presence of kmers in the sequences. The output vectors are assumed to be + empty. +*/ +void present_kmers(const std::vector>& sequences, + std::vector>& all_kmers, + sdsl::bit_vector& kmers_present) { + + // Build a map of distinct kmers. For each kmer, record the largest sequence + // id containing the kmer and the number of sequences containing it. + std::map> present; + for (size_t sequence_id = 0; sequence_id < sequences.size(); sequence_id++) { + auto& sequence = sequences[sequence_id]; + for (auto kmer : sequence) { + auto iter = present.find(kmer); + if (iter != present.end()) { + if (iter->second.first < sequence_id) { + iter->second.first = sequence_id; + iter->second.second++; + } + } else { + present[kmer] = std::pair(sequence_id, 1); + } + } + } + + // Now take those kmers that occur in some but not in all sequences. + // Use the first field for storing the offset of the kmer in the vector. + all_kmers.reserve(present.size()); + size_t offset = 0; + for (auto iter = present.begin(); iter != present.end(); ++iter) { + if (iter->second.second < sequences.size()) { + all_kmers.push_back({ iter->first, iter->second.second }); + iter->second.first = offset; + offset++; + } + } + + // Transform the sequences into kmer presence bitvectors. + kmers_present = sdsl::bit_vector(sequences.size() * all_kmers.size()); + for (size_t i = 0; i < sequences.size(); i++) { + size_t start = i * all_kmers.size(); + for (auto kmer : sequences[i]) { + auto iter = present.find(kmer); + if (iter->second.second < sequences.size()) { + kmers_present[start + iter->second.first] = 1; + } + } + } +} + +void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain, Haplotypes::TopLevelChain& output, const Parameters& parameters) const { + std::vector subchains = this->get_subchains(chain, parameters); + for (const Subchain& subchain : subchains) { + std::vector>> to_process; + auto sequences = this->get_sequences(subchain); + if (sequences.empty()) { + // There are no haplotypes crossing the subchain, so we break it into + // a suffix and a prefix. + to_process.push_back({ { Haplotypes::Subchain::suffix, subchain.start, empty_gbwtgraph_handle() }, this->get_sequences(subchain.start) }); + to_process.push_back({ { Haplotypes::Subchain::prefix, empty_gbwtgraph_handle(), subchain.end }, this->get_sequences(subchain.end) }); + } else { + to_process.push_back({ subchain, std::move(sequences) }); + } + for (auto iter = to_process.begin(); iter != to_process.end(); ++iter) { + output.subchains.push_back({ + iter->first.type, + gbwtgraph::GBWTGraph::handle_to_node(iter->first.start), gbwtgraph::GBWTGraph::handle_to_node(iter->first.end), + {}, {}, sdsl::bit_vector() + }); + Haplotypes::Subchain& subchain = output.subchains.back(); + std::vector> kmers_by_sequence; + kmers_by_sequence.reserve(iter->second.size()); + for (sequence_type sequence : iter->second) { + kmers_by_sequence.emplace_back(this->unique_minimizers(sequence, iter->first)); + } + present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmers_present); + subchain.sequences = std::move(iter->second); + } + } + + // Take entire sequences if we could not generate any haplotypes. + // Note that the kmer sets should be empty, as the sequences should + // be identical. + if (subchains.empty()) { + output.subchains.push_back({ + Haplotypes::Subchain::full_haplotype, + gbwt::ENDMARKER, gbwt::ENDMARKER, + {}, {}, sdsl::bit_vector() + }); + Haplotypes::Subchain& subchain = output.subchains.back(); + gbwt::node_type node = gbwtgraph::GBWTGraph::handle_to_node(chain.handle); + auto sequences = this->r_index.decompressDA(node); + std::vector> kmers_by_sequence; + kmers_by_sequence.reserve(sequences.size()); + for (auto seq_id : sequences) { + kmers_by_sequence.emplace_back(this->unique_minimizers(seq_id)); + } + present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmers_present); + subchain.sequences.reserve(sequences.size()); + for (size_t i = 0; i < sequences.size(); i++) { + subchain.sequences.push_back({ sequences[i], 0 }); + } + } +} + +//------------------------------------------------------------------------------ + +/* + * A haplotype beging generated as a GBWT path. + * + * GBWT metadata will be set as following: + * + * * Sample name is "recombination". + * * Contig name is "chain_X", where X is the chain identifier. + * * Haplotype identifier is set during construction. + * * Fragment identifier is set as necessary. + */ +struct RecombinatorHaplotype { + typedef Recombinator::sequence_type sequence_type; + + // Contig name in GBWT metadata. + const std::string& contig_name; + + // Haplotype identifier in GBWT metadata. + size_t id; + + // Fragment identifier in GBWT metadata. + // If no original haplotype crosses a subchain, a new fragment will + // start after the subchain. + size_t fragment; + + // GBWT sequence indentifier in the previous subchain, or + // `gbwt::invalid_sequence()` if there was no such sequence. + gbwt::size_type sequence_id; + + // GBWT position at the end of the latest `extend()` call. + // `gbwt::invalid_edge()` otherwise. + gbwt::edge_type position; + + // The path being generated. + gbwt::vector_type path; + + /* + * Extends the haplotype over the given subchain by using the given + * original haplotype. + * + * This assumes that the original haplotype crosses the subchain. + * + * If `extend()` has been called for this fragment, there must be a + * unary path connecting the subchains, which will be used in the + * generated haplotype. + * + * If `extend()` has not been called, the generated haplotype will + * take the prefix of the original original haplotype until the start + * of the subchain. + */ + void extend(sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator, gbwt::GBWTBuilder& builder); + + // Takes an existing haplotype from the GBWT index and inserts it into + /// the builder. This is intended for fragments that do not contain + /// subchains crossed by the original haplotypes. The call will fail if + /// `extend()` has been called. + void take(gbwt::size_type sequence_id, const Recombinator& recombinator, gbwt::GBWTBuilder& builder); + + // Extends the original haplotype from the latest `extend()` call until + // the end, inserts it into the builder, and starts a new fragment. + // The call will fail if `extend()` has not been called for this + // fragment. + void finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder); + +private: + // Extends the haplotype over a unary path from a previous subchain. + void connect(gbwt::node_type until, const gbwtgraph::GBWTGraph& graph); + + // Takes a prefix of a sequence. + void prefix(gbwt::size_type sequence_id, gbwt::node_type until, const gbwt::GBWT& index); + + // Extends the haplotype from the previous subchain until the end. + void suffix(const gbwt::GBWT& index); + + // Inserts the current fragment into the builder. + void insert(gbwt::GBWTBuilder& builder); +}; + +void RecombinatorHaplotype::extend(sequence_type sequence, const Haplotypes::Subchain& subchain, const Recombinator& recombinator, gbwt::GBWTBuilder& builder) { + if (subchain.type == Haplotypes::Subchain::full_haplotype) { + throw std::runtime_error("Haplotype::extend(): cannot extend a full haplotype"); + } + + if (subchain.type == Haplotypes::Subchain::prefix) { + if (!this->path.empty()) { + throw std::runtime_error("Haplotype::extend(): got a prefix subchain after the start of a fragment"); + } + this->prefix(sequence.first, subchain.end, recombinator.gbz.index); + return; + } + + // Suffixes and normal subchains have a start node, so we must reach it first. + if (!this->path.empty()) { + this->connect(subchain.start, recombinator.gbz.graph); + } else { + this->prefix(sequence.first, subchain.start, recombinator.gbz.index); + } + + gbwt::edge_type curr(subchain.start, sequence.second); + if (!recombinator.gbz.index.contains(curr)) { + throw std::runtime_error("Haplotype::extend(): the GBWT index does not contain position (" + std::to_string(curr.first) + ", " + std::to_string(curr.second) + ")"); + } + + if (subchain.type == Haplotypes::Subchain::suffix) { + this->position = curr; + this->finish(recombinator, builder); + return; + } + + // This is a normal subchain. + while (curr.first != subchain.end) { + curr = recombinator.gbz.index.LF(curr); + if (curr.first == gbwt::ENDMARKER) { + throw std::runtime_error("Haplotype::extend(): the sequence did not reach the end of the subchain at GBWT node " + std::to_string(subchain.end)); + } + this->path.push_back(curr.first); + } + this->sequence_id = sequence.first; + this->position = curr; +} + +void RecombinatorHaplotype::take(gbwt::size_type sequence_id, const Recombinator& recombinator, gbwt::GBWTBuilder& builder) { + if (!this->path.empty()) { + throw std::runtime_error("Haplotype::take(): the current fragment is not empty"); + } + if (sequence_id >= recombinator.gbz.index.sequences()) { + throw std::runtime_error("Haplotype::take(): the GBWT index does not contain sequence " + std::to_string(sequence_id)); + } + this->path = recombinator.gbz.index.extract(sequence_id); + this->insert(builder); + this->fragment++; + this->sequence_id = gbwt::invalid_sequence(); + this->position = gbwt::invalid_edge(); + this->path.clear(); +} + +void RecombinatorHaplotype::finish(const Recombinator& recombinator, gbwt::GBWTBuilder& builder) { + if (this->position == gbwt::invalid_edge()) { + throw std::runtime_error("Haplotype::finish(): there is no current position"); + } + this->suffix(recombinator.gbz.index); + this->insert(builder); + this->fragment++; + this->sequence_id = gbwt::invalid_sequence(); + this->position = gbwt::invalid_edge(); + this->path.clear(); +} + +void RecombinatorHaplotype::connect(gbwt::node_type until, const gbwtgraph::GBWTGraph& graph) { + handle_t curr = gbwtgraph::GBWTGraph::node_to_handle(this->position.first); + handle_t end = gbwtgraph::GBWTGraph::node_to_handle(until); + this->position = gbwt::invalid_edge(); + hash_set visited; + while (curr != end) { + if (visited.find(curr) != visited.end()) { + throw std::runtime_error("Haplotype::connect(): the path contains a cycle"); + } + visited.insert(curr); + handle_t successor = empty_gbwtgraph_handle(); + size_t successors = 0; + graph.follow_edges(curr, false, [&](const handle_t& next) { + successor = next; + successors++; + }); + if (successors != 1) { + throw std::runtime_error("Haplotype::connect(): the path is not unary"); + } + this->path.push_back(gbwtgraph::GBWTGraph::handle_to_node(successor)); + curr = successor; + } +} + +void RecombinatorHaplotype::prefix(gbwt::size_type sequence_id, gbwt::node_type until, const gbwt::GBWT& index) { + this->position = gbwt::invalid_edge(); + if (sequence_id >= index.sequences()) { + throw std::runtime_error("Haplotype::prefix(): invalid GBWT sequence id " + std::to_string(sequence_id)); + } + this->sequence_id = sequence_id; + for (gbwt::edge_type curr = index.start(sequence_id); curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) { + this->path.push_back(curr.first); + if (curr.first == until) { + this->position = curr; + return; + } + } + throw std::runtime_error("Haplotype::prefix(): GBWT sequence " + std::to_string(sequence_id) + " did not reach GBWT node " + std::to_string(until)); +} + +void RecombinatorHaplotype::suffix(const gbwt::GBWT& index) { + for (gbwt::edge_type curr = index.LF(this->position); curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) { + this->path.push_back(curr.first); + } +} + +void RecombinatorHaplotype::insert(gbwt::GBWTBuilder& builder) { + std::string sample_name = "recombination"; + gbwt::size_type sample_id = builder.index.metadata.sample(sample_name); + if (sample_id >= builder.index.metadata.samples()) { + builder.index.metadata.addSamples({ sample_name }); + } + + gbwt::size_type contig_id = builder.index.metadata.contig(this->contig_name); + if (contig_id >= builder.index.metadata.contigs()) { + builder.index.metadata.addContigs({ this->contig_name }); + } + + builder.index.metadata.addPath(sample_id, contig_id, this->id, this->fragment); + builder.insert(this->path, true); +} + +//------------------------------------------------------------------------------ + +void Recombinator::Statistics::combine(const Statistics& another) { + this->chains += another.chains; + this->subchains += another.subchains; + this->fragments += another.fragments; + this->full_haplotypes += another.full_haplotypes; + this->haplotypes = std::max(this->haplotypes, another.haplotypes); + this->connections += another.connections; + this->ref_paths += another.ref_paths; + this->kmers += another.kmers; + this->score += another.score; +} + +std::ostream& Recombinator::Statistics::print(std::ostream& out) const { + out << this->haplotypes << " haplotypes for " << this->chains << " chains (" + << this->full_haplotypes << " full, " << this->subchains << " subchains, " << this->fragments << " fragments)"; + if (this->subchains > 0) { + double connection_rate = static_cast(this->connections) / (this->subchains * this->haplotypes); + out << "; connection rate " << connection_rate; + } + if (this->ref_paths > 0) { + out << "; included " << this->ref_paths << " reference paths"; + } + if (this->kmers > 0) { + double average_score = this->score / (this->kmers * this->haplotypes); + out << "; used " << this->kmers << " kmers with average score " << average_score; + } + return out; +} + +//------------------------------------------------------------------------------ + +Recombinator::Recombinator(const gbwtgraph::GBZ& gbz, Verbosity verbosity) : + gbz(gbz), verbosity(verbosity) +{ +} + +//------------------------------------------------------------------------------ + +void add_path(const gbwt::GBWT& source, gbwt::size_type path_id, gbwt::GBWTBuilder& builder) { + // We know that sufficient metadata exists, because this is a cached path. + gbwt::PathName path_name = source.metadata.path(path_id); + + std::string sample_name = source.metadata.sample(path_name.sample); + path_name.sample = builder.index.metadata.sample(sample_name); + if (path_name.sample >= builder.index.metadata.samples()) { + builder.index.metadata.addSamples({ sample_name }); + } + + std::string contig_name = source.metadata.contig(path_name.contig); + path_name.contig = builder.index.metadata.contig(contig_name); + if (path_name.contig >= builder.index.metadata.contigs()) { + builder.index.metadata.addContigs({ contig_name }); + } + + builder.index.metadata.addPath(path_name); + + gbwt::vector_type path = source.extract(gbwt::Path::encode(path_id, false)); + builder.insert(path, true); +} + +//------------------------------------------------------------------------------ + +void recombinator_sanity_checks(const Recombinator::Parameters& parameters) { + if (parameters.num_haplotypes == 0) { + std::string msg = "recombinator_sanity_checks(): number of haplotypes cannot be 0"; + throw std::runtime_error(msg); + } + if (parameters.diploid_sampling && parameters.num_haplotypes < 2) { + std::string msg = "recombinator_sanity_checks(): diploid sampling requires at least 2 haplotypes"; + throw std::runtime_error(msg); + } + if (parameters.present_discount < 0.0 || parameters.present_discount > 1.0) { + std::string msg = "recombinator_sanity_checks(): present discount must be between 0.0 and 1.0"; + throw std::runtime_error(msg); + } + if (parameters.het_adjustment < 0.0) { + std::string msg = "recombinator_sanity_checks(): het adjustment must be non-negative"; + throw std::runtime_error(msg); + } + if (parameters.absent_score < 0.0) { + std::string msg = "recombinator_sanity_checks(): absent score must be non-negative"; + throw std::runtime_error(msg); + } +} + +double get_or_estimate_coverage( + const hash_map& counts, + const Recombinator::Parameters& parameters, + Haplotypes::Verbosity verbosity) { + if (parameters.coverage > 0) { + return parameters.coverage; + } + + double start = gbwt::readTimer(); + if (verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Estimating kmer coverage" << std::endl; + } + std::map count_to_frequency; + for (auto iter = counts.begin(); iter != counts.end(); ++iter) { + // We are only interested in kmers with multiple occurrences, as unique + // kmers are likely sequencing errors. + if (iter->second > 1) { + count_to_frequency[iter->second]++; + } + } + + // Use mode as the initial estimate for coverage. + auto statistics = summary_statistics(count_to_frequency); + double coverage = statistics.mode; + bool reliable = true; + if (verbosity >= Haplotypes::verbosity_detailed) { + std::cerr << "Coverage: median " << statistics.median + << ", mean " << statistics.mean + << ", stdev " << statistics.stdev + << ", mode " << statistics.mode; + } + + // If mode < median, try to find a secondary peak at ~2x mode and use + // it if it is good enough. + if (statistics.mode < statistics.median) { + size_t low = 1.7 * statistics.mode, high = 2.3 * statistics.mode; + size_t peak = count_to_frequency[coverage]; + size_t best = low, secondary = count_to_frequency[low]; + for (size_t i = low + 1; i <= high; i++) { + if (count_to_frequency[i] > secondary) { + best = i; secondary = count_to_frequency[i]; + } + } + if (verbosity >= Haplotypes::verbosity_detailed) { + std::cerr << "; secondary peak at " << best; + } + if (best >= size_t(statistics.median) && secondary >= peak / 2) { + coverage = best; + } else { + reliable = false; + } + } + + if (verbosity >= Haplotypes::verbosity_detailed) { + std::cerr << "; using " << coverage << std::endl; + } + if (!reliable) { + std::cerr << "warning: Kmer coverage estimate is unreliable" << std::endl; + } + + if (verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Estimated kmer coverage in " << seconds << " seconds" << std::endl; + } + return coverage; +} + +gbwt::GBWT Recombinator::generate_haplotypes(const Haplotypes& haplotypes, const std::string& kff_file, const Parameters& parameters) const { + + // Sanity checks (may throw). + recombinator_sanity_checks(parameters); + + // Get kmer counts (may throw) and determine coverage. + hash_map counts = haplotypes.kmer_counts(kff_file, this->verbosity); + double coverage = get_or_estimate_coverage(counts, parameters, this->verbosity); + + double start = gbwt::readTimer(); + if (this->verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Building GBWT" << std::endl; + } + + // Determine construction jobs. + std::vector> jobs(haplotypes.jobs()); + for (auto& chain : haplotypes.chains) { + if (chain.job_id < haplotypes.jobs()) { + jobs[chain.job_id].push_back(chain.offset); + } + } + + // Figure out GBWT path ids for reference paths in each job. + std::vector> reference_paths(haplotypes.jobs()); + if (parameters.include_reference) { + for (size_t i = 0; i < this->gbz.graph.named_paths.size(); i++) { + size_t job_id = haplotypes.jobs_for_cached_paths[i]; + if (job_id < haplotypes.jobs()) { + reference_paths[job_id].push_back(this->gbz.graph.named_paths[i].id); + } + } + } + + // Build partial indexes. + double checkpoint = gbwt::readTimer(); + if (this->verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Running " << omp_get_max_threads() << " GBWT construction jobs in parallel" << std::endl; + } + std::vector indexes(jobs.size()); + Statistics statistics; + #pragma omp parallel for schedule(dynamic, 1) + for (size_t job = 0; job < jobs.size(); job++) { + gbwt::GBWTBuilder builder(sdsl::bits::length(this->gbz.index.sigma() - 1), parameters.buffer_size); + builder.index.addMetadata(); + Statistics job_statistics; + // Add haplotypes for each chain. + for (auto chain_id : jobs[job]) { + try { + Statistics chain_statistics = this->generate_haplotypes(haplotypes.chains[chain_id], counts, builder, parameters, coverage); + job_statistics.combine(chain_statistics); + } catch (const std::runtime_error& e) { + std::cerr << "error: [job " << job << "]: " << e.what() << std::endl; + std::exit(EXIT_FAILURE); + } + } + // Add named and reference paths. + for (auto path_id : reference_paths[job]) { + add_path(this->gbz.index, path_id, builder); + job_statistics.ref_paths++; + } + builder.finish(); + indexes[job] = builder.index; + #pragma omp critical + { + if (this->verbosity >= Haplotypes::verbosity_detailed) { + std::cerr << "Job " << job << ": "; job_statistics.print(std::cerr) << std::endl; + } + statistics.combine(job_statistics); + } + } + if (this->verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - checkpoint; + std::cerr << "Total: "; statistics.print(std::cerr) << std::endl; + std::cerr << "Finished the jobs in " << seconds << " seconds" << std::endl; + } + + // Merge the partial indexes. + checkpoint = gbwt::readTimer(); + if (this->verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Merging the partial indexes" << std::endl; + } + gbwt::GBWT merged(indexes); + if (parameters.include_reference) { + // If we included reference paths, set the same samples as references in the output GBWT. + std::string reference_samples = this->gbz.index.tags.get(gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG); + if (!reference_samples.empty()) { + merged.tags.set(gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG, reference_samples); + } + } + if (this->verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - checkpoint; + std::cerr << "Merged the indexes in " << seconds << " seconds" << std::endl; + } + + if (this->verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Built the GBWT in " << seconds << " seconds" << std::endl; + } + return merged; +} + +//------------------------------------------------------------------------------ + +// Classify the kmers in the subchain according to the kmer counts and the +// coverage. Return a vector of kmer types and their initial scores. Update the +// statistics with the number of non-frequent kmers if necessary. +std::vector> classify_kmers( + const Haplotypes::Subchain& subchain, + const hash_map& kmer_counts, + double coverage, + Recombinator::Statistics* statistics, + const Recombinator::Parameters& parameters +) { + // TODO: What are the proper thresholds? + double absent_threshold = coverage * 0.1; + double heterozygous_threshold = coverage / std::log(4.0); + double homozygous_threshold = coverage * 2.5; + + // TODO: -log prob may be the right score once we have enough haplotypes, but + // right now +1 works better, because we don't have haplotypes with the right + // combination of rare kmers. + // Determine the type of each kmer in the sample and the score for the kmer. + // A haplotype with the kmer gets +1.0 * score, while a haplotype without it + // gets -1.0 * score. + std::vector> kmer_types; + size_t selected_kmers = 0; + for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { + double count = kmer_counts.at(subchain.kmers[kmer_id].first); + if (count < absent_threshold) { + kmer_types.push_back({ Recombinator::absent, -1.0 * parameters.absent_score }); + selected_kmers++; + } else if (count < heterozygous_threshold) { + kmer_types.push_back({ Recombinator::heterozygous, 0.0 }); + selected_kmers++; + } else if (count < homozygous_threshold) { + kmer_types.push_back({ Recombinator::present, 1.0 }); + selected_kmers++; + } else { + kmer_types.push_back({ Recombinator::frequent, 0.0 }); + } + } + if (statistics != nullptr) { + statistics->kmers += selected_kmers; + } + + return kmer_types; +} + +// Select the best pair of haplotypes from the candidates. Each haplotype gets +// +1 for getting a kmer right and -1 for getting it wrong. +std::vector> select_diploid( + const Haplotypes::Subchain& subchain, + const std::vector>& candidates, + const std::vector>& kmer_types +) { + std::int64_t best_score = std::numeric_limits::min(); + size_t best_left = 0, best_right = 1; + + for (size_t left = 0; left < candidates.size(); left++) { + size_t left_offset = candidates[left].first * subchain.kmers.size(); + for (size_t right = left + 1; right < candidates.size(); right++) { + std::int64_t score = 0; + size_t right_offset = candidates[right].first * subchain.kmers.size(); + for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { + int64_t found = subchain.kmers_present[left_offset + kmer_id] + subchain.kmers_present[right_offset + kmer_id]; + switch (kmer_types[kmer_id].first) { + case Recombinator::absent: + score += 1 - found; // +1 for 0, 0 for 1, -1 for 2 + break; + case Recombinator::heterozygous: + score += (found == 1 ? 1 : 0); + break; + case Recombinator::present: + score += found - 1; // -1 for 0, 0 for 1, +1 for 2 + break; + default: + break; + } + } + if (score > best_score) { + best_score = score; + best_left = left; + best_right = right; + } + } + } + + return { candidates[best_left], candidates[best_right] }; +} + +// Returns (sequence offset in the subchain, score). +// Updates statistics with the number of kmers used if provided. +// Updates the local haplotypes with scores and ranks in each round of selection +// if provided. +std::vector> select_haplotypes( + const Haplotypes::Subchain& subchain, + const hash_map& kmer_counts, + double coverage, + Recombinator::Statistics* statistics, + std::vector* local_haplotypes, + const Recombinator::Parameters& parameters +) { + // Classify the kmers. + std::vector> kmer_types = classify_kmers(subchain, kmer_counts, coverage, statistics, parameters); + + // Select the haplotypes greedily. + std::vector> selected_haplotypes; + std::vector> remaining_haplotypes; + for (size_t seq_offset = 0; seq_offset < subchain.sequences.size(); seq_offset++) { + remaining_haplotypes.push_back( { seq_offset, 0.0 }); + } + while (selected_haplotypes.size() < parameters.num_haplotypes && !remaining_haplotypes.empty()) { + // Score the remaining haplotypes. + for (size_t i = 0; i < remaining_haplotypes.size(); i++) { + size_t offset = remaining_haplotypes[i].first * subchain.kmers.size(); + double score = 0.0; + for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { + double multiplier = -1.0 + 2.0 * subchain.kmers_present[offset + kmer_id]; + score += multiplier * kmer_types[kmer_id].second; + } + remaining_haplotypes[i].second = score; + } + + // Report ranks and scores for each remaining haplotype. + if (local_haplotypes != nullptr) { + auto copy = remaining_haplotypes; + std::sort( + copy.begin(), copy.end(), + [](std::pair a, std::pair b) -> bool { + return (a.second > b.second); + } + ); + for (size_t i = 0; i < copy.size(); i++) { + size_t sequence_id = copy[i].first; + double score = copy[i].second; + (*local_haplotypes)[sequence_id].scores.emplace_back(i, score); + } + } + + // Select the highest-scoring haplotype. + size_t selected = 0; + for (size_t i = 1; i < remaining_haplotypes.size(); i++) { + if (remaining_haplotypes[i].second > remaining_haplotypes[selected].second) { + selected = i; + } + } + selected_haplotypes.push_back(remaining_haplotypes[selected]); + remaining_haplotypes.erase(remaining_haplotypes.begin() + selected); + + // Adjust kmer scores based on the selected haplotype. + size_t offset = selected_haplotypes.back().first * subchain.kmers.size(); + for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { + switch (kmer_types[kmer_id].first) { + case Recombinator::heterozygous: + kmer_types[kmer_id].second += (subchain.kmers_present[offset + kmer_id] ? -1.0 : 1.0) * parameters.het_adjustment; + break; + case Recombinator::present: + if (subchain.kmers_present[offset + kmer_id]) { + kmer_types[kmer_id].second *= parameters.present_discount; + } + break; + default: + break; + } + } + } + + // If we did not have enough haplotypes in the subchain, repeat them as necessary. + size_t original_selected = selected_haplotypes.size(); + for (size_t i = original_selected; i < parameters.num_haplotypes; i++) { + auto next = selected_haplotypes[i % original_selected]; + selected_haplotypes.push_back(next); + } + + // Do diploid sampling if necessary. + if (parameters.diploid_sampling) { + return select_diploid(subchain, selected_haplotypes, kmer_types); + } + + return selected_haplotypes; +} + +Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::TopLevelChain& chain, + const hash_map& kmer_counts, + gbwt::GBWTBuilder& builder, + const Parameters& parameters, + double coverage +) const { + size_t final_haplotypes = (parameters.diploid_sampling ? 2 : parameters.num_haplotypes); + std::vector haplotypes; + for (size_t i = 0; i < final_haplotypes; i++) { + haplotypes.push_back({ chain.contig_name, i + 1, 0, gbwt::invalid_sequence(), gbwt::invalid_edge(), {} }); + } + + Statistics statistics; + statistics.chains = 1; statistics.haplotypes = haplotypes.size(); + if (chain.subchains.size() == 1 && chain.subchains.front().type == Haplotypes::Subchain::full_haplotype) { + // Full haplotypes should all be identical, because there are no snarls. + // Therefore we do not need kmers. + auto& subchain = chain.subchains.front(); + for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) { + assert(!subchain.sequences.empty()); + size_t seq = haplotype % subchain.sequences.size(); + haplotypes[haplotype].take(subchain.sequences[seq].first, *this, builder); + } + statistics.full_haplotypes = 1; + } else { + bool have_haplotypes = false; + for (auto& subchain : chain.subchains) { + if (subchain.type == Haplotypes::Subchain::full_haplotype) { + throw std::runtime_error("Recombinator::generate_haplotypes(): nontrivial chain " + std::to_string(chain.offset) + " contains a subchain with full haplotypes"); + } + assert(!subchain.sequences.empty()); + + // Select the haplotypes greedily. + std::vector> selected_haplotypes = select_haplotypes( + subchain, kmer_counts, coverage, &statistics, nullptr, parameters + ); + + // Try to match the existing haplotypes with the selected sequences based on + // GBWT sequence id. + // TODO: There are often many equally good haplotypes, and because we choose + // a different subset of them in each subchain, we often cannot connect the + // same haplotype from subchain to subchain, even when it would be one of the + // top ones in each of them. + std::vector haplotype_to_selected(haplotypes.size(), haplotypes.size()); + sdsl::bit_vector selected_in_use(haplotypes.size(), 0); + for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) { + for (size_t selected = 0; selected < haplotypes.size(); selected++) { + if (subchain.sequences[selected_haplotypes[selected].first].first == haplotypes[haplotype].sequence_id && !selected_in_use[selected]) { + haplotype_to_selected[haplotype] = selected; + selected_in_use[selected] = 1; + statistics.connections++; + break; + } + } + } + for (size_t haplotype = 0, selected = 0; haplotype < haplotypes.size(); haplotype++) { + if (haplotype_to_selected[haplotype] < haplotypes.size()) { + continue; + } + while (selected < haplotypes.size() && selected_in_use[selected]) { + selected++; + } + assert(selected < haplotypes.size()); + haplotype_to_selected[haplotype] = selected; + selected_in_use[selected] = 1; + selected++; + } + + // Finally extend the haplotypes with the selected and matched sequences. + for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) { + size_t selected = haplotype_to_selected[haplotype]; + size_t seq_offset = selected_haplotypes[selected].first; + statistics.score += selected_haplotypes[selected].second; + haplotypes[haplotype].extend(subchain.sequences[seq_offset], subchain, *this, builder); + } + have_haplotypes = subchain.has_end(); + statistics.subchains++; + } + if (have_haplotypes) { + for (size_t haplotype = 0; haplotype < haplotypes.size(); haplotype++) { + haplotypes[haplotype].finish(*this, builder); + } + } + statistics.fragments = haplotypes.front().fragment; + } + + return statistics; +} + +//------------------------------------------------------------------------------ + +std::vector Recombinator::extract_sequences( + const Haplotypes& haplotypes, const std::string& kff_file, + size_t chain_id, size_t subchain_id, const Parameters& parameters +) const { + // Sanity checks. + if (chain_id >= haplotypes.chains.size()) { + std::string msg = "Recombinator::extract_sequences(): invalid chain id " + std::to_string(chain_id); + throw std::runtime_error(msg); + } + if (subchain_id >= haplotypes.chains[chain_id].subchains.size()) { + std::string msg = "Recombinator::extract_sequences(): invalid subchain id " + std::to_string(subchain_id) + + " in chain " + std::to_string(chain_id); + throw std::runtime_error(msg); + } + recombinator_sanity_checks(parameters); + + // Extract the haplotypes. + const Haplotypes::Subchain& subchain = haplotypes.chains[chain_id].subchains[subchain_id]; + std::vector result(subchain.sequences.size()); + for (size_t i = 0; i < subchain.sequences.size(); i++) { + size_t path_id = gbwt::Path::id(subchain.sequences[i].first); + path_handle_t path_handle = this->gbz.graph.path_to_handle(path_id); + result[i].name = this->gbz.graph.get_path_name(path_handle); + + gbwt::edge_type pos; + if (subchain.has_start()) { + pos = gbwt::edge_type(subchain.start, subchain.sequences[i].second); + } else { + pos = this->gbz.index.start(subchain.sequences[i].first); + } + handle_t until = gbwtgraph::GBWTGraph::node_to_handle(subchain.end); + size_t limit = std::numeric_limits::max(); + result[i].sequence = generate_haplotype(pos, until, limit, limit, this->gbz.graph); + } + + // Get kmer counts (may throw) and determine coverage. + hash_map counts = haplotypes.kmer_counts(kff_file, this->verbosity); + double coverage = get_or_estimate_coverage(counts, parameters, this->verbosity); + + // Fill in the scores. + select_haplotypes(subchain, counts, coverage, nullptr, &result, parameters); + + return result; +} + +//------------------------------------------------------------------------------ + +} // namespace vg diff --git a/src/recombinator.hpp b/src/recombinator.hpp new file mode 100644 index 00000000000..d9ad6e781a7 --- /dev/null +++ b/src/recombinator.hpp @@ -0,0 +1,528 @@ +#ifndef VG_RECOMBINATOR_HPP_INCLUDED +#define VG_RECOMBINATOR_HPP_INCLUDED + +/** \file + * Tools for generating synthetic haplotypes as recombinations of existing + * haplotypes. + */ + +#include "gbwt_helper.hpp" +#include "gbwtgraph_helper.hpp" +#include "hash_map.hpp" +#include "snarl_distance_index.hpp" + +#include + +#include + +namespace vg { + +//------------------------------------------------------------------------------ + +/** + * A representation of the haplotypes in a graph. + * + * The graph is partitioned into top-level chains, which are further partitioned + * into subchains. Each subchain contains a set of kmers and a collection of + * sequences. Each sequence is defined by a bitvector marking the kmers that are + * present. + * + * At the moment, the kmers are minimizers with a single occurrence in the graph. + * The requirement is that each kmer is specific to a single subchain and does + * not occur anywhere else in either orientation. (If no haplotype crosses a + * snarl, that snarl is broken into a suffix and a prefix, and those subchains + * may share kmers.) + * + * NOTE: This assumes that the top-level chains are linear, not cyclical. + * + * Versions: + * + * * Version 2: Top-level chains include a contig name. Compatible with version 1. + * + * * Version 1: Initial version. + */ +class Haplotypes { +public: + /// The amount of progress information that should be printed to stderr. + enum Verbosity : size_t { + /// No progress information. + verbosity_silent = 0, + + /// Basic information. + verbosity_basic = 1, + + /// Basic information and detailed statistics. + verbosity_detailed = 2, + + /// Basic information, detailed statistics, and debug information. + verbosity_debug = 3 + }; + + /// Header of the serialized file. + struct Header { + constexpr static std::uint32_t MAGIC_NUMBER = 0x4C504148; // "HAPL" + constexpr static std::uint32_t VERSION = 2; + constexpr static std::uint32_t MIN_VERSION = 1; + constexpr static std::uint64_t DEFAULT_K = 29; + + /// A magic number that identifies the file. + std::uint32_t magic_number = MAGIC_NUMBER; + + /// Version of the file. + std::uint32_t version = VERSION; + + /// Number of top-level chains in the graph. + std::uint64_t top_level_chains = 0; + + /// Number of GBWT construction jobs for the chains. + std::uint64_t construction_jobs = 0; + + /// Total number of subchains in all chains. + std::uint64_t total_subchains = 0; + + /// Total number of kmers in all subchains. + std::uint64_t total_kmers = 0; + + /// Length of the kmers. + std::uint64_t k = DEFAULT_K; + }; + + /// A GBWT sequence as (sequence identifier, offset in a node). + typedef std::pair sequence_type; + + /// Representation of a subchain. + struct Subchain { + /// Subchain types. + enum subchain_t : std::uint64_t { + /// Normal subchain with two boundary nodes. + normal = 0, + + /// A prefix with only an end node. + prefix = 1, + + /// A suffix with only a start node. + suffix = 2, + + /// A full haplotype with no boundary nodes. + full_haplotype = 3 + }; + + /// An encoded kmer. + typedef gbwtgraph::Key64::value_type kmer_type; + + /// The type of this subchain. + subchain_t type; + + /// Boundary nodes, or `gbwt::ENDMARKER` if not present. + gbwt::node_type start, end; + + /// A vector of distinct kmers. For each kmer, list the kmer itself and the number + /// of haplotypes it appears in. + std::vector> kmers; + + // TODO: This could be smaller + /// Sequences as (GBWT sequence id, offset in the relevant node). + std::vector sequences; + + // TODO: This needs to be compressed for larger datasets. + sdsl::bit_vector kmers_present; + + /// Returns the start node as a GBWTGraph handle. + handle_t start_handle() const { return gbwtgraph::GBWTGraph::node_to_handle(this->start); } + + /// Returns the end node as a GBWTGraph handle. + handle_t end_handle() const { return gbwtgraph::GBWTGraph::node_to_handle(this->end); } + + /// Returns `true` if the subchain has a start node. + bool has_start() const { return (this->type == normal || this->type == suffix); } + + /// Returns `true` if the subchain has an end node. + bool has_end() const { return (this->type == normal || this->type == prefix); } + + /// Returns a string representation of the type and the boundary nodes. + std::string to_string() const; + + /// Serializes the object to a stream in the simple-sds format. + void simple_sds_serialize(std::ostream& out) const; + + /// Loads the object from a stream in the simple-sds format. + void simple_sds_load(std::istream& in); + + /// Returns the size of the object in elements. + size_t simple_sds_size() const; + }; + + /// Representation of a top-level chain. + struct TopLevelChain { + /// Offset in the child list of the root snarl. + size_t offset; + + /// GBWT construction job for this chain. + size_t job_id; + + /// Contig name corresponding to the chain. + std::string contig_name; + + /// Subchains in the order they appear in. + std::vector subchains; + + /// Serializes the object to a stream in the simple-sds format. + void simple_sds_serialize(std::ostream& out) const; + + /// Loads the object from a stream in the simple-sds format. + void simple_sds_load(std::istream& in); + + /// Loads the old version without a contig name. + void load_old(std::istream& in); + + /// Returns the size of the object in elements. + size_t simple_sds_size() const; + }; + + /// Returns the number of weakly connected components. + size_t components() const { return this->header.top_level_chains; } + + /// Returns the number of GBWT construction jobs. + size_t jobs() const { return this->header.construction_jobs; } + + /// Returns the length of the kmers. + size_t k() const { return this->header.k; } + + Header header; + + // Job ids for each cached path in the GBWTGraph, or `jobs()` if the path is empty. + std::vector jobs_for_cached_paths; + + std::vector chains; + + /** + * Returns a mapping from kmers to their counts in the given KFF file. + * The counts include both the kmer and the reverse complement. + * + * Reads the KFF file using OpenMP threads. Exits with `std::exit()` if + * the file cannot be opened and throws `std::runtime_error` if the kmer + * counts cannot be used. + */ + hash_map kmer_counts(const std::string& kff_file, Verbosity verbosity) const; + + /// Serializes the object to a stream in the simple-sds format. + void simple_sds_serialize(std::ostream& out) const; + + /// Loads the object from a stream in the simple-sds format. + void simple_sds_load(std::istream& in); + + /// Returns the size of the object in elements. + size_t simple_sds_size() const; +}; + +//------------------------------------------------------------------------------ + +/** + * A tool for transforming the haplotypes in a GBWT index into a `Haplotypes` + * representation. Requires a GBZ graph, an r-index, a distance index, and a + * minimizer index. + */ +class HaplotypePartitioner { +public: + /// Target length of a subchain. + constexpr static size_t SUBCHAIN_LENGTH = 10000; + + /// Approximate number of construction jobs to be created. + constexpr static size_t APPROXIMATE_JOBS = 32; + + /// The amount of progress information that should be printed to stderr. + typedef Haplotypes::Verbosity Verbosity; + + /// A GBWT sequence as (sequence identifier, offset in a node). + typedef Haplotypes::sequence_type sequence_type; + + /// An encoded kmer. + typedef Haplotypes::Subchain::kmer_type kmer_type; + + /// Minimizer index without payloads. + typedef gbwtgraph::MinimizerIndex minimizer_index_type; + + /** + * A subchain is a substring of a top-level chain defined by at most two + * boundary nodes. + * + * Normal subchains have two boundary nodes, which are assumed to be the + * start node of a snarl and the end node of a possibly different snarl. + * There are assumed to be haplotypes crossing the subchain. Prefixes and + * suffixes lack one of the boundary nodes, while full haplotypes lack + * both. + * + * When a top-level chain is partitioned into subchains, the boundary nodes + * may either overlap or be connected by unary paths. If a snarl is not + * connected, it may be presented as a suffix and a prefix. + */ + struct Subchain { + /// The type of this subchain. + Haplotypes::Subchain::subchain_t type; + + /// Start node. + handle_t start; + + /// End node. + handle_t end; + + /// Returns `true` if the subchain has a start node. + bool has_start() const { return (this->type == Haplotypes::Subchain::normal || this->type == Haplotypes::Subchain::suffix); } + + /// Returns `true` if the subchain has an end node. + bool has_end() const { return (this->type == Haplotypes::Subchain::normal || this->type == Haplotypes::Subchain::prefix); } + }; + + /// Creates a new `HaplotypePartitioner` using the given indexes. + HaplotypePartitioner(const gbwtgraph::GBZ& gbz, + const gbwt::FastLocate& r_index, + const SnarlDistanceIndex& distance_index, + const minimizer_index_type& minimizer_index, + Verbosity verbosity); + + /// Parameters for `partition_haplotypes()`. + struct Parameters { + /// Target length for subchains (in bp). + size_t subchain_length = SUBCHAIN_LENGTH; + + /// Generate approximately this many jobs. + size_t approximate_jobs = APPROXIMATE_JOBS; + }; + + /** + * Creates a `Haplotypes` representation of the haplotypes in the GBWT index. + * + * Top-level chains (weakly connected components in the graph) are assigned to + * a number of jobs that can be later used as GBWT construction jobs. Multiple + * jobs are run in parallel using OpenMP threads. + * + * Each top-level chain is partitioned into subchains that consist of one or + * more snarls. Multiple snarls are combined into the same subchain if the + * minimum distance over the subchain is at most the target length and there + * are GBWT haplotypes that cross the subchain. If there are no snarls in a + * top-level chain, it is represented as a single subchain without boundary + * nodes. + * + * Haplotypes crossing each subchain are represented using minimizers with a + * single occurrence in the graph. + * + * Throws `std::runtime_error` on error in single-threaded parts and exits + * with `std::exit(EXIT_FAILURE)` in multi-threaded parts. + */ + Haplotypes partition_haplotypes(const Parameters& parameters) const; + + const gbwtgraph::GBZ& gbz; + const gbwt::FastLocate& r_index; + const SnarlDistanceIndex& distance_index; + const minimizer_index_type& minimizer_index; + + Verbosity verbosity; + +private: + // Return the minimum distance from the last base of `from` to the first base of `to`. + size_t get_distance(handle_t from, handle_t to) const; + + // Partition the top-level chain into subchains. + std::vector get_subchains(const gbwtgraph::TopLevelChain& chain, const Parameters& parameters) const; + + // Return (SA[i], i) for all GBWT sequences visiting a handle, sorted by sequence id + // and the number of the visit. + std::vector get_sequence_visits(handle_t handle) const; + + // Return (DA[i], i) for all GBWT sequences visiting a handle, sorted by sequence id. + std::vector get_sequences(handle_t handle) const; + + // Get all GBWT sequences crossing the subchain. The sequences will be at + // start for normal subchains and suffixes and at end for prefixes. + std::vector get_sequences(Subchain subchain) const; + + // Return the sorted set of kmers that are minimizers in the sequence and have + // a single occurrence in the graph. + std::vector unique_minimizers(gbwt::size_type sequence_id) const; + + // Count the number of minimizers in the sequence over the subchain with a single + // occurrence in the graph. Return the sorted set of kmers that are minimizers in + // the sequence over the subchain and have a single occurrence in the graph. + // + // To avoid using kmers shared between all haplotypes in the subchain, and + // potentially with neighboring subchains, this does not include kmers contained + // entirely in the shared initial/final nodes. + std::vector unique_minimizers(sequence_type sequence, Subchain subchain) const; + + // Build subchains for a specific top-level chain. + void build_subchains(const gbwtgraph::TopLevelChain& chain, Haplotypes::TopLevelChain& output, const Parameters& parameters) const; +}; + +//------------------------------------------------------------------------------ + +/** + * A class that creates synthetic haplotypes from a `Haplotypes` representation of + * local haplotypes. + */ +class Recombinator { +public: + /// Number of haplotypes to be generated. + constexpr static size_t NUM_HAPLOTYPES = 4; + + /// Expected kmer coverage. Use 0 to estimate from kmer counts. + constexpr static size_t COVERAGE = 0; + + /// Block size (in kmers) for reading KFF files. + constexpr static size_t KFF_BLOCK_SIZE = 1000000; + + /// Multiplier to the score of a present kmer every time a haplotype with that + /// kmer is selected. + constexpr static double PRESENT_DISCOUNT = 0.9; + + /// Adjustment to the score of a heterozygous kmer every time a haplotype with + /// (-) or without (+) that kmer is selected. + constexpr static double HET_ADJUSTMENT = 0.05; + + /// Score for getting an absent kmer right/wrong. This should be less than 1, if + /// we assume that having the right variants in the graph is more important than + /// keeping wrong variants out. + constexpr static double ABSENT_SCORE = 0.8; + + /// The amount of progress information that should be printed to stderr. + typedef Haplotypes::Verbosity Verbosity; + + /// A GBWT sequence as (sequence identifier, offset in a node). + typedef Haplotypes::sequence_type sequence_type; + + /// Statistics on the generated haplotypes. + struct Statistics { + /// Number of top-level chains. + size_t chains = 0; + + /// Number of subchains. + size_t subchains = 0; + + /// Number of fragments. + size_t fragments = 0; + + /// Number of top-level chains where full haplotypes were taken. + size_t full_haplotypes = 0; + + /// Number of haplotypes generated. + size_t haplotypes = 0; + + /// Number of times a haplotype was extended from a subchain to the next subchain. + size_t connections = 0; + + /// Number of reference paths included. + size_t ref_paths = 0; + + /// Number of kmers selected. + size_t kmers = 0; + + /// Total score for selected sequences. + double score = 0.0; + + /// Combines the statistics into this object. + void combine(const Statistics& another); + + /// Prints the statistics and returns the output stream. + std::ostream& print(std::ostream& out) const; + }; + + /// Creates a new `Recombinator`. + Recombinator(const gbwtgraph::GBZ& gbz, Verbosity verbosity); + + /// Parameters for `generate_haplotypes()`. + struct Parameters { + /// Number of haplotypes to be generated. + size_t num_haplotypes = NUM_HAPLOTYPES; + + /// Kmer coverage. Use 0 to estimate from kmer counts. + size_t coverage = COVERAGE; + + /// Buffer size (in nodes) for GBWT construction. + gbwt::size_type buffer_size = gbwt::DynamicGBWT::INSERT_BATCH_SIZE; + + /// Multiplicative factor for discounting the scores for present kmers after + /// selecting a haplotype with that kmer. + double present_discount = PRESENT_DISCOUNT; + + /// Additive term for adjusting the scores for heterozygous kmers after + /// each haplotype to encourage even sampling of haplotypes with and without + /// that kmer. + double het_adjustment = HET_ADJUSTMENT; + + /// Score for absent kmers. This should be less than 1 if we assume that + /// having the right variants in the graph is more important than keeping + /// the wrong variants out. + double absent_score = ABSENT_SCORE; + + /// After selecting the initial `num_haplotypes` haplotypes, choose the + /// highest-scoring pair out of them. + bool diploid_sampling = false; + + /// Include named and reference paths. + bool include_reference = false; + }; + + /** + * Generates haplotypes based on the given `Haplotypes` representation and + * the kmer counts in the given KFF file. + * + * Runs multiple GBWT construction jobs in parallel using OpenMP threads and + * generates the specified number of haplotypes in each top-level chain + * (component). + * + * Each generated haplotype has a single source haplotype in each subchain. + * The subchains are connected by unary paths. Suffix / prefix subchains in + * the middle of a chain create fragment breaks. If the chain starts without + * a prefix (ends without a suffix), the haplotype chosen for the first (last) + * subchain is used from the start (continued until the end). + * + * Throws `std::runtime_error` on error in single-threaded parts and exits + * with `std::exit(EXIT_FAILURE)` in multi-threaded parts. + */ + gbwt::GBWT generate_haplotypes(const Haplotypes& haplotypes, const std::string& kff_file, const Parameters& parameters) const; + + /// A local haplotype sequence within a single subchain. + struct LocalHaplotype { + /// Name of the haplotype. + std::string name; + + /// Sequence in forward orientation. + std::string sequence; + + /// (rank, score) in each round of haplotype selection this haplotype + /// participates in. + std::vector> scores; + }; + + /// Kmer classification. + enum kmer_presence { absent, heterozygous, present, frequent }; + + /** + * Extracts the local haplotypes in the given subchain. In addition to the + * haplotype sequence, this also reports the name of the corresponding path + * as well as (rank, score) for the haplotype in each round of haplotype + * selection. The number of rounds is `parameters.num_haplotyeps`, but if + * the haplotype is selected earlier, it will not get further scores. + * + * Throws `std::runtime_error` on error. + */ + std::vector extract_sequences( + const Haplotypes& haplotypes, const std::string& kff_file, + size_t chain_id, size_t subchain_id, const Parameters& parameters + ) const; + + const gbwtgraph::GBZ& gbz; + Verbosity verbosity; + +private: + // Generate haplotypes for the given chain. + Statistics generate_haplotypes(const Haplotypes::TopLevelChain& chain, + const hash_map& kmer_counts, + gbwt::GBWTBuilder& builder, + const Parameters& parameters, double coverage) const; +}; + +//------------------------------------------------------------------------------ + +} // namespace vg + +#endif // VG_RECOMBINATOR_HPP_INCLUDED diff --git a/src/region.cpp b/src/region.cpp index 250fc9eb876..2d899704733 100644 --- a/src/region.cpp +++ b/src/region.cpp @@ -8,25 +8,26 @@ namespace vg { void parse_region(const string& target, string& name, int64_t& start, int64_t& end) { start = -1; end = -1; - size_t foundFirstColon = target.find(":"); + size_t foundLastColon = target.rfind(":"); // we only have a single string, use the whole sequence as the target - if (foundFirstColon == string::npos) { + if (foundLastColon == string::npos) { name = target; } else { - name = target.substr(0, foundFirstColon); - size_t foundRangeDash = target.find("-", foundFirstColon); + name = target.substr(0, foundLastColon); + size_t foundRangeDash = target.find("-", foundLastColon); if (foundRangeDash == string::npos) { - start = atoi(target.substr(foundFirstColon + 1).c_str()); + start = atoi(target.substr(foundLastColon + 1).c_str()); end = start; } else { - start = atoi(target.substr(foundFirstColon + 1, foundRangeDash - foundRangeDash - 1).c_str()); + start = atoi(target.substr(foundLastColon + 1, foundRangeDash - foundRangeDash - 1).c_str()); end = atoi(target.substr(foundRangeDash + 1).c_str()); } } } void parse_bed_regions(const string& bed_path, - vector& out_regions) { + vector& out_regions, + vector* out_names) { out_regions.clear(); ifstream bedstream(bed_path); if (!bedstream) { @@ -36,6 +37,7 @@ void parse_bed_regions(const string& bed_path, string row; string sbuf; string ebuf; + string nbuf; for (int line = 1; getline(bedstream, row); ++line) { Region region; if (row.size() < 2 || row[0] == '#') { @@ -44,7 +46,8 @@ void parse_bed_regions(const string& bed_path, istringstream ss(row); if (!getline(ss, region.seq, '\t') || !getline(ss, sbuf, '\t') || - !getline(ss, ebuf, '\t')) { + !getline(ss, ebuf, '\t') || + (out_names != nullptr && !getline(ss, nbuf, '\t'))) { cerr << "Error parsing bed line " << line << ": " << row << endl; } else { region.start = std::stoi(sbuf); @@ -55,6 +58,10 @@ void parse_bed_regions(const string& bed_path, region.end -= 1; out_regions.push_back(region); + + if (out_names != nullptr) { + out_names->push_back(nbuf); + } } } } diff --git a/src/region.hpp b/src/region.hpp index 6111a8f73dd..a7052eba651 100644 --- a/src/region.hpp +++ b/src/region.hpp @@ -4,7 +4,6 @@ #include #include #include -#include "xg.hpp" namespace vg { @@ -15,8 +14,8 @@ using namespace std; // Generally regions parsed form user input will be 1-based. struct Region { string seq; - int64_t start; - int64_t end; + int64_t start = -1; + int64_t end = -1; }; // Parse a genomic contig[:start-end] region. Outputs -1 for missing start or end. @@ -37,7 +36,8 @@ inline void parse_region(string& region, // So bedline "chr1 5 10" will return start=5 stop=9 void parse_bed_regions( const string& bed_path, - vector& out_regions); + vector& out_regions, + vector* out_names = nullptr); } diff --git a/src/region_expander.cpp b/src/region_expander.cpp new file mode 100644 index 00000000000..e1f2885de26 --- /dev/null +++ b/src/region_expander.cpp @@ -0,0 +1,162 @@ +#include "region_expander.hpp" + +namespace vg { + + RegionExpander::RegionExpander(const PathPositionHandleGraph* graph, const SnarlManager* snarl_manager) : + graph(graph), snarl_manager(snarl_manager) + { + // Nothing to do + } + + map, pair> RegionExpander::expanded_subgraph(const GFFRecord& gff_record) { + + map, pair> return_val; + + vector> interval_subpath; + + assert(gff_record.start != -1 && gff_record.end != -1 && gff_record.start <= gff_record.end); + + if (!graph->has_path(gff_record.sequence_id)) { + cerr << "error [RegionExpander] cannot expand genomic interval, graph does not contain path with name: " << gff_record.sequence_id << endl; + exit(1); + } + + path_handle_t path_handle = graph->get_path_handle(gff_record.sequence_id); + + // walk along the path for the interval and add the corresponding nodes to the subgraph + + step_handle_t step = graph->get_step_at_position(path_handle, gff_record.start); + handle_t handle = graph->get_handle_of_step(step); + id_t node_id = graph->get_id(handle); + bool is_rev = graph->get_is_reverse(handle); + size_t node_length = graph->get_length(handle); + + size_t at_pos = graph->get_position_of_step(step); + + interval_subpath.emplace_back(node_id, is_rev); + return_val[make_pair(node_id, is_rev)] = pair(gff_record.start - at_pos, + node_length); + at_pos += node_length; + + while (at_pos <= gff_record.end) { + step = graph->get_next_step(step); + handle = graph->get_handle_of_step(step); + node_id = graph->get_id(handle); + is_rev = graph->get_is_reverse(handle); + node_length = graph->get_length(handle); + + interval_subpath.emplace_back(node_id, is_rev); + return_val[make_pair(node_id, is_rev)] = pair(0, node_length); + at_pos += node_length; + } + + return_val[make_pair(node_id, is_rev)].second = gff_record.end - (at_pos - node_length) + 1; + + // walk along the path and identify snarls that have both ends on the path + + unordered_set entered_snarls; + unordered_set completed_snarls; + + for (size_t i = 0; i + 1 < interval_subpath.size(); i++) { + const Snarl* snarl_out = snarl_manager->into_which_snarl(interval_subpath[i].first, + !interval_subpath[i].second); + + if (snarl_out) { + if (entered_snarls.count(snarl_out)) { + completed_snarls.insert(snarl_out); + } + } + + const Snarl* snarl_in = snarl_manager->into_which_snarl(interval_subpath[i].first, + interval_subpath[i].second); + + if (snarl_in) { + entered_snarls.insert(snarl_in); + } + } + + // traverse the subgraph in each snarl and add it to the annotation + for (const Snarl* snarl : completed_snarls) { + // orient the snarl to match the orientation of the annotation + handle_t oriented_start, oriented_end; + if (return_val.count(pair(snarl->start().node_id(), + snarl->start().backward()))) { + + oriented_start = graph->get_handle(snarl->start().node_id(), + snarl->start().backward()); + + oriented_end = graph->get_handle(snarl->end().node_id(), + snarl->end().backward()); + + } + else { + + oriented_start = graph->get_handle(snarl->end().node_id(), + !snarl->end().backward()); + + oriented_end = graph->get_handle(snarl->start().node_id(), + !snarl->start().backward()); + } + + // mark all the exits and the entry point as untraversable + unordered_set stacked{oriented_start, oriented_end, graph->flip(oriented_start)}; + vector stack{oriented_start}; + + // make an index for jumping over the inside of child snarls + unordered_map child_snarl_skips; + for (const Snarl* child : snarl_manager->children_of(snarl)) { + handle_t start = graph->get_handle(child->start().node_id(), + child->start().backward()); + handle_t end = graph->get_handle(child->end().node_id(), + child->end().backward()); + + child_snarl_skips[start] = end; + child_snarl_skips[graph->flip(end)] = graph->flip(start); + } + + // traverse the subgraph and add it to the return value + while (!stack.empty()) { + handle_t handle = stack.back(); + stack.pop_back(); + + pair trav = make_pair(graph->get_id(handle), + graph->get_is_reverse(handle)); + + if (!return_val.count(trav)) { + return_val[trav] = pair(0, graph->get_length(handle)); + } + + if (child_snarl_skips.count(handle)) { + // skip over the internals of the child snarl we're pointing into + handle_t next = child_snarl_skips[handle]; + if (!stacked.count(next)) { + stack.push_back(next); + stacked.insert(next); + } + } + else { + // traverse edges + graph->follow_edges(handle, false, [&](const handle_t& next) { + if (!stacked.count(next)) { + stack.push_back(next); + stacked.insert(next); + } + }); + } + } + } + + if (gff_record.strand_is_rev) { + // we did all our queries with respect to the forward strand, flip it back to the reverse + map, pair> reversed_map; + for (const auto& record : return_val) { + uint64_t node_length = graph->get_length(graph->get_handle(record.first.first)); + reversed_map[make_pair(record.first.first, !record.first.second)] = make_pair(node_length - record.second.second, + node_length - record.second.first); + } + return_val = move(reversed_map); + } + + return return_val; + } +} diff --git a/src/region_expander.hpp b/src/region_expander.hpp new file mode 100644 index 00000000000..54c2e8a1962 --- /dev/null +++ b/src/region_expander.hpp @@ -0,0 +1,27 @@ +#ifndef VG_REGION_EXPANDER_HPP_INCLUDED +#define VG_REGION_EXPANDER_HPP_INCLUDED + +#include "handle.hpp" +#include "snarls.hpp" +#include "gff_reader.hpp" + +namespace vg { + + class RegionExpander { + + public: + RegionExpander(const PathPositionHandleGraph* graph, const SnarlManager* snarl_manager); + ~RegionExpander() = default; + + map, pair> expanded_subgraph(const GFFRecord& gff_record); + + private: + + const PathPositionHandleGraph* graph = nullptr; + const SnarlManager* snarl_manager = nullptr; + + }; + +} + +#endif diff --git a/src/reverse_graph.cpp b/src/reverse_graph.cpp new file mode 100644 index 00000000000..6eac9cf5db4 --- /dev/null +++ b/src/reverse_graph.cpp @@ -0,0 +1,82 @@ +/** + * \file reverse_graph.cpp: contains the implementation of ReverseGraph + */ + + +#include "reverse_graph.hpp" + + +namespace vg { + +using namespace std; + + ReverseGraph::ReverseGraph(const HandleGraph* forward_graph, bool complement) : + forward_graph(forward_graph), complement(complement) { + // nothing to do + } + + bool ReverseGraph::has_node(id_t node_id) const { + return forward_graph->has_node(node_id); + } + + handle_t ReverseGraph::get_handle(const id_t& node_id, bool is_reverse) const { + return forward_graph->get_handle(node_id, is_reverse); + } + + id_t ReverseGraph::get_id(const handle_t& handle) const { + return forward_graph->get_id(handle); + } + + bool ReverseGraph::get_is_reverse(const handle_t& handle) const { + return forward_graph->get_is_reverse(handle); + } + + handle_t ReverseGraph::flip(const handle_t& handle) const { + return forward_graph->flip(handle); + } + + size_t ReverseGraph::get_length(const handle_t& handle) const { + return forward_graph->get_length(handle); + } + + string ReverseGraph::get_sequence(const handle_t& handle) const { + // reverse, possibly complement, the sequence + string sequence = forward_graph->get_sequence(handle); + if (complement) { + reverse_complement_in_place(sequence); + } + else { + reverse(sequence.begin(), sequence.end()); + } + return sequence; + } + + bool ReverseGraph::follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const { + // the left and right side have been switched, so reverse the direction + return forward_graph->follow_edges(handle, !go_left, iteratee); + } + + bool ReverseGraph::for_each_handle_impl(const function& iteratee, + bool parallel) const { + // since the handles are the same we can just execute this + return forward_graph->for_each_handle(iteratee, parallel); + } + + size_t ReverseGraph::get_node_count() const { + return forward_graph->get_node_count(); + } + + id_t ReverseGraph::min_node_id() const { + return forward_graph->min_node_id(); + } + + id_t ReverseGraph::max_node_id() const { + return forward_graph->max_node_id(); + } + + handle_t ReverseGraph::get_underlying_handle(const handle_t& handle) const { + return flip(handle); + } +} + diff --git a/src/reverse_graph.hpp b/src/reverse_graph.hpp new file mode 100644 index 00000000000..3767426388a --- /dev/null +++ b/src/reverse_graph.hpp @@ -0,0 +1,111 @@ +#ifndef VG_REVERSE_GRAPH_HPP_INCLUDED +#define VG_REVERSE_GRAPH_HPP_INCLUDED + +/** \file + * reverse_graph.hpp: defines a handle graph implementation that reverses the sequences + * of some other graph + */ + +#include "handle.hpp" +#include "utility.hpp" + +namespace vg { + +using namespace std; + + /** + * A HandleGraph implementation that wraps some other handle graph and reverses + * and optionally complements the sequences. + */ + class ReverseGraph : public ExpandingOverlayGraph { + public: + + /// Initialize as the reverse version of another graph, optionally also + /// complementing + ReverseGraph(const HandleGraph* forward_graph, bool complement); + + /// Default constructor -- not actually functional + ReverseGraph() = default; + + /// Default destructor + ~ReverseGraph() = default; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + // Method to check if a node exists by ID + virtual bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + virtual handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + virtual id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + virtual bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + virtual handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + virtual size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + virtual string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + virtual bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph + virtual size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t max_node_id() const; + + //////////////////////////////////////////////////////////////////////////// + /// (Future) Overlay Interface + //////////////////////////////////////////////////////////////////////////// + + /// Convert a backing graph handle to our handle to the same node + inline handle_t from_backing(const handle_t& backing_handle) const { + return backing_handle; + } + + protected: + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /** + * Returns the handle in the underlying graph that corresponds to a handle in the + * overlay + */ + virtual handle_t get_underlying_handle(const handle_t& handle) const; + + /// The forward version of the graph we're making backwards + const HandleGraph* forward_graph = nullptr; + + /// Complement the sequences? + bool complement = false; + }; +} + +#endif diff --git a/src/sampler.cpp b/src/sampler.cpp index 69a9f60c865..5936d2fa533 100644 --- a/src/sampler.cpp +++ b/src/sampler.cpp @@ -1,59 +1,137 @@ #include "sampler.hpp" +#include "path.hpp" +#include "utility.hpp" +#include "position.hpp" +#include "alignment.hpp" +#include "algorithms/path_string.hpp" +#include "algorithms/subgraph.hpp" +#include "algorithms/alignment_path_offsets.hpp" +#include "algorithms/next_pos_chars.hpp" + +#include + //#define debug_ngs_sim +using namespace vg::io; + namespace vg { -/// Make a path sampling distribution based on relative lengths -void Sampler::set_source_paths(const vector& source_paths) { - this->source_paths = source_paths; - if (!source_paths.empty()) { - vector path_lengths; - for (auto& source_path : source_paths) { - path_lengths.push_back(xgidx->path_length(source_path)); - } - path_sampler = vg::discrete_distribution<>(path_lengths.begin(), path_lengths.end()); +void AbstractReadSampler::annotate_with_path_positions(Alignment& aln) { + // We need to annotate the alignment with the right kind of path positions + // that we are configured for. + if (multi_position_annotations) { + algorithms::annotate_with_node_path_positions(graph, aln, 0, annotation_path_filter.get()); } else { + algorithms::annotate_with_initial_path_positions(graph, aln, 0, annotation_path_filter.get()); + } +} + +void Sampler::set_source_paths(const vector& source_paths, + const vector& source_path_ploidies, + const vector>& transcript_expressions, + const vector>& haplotype_transcripts) { + if (!source_paths.empty() && !transcript_expressions.empty()) { + cerr << "error:[Sampler] cannot sample from list of paths and from list of transcripts simultaneously" << endl; + exit(1); + } + else if (!haplotype_transcripts.empty() && transcript_expressions.empty()) { + cerr << "error:[Sampler] cannot sample from haplotype transcripts without an expression profile" << endl; + exit(1); + } + if (!source_path_ploidies.empty() && source_path_ploidies.size() != source_paths.size()) { + cerr << "error:[Sampler] cannot sample from list of paths with the wrong number of ploidy weights (" + << source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl; + exit(1); + } + else if (!transcript_expressions.empty()) { + this->source_paths.clear(); + vector expression_values; + if (haplotype_transcripts.empty()) { + for (const pair& transcript_expression : transcript_expressions) { + this->source_paths.push_back(transcript_expression.first); + size_t tx_len = graph.get_path_length(graph.get_path_handle(transcript_expression.first)); + expression_values.push_back(transcript_expression.second * tx_len); + } + } + else { + unordered_map> haplotypes_of_transcript; + for (size_t i = 0; i < haplotype_transcripts.size(); ++i) { + haplotypes_of_transcript[get<1>(haplotype_transcripts[i])].push_back(i); + } + for (const pair& transcript_expression : transcript_expressions) { + size_t total_haplotypes = 0; + for (size_t i : haplotypes_of_transcript[transcript_expression.first]) { + total_haplotypes += get<2>(haplotype_transcripts[i]); + } + for (size_t i : haplotypes_of_transcript[transcript_expression.first]) { + double haplotype_expression = (transcript_expression.second * get<2>(haplotype_transcripts[i])) / total_haplotypes; + size_t hp_tx_len = graph.get_path_length(graph.get_path_handle(get<0>(haplotype_transcripts[i]))); + expression_values.push_back(haplotype_expression * hp_tx_len); + this->source_paths.push_back(get<0>(haplotype_transcripts[i])); + } + } + } + path_sampler = vg::discrete_distribution<>(expression_values.begin(), expression_values.end()); + } + else if (!source_paths.empty()) { + this->source_paths = source_paths; + vector path_weights; + path_weights.reserve(source_paths.size()); + for (size_t i = 0; i < source_paths.size(); i++) { + // For each source path + auto& source_path = source_paths[i]; + // Grab an applicable ploidy weight, or assume 1 + double ploidy = i >= source_path_ploidies.size() ? 1.0 : source_path_ploidies[i]; + + // Add each path, weighted by ploidy and length, to the distribution for sampling paths + path_weights.push_back(ploidy * graph.get_path_length(graph.get_path_handle(source_path))); + } + path_sampler = vg::discrete_distribution<>(path_weights.begin(), path_weights.end()); + } + else { path_sampler = vg::discrete_distribution<>(); } } + /// We have a helper function to convert path positions and orientations to /// pos_t values. -pos_t position_at(xg::XG* xgidx, const string& path_name, const size_t& path_offset, bool is_reverse) { - Mapping path_mapping = xgidx->mapping_at_path_position(path_name, path_offset); - id_t id = xgidx->node_at_path_position(path_name, path_offset); +pos_t position_at(PathPositionHandleGraph* graph_ptr, const string& path_name, const size_t& path_offset, bool is_reverse) { + path_handle_t path_handle = graph_ptr->get_path_handle(path_name); + step_handle_t step = graph_ptr->get_step_at_position(path_handle, path_offset); + handle_t handle = graph_ptr->get_handle_of_step(step); // Work out where in that mapping we should be. - size_t node_offset = path_offset - (xgidx->node_start_at_path_position(path_name, path_offset)); + size_t node_offset = path_offset - graph_ptr->get_position_of_step(step); if (is_reverse) { // Flip the node offset around to be from the end and not the start - node_offset = xgidx->node_length(id) - node_offset - 1; + node_offset = graph_ptr->get_length(handle) - node_offset - 1; } // Make a pos_t for where we are, on the appropriate strand - pos_t pos = make_pos_t(path_mapping.position().node_id(), path_mapping.position().is_reverse() != is_reverse, node_offset); + pos_t pos = make_pos_t(graph_ptr->get_id(handle), graph_ptr->get_is_reverse(handle) != is_reverse, node_offset); return pos; } pos_t Sampler::position(void) { // We sample from the entire graph sequence, 1-based. - vg::uniform_int_distribution xdist(1, xgidx->seq_length); + vg::uniform_int_distribution xdist(1, total_seq_length); size_t offset = xdist(rng); - id_t id = xgidx->node_at_seq_pos(offset); + id_t id = dynamic_cast(&graph)->node_at_vector_offset(offset); vg::uniform_int_distribution flip(0, 1); bool rev = forward_only ? false : flip(rng); // 1-0 base conversion - size_t node_offset = offset - xgidx->node_start(id) - 1; + size_t node_offset = offset - dynamic_cast(&graph)->node_vector_offset(id) - 1; // Ignore flipping the node offset because we're uniform over both strands return make_pos_t(id, rev, node_offset); } string Sampler::sequence(size_t length) { pos_t pos = position(); - cerr << pos << endl; + string seq; while (seq.size() < length) { auto nextc = next_pos_chars(pos); @@ -280,23 +358,12 @@ Alignment Sampler::mutate(const Alignment& aln, mutaln.set_sequence(alignment_seq(mutaln)); mutaln.set_name(aln.name()); mutaln.clear_refpos(); - xg_annotate_with_initial_path_positions(mutaln, true, false, xgidx); + annotate_with_path_positions(mutaln); return mutaln; } string Sampler::alignment_seq(const Alignment& aln) { - // get the graph corresponding to the alignment path - Graph sub; - for (int i = 0; i < aln.path().mapping_size(); ++ i) { - auto& m = aln.path().mapping(i); - if (m.has_position() && m.position().node_id()) { - auto id = aln.path().mapping(i).position().node_id(); - xgidx->get_id_range(id, id, sub); - } - } - xgidx->expand_context(sub, 2, false); - VG g; g.extend(sub); - return g.path_string(aln.path()); + return algorithms::path_string(graph, aln.path()); } vector Sampler::alignment_pair(size_t read_length, size_t fragment_length, double fragment_std_dev, double base_error, double indel_error) { @@ -329,6 +396,9 @@ vector Sampler::alignment_pair(size_t read_length, size_t fragment_le (function) ([&](int64_t id) { return (int64_t)node_length(id); })); + // And annotate with true positions + annotate_with_path_positions(aln1); + annotate_with_path_positions(aln2); return fragments; } @@ -345,7 +415,9 @@ Alignment Sampler::alignment(size_t length) { Alignment Sampler::alignment_to_path(const string& source_path, size_t length) { // Pick a starting point along the path and an orientation - vg::uniform_int_distribution xdist(0, xgidx->path_length(source_path) - 1); + path_handle_t path_handle = graph.get_path_handle(source_path); + uint64_t path_length = graph.get_path_length(path_handle); + vg::uniform_int_distribution xdist(0, path_length - 1); size_t path_offset = xdist(rng); vg::uniform_int_distribution flip(0, 1); bool rev = forward_only ? false : flip(rng); @@ -359,7 +431,7 @@ Alignment Sampler::alignment_to_path(const string& source_path, size_t length) { while (seq.size() < length) { // Make a pos_t for where we are, on the appropriate strand - pos_t pos = position_at(xgidx, source_path, path_offset, rev); + pos_t pos = position_at(&graph, source_path, path_offset, rev); // Add that character to the sequence seq.push_back(pos_char(pos)); @@ -379,7 +451,7 @@ Alignment Sampler::alignment_to_path(const string& source_path, size_t length) { } path_offset--; } else { - if (path_offset == xgidx->path_length(source_path) - 1) { + if (path_offset == path_length - 1) { // Out of path! break; } @@ -405,7 +477,7 @@ Alignment Sampler::alignment_to_path(const string& source_path, size_t length) { // And set its identity aln.set_identity(identity(aln.path())); aln.clear_refpos(); - xg_annotate_with_initial_path_positions(aln, true, false, xgidx); + annotate_with_path_positions(aln); return aln; } @@ -457,19 +529,18 @@ Alignment Sampler::alignment_to_graph(size_t length) { } // And set its identity aln.set_identity(identity(aln.path())); - xg_annotate_with_initial_path_positions(aln, true, false, xgidx); + annotate_with_path_positions(aln); return aln; } Alignment Sampler::alignment_with_error(size_t length, double base_error, double indel_error) { - size_t maxiter = 100; Alignment aln; size_t iter = 0; if (base_error > 0 || indel_error > 0) { // sample a longer-than necessary alignment, then trim - while (iter++ < maxiter) { + while (iter++ < max_tries) { aln = mutate( alignment(length + 2 * ((double) length * indel_error)), base_error, indel_error); @@ -484,7 +555,7 @@ Alignment Sampler::alignment_with_error(size_t length, } } else { size_t iter = 0; - while (iter++ < maxiter) { + while (iter++ < max_tries) { aln = alignment(length); if (aln.sequence().size() == length && !(no_Ns && aln.sequence().find('N') != string::npos)) { @@ -492,28 +563,28 @@ Alignment Sampler::alignment_with_error(size_t length, } } } - if (iter == maxiter) { - cerr << "[vg::Sampler] Warning: could not generate alignment of sufficient length. " - << "Graph may be too small, or indel rate too high." << endl; + if (iter == max_tries) { + cerr << "[vg::Sampler] Warning: could not generate alignment of sufficient length in " + << max_tries << " tries. Graph may be too small, or indel rate too high." << endl; } aln.set_identity(identity(aln.path())); // Check the alignment to make sure we didn't mess it up assert(is_valid(aln)); - xg_annotate_with_initial_path_positions(aln, true, false, xgidx); + annotate_with_path_positions(aln); return aln; } size_t Sampler::node_length(id_t id) { - return xg_cached_node_length(id, xgidx, node_cache); + return graph.get_length(graph.get_handle(id)); } char Sampler::pos_char(pos_t pos) { - return xg_cached_pos_char(pos, xgidx, node_cache); + return graph.get_base(graph.get_handle(id(pos), is_rev(pos)), offset(pos)); } map Sampler::next_pos_chars(pos_t pos) { - return xg_cached_next_pos_chars(pos, xgidx, node_cache, edge_cache); + return algorithms::next_pos_chars(graph, pos); } bool Sampler::is_valid(const Alignment& aln) { @@ -529,7 +600,7 @@ bool Sampler::is_valid(const Alignment& aln) { auto accounted_bases = observed_from + mapping.position().offset(); // How many bases need to be accounted for? - auto expected_bases = xgidx->node_length(mapping.position().node_id()); + auto expected_bases = graph.get_length(graph.get_handle(mapping.position().node_id())); if (accounted_bases != expected_bases) { cerr << "[vg::Sampler] Warning: alignment mapping " << i << " accounts for " @@ -548,46 +619,58 @@ bool Sampler::is_valid(const Alignment& aln) { const string NGSSimulator::alphabet = "ACGT"; -NGSSimulator::NGSSimulator(xg::XG& xg_index, +NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph, const string& ngs_fastq_file, + const string& ngs_paired_fastq_file, bool interleaved_fastq, - const vector& source_paths, + const vector& source_paths_input, + const vector& source_path_ploidies, + const vector>& transcript_expressions, + const vector>& haplotype_transcripts, double substition_polymorphism_rate, double indel_polymorphism_rate, double indel_error_proportion, - double insert_length_mean, - double insert_length_stdev, + double fragment_length_mean, + double fragment_length_stdev, double error_multiplier, bool retry_on_Ns, - size_t seed) : - xg_index(xg_index) - , node_cache(100) - , edge_cache(100) + bool sample_unsheared_paths, + uint64_t manual_seed) : + AbstractReadSampler(graph) , sub_poly_rate(substition_polymorphism_rate) , indel_poly_rate(indel_polymorphism_rate) , indel_error_prop(indel_error_proportion) - , insert_mean(insert_length_mean) - , insert_sd(insert_length_stdev) + , fragment_mean(fragment_length_mean) + , fragment_sd(fragment_length_stdev) , retry_on_Ns(retry_on_Ns) - , prng(seed ? seed : random_device()()) , strand_sampler(0, 1) , background_sampler(0, alphabet.size() - 1) , mut_sampler(0, alphabet.size() - 2) , prob_sampler(0.0, 1.0) - , insert_sampler(insert_length_mean, insert_length_stdev) - , seed(seed) - , source_paths(source_paths) - , joint_initial_distr(seed - 1) + , seed(manual_seed) + , source_paths(source_paths_input) + , joint_initial_distr(manual_seed ? 1760681024122689423ull * manual_seed + 1107607255714504485ull : random_device()()) + , sample_unsheared_paths(sample_unsheared_paths) { - if (source_paths.empty()) { - start_pos_samplers.emplace_back(1, xg_index.seq_length); - } else { - vector path_sizes; - for (const auto& source_path : source_paths) { - path_sizes.push_back(xg_index.path_length(source_path)); - start_pos_samplers.emplace_back(0, path_sizes.back() - 1); - } - path_sampler = vg::discrete_distribution<>(path_sizes.begin(), path_sizes.end()); + if (!ngs_paired_fastq_file.empty() && interleaved_fastq) { + cerr << "error:[NGSSimulator] cannot indicate interleaved FASTQ and paired FASTQs simultaneously" << endl; + exit(1); + } + + if (!source_paths.empty() && !transcript_expressions.empty()) { + cerr << "error:[NGSSimulator] cannot simultaneously limit sampling to paths and match an expression profile" << endl; + exit(1); + } + + if (!source_path_ploidies.empty() && source_path_ploidies.size() != source_paths.size()) { + cerr << "error:[NGSSimulator] cannot sample from list of paths with the wrong number of ploidy weights (" + << source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl; + exit(1); + } + + if (!haplotype_transcripts.empty() && transcript_expressions.empty()) { + cerr << "error:[NGSSimulator] cannot sample from haplotype transcripts without an expression profile" << endl; + exit(1); } if (substition_polymorphism_rate < 0.0 || substition_polymorphism_rate > 1.0 @@ -602,25 +685,141 @@ NGSSimulator::NGSSimulator(xg::XG& xg_index, exit(1); } - if (insert_length_mean <= 0.0) { - cerr << "error:[NGSSimulator] Mean insert length must be positive" << endl; + if (fragment_length_mean <= 0.0) { + cerr << "error:[NGSSimulator] Mean fragment length must be positive" << endl; exit(1); } - if (insert_length_stdev < 0.0) { - cerr << "error:[NGSSimulator] Insert length standard deviation must be positive" << endl; + if (fragment_length_stdev < 0.0) { + cerr << "error:[NGSSimulator] Fragment length standard deviation must be positive" << endl; exit(1); } - if (insert_length_mean < 5.0 * insert_length_stdev) { - cerr << "warning:[NGSSimulator] Recommended that insert length mean (" << insert_length_mean - << ") > 5 * insert length standard deviation (" << insert_length_stdev << ")" << endl; + + + if (source_paths_input.empty() && transcript_expressions.empty()) { + // we are sampling from all positions + graph.for_each_handle([&](const handle_t& handle) { + total_seq_length += graph.get_length(handle); + }); + start_pos_samplers.emplace_back(1, total_seq_length); + } + else if (!source_paths_input.empty()) { + // we are sampling from a given set of source paths + // TODO: Deduplicate with Sampler's code that does almost exactly this. + vector path_weights; + path_weights.reserve(source_paths.size()); + for (size_t i = 0; i < source_paths.size(); i++) { + // For each source path + auto& source_path = source_paths[i]; + + size_t length = graph.get_path_length(graph.get_path_handle(source_path)); + + // Always use accurate length for sampling start pos, even with sample_unsheared_paths + start_pos_samplers.emplace_back(0, length - 1); + + if (length == 0) { + path_weights.push_back(0.0); + } + else if (sample_unsheared_paths) { + // sample uniformly between paths + path_weights.push_back(1.0); + } + else { + // Sample paths proportional to effective length and ploidy + double eff_path_len; + if (fragment_mean != numeric_limits::max()) { + double trunc_mean = vg::truncated_normal_distribution<>(fragment_mean, fragment_sd, 1.0, length).mean(); + eff_path_len = length - trunc_mean; + } + else { + eff_path_len = length; + } + + // Grab an applicable ploidy weight, or assume 1 if not set or if using sample_unsheared_paths + double ploidy = i >= source_path_ploidies.size() ? 1.0 : source_path_ploidies[i]; + + // Add each path, weighted by ploidy and length, to the distribution for sampling paths + path_weights.push_back(ploidy * eff_path_len); + } + } + path_sampler = vg::discrete_distribution<>(path_weights.begin(), path_weights.end()); + } + else { + // we are sampling according to an expression profile + vector expression_values; + + if (haplotype_transcripts.empty()) { + // no transcript name file provided, path names should match transcript names in the + // expression file + for (const pair& transcript_expression : transcript_expressions) { + size_t tx_len = graph.get_path_length(graph.get_path_handle(transcript_expression.first)); + if (tx_len == 0) { + continue; + } + source_paths.push_back(transcript_expression.first); + start_pos_samplers.emplace_back(0, tx_len - 1); + if (sample_unsheared_paths) { + expression_values.push_back(transcript_expression.second); + } + else { + double eff_tx_len; + if (fragment_mean != numeric_limits::max()) { + double trunc_mean = vg::truncated_normal_distribution<>(fragment_mean, fragment_sd, 1.0, tx_len).mean(); + eff_tx_len = tx_len - trunc_mean; + } + else { + eff_tx_len = tx_len; + } + expression_values.push_back(transcript_expression.second * eff_tx_len); + } + } + } + else { + // map the transcript names to the haplotype transcript names + unordered_map> haplotypes_of_transcript; + for (size_t i = 0; i < haplotype_transcripts.size(); ++i) { + haplotypes_of_transcript[get<1>(haplotype_transcripts[i])].push_back(i); + } + for (const pair& transcript_expression : transcript_expressions) { + // split the expression up among the haplotype transcripts according to their count + size_t total_haplotypes = 0; + for (size_t i : haplotypes_of_transcript[transcript_expression.first]) { + total_haplotypes += get<2>(haplotype_transcripts[i]); + } + for (size_t i : haplotypes_of_transcript[transcript_expression.first]) { + size_t hp_tx_len = graph.get_path_length(graph.get_path_handle(get<0>(haplotype_transcripts[i]))); + if (hp_tx_len == 0) { + continue; + } + source_paths.push_back(get<0>(haplotype_transcripts[i])); + start_pos_samplers.emplace_back(0, hp_tx_len - 1); + double haplotype_expression = (transcript_expression.second * get<2>(haplotype_transcripts[i])) / total_haplotypes; + if (sample_unsheared_paths) { + expression_values.push_back(haplotype_expression); + } + else { + double eff_hp_tx_len; + if (fragment_mean != numeric_limits::max()) { + double trunc_mean = vg::truncated_normal_distribution<>(fragment_mean, fragment_sd, 1.0, hp_tx_len).mean(); + eff_hp_tx_len = hp_tx_len - trunc_mean; + } + else { + eff_hp_tx_len = hp_tx_len; + } + expression_values.push_back(haplotype_expression * eff_hp_tx_len); + } + } + } + } + + path_sampler = vg::discrete_distribution<>(expression_values.begin(), expression_values.end()); } // memoize phred conversions phred_prob.resize(256); for (int i = 1; i < phred_prob.size(); i++) { - phred_prob[i] = error_multiplier * phred_to_prob(i); + phred_prob[i] = error_multiplier * phred_to_prob((uint8_t)i); } for (size_t i = 0; i < alphabet.size(); i++) { @@ -633,6 +832,7 @@ NGSSimulator::NGSSimulator(xg::XG& xg_index, } } + // record read lengths and the empirical distribution of base qualities unordered_map length_count; if (interleaved_fastq) { fastq_paired_interleaved_for_each(ngs_fastq_file, [&](const Alignment& aln_1, const Alignment& aln_2) { @@ -641,6 +841,13 @@ NGSSimulator::NGSSimulator(xg::XG& xg_index, record_read_pair_quality(aln_1, aln_2); }); } + else if (!ngs_paired_fastq_file.empty()) { + fastq_paired_two_files_for_each(ngs_fastq_file, ngs_paired_fastq_file, [&](const Alignment& aln_1, const Alignment& aln_2) { + length_count[aln_1.quality().size()]++; + length_count[aln_2.quality().size()]++; + record_read_pair_quality(aln_1, aln_2); + }); + } else { fastq_unpaired_for_each(ngs_fastq_file, [&](const Alignment& aln) { length_count[aln.quality().size()]++; @@ -648,6 +855,7 @@ NGSSimulator::NGSSimulator(xg::XG& xg_index, }); } + // auto-detect the read length size_t modal_length = 0; size_t modal_length_count = 0; size_t total_reads = 0; @@ -659,18 +867,20 @@ NGSSimulator::NGSSimulator(xg::XG& xg_index, total_reads += length_record.second; } - if (((double) modal_length_count) / total_reads < 0.5) { + if (((double) modal_length_count) / total_reads < 0.5 && !sample_unsheared_paths) { cerr << "warning:[NGSSimulator] Auto-detected read length of " << modal_length << " encompasses less than half of training reads, NGSSimulator is optimized for training data in which most reads are the same length" << endl; } - if (modal_length > insert_length_mean - 2.0 * insert_length_stdev) { - cerr << "warning:[NGSSimulator] Auto-detected read length of " << modal_length << " is long compared to mean insert length " << insert_length_mean << " and standard deviation " << insert_length_stdev << ", sampling may take additional time and statistical properties of insert length distribution may not reflect input parameters" << endl; + if (modal_length > fragment_length_mean - 2.0 * fragment_length_stdev && !sample_unsheared_paths) { + cerr << "warning:[NGSSimulator] Auto-detected read length of " << modal_length << " is long compared to mean fragment length " << fragment_length_mean << " and standard deviation " << fragment_length_stdev << ". Sampling may take additional time and the statistical properties of the fragment length distribution may not reflect input parameters." << endl; } - while (transition_distrs_1.size() > modal_length) { + // shorten the quality string samplers until they are the modal length (this determines read length later) + // if we're sampling unsheared paths, take the whole read + while (transition_distrs_1.size() > modal_length && !sample_unsheared_paths) { transition_distrs_1.pop_back(); } - while (transition_distrs_2.size() > modal_length) { + while (transition_distrs_2.size() > modal_length && !sample_unsheared_paths) { transition_distrs_2.pop_back(); } @@ -681,15 +891,58 @@ NGSSimulator::NGSSimulator(xg::XG& xg_index, finalize(); + + uint64_t prng_seed = seed ? seed : random_device()(); + // engine with coding-time random coefficient to produce good seeds for each thread + // from one seed + linear_congruential_engine seed_perturbor(prng_seed); + // make a prng for each thread + for (int i = 0, n = get_thread_count(); i < n; ++i) { + prngs.emplace_back(seed_perturbor()); + } + #ifdef debug_ngs_sim cerr << "finished initializing simulator" << endl; #endif } +void NGSSimulator::connect_to_position_file(const string& filename) { + if (source_paths.empty()) { + cerr << "warning:[NGSSimulator] path position file will not be created because not simulating from paths" << endl; + return; + } + position_file.open(filename); + if (!position_file) { + cerr << "error:[NGSSimulator] failed to open position file: " << filename << endl; + exit(1); + } + position_file << "read\tpath\toffset\treverse" << endl; +} + +mt19937_64& NGSSimulator::prng() { + return prngs[omp_get_thread_num()]; +} + +void NGSSimulator::register_sampled_position(const Alignment& aln, const string& path_name, + size_t offset, bool is_reverse) { + if (position_file.is_open()) { + // we're recording positions + if (is_reverse) { + // get the position of the end instead of the start + offset -= path_from_length(aln.path()); + } + string line = aln.name() + '\t' + path_name + '\t' + to_string(offset) + '\t' + to_string(is_reverse) + '\n'; +#pragma omp critical + position_file << line; + } +} + Alignment NGSSimulator::sample_read() { - Alignment aln; + + aln.set_name(get_read_name()); + // sample a quality string based on the trained distribution pair> qual_and_masks = sample_read_quality(); @@ -706,19 +959,31 @@ Alignment NGSSimulator::sample_read() { aln.set_quality(qual_and_masks.first); + // We won't try indefinitely to find a place between Ns + size_t failures_due_to_n_bases = 0; + + // Sample our path (if dealing with source_paths) + size_t source_path_idx = sample_path(); + string source_path; + if (source_path_idx != numeric_limits::max()) { + source_path = source_paths[source_path_idx]; + } + + // This is our offset along the source path, if in use + int64_t sampled_offset; + // And our direction to go along the source path, if in use + bool sampled_is_reverse; + // attempt samples until we get one that succeeds without walking // off the end of the graph while (!aln.has_path()) { - // This is our offset along the source path, if in use - size_t offset; - // And our direction to go along the source path, if in use - bool is_reverse; - // And our position in the graph, which we use whether there's a source path or not. + // Populate the sample positions pos_t pos; - // And our path (if dealing with source_paths) - string source_path; - // Populate them - sample_start_pos(offset, is_reverse, pos, source_path); + sample_start_pos(source_path_idx, (int64_t) qual_and_masks.first.size(), + sampled_offset, sampled_is_reverse, pos); + // copy the values so that we can change them without forgetting the start location + int64_t offset = sampled_offset; + bool is_reverse = sampled_is_reverse; // align the first end at this position on the source path or graph sample_read_internal(aln, offset, is_reverse, pos, source_path); @@ -727,6 +992,18 @@ Alignment NGSSimulator::sample_read() { if (aln.sequence().find('N') != string::npos) { aln.clear_path(); aln.clear_sequence(); + failures_due_to_n_bases++; + + if (failures_due_to_n_bases >= max_tries) { + // We have hit Ns too many times in a row and need to bail out or give up. + stringstream ss; + ss << "Failed to sample a " << std::to_string(aln.quality().size()) << " bp sequence without Ns"; + if (source_path_idx != numeric_limits::max()) { + ss << " from path " << source_path; + } + ss << " for our maximum of " << max_tries << " tries. Is there such a sequence available?"; + throw std::runtime_error(ss.str()); + } } } } @@ -734,13 +1011,19 @@ Alignment NGSSimulator::sample_read() { // mask out any of the sequence that we sampled to be an 'N' apply_N_mask(*aln.mutable_sequence(), qual_and_masks.second); - aln.set_name(get_read_name()); - xg_annotate_with_initial_path_positions(aln, true, false, &xg_index); + annotate_with_path_positions(aln); + + register_sampled_position(aln, source_path, sampled_offset + sampled_is_reverse, sampled_is_reverse); return aln; } pair NGSSimulator::sample_read_pair() { pair aln_pair; + + string name = get_read_name(); + aln_pair.first.set_name(name + "_1"); + aln_pair.second.set_name(name + "_2"); + pair>, pair>> qual_and_mask_pair = sample_read_quality_pair(); #ifdef debug_ngs_sim @@ -761,37 +1044,75 @@ pair NGSSimulator::sample_read_pair() { assert(qual_and_mask_pair.first.first.size() == qual_and_mask_pair.first.second.size()); assert(qual_and_mask_pair.second.first.size() == qual_and_mask_pair.second.second.size()); + // Sample our path (if dealing with source_paths) + size_t source_path_idx = sample_path(); + string source_path; + vg::truncated_normal_distribution<> fragment_sampler; + if (source_path_idx != numeric_limits::max()) { + source_path = source_paths[source_path_idx]; +#ifdef debug_ngs_sim + cerr << "sampling from path " << source_path << " with length " << graph.get_path_length(graph.get_path_handle(source_path)) << endl; +#endif + int64_t path_length = graph.get_path_length(graph.get_path_handle(source_path)); + fragment_sampler = vg::truncated_normal_distribution<>(fragment_mean, fragment_sd, 1.0, path_length); + } + else { + fragment_sampler = vg::truncated_normal_distribution<>(fragment_mean, fragment_sd, 1.0); + } + int64_t fragment_length = round(fragment_sampler(prng())); + + + // This is our offset along the source path, if in use + int64_t sampled_offset; + // And our direction to go along the source path, if in use + bool sampled_is_reverse; + + int64_t walked_offset; + +#ifdef debug_ngs_sim + cerr << "sampled fragment length " << fragment_length << endl; +#endif + + if (fragment_length < transition_distrs_1.size()) { + // the fragment is shorter than the sequencing length + qual_and_mask_pair.first.first.resize(fragment_length); + qual_and_mask_pair.first.second.resize(fragment_length); + qual_and_mask_pair.second.first.resize(fragment_length); + qual_and_mask_pair.second.second.resize(fragment_length); +#ifdef debug_ngs_sim + cerr << "truncating reads to fragment length" << endl; +#endif + } + aln_pair.first.set_quality(qual_and_mask_pair.first.first); aln_pair.second.set_quality(qual_and_mask_pair.second.first); + // reverse the quality string so that it acts like it's reading from the opposite end // when we walk forward from the beginning of the first read std::reverse(aln_pair.second.mutable_quality()->begin(), aln_pair.second.mutable_quality()->end()); - while (!aln_pair.first.has_path() || !aln_pair.second.has_path()) { - int64_t insert_length = (int64_t) round(insert_sampler(prng)); - if (insert_length < (int64_t) transition_distrs_1.size()) { - // don't make reads where the insert length is shorter than one end of the read - continue; - } - - // This is our offset along the source path, if in use - size_t offset; - // And our direction to go along the source path, if in use - bool is_reverse; - // And our position in the graph, which we use whether there's a source path or not. + + // Populate the sample positions pos_t pos; - // And our path (if dealing with source_paths) - string source_path; - // Populate them - sample_start_pos(offset, is_reverse, pos, source_path); + sample_start_pos(source_path_idx, fragment_length, sampled_offset, sampled_is_reverse, pos); + // copy them so we can modify without losing the start pos info + walked_offset = sampled_offset; + bool is_reverse = sampled_is_reverse; +#ifdef debug_ngs_sim + cerr << "sampled read 1 start pos " << pos << ", is reverse " << is_reverse << ", offset " << walked_offset << endl; +#endif + // align the first end at this position on the source path or graph - sample_read_internal(aln_pair.first, offset, is_reverse, pos, source_path); + sample_read_internal(aln_pair.first, walked_offset, is_reverse, pos, source_path); if (retry_on_Ns) { if (aln_pair.first.sequence().find('N') != string::npos) { +#ifdef debug_ngs_sim + cerr << "rejecting sample because of an N" << endl; +#endif aln_pair.first.clear_path(); aln_pair.first.clear_sequence(); } @@ -801,25 +1122,59 @@ pair NGSSimulator::sample_read_pair() { continue; } - // walk out the unsequenced part of the insert in the graph - int64_t remaining_length = insert_length - 2 * transition_distrs_1.size(); +#ifdef debug_ngs_sim + cerr << "after first read, walked offset is " << walked_offset << endl; +#endif + + // walk out the unsequenced part of the fragment in the graph + int64_t remaining_length = fragment_length - (aln_pair.first.quality().size() + + aln_pair.second.quality().size()); + +#ifdef debug_ngs_sim + cerr << "walking " << remaining_length << " to start of next read in pair" << endl; +#endif if (remaining_length >= 0) { // we need to move forward from the end of the first read - if (advance_by_distance(offset, is_reverse, pos, remaining_length, source_path)) { + if (advance_by_distance(walked_offset, is_reverse, pos, remaining_length, source_path)) { +#ifdef debug_ngs_sim + cerr << "rejecting sample because insert is off of path" << endl; +#endif // we hit the end of the graph trying to walk continue; } - } else { + } + else { // we need to walk backwards from the end of the first read - pos = walk_backwards(aln_pair.first.path(), -remaining_length); - // Make sure to update the offset along the path as well. If offset - // and is_reverse aren't being used (becasue we aren't in path - // mode), the result won't be used either. - offset += is_reverse ? -remaining_length : remaining_length; + if (walk_backwards(walked_offset, is_reverse, pos, -remaining_length, source_path, aln_pair.first.path())) { +#ifdef debug_ngs_sim + cerr << "rejecting because backwards walk is off path/graph" << endl; +#endif + continue; + } + +#ifdef debug_ngs_sim + cerr << "walked backwards, walk length " << -remaining_length << ", is rev " << is_reverse << ", walked offset " << walked_offset << endl; +#endif } - +#ifdef debug_ngs_sim + cerr << "after moving for the fragment length walked offset is " << walked_offset << endl; +#endif + // guard against running off the end of nodes + // XXX this should not be happening + // it seems to occur in some graphs due to the behavior of advance_by_distance + if (vg::offset(pos) >= graph.get_length(graph.get_handle(id(pos)))) { +#ifdef debug_ngs_sim + cerr << "rejecting sample because of invalid walked location" << endl; +#endif + continue; + } + // align the second end starting at the walked position - sample_read_internal(aln_pair.second, offset, is_reverse, pos, source_path); + sample_read_internal(aln_pair.second, walked_offset, is_reverse, pos, source_path); + +#ifdef debug_ngs_sim + cerr << "after second read, walked offset is " << walked_offset << endl; +#endif if (retry_on_Ns) { if (aln_pair.second.sequence().find('N') != string::npos) { @@ -830,39 +1185,63 @@ pair NGSSimulator::sample_read_pair() { } // unreverse the second read in the pair - aln_pair.second = reverse_complement_alignment(aln_pair.second, [&](id_t node_id) { - return xg_index.node_length(node_id); + reverse_complement_alignment_in_place(&aln_pair.second, [&](id_t node_id) { + return graph.get_length(graph.get_handle(node_id)); }); + for (auto aln : {&aln_pair.first, &aln_pair.second}) { + for (const auto& mapping : aln->path().mapping()) { + if (mapping.position().offset() < 0) { + cerr << "error: invalid alignment!" << endl; + cerr << pb2json(*aln) << endl; + exit(1); + } + } + } + // mask out any of the sequence that we sampled to be an 'N' apply_N_mask(*aln_pair.first.mutable_sequence(), qual_and_mask_pair.first.second); apply_N_mask(*aln_pair.second.mutable_sequence(), qual_and_mask_pair.second.second); + + annotate_with_path_positions(aln_pair.first); + annotate_with_path_positions(aln_pair.second); + + // take back the final base that we sampled + register_sampled_position(aln_pair.first, source_path, sampled_offset + sampled_is_reverse, sampled_is_reverse); + register_sampled_position(aln_pair.second, source_path, walked_offset + sampled_is_reverse, !sampled_is_reverse); - string name = get_read_name(); - aln_pair.first.set_name(name + "_1"); - aln_pair.second.set_name(name + "_2"); - xg_annotate_with_initial_path_positions(aln_pair.first, true, false, &xg_index); - xg_annotate_with_initial_path_positions(aln_pair.second, true, false, &xg_index); return aln_pair; } -void NGSSimulator::sample_read_internal(Alignment& aln, size_t& offset, bool& is_reverse, pos_t& curr_pos, +void NGSSimulator::sample_read_internal(Alignment& aln, int64_t& offset, bool& is_reverse, pos_t& curr_pos, const string& source_path) { - + + // we will accept a read that cannot be extended to the full read length if we're simulating from + // a path that's too small or if we are sampling unsheared paths + bool accept_partial = sample_unsheared_paths; + if (!accept_partial && !source_path.empty()) { + accept_partial = graph.get_path_length(graph.get_path_handle(source_path)) < transition_distrs_1.size(); + } + // Make sure we are starting inside the node - auto first_node_length = xg_cached_node_length(id(curr_pos), &xg_index, node_cache); + // XXX this is broken + auto first_node_length = graph.get_length(graph.get_handle(id(curr_pos))); + if (vg::offset(curr_pos) >= first_node_length) { + cerr << "something wrong " << vg::offset(curr_pos) << " " << first_node_length << endl; + cerr << vg::id(curr_pos) << ":" << vg::is_rev(curr_pos) << ":" << vg::offset(curr_pos) << endl; + } assert(vg::offset(curr_pos) < first_node_length); aln.clear_path(); aln.clear_sequence(); - char graph_char = xg_cached_pos_char(curr_pos, &xg_index, node_cache); + char graph_char = graph.get_base(graph.get_handle(id(curr_pos), is_rev(curr_pos)), vg::offset(curr_pos)); bool hit_end = false; // walk a path and generate a read sequence at the same time while (aln.sequence().size() < aln.quality().size() && !hit_end) { // sample insertion in the true graph path - while (aln.sequence().size() < aln.quality().size() && prob_sampler(prng) < indel_poly_rate * 0.5) { + while (aln.sequence().size() < aln.quality().size() && prob_sampler(prng()) < indel_poly_rate * 0.5) { // TODO: no allowance for indel errors on inserted sequence #ifdef debug_ngs_sim @@ -872,15 +1251,18 @@ void NGSSimulator::sample_read_internal(Alignment& aln, size_t& offset, bool& is apply_insertion(aln, curr_pos); } if (aln.sequence().size() >= aln.quality().size() || hit_end) { +#ifdef debug_ngs_sim + cerr << "break 1: ending sample with seq len " << aln.sequence().size() << ", qual len " << aln.quality().size() << ", hit end? " << hit_end << endl; +#endif break; } // sample errors - double err_sample = prob_sampler(prng); + double err_sample = prob_sampler(prng()); double err_prob = phred_prob[aln.quality()[aln.sequence().size()]]; while (err_sample < err_prob * indel_error_prop && !hit_end) { // indel errors - if (prob_sampler(prng) < 0.5) { + if (prob_sampler(prng()) < 0.5) { #ifdef debug_ngs_sim cerr << "insertion error at read idx " << aln.sequence().size() << ", graph pos " << curr_pos << endl; #endif @@ -888,6 +1270,9 @@ void NGSSimulator::sample_read_internal(Alignment& aln, size_t& offset, bool& is apply_insertion(aln, curr_pos); if (aln.sequence().size() >= aln.quality().size() || hit_end) { +#ifdef debug_ngs_sim + cerr << "break 2: ending sample with seq len " << aln.sequence().size() << ", qual len " << aln.quality().size() << ", hit end? " << hit_end << endl; +#endif break; } } @@ -900,17 +1285,20 @@ void NGSSimulator::sample_read_internal(Alignment& aln, size_t& offset, bool& is hit_end = advance(offset, is_reverse, curr_pos, graph_char, source_path); } - err_sample = prob_sampler(prng); + err_sample = prob_sampler(prng()); err_prob = phred_prob[aln.quality()[aln.sequence().size()]]; } if (aln.sequence().size() >= aln.quality().size() || hit_end) { +#ifdef debug_ngs_sim + cerr << "break 3: ending sample with seq len " << aln.sequence().size() << ", qual len " << aln.quality().size() << ", hit end? " << hit_end << endl; +#endif break; } // get the true graph char, possibly with a substitution polymorphism char poly_graph_char = graph_char; - if (prob_sampler(prng) < sub_poly_rate) { - poly_graph_char = mutation_alphabets[poly_graph_char != 'N' ? poly_graph_char : alphabet[background_sampler(prng)]][mut_sampler(prng)]; + if (prob_sampler(prng()) < sub_poly_rate) { + poly_graph_char = mutation_alphabets[poly_graph_char != 'N' ? poly_graph_char : alphabet[background_sampler(prng())]][mut_sampler(prng())]; } // by default the read matches the true graph char @@ -919,7 +1307,7 @@ void NGSSimulator::sample_read_internal(Alignment& aln, size_t& offset, bool& is // sample substitution errors with the remaining err sample if (err_sample < err_prob) { // substitution error - read_char = mutation_alphabets[read_char != 'N' ? read_char : alphabet[background_sampler(prng)]][mut_sampler(prng)]; + read_char = mutation_alphabets[read_char != 'N' ? read_char : alphabet[background_sampler(prng())]][mut_sampler(prng())]; } #ifdef debug_ngs_sim @@ -931,11 +1319,14 @@ void NGSSimulator::sample_read_internal(Alignment& aln, size_t& offset, bool& is hit_end = advance(offset, is_reverse, curr_pos, graph_char, source_path); if (aln.sequence().size() >= aln.quality().size() || hit_end) { +#ifdef debug_ngs_sim + cerr << "break 4: ending sample with seq len " << aln.sequence().size() << ", qual len " << aln.quality().size() << ", hit end? " << hit_end << endl; +#endif break; } // sample deletions in the true graph path - while (prob_sampler(prng) < indel_poly_rate * 0.5 && !hit_end) { + while (prob_sampler(prng()) < indel_poly_rate * 0.5 && !hit_end) { #ifdef debug_ngs_sim cerr << "deletion polymorphism at read idx " << aln.sequence().size() << ", graph pos " << curr_pos << endl; #endif @@ -948,8 +1339,14 @@ void NGSSimulator::sample_read_internal(Alignment& aln, size_t& offset, bool& is // remove the sequence and path if we hit the end the graph before finishing // the alignment if (aln.sequence().size() != aln.quality().size()) { - aln.clear_path(); - aln.clear_sequence(); + if (accept_partial) { + // we simulated the whole path, so we don't use the final quality values + aln.mutable_quality()->resize(aln.sequence().size()); + } + else { + aln.clear_path(); + aln.clear_sequence(); + } } #ifdef debug_ngs_sim @@ -957,7 +1354,7 @@ void NGSSimulator::sample_read_internal(Alignment& aln, size_t& offset, bool& is #endif } -bool NGSSimulator::advance(size_t& offset, bool& is_reverse, pos_t& pos, char& graph_char, const string& source_path) { +bool NGSSimulator::advance(int64_t& offset, bool& is_reverse, pos_t& pos, char& graph_char, const string& source_path) { if (source_path.empty()) { return advance_on_graph(pos, graph_char); } else { @@ -968,16 +1365,13 @@ bool NGSSimulator::advance(size_t& offset, bool& is_reverse, pos_t& pos, char& g bool NGSSimulator::advance_on_graph(pos_t& pos, char& graph_char) { // choose a next position at random - map next_pos_chars = xg_cached_next_pos_chars(pos, - &xg_index, - node_cache, - edge_cache); + map next_pos_chars = algorithms::next_pos_chars(graph, pos); if (next_pos_chars.empty()) { return true; } vg::uniform_int_distribution pos_distr(0, next_pos_chars.size() - 1); - size_t next = pos_distr(prng); + size_t next = pos_distr(prng()); auto iter = next_pos_chars.begin(); for (size_t i = 0; i != next; i++) { iter++; @@ -988,34 +1382,34 @@ bool NGSSimulator::advance_on_graph(pos_t& pos, char& graph_char) { return false; } -bool NGSSimulator::advance_on_path(size_t& offset, bool& is_reverse, pos_t& pos, char& graph_char, const string& source_path) { - +bool NGSSimulator::advance_on_path(int64_t& offset, bool& is_reverse, pos_t& pos, char& graph_char, const string& source_path) { + int64_t path_length = graph.get_path_length(graph.get_path_handle(source_path)); if (is_reverse) { // Go left on the path - if (offset == 0) { + offset--; + if (offset < 0) { // We hit the end return true; } - offset--; } else { // Go right on the path - if (offset == xg_index.path_length(source_path) - 1) { + offset++; + if (offset == path_length) { // We hit the end return true; } - offset++; } // Set position according to position on path - pos = position_at(&xg_index, source_path, offset, is_reverse); + pos = position_at(&graph, source_path, offset, is_reverse); // And look up the character - graph_char = xg_cached_pos_char(pos, &xg_index, node_cache); + graph_char = graph.get_base(graph.get_handle(id(pos), is_rev(pos)), vg::offset(pos)); return false; } -bool NGSSimulator::advance_by_distance(size_t& offset, bool& is_reverse, pos_t& pos, size_t distance, +bool NGSSimulator::advance_by_distance(int64_t& offset, bool& is_reverse, pos_t& pos, int64_t distance, const string& source_path) { if (source_path.empty()) { return advance_on_graph_by_distance(pos, distance); @@ -1025,136 +1419,113 @@ bool NGSSimulator::advance_by_distance(size_t& offset, bool& is_reverse, pos_t& } -bool NGSSimulator::advance_on_graph_by_distance(pos_t& pos, size_t distance) { +bool NGSSimulator::advance_on_graph_by_distance(pos_t& pos, int64_t distance) { int64_t remaining = distance; - int64_t node_length = xg_index.node_length(id(pos)) - offset(pos); + handle_t handle = graph.get_handle(id(pos), is_rev(pos)); + int64_t node_length = graph.get_length(handle) - offset(pos); while (remaining >= node_length) { remaining -= node_length; - vector edges = is_rev(pos) ? xg_index.edges_on_start(id(pos)) : xg_index.edges_on_end(id(pos)); - if (edges.empty()) { + vector nexts; + graph.follow_edges(handle, false, [&](const handle_t& next) { + nexts.push_back(next); + }); + if (nexts.empty()) { return true; } - size_t choice = vg::uniform_int_distribution(0, edges.size() - 1)(prng); - Edge& edge = edges[choice]; - if (id(pos) == edge.from() && is_rev(pos) == edge.from_start()) { - get_id(pos) = edge.to(); - get_is_rev(pos) = edge.to_end(); - } - else { - get_id(pos) = edge.from(); - get_is_rev(pos) = !edge.from_start(); - } - get_offset(pos) = 0; - node_length = xg_index.node_length(id(pos)); + size_t choice = vg::uniform_int_distribution(0, nexts.size() - 1)(prng()); + handle = nexts[choice]; + node_length = graph.get_length(handle); } - + + get_id(pos) = graph.get_id(handle); + get_is_rev(pos) = graph.get_is_reverse(handle); get_offset(pos) += remaining; return false; } -bool NGSSimulator::advance_on_path_by_distance(size_t& offset, bool& is_reverse, pos_t& pos, size_t distance, +bool NGSSimulator::advance_on_path_by_distance(int64_t& offset, bool& is_reverse, pos_t& pos, int64_t distance, const string& source_path) { + int64_t path_length = graph.get_path_length(graph.get_path_handle(source_path)); if (is_reverse) { // Go left on the path - if (offset < distance) { - // We hit the end - return true; - } offset -= distance; } else { // Go right on the path - if (offset + distance >= xg_index.path_length(source_path)) { - // We hit the end - return true; - } offset += distance; } + if (offset < 0 || offset >= path_length) { +#ifdef debug_ngs_sim + cerr << "walked offset of " << offset << " after advancing " << distance << ", rev ? " << is_reverse << " is outside of path of length " << path_length << endl; +#endif + // We hit the end + return true; + } // Set position according to position on path - pos = position_at(&xg_index, source_path, offset, is_reverse); + pos = position_at(&graph, source_path, offset, is_reverse); return false; } -pos_t NGSSimulator::walk_backwards(const Path& path, size_t distance) { - // Starting at the past-the-end of the path, walk back to the given nonzero distance. - // Walking back the whole path length puts you at the start of the path. - - if (distance > path_to_length(path)) { - throw runtime_error("Cannot walk back " + to_string(distance) + " on path of length " + to_string(path_to_length(path))); +bool NGSSimulator::walk_backwards_along_alignment(const Path& path, int64_t distance, pos_t& pos) { + + // convert to a distance forward, which is easier to implement + int64_t remaining_to_walk = max(path_to_length(path) - distance, 0); + + for (size_t i = 0; i < path.mapping_size(); ++i) { + const auto& mapping = path.mapping(i); + int64_t walked_from_length = 0; + for (size_t j = 0; j < mapping.edit_size(); ++j) { + const auto& edit = mapping.edit(j); + if (edit.to_length() < remaining_to_walk) { + // we can continue to walk through this edit + remaining_to_walk -= edit.to_length(); + walked_from_length += edit.from_length(); + } + else { + // this edit overlaps the place we want to walk to + if (edit.to_length() == edit.from_length()) { + // this is a match/substitition, so we can walk the remaining distance + walked_from_length += remaining_to_walk; + remaining_to_walk = 0; + } + + nid_t node_id = mapping.position().node_id(); + bool rev = mapping.position().is_reverse(); + size_t offset = mapping.position().offset() + walked_from_length; + if (offset == graph.get_length(graph.get_handle(node_id))) { + // we're actually past-the-last on this node, which is not what we want + if (i + 1 < path.mapping_size()) { + // we can bump the position over to the next node + const auto& next_mapping = path.mapping(i + 1); + node_id = next_mapping.position().node_id(); + rev = next_mapping.position().is_reverse(); + offset = 0; + } + else { + // there's not really a "correct" option here, so we just adjust + // the offset by 1 and hope this doesn't lead to too much distortion + --offset; + } + } + pos = make_pos_t(node_id, rev, offset); + return false; + } + } } - assert(distance > 0); + return true; +} - // walk backwards until we find the mapping it's on - int64_t remaining = distance; - int64_t mapping_idx = path.mapping_size() - 1; - int64_t mapping_length = mapping_to_length(path.mapping(mapping_idx)); - while (remaining > mapping_length) { - remaining -= mapping_length; - mapping_idx--; - mapping_length = mapping_to_length(path.mapping(mapping_idx)); - } - // Now we know the position we want is inside this mapping. - const Mapping& mapping = path.mapping(mapping_idx); - const Position& mapping_pos = mapping.position(); - // walk forward from the beginning of the mapping, edit by edit, until we've passed where it is on the read - int64_t remaining_flipped = mapping_length - remaining; - int64_t edit_idx = 0; - int64_t prefix_from_length = 0; - int64_t prefix_to_length = 0; - while (prefix_to_length <= remaining_flipped) { - prefix_from_length += mapping.edit(edit_idx).from_length(); - prefix_to_length += mapping.edit(edit_idx).to_length(); - edit_idx++; - } - // Go back one edit to the edit that covers the position we want to be at - --edit_idx; - // use the mapping's position and the distance we traveled on the graph to get the offset of - // the beginning of this edit - int64_t offset = mapping_pos.offset() + prefix_from_length - mapping.edit(edit_idx).from_length(); - if (mapping.edit(edit_idx).from_length() == mapping.edit(edit_idx).to_length()) { - // if the edit is a match/mismatch, we can walk part of it - - // How many extra bases did this mapping have when we got past where we wanted to be? - auto extra_bases = prefix_to_length - remaining_flipped; - - // Take all the non-extra bases of the mapping. - offset += mapping.edit(edit_idx).to_length() - extra_bases; - } else { - // Otherwise it's an insert, so just land at the spot it is inserted at. - - // But we will have a problem if the insert is the last thing in its mapping - if (prefix_from_length == mapping_from_length(mapping)) { - // We are trying to put our starting position past the end of this - // mapping, because we are landing inside an insert that is the - // last thing in its mapping. - - // Note that this can happen at the end of the path, if the whole - // path ends in an insert. So we can't just go to the next - // position. - - // There's not really a quite correct thing to do here, but since - // we can't go right we have to go left. - assert(offset > 0); - offset--; - } +bool NGSSimulator::walk_backwards(int64_t& offset, bool& is_reverse, pos_t& pos, int64_t distance, + const string& source_path, const Path& path) { + if (source_path.empty()) { + return walk_backwards_along_alignment(path, distance, pos); } - - // Get the length of the node we landed on - auto node_length = xg_cached_node_length(mapping_pos.node_id(), &xg_index, node_cache); - // The position we pick should not be past the end of the node. - if (offset >= node_length) { - cerr << pb2json(path) << endl; - cerr << "Covering mapping: " << pb2json(mapping) << endl; - cerr << "Covering edit: " << pb2json(mapping.edit(edit_idx)) << endl; - throw runtime_error("Could not go back " + to_string(distance) + " in path of length " + - to_string(path_to_length(path)) + "; hit node " + to_string(mapping_pos.node_id()) + " length " + - to_string(node_length) + " end at offset " + to_string(offset)); + else { + return advance_on_path_by_distance(offset, is_reverse, pos, -distance, source_path); } - - return make_pos_t(mapping_pos.node_id(), mapping_pos.is_reverse(), offset); } void NGSSimulator::apply_aligned_base(Alignment& aln, const pos_t& pos, char graph_char, @@ -1267,7 +1638,7 @@ void NGSSimulator::apply_deletion(Alignment& aln, const pos_t& pos) { void NGSSimulator::apply_insertion(Alignment& aln, const pos_t& pos) { Path* path = aln.mutable_path(); - char insert_char = alphabet[background_sampler(prng)]; + char insert_char = alphabet[background_sampler(prng())]; aln.mutable_sequence()->push_back(insert_char); if (path->mapping_size() == 0) { @@ -1298,63 +1669,107 @@ void NGSSimulator::apply_insertion(Alignment& aln, const pos_t& pos) { } } -void NGSSimulator::sample_start_pos(size_t& offset, bool& is_reverse, pos_t& pos, string& source_path) { +size_t NGSSimulator::sample_path() { + if (source_paths.empty()) { + return numeric_limits::max(); + } + else { + size_t path_idx = path_sampler(prng()); + return path_idx; + } +} + +void NGSSimulator::sample_start_pos(const size_t& source_path_idx, const int64_t& fragment_length, + int64_t& offset, bool& is_reverse, pos_t& pos) { if (source_paths.empty()) { pos = sample_start_graph_pos(); offset = 0; is_reverse = false; - source_path = ""; - } else { - tie(offset, is_reverse, pos, source_path) = sample_start_path_pos(); + } + else { + tie(offset, is_reverse, pos) = sample_start_path_pos(source_path_idx, fragment_length); } } pos_t NGSSimulator::sample_start_graph_pos() { // The start pos sampler has been set up in graph space, 1-based assert(start_pos_samplers.size() == 1); - size_t idx = start_pos_samplers[0](prng); + size_t idx = start_pos_samplers[0](prng()); - id_t id = xg_index.node_at_seq_pos(idx); - bool rev = strand_sampler(prng); - size_t node_offset = idx - xg_index.node_start(id) - 1; + id_t id = dynamic_cast(graph).node_at_vector_offset(idx); + bool rev = strand_sampler(prng()); + size_t node_offset = idx - dynamic_cast(graph).node_vector_offset(id) - 1; return make_pos_t(id, rev, node_offset); } -tuple NGSSimulator::sample_start_path_pos() { - // choose a path - size_t source_path_idx = path_sampler(prng); - string source_path = source_paths[source_path_idx]; - // The start pos sampler hasd been set up in path space, 0-based - size_t offset = start_pos_samplers[source_path_idx](prng); - bool rev = strand_sampler(prng); - pos_t pos = position_at(&xg_index, source_path, offset, rev); +tuple NGSSimulator::sample_start_path_pos(const size_t& source_path_idx, + const int64_t& fragment_length) { - return make_tuple(offset, rev, pos, source_path); + int64_t path_length = graph.get_path_length(graph.get_path_handle(source_paths[source_path_idx])); + bool rev = strand_sampler(prng()); +#ifdef debug_ngs_sim + cerr << "sampling start position on path " << source_paths[source_path_idx] << ", strand " << rev << ", path length " << path_length << endl; +#endif + int64_t offset; + if (sample_unsheared_paths || path_length < transition_distrs_1.size() || + (fragment_length > 0 && fragment_length >= path_length)) { + if (rev) { + offset = path_length - 1; + } + else { + offset = 0; + } + } + else { + + // we'll not let it choose too unreasonable of a start position so that we can speed + // up the process of continuously resampling impractically close to the end of paths + bool feasible = false; + int64_t shortened_fragment_length = fragment_length - ceil(sqrt(fragment_length)); + do { + // The start pos sampler has been set up in path space, 0-based + offset = start_pos_samplers[source_path_idx](prng()); + if (rev) { + feasible = (offset - shortened_fragment_length >= -1); + } + else { + feasible = (offset + shortened_fragment_length <= path_length); + } + } while (!feasible); + } + pos_t pos = position_at(&graph, source_paths[source_path_idx], offset, rev); + + return make_tuple(offset, rev, pos); } string NGSSimulator::get_read_name() { stringstream sstrm; - sstrm << "seed_" << seed << "_fragment_" << sample_counter; - sample_counter++; + size_t num; +#pragma omp atomic capture + num = sample_counter++; + sstrm << "seed_" << seed << "_fragment_" << num; return sstrm.str(); } void NGSSimulator::record_read_quality(const Alignment& aln, bool read_2) { const string& quality = aln.quality(); const string& sequence = aln.sequence(); + assert(sequence.size() == quality.size()); auto& transition_distrs = read_2 ? transition_distrs_2 : transition_distrs_1; if (quality.empty()) { return; } while (transition_distrs.size() < quality.size()) { - transition_distrs.emplace_back(seed ? seed + transition_distrs.size() + 1 : random_device()()); + // coding-time random engine to perturb the seed for each position + linear_congruential_engine seed_perturbor(seed + transition_distrs.size() + read_2); + transition_distrs.emplace_back(seed ? seed_perturbor() : random_device()()); } // record the initial quality and N-mask transition_distrs[0].record_transition(pair(0, false), pair(quality[0], sequence[0] == 'N')); // record the subsequent quality and N-mask transitions - for (size_t i = 1; i < transition_distrs.size(); i++) { + for (size_t i = 1; i < quality.size(); i++) { transition_distrs[i].record_transition(pair(quality[i - 1], sequence[i - 1] == 'N'), pair(quality[i], sequence[i] == 'N')); } @@ -1419,74 +1834,11 @@ pair> NGSSimulator::sample_read_quality_internal(pair& n_mask) { - assert(sequence.size() == n_mask.size()); - for (size_t i = 0; i < n_mask.size(); i++) { + for (size_t i = 0; i < sequence.size(); i++) { if (n_mask[i]) { sequence[i] = 'N'; } } } - -template -NGSSimulator::MarkovDistribution::MarkovDistribution(size_t seed) : prng(seed) { - // nothing to do -} - -template -void NGSSimulator::MarkovDistribution::record_transition(From from, To to) { - if (!cond_distrs.count(from)) { - cond_distrs[from] = vector(value_at.size(), 0); - } - - if (!column_of.count(to)) { - column_of[to] = value_at.size(); - value_at.push_back(to); - for (pair>& cond_distr : cond_distrs) { - cond_distr.second.push_back(0); - } - } - - cond_distrs[from][column_of[to]]++; -} - -template -void NGSSimulator::MarkovDistribution::finalize() { - for (pair>& cond_distr : cond_distrs) { - for (size_t i = 1; i < cond_distr.second.size(); i++) { - cond_distr.second[i] += cond_distr.second[i - 1]; - } - - samplers[cond_distr.first] = vg::uniform_int_distribution(1, cond_distr.second.back()); - } -} - -template -To NGSSimulator::MarkovDistribution::sample_transition(From from) { - // return randomly if a transition has never been observed - if (!cond_distrs.count(from)) { - return value_at[vg::uniform_int_distribution(0, value_at.size() - 1)(prng)]; - } - - size_t sample_val = samplers[from](prng); - vector& cdf = cond_distrs[from]; - - if (sample_val <= cdf[0]) { - return value_at[0]; - } - - size_t low = 0; - size_t hi = cdf.size() - 1; - while (hi > low + 1) { - int64_t mid = (hi + low) / 2; - - if (sample_val <= cdf[mid]) { - hi = mid; - } - else { - low = mid; - } - } - return value_at[hi]; -} } diff --git a/src/sampler.hpp b/src/sampler.hpp index 56219c511f6..c36288f4de0 100644 --- a/src/sampler.hpp +++ b/src/sampler.hpp @@ -2,6 +2,7 @@ #define VG_SIMULATOR_HPP_INCLUDED #include +#include #include #include #include @@ -9,16 +10,10 @@ #include #include #include -#include "vg.hpp" -#include "xg.hpp" -#include "alignment.hpp" -#include "path.hpp" -#include "position.hpp" -#include "cached_position.hpp" -#include "xg_position.hpp" -#include "distributions.hpp" #include "lru_cache.h" -#include "json2pb.h" +#include "statistics.hpp" +#include "position.hpp" +#include "vg/io/json2pb.h" namespace vg { @@ -28,17 +23,66 @@ using namespace std; /// orientations, into pos_ts. Remember that pos_t counts offset from the start /// of the reoriented node, while here we count offset from the beginning of the /// forward version of the path. -pos_t position_at(xg::XG* xgidx, const string& path_name, const size_t& path_offset, bool is_reverse); +pos_t position_at(PathPositionHandleGraph* graph_ptr, const string& path_name, const size_t& path_offset, bool is_reverse); + +/** + * Interface for shared functionality for things that sample reads. + */ +class AbstractReadSampler { +public: + virtual ~AbstractReadSampler() = default; + + /// Make a new sampler using the given graph. + inline AbstractReadSampler(PathPositionHandleGraph& graph) : + graph(graph) { + // Graph must be vectorizable for our implementations to be able to + // sample positions. + if (!dynamic_cast(&graph)) { + throw std::logic_error("Graph is expected to be vectorizable!"); + } + } + + // TODO: Add a real sampling interface when one can be factored out! + + /////////// + // Control fields + /////////// + + /// If true, annotate alignments with multiple positions along reference + /// paths. If false, annotate them with minimum visited positions along + /// reference paths. + bool multi_position_annotations = false; + + /// What limit should we use for retry loops before giving up or failing? + size_t max_tries = 100; + + /// Set to a filter function that returns true if a given path in the graph + /// is allowed to be used as an annotation path. + std::unique_ptr> annotation_path_filter; + + // TODO: Move more common fields out here. Make Sampler store at least a + // default error rate, etc. for the common sampling interface. + +protected: + + /// The graph being simulated against. + PathPositionHandleGraph& graph; + + /// Annotate the given alignment with the appropriate type of path + /// positions. + void annotate_with_path_positions(Alignment& aln); +}; + + /** * Generate Alignments (with or without mutations, and in pairs or alone) from - * an XG index. + * an PathPositionHandleGraph index. */ -class Sampler { +class Sampler: public AbstractReadSampler { public: - xg::XG* xgidx; // We need this so we don't re-load the node for every character we visit in // it. LRUCache node_cache; @@ -53,26 +97,52 @@ class Sampler { // A vector which, if nonempty, gives the names of the paths to restrict simulated reads to. vector source_paths; vg::discrete_distribution<> path_sampler; // draw an index in source_paths - inline Sampler(xg::XG* x, + size_t total_seq_length = 0; + + /// Make a Sampler to sample from the given graph. + /// If sampling from particular paths, source_paths should contain their + /// names, and source_path_ploidies should either be empty or contain a + /// ploidy value for each source path. + inline Sampler(PathPositionHandleGraph* x, int seed = 0, bool forward_only = false, bool allow_Ns = false, - const vector& source_paths = {}) - : xgidx(x), + const vector& source_paths = {}, + const vector& source_path_ploidies = {}, + const vector>& transcript_expressions = {}, + const vector>& haplotype_transcripts = {}) + : AbstractReadSampler(*x), node_cache(100), edge_cache(100), forward_only(forward_only), no_Ns(!allow_Ns), nonce(0), source_paths(source_paths) { + // sum seq lengths + graph.for_each_handle([&](const handle_t& handle) { + total_seq_length += graph.get_length(handle); + }); if (!seed) { seed = time(NULL); } rng.seed(seed); - set_source_paths(source_paths); + set_source_paths(source_paths, source_path_ploidies, transcript_expressions, haplotype_transcripts); } + + // AbstractReadSampler interface + Alignment sample_read(); + pair sample_read_pair(); - void set_source_paths(const vector& source_paths); + /// Make a path sampling distribution based on relative lengths (weighted + /// by ploidy) or on transcript expressions. (At most one of source_paths and + /// expressions should be non-empty.) If providing a transcript expression + /// profile, can optionally provide a non-empty vector of haplotype + /// transcripts to translate between the embedded path names and the + /// transcript names in the expression profile. + void set_source_paths(const vector& source_paths, + const vector& source_path_ploidies, + const vector>& transcript_expressions, + const vector>& haplotype_transcripts); pos_t position(void); string sequence(size_t length); @@ -95,6 +165,7 @@ class Sampler { double fragment_std_dev, double base_error, double indel_error); + size_t node_length(id_t id); char pos_char(pos_t pos); map next_pos_chars(pos_t pos); @@ -118,7 +189,7 @@ class Sampler { string alignment_seq(const Alignment& aln); - /// Return true if the alignment is semantically valid against the XG index + /// Return true if the alignment is semantically valid against the PathPositionHandleGraph index /// we wrap, and false otherwise. Checks from_lengths on mappings to make /// sure all node bases are accounted for. Won't accept alignments with /// internal jumps between graph locations or regions; all skipped bases @@ -131,24 +202,31 @@ class Sampler { * Class that simulates reads with alignments to a graph that mimic the error * profile of NGS sequencing data. */ -class NGSSimulator { +class NGSSimulator : public AbstractReadSampler { public: /// Initialize simulator. FASTQ file will be used to train an error distribution. /// Most reads in the FASTQ should be the same length. Polymorphism rates apply /// uniformly along a read, whereas errors are distributed as indicated by the learned /// distribution. The simulation can also be restricted to named paths in the graph. - NGSSimulator(xg::XG& xg_index, + /// Alternatively, it can match an expression profile. However, it cannot be simulateously + /// restricted to paths and to an expression profile. + NGSSimulator(PathPositionHandleGraph& graph, const string& ngs_fastq_file, + const string& ngs_paired_fastq_file = "", bool interleaved_fastq = false, const vector& source_paths = {}, + const vector& source_path_ploidies = {}, + const vector>& transcript_expressions = {}, + const vector>& haplotype_transcripts = {}, double substition_polymorphism_rate = 0.001, double indel_polymorphism_rate = 0.0002, double indel_error_proportion = 0.01, - double insert_length_mean = 1000.0, - double insert_length_stdev = 75.0, + double fragment_length_mean = 300.0, + double fragment_length_stdev = 50.0, double error_multiplier = 1.0, bool retry_on_Ns = true, - size_t seed = 0); + bool sample_unsheared_paths = false, + uint64_t seed = 0); /// Sample an individual read and alignment Alignment sample_read(); @@ -156,11 +234,14 @@ class NGSSimulator { /// Sample a pair of reads an alignments pair sample_read_pair(); + /// Open up a stream to output read positions to + void connect_to_position_file(const string& filename); + private: template class MarkovDistribution { public: - MarkovDistribution(size_t seed); + MarkovDistribution(uint64_t seed); /// record a transition from the input data void record_transition(From from, To to); @@ -171,7 +252,7 @@ class NGSSimulator { private: - default_random_engine prng; + mt19937_64 prng; unordered_map> samplers; unordered_map column_of; @@ -179,13 +260,15 @@ class NGSSimulator { unordered_map> cond_distrs; }; - + NGSSimulator(void) = delete; /// DNA alphabet static const string alphabet; /// Remainder of the alphabet after removing a given character unordered_map mutation_alphabets; + /// The total sequence length in our graph + size_t total_seq_length = 0; /// Add a quality string to the training data void record_read_quality(const Alignment& aln, bool read_2 = false); @@ -206,16 +289,27 @@ class NGSSimulator { /// the iteration and update of curr_pos) in path node. Otherwise, in whole /// graph mode, they are ignored and curr_pos is used to traverse the graph /// directly. - void sample_read_internal(Alignment& aln, size_t& offset, bool& is_reverse, pos_t& curr_pos, + void sample_read_internal(Alignment& aln, int64_t& offset, bool& is_reverse, pos_t& curr_pos, const string& source_path); + /// Return the index of a path if using source_paths or else numeric_limits::max() + size_t sample_path(); + + /// Ouput a sampled position to the path position file + void register_sampled_position(const Alignment& aln, const string& path_name, + size_t offset, bool is_reverse); + /// Sample an appropriate starting position according to the mode. Updates the arguments. - void sample_start_pos(size_t& offset, bool& is_reverse, pos_t& pos, string& source_path); + /// Providing a negative number for fragment length indicates no fragment length restrictions. + void sample_start_pos(const size_t& source_path_idx, const int64_t& fragment_length, + int64_t& offset, bool& is_reverse, pos_t& pos); /// Get a random position in the graph pos_t sample_start_graph_pos(); - /// Get a random position along the source path - tuple sample_start_path_pos(); + /// Get a random position along the source path. Enforce fragment length restrictions if argument + /// is positive. + tuple sample_start_path_pos(const size_t& source_path_idx, + const int64_t& fragment_length); /// Get an unclashing read name string get_read_name(); @@ -223,20 +317,20 @@ class NGSSimulator { /// Move forward one position in either the source path or the graph, /// depending on mode. Update the arguments. Return true if we can't because /// we hit a tip or false otherwise - bool advance(size_t& offset, bool& is_reverse, pos_t& pos, char& graph_char, const string& source_path); + bool advance(int64_t& offset, bool& is_reverse, pos_t& pos, char& graph_char, const string& source_path); /// Move forward a certain distance in either the source path or the graph, /// depending on mode. Update the arguments. Return true if we can't because /// we hit a tip or false otherwise - bool advance_by_distance(size_t& offset, bool& is_reverse, pos_t& pos, size_t distance, + bool advance_by_distance(int64_t& offset, bool& is_reverse, pos_t& pos, int64_t distance, const string& source_path); /// Move forward one position in the source path, return true if we can't /// because we hit a tip or false otherwise - bool advance_on_path(size_t& offset, bool& is_reverse, pos_t& pos, char& graph_char, + bool advance_on_path(int64_t& offset, bool& is_reverse, pos_t& pos, char& graph_char, const string& source_path); /// Move forward a certain distance in the source path, return true if we /// can't because we hit a tip or false otherwise - bool advance_on_path_by_distance(size_t& offset, bool& is_reverse, pos_t& pos, size_t distance, + bool advance_on_path_by_distance(int64_t& offset, bool& is_reverse, pos_t& pos, int64_t distance, const string& source_path); /// Move forward one position in the graph along a random path, return true if we can't @@ -244,13 +338,17 @@ class NGSSimulator { bool advance_on_graph(pos_t& pos, char& graph_char); /// Move forward a certain distance in the graph along a random path, return true if we /// can't because we hit a tip or false otherwise - bool advance_on_graph_by_distance(pos_t& pos, size_t distance); + bool advance_on_graph_by_distance(pos_t& pos, int64_t distance); /// Mask out bases with 'N's if the mask is true void apply_N_mask(string& sequence, const vector& n_mask); + + /// Walk backwards either along an alignment path or a source path, updates positions + bool walk_backwards(int64_t& offset, bool& is_reverse, pos_t& pos, int64_t distance, + const string& source_path, const Path& path); + /// Walk backwards along the alignment path + bool walk_backwards_along_alignment(const Path& path, int64_t distance, pos_t& pos); - /// Returns the position a given distance from the end of the path, walking backwards - pos_t walk_backwards(const Path& path, size_t distance); /// Add a deletion to the alignment void apply_deletion(Alignment& aln, const pos_t& pos); /// Add an insertion to the alignment @@ -258,7 +356,9 @@ class NGSSimulator { /// Add a match/mismatch to the alignment void apply_aligned_base(Alignment& aln, const pos_t& pos, char graph_char, char read_char); - /// Memo for Phred -> probability conversion + mt19937_64& prng(); + + /// Memo for pre-multiplied Phred -> probability conversion vector phred_prob; /// A Markov distribution for each read position indicating quality and whether the base is an 'N' @@ -268,39 +368,99 @@ class NGSSimulator { /// A distribution for the joint initial qualities of a read pair MarkovDistribution, pair, pair>> joint_initial_distr; - xg::XG& xg_index; - - LRUCache node_cache; - LRUCache > edge_cache; - - default_random_engine prng; + vector prngs; vg::discrete_distribution<> path_sampler; vector> start_pos_samplers; vg::uniform_int_distribution strand_sampler; vg::uniform_int_distribution background_sampler; vg::uniform_int_distribution mut_sampler; vg::uniform_real_distribution prob_sampler; - vg::normal_distribution insert_sampler; const double sub_poly_rate; const double indel_poly_rate; const double indel_error_prop; - const double insert_mean; - const double insert_sd; + const double fragment_mean; + const double fragment_sd; size_t sample_counter = 0; - size_t seed; + uint64_t seed; + /// Should we try again for a read without Ns of we get Ns? const bool retry_on_Ns; + const bool sample_unsheared_paths; /// Restrict reads to just these paths (path-only mode) if nonempty. vector source_paths; + + ofstream position_file; }; + + /** * A finite state Markov distribution that supports sampling */ +template +NGSSimulator::MarkovDistribution::MarkovDistribution(uint64_t seed) : prng(seed) { + // nothing to do +} +template +void NGSSimulator::MarkovDistribution::record_transition(From from, To to) { + if (!cond_distrs.count(from)) { + cond_distrs[from] = vector(value_at.size(), 0); + } + + if (!column_of.count(to)) { + column_of[to] = value_at.size(); + value_at.push_back(to); + for (pair>& cond_distr : cond_distrs) { + cond_distr.second.push_back(0); + } + } + + cond_distrs[from][column_of[to]]++; +} + +template +void NGSSimulator::MarkovDistribution::finalize() { + for (pair>& cond_distr : cond_distrs) { + for (size_t i = 1; i < cond_distr.second.size(); i++) { + cond_distr.second[i] += cond_distr.second[i - 1]; + } + + samplers[cond_distr.first] = vg::uniform_int_distribution(1, cond_distr.second.back()); + } +} + +template +To NGSSimulator::MarkovDistribution::sample_transition(From from) { + // return randomly if a transition has never been observed + if (!cond_distrs.count(from)) { + return value_at[vg::uniform_int_distribution(0, value_at.size() - 1)(prng)]; + } + + size_t sample_val = samplers[from](prng); + vector& cdf = cond_distrs[from]; + + if (sample_val <= cdf[0]) { + return value_at[0]; + } + + size_t low = 0; + size_t hi = cdf.size() - 1; + while (hi > low + 1) { + int64_t mid = (hi + low) / 2; + + if (sample_val <= cdf[mid]) { + hi = mid; + } + else { + low = mid; + } + } + return value_at[hi]; +} } diff --git a/src/scanner.cpp b/src/scanner.cpp new file mode 100644 index 00000000000..d78a5206da1 --- /dev/null +++ b/src/scanner.cpp @@ -0,0 +1,143 @@ +/** + * \file scanner.cpp + * Implementations for traversing Protpbuf object trees + */ + +#include "scanner.hpp" + +namespace vg { + +using namespace std; + +// Specializations have to be defined in dependency order to avoid complaints of specialization after instantiation. + +template<> +bool PositionIDScanner::scan(const Mapping& msg, const function& pos_iteratee, + const function& id_iteratee) { + + if (msg.position().node_id() != 0) { + // Just enumerate the position we have + return pos_iteratee(msg.position()); + } else { + // Skip this empty Position + return true; + } +} + +template<> +bool PositionIDScanner::scan(const Path& msg, const function& pos_iteratee, + const function& id_iteratee) { + + // If we don't see any real positions or IDs in this path, we have to emit a 0 node ID sentinel. + bool path_is_empty = true; + + // We have to wrap the iteratees to do this + auto record_pos = [&](const Position& pos) -> bool { + path_is_empty = false; + return pos_iteratee(pos); + }; + auto record_id = [&](const id_t& id) -> bool { + path_is_empty = false; + return id_iteratee(id); + }; + + bool keep_going = true; + for (size_t i = 0; keep_going && i < msg.mapping_size(); i++) { + // Scan over all the mappings + keep_going &= PositionIDScanner::scan(msg.mapping(i), record_pos, record_id); + } + + if (keep_going && path_is_empty) { + // Visit the sentinel zero node ID + keep_going &= id_iteratee(0); + } + + return keep_going; +} + +template<> +bool PositionIDScanner::scan(const Alignment& msg, const function& pos_iteratee, + const function& id_iteratee) { + + // Visit the Path + return PositionIDScanner::scan(msg.path(), pos_iteratee, id_iteratee); +} + +template<> +bool PositionIDScanner::scan(const Node& msg, const function& pos_iteratee, + const function& id_iteratee) { + + if (msg.id() != 0) { + // Just announce the node's ID + return id_iteratee(msg.id()); + } + return true; + +} + +template<> +bool PositionIDScanner::scan(const Edge& msg, const function& pos_iteratee, + const function& id_iteratee) { + + bool keep_going = true; + + // Make sure to filter out zero node IDs + + if (msg.from() != 0) { + keep_going &= id_iteratee(msg.from()); + } + + if (keep_going && msg.to() != 0) { + keep_going &= id_iteratee(msg.to()); + } + + return keep_going; + +} + +template<> +bool PositionIDScanner::scan(const Graph& msg, const function& pos_iteratee, + const function& id_iteratee) { + + // If we don't see any real positions or IDs in this graph, we have to emit a 0 node ID sentinel. + bool graph_is_empty = true; + + // We have to wrap the iteratees to do this + auto record_pos = [&](const Position& pos) -> bool { + graph_is_empty = false; + return pos_iteratee(pos); + }; + auto record_id = [&](const id_t& id) -> bool { + graph_is_empty = false; + return id_iteratee(id); + }; + + // Note that it's OK if we catch a 0 node ID sentinel from a contained + // path. Then 0 has already been emitted and we don't need to emit it again + // for the graph as a whole, even if the graph has no nodes/edges. + + bool keep_going = true; + for (size_t i = 0; keep_going && i < msg.node_size(); i++) { + // Scan over all the nodes + keep_going &= PositionIDScanner::scan(msg.node(i), record_pos, record_id); + } + for (size_t i = 0; keep_going && i < msg.edge_size(); i++) { + // Scan over all the edges + keep_going &= PositionIDScanner::scan(msg.edge(i), record_pos, record_id); + } + for (size_t i = 0; keep_going && i < msg.path_size(); i++) { + // Scan over all the paths + keep_going &= PositionIDScanner::scan(msg.path(i), record_pos, record_id); + } + + if (keep_going && graph_is_empty) { + // Visit the sentinel zero node ID + keep_going &= id_iteratee(0); + } + + return keep_going; + +} + + +} diff --git a/src/scanner.hpp b/src/scanner.hpp new file mode 100644 index 00000000000..f2c43f51678 --- /dev/null +++ b/src/scanner.hpp @@ -0,0 +1,116 @@ +#ifndef VG_SCANNER_HPP_INCLUDED +#define VG_SCANNER_HPP_INCLUDED + +/** + * \file scanner.hpp + * Define some "scanners" that can traverse a tree of VG + * Protobuf objects and iterate over items found in the tree. + */ + +#include +#include "types.hpp" +#include + +namespace vg { + +using namespace std; + +/** + * We define a PositionIDScanner that scans a VG Protobuf message tree for whole Position objects and node IDs. + * Each of the two is visited with its own iteratee function. + * May emit the same Position or id multiple times. + * Will never emit an empty Position. + * Will only emit the 0 node ID if a Graph or Path contains no nonzero node IDs. + */ +template +struct PositionIDScanner { + /// Scan over the Position objects and non-Position-wrapped node IDs in + /// this message and all its children. Returns false if an iteratee + /// returned false and asked to stop. + static bool scan(const Message& msg, const function& pos_iteratee, + const function& id_iteratee); +}; + +/** + * We define an IDScanner which scans over all node ID references in a tree of VG Protobuf objects. + * May emit the same ID multiple times. + * Will only emit the 0 node ID if a Graph or Path contains no nonzero node IDs. + */ +template +struct IDScanner { + /// Scan over the node IDs in this message and all its children. + /// Returns false if an iteratee returned false and asked to stop. + static bool scan(const Message& msg, const function& iteratee); +}; + +/** + * We define a PositionScanner which scans over all Position objects and node + * IDs, wrapped as Positions, in a tree of VG Protobuf objects. + * Will only emit the empty Position if a Graph or Path contains no nonzero node IDs. + */ +template +struct WrappingPositionScanner { + /// Scan over the node IDs in this message and all its children. + /// Returns false if an iteratee returned false and asked to stop. + static bool scan(const Message& msg, const function& iteratee); +}; + +///////////// +// Template Specializations +///////////// + +// Declare specializations of the above that we will implement + +template<> +bool PositionIDScanner::scan(const Mapping& msg, const function& pos_iteratee, + const function& id_iteratee); + +template<> +bool PositionIDScanner::scan(const Path& msg, const function& pos_iteratee, + const function& id_iteratee); + +template<> +bool PositionIDScanner::scan(const Alignment& msg, const function& pos_iteratee, + const function& id_iteratee); + +template<> +bool PositionIDScanner::scan(const Node& msg, const function& pos_iteratee, + const function& id_iteratee); + +template<> +bool PositionIDScanner::scan(const Edge& msg, const function& pos_iteratee, + const function& id_iteratee); + +template<> +bool PositionIDScanner::scan(const Graph& msg, const function& pos_iteratee, + const function& id_iteratee); + +///////////// +// Template Implementations +///////////// + + +// We implement everything in terms of the PositionIDScanner. + +template +bool IDScanner::scan(const Message& msg, const function& iteratee) { + // Get the node ID form the position and iterate over that + return PositionIDScanner::scan(msg, [&](const Position& pos) { + return iteratee(pos.node_id()); + }, iteratee); +} + +template +bool WrappingPositionScanner::scan(const Message& msg, const function& iteratee) { + // Wrap the node ID and iterate over that + return PositionIDScanner::scan(msg, iteratee, [&](const id_t& id) { + Position wrapped; + wrapped.set_node_id(id); + return iteratee(wrapped); + }); +} + +} + + +#endif diff --git a/src/sequence_complexity.hpp b/src/sequence_complexity.hpp new file mode 100644 index 00000000000..0871a8d9c88 --- /dev/null +++ b/src/sequence_complexity.hpp @@ -0,0 +1,116 @@ +/** + * \file sequence_complexity.hpp + * + * Defines and implements an algorithm to identify low-complexity sequences + * + */ +#ifndef VG_SEQUENCE_COMPLEXITY_HPP_INCLUDED +#define VG_SEQUENCE_COMPLEXITY_HPP_INCLUDED + +#include +#include +#include + +namespace vg { + +using namespace std; + +//#define debug_seq_complexity + +/* + * Struct to compute the complexity of sequence at different orders + */ +template +struct SeqComplexity { + + SeqComplexity(string::const_iterator begin, string::const_iterator end); + SeqComplexity(const string& seq); + + // The approximate p-value of the n-th order correlation between + // nucleotides. Only valid for 1 <= order <= MaxOrder template param. + double p_value(int order) const; + + // The fraction of pairs that are repeats at this order. + double repetitiveness(int order) const; + +private: + + int len; + int matches[MaxOrder]; +}; + +/* + * Template implementations + */ + +template +SeqComplexity::SeqComplexity(const string& seq) : SeqComplexity(seq.begin(), seq.end()) { + +} + +template +SeqComplexity::SeqComplexity(string::const_iterator begin, string::const_iterator end) { + + len = end - begin; + + for (int i = 0; i < MaxOrder; ++i) { + matches[i] = 0; + } + + for (int i = 1; i < len; ++i) { + for (int j = max(0, i - MaxOrder); j < i; ++j) { + matches[i - j - 1] += (*(begin + j) == *(begin + i)); + } + } +#ifdef debug_seq_complexity + cerr << "match table for seq of length " << len << ":" << endl; + for (int i = 1; i < len; ++i) { + cerr << i << ": " << matches[i - 1] << endl; + } +#endif +} + +// TODO: have GC bias instead of uniform random? maybe not since repetition +// hurts alignment uncertainty even in biased sequencesc + +template +double SeqComplexity::p_value(int order) const { + if (order < len && order + 8 > len) { + // exact binomial CDF + // TODO: flip sum if is smaller? + double x = 1.0; + double y = len - order; + double term = pow(0.75, y); + double accum = 0.0; + for (int i = 0, k = matches[order - 1]; i < k; ++i) { + accum += term; + // the ratio of successive terms in the binomial distr pmf + term *= 0.333333333333333333 * y / x; + x += 1.0; + y -= 1.0; + } + + return 1.0 - accum; + } + else if (order < len) { + // normal approximation to binomial + static const double root_pq = sqrt(0.25 * 0.75); + static const double root_1_2 = sqrt(0.5); + double z = (double(matches[order - 1]) - double(len - order) * 0.25) / (sqrt(len - order) * root_pq); + // the normal CDF + return 1.0 - 0.5 * erfc(-root_1_2 * z); + } + else { + return 1.0; + } +} + +template +double SeqComplexity::repetitiveness(int order) const { + return double(matches[order - 1]) / double(len - order); +} + + +} + +#endif diff --git a/src/simde b/src/simde new file mode 160000 index 00000000000..8cd136a43ba --- /dev/null +++ b/src/simde @@ -0,0 +1 @@ +Subproject commit 8cd136a43bae7ab9b82316179b9cef8887726778 diff --git a/src/small_bitset.hpp b/src/small_bitset.hpp new file mode 100644 index 00000000000..11e3fd691d2 --- /dev/null +++ b/src/small_bitset.hpp @@ -0,0 +1,161 @@ +#ifndef VG_SMALL_BITSET_INCLUDED +#define VG_SMALL_BITSET_INCLUDED + +#include + +#include + +namespace vg { + +/** + * A small bitset. We expect that the universe size is usually at most 64. + */ +class SmallBitset { + + public: + typedef std::uint64_t value_type; + + SmallBitset() : universe_size(0), data({ static_cast(0) }) {} + + explicit SmallBitset(size_t n) : universe_size(n) { + if (this->small()) { + this->data.value = 0; + } else { + size_t sz = this->data_size(); + this->data.pointer = new value_type[sz](); + } + } + + ~SmallBitset() { + this->clear(); + } + + SmallBitset(const SmallBitset& another) { + this->copy(another); + } + + SmallBitset(SmallBitset&& another) { + this->move(another); + } + + SmallBitset& operator=(const SmallBitset& another) { + if (&another != this) { + this->clear(); + this->copy(another); + } + return *this; + } + + SmallBitset& operator=(SmallBitset&& another) { + if (&another != this) { + this->clear(); + this->move(another); + } + return *this; + } + + size_t size() const { return this->universe_size; } + bool small() const { return (this->size() <= VALUE_BITS); } + size_t data_size() const { return (this->size() + VALUE_BITS - 1) / VALUE_BITS; } + + size_t count() const { + if (this->small()) { + return sdsl::bits::cnt(this->data.value); + } else { + size_t result = 0; + size_t sz = this->data_size(); + for (size_t i = 0; i < sz; i++) { + result += sdsl::bits::cnt(this->data.pointer[i]); + } + return result; + } + } + + void insert(size_t i) { + if (this->small()) { + this->data.value |= static_cast(1) << i; + } + else { + this->data.pointer[i >> VALUE_SHIFT] |= static_cast(1) << (i & VALUE_MASK); + } + } + + bool contains(size_t i) const { + if (this->small()) { + return (this->data.value & (static_cast(1) << i)); + } else { + return (this->data.pointer[i >> VALUE_SHIFT] & (static_cast(1) << (i & VALUE_MASK))); + } + } + + void operator|=(const SmallBitset& another) { + assert(this->size() == another.size()); + if (this->small()) { + this->data.value |= another.data.value; + } else { + size_t sz = this->data_size(); + for (size_t i = 0; i < sz; i++) { + this->data.pointer[i] |= another.data.pointer[i]; + } + } + } + + void operator&=(const SmallBitset& another) { + assert(this->size() == another.size()); + if (this->small()) { + this->data.value &= another.data.value; + } else { + size_t sz = this->data_size(); + for (size_t i = 0; i < sz; i++) { + this->data.pointer[i] &= another.data.pointer[i]; + } + } + } + + private: + size_t universe_size; + union { + value_type value; + value_type* pointer; + } data; + + void clear() { + if (!this->small()) { + delete[] this->data.pointer; + } + this->universe_size = 0; + this->data.value = 0; + } + + void copy(const SmallBitset& another) { + this->universe_size = another.universe_size; + if (this->small()) { + this->data.value = another.data.value; + } else { + size_t sz = this->data_size(); + this->data.pointer = new value_type[sz]; + for (size_t i = 0; i < sz; i++) { + this->data.pointer[i] = another.data.pointer[i]; + } + } + } + + void move(SmallBitset& another) { + this->universe_size = another.universe_size; + if (this->small()) { + this->data.value = another.data.value; + } else { + this->data.pointer = another.data.pointer; + another.universe_size = 0; + another.data.value = 0; + } + } + + constexpr static size_t VALUE_SHIFT = 6; + constexpr static size_t VALUE_BITS = static_cast(1) << VALUE_SHIFT; + constexpr static size_t VALUE_MASK = VALUE_BITS - 1; +}; + +} + +#endif // VG_SMALL_BITSET_INCLUDED diff --git a/src/simplifier.cpp b/src/small_snarl_simplifier.cpp similarity index 87% rename from src/simplifier.cpp rename to src/small_snarl_simplifier.cpp index 5909d09c1f4..bbb9e435194 100644 --- a/src/simplifier.cpp +++ b/src/small_snarl_simplifier.cpp @@ -1,17 +1,19 @@ -#include "simplifier.hpp" +#include "small_snarl_simplifier.hpp" + +#include "cactus_snarl_finder.hpp" namespace vg { using namespace std; -Simplifier::Simplifier(VG& graph) : Progressive(), graph(graph), traversal_finder(graph) { +SmallSnarlSimplifier::SmallSnarlSimplifier(VG& graph) : Progressive(), graph(graph), traversal_finder(graph) { // create a SnarlManager using Cactus CactusSnarlFinder site_finder(graph); site_manager = site_finder.find_snarls(); } -pair Simplifier::simplify_once(size_t iteration) { +pair SmallSnarlSimplifier::simplify_once(size_t iteration) { // Set up the deleted node and edge counts pair to_return {0, 0}; @@ -20,7 +22,7 @@ pair Simplifier::simplify_once(size_t iteration) { if(!graph.is_valid(true, true, true, true)) { // Make sure the graph is valid and not missing nodes or edges - cerr << "error:[vg::Simplifier] Invalid graph on iteration " << iteration << endl; + cerr << "error:[vg::SmallSnarlSimplifier] Invalid graph on iteration " << iteration << endl; exit(1); } @@ -77,7 +79,7 @@ pair Simplifier::simplify_once(size_t iteration) { // We can't use the SnarlManager after we modify the graph, so we load the // contents of all the leaves we're going to modify first. - map, unordered_set>> leaf_contents; + map, unordered_set>> leaf_contents; // How big is each leaf in bp map leaf_sizes; @@ -94,11 +96,11 @@ pair Simplifier::simplify_once(size_t iteration) { leaf_contents[leaf] = site_manager.deep_contents(leaf, graph, false); // For each leaf, calculate its total size. - unordered_set& nodes = leaf_contents[leaf].first; + unordered_set& nodes = leaf_contents[leaf].first; size_t& total_size = leaf_sizes[leaf]; - for (Node* node : nodes) { + for (id_t node_id : nodes) { // For each node include it in the size figure - total_size += node->sequence().size(); + total_size += graph.get_length(graph.get_handle(node_id)); } if (total_size == 0) { @@ -122,8 +124,8 @@ pair Simplifier::simplify_once(size_t iteration) { // Look at all the leaves // Get the contents of the bubble, excluding the boundary nodes - unordered_set& nodes = leaf_contents[leaf].first; - unordered_set& edges = leaf_contents[leaf].second; + unordered_set& nodes = leaf_contents[leaf].first; + unordered_set& edges = leaf_contents[leaf].second; // For each leaf, grab its total size. size_t& total_size = leaf_sizes[leaf]; @@ -141,8 +143,8 @@ pair Simplifier::simplify_once(size_t iteration) { #ifdef debug cerr << "Found " << total_size << " bp leaf" << endl; - for (auto* node : nodes) { - cerr << "\t" << node->id() << ": " << node->sequence() << endl; + for (id_t node_id : nodes) { + cerr << "\t" << node << " = " << node_id << ": " << graph.get_sequence(graph.get_handle(node_id)) << endl; } #endif @@ -159,11 +161,20 @@ pair Simplifier::simplify_once(size_t iteration) { // Get the traversal out of the vector SnarlTraversal& traversal = traversals.front(); +#ifdef debug + cerr << "Chosen traversal has: " << traversal.visit_size() << " visits" << endl; +#endif + // Determine the length of the new traversal size_t new_site_length = 0; for (size_t i = 1; i < traversal.visit_size() - 1; i++) { // For every non-anchoring node const Visit& visit = traversal.visit(i); + +#ifdef debug + cerr << "Chosen traversal has: " << visit << endl; +#endif + // Total up the lengths of all the nodes that are newly visited. assert(visit.node_id()); new_site_length += graph.get_node(visit.node_id())->sequence().size(); @@ -230,7 +241,7 @@ pair Simplifier::simplify_once(size_t iteration) { if (here->node_id() == leaf->start().node_id() && here->is_reverse() != (leaf->start().backward() != backward)) { // We have encountered the start node with an incorrect orientation. - cerr << "warning:[vg simplify] Path " << path_name + cerr << "warning:[vg::SmallSnarlSimplifier] Path " << path_name << " doubles back through start of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; skipping site!" << endl; @@ -281,7 +292,7 @@ pair Simplifier::simplify_once(size_t iteration) { if (here->node_id() == leaf->end().node_id() && here->is_reverse() != (leaf->end().backward() != backward)) { // We have encountered the end node with an incorrect orientation. - cerr << "warning:[vg simplify] Path " << path_name + cerr << "warning:[vg::SmallSnarlSimplifier] Path " << path_name << " doubles back through end of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; dropping site!" << endl; @@ -301,7 +312,7 @@ pair Simplifier::simplify_once(size_t iteration) { if (found_hairpin) { // We found a hairpin, so we want to skip the site. - cerr << "warning:[vg simplify] Site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << " skipped due to hairpin path." << endl; + cerr << "warning:[vg::SmallSnarlSimplifier] Site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << " skipped due to hairpin path." << endl; continue; } @@ -344,7 +355,7 @@ pair Simplifier::simplify_once(size_t iteration) { mapping_t* end_mapping = nullptr; #ifdef debug - cerr << "Scanning " << path_name << " from " << pb2json(*here) + cerr << "Scanning " << path_name << " from " << *here << " for " << to_node_traversal(leaf->end(), graph) << " orientation " << backward << endl; #endif @@ -352,7 +363,7 @@ pair Simplifier::simplify_once(size_t iteration) { // Until we hit the start/end of the path or the mapping we want #ifdef debug - cerr << "\tat " << pb2json(*here) << endl; + cerr << "\tat " << *here << endl; #endif if (here->node_id() == leaf->end().node_id() && @@ -376,7 +387,7 @@ pair Simplifier::simplify_once(size_t iteration) { if (here->node_id() == leaf->start().node_id() && here->is_reverse() != (leaf->start().backward() != backward)) { // We have encountered the start node with an incorrect orientation. - cerr << "warning:[vg simplify] Path " << path_name + cerr << "warning:[vg::SmallSnarlSimplifier] Path " << path_name << " doubles back through start of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; dropping!" << endl; @@ -386,11 +397,16 @@ pair Simplifier::simplify_once(size_t iteration) { break; } - if (!nodes.count(graph.get_node(here->node_id()))) { + if (here->node_id() != leaf->start().node_id() && + here->node_id() != leaf->end().node_id() && + !nodes.count(here->node_id())) { + // We aren't the start, the end, or any internal contained node. + // That's an error! // We really should stay inside the site! - cerr << "error:[vg simplify] Path " << path_name + cerr << "error:[vg::SmallSnarlSimplifier] Path " << path_name << " somehow escapes site " << to_node_traversal(leaf->start(), graph) - << " - " << to_node_traversal(leaf->end(), graph) << endl; + << " - " << to_node_traversal(leaf->end(), graph) << " and reaches non-contained node " + << here->node_id() << " at address " << graph.get_node(here->node_id()) << endl; exit(1); } @@ -401,8 +417,12 @@ pair Simplifier::simplify_once(size_t iteration) { existing_mappings.push_back(here); } - // Scan left along ther path if we found the site start backwards, and right if we found it forwards. + // Scan left along the path if we found the site start backwards, and right if we found it forwards. mapping_t* next = backward ? graph.paths.traverse_left(here) : graph.paths.traverse_right(here); + +#ifdef debug + cerr << "Here is " << *here << " at " << here << " and next is " << *next << " at " << next << endl; +#endif if (next == nullptr) { // We hit the end of the path without finding the end of the site. @@ -421,7 +441,7 @@ pair Simplifier::simplify_once(size_t iteration) { // Make sure we have an edge so we can traverse this node and then the node we're going to. if(graph.get_edge(here_traversal, next_traversal) == nullptr) { - cerr << "error:[vg::Simplifier] No edge " << here_traversal << " to " << next_traversal << endl; + cerr << "error:[vg::SmallSnarlSimplifier] No edge " << here_traversal << " to " << next_traversal << endl; exit(1); } @@ -490,7 +510,9 @@ pair Simplifier::simplify_once(size_t iteration) { #endif // Actually update any BED features - features.on_path_edit(path_name, variable_start, old_site_length, new_site_length); + if (features != nullptr) { + features->on_path_edit(path_name, variable_start, old_site_length, new_site_length); + } // Where will we insert the new site traversal into the path? list::iterator insert_position; @@ -505,7 +527,7 @@ pair Simplifier::simplify_once(size_t iteration) { // mapping to the end of the site. #ifdef debug - cerr << path_name << ": Drop mapping " << pb2json(*mapping) << endl; + cerr << path_name << " forward: Drop mapping " << *mapping << endl; #endif insert_position = graph.paths.remove_mapping(mapping); @@ -524,11 +546,19 @@ pair Simplifier::simplify_once(size_t iteration) { assert(insert_position->node_id() == leaf->end().node_id()); } +#ifdef debug + cerr << "Chosen traversal has " << traversal.visit_size() << " visits" << endl; +#endif + // Loop through the internal visits in the canonical // traversal backwards along the path we are splicing. If // it's a forward path this is just right to left, but if // it's a reverse path it has to be left to right. - for (size_t i = 0; i < traversal.visit_size(); i++) { + for (size_t i = 1; i + 1 < traversal.visit_size(); i++) { + // Don't visit the first or last node on the traversal + // because they are the snarl start and end which we didn't + // remove. + // Find the visit we need next, as a function of which // way we need to insert this run of visits. Normally we // go through the visits right to left, but when we have @@ -549,7 +579,7 @@ pair Simplifier::simplify_once(size_t iteration) { new_mapping.length = node_seq_length; #ifdef debug - cerr << path_name << ": Add mapping " << pb2json(new_mapping) << endl; + cerr << path_name << " backward: Add mapping " << new_mapping << endl; #endif // Insert the mapping in the path, moving right to left @@ -604,7 +634,7 @@ pair Simplifier::simplify_once(size_t iteration) { if (here->node_id() == leaf->end().node_id() && here->is_reverse() != (leaf->end().backward() != backward)) { // We have encountered the end node with an incorrect orientation. - cerr << "warning:[vg simplify] Path " << path_name + cerr << "warning:[vg::SmallSnarlSimplifier] Path " << path_name << " doubles back through end of site " << to_node_traversal(leaf->start(), graph) << " - " << to_node_traversal(leaf->end(), graph) << "; dropping!" << endl; @@ -651,8 +681,12 @@ pair Simplifier::simplify_once(size_t iteration) { // For each node and the next node (which won't be the end) const Visit visit = traversal.visit(i); - const Visit next = traversal.visit(i); - + const Visit next = traversal.visit(i+1); + +#ifdef debug + cerr << "Follow edge from " << visit << " to " << next << endl; +#endif + // Find the edge between them NodeTraversal here(graph.get_node(visit.node_id()), visit.backward()); NodeTraversal next_traversal(graph.get_node(next.node_id()), next.backward()); @@ -662,22 +696,14 @@ pair Simplifier::simplify_once(size_t iteration) { // Remember we need it blessed_edges.insert(edge); } + + // The traversal also touches the boundary nodes, so don't do anything special for them. - // Also get the edges from the boundary nodes into the traversal - if (traversal.visit_size() > 0) { - NodeTraversal first_visit = to_node_traversal(traversal.visit(0), graph); - NodeTraversal last_visit = to_node_traversal(traversal.visit(traversal.visit_size() - 1), - graph); - blessed_edges.insert(graph.get_edge(to_node_traversal(leaf->start(), graph), first_visit)); - blessed_edges.insert(graph.get_edge(last_visit, to_node_traversal(leaf->end(), graph))); - } - else { - // This is a deletion traversal, so get the edge from the start to end of the site - blessed_edges.insert(graph.get_edge(to_node_traversal(leaf->start(), graph), - to_node_traversal(leaf->end(), graph))); - } - - for (auto* edge : edges) { + for (const edge_t& edge_handle : edges) { + Node* from_node = graph.get_node(graph.get_id(edge_handle.first)); + Node* to_node = graph.get_node(graph.get_id(edge_handle.second)); + Edge* edge = graph.get_edge(NodeTraversal(from_node, graph.get_is_reverse(edge_handle.first)), + NodeTraversal(to_node, graph.get_is_reverse(edge_handle.second))); if (!blessed_edges.count(edge)) { // Get rid of all the edges not needed for the one true traversal #ifdef debug @@ -700,7 +726,8 @@ pair Simplifier::simplify_once(size_t iteration) { blessed_nodes.insert(graph.get_node(visit.node_id())); } - for (auto* node : nodes) { + for (id_t node_id : nodes) { + Node* node = graph.get_node(node_id); // For every node in the site if (!blessed_nodes.count(node)) { // If we don't need it for the chosen path, destroy it @@ -725,7 +752,7 @@ pair Simplifier::simplify_once(size_t iteration) { } for (auto& path : paths_to_kill) { graph.paths.remove_path(path); - cerr << "warning:[vg simplify] Path " << path << " removed" << endl; + cerr << "warning:[vg::SmallSnarlSimplifier] Path " << path << " removed" << endl; } graph.destroy_node(node); @@ -748,7 +775,7 @@ pair Simplifier::simplify_once(size_t iteration) { } -void Simplifier::simplify() { +void SmallSnarlSimplifier::simplify() { for (size_t i = 0; i < max_iterations; i++) { // Try up to the max number of iterations auto deleted_elements = simplify_once(i); diff --git a/src/simplifier.hpp b/src/small_snarl_simplifier.hpp similarity index 76% rename from src/simplifier.hpp rename to src/small_snarl_simplifier.hpp index 3105b200dba..65463a61686 100644 --- a/src/simplifier.hpp +++ b/src/small_snarl_simplifier.hpp @@ -1,10 +1,10 @@ -#ifndef VG_SIMPLIFIER_HPP_INCLUDED -#define VG_SIMPLIFIER_HPP_INCLUDED +#ifndef VG_SMALL_SNARL_SIMPLIFIER_HPP_INCLUDED +#define VG_SMALL_SNARL_SIMPLIFIER_HPP_INCLUDED #include "progressive.hpp" #include "vg.hpp" -#include "vg.pb.h" +#include #include "traversal_finder.hpp" #include "utility.hpp" #include "path.hpp" @@ -26,11 +26,11 @@ using namespace std; * like features up to date. TODO: doesn't handle path start and end positions * within nodes. */ -class Simplifier : public Progressive { +class SmallSnarlSimplifier : public Progressive { public: /// Make a simplifier that simplifies the given graph in place. - Simplifier(VG& graph); + SmallSnarlSimplifier(VG& graph); /// Simplify the graph by one step. Returns the number of nodes deleted and /// the number of edges deleted. Can be passed an iteration for its progress @@ -53,9 +53,11 @@ class Simplifier : public Progressive { /// bubbles unsimplified? bool drop_hairpin_paths = false; - /// Stores the features in the graph, and gets updated as simplification - /// proceeds. The user should load the features in and pull them out. - FeatureSet features; + /// If the user points this to a FeatureSet, that FeatureSet will get its + /// features updated with changes to the graph as simplification proceeds. + /// The user should load the features in and pull them out. + /// TODO: Replace this with an on_path_edit event on this object that can be listened on. + FeatureSet* features = nullptr; protected: diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp new file mode 100644 index 00000000000..a6eebcbe93b --- /dev/null +++ b/src/snarl_caller.cpp @@ -0,0 +1,954 @@ +#include "snarl_caller.hpp" +#include "genotypekit.hpp" + +//#define debug + +namespace vg { + +SnarlCaller::~SnarlCaller() { +} + +function SnarlCaller::get_skip_allele_fn() const { + // default implementation says don't skip anything + return [](const SnarlTraversal&, int) { assert(false); return false; }; +} + +SupportBasedSnarlCaller::SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder) : + graph(graph), + snarl_manager(snarl_manager), + support_finder(support_finder) { + +} + +SupportBasedSnarlCaller::~SupportBasedSnarlCaller() { + +} + +void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const unique_ptr& call_info, + const string& sample_name, + vcflib::Variant& variant) { + + +} + +TraversalSupportFinder& SupportBasedSnarlCaller::get_support_finder() const { + return support_finder; +} + +int SupportBasedSnarlCaller::get_min_total_support_for_call() const { + return min_total_support_for_call; +} + +void SupportBasedSnarlCaller::set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support) { + if (min_mad_for_call >= 0) { + min_mad_for_filter = min_mad_for_call; + } + if (min_support_for_call >= 0) { + min_total_support_for_call = min_support_for_call; + } + if (min_site_support >= 0) { + min_site_depth = min_site_support; + } +} + +int SupportBasedSnarlCaller::get_best_support(const vector& supports, const vector& skips) { + int best_allele = -1; + for(size_t i = 0; i < supports.size(); i++) { + if(std::find(skips.begin(), skips.end(), i) == skips.end() && ( + best_allele == -1 || support_val(supports[best_allele]) <= support_val(supports[i]))) { + best_allele = i; + } + } + return best_allele; +} + +function SupportBasedSnarlCaller::get_skip_allele_fn() const { + // port over cutoff used in old support caller (there avg support used all the time, here + // we use the same toggles as when genotyping) + return [&](const SnarlTraversal& trav, int iteration) -> bool { + return support_val(support_finder.get_traversal_support(trav)) < pow(2, iteration) * min_alt_path_support; + }; +} + +RatioSupportSnarlCaller::RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder) : + SupportBasedSnarlCaller(graph, snarl_manager, support_finder) { +} + +RatioSupportSnarlCaller::~RatioSupportSnarlCaller() { + +} + +void RatioSupportSnarlCaller::set_het_bias(double het_bias, double ref_het_bias) { + // want to move away from ugly hacks that treat the reference traversal differently, + // so keep all these set the same + if (het_bias >= 0) { + max_het_bias = het_bias; + max_ref_het_bias = het_bias; + max_indel_het_bias = het_bias; + } + if (ref_het_bias >= 0) { + max_ref_het_bias = ref_het_bias; + } +} + +pair, unique_ptr> RatioSupportSnarlCaller::genotype(const Snarl& snarl, + const vector& traversals, + int ref_trav_idx, + int ploidy, + const string& ref_path_name, + pair ref_range) { + +#ifdef debug + cerr << "Support calling site " << pb2json(snarl) << endl; +#endif + + // get the traversal sizes + vector traversal_sizes = support_finder.get_traversal_sizes(traversals); + + // get the supports of each traversal independently + vector supports = support_finder.get_traversal_set_support(traversals, {}, {}, {}, false, {}, {}, ref_trav_idx); + int best_allele = get_best_support(supports, {}); + +#ifdef debug + for (int i = 0; i < traversals.size(); ++i) { + cerr << "trav " << i << " size = " << traversal_sizes[i] << " support = " << support_val(supports[i]); + if (i == ref_trav_idx) { + cerr << " [Reference traversal]"; + } + cerr << endl; + } +#endif + + // we prune out traversals whose exclusive support (structure that is not shared with best traversal) + // doesn't meet a certain cutoff + vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, {}, true, {}, {}, ref_trav_idx); + vector skips = {best_allele}; + for (int i = 0; i < secondary_exclusive_supports.size(); ++i) { + double bias = get_bias(traversal_sizes, i, best_allele, ref_trav_idx); +#ifdef debug + cerr << "trav " << i << " exclusive support " << support_val(secondary_exclusive_supports[i]) + << " * bias " << bias << " vs " << support_val(supports[best_allele]) << endl; +#endif + if (i != best_allele && support_val(secondary_exclusive_supports[i]) * bias <= support_val(supports[best_allele])) { + skips.push_back(i); + } + } + // get the supports of each traversal in light of best + vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, {}, false, {}, {}, ref_trav_idx); + int second_best_allele = get_best_support(secondary_supports, {skips}); + + // get the supports of each traversal in light of second best + // for special case where we may call two alts, with each having less support than ref + vector tertiary_supports; + int third_best_allele = -1; + if (second_best_allele != -1) { + // prune out traversals whose exclusive support relative to second best doesn't pass cut + vector tertiary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, {}, {}, true, {}, {}, ref_trav_idx); + skips.push_back(best_allele); + skips.push_back(second_best_allele); + for (int i = 0; i < tertiary_exclusive_supports.size(); ++i) { + double bias = get_bias(traversal_sizes, i, second_best_allele, ref_trav_idx); + if (support_val(tertiary_exclusive_supports[i]) * bias <= support_val(supports[second_best_allele])) { + skips.push_back(i); + } + } + tertiary_supports = support_finder.get_traversal_set_support(traversals, {second_best_allele}, {}, {}, false, {}, {}, ref_trav_idx); + third_best_allele = get_best_support(tertiary_supports, skips); + } + + + // Now make a genotype call at this site, up to the allowed copy number + vector genotype; + + // How much support do we have for the top two alleles? + Support site_support = supports.at(best_allele); + if(second_best_allele != -1) { + site_support += supports.at(second_best_allele); + } + + // Pull out the different supports. Some of them may be the same. + Support best_support = supports.at(best_allele); + Support second_best_support; // Defaults to 0 + if(second_best_allele != -1) { + second_best_support = supports.at(second_best_allele); + } + Support third_best_support; + if (third_best_allele != -1) { + third_best_support = supports.at(third_best_allele); + } + +#ifdef debug + cerr << "best allele=" << best_allele << ", best sup=" << best_support << " and " + << "2nd_best_allele=" << second_best_allele << ", 2nd best sup=" << second_best_support << " and " + << "3rd_best_allele=" << third_best_allele << ", 3rd best sup=" << third_best_support << endl; + + if (support_val(second_best_support) > 0) { + cerr << "Bias: (limit " << get_bias(traversal_sizes, best_allele, second_best_allele, ref_trav_idx) << "):" + << support_val(best_support)/support_val(second_best_support) << endl; + } + + cerr << get_bias(traversal_sizes, best_allele, second_best_allele, ref_trav_idx) * support_val(second_best_support) << " vs " + << support_val(best_support) << endl; + + cerr << total(second_best_support) << " vs " << min_total_support_for_call << endl; +#endif + + // Single ploidy case when doing recursive genotyping. Just return the best allele + if (ploidy == 1) { + return make_pair(vector(1, best_allele), unique_ptr()); + } + // Call 1/2 : REF-Alt1/Alt2 even if Alt2 has only third best support + else if (ploidy >= 2 && + third_best_allele > 0 && + best_allele == ref_trav_idx && + max_ma_bias * + support_val(third_best_support) >= support_val(best_support) && + total(second_best_support) > min_total_support_for_call && + total(third_best_support) > min_total_support_for_call) { + // There's a second best allele and third best allele, and it's not too biased to call, + // and both alleles exceed the minimum to call them present, and the + // second-best and third-best alleles have enough support that it won't torpedo the + // variant. + +#ifdef debug + cerr << "Call as second best/third best" << endl; +#endif + // Say both are present + genotype = {second_best_allele, third_best_allele}; + } + // Call 1/2 : REF-Alt1/Alt2 even if Alt2 has only third best support (but ref is second best) + else if (ploidy >= 2 && + third_best_allele > 0 && + second_best_allele == ref_trav_idx && + max_ma_bias * + support_val(third_best_support) >= support_val(second_best_support) && + total(best_support) > min_total_support_for_call && + total(third_best_support) > min_total_support_for_call) { + // There's a second best allele and third best allele, and it's not too biased to call, + // and both alleles exceed the minimum to call them present, and the + // second-best and third-best alleles have enough support that it won't torpedo the + // variant. + +#ifdef debug + cerr << "Call as second best/third best" << endl; +#endif + // Say both are present + genotype = {best_allele, third_best_allele}; + } + else if (ploidy >= 2 && + second_best_allele != -1 && + get_bias(traversal_sizes, best_allele, second_best_allele, ref_trav_idx) * + support_val(second_best_support) >= support_val(best_support) && + total(best_support) > min_total_support_for_call && + total(second_best_support) > min_total_support_for_call) { + // There's a second best allele, and it's not too biased to call, + // and both alleles exceed the minimum to call them present, and the + // second-best allele has enough support that it won't torpedo the + // variant. + +#ifdef debug + cerr << "Call as best/second best" << endl; +#endif + + // Say both are present + genotype = {best_allele, second_best_allele}; + + } else if (ploidy >= 2 && total(best_support) > min_total_support_for_call) { + // The second best allele isn't present or isn't good enough, + // but the best allele has enough coverage that we can just call + // two of it. + +#ifdef debug + cerr << "Call as best/best" << endl; +#endif + + // Say the best is present twice + genotype = {best_allele, best_allele}; + + } else if (ploidy >= 1 && total(best_support) > min_total_support_for_call) { + // We're only supposed to have one copy, and the best allele is good enough to call + +#ifdef debug + cerr << "Call as best" << endl; +#endif + genotype = {best_allele}; + } else { + // Either coverage is too low, or we aren't allowed any copies. + // We can't really call this as anything. + +#ifdef debug + cerr << "Do not call" << endl; +#endif + + } + + // Todo: specify call_info to use new interface, then fix up update_vcf_info to read it, + // and move common logic up to SupportBasedCaller if possible. + return make_pair(genotype, unique_ptr()); +} + +void RatioSupportSnarlCaller::update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const unique_ptr& call_info, + const string& sample_name, + vcflib::Variant& variant) { + + assert(traversals.size() == variant.alleles.size()); + + set called_allele_set(genotype.begin(), genotype.end()); + vector shared_travs; + if (called_allele_set.size() > 1) { + shared_travs.push_back(genotype[0]); + } + // compute the support of our called alleles + vector allele_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0); + + // Compute the total support for all the alts that will be appearing + Support total_support = std::accumulate(allele_supports.begin(), allele_supports.end(), Support()); + + // Set up the depth format field + variant.format.push_back("DP"); + // And allelic depth + variant.format.push_back("AD"); + // And the log likelihood from the assignment of reads among the + // present alleles + variant.format.push_back("XADL"); + // Also the alt allele depth + variant.format.push_back("XAAD"); + + // And total alt allele depth for the alt alleles + Support alt_support; + // Find the min total support of anything called + double min_site_support = called_allele_set.size() > 0 ? INFINITY : 0; + + if (!allele_supports.empty()) { //only add info if we made a call + for (int allele = 0; allele < traversals.size(); ++allele) { + bool is_called = called_allele_set.count(allele); + auto& support = allele_supports[allele]; + + // Set up allele-specific stats for the allele + variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(total(support)))); + + if (allele != 0) { + // It's not the primary reference allele + alt_support += support; + } + + // Min all the total supports from the alleles called as present + if (is_called) { + min_site_support = min(min_site_support, total(support)); + } + } + } + + + // Find the binomial bias between the called alleles, if multiple were called. + double ad_log_likelihood = INFINITY; + if (called_allele_set.size() == 2) { + int best_allele = genotype[0]; + int second_best_allele = genotype[1]; + // How many of the less common one do we have? + size_t successes = round(total(allele_supports[second_best_allele])); + // Out of how many chances + size_t trials = successes + (size_t) round(total(allele_supports[best_allele])); + + assert(trials >= successes); + + // How weird is that? + ad_log_likelihood = binomial_cmf_ln(prob_to_logprob((real_t) 0.5), trials, successes); + + assert(!std::isnan(ad_log_likelihood)); + + variant.samples[sample_name]["XADL"].push_back(std::to_string(ad_log_likelihood)); + } else { + // No need to assign reads between two alleles + variant.samples[sample_name]["XADL"].push_back("."); + } + + // Set the variant's total depth + string depth_string = std::to_string((int64_t)round(total(total_support))); + variant.info["DP"].push_back(depth_string); // We only have one sample, so variant depth = sample depth + + // And for the sample + variant.samples[sample_name]["DP"].push_back(depth_string); + + // And its depth of non-0 alleles + variant.samples[sample_name]["XAAD"].push_back(std::to_string((int64_t)round(total(alt_support)))); + + // Set the total support of the min allele as the variant quality + variant.quality = min_site_support; + + // And store the minimum support just to be clear + variant.format.push_back("MAD"); + variant.samples[sample_name]["MAD"].push_back(std::to_string((int)(min_site_support))); + + // Now do the filters + variant.filter = "PASS"; + if (min_site_support < min_mad_for_filter) { + // Apply Min Allele Depth cutoff across all alleles (even ref) + variant.filter = "lowad"; + } else if (min_ad_log_likelihood_for_filter != 0 && + ad_log_likelihood < min_ad_log_likelihood_for_filter) { + // We have a het, but the assignment of reads between the two branches is just too weird + variant.filter = "lowxadl"; + } else if ((int64_t)round(total(total_support)) < min_site_depth) { + // we don't have enough support to want to make a call + variant.filter = "lowdepth"; + } +} + +void RatioSupportSnarlCaller::update_vcf_header(string& header) const { + header += "##INFO=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + // We need this field to stratify on for VCF comparison. The info is in SB but vcfeval can't pull it out + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + header += "##FILTER=\n"; + header += "##FILTER=\n"; + header += "##FILTER=\n"; +} + +double RatioSupportSnarlCaller::get_bias(const vector& traversal_sizes, int best_trav, + int second_best_trav, int ref_trav_idx) const { + bool is_indel = ((best_trav >= 0 && traversal_sizes[best_trav] != traversal_sizes[ref_trav_idx]) || + (second_best_trav >=0 && traversal_sizes[second_best_trav] != traversal_sizes[ref_trav_idx])); + + double bias_limit = 1; + + if (best_trav >= 0 && second_best_trav >=0) { + if (best_trav == ref_trav_idx) { + // Use ref bias limit + + // We decide closeness differently depending on whether best is ref + // or not. In practice, we use this to slightly penalize homozygous + // ref calls (by setting max_ref_het_bias higher than max_het_bias) + // and rather make a less supported alt call instead. This boost + // max sensitivity, and because everything is homozygous ref by + // default in VCF, any downstream filters will effectively reset + // these calls back to homozygous ref. TODO: This shouldn't apply + // when off the primary path! + bias_limit = max_ref_het_bias; + } else if (is_indel) { + // This is an indel + // Use indel bias limit + bias_limit = max_indel_het_bias; + } else { + // Use normal het bias limit + bias_limit = max_het_bias; + } + } + return bias_limit; +} + + +PoissonSupportSnarlCaller::PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder, + const algorithms::BinnedDepthIndex& depth_index, + bool use_mapq) : + SupportBasedSnarlCaller(graph, snarl_manager, support_finder), + depth_index(depth_index), + use_mapq(use_mapq) { + +} + +PoissonSupportSnarlCaller::~PoissonSupportSnarlCaller() { + +} + +void PoissonSupportSnarlCaller::set_baseline_error(double small_variant_error, double large_variant_error) { + if (small_variant_error >= 0) { + baseline_error_small = small_variant_error; + } + if (large_variant_error >= 0) { + baseline_error_large = large_variant_error; + } +} + +void PoissonSupportSnarlCaller::set_insertion_bias(double insertion_threshold, double small_insertion_bias, double large_insertion_bias) { + this->insertion_threshold = insertion_threshold; + if (small_insertion_bias >= 0) { + insertion_bias_small = small_insertion_bias; + } + if (large_insertion_bias >= 0) { + insertion_bias_large = large_insertion_bias; + } +} + +pair, unique_ptr> PoissonSupportSnarlCaller::genotype(const Snarl& snarl, + const vector& traversals, + int ref_trav_idx, + int ploidy, + const string& ref_path_name, + pair ref_range) { + + +#ifdef debug + cerr << "Poisson Support calling site " << pb2json(snarl) + << " on path " << ref_path_name << ":" << ref_range.first << "-" << ref_range.second << endl; +#endif + + assert(ploidy == 2 || ploidy == 1); + + // get the traversal sizes + vector traversal_sizes = support_finder.get_traversal_sizes(traversals); + + // get the mapqs + vector traversal_mapqs; + if (use_mapq) { + // note: we are only looking at nodes for mapqs, not edges + traversal_mapqs = support_finder.get_traversal_mapqs(traversals); + } + + // get the supports of each traversal independently + int max_trav_size = -1; + vector supports = support_finder.get_traversal_set_support(traversals, {}, {}, {}, false, {}, {}, ref_trav_idx, &max_trav_size); + + int ref_trav_size = 0; + if (ref_trav_idx >= 0) { + const SnarlTraversal& ref_trav = traversals[ref_trav_idx]; + for (int64_t i = 1; i < (int64_t)ref_trav.visit_size() - 1; ++i) { + ref_trav_size += graph.get_length(graph.get_handle(ref_trav.visit(i).node_id())); + } + } + + // sort the traversals by support + vector ranked_traversals = rank_by_support(supports); + size_t max_trav = std::min(top_k, (size_t)ranked_traversals.size()); + size_t max_sec_trav = std::min(top_m, (size_t)ranked_traversals.size()); + // take the top-m traversals in order to check against the top traversal + set top_traversals(ranked_traversals.begin(), ranked_traversals.begin() + max_sec_trav); + + // the candidate genotypes and their supports. the numbers here are alleles as indexed in traversals[] + set> candidates; + // we always consider the reference allele + + // pre-filter out some alleles based on poor exclusive support + set skips; + + // consider each of the top 25 traversals as our top_traversal + for (int i = 0; i < max_trav; ++i) { + + int best_allele = ranked_traversals[i]; + + if (skips.count(best_allele)) { + continue; + } + if (support_val(supports[best_allele]) < min_total_support_for_call && candidates.size() >= max_trav) { + break; + } + + if (ploidy == 1) { + candidates.insert({best_allele}); + } else { + assert(ploidy == 2); + + // we prune out traversals whose exclusive support (structure that is not shared with best traversal) + // doesn't meet a certain cutoff + vector secondary_exclusive_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, top_traversals, true, {}, {}, ref_trav_idx, &max_trav_size); + for (int j = 0; j < secondary_exclusive_supports.size(); ++j) { + if (j != best_allele && + support_val(secondary_exclusive_supports[j]) < min_total_support_for_call && + support_val(secondary_exclusive_supports[j]) < support_val(supports[j])) { + skips.insert(j); + } + } + + // get the supports of each traversal in light of best + vector secondary_supports = support_finder.get_traversal_set_support(traversals, {best_allele}, {}, top_traversals, false, {}, {}, ref_trav_idx, &max_trav_size); + vector ranked_secondary_traversals = rank_by_support(secondary_supports); + + // add the homozygous genotype for our best allele + candidates.insert({best_allele, best_allele}); + + // now look at the top-k second-best traversals + size_t sec_count = 0; + for (int j = 0; j < ranked_secondary_traversals.size() && sec_count < top_k; ++j) { + int second_best_allele = ranked_secondary_traversals[j]; + if (support_val(secondary_supports[second_best_allele]) < min_total_support_for_call && candidates.size() >= max_trav) { + break; + } + if (!skips.count(second_best_allele) && second_best_allele != best_allele) { + // canonical ordering for our set + candidates.insert({min(best_allele, second_best_allele), max(best_allele, second_best_allele)}); + // also make sure we have our homozygous genotype for the second best allele + candidates.insert({second_best_allele, second_best_allele}); + ++sec_count; + } + } + } + } + + // expected depth from our coverage + auto depth_info = algorithms::get_depth_from_index(depth_index, ref_path_name, ref_range.first, ref_range.second); + double exp_depth = depth_info.first; + assert(!isnan(exp_depth)); + // variance/std-err can be nan when binsize < 2. We just clamp it to 0 + double depth_err = depth_info.second ? !isnan(depth_info.second) : 0.; + + // genotype (log) likelihoods + double best_genotype_likelihood = -numeric_limits::max(); + double second_best_genotype_likelihood = -numeric_limits::max(); + double total_likelihood = 0; + vector best_genotype; + for (const auto& candidate : candidates) { + double gl = genotype_likelihood(candidate, traversals, top_traversals, traversal_sizes, traversal_mapqs, + ref_trav_idx, exp_depth, depth_err, max_trav_size, ref_trav_size); + if (gl > best_genotype_likelihood) { + second_best_genotype_likelihood = best_genotype_likelihood; + best_genotype_likelihood = gl; + best_genotype = candidate; + } else if (gl > second_best_genotype_likelihood) { + assert(gl <= best_genotype_likelihood); + second_best_genotype_likelihood = gl; + } + total_likelihood = total_likelihood == 0 ? gl : add_log(total_likelihood, gl); + } + + PoissonCallInfo* call_info = new PoissonCallInfo(); + + call_info->posterior = 0; + if (!candidates.empty()) { + // compute the posterior from our likelihoods using a uniform prior + call_info->posterior = best_genotype_likelihood - log(candidates.size()) - total_likelihood; + } + + // GQ computed as here https://gatk.broadinstitute.org/hc/en-us/articles/360035890451?id=11075 + // as difference between best and second best likelihoods + call_info->gq = 0; + if (!isnan(best_genotype_likelihood) && !isnan(second_best_genotype_likelihood)) { + call_info->gq = logprob_to_phred(second_best_genotype_likelihood) - logprob_to_phred(best_genotype_likelihood); + } + + call_info->expected_depth = exp_depth; + call_info->depth_err = depth_err; + call_info->max_trav_size = max_trav_size; + + +#ifdef debug + cerr << " best genotype: "; for (auto a : best_genotype) {cerr << a <<",";} cerr << " gl=" << best_genotype_likelihood << endl; +#endif + return make_pair(best_genotype, unique_ptr(call_info)); +} + +double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotype, + const vector& traversals, + const set& trav_subset, + const vector& traversal_sizes, + const vector& traversal_mapqs, + int ref_trav_idx, double exp_depth, double depth_err, + int max_trav_size, + int ref_trav_size) { + + assert(genotype.size() == 1 || genotype.size() == 2); + + // get the genotype support + vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, trav_subset, ref_trav_idx, + &max_trav_size); + + // get the total support over the site + Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support()); + + // get the length-normalized mapq for the alleles + double total_genotype_mapq = 0; + size_t total_genotype_length = 0; + if (use_mapq) { + for (int i = 0; i < genotype.size(); ++i) { + total_genotype_mapq += traversal_mapqs[genotype[i]] * traversal_sizes[genotype[i]]; + total_genotype_length += traversal_sizes[genotype[i]]; + } + } + + // get the total support of traversals *not* in the genotype + Support total_other_support; + // also get length-normalized mapq + double total_other_mapq = 0; + size_t total_other_length = 0; + set genotype_set(genotype.begin(), genotype.end()); + for (int i = 0; i < traversals.size(); ++i) { + if (!genotype_set.count(i)) { + total_other_support += genotype_supports[i]; + if (use_mapq) { + total_other_mapq += traversal_mapqs[i] * traversal_sizes[i]; + total_other_length += traversal_sizes[i]; + } + } + } + + // split the homozygous support into two + // from now on we'll treat it like two separate observations, each with half coverage + vector fixed_genotype_supports = genotype_supports; + if (std::equal(genotype.begin() + 1, genotype.end(), genotype.begin())) { + for (int i = 0; i < genotype_supports.size(); ++i) { + fixed_genotype_supports[i] = genotype_supports[i] / (double)genotype.size(); + } + } + + // how many reads would we expect to not map to our genotype due to error + // Note: The bin size is set quite a bit smaller than originally intended as it seems to + // help nearly nevery benchmark. But the small bin sizes means that depth_err, the + // error from the binned coverage, is way too high and including it only causes trouble. + // tldr: just use the baseline_mapping_error constant and forget about depth_err for now. + //double error_rate = std::min(0.05, depth_err + baseline_mapping_error); + + // we toggle the baseline error + size_t threshold = support_finder.get_average_traversal_support_switch_threshold(); + double error_rate = max_trav_size >= threshold ? baseline_error_large : baseline_error_small; + // and multiply by the insertion bias if the site looks like an insertion + if (ref_trav_idx >= 0 && max_trav_size >= insertion_threshold * ref_trav_size) { + error_rate *= (max_trav_size >= threshold ? insertion_bias_large : insertion_bias_small); + } + + // error rate for non-allele traversals + double other_error_rate = error_rate; + if (use_mapq && total_other_length > 0) { + other_error_rate += phred_to_prob(total_other_mapq / total_other_length); +#ifdef debug + cerr << "adding phred " << total_other_mapq << " / " << total_other_length << " to other error rate of " + << error_rate << " gives " << other_error_rate << endl; +#endif + + } + double other_poisson_lambda = other_error_rate * exp_depth; //support_val(total_site_support); + + // and our likelihood for the unmapped reads we see: + double other_log_likelihood = poisson_prob_ln(std::round(support_val(total_other_support)), other_poisson_lambda); + + double allele_error_rate = error_rate; + if (use_mapq && total_genotype_length > 0) { + allele_error_rate += phred_to_prob(total_genotype_mapq / total_genotype_length); +#ifdef debug + cerr << "adding phred " << total_genotype_mapq << " / " << total_genotype_length << " to allele error rate of " + << error_rate << " gives " << allele_error_rate << endl; +#endif + } + + // how many reads do we expect for an allele? we use the expected coverage and just + // divide it out by the size of the genotype. + double allele_poisson_lambda = (exp_depth / (double)genotype.size()) * (1. - allele_error_rate); + +#ifdef debug + cerr << "Computing prob of genotype: {"; + for (int i = 0; i < genotype.size(); ++i) { + cerr << genotype[i] << ","; + } + cerr << "}: tot_other_sup = " << total_other_support << " tot site sup = " << total_site_support + << " exp-depth = " << exp_depth << " depth-err = " << depth_err << " other-lambda = " << other_poisson_lambda + << " allele-lambda " << allele_poisson_lambda << " ref-idx " << ref_trav_idx << endl; +#endif + + // now we compute the likelihood of our genotype + double alleles_log_likelihood = 0; + for (int allele : genotype) { + const Support& allele_support = fixed_genotype_supports[allele]; + double allele_ll = poisson_prob_ln(std::round(support_val(allele_support)), allele_poisson_lambda); + alleles_log_likelihood += allele_ll; + +#ifdef debug + cerr << " a[" << allele <<"]=" << " sup=" << genotype_supports[allele] << " fix-sup=" << allele_support + << " prob " << allele_ll << endl; +#endif + } + +#ifdef debug + cerr << " allele-log-prob " << alleles_log_likelihood << " other-log-prob " << other_log_likelihood + << " total-prob " << (alleles_log_likelihood + other_log_likelihood) << endl; +#endif + + return alleles_log_likelihood + other_log_likelihood; +} + +void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const unique_ptr& call_info, + const string& sample_name, + vcflib::Variant& variant) { + + assert(traversals.size() == variant.alleles.size()); + + // get the traversal sizes + vector traversal_sizes = support_finder.get_traversal_sizes(traversals); + + // get the traversal mapqs + vector traversal_mapqs; + if (use_mapq) { + traversal_mapqs = support_finder.get_traversal_mapqs(traversals); + } + + // get the maximum size from the info + const SnarlCaller::CallInfo* s_call_info = call_info.get(); + const PoissonCallInfo* p_call_info = dynamic_cast(call_info.get()); + int max_trav_size = p_call_info->max_trav_size; + + int ref_trav_idx = 0; + int ref_trav_size = 0; + const SnarlTraversal& ref_trav = traversals[ref_trav_idx]; + for (int64_t i = 1; i < (int64_t)ref_trav.visit_size() - 1; ++i) { + ref_trav_size += graph.get_length(graph.get_handle(ref_trav.visit(i).node_id())); + } + + // get the genotype support + vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0, &max_trav_size); + + // Get the depth of the site + Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support()); + double total_site_depth = support_val(total_site_support); + + // Set the variant's total depth + string depth_string = std::to_string((int64_t)round(total_site_depth)); + variant.format.push_back("DP"); + variant.info["DP"].push_back(depth_string); // We only have one sample, so variant depth = sample depth + + // And for the sample + variant.samples[sample_name]["DP"].push_back(depth_string); + + // get the allele depths + variant.format.push_back("AD"); + + set genotype_set(genotype.begin(), genotype.end()); + double min_site_support = genotype.size() > 0 ? INFINITY : 0; + + // update the allele depths + for (int i = 0; i < traversals.size(); ++i) { + Support allele_support = genotype_supports[i]; + variant.samples[sample_name]["AD"].push_back(std::to_string((int64_t)round(support_val(allele_support)))); + if (genotype_set.count(i)) { + // update the minimum support + min_site_support = min(min_site_support, total(genotype_supports[i])); + } + } + + // get the genotype likelihoods + vector gen_likelihoods; + double gen_likelihood; + variant.format.push_back("GL"); + + assert(!isnan(p_call_info->expected_depth)); + // variance/std-err can be nan when binsize < 2. We just clamp it to 0 + double depth_err = !isnan(p_call_info->depth_err) ? p_call_info->depth_err : 0.; + + double total_likelihood = 0.; + double ref_likelihood = 1.; + double alt_likelihood = 0.; + + if (genotype.size() == 2) { + // assume ploidy 2 + for (int i = 0; i < traversals.size(); ++i) { + for (int j = i; j < traversals.size(); ++j) { + double gl = genotype_likelihood({i, j}, traversals, {}, traversal_sizes, traversal_mapqs, + ref_trav_idx, p_call_info->expected_depth, depth_err, max_trav_size, ref_trav_size); + gen_likelihoods.push_back(gl); + if (vector({i, j}) == genotype || vector({j,i}) == genotype) { + gen_likelihood = gl; + } + if (i == 0 && j == 0) { + ref_likelihood = gl; + } else { + alt_likelihood = alt_likelihood == 0. ? gl : add_log(alt_likelihood, gl); + } + total_likelihood = total_likelihood == 0 ? gl : add_log(total_likelihood, gl); + // convert from natural log to log10 by dividing by ln(10) + variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); + } + } + } else if (genotype.size() == 1) { + // assume ploidy 1 + // todo: generalize this iteration (as is, it is copy pased from above) + for (int i = 0; i < traversals.size(); ++i) { + double gl = genotype_likelihood({i}, traversals, {}, traversal_sizes, traversal_mapqs, + ref_trav_idx, p_call_info->expected_depth, depth_err, max_trav_size, ref_trav_size); + gen_likelihoods.push_back(gl); + if (vector({i}) == genotype) { + gen_likelihood = gl; + } + if (i == 0) { + ref_likelihood = gl; + } else { + alt_likelihood = alt_likelihood == 0. ? gl : add_log(alt_likelihood, gl); + } + total_likelihood = total_likelihood == 0 ? gl : add_log(total_likelihood, gl); + // convert from natural log to log10 by dividing by ln(10) + variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); + } + } + + variant.format.push_back("GQ"); + variant.samples[sample_name]["GQ"].push_back(std::to_string(min((int)256, max((int)0, (int)p_call_info->gq)))); + + variant.format.push_back("GP"); + variant.samples[sample_name]["GP"].push_back(std::to_string(p_call_info->posterior)); + + variant.format.push_back("XD"); + variant.samples[sample_name]["XD"].push_back(std::to_string(p_call_info->expected_depth)); + + // The QUAL field is the probability that we have variation as a PHRED score (of wrongness) + // We derive this from the posterior probability of the reference genotype. + // But if it's a reference call, we take the total of all the alts + variant.quality = 0; + if (genotype.size() > 0) { + // our flat prior and p[traversal coverage] + double posterior = -log(gen_likelihoods.size()) - total_likelihood; + if (!all_of(genotype.begin(), genotype.end(), [&](int a) {return a == 0;})) { + posterior += ref_likelihood; + } else { + posterior += alt_likelihood; + } + variant.quality = logprob_to_phred(posterior); + } + + // Minmum allele depth. This historically has been our QUAL field for the sole reason + // that it was better than anything else at making ROC curves + variant.format.push_back("MAD"); + variant.samples[sample_name]["MAD"].push_back(std::to_string((int)(min_site_support))); + + // Now do the filters + // todo: fix and share with other caller + variant.filter = "PASS"; + if (min_site_support < min_mad_for_filter) { + // Apply Min Allele Depth cutoff across all alleles (even ref) + variant.filter = "lowad"; + } else if ((int64_t)round(total_site_depth) < min_site_depth) { + // we don't have enough support to want to make a call + variant.filter = "lowdepth"; + } +} + +void PoissonSupportSnarlCaller::update_vcf_header(string& header) const { + header += "##INFO=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + header += "##FORMAT=\n"; + header += "##FILTER=\n"; + header += "##FILTER=\n"; +} + +vector PoissonSupportSnarlCaller::rank_by_support(const vector& supports) { + vector ranks(supports.size()); + for (int i = 0; i < supports.size(); ++i) { + ranks[i] = i; + } + std::sort(ranks.begin(), ranks.end(), [&](int a, int b) { + return support_val(supports[a]) > support_val(supports[b]); + }); + return ranks; +} + + +} diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp new file mode 100644 index 00000000000..5ac2cd506e3 --- /dev/null +++ b/src/snarl_caller.hpp @@ -0,0 +1,300 @@ +#ifndef VG_SNARL_CALLER_HPP_INCLUDED +#define VG_SNARL_CALLER_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include "handle.hpp" +#include "snarls.hpp" +#include "genotypekit.hpp" +#include "traversal_support.hpp" +#include "algorithms/coverage_depth.hpp" + +namespace vg { + +using namespace std; + + +/** + * SnarlCaller: Given a list of traversals through a site, come up with a genotype + * come up with a genotype + */ +class SnarlCaller { +public: + virtual ~SnarlCaller(); + + /// implementation-dependent metadata for calls that get paseed between genotype() + /// and update_vcf_info(). + struct CallInfo { + virtual ~CallInfo() = default; + }; + + /// Get the genotype of a site + /// snarl : site + /// traversals : all traversals to consider + /// ref_trav_idx : index of reference path traversal in traversals (in case it needs special treatment) + /// ref_path : the reference path associated with the snarl + /// ref_range : the interval along the reference path (forward coordinates) spanned by snarl + virtual pair, unique_ptr> genotype(const Snarl& snarl, + const vector& traversals, + int ref_trav_idx, + int ploidy, + const string& ref_path_name, + pair ref_range) = 0; + + /// Update INFO and FORMAT fields of the called variant + virtual void update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const unique_ptr& call_info, + const string& sample_name, + vcflib::Variant& variant) = 0; + + /// Define any header fields needed by the above + virtual void update_vcf_header(string& header) const = 0; + + /// Optional method used for pruning searches + virtual function get_skip_allele_fn() const; +}; + +/** + * Interface for a caller that relies on a TraversalSupportFinder + * and has a few very basic support-based cutoffs + * Not every exciting but is currently required for the LegacySupportCaller + * which needs this to interface with the RepresentativeTraversalFinder + */ +class SupportBasedSnarlCaller : public SnarlCaller { +public: + SupportBasedSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder); + + virtual ~SupportBasedSnarlCaller(); + + virtual void update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const unique_ptr& call_info, + const string& sample_name, + vcflib::Variant& variant); + + /// Set some of the parameters + void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support); + + /// Get the traversal support finder + TraversalSupportFinder& get_support_finder() const; + + /// Get the minimum total support for call + virtual int get_min_total_support_for_call() const; + + /// Use min_alt_path_support threshold as cutoff + virtual function get_skip_allele_fn() const; + +protected: + + /// Get the best support out of a list of supports, ignoring skips + static int get_best_support(const vector& supports, const vector& skips); + + /// Relic from old code + static double support_val(const Support& support) { return total(support); }; + + const PathHandleGraph& graph; + + SnarlManager& snarl_manager; + + /// Get support from traversals + TraversalSupportFinder& support_finder; + + /// What's the minimum integer number of reads that must support a call? We + /// don't necessarily want to call a SNP as het because we have a single + // supporting read, even if there are only 10 reads on the site. + int min_total_support_for_call = 2; + /// what's the minimum ref or alt allele depth to give a PASS in the filter + /// column? Also used as a min actual support for a second-best allele call + size_t min_mad_for_filter = 1; + /// what's the minimum total support (over all alleles) of the site to make + /// a call + size_t min_site_depth = 4; + /// used only for pruning alleles in the VCFTraversalFinder: minimum support + /// of an allele's alt-path for it to be considered in the brute-force enumeration + double min_alt_path_support = 0.5; +}; + + +/** + * Find the genotype of some traversals in a site using read support and + * a bias ratio to tell heterozygous from homozygous + */ +class RatioSupportSnarlCaller : public SupportBasedSnarlCaller { +public: + RatioSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder); + virtual ~RatioSupportSnarlCaller(); + + /// Set some of the parameters + void set_het_bias(double het_bias, double ref_het_bias = 0.); + + /// Get the genotype of a site + virtual pair, unique_ptr> genotype(const Snarl& snarl, + const vector& traversals, + int ref_trav_idx, + int ploidy, + const string& ref_path_name, + pair ref_range); + + /// Update INFO and FORMAT fields of the called variant + virtual void update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const unique_ptr& call_info, + const string& sample_name, + vcflib::Variant& variant); + + /// Define any header fields needed by the above + virtual void update_vcf_header(string& header) const; + +protected: + + /// Get the bias used to for comparing two traversals + /// (It differrs heuristically depending whether they are alt/ref/het/hom/snp/indel + /// see tuning parameters below) + double get_bias(const vector& traversal_sizes, int best_trav, + int second_best_trav, int ref_trav_idx) const; + + /// get a map of the beginning of a node (in forward orientation) on a traversal + /// used for up-weighting large deletion edges in complex snarls with average support + unordered_map get_ref_offsets(const SnarlTraversal& ref_trav) const; + + /// Tuning + + /// What fraction of the reads supporting an alt are we willing to discount? + /// At 2, if twice the reads support one allele as the other, we'll call + /// homozygous instead of heterozygous. At infinity, every call will be + /// heterozygous if even one read supports each allele. + double max_het_bias = 6; + /// Like above, but applied to ref / alt ratio (instead of alt / ref) + double max_ref_het_bias = 6; + /// Like the max het bias, but applies to novel indels. + double max_indel_het_bias = 6; + /// Used for calling 1/2 calls. If both alts (times this bias) are greater than + /// the reference, the call is made. set to 0 to deactivate. + double max_ma_bias = 0; + /// what's the min log likelihood for allele depth assignments to PASS? + double min_ad_log_likelihood_for_filter = -9; +}; + +/** + * Find the genotype of some traversals in a site using read support + * and a Poisson model based on expected depth. Inspired, in part, + * by Paragraph, which uses a similar approach for genotyping break points + * + **/ +class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { +public: + PoissonSupportSnarlCaller(const PathHandleGraph& graph, SnarlManager& snarl_manager, + TraversalSupportFinder& support_finder, + const algorithms::BinnedDepthIndex& depth_index, + bool use_mapq); + virtual ~PoissonSupportSnarlCaller(); + + struct PoissonCallInfo : public SnarlCaller::CallInfo { + virtual ~PoissonCallInfo() = default; + double gq; + double posterior; + double expected_depth; + double depth_err; + int max_trav_size; + }; + + /// Set some parameters + void set_baseline_error(double small_variant_error, double large_variant_error); + /// These are multipliers applied to the errors if the site has an insertion + void set_insertion_bias(double insertion_threshold, double small_insertion_bias, double large_insertion_bias); + + /// Get the genotype of a site + virtual pair, unique_ptr> genotype(const Snarl& snarl, + const vector& traversals, + int ref_trav_idx, + int ploidy, + const string& ref_path_name, + pair ref_range); + + /// Update INFO and FORMAT fields of the called variant + virtual void update_vcf_info(const Snarl& snarl, + const vector& traversals, + const vector& genotype, + const unique_ptr& call_info, + const string& sample_name, + vcflib::Variant& variant); + + /// Define any header fields needed by the above + virtual void update_vcf_header(string& header) const; + +protected: + + /// Compute likelihood of genotype as product of poisson probabilities + /// P[allele1] * P[allle2] * P[uncalled alleles] + /// Homozygous alleles are split into two, with half support each + /// The (natural) logoarithm is returned + /// If trav_subset is not empty, traversals outside that set (and genotype) + /// will be ignored to save time + double genotype_likelihood(const vector& genotype, + const vector& traversals, + const set& trav_subset, + const vector& traversal_sizes, + const vector& traversal_mapqs, + int ref_trav_idx, double exp_depth, double depth_err, + int max_trav_size, int ref_trav_size); + + /// Rank supports + vector rank_by_support(const vector& supports); + + /// Error rates are different for small and large variants, which depend + /// more on base and mapping qualities respectively. The switch threshold + /// is in TraversalSupportFinder. Error stats from the Packer object + /// get added to these baselines when computing the scores. + + /// Baseline error rate for smaller variants + double baseline_error_small = 0.005; + /// Baseline error rate for larger variants + double baseline_error_large = 0.01; + /// multiply error by this much in pressence of insertion + /// (after some testing, this does not in fact seem to help much in practice. + /// best just to boost overall error above. hence not in CLI and off by default) + double insertion_bias_large = 1.; + double insertion_bias_small = 1.; + /// a site is an insertion if one (supported)allele is this many times bigger than another + /// unlike above, default comes from call_main.cpp (todo: straighten this out?) + double insertion_threshold = 5.; + + /// Consider up to the top-k traversals (based on support) for genotyping + size_t top_k = 20; + /// Consider up to the tom-m secondary traversals (based on support) for each top traversal + /// (so at most top_k * top_m considered) + size_t top_m = 100; + + /// padding to apply wrt to longest traversal to snarl ranges when looking up binned depth + double depth_padding_factor = 1.; + + /// Map path name to of depth coverage from the packer + const algorithms::BinnedDepthIndex& depth_index; + + /// MAPQ information is available from the packer and we want to use it + bool use_mapq; + +}; + + +// debug helpers +inline string to_string(const HandleGraph& graph, handle_t handle) { + return std::to_string(graph.get_id(handle)) + ":" + std::to_string(graph.get_is_reverse(handle)); +} +inline string to_string(const HandleGraph& graph, edge_t edge) { + return to_string(graph, edge.first) + " -> " + to_string(graph, edge.second); +} + +} +#endif diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp new file mode 100644 index 00000000000..c0a950838bc --- /dev/null +++ b/src/snarl_distance_index.cpp @@ -0,0 +1,1886 @@ +//#define debug_distance_indexing +//#define debug_snarl_traversal +//#define debug_distances +//#define debug_subgraph + +#include "snarl_distance_index.hpp" + +using namespace std; +using namespace handlegraph; +namespace vg { + +size_t minimum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, pos_t pos2, + bool unoriented_distance, const HandleGraph* graph) { + return distance_index.minimum_distance( get_id(pos1), get_is_rev(pos1), get_offset(pos1), + get_id(pos2), get_is_rev(pos2), get_offset(pos2), + unoriented_distance, graph, nullptr); +} +size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, pos_t pos2) { + return distance_index.maximum_distance( get_id(pos1), get_is_rev(pos1), get_offset(pos1), + get_id(pos2), get_is_rev(pos2), get_offset(pos2)); +} + +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit) { + distance_index->set_snarl_size_limit(size_limit); + + //Build the temporary distance index from the graph + SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit); + + if (temp_index.use_oversized_snarls) { + cerr << "warning: distance index uses oversized snarls, which may make mapping slow" << endl; + cerr << "\ttry increasing --snarl-limit when building the distance index" << endl; + } + + //And fill in the permanent distance index + vector indexes; + indexes.emplace_back(&temp_index); + distance_index->get_snarl_tree_records(indexes, graph); +} +SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( + const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit) { + +#ifdef debug_distance_indexing + cerr << "Creating new distance index for nodes between " << graph->min_node_id() << " and " << graph->max_node_id() << endl; + +#endif + + SnarlDistanceIndex::TemporaryDistanceIndex temp_index; + + temp_index.min_node_id=graph->min_node_id(); + temp_index.max_node_id=graph->max_node_id(); + + //Construct the distance index using the snarl decomposition + //traverse_decomposition will visit all structures (including trivial snarls), calling + //each of the given functions for the start and ends of the snarls and chains + + temp_index.temp_node_records.resize(temp_index.max_node_id-temp_index.min_node_id+1); + + + + //Stores unfinished records, as type of record and offset into appropriate vector + //(temp_node/snarl/chain_records) + vector> stack; + + //There may be components of the root that are connected to each other. Each connected component will + //get put into a (fake) root-level snarl, but we don't know what those components will be initially, + //since the decomposition just puts them in the same root snarl. This is used to group the root-level + //components into connected components that will later be used to make root snarls + structures::UnionFind root_snarl_component_uf (0); + + + /*Go through the decomposition top down and record the connectivity of the snarls and chains + * Distances will be added later*/ + + snarl_finder->traverse_decomposition( + [&](handle_t chain_start_handle) { + /*This gets called when a new chain is found, starting at the start handle going into chain + * For the first node in a chain, create a chain record and fill in the first node. + * Also add the first node record + */ +#ifdef debug_distance_indexing + cerr << " Starting new chain at " << graph->get_id(chain_start_handle) << (graph->get_is_reverse(chain_start_handle) ? " reverse" : " forward") << endl; + //We shouldn't have seen this node before + //assert(temp_index.temp_node_records[graph->get_id(chain_start_handle)-min_node_id].node_id == 0); +#endif + + //Fill in node in chain + stack.emplace_back(SnarlDistanceIndex::TEMP_CHAIN, temp_index.temp_chain_records.size()); + nid_t node_id = graph->get_id(chain_start_handle); + temp_index.temp_chain_records.emplace_back(); + auto& temp_chain = temp_index.temp_chain_records.back(); + temp_chain.start_node_id = node_id; + temp_chain.start_node_rev = graph->get_is_reverse(chain_start_handle); + temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + + + //And the node record itself + auto& temp_node = temp_index.temp_node_records.at(node_id-temp_index.min_node_id); + temp_node.node_id = node_id; + temp_node.node_length = graph->get_length(chain_start_handle); + temp_node.reversed_in_parent = graph->get_is_reverse(chain_start_handle); + temp_node.parent = stack.back(); //The parent is this chain + + }, + [&](handle_t chain_end_handle) { + /*This gets called at the end of a chain, facing out + * Record the chain's end node. The node record itself would have been added as part of the snarl + * Also record the chain's parent here + */ + + //Done with this chain + pair chain_index = stack.back(); + stack.pop_back(); + +#ifdef debug_distance_indexing + assert(chain_index.first == SnarlDistanceIndex::TEMP_CHAIN); +#endif + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.temp_chain_records.at(chain_index.second); + nid_t node_id = graph->get_id(chain_end_handle); + + if (temp_chain_record.children.size() == 1 && node_id == temp_chain_record.start_node_id) { + //This is a trivial snarl + +#ifdef debug_distance_indexing + //Then this must be the last thing on the chain_records vector + assert(temp_index.temp_chain_records.size() == chain_index.second+1); +#endif + + //Get the node + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.temp_node_records.at(node_id - temp_index.min_node_id); + + temp_node_record.reversed_in_parent = false; + + //And give the chain's parent the node info + // + if (stack.empty()) { + temp_node_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); + //If this was the last thing on the stack, then this was a root + + //Check to see if there is anything connected to the ends of the chain + vector reachable_nodes; + graph->follow_edges(graph->get_handle(node_id, false), + false, [&] (const handle_t& next) { + if (graph->get_id(next) != node_id) { + reachable_nodes.emplace_back(graph->get_id(next)); + } + }); + graph->follow_edges(graph->get_handle(node_id, true), + false, [&] (const handle_t& next) { + if (graph->get_id(next) != node_id) { + reachable_nodes.emplace_back(graph->get_id(next)); + } + }); + if (reachable_nodes.size()) { + //If we can reach anything leaving the chain (besides the chain itself), then it is part of a root snarl + //Note that if the chain's start and end node are the same, then it will always be a single component +#ifdef debug_distance_indexing + cerr << " This trivial chain is part of the root but connects with something else in the root"<::max()); +#endif + root_snarl_component_uf.union_groups(other_i, temp_node_record.root_snarl_index); +//#ifdef debug_distance_indexing +// cerr << " Union this trivial with " << temp_index.temp_chain_records[node_record.parent.second].start_node_id << " " << temp_index.temp_chain_records[node_record.parent.second].end_node_id << endl; +//#endif + } else { + new_component = false; + } + } + } else { + //If this chain isn't connected to anything else, then it is a single component of the root + temp_node_record.rank_in_parent = temp_index.components.size(); + temp_index.components.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + } + } else { + //The last thing on the stack is the parent of this chain, which must be a snarl + temp_node_record.parent = stack.back(); + auto& parent_snarl_record = temp_index.temp_snarl_records.at(temp_node_record.parent.second); + temp_node_record.rank_in_parent = parent_snarl_record.children.size() + 2; + parent_snarl_record.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + } + + + //Remove the chain record + temp_index.temp_chain_records.pop_back(); + temp_index.max_index_size += temp_node_record.get_max_record_length(); + + } else { + //Otherwise, it is an actual chain + + //Fill in node in chain + temp_chain_record.end_node_id = node_id; + temp_chain_record.end_node_rev = graph->get_is_reverse(chain_end_handle); + temp_chain_record.end_node_length = graph->get_length(chain_end_handle); + + if (stack.empty()) { + //If this was the last thing on the stack, then this was a root + + //Check to see if there is anything connected to the ends of the chain + vector reachable_nodes; + graph->follow_edges(graph->get_handle(temp_chain_record.start_node_id, !temp_chain_record.start_node_rev), + false, [&] (const handle_t& next) { + if (graph->get_id(next) != temp_chain_record.start_node_id && + graph->get_id(next) != temp_chain_record.end_node_id) { + reachable_nodes.emplace_back(graph->get_id(next)); + } + }); + graph->follow_edges(graph->get_handle(temp_chain_record.end_node_id, temp_chain_record.end_node_rev), + false, [&] (const handle_t& next) { + if (graph->get_id(next) != temp_chain_record.start_node_id && + graph->get_id(next) != temp_chain_record.end_node_id) { + reachable_nodes.emplace_back(graph->get_id(next)); + } + }); + if (reachable_nodes.size() && (temp_chain_record.is_trivial || temp_chain_record.start_node_id != temp_chain_record.end_node_id)) { + //If we can reach anything leaving the chain (besides the chain itself), then it is part of a root snarl + //Note that if the chain's start and end node are the same, then it will always be a single component +#ifdef debug_distance_indexing + cerr << " This chain is part of the root but connects with something else in the root"<::max()); +#endif + root_snarl_component_uf.union_groups(other_i, temp_chain_record.root_snarl_index); +#ifdef debug_distance_indexing + cerr << " Union this chain with " << temp_index.temp_chain_records[node_record.parent.second].start_node_id << " " << temp_index.temp_chain_records[node_record.parent.second].end_node_id << endl; +#endif + } else { + new_component = false; + } + } + } else { + //If this chain isn't connected to anything else, then it is a single component of the root + temp_chain_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); + temp_chain_record.rank_in_parent = temp_index.components.size(); + temp_index.components.emplace_back(chain_index); + } + } else { + //The last thing on the stack is the parent of this chain, which must be a snarl + temp_chain_record.parent = stack.back(); + auto& parent_snarl_record = temp_index.temp_snarl_records.at(temp_chain_record.parent.second); + temp_chain_record.rank_in_parent = parent_snarl_record.children.size() + 2; + parent_snarl_record.children.emplace_back(chain_index); + } + + temp_index.max_index_size += temp_chain_record.get_max_record_length(); +#ifdef debug_distance_indexing + cerr << " Ending new " << (temp_chain_record.is_trivial ? "trivial " : "") << "chain " << temp_index.structure_start_end_as_string(chain_index) + << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_chain_record.parent) << endl; +#endif + } + }, + [&](handle_t snarl_start_handle) { + /*This gets called at the beginning of a new snarl facing in + * Create a new snarl record and fill in the start node. + * The node record would have been created as part of the chain, or as the end node + * of the previous snarl + */ + +#ifdef debug_distance_indexing + cerr << " Starting new snarl at " << graph->get_id(snarl_start_handle) << (graph->get_is_reverse(snarl_start_handle) ? " reverse" : " forward") << endl; + cerr << "with index " << temp_index.temp_snarl_records.size() << endl; +#endif + auto& parent = stack.back(); + stack.emplace_back(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size()); + temp_index.temp_snarl_records.emplace_back(); + temp_index.temp_snarl_records.back().start_node_id = graph->get_id(snarl_start_handle); + temp_index.temp_snarl_records.back().start_node_rev = graph->get_is_reverse(snarl_start_handle); + temp_index.temp_snarl_records.back().start_node_length = graph->get_length(snarl_start_handle); + + }, + [&](handle_t snarl_end_handle){ + /*This gets called at the end of the snarl facing out + * Fill in the end node of the snarl, its parent, and record the snarl as a child of its + * parent chain + * Also create a node record + */ + pair snarl_index = stack.back(); + stack.pop_back(); +#ifdef debug_distance_indexing + assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); + assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); +#endif + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records[snarl_index.second]; + nid_t node_id = graph->get_id(snarl_end_handle); + + //Record the end node in the snarl + temp_snarl_record.end_node_id = node_id; + temp_snarl_record.end_node_rev = graph->get_is_reverse(snarl_end_handle); + temp_snarl_record.end_node_length = graph->get_length(snarl_end_handle); + temp_snarl_record.node_count = temp_snarl_record.children.size(); + bool any_edges_in_snarl = false; + graph->follow_edges(graph->get_handle(temp_snarl_record.start_node_id, temp_snarl_record.start_node_rev), false, [&](const handle_t next_handle) { + if (graph->get_id(next_handle) != temp_snarl_record.end_node_id) { + any_edges_in_snarl = true; + } + }); + graph->follow_edges(graph->get_handle(temp_snarl_record.end_node_id, !temp_snarl_record.end_node_rev), false, [&](const handle_t next_handle) { + if (graph->get_id(next_handle) != temp_snarl_record.start_node_id) { + any_edges_in_snarl = true; + } + }); + + if (temp_snarl_record.children.size() == 0) { + //This is a trivial snarl + temp_snarl_record.is_trivial = true; + + //Add the end node to the chain +#ifdef debug_distance_indexing + assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); +#endif + temp_snarl_record.parent = stack.back(); + auto& temp_chain = temp_index.temp_chain_records.at(stack.back().second); + temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + + //Remove the snarl record +#ifdef debug_distance_indexing + assert(temp_index.temp_snarl_records.size() == snarl_index.second+1); +#endif + temp_index.temp_snarl_records.pop_back(); + } else { + //This is the child of a chain +#ifdef debug_distance_indexing + assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); +#endif + temp_snarl_record.parent = stack.back(); + auto& temp_chain = temp_index.temp_chain_records.at(stack.back().second); + temp_chain.children.emplace_back(snarl_index); + temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + + } + //Record the snarl as a child of its chain + //if (stack.empty()) { + // assert(false); + // //TODO: The snarl should always be the child of a chain + // //If this was the last thing on the stack, then this was a root + // //TODO: I'm not sure if this would get put into a chain or not + // temp_snarl_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); + // temp_index.components.emplace_back(snarl_index); + //} + + //Record the node itself. This gets done for the start of the chain, and ends of snarls + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.temp_node_records.at(node_id-temp_index.min_node_id); + temp_node_record.node_id = node_id; + temp_node_record.node_length = graph->get_length(snarl_end_handle); + temp_node_record.reversed_in_parent = graph->get_is_reverse(snarl_end_handle); + temp_node_record.parent = stack.back(); + + + +#ifdef debug_distance_indexing + cerr << " Ending new snarl " << temp_index.structure_start_end_as_string(snarl_index) + << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_snarl_record.parent) << endl; +#endif + }); + + /* + * We finished going through everything that exists according to the snarl decomposition, but + * it's still missing tips, which will be discovered when filling in the snarl distances, + * and root-level snarls, which we'll add now by combining the chain components in root_snarl_components + * into snarls defined by root_snarl_component_uf + * The root-level snarl is a fake snarl that doesn't exist according to the snarl decomposition, + * but is an extra layer that groups together components of the root that are connected + */ + + vector> root_snarl_component_indexes = root_snarl_component_uf.all_groups(); + for (vector& root_snarl_indexes : root_snarl_component_indexes) { +#ifdef debug_distance_indexing + cerr << "Create a new root snarl from components" << endl; +#endif + //For each of the root snarls + temp_index.components.emplace_back(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size()); + temp_index.temp_snarl_records.emplace_back(); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.back(); + temp_snarl_record.is_root_snarl = true; + temp_snarl_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); + + + for (size_t chain_i : root_snarl_indexes) { + //For each chain component of this root-level snarl + if (temp_index.root_snarl_components[chain_i].first == SnarlDistanceIndex::TEMP_CHAIN){ + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.temp_chain_records[temp_index.root_snarl_components[chain_i].second]; + temp_chain_record.parent = make_pair(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size() - 1); + temp_chain_record.rank_in_parent = temp_snarl_record.children.size(); + temp_chain_record.reversed_in_parent = false; + + temp_snarl_record.children.emplace_back(temp_index.root_snarl_components[chain_i]); + } else { +#ifdef debug_distance_indexing + assert(temp_index.root_snarl_components[chain_i].first == SnarlDistanceIndex::TEMP_NODE); +#endif + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.temp_node_records[temp_index.root_snarl_components[chain_i].second - temp_index.min_node_id]; + temp_node_record.parent = make_pair(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size() - 1); + temp_node_record.rank_in_parent = temp_snarl_record.children.size(); + temp_node_record.reversed_in_parent = false; + + temp_snarl_record.children.emplace_back(temp_index.root_snarl_components[chain_i]); + } + } + temp_snarl_record.node_count = temp_snarl_record.children.size(); + } + + /*Now go through the decomposition again to fill in the distances + * This traverses all chains in reverse order that we found them in, so bottom up + * Each chain and snarl already knows its parents and children, except for single nodes + * that are children of snarls. These nodes were not in chains will have their node + * records created here + */ + +#ifdef debug_distance_indexing + cerr << "Filling in the distances in snarls" << endl; +#endif + for (int i = temp_index.temp_chain_records.size()-1 ; i >= 0 ; i--) { + + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.temp_chain_records[i]; +#ifdef debug_distance_indexing + assert(!temp_chain_record.is_trivial); + cerr << " At " << (temp_chain_record.is_trivial ? " trivial " : "") << " chain " << temp_index.structure_start_end_as_string(make_pair(SnarlDistanceIndex::TEMP_CHAIN, i)) << endl; +#endif + + //Add the first values for the prefix sum and backwards loop vectors + temp_chain_record.prefix_sum.emplace_back(0); + temp_chain_record.max_prefix_sum.emplace_back(0); + temp_chain_record.backward_loops.emplace_back(std::numeric_limits::max()); + temp_chain_record.chain_components.emplace_back(0); + + + /*First, go through each of the snarls in the chain in the forward direction and + * fill in the distances in the snarl. Also fill in the prefix sum and backwards + * loop vectors here + */ + size_t curr_component = 0; //which component of the chain are we in + size_t last_node_length = 0; + for (size_t chain_child_i = 0 ; chain_child_i < temp_chain_record.children.size() ; chain_child_i++ ){ + const pair& chain_child_index = temp_chain_record.children[chain_child_i]; + //Go through each of the children in the chain, skipping nodes + //The snarl may be trivial, in which case don't fill in the distances +#ifdef debug_distance_indexing + cerr << " Looking at child " << temp_index.structure_start_end_as_string(chain_child_index) << " current max prefi xum " << temp_chain_record.max_prefix_sum.back() << endl; +#endif + + if (chain_child_index.first == SnarlDistanceIndex::TEMP_SNARL){ + //This is where all the work gets done. Need to go through the snarl and add + //all distances, then add distances to the chain that this is in + //The parent chain will be the last thing in the stack + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = + temp_index.temp_snarl_records.at(chain_child_index.second); + + //Fill in this snarl's distances + populate_snarl_index(temp_index, chain_child_index, size_limit, graph); + + bool new_component = temp_snarl_record.min_length == std::numeric_limits::max(); + if (new_component){ + curr_component++; + } + + //And get the distance values for the end node of the snarl in the chain + if (new_component) { + //If this snarl wasn't start-end connected, then we start + //tracking the distance vectors here + + //Update the maximum distance + temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); + + temp_chain_record.prefix_sum.emplace_back(0); + temp_chain_record.max_prefix_sum.emplace_back(0); + temp_chain_record.backward_loops.emplace_back(temp_snarl_record.distance_end_end); + //If the chain is disconnected, the max length is infinite + temp_chain_record.max_length = std::numeric_limits::max(); + } else { + temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.prefix_sum.back(), + temp_snarl_record.min_length), + temp_snarl_record.start_node_length)); + temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.max_prefix_sum.back(), + temp_snarl_record.max_length), + temp_snarl_record.start_node_length)); + temp_chain_record.backward_loops.emplace_back(std::min(temp_snarl_record.distance_end_end, + SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back() + , 2 * (temp_snarl_record.start_node_length + temp_snarl_record.min_length)))); + temp_chain_record.max_length = SnarlDistanceIndex::sum(temp_chain_record.max_length, + temp_snarl_record.max_length); + } + temp_chain_record.chain_components.emplace_back(curr_component); + if (chain_child_i == temp_chain_record.children.size() - 2 && temp_snarl_record.min_length == std::numeric_limits::max()) { + temp_chain_record.loopable = false; + } + last_node_length = 0; + } else { + if (last_node_length != 0) { + //If this is a node and the last thing was also a node, + //then there was a trivial snarl + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = + temp_index.temp_node_records.at(chain_child_index.second-temp_index.min_node_id); + + //Check if there is a loop in this node + //Snarls get counted as trivial if they contain no nodes but they might still have edges + size_t backward_loop = std::numeric_limits::max(); + + graph->follow_edges(graph->get_handle(temp_node_record.node_id, !temp_node_record.reversed_in_parent), false, [&](const handle_t next_handle) { + if (graph->get_id(next_handle) == temp_node_record.node_id) { + //If there is a loop going backwards (relative to the chain) back to the same node + backward_loop = 0; + } + }); + + temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back(), last_node_length)); + temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.max_prefix_sum.back(), last_node_length)); + temp_chain_record.backward_loops.emplace_back(std::min(backward_loop, + SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back(), 2 * last_node_length))); + + if (chain_child_i == temp_chain_record.children.size()-1) { + //If this is the last node + temp_chain_record.loopable=false; + } + temp_chain_record.chain_components.emplace_back(curr_component); + } + last_node_length = temp_index.temp_node_records.at(chain_child_index.second - temp_index.min_node_id).node_length; + //And update the chains max length + temp_chain_record.max_length = SnarlDistanceIndex::sum(temp_chain_record.max_length, + last_node_length); + } + } //Finished walking through chain + if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.chain_components.back() != 0) { + //If this is a looping, multicomponent chain, the start/end node could end up in separate chain components + //despite being the same node. + //Since the first component will always be 0, set the first node's component to be whatever the last + //component was + temp_chain_record.chain_components[0] = temp_chain_record.chain_components.back(); + + } + + //For a multicomponent chain, the actual minimum length will always be infinite, but since we sometimes need + //the length of the last component, save that here + temp_chain_record.min_length = !temp_chain_record.is_trivial && temp_chain_record.start_node_id == temp_chain_record.end_node_id + ? temp_chain_record.prefix_sum.back() + : SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back() , temp_chain_record.end_node_length); + +#ifdef debug_distance_indexing + assert(temp_chain_record.prefix_sum.size() == temp_chain_record.backward_loops.size()); + assert(temp_chain_record.prefix_sum.size() == temp_chain_record.chain_components.size()); +#endif + + + /*Now that we've gone through all the snarls in the chain, fill in the forward loop vector + * by going through the chain in the backwards direction + */ + temp_chain_record.forward_loops.resize(temp_chain_record.prefix_sum.size(), + std::numeric_limits::max()); + if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.children.size() > 1) { + + //If this is a looping chain, then check the first snarl for a loop + if (temp_chain_record.children.at(1).first == SnarlDistanceIndex::TEMP_SNARL) { + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(temp_chain_record.children.at(1).second); + temp_chain_record.forward_loops[temp_chain_record.forward_loops.size()-1] = temp_snarl_record.distance_start_start; + } + } + + size_t node_i = temp_chain_record.prefix_sum.size() - 2; + // We start at the next to last node because we need to look at this record and the next one. + last_node_length = 0; + for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { + auto& child = temp_chain_record.children.at(j); + if (child.first == SnarlDistanceIndex::TEMP_SNARL){ + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); + if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) && + temp_chain_record.chain_components.at(node_i+1) != 0){ + //If this is a new chain component, then add the loop distance from the snarl + //If the component of the next node is 0, then we're still in the same component since we're going backwards + temp_chain_record.forward_loops.at(node_i) = temp_snarl_record.distance_start_start; + } else { + temp_chain_record.forward_loops.at(node_i) = + std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.forward_loops.at(node_i+1), + 2* temp_snarl_record.min_length), + 2*temp_snarl_record.end_node_length), + temp_snarl_record.distance_start_start); + } + node_i --; + last_node_length = 0; + } else { + if (last_node_length != 0) { + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = + temp_index.temp_node_records.at(child.second-temp_index.min_node_id); + + + //Check if there is a loop in this node + //Snarls get counted as trivial if they contain no nodes but they might still have edges + size_t forward_loop = std::numeric_limits::max(); + graph->follow_edges(graph->get_handle(temp_node_record.node_id, temp_node_record.reversed_in_parent), false, [&](const handle_t next_handle) { + if (graph->get_id(next_handle) == temp_node_record.node_id) { + //If there is a loop going forward (relative to the chain) back to the same node + forward_loop = 0; + } + }); + temp_chain_record.forward_loops.at(node_i) = std::min( forward_loop, + SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , + 2*last_node_length)); + node_i--; + } + last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; + } + } + + + //If this is a looping chain, check if the loop distances can be improved by going around the chain + + if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.children.size() > 1) { + + + //Also check if the reverse loop values would be improved if we went around again + + if (temp_chain_record.backward_loops.back() < temp_chain_record.backward_loops.front()) { + temp_chain_record.backward_loops[0] = temp_chain_record.backward_loops.back(); + size_t node_i = 1; + size_t last_node_length = 0; + for (size_t i = 1 ; i < temp_chain_record.children.size()-1 ; i++ ) { + auto& child = temp_chain_record.children.at(i); + if (child.first == SnarlDistanceIndex::TEMP_SNARL) { + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); + size_t new_loop_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.backward_loops.at(node_i-1), + 2*temp_snarl_record.min_length), + 2*temp_snarl_record.start_node_length); + if (temp_chain_record.chain_components.at(node_i)!= 0 || new_loop_distance >= temp_chain_record.backward_loops.at(node_i)) { + //If this is a new chain component or it doesn't improve, stop + break; + } else { + //otherwise record the better distance + temp_chain_record.backward_loops.at(node_i) = new_loop_distance; + + } + node_i++; + last_node_length = 0; + } else { + if (last_node_length != 0) { + size_t new_loop_distance = SnarlDistanceIndex::sum(temp_chain_record.backward_loops.at(node_i-1), + 2*last_node_length); + size_t old_loop_distance = temp_chain_record.backward_loops.at(node_i); + temp_chain_record.backward_loops.at(node_i) = std::min(old_loop_distance,new_loop_distance); + node_i++; + } + last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; + } + } + } + if (temp_chain_record.forward_loops.front() < temp_chain_record.forward_loops.back()) { + //If this is a looping chain and looping improves the forward loops, + //then we have to keep going around to update distance + + temp_chain_record.forward_loops.back() = temp_chain_record.forward_loops.front(); + size_t last_node_length = 0; + node_i = temp_chain_record.prefix_sum.size() - 2; + for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { + auto& child = temp_chain_record.children.at(j); + if (child.first == SnarlDistanceIndex::TEMP_SNARL){ + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); + size_t new_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.forward_loops.at(node_i+1), + 2* temp_snarl_record.min_length), + 2*temp_snarl_record.end_node_length); + if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) || + new_distance >= temp_chain_record.forward_loops.at(node_i)){ + //If this is a new component or the distance doesn't improve, stop looking + break; + } else { + //otherwise, update the distance + temp_chain_record.forward_loops.at(node_i) = new_distance; + } + node_i --; + last_node_length =0; + } else { + if (last_node_length != 0) { + size_t new_distance = SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , 2* last_node_length); + size_t old_distance = temp_chain_record.forward_loops.at(node_i); + temp_chain_record.forward_loops.at(node_i) = std::min(old_distance, new_distance); + node_i--; + } + last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; + } + } + } + } + + temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); + temp_index.max_distance = temp_chain_record.forward_loops.back() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.forward_loops.back()); + temp_index.max_distance = temp_chain_record.backward_loops.front() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.backward_loops.front()); + assert(temp_index.max_distance <= 2742664019); + + } + +#ifdef debug_distance_indexing + cerr << "Filling in the distances in root snarls and distances along chains" << endl; +#endif + for (pair& component_index : temp_index.components) { + if (component_index.first == SnarlDistanceIndex::TEMP_SNARL) { + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(component_index.second); + populate_snarl_index(temp_index, component_index, size_limit, graph); + temp_snarl_record.min_length = std::numeric_limits::max(); + } + } + temp_index.root_structure_count = temp_index.components.size(); +#ifdef debug_distance_indexing + assert(temp_index.components.size() == temp_index.root_structure_count); + cerr << "Finished temp index with " << temp_index.root_structure_count << " connected components" << endl; +#endif + return temp_index; +} + + + +/*Fill in the snarl index. + * The index will already know its boundaries and everything knows their relationships in the + * snarl tree. This needs to fill in the distances and the ranks of children in the snarl + * The rank of a child is arbitrary, except that the start node will always be 0 and the end node + * will always be the node count+1 (since node count doesn't count the boundary nodes) + */ +void populate_snarl_index( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + pair snarl_index, size_t size_limit, + const HandleGraph* graph) { +#ifdef debug_distance_indexing + cerr << "Getting the distances for snarl " << temp_index.structure_start_end_as_string(snarl_index) << endl; + assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); +#endif + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(snarl_index.second); + temp_snarl_record.is_simple=true; + + + + + /*Helper function to find the ancestor of a node that is a child of this snarl */ + auto get_ancestor_of_node = [&](pair curr_index) { + + //This is a child that isn't a node, so it must be a chain + if (curr_index.second == temp_snarl_record.start_node_id || + curr_index.second == temp_snarl_record.end_node_id) { + return curr_index; + } + + //Otherwise, walk up until we hit the current snarl + pair parent_index = temp_index.temp_node_records.at(curr_index.second-temp_index.min_node_id).parent; + while (parent_index != snarl_index) { + curr_index=parent_index; + parent_index = parent_index.first == SnarlDistanceIndex::TEMP_SNARL ? temp_index.temp_snarl_records.at(parent_index.second).parent + : temp_index.temp_chain_records.at(parent_index.second).parent; +#ifdef debug_distance_indexing + assert(parent_index.first != SnarlDistanceIndex::TEMP_ROOT); +#endif + } + + return curr_index; + }; + + + /*Now go through each of the children and add distances from that child to everything reachable from it + * Start a dijkstra traversal from each node side in the snarl and record all distances + */ + + //Add the start and end nodes to the list of children so that we include them in the traversal + //TODO: Copying the list + vector> all_children = temp_snarl_record.children; + + //Reserve enough space to store all possible distances + temp_snarl_record.distances.reserve( (temp_snarl_record.node_count > size_limit || size_limit == 0) + ? temp_snarl_record.node_count * 2 + : temp_snarl_record.node_count * temp_snarl_record.node_count); + + if (size_limit != 0 && temp_snarl_record.node_count > size_limit) { + temp_index.use_oversized_snarls = true; + } + + if (!temp_snarl_record.is_root_snarl) { + + all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.start_node_id); + all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.end_node_id); + } + + while (!all_children.empty()) { + const pair start_index = std::move(all_children.back()); + all_children.pop_back(); + + bool is_internal_node = false; + + //Check if this node is a tip + if ((start_index.first == SnarlDistanceIndex::TEMP_NODE + && start_index.second != temp_snarl_record.start_node_id + && start_index.second != temp_snarl_record.end_node_id) + || + (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && temp_index.temp_chain_records.at(start_index.second).is_trivial)) { + //If this is an internal node + is_internal_node = true; + nid_t node_id = start_index.first == SnarlDistanceIndex::TEMP_NODE ? start_index.second : temp_index.temp_chain_records.at(start_index.second).start_node_id; + size_t rank = start_index.first == SnarlDistanceIndex::TEMP_NODE ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).rank_in_parent + : temp_index.temp_chain_records.at(start_index.second).rank_in_parent; + + bool has_edges = false; + graph->follow_edges(graph->get_handle(node_id, false), false, [&](const handle_t next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.temp_node_records.at(node_id-temp_index.min_node_id).is_tip = true; + temp_snarl_record.tippy_child_ranks.insert(rank); + temp_snarl_record.is_simple=false; //It is a tip so this isn't simple snarl + } + has_edges = false; + graph->follow_edges(graph->get_handle(node_id, true), false, [&](const handle_t next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.temp_node_records.at(node_id-temp_index.min_node_id).is_tip = true; + temp_snarl_record.tippy_child_ranks.insert(rank); + temp_snarl_record.is_simple=false; //It is a tip so this isn't simple snarl + } + } else if (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && !temp_index.temp_chain_records.at(start_index.second).is_trivial) { + //If this is an internal chain, then it isn't a simple snarl + temp_snarl_record.is_simple=false; + } + + bool start_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).is_tip + : temp_index.temp_chain_records.at(start_index.second).is_tip; + + size_t start_rank = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).rank_in_parent + : temp_index.temp_chain_records.at(start_index.second).rank_in_parent; + + + if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.start_node_id) { + start_rank = 0; + } else if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.end_node_id) { + start_rank = 1; + } //TODO: + //else { + // assert(start_rank != 0 && start_rank != 1); + //} + + if ( (temp_snarl_record.node_count > size_limit || size_limit == 0) && (temp_snarl_record.is_root_snarl || (!start_is_tip && + !start_rank == 0 && ! start_rank == 1))) { + //If we don't care about internal distances, and we also are not at a boundary or tip + continue; + } + + //Start from either direction for all nodes, but only going in for start and end + vector directions; + if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.start_node_id) { + directions.emplace_back(temp_snarl_record.start_node_rev); + } else if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.end_node_id){ + directions.emplace_back(!temp_snarl_record.end_node_rev); + } else { + directions.emplace_back(true); + directions.emplace_back(false); + } + for (bool start_rev : directions) { + //Start a dijkstra traversal from start_index going in the direction indicated by start_rev + //Record the distances to each node (child of the snarl) found + size_t reachable_node_count = 0; //How many nodes can we reach from this node side? + +#ifdef debug_distance_indexing + cerr << " Starting from child " << temp_index.structure_start_end_as_string(start_index) + << " going " << (start_rev ? "rev" : "fd") << endl; +#endif + + //Define a NetgraphNode as the value for the priority queue: + // , direction> + using NetgraphNode = pair, bool>>; + auto cmp = [] (const NetgraphNode a, const NetgraphNode b) { + return a.first > b.first; + }; + + //The priority queue of the next nodes to visit, ordered by the distance + std::priority_queue, decltype(cmp)> queue(cmp); + //The nodes we've already visited + unordered_set, bool>> visited_nodes; + visited_nodes.reserve(temp_snarl_record.node_count * 2); + + //Start from the current start node + queue.push(make_pair(0, make_pair(start_index, start_rev))); + + while (!queue.empty()) { + + //Get the current node from the queue and pop it out of the queue + size_t current_distance = queue.top().first; + pair current_index = queue.top().second.first; + bool current_rev = queue.top().second.second; + if (visited_nodes.count(queue.top().second)) { + queue.pop(); + continue; + } + visited_nodes.emplace(queue.top().second); + queue.pop(); + + + //The handle that we need to follow to get the next reachable nodes + //If the current node is a node, then its just the node. Otherwise, it's the + //opposite side of the child chain + handle_t current_end_handle = current_index.first == SnarlDistanceIndex::TEMP_NODE ? + graph->get_handle(current_index.second, current_rev) : + (current_rev ? graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, + !temp_index.temp_chain_records[current_index.second].start_node_rev) + : graph->get_handle(temp_index.temp_chain_records[current_index.second].end_node_id, + temp_index.temp_chain_records[current_index.second].end_node_rev)); + +#ifdef debug_distance_indexing + cerr << " at child " << temp_index.structure_start_end_as_string(current_index) << " going " + << (current_rev ? "rev" : "fd") << " at actual node " << graph->get_id(current_end_handle) + << (graph->get_is_reverse(current_end_handle) ? "rev" : "fd") << endl; +#endif + graph->follow_edges(current_end_handle, false, [&](const handle_t next_handle) { + if (graph->get_id(current_end_handle) == graph->get_id(next_handle)){ + //If there are any loops then this isn't a simple snarl + temp_snarl_record.is_simple = false; + } + + reachable_node_count++; + //At each of the nodes reachable from the current one, fill in the distance from the start + //node to the next node (current_distance). If this handle isn't leaving the snarl, + //add the next nodes along with the distance to the end of the next node + auto& node_record = temp_index.temp_node_records.at(graph->get_id(next_handle)-temp_index.min_node_id); + + //The index of the snarl's child that next_handle represents + pair next_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle))); + + bool next_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).is_tip + : temp_index.temp_chain_records.at(start_index.second).is_tip; + + //The rank and orientation of next in the snarl + size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE + ? node_record.rank_in_parent + : temp_index.temp_chain_records[next_index.second].rank_in_parent; + if (next_index.first == SnarlDistanceIndex::TEMP_NODE && next_index.second == temp_snarl_record.start_node_id) { + next_rank = 0; + } else if (next_index.first == SnarlDistanceIndex::TEMP_NODE && next_index.second == temp_snarl_record.end_node_id) { + next_rank = 1; + } else { + //If the next thing wasn't a boundary node and this was an internal node, then it isn't a simple snarl + if (is_internal_node) { + temp_snarl_record.is_simple = false; + } + }//TODO: This won't be true of root snarls + //else { + // assert(next_rank != 0 && next_rank != 1); + //} + bool next_rev = next_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.temp_chain_records[next_index.second].is_trivial + ? graph->get_is_reverse(next_handle) + : graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].end_node_id; + + /**Record the distance **/ + bool start_is_boundary = !temp_snarl_record.is_root_snarl && (start_rank == 0 || start_rank == 1); + bool next_is_boundary = !temp_snarl_record.is_root_snarl && (next_rank == 0 || next_rank == 1); + + if (size_limit != 0 && + (temp_snarl_record.node_count < size_limit || start_is_boundary || next_is_boundary)) { + //If the snarl is too big, then we don't record distances between internal nodes + //If we are looking at all distances or we are looking at boundaries + bool added_new_distance = false; + + //Set the distance + pair start = start_is_boundary + ? make_pair(start_rank, false) : make_pair(start_rank, !start_rev); + pair next = next_is_boundary + ? make_pair(next_rank, false) : make_pair(next_rank, next_rev); + if (start_is_boundary && next_is_boundary) { + //If it is between bounds of the snarl, then the snarl stores it + if (start_rank == 0 && next_rank == 0 && + temp_snarl_record.distance_start_start == std::numeric_limits::max()) { + temp_snarl_record.distance_start_start = current_distance; + added_new_distance = true; + } else if (start_rank == 1 && next_rank == 1 && + temp_snarl_record.distance_end_end == std::numeric_limits::max()) { + temp_snarl_record.distance_end_end = current_distance; + added_new_distance = true; + } else if (((start_rank == 0 && next_rank == 1) || (start_rank == 1 && next_rank == 0)) + && temp_snarl_record.min_length == std::numeric_limits::max()){ + temp_snarl_record.min_length = current_distance; + added_new_distance = true; + + } + } else if (start_is_boundary){ + //If start is a boundary node + if (next_index.first == SnarlDistanceIndex::TEMP_NODE) { + //Next is a node + auto& temp_node_record = temp_index.temp_node_records.at(next_index.second-temp_index.min_node_id); + if (start_rank == 0 && !next_rev && + temp_node_record.distance_left_start == std::numeric_limits::max()) { + temp_node_record.distance_left_start = current_distance; + added_new_distance = true; + } else if (start_rank == 0 && next_rev && + temp_node_record.distance_right_start == std::numeric_limits::max()) { + temp_node_record.distance_right_start = current_distance; + added_new_distance = true; + } else if (start_rank == 1 && !next_rev && + temp_node_record.distance_left_end == std::numeric_limits::max()) { + temp_node_record.distance_left_end = current_distance; + added_new_distance = true; + } else if (start_rank == 1 && next_rev && + temp_node_record.distance_right_end == std::numeric_limits::max()) { + temp_node_record.distance_right_end = current_distance; + added_new_distance = true; + } + } else { + //Next is a chain + auto& temp_chain_record = temp_index.temp_chain_records.at(next_index.second); + if (start_rank == 0 && !next_rev && + temp_chain_record.distance_left_start == std::numeric_limits::max()) { + temp_chain_record.distance_left_start = current_distance; + added_new_distance = true; + } else if (start_rank == 0 && next_rev && + temp_chain_record.distance_right_start == std::numeric_limits::max()) { + temp_chain_record.distance_right_start = current_distance; + added_new_distance = true; + } else if (start_rank == 1 && !next_rev && + temp_chain_record.distance_left_end == std::numeric_limits::max()) { + temp_chain_record.distance_left_end = current_distance; + added_new_distance = true; + } else if (start_rank == 1 && next_rev && + temp_chain_record.distance_right_end == std::numeric_limits::max()) { + temp_chain_record.distance_right_end = current_distance; + added_new_distance = true; + } + } + } else if (!next_is_boundary && !temp_snarl_record.distances.count(make_pair(start, next))) { + //Otherwise the snarl stores it in its distance + //If the distance isn't from an internal node to a bound and we haven't stored the distance yets + + temp_snarl_record.distances[make_pair(start, next)] = current_distance; + added_new_distance = true; +#ifdef debug_distance_indexing + cerr << " Adding distance between ranks " << start.first << " " << start.second << " and " << next.first << " " << next.second << ": " << current_distance << endl; +#endif + } + if (added_new_distance) { + temp_snarl_record.max_distance = std::max(temp_snarl_record.max_distance, current_distance); + } + } + + + /**Add the next node to the priority queue**/ + + if (visited_nodes.count(make_pair(next_index, next_rev)) == 0 && + graph->get_id(next_handle) != temp_snarl_record.start_node_id && + graph->get_id(next_handle) != temp_snarl_record.end_node_id + ) { + //If this isn't leaving the snarl, + //then add the next node to the queue, along with the distance to traverse it + size_t next_node_length = next_index.first == SnarlDistanceIndex::TEMP_NODE ? graph->get_length(next_handle) : + temp_index.temp_chain_records[next_index.second].min_length; + if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN && + temp_index.temp_chain_records[next_index.second].chain_components.back() != 0) { + //If there are multiple components, then the chain is not start-end reachable so its length + //is actually infinite + next_node_length = std::numeric_limits::max(); + } + if (next_node_length != std::numeric_limits::max()) { + queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, next_node_length), + make_pair(next_index, next_rev))); + } + } + if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN) { + size_t loop_distance = next_rev ? temp_index.temp_chain_records[next_index.second].backward_loops.back() + : temp_index.temp_chain_records[next_index.second].forward_loops.front(); + if (loop_distance != std::numeric_limits::max() && + visited_nodes.count(make_pair(next_index, !next_rev)) == 0 && + graph->get_id(next_handle) != temp_snarl_record.start_node_id && + graph->get_id(next_handle) != temp_snarl_record.end_node_id + ) { + //If the next node can loop back on itself, then add the next node in the opposite direction + size_t next_node_len = loop_distance + 2 * graph->get_length(next_handle); + queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, next_node_len), + make_pair(next_index, !next_rev))); + } + } +#ifdef debug_distance_indexing + cerr << " reached child " << temp_index.structure_start_end_as_string(next_index) << "going " + << (next_rev ? "rev" : "fd") << " with distance " << current_distance << " for ranks " << start_rank << " " << next_rank << endl; +#endif + }); + } + if (is_internal_node && reachable_node_count != 1) { + //If this is an internal node, then it must have only one edge for it to be a simple snarl + temp_snarl_record.is_simple = false; + } + } + + /** Check the minimum length of the snarl passing through this node **/ + if (start_rank != 0 && start_rank != 1) { + + size_t child_max_length = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).node_length + : temp_index.temp_chain_records.at(start_index.second).max_length; + //The distance through the whole snarl traversing this node forwards + //(This might actually be traversing it backwards but it doesn't really matter) + + size_t dist_start_left = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_left_start + : temp_index.temp_chain_records.at(start_index.second).distance_left_start; + size_t dist_end_right = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_right_end + : temp_index.temp_chain_records.at(start_index.second).distance_right_end; + size_t dist_start_right = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_right_start + : temp_index.temp_chain_records.at(start_index.second).distance_right_start; + size_t dist_end_left = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_left_end + : temp_index.temp_chain_records.at(start_index.second).distance_left_end; + + size_t snarl_length_fd = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + dist_start_left, dist_end_right),child_max_length); + //The same thing traversing this node backwards + size_t snarl_length_rev = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + dist_start_right, dist_end_left), child_max_length); + //The max that isn't infinite + size_t max_length = + snarl_length_rev == std::numeric_limits::max() + ? snarl_length_fd + : (snarl_length_fd == std::numeric_limits::max() + ? snarl_length_rev + : std::max(snarl_length_rev, snarl_length_fd)); + if (max_length != std::numeric_limits::max()) { + temp_snarl_record.max_length = std::max(temp_snarl_record.max_length, max_length); + } + if ( temp_snarl_record.is_simple && + ! ((dist_start_left == 0 && dist_end_right == 0 && dist_end_left == std::numeric_limits::max() && dist_start_right == std::numeric_limits::max() ) || + (dist_start_left == std::numeric_limits::max() && dist_end_right == std::numeric_limits::max() && dist_end_left == 0 && dist_start_right == 0 ))){ + //If the snarl is simple, double check that this node is actually simple: that it can only be traversed going + //across the nsarl + temp_snarl_record.is_simple = false; + } + } + } + + //If this is a simple snarl (one with only single nodes that connect to the start and end nodes), then + // we want to remember if the child nodes are reversed + if (temp_snarl_record.is_simple) { + for (size_t i = 0 ; i < temp_snarl_record.node_count ; i++) { + //Get the index of the child + const pair& child_index = temp_snarl_record.children[i]; + //Which is a node +#ifdef debug_distance_indexing + assert(child_index.first == SnarlDistanceIndex::TEMP_NODE); +#endif + + //And get the record + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = + temp_index.temp_node_records[child_index.second-temp_index.min_node_id]; + size_t rank =temp_node_record.rank_in_parent; + + + + //Set the orientation of this node in the simple snarl + temp_node_record.reversed_in_parent = temp_node_record.distance_left_start == std::numeric_limits::max(); + + } + } + + //Now that the distances are filled in, predict the size of the snarl in the index + temp_index.max_index_size += temp_snarl_record.get_max_record_length(); + if (temp_snarl_record.is_simple) { + temp_index.max_index_size -= (temp_snarl_record.children.size() * SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord::get_max_record_length()); + } + + +} + + +//Given an alignment to a graph and a range, find the set of nodes in the +//graph for which the minimum distance from the position to any position +//in the node is within the given distance range +//If look_forward is true, then start from the start of the path forward, +//otherwise start from the end going backward +void subgraph_in_distance_range(const SnarlDistanceIndex& distance_index, const Path& path, const HandleGraph* super_graph, size_t min_distance, + size_t max_distance, std::unordered_set& subgraph, bool look_forward){ + + //The position we're starting from - either the start or end of the path + pos_t start_pos; + size_t node_len; + if (look_forward ){ + start_pos = initial_position(path); + node_len = super_graph->get_length(super_graph->get_handle(get_id(start_pos))); + } else { + start_pos = final_position(path); + node_len = super_graph->get_length(super_graph->get_handle(get_id(start_pos))); + start_pos = reverse_base_pos(start_pos, node_len); + } + pair traversal_start = std::make_pair(get_id(start_pos), get_is_rev(start_pos)); + +#ifdef debug_subgraph +cerr << endl << "Find subgraph in distance range " << min_distance << " to " << max_distance << endl; +cerr << "Start positon: "<< start_pos << endl; +#endif + //The distance from the position to the ends of the current node(/snarl/chain) + size_t current_distance_left = is_rev(start_pos) ? node_len - get_offset(start_pos) : std::numeric_limits::max() ; + size_t current_distance_right = is_rev(start_pos) ? std::numeric_limits::max() : node_len - get_offset(start_pos) ; + + //Graph node of the start and end of the current node(/snarl/chain) pointing out + net_handle_t current_net = distance_index.get_node_net_handle(get_id(start_pos)); + net_handle_t parent = distance_index.start_end_traversal_of(distance_index.get_parent(current_net)); + + //The id and orientation of nodes that are too close and should be avoided + hash_set> seen_nodes; + //Nodes that we want to start a search from - the distance is smaller or equal to than min_distance but + //we can't walk out any further along the snarl tree without exceeding it + //The distance is the distance from the start position to the beginning (or end if its backwards) of the node, + //including the position + vector> search_start_nodes; + + if (((current_distance_left != std::numeric_limits::max() && current_distance_left > min_distance) || + (current_distance_right != std::numeric_limits::max() && current_distance_right > min_distance)) || + (distance_index.is_trivial_chain(parent) + && distance_index.distance_in_parent(distance_index.get_parent(parent), parent, distance_index.flip(parent)) == 0 + && node_len*2 > min_distance)) { + //If the distance to either end of the node is within the range + //Or of there is a loop on the node ( a duplication of just the node) and the node length would put one loop in the distance range + + //Add this node to the subgraph + subgraph.emplace(get_id(start_pos)); + + handle_t start = is_rev(start_pos) ? distance_index.get_handle(distance_index.flip(current_net), super_graph) + : distance_index.get_handle(current_net, super_graph); + + //Add any node one step out from this one to search_start_nodes + super_graph->follow_edges(start, + false, [&](const handle_t& next_handle) { + search_start_nodes.emplace_back(next_handle, is_rev(start_pos) ? current_distance_left : current_distance_right); + }); + + //Search for reachable nodes + subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, subgraph, search_start_nodes, seen_nodes, traversal_start); + + return; + } + + + + while (!distance_index.is_root(parent)) { +#ifdef debug_subgraph + cerr << "At child " << distance_index.net_handle_as_string(current_net) << " with distances " << current_distance_left << " " << current_distance_right << endl; +#endif + + size_t max_parent_length = distance_index.maximum_length(parent); + + + //Distances to get to the ends of the parent + size_t distance_start_left = SnarlDistanceIndex::sum(current_distance_left, + distance_index.distance_to_parent_bound(parent, true, distance_index.flip(current_net))); + size_t distance_start_right = SnarlDistanceIndex::sum(current_distance_right, + distance_index.distance_to_parent_bound(parent, true, current_net)); + size_t distance_end_left = SnarlDistanceIndex::sum(current_distance_left, + distance_index.distance_to_parent_bound(parent, false, distance_index.flip(current_net))); + size_t distance_end_right = SnarlDistanceIndex::sum(current_distance_right, + distance_index.distance_to_parent_bound(parent, false, current_net)); + + if ((current_distance_right != std::numeric_limits::max() && current_distance_right >= min_distance) + || (current_distance_left != std::numeric_limits::max() && current_distance_left >= min_distance) + || (distance_start_right != std::numeric_limits::max() && distance_start_right>= min_distance) + || (distance_end_right != std::numeric_limits::max() && distance_end_right >= min_distance) + || (distance_start_left != std::numeric_limits::max() && distance_start_left >= min_distance) + || (distance_end_left != std::numeric_limits::max() && distance_end_left >= min_distance) + || (max_parent_length != std::numeric_limits::max() && max_parent_length >= min_distance)) { + //If the min distance will be exceeded within this parent, then start a search from the ends of this child + + if (distance_index.is_snarl(parent)) { + //If this is the child of a snarl, then just traverse from the end of the node +#ifdef debug_subgraph +cerr << "Start search in parent " << distance_index.net_handle_as_string(parent); +#endif + if (current_distance_left != std::numeric_limits::max() ){ + //If we can go left + net_handle_t bound = distance_index.is_node(current_net) ? distance_index.flip(current_net) + : distance_index.get_bound(current_net, false, false); + if (distance_index.is_sentinel(bound)) { + bound = distance_index.get_node_from_sentinel(bound); + } + handle_t current_node = distance_index.get_handle(bound, super_graph); + //Add everything immediately after the left bound of this node/chain + super_graph->follow_edges(distance_index.get_handle(bound, super_graph), + false, [&](const handle_t& next_handle) { + seen_nodes.erase(make_pair(super_graph->get_id(next_handle), super_graph->get_is_reverse(next_handle))); + search_start_nodes.emplace_back(next_handle,current_distance_left); + + }); + +#ifdef debug_subgraph + cerr << " going left from " << super_graph->get_id(current_node) << (super_graph->get_is_reverse(current_node) ? "rev " : "fd ") ; +#endif + } + if (current_distance_right != std::numeric_limits::max()) { + //If we can go right + net_handle_t bound = distance_index.is_node(current_net) ? current_net + : distance_index.get_bound(current_net, true, false); + if (distance_index.is_sentinel(bound)) { + bound = distance_index.get_node_from_sentinel(bound); + } + handle_t current_node = distance_index.get_handle(bound, super_graph); + + //Add everything immediately after the right bound of this node/chain + super_graph->follow_edges(distance_index.get_handle(bound, super_graph), + false, [&](const handle_t& next_handle) { + seen_nodes.erase(make_pair(super_graph->get_id(next_handle),super_graph->get_is_reverse(next_handle))); + search_start_nodes.emplace_back(next_handle, current_distance_right); + }); + +#ifdef debug_subgraph + cerr << " going right from " << super_graph->get_id(current_node) << (super_graph->get_is_reverse(current_node) ? "rev " : "fd "); +#endif + } +#ifdef debug_subgraph + cerr << endl; +#endif + } else { +#ifdef debug_subgraph +cerr << "Start search along parent chain " << distance_index.net_handle_as_string(parent); +#endif + //If this is the child of a chain, then traverse along the chain + if (current_distance_left != std::numeric_limits::max()) { + subgraph_in_distance_range_walk_across_chain (distance_index, super_graph, subgraph, + distance_index.flip(current_net), current_distance_left, search_start_nodes, seen_nodes, min_distance, max_distance, false); + } + if (current_distance_right != std::numeric_limits::max()) { + subgraph_in_distance_range_walk_across_chain (distance_index, super_graph, subgraph, + current_net, current_distance_right, search_start_nodes, seen_nodes, min_distance, max_distance, false); + } + } + subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, subgraph, search_start_nodes, seen_nodes, traversal_start); + return; + } else if (distance_index.is_snarl(parent)){ + //TODO: This might be overkill. It prevents us from adding nodes that shouldn't be in the subgraph, but might be too slow + //If we don't check the other direction, go through the loop and add everything whose distance is lower than the minimum + //to seen_nodes + vector> loop_handles_to_check; + handle_t start_out = distance_index.get_handle(distance_index.get_bound(parent, false, false), super_graph); + handle_t end_out = distance_index.get_handle(distance_index.get_bound(parent, true, false), super_graph); + if (current_distance_left != std::numeric_limits::max()) { + loop_handles_to_check.emplace_back(distance_index.get_handle(distance_index.get_bound(current_net, false, false), super_graph), current_distance_left); + } + if (current_distance_right != std::numeric_limits::max()) { + loop_handles_to_check.emplace_back(distance_index.get_handle(distance_index.get_bound(current_net, true, false), super_graph), current_distance_right); + } + while (!loop_handles_to_check.empty()) { + handle_t current_loop_handle = loop_handles_to_check.back().first; + size_t current_loop_distance = loop_handles_to_check.back().second; + loop_handles_to_check.pop_back(); + + //Add to seen_nodes + seen_nodes.emplace(super_graph->get_id(current_loop_handle), super_graph->get_is_reverse(current_loop_handle)); + + //Walk one step out from this node + super_graph->follow_edges(current_loop_handle, false, [&](const handle_t& next_handle) { + //If the next node is close enough and isn't exiting the snarl, then add it to stack + size_t new_distance = SnarlDistanceIndex::sum(current_loop_distance, super_graph->get_length(next_handle)); + if (new_distance < min_distance && next_handle != start_out && next_handle != end_out) { + loop_handles_to_check.emplace_back(next_handle, new_distance); + } + }); + } + } else if (distance_index.is_chain(parent)) { + //TODO: This is probably also overkill - walk a chain if there is a viable loop + size_t distance_loop_right = distance_index.distance_in_parent(parent, current_net, current_net, super_graph, max_distance); + size_t distance_loop_left = distance_index.distance_in_parent(parent, distance_index.flip(current_net), distance_index.flip(current_net), super_graph, max_distance); + if ((current_distance_left != std::numeric_limits::max() && distance_loop_left != std::numeric_limits::max()) || + (current_distance_right != std::numeric_limits::max() && distance_loop_right != std::numeric_limits::max())) { + //If there is a loop that we can take, then take it + if (current_distance_left != std::numeric_limits::max()) { + subgraph_in_distance_range_walk_across_chain (distance_index, super_graph, subgraph, + distance_index.flip(current_net), current_distance_left, search_start_nodes, seen_nodes, min_distance, max_distance, false); + } + if (current_distance_right != std::numeric_limits::max()) { + subgraph_in_distance_range_walk_across_chain (distance_index, super_graph, subgraph, + current_net, current_distance_right, search_start_nodes, seen_nodes, min_distance, max_distance, false); + } + subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, subgraph, search_start_nodes, seen_nodes, traversal_start); + return; + } + } + + //Remember the bounds of this child so we don't return to it + if (current_distance_left != std::numeric_limits::max() ){ + //If we can go left + net_handle_t bound = distance_index.is_node(current_net) ? distance_index.flip(current_net) + : distance_index.get_bound(current_net, false, false); + if (distance_index.is_sentinel(bound)) { + bound = distance_index.get_node_from_sentinel(bound); + } + handle_t current_node = distance_index.get_handle(bound, super_graph); + seen_nodes.emplace(super_graph->get_id(current_node), super_graph->get_is_reverse(current_node)); + } + if (current_distance_right != std::numeric_limits::max()) { + //If we can go right + net_handle_t bound = distance_index.is_node(current_net) ? current_net + : distance_index.get_bound(current_net, true, false); + if (distance_index.is_sentinel(bound)) { + bound = distance_index.get_node_from_sentinel(bound); + } + handle_t current_node = distance_index.get_handle(bound, super_graph); + seen_nodes.emplace(super_graph->get_id(current_node), super_graph->get_is_reverse(current_node)); + } + + current_distance_left = std::min(distance_start_left, distance_start_right); + current_distance_right = std::min(distance_end_left, distance_end_right); + + current_net = std::move(parent); + parent = distance_index.canonical(distance_index.get_parent(current_net)); + } + if (current_distance_left <= min_distance) { +#ifdef debug_subgraph + cerr << "Adding the end of a child of the root " << distance_index.net_handle_as_string(distance_index.get_bound(current_net, false, false)) << " with distance " << current_distance_left << endl; +#endif + + handle_t bound = distance_index.get_handle(distance_index.get_bound(current_net, false, false), super_graph); + search_start_nodes.emplace_back(bound, current_distance_left); + } + if (current_distance_right <= min_distance) { +#ifdef debug_subgraph + cerr << "Adding the end of a child of the root " << distance_index.net_handle_as_string(distance_index.get_bound(current_net, false, false)) << " with distance " << current_distance_right << endl; +#endif + handle_t bound = distance_index.get_handle(distance_index.get_bound(current_net, true, false), super_graph); + search_start_nodes.emplace_back(bound,current_distance_right); + } + subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, subgraph, search_start_nodes, seen_nodes, traversal_start); + + return; +} + + +///Helper for subgraph_in_distance_range +///Given starting handles in the super graph and the distances to each handle (including the start position and +//the first position in the handle), add all nodes within the distance range, excluding nodes in seen_nodes +void subgraph_in_distance_range_walk_graph(const HandleGraph* super_graph, size_t min_distance, size_t max_distance, + std::unordered_set& subgraph, vector>& start_nodes, + hash_set>& seen_nodes, const pair& traversal_start) { +#ifdef debug_subgraph + cerr << "Starting search from nodes " << endl; + for (auto& start_handle : start_nodes) { + cerr << "\t" << super_graph->get_id(start_handle.first) << " " << super_graph->get_is_reverse(start_handle.first) + << " with distance " << start_handle.second << endl; + } +#endif + + //Order based on the distance to the position (handle) + auto cmp = [] (const pair a, const pair b ) { + return a.second > b.second; + }; + priority_queue< pair, vector>, decltype(cmp)> next_handles (cmp); + for (auto& start_handle : start_nodes) { + next_handles.emplace(start_handle); + } + bool first_node = true; + + while (next_handles.size() > 0) { + //Traverse the graph, adding nodes if they are within the range + handle_t curr_handle=next_handles.top().first; + size_t curr_distance=next_handles.top().second; + next_handles.pop(); +#ifdef debug_subgraph + cerr << "At node " << super_graph->get_id(curr_handle) << " " << super_graph->get_is_reverse(curr_handle) << " with distance " << curr_distance << endl; +#endif + if (seen_nodes.count(make_pair(super_graph->get_id(curr_handle), super_graph->get_is_reverse(curr_handle))) == 0) { + seen_nodes.emplace(super_graph->get_id(curr_handle), super_graph->get_is_reverse(curr_handle)); + + size_t node_len = super_graph->get_length(curr_handle); + size_t curr_distance_end = SnarlDistanceIndex::sum(curr_distance, node_len)-1; + if ((curr_distance >= min_distance && curr_distance <= max_distance) || + (curr_distance_end >= min_distance && curr_distance_end <= max_distance) || + (curr_distance <= min_distance && curr_distance_end >= max_distance)) { +#ifdef debug_subgraph + cerr << "\tadding node " << super_graph->get_id(curr_handle) << " " << super_graph->get_is_reverse(curr_handle) << " with distance " + << curr_distance << " and node length " << node_len << endl; +#endif + subgraph.insert(super_graph->get_id(curr_handle)); + + } +#ifdef debug_subgraph + else { + cerr << "\tdisregarding node " << super_graph->get_id(curr_handle) << " " << super_graph->get_is_reverse(curr_handle) + << " with distance " << curr_distance << " and node length " << node_len << endl; + } +#endif + curr_distance = SnarlDistanceIndex::sum(node_len, curr_distance); + + //If the end of this node is still within the range, add the next nodes that are within + //Also check that the node we're currently at isn't the start node + if (SnarlDistanceIndex::minus(curr_distance,1) <= max_distance) { + super_graph->follow_edges(curr_handle, false, [&](const handle_t& next) { + nid_t next_id = super_graph->get_id(next); + if (seen_nodes.count(make_pair(next_id, super_graph->get_is_reverse(next))) == 0) { + next_handles.emplace(next, curr_distance); + } + return true; + }); + } + first_node = false; + } +#ifdef debug_subgraph + else { + cerr << "\tthe node was already seen" << endl; + } +#endif + + } + +#ifdef debug_subgraph + cerr << "Subgraph has nodes: "; + for (const nid_t& node : subgraph) { + cerr << node << ", "; + } + cerr << endl; +#endif + return; +} +//helper function to walk along a chain from the current node until the distance traversed +//exceeds the minimum limit. Add the node just before this happens to search_start_nodes +void subgraph_in_distance_range_walk_across_chain (const SnarlDistanceIndex& distance_index, const HandleGraph* super_graph, + std::unordered_set& subgraph, net_handle_t current_node, + size_t current_distance, vector>& search_start_nodes, hash_set>& seen_nodes, + const size_t& min_distance, const size_t& max_distance, bool checked_loop){ +#ifdef debug_subgraph + cerr << "Walk along parent chain " << distance_index.net_handle_as_string(distance_index.get_parent(current_node)) << " from " << distance_index.net_handle_as_string(current_node) << " with " << current_distance << endl; +#endif + if (distance_index.is_trivial_chain(distance_index.get_parent(current_node))){ + return; + } + bool finished_chain = false; + bool added_nodes = false; //Did we start a search? if not, add the last node in the chain + while (current_distance <= min_distance && !finished_chain) { + finished_chain = distance_index.follow_net_edges(current_node, super_graph, false, + [&](const net_handle_t& next) { + size_t next_length = distance_index.minimum_length(next); + //If the next child is a snarl, then the distance to loop in the snarl + if (distance_index.is_snarl(next)) { + net_handle_t bound_fd = distance_index.get_bound(next, distance_index.ends_at(next) == SnarlDistanceIndex::START, true); + size_t next_loop = distance_index.distance_in_parent(next, bound_fd, bound_fd, super_graph, max_distance); + if (!checked_loop && next_loop != std::numeric_limits::max()) { +#ifdef debug_subgraph + cerr << "\tsnarl loops so also check the other direction" << endl; +#endif + //If we haven't yet checked the chain in the other direction and this snarl allows us to loop + if ( SnarlDistanceIndex::sum(next_loop, current_distance) != std::numeric_limits::max() && + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(next_loop, + current_distance), + distance_index.node_length(current_node)) >= min_distance) { +#ifdef debug_subgraph + cerr << "\t\t add the current node" << endl; +#endif + //If the loop will put us over the edge, then start from the current node + super_graph->follow_edges(distance_index.get_handle(current_node, super_graph), false, [&](const handle_t& next_handle) { + search_start_nodes.emplace_back(next_handle,current_distance); + }); + return true; + } else { + //Otherwise, switch direction in the chain and walk along it again + subgraph_in_distance_range_walk_across_chain(distance_index, super_graph, subgraph, distance_index.flip(current_node), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(current_distance, + next_loop), + distance_index.node_length(current_node)), + search_start_nodes, seen_nodes, min_distance, max_distance, true); + checked_loop = true; + } + } + if (next_loop != std::numeric_limits::max()){ + //TODO: This might be overkill. It prevents us from adding nodes that shouldn't be in the subgraph, but might be too slow + //If we don't check the other direction, go through the loop and add everything whose distance is lower than the minimum + //to seen_nodes + vector> loop_handles_to_check; + handle_t start_out = distance_index.get_handle(distance_index.get_bound(next, false, false), super_graph); + handle_t end_out = distance_index.get_handle(distance_index.get_bound(next, true, false), super_graph); + loop_handles_to_check.emplace_back(distance_index.get_handle(bound_fd, super_graph), current_distance); + while (!loop_handles_to_check.empty()) { + handle_t current_loop_handle = loop_handles_to_check.back().first; + size_t current_loop_distance = loop_handles_to_check.back().second; + loop_handles_to_check.pop_back(); + + //Add to seen_nodes + seen_nodes.emplace(super_graph->get_id(current_loop_handle), super_graph->get_is_reverse(current_loop_handle)); + + //Walk one step out from this node + super_graph->follow_edges(current_loop_handle, false, [&](const handle_t& next_handle) { + //If the next node is close enough and isn't exiting the snarl, then add it to stack + size_t new_distance = SnarlDistanceIndex::sum(current_loop_distance, super_graph->get_length(next_handle)); + if (new_distance < min_distance && next_handle != start_out && next_handle != end_out) { + loop_handles_to_check.emplace_back(next_handle, new_distance); + } + }); + } + + } + } + size_t next_max_length = distance_index.maximum_length(next); +#ifdef debug_subgraph + cerr << "\tnext node: " << distance_index.net_handle_as_string(next) << " with distance " << current_distance << " and min and max lengths " << next_length << " " << next_max_length << endl; +#endif + if (( SnarlDistanceIndex::sum(next_max_length, current_distance) != std::numeric_limits::max() && + SnarlDistanceIndex::sum(next_max_length, current_distance) >= min_distance)){ + if (distance_index.is_node(next)) { + size_t curr_distance_end = SnarlDistanceIndex::minus(SnarlDistanceIndex::sum(next_max_length, current_distance),1); + //If its a node that puts us over, add the node to the subgraph, then start the search from that node +#ifdef debug_subgraph + cerr << "\t\tAdding node from a chain " << distance_index.net_handle_as_string(next) << " with distance " << current_distance << endl; +#endif + if ((current_distance >= min_distance && current_distance <= max_distance) || + (curr_distance_end >= min_distance && curr_distance_end <= max_distance) || + (current_distance <= min_distance && curr_distance_end >= max_distance)) { + subgraph.emplace(distance_index.node_id(next)); + } + super_graph->follow_edges(distance_index.get_handle(next, super_graph), false, [&](const handle_t& next_handle) { + search_start_nodes.emplace_back(next_handle, SnarlDistanceIndex::sum(current_distance, next_length)); + seen_nodes.erase(make_pair(super_graph->get_id(next_handle), super_graph->get_is_reverse(next_handle))); + }); + } else { + //If it's a snarl, then we'll start from the last node +#ifdef debug_subgraph + cerr << "\t\tAdding node from a chain " << distance_index.net_handle_as_string(next) << " with distance " << current_distance << endl; +#endif + super_graph->follow_edges(distance_index.get_handle(current_node, super_graph), false, [&](const handle_t& next_handle) { + search_start_nodes.emplace_back(next_handle,current_distance); + seen_nodes.erase(make_pair(super_graph->get_id(next_handle), super_graph->get_is_reverse(next_handle))); + }); + } + //If we added something, stop traversing the chain + added_nodes = true; + return true; + } else if (distance_index.is_node(next)) { + seen_nodes.emplace(distance_index.node_id(next), distance_index.ends_at(next) == SnarlDistanceIndex::START); + } + current_node = next; + current_distance = SnarlDistanceIndex::sum(next_length, current_distance); + if (current_distance > max_distance) { + added_nodes = true; + return true; + } else { + return false; + } + }); + } + if (!added_nodes && current_distance <= max_distance) { + //If we haven't added anything and haven't exceeded the distance limit, then start from the end of the chain + handle_t bound = distance_index.get_handle(current_node, super_graph); + + super_graph->follow_edges(bound, false, [&](const handle_t& next_handle) { + search_start_nodes.emplace_back(next_handle,current_distance); + seen_nodes.erase(make_pair(super_graph->get_id(next_handle), super_graph->get_is_reverse(next_handle))); + }); + //seen_nodes.erase(make_pair(super_graph->get_id(bound), super_graph->get_is_reverse(bound))); + //search_start_nodes.emplace_back( bound, current_distance); + } +}; + + +void subgraph_containing_path_snarls(const SnarlDistanceIndex& distance_index, const HandleGraph* graph, const Path& path, std::unordered_set& subgraph) { + //Get the start and end of the path + pos_t start_pos = initial_position(path); + net_handle_t start_node = distance_index.get_node_net_handle(get_id(start_pos)); + subgraph.insert(get_id(start_pos)); + + pos_t end_pos = final_position(path); + net_handle_t end_node = distance_index.get_node_net_handle(get_id(end_pos)); + subgraph.insert(get_id(end_pos)); + + //Get the lowest common ancestor + pair lowest_ancestor_bool = distance_index.lowest_common_ancestor(start_node, end_node); + net_handle_t common_ancestor = lowest_ancestor_bool.first; + + + if (distance_index.is_snarl(common_ancestor) || common_ancestor == start_node) { + //If the lowest common ancestor is a snarl, just add the entire snarl + + add_descendants_to_subgraph(distance_index, common_ancestor, subgraph); + + } else if (distance_index.is_chain(common_ancestor)) { + + //Get the ancestors of the nodes that are children of the common ancestor + net_handle_t ancestor1 = distance_index.canonical(distance_index.get_parent(start_node)); + while (ancestor1 != common_ancestor) { + start_node = ancestor1; + ancestor1 = distance_index.canonical(distance_index.get_parent(start_node)); + } + net_handle_t ancestor2 = distance_index.canonical(distance_index.get_parent(end_node)); + while (ancestor2 != common_ancestor) { + end_node = ancestor2; + ancestor2 = distance_index.canonical(distance_index.get_parent(end_node)); + } +#ifdef debug_distance_indexing + assert(ancestor1 == ancestor2); +#endif + + + //Walk from one ancestor to the other and add everything in the chain + net_handle_t current_child = distance_index.canonical(distance_index.is_ordered_in_chain(start_node, end_node) ? start_node : end_node); + net_handle_t end_child = distance_index.canonical(distance_index.is_ordered_in_chain(start_node, end_node) ? end_node : start_node); + if (distance_index.is_reversed_in_parent(current_child)) { + current_child = distance_index.flip(current_child); + } + if (distance_index.is_reversed_in_parent(end_child)) { + end_child = distance_index.flip(end_child); + } + + add_descendants_to_subgraph(distance_index, current_child, subgraph); + while (current_child != end_child) { + distance_index.follow_net_edges(current_child, graph, false, [&](const net_handle_t& next) { + add_descendants_to_subgraph(distance_index, next, subgraph); + current_child = next; + + }); + } + + } + +} + + +//Recursively add all nodes in parent to the subgraph +void add_descendants_to_subgraph(const SnarlDistanceIndex& distance_index, const net_handle_t& parent, std::unordered_set& subgraph) { + if (distance_index.is_node(parent)) { + subgraph.insert(distance_index.node_id(parent)); + } else { + distance_index.for_each_child(parent, [&](const net_handle_t& child) { + add_descendants_to_subgraph(distance_index, child, subgraph); + }); + } +} + +/*Given a position, return distances that can be stored by a minimizer + * + * This stores: + + - (size_t) record offset of node + - (size_t) record offset of parent (or the grandparent if the node and parent have the same offset) + - (size_t) node record offset + - (size_t) length of the node + - (bool) is the node reversed in its parent + - (bool) is trivial chain + - (bool) is the parent a chain + - (bool) is the parent a root (the parent we saved is a root-snarl or root-level chain) + - (size_t) prefix sum value of the node (or prefix sum to the start of the parent snarl) + - (size_t) the chain component of the node + This is set if the node is in a nontrivial chain or in a simple snarl, in which case the component is + the chain component of the start and end nodes of the parent snarl + + If the node is on a chain, then all the values are what you'd expect, is_root is true if it is a root-level chain + If the node is in a trivial chain in a simple snarl, then the parent is the record offset of the chain, and the + prefix sum and chain component values are for the start of the simple snarl + If the node is a trivial chain in a non-simple snarl, then parent is the record offset of the parent snarl, + and the prefix sum and components are inf + + */ + + +MIPayloadValues get_minimizer_distances (const SnarlDistanceIndex& distance_index,pos_t pos) { + + net_handle_t node_handle = distance_index.get_node_net_handle(get_id(pos)); + net_handle_t parent_handle = distance_index.get_parent(node_handle); + + bool is_trivial_chain = distance_index.is_trivial_chain(parent_handle); + + if (is_trivial_chain) { + parent_handle = distance_index.get_parent(parent_handle); + } + + bool parent_is_root = distance_index.is_root(parent_handle); + bool parent_is_root_snarl = distance_index.is_root_snarl(parent_handle); + bool parent_is_simple_snarl = distance_index.is_simple_snarl(parent_handle); + + //The values that will be returned + size_t record_offset = distance_index.get_record_offset(node_handle); + size_t parent_record_offset; + size_t node_record_offset = distance_index.get_node_record_offset(node_handle); + size_t node_length = distance_index.minimum_length(node_handle); + bool is_reversed_in_parent; + bool parent_is_chain; + size_t prefix_sum; + size_t component; + + + if (parent_is_root && !parent_is_root_snarl) { + //If the node is a child of the root + parent_record_offset = 0; + is_reversed_in_parent = false; + parent_is_chain = false; + parent_is_root = true; + prefix_sum = std::numeric_limits::max(); + component = std::numeric_limits::max(); + } else if (parent_is_root_snarl) { + //The node is in a root snarl + parent_record_offset = distance_index.get_record_offset(parent_handle); + is_reversed_in_parent = false; + parent_is_chain = false; + parent_is_root = true; + prefix_sum = std::numeric_limits::max(); + component = std::numeric_limits::max(); + } else if (parent_is_simple_snarl) { + //If the node is a trivial chain in a simple snarl + //Since the actual parent was a trivial chain, the current parent_handle is the grandparent snarl + + //We actually store the greatgrandparent chain as the parent + parent_record_offset = distance_index.get_record_offset(distance_index.get_parent(parent_handle)); + is_reversed_in_parent = distance_index.is_reversed_in_parent(distance_index.get_parent(node_handle)); + is_trivial_chain = true; + parent_is_chain = true; + parent_is_root = false; + + //Remember the prefix sum value as being the distance to the start + //of the snarl - the prefix sum of the start node plus the length of the start node + //The chain component is also the same for both boundary nodes of the snarl, so remember that too + + //The start node of the simple snarl + net_handle_t snarl_start= distance_index.get_node_from_sentinel(distance_index.get_bound(parent_handle, false, false)); + prefix_sum = SnarlDistanceIndex::sum( + distance_index.get_prefix_sum_value(snarl_start), + distance_index.minimum_length(snarl_start)); + component = distance_index.get_chain_component(snarl_start); + } else if (is_trivial_chain) { + //If the node is a trivial chain in a non-simple snarl + //Since the actual parent was a trivial chain, the current parent_handle is the grandparent snarl + parent_record_offset = distance_index.get_record_offset(parent_handle); + is_reversed_in_parent = false; + parent_is_chain = false; + parent_is_root = false; + prefix_sum = std::numeric_limits::max(); + component = std::numeric_limits::max(); + } else { + //Otherwise the node is in a chain + parent_record_offset = distance_index.get_record_offset(parent_handle); + is_reversed_in_parent = distance_index.is_reversed_in_parent(node_handle); + parent_is_chain = true; + net_handle_t grandparent = distance_index.get_parent(parent_handle); + parent_is_root = distance_index.is_root(grandparent) && !distance_index.is_root_snarl(grandparent); + prefix_sum = distance_index.get_prefix_sum_value(node_handle); + component = distance_index.is_multicomponent_chain(parent_handle) ? distance_index.get_chain_component(node_handle) + : 0; + } + return { record_offset, + parent_record_offset, + node_record_offset, + node_length, + is_reversed_in_parent, + is_trivial_chain, + parent_is_chain, + parent_is_root, + prefix_sum, + component}; + +} + + + +constexpr gbwtgraph::Payload MIPayload::NO_CODE; +constexpr size_t MIPayload::NO_VALUE; +} + diff --git a/src/snarl_distance_index.hpp b/src/snarl_distance_index.hpp new file mode 100644 index 00000000000..33f3c6c8490 --- /dev/null +++ b/src/snarl_distance_index.hpp @@ -0,0 +1,354 @@ +#ifndef VG_SNARL_DISTANCE_HPP_INCLUDED +#define VG_SNARL_DISTANCE_HPP_INCLUDED + +#include +#include "snarls.hpp" +#include +#include "hash_map.hpp" +#include + + +namespace vg { + +using namespace sdsl; +using namespace handlegraph; +using namespace bdsg; + +//Minimum distance taking a pos instead of id/orientation/offset +size_t minimum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, pos_t pos2, + bool unoriented_distance = false, const HandleGraph* graph=nullptr); +//Maximum distance taking a pos instead of id/orientation/offset +size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, pos_t pos2); + +//Fill in the index +//size_limit is a limit on the number of nodes in a snarl, after which the index won't store pairwise distances +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000); + +//Fill in the temporary snarl record with distances +void populate_snarl_index(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + pair snarl_index, size_t size_limit, const HandleGraph* graph) ; + +SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit); + +//Define wang_hash for net_handle_t's so that we can use a hash_map +template<> struct wang_hash { +public: + inline size_t operator()(const net_handle_t& net_handle) const { + return wang_hash_64(as_integer(net_handle)); + } +}; + +//Given an alignment to a graph and a range, find the set of nodes in the +//graph for which the minimum distance from the position to any position +//in the node is within the given distance range +//If look_forward is true, then start from the start of the path forward, +//otherwise start from the end going backward +void subgraph_in_distance_range(const SnarlDistanceIndex& distance_index, const Path& path, const HandleGraph* super_graph, size_t min_distance, + size_t max_distance, std::unordered_set& subgraph, bool look_forward); +///Helper for subgraph_in_distance_range +///Given starting handles in the super graph and the distances to each handle (including the start position and +//the first position in the handle), add all nodes within the distance range, excluding nodes in seen_nodes +//traversal_start is the node that we started the search from, because we can traverse it a second time but +//we don't want to include a loop distance to any node after it +void subgraph_in_distance_range_walk_graph(const HandleGraph* super_graph, size_t min_distance, size_t max_distance, + std::unordered_set& subgraph, vector>& start_nodes, + hash_set>& seen_nodes, const pair& traversal_start); + +//Helper function for subgraph_in_distance_range +//Given a node that is a child of a chain and the distance to it, walk forward from the +//current child and add nodes to search_start_nodes +void subgraph_in_distance_range_walk_across_chain (const SnarlDistanceIndex& distance_index, + const HandleGraph* super_graph,std::unordered_set& subgraph, + net_handle_t current_node, size_t current_distance, + vector>& search_start_nodes, + hash_set>& seen_nodes, + const size_t& min_distane, const size_t& max_distance, bool checked_loop=false); + + +//Add nodes to the subgraph if they are near the path in the snarl tree +//Walks up the snarl tree from either end of the path, then takes everything +//in between them +void subgraph_containing_path_snarls(const SnarlDistanceIndex& distance_index, const HandleGraph* graph, const Path& path, std::unordered_set& subgraph); + + +//Helper function for subgraph_containing_path_snarls +//Add all the nodes in the parent to the subgraph +void add_descendants_to_subgraph(const SnarlDistanceIndex& distance_index, const net_handle_t& parent, std::unordered_set& subgraph); + + + +//The distance values that get stored in an MIPayload +struct MIPayloadValues{ + + //The record offset of the node + size_t record_offset; + + //The record offset of the parent + size_t parent_record_offset; + + //The node record offset of the node (eg, which node in a trivial snarl) + size_t node_record_offset; + + size_t node_length; + + //Is the node reversed in its parent + bool is_reversed; + + bool is_trivial_chain; + + bool parent_is_chain; + + bool parent_is_root; + + size_t prefix_sum; + + size_t chain_component; +}; + +/// +// The encoding of distances for positions in top-level chains +// We store this information in the minimizer index. +// +// This gets stored in two separate uint64_t's +// +// 32 bits | 32 +// record offset of node | record offset of parent +// +// 8 bits | 12 bit | 1 | 1 | 1 | 1 | 32 | 8 +// node record offset | node length | is_reversed | is trivial chain | parent is chain | parent is root | prefix sum | chain_component +// +// +// These values are en/de-coded from the raw values in the order above +// +// If no values are stored, then the two uint64_t's will both be inf +// bools are always stored, everything else is all 1's if it is not stored +// + +struct MIPayload { + typedef std::uint64_t code_type; + + + constexpr static gbwtgraph::Payload NO_CODE = gbwtgraph::Payload::default_payload(); + constexpr static std::size_t NO_VALUE = std::numeric_limits::max(); + + + //Static values for the offset from the right side of the uint64_t storing the values, the width of each value, and a bit mask for the value + const static size_t PARENT_RECORD_OFFSET = 0; + const static size_t PARENT_RECORD_WIDTH = 32; + const static code_type PARENT_RECORD_MASK = (static_cast(1) << PARENT_RECORD_WIDTH) - 1; + + const static size_t NODE_RECORD_OFFSET = 32; + const static size_t NODE_RECORD_WIDTH = 32; + const static code_type NODE_RECORD_MASK = (static_cast(1) << NODE_RECORD_WIDTH) - 1; + + + const static size_t CHAIN_COMPONENT_OFFSET = 0; + const static size_t CHAIN_COMPONENT_WIDTH = 8; + const static code_type CHAIN_COMPONENT_MASK = (static_cast(1) << CHAIN_COMPONENT_WIDTH) - 1; + + const static size_t PREFIX_SUM_OFFSET = 8; + const static size_t PREFIX_SUM_WIDTH = 32; + const static code_type PREFIX_SUM_MASK = (static_cast(1) << PREFIX_SUM_WIDTH) - 1; + + const static size_t PARENT_IS_ROOT_OFFSET = 40; + const static size_t PARENT_IS_CHAIN_OFFSET = 41; + const static size_t IS_TRIVIAL_CHAIN_OFFSET = 42; + const static size_t IS_REVERSED_OFFSET = 43; + + const static size_t NODE_LENGTH_OFFSET = 44; + const static size_t NODE_LENGTH_WIDTH = 12; + const static code_type NODE_LENGTH_MASK = (static_cast(1) << NODE_LENGTH_WIDTH) - 1; + + const static size_t NODE_RECORD_OFFSET_OFFSET = 56; + const static size_t NODE_RECORD_OFFSET_WIDTH = 8; + const static code_type NODE_RECORD_OFFSET_MASK = (static_cast(1) << NODE_RECORD_OFFSET_WIDTH) - 1; + + //Encode and decode from the following values: + //record offset of node, record offset of parent, node record offset, node length, is_reversed, parent is chain, prefix sum, chain_component + static gbwtgraph::Payload encode(MIPayloadValues info) { + + if ( info.record_offset > NODE_RECORD_MASK + || info.parent_record_offset > PARENT_RECORD_MASK + || info.node_record_offset > NODE_RECORD_OFFSET_MASK + || info.node_length > NODE_LENGTH_MASK + || info.prefix_sum > PREFIX_SUM_MASK + || info.chain_component > CHAIN_COMPONENT_MASK) { + //If there aren't enough bits to represent one of the values + return NO_CODE; + } + + code_type encoded1 = (static_cast(info.record_offset) << NODE_RECORD_OFFSET) + | (static_cast(info.parent_record_offset) << PARENT_RECORD_OFFSET); + + code_type encoded2 = (static_cast(info.node_record_offset) << NODE_RECORD_OFFSET_OFFSET) + | (static_cast(info.node_length) << NODE_LENGTH_OFFSET) + | (static_cast(info.is_reversed) << IS_REVERSED_OFFSET) + | (static_cast(info.is_trivial_chain) << IS_TRIVIAL_CHAIN_OFFSET) + | (static_cast(info.parent_is_chain) << PARENT_IS_CHAIN_OFFSET) + | (static_cast(info.parent_is_root) << PARENT_IS_ROOT_OFFSET) + | (static_cast(info.prefix_sum) << PREFIX_SUM_OFFSET) + | (static_cast(info.chain_component) << CHAIN_COMPONENT_OFFSET); + + return {encoded1, encoded2}; + + } + + //Set the values of a code. Mutate the given code + static void set_record_offset(gbwtgraph::Payload& code, size_t record_offset) { + //Set everything in node_record slot to 0's + code.first = code.first & ~(NODE_RECORD_MASK << NODE_RECORD_OFFSET); + //And | with the value to set it + code.first = code.first | (static_cast(record_offset) << NODE_RECORD_OFFSET); + } + static void set_parent_record_offset(gbwtgraph::Payload& code, size_t parent_record_offset) { + code.first = code.first & ~(PARENT_RECORD_MASK << PARENT_RECORD_OFFSET); + code.first = code.first | (static_cast(parent_record_offset) << PARENT_RECORD_OFFSET); + } + static void set_node_record_offset(gbwtgraph::Payload& code, size_t node_record_offset) { + code.second = code.second & ~(NODE_RECORD_OFFSET_MASK << NODE_RECORD_OFFSET_OFFSET); + code.second = code.second | (static_cast(node_record_offset) << NODE_RECORD_OFFSET_OFFSET); + } + static void set_node_length(gbwtgraph::Payload& code, size_t node_length) { + code.second = code.second & ~(NODE_LENGTH_MASK << NODE_LENGTH_OFFSET); + code.second = code.second | (static_cast(node_length) << NODE_LENGTH_OFFSET); + } + static void set_is_reversed(gbwtgraph::Payload& code, bool is_reversed) { + code.second = code.second & ~(static_cast(1) << IS_REVERSED_OFFSET); + code.second = code.second | (static_cast(is_reversed) << IS_REVERSED_OFFSET); + } + static void set_is_trivial_chain(gbwtgraph::Payload& code, bool is_trivial_chain) { + code.second = code.second & ~(static_cast(1) << IS_TRIVIAL_CHAIN_OFFSET); + code.second = code.second | (static_cast(is_trivial_chain) << IS_TRIVIAL_CHAIN_OFFSET); + } + static void set_parent_is_chain(gbwtgraph::Payload& code, bool parent_is_chain) { + code.second = code.second & ~(static_cast(1) << PARENT_IS_CHAIN_OFFSET); + code.second = code.second | (static_cast(parent_is_chain) << PARENT_IS_CHAIN_OFFSET); + } + static void set_parent_is_root(gbwtgraph::Payload& code, bool parent_is_root) { + code.second = code.second & ~(static_cast(1) << PARENT_IS_ROOT_OFFSET); + code.second = code.second | (static_cast(parent_is_root) << PARENT_IS_ROOT_OFFSET); + } + static void set_prefix_sum(gbwtgraph::Payload& code, size_t prefix_sum) { + code.second = code.second & ~(PREFIX_SUM_MASK << PREFIX_SUM_OFFSET); + code.second = code.second | (static_cast(prefix_sum) << PREFIX_SUM_OFFSET); + } + static void set_chain_component(gbwtgraph::Payload& code, size_t chain_component) { + code.second = code.second & ~(CHAIN_COMPONENT_MASK << CHAIN_COMPONENT_OFFSET); + code.second = code.second | (static_cast(chain_component) << CHAIN_COMPONENT_OFFSET); + } + + + //How do decode the code + static size_t record_offset(const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return NO_VALUE; + } + return (size_t) (code.first >> NODE_RECORD_OFFSET & NODE_RECORD_MASK); + } + static size_t parent_record_offset(const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return NO_VALUE; + } + return (size_t) (code.first >> PARENT_RECORD_OFFSET & PARENT_RECORD_MASK); + } + + static size_t node_record_offset(const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return NO_VALUE; + } + return (size_t) (code.second >> NODE_RECORD_OFFSET_OFFSET & NODE_RECORD_OFFSET_MASK); + } + static size_t node_length(const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return NO_VALUE; + } + return (size_t) (code.second >> NODE_LENGTH_OFFSET & NODE_LENGTH_MASK); + } + static bool is_reversed(const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return false; + } + return (bool) (code.second >> IS_REVERSED_OFFSET & 1); + } + static bool is_trivial_chain (const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return false; + } + return (bool) (code.second >> IS_TRIVIAL_CHAIN_OFFSET & 1); + } + static bool parent_is_chain(const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return false; + } + return (bool) (code.second >> PARENT_IS_CHAIN_OFFSET & 1); + } + static bool parent_is_root (const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return false; + } + return (bool) (code.second >> PARENT_IS_ROOT_OFFSET & 1); + } + static size_t prefix_sum (const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return NO_VALUE; + } + return (size_t) (code.second >> PREFIX_SUM_OFFSET & PREFIX_SUM_MASK); + } + static size_t chain_component (const gbwtgraph::Payload code) { + if (code == NO_CODE) { + return NO_VALUE; + } + return (size_t) (code.second >> CHAIN_COMPONENT_OFFSET & CHAIN_COMPONENT_MASK); + } + + + + static MIPayloadValues decode(gbwtgraph::Payload code) { + if (code == NO_CODE) { + return {NO_VALUE, NO_VALUE, NO_VALUE, NO_VALUE, false, false, false, false, NO_VALUE, NO_VALUE}; + } else { + return { + record_offset(code), + parent_record_offset(code), + node_record_offset(code), + node_length(code), + is_reversed(code), + is_trivial_chain(code), + parent_is_chain(code), + parent_is_root(code), + prefix_sum(code), + chain_component(code)}; + + + } + } + +}; + +//Given a position, return distances that can be stored by a minimizer +// +//If the position is on a boundary node of a top level chain, then return true, and +//a unique identifier for the connected component that the node is on and +//the offset of the position in the root chain - the minimum distance from the beginning of the chain to +//the position +//The second bool will be false and the remaining size_t's will be 0 +// +//If the position is on a child node of a top-level simple bubble (bubble has no children and nodes connect only to boundaries) +//return false, 0, 0, true, and the rank of the bubble in its chain, the length of the start +//node of the snarl, the length of the end node (relative to a fd traversal of the chain), and +//the length of the node +// +//If the position is not on a root node (that is, a boundary node of a snarl in a root chain), returns +//false and MIPayload::NO_VALUE for all values +// + + +//Given a position, return the distances that can be stored by a minimizer +//record offset of node, record offset of parent, node record offset, node length, is_reversed, is_trivial_chain, parent is chain, prefix sum, chain_component +MIPayloadValues get_minimizer_distances (const SnarlDistanceIndex& distance_index, pos_t pos); + + + +} + +#endif diff --git a/src/snarl_seed_clusterer.cpp b/src/snarl_seed_clusterer.cpp new file mode 100644 index 00000000000..420cebf780f --- /dev/null +++ b/src/snarl_seed_clusterer.cpp @@ -0,0 +1,3767 @@ +#include "snarl_seed_clusterer.hpp" + +#include + +//#define DEBUG_CLUSTER +//#define debug_distances +namespace vg { + +SnarlDistanceIndexClusterer::SnarlDistanceIndexClusterer( const SnarlDistanceIndex& distance_index, const HandleGraph* graph) : + distance_index(distance_index), + graph(graph){ +}; +SnarlDistanceIndexClusterer::SnarlDistanceIndexClusterer( const SnarlDistanceIndex* distance_index, const HandleGraph* graph) : + distance_index(*distance_index), + graph(graph){ +}; +SnarlDistanceIndexClusterer::SnarlDistanceIndexClusterer( const SnarlDistanceIndex& distance_index) : + distance_index(distance_index), + graph(nullptr){ +}; +SnarlDistanceIndexClusterer::SnarlDistanceIndexClusterer( const SnarlDistanceIndex* distance_index) : + distance_index(*distance_index), + graph(nullptr){ +}; + +vector SnarlDistanceIndexClusterer::cluster_seeds (const vector& seeds, size_t read_distance_limit) const { + //Wrapper for single ended + + vector seed_caches(seeds.size()); + for (size_t i = 0 ; i < seeds.size() ; i++) { + seed_caches[i].pos = seeds[i].pos; + seed_caches[i].minimizer_cache = seeds[i].minimizer_cache; + } + vector*> all_seed_caches = {&seed_caches}; + + std::vector> all_clusters = + std::get<0>(cluster_seeds_internal(all_seed_caches, read_distance_limit, 0))[0].all_groups(); + + std::vector result; + result.reserve(all_clusters.size()); + for (auto& cluster : all_clusters) { + result.emplace_back(); + result.back().seeds = std::move(cluster); + } + //TODO: Sorting fixes determinism issues but seems unecessary + std::sort(result.begin(), result.end(), [&] (Cluster& cluster1, Cluster& cluster2) { + return cluster1.seeds.front() < cluster2.seeds.front(); + }); + + return result; +}; + +vector> SnarlDistanceIndexClusterer::cluster_seeds ( + const vector>& all_seeds, + size_t read_distance_limit, size_t fragment_distance_limit) const { + //Wrapper for paired end + + if (all_seeds.size() > 2) { + throw std::runtime_error("Clusterer: We can't handle more than paired end mapping"); + } + + //Make a vector of SeedCache that contains all the payloads + vector> all_seed_caches; + all_seed_caches.reserve(all_seeds.size()); + + for (size_t read_num = 0 ; read_num < all_seeds.size() ; read_num++) { + all_seed_caches.emplace_back(all_seeds[read_num].size()); + for (size_t i = 0 ; i < all_seeds[read_num].size() ; i++) { + all_seed_caches[read_num][i].pos = all_seeds[read_num][i].pos; + all_seed_caches[read_num][i].minimizer_cache = all_seeds[read_num][i].minimizer_cache; + } + } + vector*> seed_cache_pointers; + + seed_cache_pointers.reserve(all_seed_caches.size()); + + for (vector& v : all_seed_caches) seed_cache_pointers.push_back(&v); + + //Actually cluster the seeds + auto union_finds = cluster_seeds_internal(seed_cache_pointers, read_distance_limit, fragment_distance_limit); + + vector* read_union_finds = &std::get<0>(union_finds); + structures::UnionFind* fragment_union_find = &std::get<1>(union_finds); + + std::vector> result (all_seeds.size()); + //Map the old group heads to new indices + size_t curr_index = 0; + size_t read_num_offset = 0; + hash_map old_to_new_cluster_index; + + for (size_t read_num = 0 ; read_num < read_union_finds->size() ; read_num++) { + vector> read_clusters = read_union_finds->at(read_num).all_groups(); + result[read_num].reserve(read_clusters.size()); + for (vector& cluster : read_clusters) { + result[read_num].emplace_back(); + Cluster& curr = result[read_num].back(); + curr.seeds = std::move(cluster); + } + //TODO: Sorting fixes determinism issues but seems unecessary + std::sort(result[read_num].begin(), result[read_num].end(), [&] (Cluster& cluster1, Cluster& cluster2) { + return cluster1.seeds.front() < cluster2.seeds.front(); + }); + } + for (size_t read_num = 0 ; read_num < result.size() ; read_num++) { + for (Cluster& cluster : result[read_num]) { + size_t fragment_index = read_num_offset + cluster.seeds[0]; + size_t fragment_cluster_head = fragment_union_find->find_group(fragment_index); + if (old_to_new_cluster_index.count(fragment_cluster_head) == 0) { + old_to_new_cluster_index.emplace(fragment_cluster_head, curr_index); + fragment_cluster_head = curr_index; + curr_index++; + } else { + fragment_cluster_head = old_to_new_cluster_index[fragment_cluster_head]; + } + cluster.fragment = fragment_cluster_head; + } + read_num_offset += all_seeds[read_num].size(); + } + + + return result; +} + + +tuple, structures::UnionFind> SnarlDistanceIndexClusterer::cluster_seeds_internal ( + vector*>& all_seeds, + size_t read_distance_limit, size_t fragment_distance_limit) const { + /* Given a vector of seeds and a limit, find a clustering of seeds where + * seeds that are closer than the limit cluster together. + * Returns a vector of clusters + */ +#ifdef DEBUG_CLUSTER +cerr << endl << endl << endl << endl << "New cluster calculation:" << endl; +cerr << "\tread distance limit: " << read_distance_limit << " and fragment distance limit: " << fragment_distance_limit << endl; +#endif + if (fragment_distance_limit != 0 && + fragment_distance_limit < read_distance_limit) { + throw std::runtime_error("Fragment distance limit must be greater than read distance limit"); + } + + //For each level of the snarl tree, which chains at that level contain seeds + //Initially populated by get_nodes(), which adds chains whose nodes contain seeds + //Chains are added when the child snarls are found + //A ClusteringProblem will have pointers to the current and next level of the snarl tree + vector> chains_by_level; + chains_by_level.reserve(distance_index.get_max_tree_depth()+1); + + + + //This stores all the tree relationships and cluster information + //for a single level of the snarl tree as it is being processed + //It also keeps track of the parents of the current level + size_t seed_count = 0; + for (auto v : all_seeds) seed_count+= v->size(); + ClusteringProblem clustering_problem (&all_seeds, read_distance_limit, fragment_distance_limit, seed_count); + + + //Initialize chains_by_level with all the seeds on chains + //Also clusters seeds on nodes in the root or root snarls and adds them to the root snarls + get_nodes(clustering_problem, chains_by_level); + + //Initialize the tree state to the bottom level + clustering_problem.current_chains = &chains_by_level[chains_by_level.size() - 1]; + + for (int depth = chains_by_level.size() - 1 ; depth >= 0 ; depth --) { + // Go through each level of the tree, bottom up, and cluster that level. + // When we reach a level, we know all the children of the chains at that level + // Cluster each chain, assign chains to parent snarls + // Cluster each snarl, assign snarls to parent chains + // Reset the current level to point to the parent chains + +#ifdef DEBUG_CLUSTER +assert(clustering_problem.seed_count_prefix_sum[0] == 0); +for (size_t i = 1 ; i < clustering_problem.all_seeds->size() ; i++) { + assert (clustering_problem.seed_count_prefix_sum[i] + clustering_problem.all_seeds->at(i)->size() == clustering_problem.seed_count_prefix_sum[i+1]); +} +#endif + if (depth != 0) { + clustering_problem.parent_chains = &chains_by_level[depth-1]; + } + + + //Cluster all the chains at this depth + //Also records which chains are in snarls and the parents of these + //chains in clustering_problem.parent_chains + cluster_chain_level(clustering_problem, depth); + + //And cluster all the snarls, record the parents of these snarls + cluster_snarl_level(clustering_problem); + + + // Swap buffer over for the next level + clustering_problem.current_chains = clustering_problem.parent_chains; + } + //There may be some connectivity in the root, so also try to cluster in the root + cluster_root(clustering_problem); + + + +#ifdef DEBUG_CLUSTER + + cerr << "Found read clusters : " << endl; + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + cerr << "\t read num " << read_num << ": " ; + for (auto group : clustering_problem.read_union_find[read_num].all_groups()){ + cerr << "\t\t"; + for (size_t c : group) { + cerr << clustering_problem.all_seeds->at(read_num)->at(c).pos << " "; + } + cerr << endl; + } + cerr << endl; + } + vector ordered_seeds; + for (size_t i = 0 ; i < clustering_problem.all_seeds->size() ; i++) { + const auto v = clustering_problem.all_seeds->at(i); + for ( auto x : *v) { + ordered_seeds.push_back(x); + } + } + cerr << "Found fragment clusters : " << endl; + for (auto group : clustering_problem.fragment_union_find.all_groups()){ + cerr << "\t"; + for (size_t c : group) { + cerr << ordered_seeds[c].pos << " "; + } + cerr << endl; + } + +/* + //CHeck read clusters + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + auto all_groups = clustering_problem.read_union_find[read_num].all_groups(); + for (size_t g1 = 0 ; g1 < all_groups.size() ; g1 ++ ){ + auto group = all_groups[g1]; + structures::UnionFind uf(group.size(), false); + for (size_t i1 = 0 ; i1 < group.size() ; i1++) { + size_t c = group[i1]; + pos_t pos1 = clustering_problem.all_seeds->at(read_num)->at(c).pos; + pos_t rev1 = make_pos_t(get_id(pos1), !is_rev(pos1), distance_index.node_length(get_id(pos1)) - get_offset(pos1) - 1); + + for (size_t i2 = 0 ; i2 < i1 ; i2++) { + + size_t d = group[i2]; + + pos_t pos2 = clustering_problem.all_seeds->at(read_num)->at(d).pos; + pos_t rev2 = make_pos_t(get_id(pos2), !is_rev(pos2), distance_index.node_length(get_id(pos2))- get_offset(pos2) - 1); + size_t d1 = distance_index.min_distance(pos1, pos2); + size_t d2 = std::min(d1, distance_index.min_distance(pos1, rev2)); + size_t d3 = std::min(d2, distance_index.min_distance(rev1, rev2)); + size_t d4 = std::min(d3, distance_index.min_distance(rev1, pos2)); + if (d4 != -1 && d4 <= clustering_problem.read_distance_limit) { + + uf.union_groups(i1, i2); + } + } + for (size_t g2 = 0 ; g2 < all_groups.size() ; g2 ++) { + if (g2 != g1) { + auto group2 = all_groups[g2]; + for (size_t d : group2) { + pos_t pos2 = clustering_problem.all_seeds->at(read_num)->at(d).pos; + pos_t rev2 = make_pos_t(get_id(pos2), !is_rev(pos2), distance_index.node_length(get_id(pos2)) - get_offset(pos2) - 1); + + size_t d1 = distance_index.min_distance(pos1, pos2); + size_t d2 = std::min(d1, distance_index.min_distance(pos1, rev2)); + size_t d3 = std::min(d2, distance_index.min_distance(rev1, rev2)); + size_t d4 = std::min(d3, distance_index.min_distance(rev1, pos2)); + + assert (d4 == -1 || d4 > clustering_problem.read_distance_limit); + } + + } + } + } + if (uf.all_groups().size() != 1) { + cerr << "These should be separate clusters: " << endl; + for (auto uf_group : uf.all_groups()) { + for (size_t i : uf_group) { + size_t c = group[i]; + cerr << clustering_problem.all_seeds->at(read_num)->at(c).pos << ":" << clustering_problem.all_seeds->at(read_num)->at(c).component << ":" + << clustering_problem.all_seeds->at(read_num)->at(c).offset << ", "; + } + cerr << endl; + } + + } + assert (uf.all_groups().size() == 1); + } + } + */ + + +#endif + return make_tuple(std::move(clustering_problem.read_union_find), std::move(clustering_problem.fragment_union_find)); + +}; + + +//Go through all the seeds and assign them to their parent chains or roots +//If a node is in a chain, then assign it to its parent chain and add the parent +//chain to chains_by_level +//If a node is a child of the root or of a root snarl, then add cluster it and +//remember to cluster the root snarl +void SnarlDistanceIndexClusterer::get_nodes( ClusteringProblem& clustering_problem, vector>& chains_by_level) const { +#ifdef DEBUG_CLUSTER +cerr << "Add all seeds to nodes: " << endl; +#endif + + //This is to remember the nodes that we are going to cluster at the end of get_nodes + //these will be the nodes that are children of the root or root snarl. + //All other seeds are added directly to their parent chains as children + vector nodes_to_cluster_now; + + + //Map the parent SnarlTreeNodeProblem to its depth so we don't use get_depth() as much + hash_map parent_to_depth; + parent_to_depth.reserve(clustering_problem.seed_count_prefix_sum.back()); + + + //All nodes we've already assigned + hash_set seen_nodes; + seen_nodes.reserve(clustering_problem.seed_count_prefix_sum.back()); + + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++){ + vector* seeds = clustering_problem.all_seeds->at(read_num); + for (size_t i = 0; i < seeds->size(); i++) { + SeedCache& seed = seeds->at(i); + pos_t pos = seed.pos; + id_t id = get_id(pos); + + +#ifdef DEBUG_CLUSTER + cerr << "\t" << read_num << ":" << pos << ", "; +#endif + + + //We are going to add the seed to its parent. + //If the node is in the root, then cluster it after going through all seeds and forget about it + //If the parent is a proper chain, then add the seed directly to the parent chain + + + //Remember the new cache. We will replace it with the actual values for the seed as necessary + //cached values are: + //(0)record offset of node, (1)record offset of parent, (2)node record offset, (3)node length, (4)is_reversed, + // (5)is_trivial_chain, (6)parent is chain, (7)parent is root, (8)prefix sum, (9)chain_component + gbwtgraph::Payload old_cache = seed.minimizer_cache; + + //TODO: For now, we're either storing all values or none + bool has_cached_values = old_cache != MIPayload::NO_CODE; +#ifdef DEBUG_CLUSTER + if (has_cached_values) { + cerr << "Using cached values:" + << ", " << MIPayload::record_offset(old_cache) + << ", " << MIPayload::parent_record_offset(old_cache) + << ", " << MIPayload::node_record_offset(old_cache) + << ", " << MIPayload::node_length(old_cache) + << ", " << MIPayload::prefix_sum(old_cache) + << ", " << MIPayload::chain_component(old_cache) << endl; + } else { + cerr << "Not using cached values" << endl; + } +#endif + + + //Get the net_handle for the node the seed is on + net_handle_t node_net_handle = !has_cached_values ? distance_index.get_node_net_handle(id) + : distance_index.get_net_handle_from_values(MIPayload::record_offset(old_cache), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::NODE_HANDLE, + MIPayload::node_record_offset(old_cache)); + + + //Get the parent of the node + net_handle_t parent; + //If the grandparent is a root/root snarl, then make it the parent and the node a trivial chain + //because they will be clustered here and added to the root instead of being added to the + //snarl tree to be clustered + if (has_cached_values) { + if (MIPayload::is_trivial_chain(old_cache)) { + //If the node is a trivial chain, then the parent is just the node but recorded as a chain in the net handle + parent = distance_index.get_net_handle_from_values (distance_index.get_record_offset(node_net_handle), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::CHAIN_HANDLE, + MIPayload::node_record_offset(old_cache)); + if (MIPayload::parent_record_offset(old_cache) == 0) { + //If the parent offset stored in the cache is the root, then this is a trivial chain + //child of the root not in a root snarl, so remember the root as the parent and the + //trivial chain as the node + node_net_handle = parent; + parent = distance_index.get_root(); + } else if (MIPayload::parent_is_root(old_cache) && !MIPayload::parent_is_chain(old_cache)) { + //If the parent is a root snarl, then the node becomes the trivial chain + //and we get the parent root snarl from the cache + node_net_handle = parent; + parent = distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(old_cache), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::ROOT_HANDLE); + } + } else if (MIPayload::parent_record_offset(old_cache) == 0) { + //The parent is just the root + parent = distance_index.get_root(); + } else if (MIPayload::parent_is_root(old_cache) && !MIPayload::parent_is_chain(old_cache)) { + //If the parent is a root snarl + parent = distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(old_cache), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::ROOT_HANDLE); + } else { + //Otherwise the parent is an actual chain and we use the value from the cache + parent = distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(old_cache), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::CHAIN_HANDLE); + } + } else { + parent = distance_index.start_end_traversal_of(distance_index.get_parent(node_net_handle)); + if (distance_index.is_trivial_chain(parent)){ + net_handle_t grandparent = distance_index.get_parent(parent); + if (distance_index.is_root(grandparent)){ + node_net_handle = parent; + parent = distance_index.start_end_traversal_of(grandparent); + } + } + } + +#ifdef DEBUG_CLUSTER +cerr << MIPayload::is_trivial_chain(old_cache) << " " << MIPayload::parent_is_chain(old_cache) << " " << MIPayload::parent_is_root(old_cache) << endl; +cerr << distance_index.net_handle_as_string(node_net_handle) << " parent: " << distance_index.net_handle_as_string(parent) << endl; + if (!distance_index.is_root(parent)) { + cerr << "Parent should be " << distance_index.net_handle_as_string(distance_index.start_end_traversal_of(distance_index.get_parent(node_net_handle))) << endl; + assert( distance_index.start_end_traversal_of(parent) == distance_index.start_end_traversal_of(distance_index.get_parent(node_net_handle))); + } +#endif + if (!distance_index.is_root(parent)) { + //If the parent is not the root and not a root snarl (it is a chain or trivial chain) + +#ifdef DEBUG_CLUSTER + cerr << "\tchild of a chain " << distance_index.net_handle_as_string(parent) << endl; +#endif + + //Add the seed to its parent + //Also update the minimizer_cache on the seed + + + + //Seed payload is: + //record offset of node, record offset of parent, node record offset, node length, is_reversed, is_trivial_chain, parent is chain, parent is root, prefix sum, chain_component + + bool is_trivial_chain = has_cached_values ? MIPayload::is_trivial_chain(old_cache) + : distance_index.is_trivial_chain(parent); + size_t prefix_sum = MIPayload::prefix_sum(old_cache); + size_t node_length = MIPayload::node_length(old_cache); + bool is_reversed_in_parent = MIPayload::is_reversed(old_cache); + + if (!has_cached_values) { + //If we didn't store information in the seed, then get it from the distance index + //and remember it in the seed's cache + + //prefix sum + prefix_sum = is_trivial_chain ? std::numeric_limits::max() + : distance_index.get_prefix_sum_value(node_net_handle); + MIPayload::set_prefix_sum(seed.minimizer_cache, prefix_sum); + + //component + MIPayload::set_chain_component(seed.minimizer_cache, + distance_index.is_multicomponent_chain(parent) + ? distance_index.get_chain_component(node_net_handle) + : 0); + + //node length + node_length = distance_index.minimum_length(node_net_handle); + MIPayload::set_node_length(seed.minimizer_cache, node_length); + + //is_reversed_in_parent + is_reversed_in_parent = is_trivial_chain ? distance_index.is_reversed_in_parent(parent) + : distance_index.is_reversed_in_parent(node_net_handle); + MIPayload::set_is_reversed(seed.minimizer_cache, is_reversed_in_parent); + + } +#ifdef DEBUG_CLUSTER + //assert(prefix_sum == (is_trivial_chain ? std::numeric_limits::max() + // : distance_index.get_prefix_sum_value(node_net_handle))); + assert(node_length == distance_index.minimum_length(node_net_handle)); + + assert(is_reversed_in_parent == (is_trivial_chain ? distance_index.is_reversed_in_parent(parent) + : distance_index.is_reversed_in_parent(node_net_handle))); +#endif + + //Add the parent chain or trivial chain + bool new_parent = false; + size_t depth; + if (MIPayload::is_trivial_chain(old_cache) && MIPayload::parent_is_chain(old_cache) && MIPayload::parent_is_root(old_cache)) { + //If the node is a trivial chain, and the parent we stored is a chain and root, + //then the node is in a simple snarl on the root-level chain + depth = 2; + } else if (MIPayload::parent_is_root(old_cache)) { + //If the parent is a root (or root-level chain) + depth = 1; + } else { + //Otherwise get it later from parent_node_cluster_offset_to_depth + depth = std::numeric_limits::max(); + } + new_parent = false; + if (clustering_problem.net_handle_to_node_problem_index.count(parent) == 0) { + //If we haven't seen the parent chain before, make a new SnarlTreeNodeProblem for it + new_parent = true; + if (is_trivial_chain ) { + clustering_problem.net_handle_to_node_problem_index.emplace(parent, clustering_problem.all_node_problems.size()); + clustering_problem.all_node_problems.emplace_back(parent, clustering_problem.all_seeds->size(), + clustering_problem.seed_count_prefix_sum.back(), + false, node_length, std::numeric_limits::max(), std::numeric_limits::max()); + clustering_problem.all_node_problems.back().is_trivial_chain = true; + } else { + //The parent is an actual chain + clustering_problem.net_handle_to_node_problem_index.emplace(parent, clustering_problem.all_node_problems.size()); + clustering_problem.all_node_problems.emplace_back(parent, clustering_problem.all_seeds->size(), + clustering_problem.seed_count_prefix_sum.back(), distance_index); + } + + //Get the depth from the parent if we didn't cache it + if (depth == std::numeric_limits::max()) { + depth = distance_index.get_depth(parent); + } + parent_to_depth.emplace(parent, depth); + new_parent = true; + } else { + //If we've seen the parent before, just find its index into all_node_problems and its depth + if (depth == std::numeric_limits::max()) { + depth = parent_to_depth[parent]; + } + } +#ifdef DEBUG_CLUSTER + assert(depth == distance_index.get_depth(parent)); +#endif + + + //If chains_by_level isn't big enough for this depth, resize it and reserve space at each level + if (depth+1 > chains_by_level.size()) { + size_t to_add = (depth+1) - chains_by_level.size(); + for (size_t i = 0 ; i < to_add ; i++) { + chains_by_level.emplace_back(); + chains_by_level.back().reserve(clustering_problem.seed_count_prefix_sum.back()); + } + } + + //Make sure the seed's distances are relative to the orientation in the parent + seed.distance_left = is_reversed_in_parent != is_rev(pos) ? node_length- get_offset(pos) + : get_offset(pos) + 1; + seed.distance_right = is_reversed_in_parent != is_rev(pos) ? get_offset(pos) + 1 + : node_length- get_offset(pos); + + //Add this seed to its parent cluster + SnarlTreeNodeProblem& parent_problem = clustering_problem.all_node_problems.at(clustering_problem.net_handle_to_node_problem_index.at(parent)); + parent_problem.children.emplace_back(); + parent_problem.children.back().net_handle = node_net_handle; + parent_problem.children.back().seed_indices = {read_num, i}; + parent_problem.children.back().is_seed = true; + parent_problem.children.back().has_chain_values = true; + parent_problem.children.back().chain_component = MIPayload::chain_component(seed.minimizer_cache); + parent_problem.children.back().prefix_sum = SnarlDistanceIndex::sum(seed.distance_left, + MIPayload::prefix_sum(seed.minimizer_cache)); + + + //And the parent to chains_by_level + if (new_parent) { + chains_by_level[depth].emplace_back(parent); + } + + + //If the parent is a trivial chain and not in the root, then we also stored the identity of the snarl, so add it here too + if (new_parent && has_cached_values) { + if (is_trivial_chain && !MIPayload::parent_is_root(old_cache)) { + bool grandparent_is_simple_snarl = MIPayload::parent_is_chain(old_cache); + parent_problem.has_parent_handle = true; + parent_problem.parent_net_handle = grandparent_is_simple_snarl + ? distance_index.get_net_handle_from_values(distance_index.get_record_offset(node_net_handle), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::SNARL_HANDLE, + 1) + : distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(old_cache), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::SNARL_HANDLE); + + if (grandparent_is_simple_snarl) { + //If the grandparent is a simple snarl, then we also stored the identity of its parent chain, so add it here too + parent_problem.has_grandparent_handle = true; + parent_problem.grandparent_net_handle = distance_index.get_net_handle_from_values( + MIPayload::parent_record_offset(old_cache), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::CHAIN_HANDLE); + } + } else if (MIPayload::parent_is_root(old_cache) && MIPayload::parent_is_chain(old_cache) && !is_trivial_chain) { + //The parent chain is a child of the root + parent_problem.has_parent_handle = true; + parent_problem.parent_net_handle = distance_index.get_net_handle_from_values( + 0, SnarlDistanceIndex::START_END, SnarlDistanceIndex::ROOT_HANDLE); + } + } + + + } else { + //Otherwise, the parent is the root or a root snarl, and the node_net_handle is a node + + + //Get the values from the seed. Some may be infinite and need to be re-set + size_t node_length = has_cached_values ? MIPayload::node_length(old_cache) + : distance_index.minimum_length(node_net_handle); + bool is_reversed_in_parent = has_cached_values ? MIPayload::is_reversed(old_cache) + : distance_index.is_reversed_in_parent(node_net_handle); + + + //Create a new SnarlTreeNodeProblem for this node + bool new_node = false; + if (seen_nodes.count(id) == 0) { + new_node = true; + clustering_problem.net_handle_to_node_problem_index.emplace(node_net_handle, + clustering_problem.all_node_problems.size()); + clustering_problem.all_node_problems.emplace_back(node_net_handle, clustering_problem.all_seeds->size(), + clustering_problem.seed_count_prefix_sum.back(), + false, node_length, std::numeric_limits::max(), + std::numeric_limits::max()); + + //Remember the parent of this node, since it will be needed to remember the root snarl later + clustering_problem.all_node_problems.back().parent_net_handle = parent; + + seen_nodes.insert(id); + + } + + seed.distance_left = is_reversed_in_parent != is_rev(pos) ? node_length- get_offset(pos) : get_offset(pos) + 1; + seed.distance_right = is_reversed_in_parent != is_rev(pos) ? get_offset(pos) + 1 : node_length- get_offset(pos); + + SnarlTreeNodeProblem& node_problem = clustering_problem.all_node_problems.at(clustering_problem.net_handle_to_node_problem_index.at(node_net_handle)); + + node_problem.children.emplace_back(); + node_problem.children.back().net_handle = node_net_handle; + node_problem.children.back().seed_indices = {read_num, i}; + node_problem.children.back().is_seed = true; + node_problem.children.back().has_chain_values = true; + node_problem.children.back().chain_component = MIPayload::chain_component(seed.minimizer_cache); + node_problem.children.back().prefix_sum = SnarlDistanceIndex::sum(seed.distance_left, + MIPayload::prefix_sum(seed.minimizer_cache)); + + + + //Remember this seed as a child of the node + if (new_node) { + nodes_to_cluster_now.emplace_back(node_net_handle); + } + } + } + } + +#ifdef DEBUG_CLUSTER + cerr << endl; +#endif + + //Go through and cluster nodes that are children of the root or root snarls + for(const net_handle_t& node_net_handle : nodes_to_cluster_now) { + SnarlTreeNodeProblem& node_problem = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(node_net_handle)); + + //Cluster the node. Give it the range in node_to_seeds, which is from seed_range_start + //to either current_iterator (if current_iterator is a different node), or the end of node_to_seeds + //if current_iterator is the last thing in the list and the same node + cluster_one_node(clustering_problem, &node_problem); + + net_handle_t parent = node_problem.parent_net_handle; + + if (distance_index.is_root_snarl(parent)) { + //If this is a root snarl, then remember it to cluster in the root + if (clustering_problem.net_handle_to_node_problem_index.count(parent) == 0) { + clustering_problem.net_handle_to_node_problem_index.emplace(parent, + clustering_problem.all_node_problems.size()); + clustering_problem.all_node_problems.emplace_back(parent, clustering_problem.all_seeds->size(), + clustering_problem.seed_count_prefix_sum.back(), distance_index); + } + clustering_problem.root_children.emplace_back(parent, node_net_handle); + } else { + //Otherwise, just compare the single child's external connectivity + compare_and_combine_cluster_on_one_child(clustering_problem, &node_problem); + } + + } + + if (chains_by_level.empty()) { + chains_by_level.resize(1); + } +} + + + +//Cluster all of the snarls in clustering_problem from the same depth +//Assumes that all the children of the snarls have been clustered already and are present in clustering_problem.snarls_to_children +void SnarlDistanceIndexClusterer::cluster_snarl_level(ClusteringProblem& clustering_problem) const { + + for (const net_handle_t& snarl_handle : clustering_problem.parent_snarls) { + //Go through each of the snarls at this level, cluster them, + //and find which chains they belong to, if any + SnarlTreeNodeProblem* snarl_problem = &clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(snarl_handle)); + +#ifdef DEBUG_CLUSTER + cerr << "Cluster one snarl " << distance_index.net_handle_as_string(snarl_problem->containing_net_handle) << endl; +#endif + + //Cluster the snarlindex]; + cluster_one_snarl(clustering_problem, snarl_problem); + + /*Now add the snarl to its parent. Only do so if the clusters are close enough to the boundaries that it can be clustered*/ + + //Check the best distance of any seed to the ends of the snarl + //Is the distance small enough that we can cluster it with something else? + bool reachable_right = snarl_problem->fragment_best_right <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + bool reachable_left = snarl_problem->fragment_best_left <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + + + if (reachable_left || reachable_right) { + + //Make a new SnarlTreeNodeProblem for the parent + + net_handle_t snarl_parent = snarl_problem->has_parent_handle + ? snarl_problem->parent_net_handle + : distance_index.start_end_traversal_of(distance_index.get_parent(snarl_problem->containing_net_handle)); + bool new_parent = false; + if (clustering_problem.net_handle_to_node_problem_index.count(snarl_parent) == 0) { + new_parent = true; + clustering_problem.net_handle_to_node_problem_index.emplace(snarl_parent, + clustering_problem.all_node_problems.size()); + clustering_problem.all_node_problems.emplace_back(snarl_parent, clustering_problem.all_seeds->size(), + clustering_problem.seed_count_prefix_sum.back(), distance_index); + + //Because a new SnarlTreeNodeProblem got added, the snarl_problem pointer might have moved + SnarlTreeNodeProblem snarl_problem = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(snarl_handle)); + if (snarl_problem.has_grandparent_handle) { + SnarlTreeNodeProblem& parent_problem = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(snarl_parent)); + parent_problem.has_parent_handle = true; + parent_problem.parent_net_handle = snarl_problem.grandparent_net_handle; + } + } + SnarlTreeNodeProblem& parent_problem = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(snarl_parent)); + + //Add the snarl to its parent + if (distance_index.is_root(snarl_parent)) { + if(distance_index.is_root_snarl(snarl_parent)) { + //If the parent is a root snarl, then remember it to be compared in the root + clustering_problem.root_children.emplace_back(snarl_parent, snarl_handle); + } else { + //Otherwise, compare it to itself using external connectivity and forget about it since we're done + compare_and_combine_cluster_on_one_child(clustering_problem, + &clustering_problem.all_node_problems.at(clustering_problem.net_handle_to_node_problem_index.at(snarl_parent))); + } + } else { + //Add the snarl to its parent chain + parent_problem.children.emplace_back(); + parent_problem.children.back().net_handle = snarl_handle; + parent_problem.children.back().is_seed = false; + parent_problem.children.back().has_chain_values = false; + if (new_parent) { + //And the parent chain to the things to be clustered next + clustering_problem.parent_chains->emplace_back(snarl_parent); + } + } + } + +#ifdef DEBUG_CLUSTER + cerr << "\tRecording snarl " << distance_index.net_handle_as_string(snarl_handle) << " as a child of " + << distance_index.net_handle_as_string(distance_index.get_parent(snarl_handle)) << endl; +#endif + + } + clustering_problem.parent_snarls.clear(); +} + + +void SnarlDistanceIndexClusterer::cluster_chain_level(ClusteringProblem& clustering_problem, size_t depth) const { + + //Go through current_chains, which is a vector of chain, child pairs. Start by sorting by parent chain + if (clustering_problem.current_chains->empty()) { + return; + } + + + for (const net_handle_t& chain_handle : *(clustering_problem.current_chains)) { + + SnarlTreeNodeProblem* chain_problem = &clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(chain_handle)); + + +#ifdef DEBUG_CLUSTER + cerr << "Cluster one chain " << distance_index.net_handle_as_string(chain_handle) << " with " << chain_problem->children.size() << " children" << endl; + for (auto& x : chain_problem->children) { + cerr << "\t" << distance_index.net_handle_as_string(x.net_handle) << endl; + } +#endif + + + net_handle_t parent = chain_problem->has_parent_handle + ? chain_problem->parent_net_handle + : distance_index.start_end_traversal_of(distance_index.get_parent(chain_handle)); + bool is_root = distance_index.is_root(parent); + bool is_root_snarl = is_root ? distance_index.is_root_snarl(parent) : false; + + //This is used to determine if we need to remember the distances to the ends of the chain, since + //for a top level chain it doesn't matter + bool is_top_level_chain = (depth == 1) && !is_root_snarl && + !distance_index.is_externally_start_start_connected(chain_handle) && + !distance_index.is_externally_start_end_connected(chain_handle) && + !distance_index.is_externally_end_end_connected(chain_handle) && + !distance_index.is_looping_chain(chain_handle); + + // Compute the clusters for the chain + cluster_one_chain(clustering_problem, chain_problem, is_top_level_chain); + + //Add the chain to its parent + if (is_root) { + //If the parent is the root, remember to cluster it + if (is_root_snarl) { + //If the parent is a root snarl, then remember it to cluster in the root + if (clustering_problem.net_handle_to_node_problem_index.count(parent) == 0) { + clustering_problem.net_handle_to_node_problem_index.emplace(parent, clustering_problem.all_node_problems.size()); + clustering_problem.all_node_problems.emplace_back(parent, clustering_problem.all_seeds->size(), + clustering_problem.seed_count_prefix_sum.back(), distance_index); + } + clustering_problem.root_children.emplace_back(parent, chain_handle); + } else if (!is_top_level_chain) { + //Otherwise, cluster it with itself using external connectivity only + //is_top_level_chain also includes external connectivity, so if it's true we don't need to check this + compare_and_combine_cluster_on_one_child(clustering_problem, chain_problem); + } + } else if (!is_top_level_chain) { + //If the parent is just a snarl + + //Remember the distances to the ends of the parent + + chain_problem->distance_start_left = + distance_index.distance_to_parent_bound(parent, true, distance_index.flip(chain_handle), + std::make_tuple(SnarlDistanceIndex::SNARL_HANDLE, + SnarlDistanceIndex::SNARL_HANDLE, + (chain_problem->is_trivial_chain ? SnarlDistanceIndex::NODE_HANDLE + : SnarlDistanceIndex::CHAIN_HANDLE), + SnarlDistanceIndex::CHAIN_HANDLE)); + + chain_problem->distance_start_right = + distance_index.distance_to_parent_bound(parent, true, chain_handle, + std::make_tuple(SnarlDistanceIndex::SNARL_HANDLE, + SnarlDistanceIndex::SNARL_HANDLE, + (chain_problem->is_trivial_chain ? SnarlDistanceIndex::NODE_HANDLE + : SnarlDistanceIndex::CHAIN_HANDLE), + SnarlDistanceIndex::CHAIN_HANDLE)); + + chain_problem->distance_end_left = + distance_index.distance_to_parent_bound(parent, false, distance_index.flip(chain_handle), + std::make_tuple(SnarlDistanceIndex::SNARL_HANDLE, + SnarlDistanceIndex::SNARL_HANDLE, + (chain_problem->is_trivial_chain ? SnarlDistanceIndex::NODE_HANDLE + : SnarlDistanceIndex::CHAIN_HANDLE), + SnarlDistanceIndex::CHAIN_HANDLE)); + + chain_problem->distance_end_right = + distance_index.distance_to_parent_bound(parent, false, chain_handle, + std::make_tuple(SnarlDistanceIndex::SNARL_HANDLE, + SnarlDistanceIndex::SNARL_HANDLE, + (chain_problem->is_trivial_chain ? SnarlDistanceIndex::NODE_HANDLE + : SnarlDistanceIndex::CHAIN_HANDLE), + SnarlDistanceIndex::CHAIN_HANDLE)); +#ifdef DEBUG_CLUSTER + cerr << "This child has distances to end : " << chain_problem->distance_start_left << " " << chain_problem->distance_start_right + << " " << chain_problem->distance_end_left << " " << chain_problem->distance_end_right << endl; +#endif + //And add it to its parent snarl + bool new_parent = false; + if (clustering_problem.net_handle_to_node_problem_index.count(parent) == 0) { + new_parent = true; + clustering_problem.net_handle_to_node_problem_index.emplace(parent, clustering_problem.all_node_problems.size()); + clustering_problem.all_node_problems.emplace_back(parent, clustering_problem.all_seeds->size(), + clustering_problem.seed_count_prefix_sum.back(), distance_index); + //Because a new SnarlTreeNodeProblem got added, the old chain_problem pointer might have moved + SnarlTreeNodeProblem& chain_problem = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(chain_handle)); + if (chain_problem.has_grandparent_handle) { + SnarlTreeNodeProblem& parent_problem = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(parent)); + parent_problem.has_parent_handle = true; + parent_problem.parent_net_handle = chain_problem.grandparent_net_handle; + } + } + SnarlTreeNodeProblem& parent_problem = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(parent)); + parent_problem.children.emplace_back(); + parent_problem.children.back().net_handle = chain_handle; + parent_problem.children.back().is_seed = false; + parent_problem.children.back().has_chain_values = false; + + + if (new_parent) { + clustering_problem.parent_snarls.emplace_back(parent); + } + + } + } +} + + +void SnarlDistanceIndexClusterer::cluster_one_node( + ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* node_problem) const { +#ifdef DEBUG_CLUSTER + cerr << "Finding clusters on node " << distance_index.net_handle_as_string(node_problem->containing_net_handle) << endl; +#endif + + size_t node_length = node_problem->node_length; + + + //Sort the seeds on the node + std::sort(node_problem->children.begin(), node_problem->children.end(), + [&](const SnarlTreeNodeProblem::SnarlTreeChild& a, const SnarlTreeNodeProblem::SnarlTreeChild& b) { + return clustering_problem.all_seeds->at(a.seed_indices.first)->at(a.seed_indices.second).distance_left + < clustering_problem.all_seeds->at(b.seed_indices.first)->at(b.seed_indices.second).distance_left; + + }); + + cluster_seeds_on_linear_structure(clustering_problem, node_problem, node_length, false, false); + +#ifdef DEBUG_CLUSTER + + cerr << "\tFound read clusters on node " << distance_index.net_handle_as_string(node_problem->containing_net_handle) << endl; + + bool got_left = false; + bool got_right = false; + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + cerr << "\t for read num " << read_num << " best left: " << (read_num == 0 ? node_problem->read_best_left.first : node_problem->read_best_left.second) << " best right: " << (read_num == 0 ? node_problem->read_best_right.first : node_problem->read_best_right.second) << endl; + bool got_read_left=false; + bool got_read_right = false; + for (pair c : node_problem->read_cluster_heads) { + if (c.first == read_num) { + pair dists (clustering_problem.all_seeds->at(c.first)->at(c.second).distance_left, clustering_problem.all_seeds->at(c.first)->at(c.second).distance_right); + cerr << "\t\t" << c.first << ":"<at(c.first)->size() ; x++) { + if (clustering_problem.read_union_find[c.first].find_group(x) == c.second) { + cerr << clustering_problem.all_seeds->at(c.first)->at(x).pos << " "; + has_seeds = true; + } + } + assert(dists.first == std::numeric_limits::max() || dists.first >= (read_num == 0 ? node_problem->read_best_left.first : node_problem->read_best_left.second)); + ; assert(dists.second == std::numeric_limits::max() || dists.second >= (read_num == 0 ? node_problem->read_best_right.first : node_problem->read_best_right.second)); + assert(dists.first == std::numeric_limits::max() || dists.first >= node_problem->fragment_best_left); + assert(dists.second == std::numeric_limits::max() || dists.second >= node_problem->fragment_best_right); + if (dists.first == node_problem->fragment_best_left) {got_left = true;} + if (dists.second == node_problem->fragment_best_right) {got_right = true;} + if (dists.first == (read_num == 0 ? node_problem->read_best_left.first : node_problem->read_best_left.second)) {got_read_left = true;} + if (dists.second == (read_num == 0 ? node_problem->read_best_right.first : node_problem->read_best_right.second)) {got_read_right = true;} + cerr << endl; + assert(has_seeds); + } + } + //assert(got_read_left || (read_num == 0 ? node_problem->read_best_left.first : node_problem->read_best_left.second) == std::numeric_limits::max()); + //assert(got_read_right || (read_num == 0 ? node_problem->read_best_right.first : node_problem->read_best_right.second) == std::numeric_limits::max()); + } + //assert(got_left); + //assert(got_right); + for (pair group_id : node_problem->read_cluster_heads) { + assert (group_id.second == clustering_problem.read_union_find[group_id.first].find_group(group_id.second)); + } +#endif + return; + +}; + + +//Go through pairs of clusters of the two children and see which ones can be combined +//The first child may not have been seen before, so all of it's clusters may be added to the parent, then +//anything that was combined gets removed and only the cluster heads get added. +//For the second child, everything is already in the parent so remove ones that were combined then +//add the head of the combined clusters +// +//If this is the first time we see the first child, then also update the best distances to the ends of the +//parent for the parent clusters +void SnarlDistanceIndexClusterer::compare_and_combine_cluster_on_child_structures(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* child_problem1, + SnarlTreeNodeProblem* child_problem2, SnarlTreeNodeProblem* parent_problem, + const vector> & child_distances, bool is_root, bool first_child) const { +#ifdef DEBUG_CLUSTER + cerr << "\tCompare " << distance_index.net_handle_as_string(child_problem1->containing_net_handle) + << " and " << distance_index.net_handle_as_string(child_problem2->containing_net_handle) + << " which are children of " << distance_index.net_handle_as_string(parent_problem->containing_net_handle) << endl; +#endif + + net_handle_t& parent_handle = parent_problem->containing_net_handle; + net_handle_t& child_handle1 = child_problem1->containing_net_handle; + net_handle_t& child_handle2 = child_problem2->containing_net_handle; + + + + //Get the distances between the two sides of the children in the parent + size_t distance_left_left = distance_index.distance_in_parent(parent_handle, distance_index.flip(child_handle1), + distance_index.flip(child_handle2), graph, + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit + : clustering_problem.fragment_distance_limit)); + size_t distance_left_right = distance_index.distance_in_parent(parent_handle, distance_index.flip(child_handle1), + child_handle2, graph, + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit + : clustering_problem.fragment_distance_limit)); + size_t distance_right_right = distance_index.distance_in_parent(parent_handle, child_handle1, child_handle2, graph, + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit + : clustering_problem.fragment_distance_limit)); + size_t distance_right_left = distance_index.distance_in_parent(parent_handle, child_handle1, + distance_index.flip(child_handle2), graph, + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit + : clustering_problem.fragment_distance_limit)); + + //If it's the root and nothing can be clustered, return here + //If it's not the root, then we still need add the new clusters + if (is_root){ + if (distance_left_left == std::numeric_limits::max() && + distance_left_right == std::numeric_limits::max() && + distance_right_right == std::numeric_limits::max() && + distance_right_left == std::numeric_limits::max()) { + return; + } + } + + +#ifdef DEBUG_CLUSTER + cerr << "\t\tFound distances between the two children: " << distance_left_left << " " << distance_left_right << " " << distance_right_right << " " << distance_right_left << endl; + cerr << "\t\tBest left and right distances for the two children: " << child_problem1->fragment_best_left << " " << child_problem1->fragment_best_right << " and " << child_problem2->fragment_best_left << " " << child_problem2->fragment_best_right << endl; + cerr << "\t\tAnd distances from the ends of child1 to ends of parent: " << child_problem1->distance_start_left << " " + << child_problem1->distance_start_right << " " << child_problem1->distance_end_left << " " << child_problem1->distance_end_right << endl; +#endif + /* + * We're going to go through all clusters to see which can get combined. There will be up to four combined clusters (per read), + * one for each path between the two sides of the two nodes + * + * If a cluster from the first child can be combined with a cluster of the second by taking the left-right path, + * then any cluster of the second child with a right distance that is less than the distance limit - the best left distance + * of the first will be combined with the cluster with the best left distances. So in the end there will only be four combined + * clusters, one for each path + * This strategy ends up unioning a cluster with itself but it only goes through the clusters once so I think + * it's efficient + */ + + //The cluster heads that will be removed from the parent's read_cluster_heads + vector> to_erase; + + //Helper function that will compare two clusters + //Given the read num and seed_num of the cluster head, the distance to the other node side we're looking at, + //the distances to the ends of the parent for the cluster head, a reference + //to the current cluster head and distances of the potential combined cluster (pairpair<>> which will be updated if it gets combined), + //the relevant combined cluster head for the fragment + //Returns true if this cluster got combined + auto compare_and_combine_clusters = [&] (const size_t& read_num, const size_t& cluster_num, const size_t& distance_between_reads, + const size_t& distance_between_fragments, pair& old_distances, + ClusterHead& new_cluster_head_and_distances, size_t& new_cluster_head_fragment){ + if ((read_num == new_cluster_head_and_distances.read_num + && cluster_num == new_cluster_head_and_distances.cluster_num) || + ( distance_between_fragments == std::numeric_limits::max())) { + //If this is the same as the old cluster head, or the distances are infinite, + //then don't bother trying to compare + return false; + } + size_t distance_reads = SnarlDistanceIndex::minus(distance_between_reads, 1); + size_t distance_fragments = SnarlDistanceIndex::minus(distance_between_fragments, 1); + bool combined = false; + + if (distance_reads <= clustering_problem.read_distance_limit) { + //If this can be combined with the given combined cluster + if (new_cluster_head_and_distances.read_num == std::numeric_limits::max()){ + //new cluster head + new_cluster_head_and_distances.read_num =read_num; + new_cluster_head_and_distances.cluster_num = cluster_num; + new_cluster_head_and_distances.distance_left = old_distances.first; + new_cluster_head_and_distances.distance_right = old_distances.second; + } else { + //Combine with old cluster head + size_t new_cluster_head = clustering_problem.read_union_find.at(read_num).union_groups(cluster_num, new_cluster_head_and_distances.cluster_num); + + //Update distances + size_t new_best_left = std::min(old_distances.first, new_cluster_head_and_distances.distance_left); + size_t new_best_right = std::min(old_distances.second, new_cluster_head_and_distances.distance_right); + + //And remember new head and distances + new_cluster_head_and_distances.read_num =read_num; + new_cluster_head_and_distances.cluster_num = new_cluster_head; + new_cluster_head_and_distances.distance_left = new_best_left; + new_cluster_head_and_distances.distance_right = new_best_right; + + //Remember these distances because we might need to check them later + old_distances = make_pair(new_best_left, new_best_right); + clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_left = new_best_left; + clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_right = new_best_right; + } + //Remember to erase this cluster head + to_erase.emplace_back(read_num, cluster_num); + combined = true; + +#ifdef DEBUG_CLUSTER + cerr << "\t\t\tCombining read/cluster " << read_num << "/" << cluster_num << "... new cluster head:" << clustering_problem.all_seeds->at(read_num)->at(new_cluster_head_and_distances.cluster_num).pos << endl; + cerr << "\t\t\t\t Best distances for this cluster: " << old_distances.first << " and " << old_distances.second << endl; + cerr << "\t\t\t\t New best distances for combined cluster: " << new_cluster_head_and_distances.distance_left << " and " << new_cluster_head_and_distances.distance_right << endl; +#endif + } + if (clustering_problem.fragment_distance_limit != 0 && + distance_fragments <= clustering_problem.fragment_distance_limit ) { + //Just union the fragment + if (new_cluster_head_fragment == std::numeric_limits::max()) { + new_cluster_head_fragment =cluster_num+clustering_problem.seed_count_prefix_sum[read_num]; + } else { + new_cluster_head_fragment = clustering_problem.fragment_union_find.union_groups(cluster_num+clustering_problem.seed_count_prefix_sum[read_num], + new_cluster_head_fragment); + } +#ifdef DEBUG_CLUSTER + cerr << "\t\t\tCombining fragment" << endl; +#endif + } + return combined; + }; + /* + * Go through all clusters on the first child and see if they can be combined with clusters on the second child + */ + + //Did any cluster get combined? If it didn't, we don't need to go through the second node's clusters + bool combined_anything = false; + + + if (first_child || distance_left_left != std::numeric_limits::max() + || distance_left_right != std::numeric_limits::max() + || distance_right_left != std::numeric_limits::max() + || distance_right_right != std::numeric_limits::max()){ + //These will be the cluster heads and distances of everything combined by taking the indicated path + //one cluster head per read + //The default value will be ((inf, 0), (0,0)). Only the inf gets checked to see if it's a real value so I filled it in with 0 so I wouldn't have to type out inf + pair new_cluster_left_left_by_read; + pair new_cluster_left_right_by_read; + pair new_cluster_right_right_by_read; + pair new_cluster_right_left_by_read; + + //And the new cluster heads for the fragment + //These are the values of the cluster heads in the union finds, which include the values from read_index_offset + size_t new_cluster_left_left_fragment = std::numeric_limits::max(); + size_t new_cluster_left_right_fragment = std::numeric_limits::max(); + size_t new_cluster_right_right_fragment = std::numeric_limits::max(); + size_t new_cluster_right_left_fragment = std::numeric_limits::max(); + + + for (auto& child_cluster_head : child_problem1->read_cluster_heads) { + + bool combined = false; + size_t read_num = child_cluster_head.first; + ClusterHead& new_cluster_left_left = (read_num == 0 ? new_cluster_left_left_by_read.first : new_cluster_left_left_by_read.second); + ClusterHead& new_cluster_left_right = (read_num == 0 ? new_cluster_left_right_by_read.first : new_cluster_left_right_by_read.second); + ClusterHead& new_cluster_right_right = (read_num == 0 ? new_cluster_right_right_by_read.first : new_cluster_right_right_by_read.second); + ClusterHead& new_cluster_right_left = (read_num == 0 ? new_cluster_right_left_by_read.first : new_cluster_right_left_by_read.second); + size_t cluster_num = clustering_problem.read_union_find[read_num].find_group(child_cluster_head.second); + + //Distances to the ends of the child + pair distances = child_distances[child_cluster_head.second + clustering_problem.seed_count_prefix_sum[read_num]]; + + //Distances to the parent + size_t new_dist_left = std::min(SnarlDistanceIndex::sum(distances.first, child_problem1->distance_start_left), + SnarlDistanceIndex::sum(distances.second, child_problem1->distance_start_right)); + size_t new_dist_right= std::min(SnarlDistanceIndex::sum(distances.first, child_problem1->distance_end_left), + SnarlDistanceIndex::sum(distances.second, child_problem1->distance_end_right)); + + pair distances_to_parent = make_pair(new_dist_left, new_dist_right); + //If this is already in the parent, take the minimum of the parent distances + if (parent_problem->read_cluster_heads.count(make_pair(read_num, cluster_num)) > 0) { + distances_to_parent = make_pair( + std::min(new_dist_left, clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_left), + std::min(new_dist_right, clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_right)); + } + + + //Check if the left of 1 can connect with the left of 2 + combined = combined | compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first, distance_left_left), (read_num == 0 ? child_problem2->read_best_left.first : child_problem2->read_best_left.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first, distance_left_left), child_problem2->fragment_best_left), + distances_to_parent, new_cluster_left_left, + new_cluster_left_left_fragment); + + //Check if the left of 1 can connect with the right of 2 + combined = combined | compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_left_right), (read_num == 0 ? child_problem2->read_best_right.first : child_problem2->read_best_right.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_left_right), child_problem2->fragment_best_right), + distances_to_parent, new_cluster_left_right, + new_cluster_left_right_fragment); + + //Check if the right of 1 can connect with the right of 2 + combined = combined | compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_right_right), (read_num == 0 ? child_problem2->read_best_right.first : child_problem2->read_best_right.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_right_right), child_problem2->fragment_best_right), + distances_to_parent, new_cluster_right_right, + new_cluster_right_right_fragment); + + //Check if the right of 1 can connect with the left of 2 + combined = combined | compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_right_left), (read_num == 0 ? child_problem2->read_best_left.first : child_problem2->read_best_left.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_right_left), child_problem2->fragment_best_left), + distances_to_parent, new_cluster_right_left, + new_cluster_right_left_fragment); + + //Is the distance small enough that we can cluster it with something else? + bool reachable_left = distances_to_parent.first <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + bool reachable_right = distances_to_parent.second <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + //If this cluster wasn't combined and hasn't been seen before and its reachable from other clusters, add it to the parent + if (first_child && !combined && (reachable_left || reachable_right)) { + parent_problem->read_cluster_heads.emplace(read_num, cluster_num); + clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_left = distances_to_parent.first; + clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_right = distances_to_parent.second; + } + combined_anything |= combined; + } + + if (combined_anything || new_cluster_left_left_fragment != std::numeric_limits::max() + || new_cluster_left_right_fragment != std::numeric_limits::max() + || new_cluster_right_left_fragment != std::numeric_limits::max() + || new_cluster_right_right_fragment != std::numeric_limits::max()) { + /* If anything got combined, then we have to go through the second child + * and see if any cluster can be combined with clusters on the first child + */ + for (auto& child_cluster_head : child_problem2->read_cluster_heads) { + + size_t read_num = child_cluster_head.first; + size_t cluster_num = clustering_problem.read_union_find[read_num].find_group(child_cluster_head.second); + + ClusterHead& new_cluster_left_left = (read_num == 0 ? new_cluster_left_left_by_read.first + : new_cluster_left_left_by_read.second); + ClusterHead& new_cluster_left_right = (read_num == 0 ? new_cluster_left_right_by_read.first + : new_cluster_left_right_by_read.second); + ClusterHead& new_cluster_right_right = (read_num == 0 ? new_cluster_right_right_by_read.first + : new_cluster_right_right_by_read.second); + ClusterHead& new_cluster_right_left = (read_num == 0 ? new_cluster_right_left_by_read.first + : new_cluster_right_left_by_read.second); + + pair distances = child_distances[child_cluster_head.second + + clustering_problem.seed_count_prefix_sum[read_num]]; + size_t new_dist_left = std::min(SnarlDistanceIndex::sum(distances.first,child_problem2->distance_start_left), + SnarlDistanceIndex::sum(distances.second,child_problem2->distance_start_right)); + size_t new_dist_right = std::min(SnarlDistanceIndex::sum(distances.first,child_problem2->distance_end_left), + SnarlDistanceIndex::sum(distances.second,child_problem2->distance_end_right)); + pair distances_to_parent = make_pair(new_dist_left, new_dist_right); + + if (parent_problem->read_cluster_heads.count(make_pair(read_num, cluster_num)) > 0) { + distances_to_parent = make_pair( + std::min(new_dist_left, clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_left), + std::min(new_dist_right, clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_right)); + } + + //Check if the left of 1 can connect with the left of 2 + compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_left_left), (read_num == 0 ? child_problem1->read_best_left.first : child_problem1->read_best_left.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_left_left),child_problem1->fragment_best_left), + distances_to_parent, new_cluster_left_left, new_cluster_left_left_fragment); + + //Check if the left of 1 can connect with the right of 2 + compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_left_right), (read_num == 0 ? child_problem1->read_best_left.first : child_problem1->read_best_left.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_left_right),child_problem1->fragment_best_left), + distances_to_parent, new_cluster_left_right, new_cluster_left_right_fragment); + + //Check if the right of 1 can connect with the right of 2 + compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_right_right), (read_num == 0 ? child_problem1->read_best_right.first : child_problem1->read_best_right.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_right_right),child_problem1->fragment_best_right), + distances_to_parent, new_cluster_right_right, new_cluster_right_right_fragment); + + //Check if the right of 1 can connect with the left of 2 + compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_right_left), (read_num == 0 ? child_problem1->read_best_right.first : child_problem1->read_best_right.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_right_left),child_problem1->fragment_best_right), + distances_to_parent, new_cluster_right_left, new_cluster_right_left_fragment); + } + + /*then remove all clusters that got erase, then add back in the cluster heads + */ + + //remove cluster heads that got combined with new ones + for (pair& cluster_head : to_erase) { + parent_problem->read_cluster_heads.erase(cluster_head); + } + + //And add back in the new cluster heads + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + ClusterHead& new_cluster_left_left = (read_num == 0 ? new_cluster_left_left_by_read.first + : new_cluster_left_left_by_read.second); + ClusterHead& new_cluster_left_right = (read_num == 0 ? new_cluster_left_right_by_read.first + : new_cluster_left_right_by_read.second); + ClusterHead& new_cluster_right_right = (read_num == 0 ? new_cluster_right_right_by_read.first + : new_cluster_right_right_by_read.second); + ClusterHead& new_cluster_right_left = (read_num == 0 ? new_cluster_right_left_by_read.first + : new_cluster_right_left_by_read.second); + + //If the new cluster is clusterable, then add the new cluster_left_left + if (new_cluster_left_left.read_num != std::numeric_limits::max()){ + + //Check the old distances for this cluster head, because it may include the distance from another combined cluster + //head + pair old_distances = + parent_problem->read_cluster_heads.count(make_pair(new_cluster_left_left.read_num, new_cluster_left_left.cluster_num)) == 0 + ? make_pair(std::numeric_limits::max(), std::numeric_limits::max()) + : make_pair(clustering_problem.all_seeds->at(read_num)->at(new_cluster_left_left.cluster_num).distance_left, + clustering_problem.all_seeds->at(read_num)->at(new_cluster_left_left.cluster_num).distance_right); + //Is the distance small enough that we can cluster it with something else? + size_t best_left = std::min(new_cluster_left_left.distance_left, old_distances.first); + size_t best_right = std::min(new_cluster_left_left.distance_right, old_distances.second); + bool reachable_left = best_left <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + bool reachable_right = best_right <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + if ((reachable_left || reachable_right)) { + parent_problem->read_cluster_heads.emplace(new_cluster_left_left.read_num, new_cluster_left_left.cluster_num); + clustering_problem.all_seeds->at(read_num)->at(new_cluster_left_left.cluster_num).distance_left = best_left; + clustering_problem.all_seeds->at(read_num)->at(new_cluster_left_left.cluster_num).distance_right = best_right; + } else { + parent_problem->read_cluster_heads.erase(std::make_pair( new_cluster_left_left.read_num, new_cluster_left_left.cluster_num )); + } + } + //Add the new cluster_right_right + if (new_cluster_right_right.read_num != std::numeric_limits::max()){ + + + pair old_distances = parent_problem->read_cluster_heads.count(make_pair(new_cluster_right_right.read_num, new_cluster_right_right.cluster_num)) == 0 + ? make_pair(std::numeric_limits::max(), std::numeric_limits::max()) + : make_pair(clustering_problem.all_seeds->at(read_num)->at( new_cluster_right_right.cluster_num).distance_left, + clustering_problem.all_seeds->at(read_num)->at( new_cluster_right_right.cluster_num).distance_right); + size_t best_left = std::min(new_cluster_right_right.distance_left, + old_distances.first); + size_t best_right = std::min( new_cluster_right_right.distance_right, + old_distances.second); + + bool reachable_left = best_left <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + bool reachable_right = best_right <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + + if ((reachable_left || reachable_right)){ + parent_problem->read_cluster_heads.emplace( new_cluster_right_right.read_num, new_cluster_right_right.cluster_num); + clustering_problem.all_seeds->at(read_num)->at(new_cluster_right_right.cluster_num).distance_left = best_left; + clustering_problem.all_seeds->at(read_num)->at(new_cluster_right_right.cluster_num).distance_right = best_right; + } else { + parent_problem->read_cluster_heads.erase(make_pair(new_cluster_right_right.read_num, new_cluster_right_right.cluster_num)); + } + } + + //Add the new cluster_left_right + if (new_cluster_left_right.read_num != std::numeric_limits::max()) { + pair old_distances = parent_problem->read_cluster_heads.count( + make_pair(new_cluster_left_right.read_num, new_cluster_left_right.cluster_num)) == 0 + ? make_pair(std::numeric_limits::max(), std::numeric_limits::max()) + : make_pair(clustering_problem.all_seeds->at(read_num)->at(new_cluster_left_right.cluster_num).distance_left, + clustering_problem.all_seeds->at(read_num)->at(new_cluster_left_right.cluster_num).distance_right); + + size_t best_left = std::min(new_cluster_left_right.distance_left, old_distances.first); + size_t best_right = std::min( new_cluster_left_right.distance_right, old_distances.second); + + bool reachable_left = best_left <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + bool reachable_right = best_right <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + + if ((reachable_left || reachable_right) ){ + + parent_problem->read_cluster_heads.emplace( new_cluster_left_right.read_num, new_cluster_left_right.cluster_num); + clustering_problem.all_seeds->at(read_num)->at( new_cluster_left_right.cluster_num).distance_left = best_left; + clustering_problem.all_seeds->at(read_num)->at(new_cluster_left_right.cluster_num).distance_right = best_right; + } else { + parent_problem->read_cluster_heads.erase(make_pair(new_cluster_left_right.read_num, new_cluster_left_right.cluster_num)); + } + } + //Add the new cluster_right_left + if ( new_cluster_right_left.read_num != std::numeric_limits::max()) { + + pair old_distances = parent_problem->read_cluster_heads.count(make_pair(new_cluster_right_left.read_num, new_cluster_right_left.cluster_num)) == 0 + ? make_pair(std::numeric_limits::max(), std::numeric_limits::max()) + : make_pair(clustering_problem.all_seeds->at(read_num)->at(new_cluster_right_left.cluster_num).distance_left, + clustering_problem.all_seeds->at(read_num)->at(new_cluster_right_left.cluster_num).distance_right); + size_t best_left = std::min(new_cluster_right_left.distance_left, old_distances.first); + size_t best_right = std::min(new_cluster_right_left.distance_right, old_distances.second); + + bool reachable_left = best_left <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + bool reachable_right = best_right <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit); + + if ((reachable_left || reachable_right)) { + parent_problem->read_cluster_heads.emplace(new_cluster_right_left.read_num, new_cluster_right_left.cluster_num); + clustering_problem.all_seeds->at(read_num)->at(new_cluster_right_left.cluster_num).distance_left = best_left; + clustering_problem.all_seeds->at(read_num)->at(new_cluster_right_left.cluster_num).distance_right = best_right; + } else { + parent_problem->read_cluster_heads.erase(make_pair(new_cluster_right_left.read_num, new_cluster_right_left.cluster_num)); + } + } + } + } + } + + + /*Update the parent's best left and right distances, only looking at the first child since we've already seen the second one + */ + if (first_child) { + //Update the parent's fragment best distances + parent_problem->fragment_best_left = std::min(parent_problem->fragment_best_left, + std::min(SnarlDistanceIndex::sum(child_problem1->distance_start_left, child_problem1->fragment_best_left), + SnarlDistanceIndex::sum(child_problem1->distance_start_right , child_problem1->fragment_best_right))); + parent_problem->fragment_best_right = std::min(parent_problem->fragment_best_right, + std::min(SnarlDistanceIndex::sum(child_problem1->distance_end_left , child_problem1->fragment_best_left), + SnarlDistanceIndex::sum(child_problem1->distance_end_right , child_problem1->fragment_best_right))); + + + //Update the best distances in the parent for each read num + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num ++) { + //Find the best distances to the ends of the parent from child1 + size_t best_start = std::min(SnarlDistanceIndex::sum(child_problem1->distance_start_left, (read_num == 0 ? child_problem1->read_best_left.first : child_problem1->read_best_left.second)), + SnarlDistanceIndex::sum(child_problem1->distance_start_right, (read_num == 0 ? child_problem1->read_best_right.first : child_problem1->read_best_right.second))); + size_t best_end = std::min(SnarlDistanceIndex::sum(child_problem1->distance_end_left, (read_num == 0 ? child_problem1->read_best_left.first : child_problem1->read_best_left.second)), + SnarlDistanceIndex::sum(child_problem1->distance_end_right, (read_num == 0 ? child_problem1->read_best_right.first : child_problem1->read_best_right.second))); + //And update the distances in the parent + if (read_num == 0) { + parent_problem->read_best_left.first = std::min(best_start, parent_problem->read_best_left.first); + parent_problem->read_best_right.first = std::min(best_end, parent_problem->read_best_right.first); + } else { + parent_problem->read_best_left.second = std::min(best_start, parent_problem->read_best_left.second); + parent_problem->read_best_right.second = std::min(best_end, parent_problem->read_best_right.second); + } + } + } + +//#ifdef DEBUG_CLUSTER +// cerr << "\tIntermediate clusters on " << distance_index.net_handle_as_string(parent_problem->containing_net_handle); +// cerr << " with best left and right values: " << parent_problem->fragment_best_left << " " +// << parent_problem->fragment_best_right << endl; +// bool got_left = false; +// bool got_right = false; +// for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { +// cerr << "\t\t\tfor read num " << read_num << " best left: " << (read_num == 0 ? parent_problem->read_best_left.first : parent_problem->read_best_lef.second) << " best right: " << (read_num == 0 ? parent_problem->read_best_right.first : parent_problem->read_best_right.second) << endl; +// for (pair, pair> c : parent_problem->read_cluster_heads) { +// if (c.first.first == read_num) { +// pair dists = c.second; +// cerr << "\t\t\t" << clustering_problem.all_seeds->at(c.first.first)->at(c.first.second).pos << " (" << c.first.first << ":"<at(c.first.first)->size() ; x++) { +// if (clustering_problem.read_union_find[c.first.first].find_group(x) == c.first.second) { +// cerr << clustering_problem.all_seeds->at(c.first.first)->at(x).pos << " "; +// } +// } +// } +// cerr << endl; +// } +// } +//#endif +} + +void SnarlDistanceIndexClusterer::compare_and_combine_cluster_on_one_child(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* child_problem) const { +#ifdef DEBUG_CLUSTER + cerr << "\tCompare " << distance_index.net_handle_as_string(child_problem->containing_net_handle) + << " to itself in the root" << endl; +#endif + + net_handle_t& handle = child_problem->containing_net_handle; + + + //Get the distances between the two sides of the child + size_t distance_left_left = distance_index.is_externally_start_start_connected(handle) ? 0 : std::numeric_limits::max(); + size_t distance_left_right = distance_index.is_externally_start_end_connected(handle) ? 0 : std::numeric_limits::max(); + size_t distance_right_right = distance_index.is_externally_end_end_connected(handle) ? 0 : std::numeric_limits::max(); + if (distance_left_left == std::numeric_limits::max() && + distance_left_right == std::numeric_limits::max() && + distance_right_right == std::numeric_limits::max()) { + //If there is no external connectivity + return; + } + +#ifdef DEBUG_CLUSTER + cerr << "\t\tFound distances between the two children: " << distance_left_left << " " << distance_left_right << " " << distance_right_right << endl; +#endif + /* + * We're going to go through all clusters to see which can get combined. There are at most three new clusters, + * because there are three possible paths between the two ends of the single node + */ + + //These will be the cluster heads of everything combined by taking the indicated path + //one cluster head per read + //pair< pair> + //The default value will be (inf, 0). Only the inf gets checked to see if it's a real value so I filled it in with 0 so I wouldn't have to type out inf + pair, pair> new_cluster_left_left_by_read = std::make_pair( + std::make_pair(std::numeric_limits::max(), 0), std::make_pair(std::numeric_limits::max(), 0)); + pair, pair> new_cluster_left_right_by_read = std::make_pair( + std::make_pair(std::numeric_limits::max(), 0), std::make_pair(std::numeric_limits::max(), 0)); + pair, pair> new_cluster_right_right_by_read = std::make_pair( + std::make_pair(std::numeric_limits::max(), 0), std::make_pair(std::numeric_limits::max(), 0)); + + //And the new cluster heads for the fragment + //These are the values of the cluster heads in the union finds, which include the values from read_index_offset + size_t new_cluster_left_left_fragment = std::numeric_limits::max(); + size_t new_cluster_left_right_fragment = std::numeric_limits::max(); + size_t new_cluster_right_right_fragment = std::numeric_limits::max(); + + //Helper function that will compare two clusters + //Given the read num and seed_num of the cluster head, the distance to the other node side we're looking at, + //the distances to the ends of the parent for the cluster head, a reference + //to the current cluster head and distances of the potential combined cluster (pairpair<>> which will be updated if it gets combined), + //the relevant combined cluster head for the fragment + //Returns true if this cluster got combined + auto compare_and_combine_clusters = [&] (size_t read_num, size_t cluster_num, size_t distance_between_reads, + size_t distance_between_fragments, + pair& new_cluster_head, size_t& new_cluster_head_fragment){ + + if (read_num == new_cluster_head.first && cluster_num == new_cluster_head.second) { + //If this is the same as the old cluster head, then don't bother trying to compare + return; + } + distance_between_reads = SnarlDistanceIndex::minus(distance_between_reads, 1); + distance_between_fragments = SnarlDistanceIndex::minus(distance_between_fragments, 1); + + if (distance_between_reads <= clustering_problem.read_distance_limit) { + //If this can be combined with the given combined cluster + if (new_cluster_head.first == std::numeric_limits::max()){ + //new cluster head + new_cluster_head = make_pair(read_num, cluster_num); + } else { + //Combine with old cluster head + new_cluster_head = make_pair(read_num, + clustering_problem.read_union_find.at(read_num).union_groups(cluster_num, new_cluster_head.second)); + } + //Remember to erase this cluster head + +#ifdef DEBUG_CLUSTER + cerr << "\t\t\tCombining read/cluster " << read_num << "/" << cluster_num << "... new cluster head:" << new_cluster_head.second << endl; +#endif + } + if (clustering_problem.fragment_distance_limit != 0 && + distance_between_fragments <= clustering_problem.fragment_distance_limit ) { + //Just union the fragment + if (new_cluster_head_fragment == std::numeric_limits::max()) { + new_cluster_head_fragment =cluster_num+clustering_problem.seed_count_prefix_sum[read_num]; + } else { + new_cluster_head_fragment = clustering_problem.fragment_union_find.union_groups(cluster_num+clustering_problem.seed_count_prefix_sum[read_num], + new_cluster_head_fragment); + } +#ifdef DEBUG_CLUSTER + cerr << "\t\t\tCombining fragment" << endl; +#endif + } + return; + }; + + /* + * Go through all clusters and see if they can be combined with anything + */ + for (auto& child_cluster_head : child_problem->read_cluster_heads) { + + size_t read_num = child_cluster_head.first; + size_t cluster_num = clustering_problem.read_union_find[read_num].find_group(child_cluster_head.second); + + pair& new_cluster_left_left = read_num == 0 ? new_cluster_left_left_by_read.first + : new_cluster_left_left_by_read.second; + pair& new_cluster_left_right = read_num == 0 ? new_cluster_left_right_by_read.first + : new_cluster_left_right_by_read.second; + pair& new_cluster_right_right = read_num == 0 ? new_cluster_right_right_by_read.first + : new_cluster_right_right_by_read.second; + + //Distances to the ends of the child + pair distances (clustering_problem.all_seeds->at(read_num)->at(child_cluster_head.second).distance_left, + clustering_problem.all_seeds->at(read_num)->at(child_cluster_head.second).distance_right); + + + //Check if this can be combined left-left + compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_left_left), (read_num == 0 ? child_problem->read_best_left.first : child_problem->read_best_left.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_left_left), child_problem->fragment_best_left), + new_cluster_left_left, new_cluster_left_left_fragment); + + //Check if the left of this can be combined with the right of anything else + compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_left_right), (read_num == 0 ? child_problem->read_best_right.first : child_problem->read_best_right.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.first,distance_left_right), child_problem->fragment_best_right), + new_cluster_left_right, new_cluster_left_right_fragment); + + //Check if the right of this can be combined with the left of anything else + compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_left_right), (read_num == 0 ? child_problem->read_best_left.first : child_problem->read_best_left.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_left_right), child_problem->fragment_best_left), + new_cluster_left_right, new_cluster_left_right_fragment); + + //Check if this can be combined right-right + compare_and_combine_clusters (read_num, cluster_num, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_right_right), (read_num == 0 ? child_problem->read_best_right.first : child_problem->read_best_right.second)), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distances.second,distance_right_right), child_problem->fragment_best_right), + new_cluster_right_right, new_cluster_right_right_fragment); + } + +} + + +void SnarlDistanceIndexClusterer::cluster_one_snarl(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* snarl_problem) const { + //Get the clusters on this snarl, assumes that all of the snarls children have been clustered already. + + + snarl_problem->set_snarl_values(distance_index); + net_handle_t& snarl_handle = snarl_problem->containing_net_handle; + +#ifdef DEBUG_CLUSTER + cerr << "Finding clusters on snarl " << distance_index.net_handle_as_string(snarl_handle) << endl; +#endif + + + //If the snarl is a simple snarl, then there is no clustering to do because there is no path between + //the nodes. Otherwise, compare the children of the snarl + if (!distance_index.is_simple_snarl(snarl_handle)) { + //If this isn't a simple snarl + //Get the children of this snarl and their clusters + + //The old distances from clusters to the bounds of the children, since we will be updating the distances + //to represent distances to the parent + vector> child_distances (clustering_problem.seed_count_prefix_sum.back(), + make_pair(std::numeric_limits::max(), std::numeric_limits::max())); + + + for (size_t i = 0 ; i < snarl_problem->children.size() ; i++) { + //Go through each child node of the netgraph + + SnarlTreeNodeProblem& child_problem_i = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(snarl_problem->children[i].net_handle)); + + if (child_problem_i.fragment_best_left > (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit) && + child_problem_i.fragment_best_right > (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit)) { + //If everything is too far away to cluster, then skip it + continue; + } + + //This is true if this is the first time we see the outer loop's child. Used so we know if we need to calculate the distance to parents + bool first_child = true; + + //Remember the distances for this child since they will get overwritten + for (const pair& head : child_problem_i.read_cluster_heads) { + child_distances[head.second + clustering_problem.seed_count_prefix_sum[head.first]] = + make_pair(clustering_problem.all_seeds->at(head.first)->at(head.second).distance_left, + clustering_problem.all_seeds->at(head.first)->at(head.second).distance_right); + } + + for (size_t j = 0 ; j <=i ; j++){ + //Go through other child net graph nodes up to and including i + + //Get the other node and its clusters + SnarlTreeNodeProblem& child_problem_j = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(snarl_problem->children[j].net_handle)); + + if (child_problem_j.fragment_best_left > (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit) && + child_problem_j.fragment_best_right > (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit)) { + continue; + } + +#ifdef DEBUG_CLUSTER + cerr << "\tComparing two children of " << distance_index.net_handle_as_string(snarl_handle) << ": " + << distance_index.net_handle_as_string(child_problem_i.containing_net_handle) << " and " + << distance_index.net_handle_as_string(child_problem_j.containing_net_handle) << endl; + + + +#endif + + compare_and_combine_cluster_on_child_structures(clustering_problem, &child_problem_i, + &child_problem_j, snarl_problem, child_distances, false, first_child); + first_child = false; + } + } + } else { + //IF this is a simple snarl + + for (SnarlTreeNodeProblem::SnarlTreeChild& node_problem : snarl_problem->children) { + //Go through each child node of the netgraph and add its clusters to the snarl + SnarlTreeNodeProblem& child_problem = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(node_problem.net_handle)); + + //Add the cluster heads + for (auto& cluster_head : child_problem.read_cluster_heads) { + snarl_problem->read_cluster_heads.emplace(cluster_head); + } + + //Update the distances + //Because the orientation of the nodes was determined by the orientation of the chain, + //the orientation relative to the snarl is correct + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + if (read_num == 0) { + snarl_problem->read_best_left.first = std::min(snarl_problem->read_best_left.first, + child_problem.read_best_left.first); + snarl_problem->read_best_right.first = std::min(snarl_problem->read_best_right.first, + child_problem.read_best_right.first); + } else { + snarl_problem->read_best_left.second = std::min(snarl_problem->read_best_left.second, + child_problem.read_best_left.second); + snarl_problem->read_best_right.second = std::min(snarl_problem->read_best_right.second, + child_problem.read_best_right.second); + } + } + snarl_problem->fragment_best_left = std::min(snarl_problem->fragment_best_left, + child_problem.fragment_best_left); + snarl_problem->fragment_best_right = std::min(snarl_problem->fragment_best_right, + child_problem.fragment_best_right); + + + } + } + +#ifdef DEBUG_CLUSTER + cerr << "\tFound clusters on " << distance_index.net_handle_as_string(snarl_handle) << endl; + cerr << "\t with best left and right values: " << snarl_problem->fragment_best_left << " " + << snarl_problem->fragment_best_right << endl; + bool got_left = false; + bool got_right = false; + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + cerr << "\t\tfor read num " << read_num << " best left: " << (read_num == 0 ? snarl_problem->read_best_left.first : snarl_problem->read_best_left.second) + << " best right: " << (read_num == 0 ? snarl_problem->read_best_right.first : snarl_problem->read_best_right.second) << endl; + bool got_read_left=false; + bool got_read_right = false; + bool any_clusters = false; + for (pair c : snarl_problem->read_cluster_heads) { + if (c.first == read_num) { + any_clusters = true; + pair dists (clustering_problem.all_seeds->at(c.first)->at(c.second).distance_left, + clustering_problem.all_seeds->at(c.first)->at(c.second).distance_right); + cerr << "\t\t" << c.first << ":"<at(c.first)->size() ; x++) { + if (clustering_problem.read_union_find[c.first].find_group(x) == c.second) { + cerr << clustering_problem.all_seeds->at(c.first)->at(x).pos << " "; + has_seeds = true; + } + } + assert(dists.first == std::numeric_limits::max() || dists.first >= (read_num == 0 ? snarl_problem->read_best_left.first : snarl_problem->read_best_left.second)); + assert(dists.second == std::numeric_limits::max() || dists.second >= (read_num == 0 ? snarl_problem->read_best_right.first : snarl_problem->read_best_right.second)); + assert(dists.first == std::numeric_limits::max() || dists.first >= snarl_problem->fragment_best_left); + assert(dists.second == std::numeric_limits::max() || dists.second >= snarl_problem->fragment_best_right); + if (dists.first == snarl_problem->fragment_best_left) {got_left = true;} + if (dists.second == snarl_problem->fragment_best_right) {got_right = true;} + if (dists.first == (read_num == 0 ? snarl_problem->read_best_left.first : snarl_problem->read_best_left.second)) {got_read_left = true;} + if (dists.second == (read_num == 0 ? snarl_problem->read_best_right.first : snarl_problem->read_best_right.second)) {got_read_right = true;} + cerr << endl; + //assert(has_seeds); + } + } + //assert(!any_clusters ||got_read_left || (read_num == 0 ? snarl_problem->read_best_left.first : snarl_problem->read_best_left.second) == std::numeric_limits::max()); + //assert(!any_clusters ||got_read_right || (read_num == 0 ? snarl_problem->read_best_right.first : snarl_problem->read_best_right.second) == std::numeric_limits::max()); + } + //assert(got_left); + //assert(got_right); + + //for (pair, pair> group_id : snarl_problem->read_cluster_heads) { + // assert (group_id.first.second == clustering_problem.read_union_find[group_id.first.first].find_group(group_id.first.second)); + //} +#endif +}; + + + +void SnarlDistanceIndexClusterer::cluster_one_chain(ClusteringProblem& clustering_problem, + SnarlTreeNodeProblem* chain_problem, bool is_top_level_chain) const { +#ifdef DEBUG_CLUSTERS + assert(distance_index.is_chain(chain_problem->containing_net_handle)); + //if (only_seeds) { + // for (auto child : children_in_chain) { + // assert(!std::get<3>(child)); + // } + //} else { + // bool is_only_seeds = true; + // for (auto child : children_in_chain) { + // if (std::get<3>(child)) { + // is_only_seeds=false; + // } + // } + // assert(!is_only_seeds); + //} +#endif + + //First, sort the children of the chain + //If there is only one child, check if it's a seeed + bool only_seeds=chain_problem->children.size() == 1 ? distance_index.is_node(chain_problem->children.front().net_handle) + : true; + + std::sort(chain_problem->children.begin(), chain_problem->children.end(), + [&] (SnarlTreeNodeProblem::SnarlTreeChild& child1, SnarlTreeNodeProblem::SnarlTreeChild& child2) { + if (!child1.is_seed || !child2.is_seed) { + only_seeds = false; + } + if (!child1.is_seed && !child1.has_chain_values) { + //If child1 is a snarl and hasn't had its values set yet + child1.chain_component = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(child1.net_handle)).chain_component_start; + child1.prefix_sum = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(child1.net_handle)).prefix_sum_value; + } + if (!child2.is_seed && !child2.has_chain_values) { + //If child2 is a snarl and hasn't had its values set yet + child2.chain_component = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(child2.net_handle)).chain_component_start; + child2.prefix_sum = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(child2.net_handle)).prefix_sum_value; + } + if (child1.chain_component != child2.chain_component) { + return child1.chain_component < child2.chain_component; + } else if (child1.prefix_sum == child2.prefix_sum) { + return distance_index.is_ordered_in_chain(child1.net_handle, child2.net_handle); + } else { + return child1.prefix_sum < child2.prefix_sum; + } + }); + + net_handle_t& chain_handle = chain_problem->containing_net_handle; + + + if (!chain_problem->is_trivial_chain && ! is_top_level_chain) { + //If we need it, get the values from the distance index: + //is_looping_chain, node_length, the end boundary node, and the end component + //THese only get used if we need the distances to the ends of the chain + chain_problem->set_chain_values(distance_index); + } + + + if (only_seeds && !chain_problem->is_looping_chain && + (chain_problem->chain_component_end == 0 + || chain_problem->chain_component_end == std::numeric_limits::max())) { + //If there are only seeds in the chain (and the chain doesn't loop and isn't a multicomponent chain), + //then cluster by walking through the seeds + //This also does the work of clustering a trivial chain (which is just a node), which should be the same amount of work as using cluster_one_node + + cluster_seeds_on_linear_structure(clustering_problem, chain_problem, chain_problem->node_length, + !distance_index.is_trivial_chain(chain_handle), is_top_level_chain); + +#ifdef DEBUG_CLUSTER + cerr << "\tFound clusters on " << distance_index.net_handle_as_string(chain_handle) << endl; + cerr << "\t with best left and right values: " << chain_problem->fragment_best_left << " " + << chain_problem->fragment_best_right << endl; + bool got_left = false; + bool got_right = false; + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + cerr << "\t for read num " << read_num << " best left: " << (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) << " best right: " << (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) << endl; + bool got_read_left=false; + bool got_read_right = false; + bool any_clusters = false; + for (pair c : chain_problem->read_cluster_heads) { + if (c.first == read_num) { + any_clusters = true; + pair dists (clustering_problem.all_seeds->at(c.first)->at(c.second).distance_left, + clustering_problem.all_seeds->at(c.first)->at(c.second).distance_right); + cerr << "\t\t" << c.first << ":"<at(c.first)->size() ; x++) { + if (clustering_problem.read_union_find[c.first].find_group(x) == c.second) { + cerr << clustering_problem.all_seeds->at(c.first)->at(x).pos << " "; + has_seeds = true; + } + } + //assert(dists.first == std::numeric_limits::max() || dists.first >= (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second)); + //assert(dists.second == std::numeric_limits::max() || dists.second >= (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second)); + //assert(dists.first == std::numeric_limits::max() || dists.first >= chain_problem->fragment_best_left); + //assert(dists.second == std::numeric_limits::max() || dists.second >= chain_problem->fragment_best_right); + if (dists.first == chain_problem->fragment_best_left) {got_left = true;} + if (dists.second == chain_problem->fragment_best_right) {got_right = true;} + if (dists.first == (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second)) {got_read_left = true;} + if (dists.second == (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second)) {got_read_right = true;} + cerr << endl; + //assert(has_seeds); + } + } + //assert(!any_clusters ||got_read_left || (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) == std::numeric_limits::max()); + //assert(!any_clusters ||got_read_right || (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) == std::numeric_limits::max()); + } + //assert(got_left); + //assert(got_right); + + for (pair group_id : chain_problem->read_cluster_heads) { + //assert (group_id.first.second == clustering_problem.read_union_find[group_id.first.first].find_group(group_id.first.second)); + } +#endif + return; + + } + + +#ifdef DEBUG_CLUSTER + cerr << "Cluster chain " << distance_index.net_handle_as_string(chain_handle) << endl; + cerr << "\t chain has " << chain_problem->children.size() << " children" << endl; +#endif + + /*Go through the chain child by child + * + * As we walk through the chain, keep track of all clusters found up to the current child. + * So after we saw a child, the chain knows all clusters with distances up to the right side of the child + * + * For each child, + * - check if the clusters in the child can be combined with each other by walking out then back in through the chain + * - For snarls, update distances to the ends of the child (taking into account the distances to loop around in the chain) + * - compare and combine with clusters of the chain that get build as we walk along it + * + * - after combining clusters of the current child, remove redundant cluster heads from the chain clusters + * + * As we walk along the chain, we maintain clusters of the chain up to the last node we saw (the later + * boundary node of a snarl relative to the chain) + * + * The clusters in the chain have left distances to the beginning of the chain and right distances + * to the last thing we saw + */ + + + //These are the things that we update as we walk through the chain: Information about the last child we saw + //Initialized to the first child in the chain + + //The last child we saw + SnarlTreeNodeProblem::SnarlTreeChild& last_child = chain_problem->children.front(); + + //And values we need to save from the last child + //If the last child is a snarl, get it from the SnarlTreeNodeProblem otherwise from the seed's cache + size_t last_prefix_sum = last_child.is_seed + ? clustering_problem.all_seeds->at(last_child.seed_indices.first)->at(last_child.seed_indices.second).distance_left + : clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(last_child.net_handle)).chain_component_start; + size_t last_length = last_child.is_seed + ? MIPayload::node_length(clustering_problem.all_seeds->at(last_child.seed_indices.first)->at(last_child.seed_indices.second).minimizer_cache) + : clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(last_child.net_handle)).node_length; + size_t last_chain_component_end = last_child.is_seed + ? MIPayload::chain_component(clustering_problem.all_seeds->at(last_child.seed_indices.first)->at(last_child.seed_indices.second).minimizer_cache) + : clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(last_child.net_handle)).chain_component_start; + + //These are clusters that we don't want to consider as we walk through the chain but that + //we want to remember after we're done with the chain because the left distance is small + vector cluster_heads_to_add_again; + + //For remembering the best left distances of the chain, we only need to check for the smallest chain distance left + //for the children up to the first node + bool found_first_node = false; + pair found_first_node_by_read = std::make_pair(false, false); + + + for (size_t child_i = 0 ; child_i < chain_problem->children.size() ; child_i ++) { + /* + * Snarls and nodes are in the order that they are traversed in the chain + * For each child, compare all of the clusters on the child to clusters of the chain so far + * The clusters of the chain have right distances up to the end of the last child seen + */ + + SnarlTreeNodeProblem::SnarlTreeChild& child = chain_problem->children[child_i]; + + if (!child.is_seed){ + + //If this is a snarl, then cluster the children here + add_snarl_to_chain_problem(clustering_problem, chain_problem, last_child, last_prefix_sum, last_length, + last_chain_component_end, cluster_heads_to_add_again, found_first_node, found_first_node_by_read, + child, child_i == 0, child_i == chain_problem->children.size() - 1, is_top_level_chain); + } else { + + add_seed_to_chain_problem(clustering_problem, chain_problem, last_child, last_prefix_sum, last_length, + last_chain_component_end, cluster_heads_to_add_again, found_first_node, found_first_node_by_read, + child, child_i == 0, child_i == chain_problem->children.size() - 1, is_top_level_chain); + } + +#ifdef DEBUG_CLUSTER + cerr << "\tintermediate clusters on " << distance_index.net_handle_as_string(chain_handle) << " after child " << distance_index.net_handle_as_string(child.net_handle) << endl; + cerr << "\t with best left and right values: " << chain_problem->fragment_best_left << " " + << chain_problem->fragment_best_right << endl; + bool got_left = false; + bool got_right = false; + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + cerr << "\t for read num " << read_num << " best left: " << (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) << " best right: " << (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) << endl; + bool got_read_left=false; + bool got_read_right = false; + bool any_clusters = false; + for (pair c : chain_problem->read_cluster_heads) { + if (c.first == read_num) { + any_clusters = true; + pair dists (clustering_problem.all_seeds->at(c.first)->at(c.second).distance_left, + clustering_problem.all_seeds->at(c.first)->at(c.second).distance_right); + cerr << "\t\t" << c.first << ":"<at(c.first)->size() ; x++) { + if (clustering_problem.read_union_find[c.first].find_group(x) == c.second) { + cerr << clustering_problem.all_seeds->at(c.first)->at(x).pos << " "; + has_seeds = true; + } + } + cerr << endl; + } + } + } + vector ordered_seeds; + for (size_t i = 0 ; i < clustering_problem.all_seeds->size() ; i++) { + const auto v = clustering_problem.all_seeds->at(i); + for ( auto x : *v) { + ordered_seeds.push_back(x); + } + } + cerr << "Found intermediate fragment clusters : " << endl; + for (auto group : clustering_problem.fragment_union_find.all_groups()){ + cerr << "\t"; + for (size_t c : group) { + cerr << ordered_seeds[c].pos << " "; + } + cerr << endl; + } +#endif + } + //Add back clusters we skipped + for (auto& cluster_head : cluster_heads_to_add_again) { + chain_problem->read_cluster_heads.emplace(cluster_head.read_num, cluster_head.cluster_num); + clustering_problem.all_seeds->at(cluster_head.read_num)->at(cluster_head.cluster_num).distance_left = cluster_head.distance_left; + clustering_problem.all_seeds->at(cluster_head.read_num)->at(cluster_head.cluster_num).distance_right = cluster_head.distance_right; + chain_problem->fragment_best_left = std::min(chain_problem->fragment_best_left, cluster_head.distance_left); + if (cluster_head.read_num == 0) { + chain_problem->read_best_left.first = std::min(chain_problem->read_best_left.first, cluster_head.distance_left); + } else { + chain_problem->read_best_left.second = std::min(chain_problem->read_best_left.second, cluster_head.distance_left); + } + + } + + + //If the chain loops, then we also have to compare the first thing we saw to the last things + + if (chain_problem->is_looping_chain){ +#ifdef DEBUG_CLUSTER + cerr << "Check connectivity around a looping chain" << endl; + cerr << "\tFound clusters on " << distance_index.net_handle_as_string(chain_handle) << endl; + cerr << "\t with best left and right values: " << chain_problem->fragment_best_left << " " + << chain_problem->fragment_best_right << endl; + bool got_left = false; + bool got_right = false; + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + cerr << "\t for read num " << read_num << " best left: " << (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) << " best right: " << (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) << endl; + bool got_read_left=false; + bool got_read_right = false; + bool any_clusters = false; + for (pair c : chain_problem->read_cluster_heads) { + if (c.first == read_num) { + any_clusters = true; + pair dists (clustering_problem.all_seeds->at(c.first)->at(c.second).distance_left, + clustering_problem.all_seeds->at(c.first)->at(c.second).distance_right); + cerr << "\t\t" << c.first << ":"<at(c.first)->size() ; x++) { + if (clustering_problem.read_union_find[c.first].find_group(x) == c.second) { + cerr << clustering_problem.all_seeds->at(c.first)->at(x).pos << " "; + has_seeds = true; + } + } + //assert(dists.first == std::numeric_limits::max() || dists.first >= (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second)); + //assert(dists.second == std::numeric_limits::max() || dists.second >= (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second)); + //assert(dists.first == std::numeric_limits::max() || dists.first >= chain_problem->fragment_best_left); + //assert(dists.second == std::numeric_limits::max() || dists.second >= chain_problem->fragment_best_right); + if (dists.first == chain_problem->fragment_best_left) {got_left = true;} + if (dists.second == chain_problem->fragment_best_right) {got_right = true;} + if (dists.first == (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second)) {got_read_left = true;} + if (dists.second == (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second)) {got_read_right = true;} + cerr << endl; + //assert(has_seeds); + } + } + //assert(!any_clusters ||got_read_left || (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) == std::numeric_limits::max()); + //assert(!any_clusters ||got_read_right || (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) == std::numeric_limits::max()); + } + //assert(got_left); + //assert(got_right); + + for (pair group_id : chain_problem->read_cluster_heads) { + //assert (group_id.first.second == clustering_problem.read_union_find[group_id.first.first].find_group(group_id.first.second)); + } +#endif + pair combined_cluster_by_read = std::make_pair(std::numeric_limits::max(), std::numeric_limits::max()); + size_t combined_cluster_fragment = std::numeric_limits::max(); + for (auto& cluster_head : chain_problem->read_cluster_heads) { + size_t read_num = cluster_head.first; + size_t cluster_num=cluster_head.second; + size_t dist_left = clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_left; + size_t dist_right = clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_right; + + size_t distance_between_left_right =SnarlDistanceIndex::minus(SnarlDistanceIndex::sum(dist_left, (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second)), 1); + size_t distance_between_right_left = SnarlDistanceIndex::minus( SnarlDistanceIndex::sum(dist_right, (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second)), 1); + if (distance_between_left_right <= clustering_problem.read_distance_limit + || distance_between_right_left <= clustering_problem.read_distance_limit) { + //If we can combine the read + size_t& combined_cluster_num = (read_num == 0 ? combined_cluster_by_read.first : combined_cluster_by_read.second); + if (combined_cluster_num == std::numeric_limits::max()) { + combined_cluster_num = cluster_num; + } else { + combined_cluster_num = clustering_problem.read_union_find[read_num].union_groups( + combined_cluster_num, cluster_num); + } + } + size_t distance_between_left_right_fragment = SnarlDistanceIndex::minus(SnarlDistanceIndex::sum(dist_left, chain_problem->fragment_best_right) ,1); + size_t distance_between_right_left_fragment = SnarlDistanceIndex::minus(SnarlDistanceIndex::sum(dist_right, chain_problem->fragment_best_left) ,1); + if (clustering_problem.fragment_distance_limit != 0 && + (distance_between_left_right_fragment <= clustering_problem.fragment_distance_limit + || distance_between_right_left_fragment <= clustering_problem.fragment_distance_limit)) { + + if (combined_cluster_fragment != std::numeric_limits::max()) { + combined_cluster_fragment = clustering_problem.fragment_union_find.union_groups(combined_cluster_fragment, + cluster_num + clustering_problem.seed_count_prefix_sum[read_num]); + } else { + combined_cluster_fragment = cluster_num + clustering_problem.seed_count_prefix_sum[read_num]; + } + } + } + + } + +#ifdef DEBUG_CLUSTER + cerr << "\tFound clusters on " << distance_index.net_handle_as_string(chain_handle) << endl; + cerr << "\t with best left and right values: " << chain_problem->fragment_best_left << " " + << chain_problem->fragment_best_right << endl; + bool got_left = false; + bool got_right = false; + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + cerr << "\t for read num " << read_num << " best left: " << (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) << " best right: " << (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) << endl; + bool got_read_left=false; + bool got_read_right = false; + bool any_clusters = false; + for (pair c : chain_problem->read_cluster_heads) { + if (c.first == read_num) { + any_clusters = true; + pair dists (clustering_problem.all_seeds->at(c.first)->at(c.second).distance_left, + clustering_problem.all_seeds->at(c.first)->at(c.second).distance_right); + cerr << "\t\t" << c.first << ":"<at(c.first)->size() ; x++) { + if (clustering_problem.read_union_find[c.first].find_group(x) == c.second) { + cerr << clustering_problem.all_seeds->at(c.first)->at(x).pos << " "; + has_seeds = true; + } + } + //assert(dists.first == std::numeric_limits::max() || dists.first >= (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second)); + //assert(dists.second == std::numeric_limits::max() || dists.second >= (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second)); + //assert(dists.first == std::numeric_limits::max() || dists.first >= chain_problem->fragment_best_left); + //assert(dists.second == std::numeric_limits::max() || dists.second >= chain_problem->fragment_best_right); + if (dists.first == chain_problem->fragment_best_left) {got_left = true;} + if (dists.second == chain_problem->fragment_best_right) {got_right = true;} + if (dists.first == (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second)) {got_read_left = true;} + if (dists.second == (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second)) {got_read_right = true;} + cerr << endl; + //assert(has_seeds); + } + } + //assert(!any_clusters ||got_read_left || (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) == std::numeric_limits::max()); + //assert(!any_clusters ||got_read_right || (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) == std::numeric_limits::max()); + } + //assert(got_left); + //assert(got_right); + + for (pair group_id : chain_problem->read_cluster_heads) { + //assert (group_id.first.second == clustering_problem.read_union_find[group_id.first.first].find_group(group_id.first.second)); + } +#endif +} +void SnarlDistanceIndexClusterer::add_seed_to_chain_problem(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* chain_problem, + SnarlTreeNodeProblem::SnarlTreeChild& last_child, + size_t& last_prefix_sum, size_t& last_length, size_t& last_chain_component_end, + vector& cluster_heads_to_add_again, + bool& found_first_node, pair& found_first_node_by_read, + const SnarlTreeNodeProblem::SnarlTreeChild& current_child, bool is_first_child, + bool is_last_child, bool skip_distances_to_ends) const { + + size_t read_num = current_child.seed_indices.first; + size_t cluster_num = current_child.seed_indices.second; + net_handle_t& chain_handle = chain_problem->containing_net_handle; + SeedCache& current_child_seed = clustering_problem.all_seeds->at(read_num)->at(cluster_num); + /* + Get a bunch of distances from the current child that will be used to calculate distance + from the last child + */ + +#ifdef DEBUG_CLUSTER + cerr << "At child seed " << current_child_seed.pos << endl; +#endif + //The distance from the right side of the last child to the left side of this child + //(relative to the orientation of the chain + size_t distance_from_last_child_to_current_child = std::numeric_limits::max(); + if (!is_first_child) { + //If this isn't the first child we're looking at + if (last_child.net_handle == current_child.net_handle) { + //This can happen if the last thing was also a seed on the same node + distance_from_last_child_to_current_child = 0; + } else if ( last_chain_component_end == MIPayload::chain_component(current_child_seed.minimizer_cache)) { + //If this child is in the same component as the last one + if (last_length == std::numeric_limits::max()) { + //If the last length is infinite, then is must be a snarl that is not start-end reachable, so the distance + //from the last child is the same as the distance from the start of the chain (the start of this compnent) + distance_from_last_child_to_current_child = MIPayload::prefix_sum(current_child_seed.minimizer_cache); + } else { + size_t distance_from_chain_start_to_last_node = SnarlDistanceIndex::sum(last_prefix_sum,last_length); + + //Distance is the current node's prefix sum minus the distance from the start of the chain to the last node + distance_from_last_child_to_current_child = SnarlDistanceIndex::minus(MIPayload::prefix_sum(current_child_seed.minimizer_cache), + distance_from_chain_start_to_last_node); + } + } + } + + + + //The distance to add to get to the end of the chain. Only matters if this is the last thing in the chain + //The distances will include the distance to the end of a trivial chain, + //so we can't rely on distance_in_parent to know when the distance should be 0 + + size_t distance_from_current_end_to_end_of_chain; + if (!is_last_child || skip_distances_to_ends) { + //If this isn't the last child in the chain, then we only want the distance to the end of the current child + + distance_from_current_end_to_end_of_chain = 0; + } else if (SnarlDistanceIndex::get_record_offset(current_child.net_handle) == SnarlDistanceIndex::get_record_offset(chain_problem->end_in)) { + //If this is the last node in the chain + if (chain_problem->chain_component_end != MIPayload::chain_component(current_child_seed.minimizer_cache)) { + //If they aren't in the same component + distance_from_current_end_to_end_of_chain = std::numeric_limits::max(); + } else { + distance_from_current_end_to_end_of_chain = 0; + } + } else if (chain_problem->chain_component_end != MIPayload::chain_component(current_child_seed.minimizer_cache)) { + //If they aren't in the same component + distance_from_current_end_to_end_of_chain = std::numeric_limits::max(); + } else { + + //Length of the chain - (prefix sum + node length of the current node) + distance_from_current_end_to_end_of_chain = SnarlDistanceIndex::minus(chain_problem->node_length, + SnarlDistanceIndex::sum(MIPayload::prefix_sum(current_child_seed.minimizer_cache), + MIPayload::node_length(current_child_seed.minimizer_cache))); + + } + +#ifdef DEBUG_CLUSTER + cerr << "\tDistance from last child to this one: " << distance_from_last_child_to_current_child << endl; + cerr << "\tDistance from start of chain to the left side of this one: " << (MIPayload::chain_component(current_child_seed.minimizer_cache) != 0 ? std::numeric_limits::max() : MIPayload::prefix_sum(current_child_seed.minimizer_cache)) << endl; + cerr << "\tDistance to get to the end of the chain: " << distance_from_current_end_to_end_of_chain << endl; +#endif + + + if (last_child.net_handle != current_child.net_handle && + SnarlDistanceIndex::sum(distance_from_last_child_to_current_child, chain_problem->fragment_best_right) + > (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit)) { +#ifdef DEBUG_CLUSTER + cerr << "This child is too far away from the last one to cluster anything" << endl; +#endif + //If the distance from the last cluster is too far to cluster anything + if (!skip_distances_to_ends) { + for (auto& cluster_head : chain_problem->read_cluster_heads) { + //For each of the chain clusters, remember the ones that are still reachable from the left side of the chain + size_t dist_left = clustering_problem.all_seeds->at(cluster_head.first)->at(cluster_head.second).distance_left; + if (dist_left <= (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit)) { + //If this cluster can be clustered outside of the chain, remember to add it back + cluster_heads_to_add_again.emplace_back(); + cluster_heads_to_add_again.back().read_num = cluster_head.first; + cluster_heads_to_add_again.back().cluster_num = cluster_head.second; + cluster_heads_to_add_again.back().distance_left = dist_left; + cluster_heads_to_add_again.back().distance_right = std::numeric_limits::max(); + } + } + } + + //Now clear the chain's list of clusters + chain_problem->read_cluster_heads.clear(); + + + //Update the distances stored in the seed to reach the ends of the chain + //The distance left and right of the seed are currently oriented relative to the chain + + //The current left distance is infinite if it is not in the first component of a multicomponent chain + if (MIPayload::chain_component(current_child_seed.minimizer_cache) != 0) { + //If this node isn't in the first component of the chain + current_child_seed.distance_left = std::numeric_limits::max(); + } else { + //Prefix sum + offset of the seed in the node + current_child_seed.distance_left = SnarlDistanceIndex::sum(current_child_seed.distance_left, + MIPayload::prefix_sum(current_child_seed.minimizer_cache)); + } + current_child_seed.distance_right = SnarlDistanceIndex::sum(current_child_seed.distance_right, + distance_from_current_end_to_end_of_chain); + + //Add the cluster to the chain + chain_problem->read_cluster_heads.emplace(read_num, cluster_num); + + //Update the best distances on the chain + if (!found_first_node) { + chain_problem->fragment_best_left = std::min(chain_problem->fragment_best_left, current_child_seed.distance_left); + } + if (! (read_num == 0 ? found_first_node_by_read.first : found_first_node_by_read.second)) { + (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) = std::min((read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second), current_child_seed.distance_left); + } + //Since this child is a seed on a node, it's right distance will be the best one for the chain so far + chain_problem->fragment_best_right = current_child_seed.distance_right; + (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) = current_child_seed.distance_right; + + //Also update the best right distances to the end of this node for the other read + //Since it was too far away from this node, it will be too far away from anything else and it can just be infinite + for (size_t chain_read_num = 0 ; chain_read_num < clustering_problem.all_seeds->size() ; chain_read_num++) { + if (chain_read_num != read_num) { + if (chain_read_num == 0) { + chain_problem->read_best_right.first = std::numeric_limits::max(); + } else { + chain_problem->read_best_right.second = std::numeric_limits::max(); + } + } + } + + } else { + //Otherwise, check to see if anything on the current child can be combined with + //anything in the chain thus far + + //The distance from the right side of the last child to the right side of this child, which is + //the distance we need to update the chain clusters to the end of this child + //This isn't quite right for the first thing in the chain but it doesn't matter because it only + //gets added to chain clusters + //IF it gets calculated, then it's the distance from the last child to this node + the length + //of this node (the first value in the cache) + size_t distance_from_last_child_to_current_end = + distance_from_last_child_to_current_child == std::numeric_limits::max() + ? std::numeric_limits::max() : + (last_child.net_handle == current_child.net_handle ? 0 + : SnarlDistanceIndex::sum(distance_from_last_child_to_current_child, MIPayload::node_length(current_child_seed.minimizer_cache))); + + //The new distances from this child to the start of the chain and the end of this child (or the end of the chain if it's the last child) + //Left distance is the prefix sum (or inf if the node isn't in the first component of the chain) + offset of seed in node + //Right distance is the right offst of the seed in the node + the distance from the end of the node to the end of the chain + // (or 0 if it isn't the last thing in the chain) + pair new_distances = make_pair( + MIPayload::chain_component(current_child_seed.minimizer_cache) != 0 ? std::numeric_limits::max() + : SnarlDistanceIndex::sum(current_child_seed.distance_left, + MIPayload::prefix_sum(current_child_seed.minimizer_cache)), + SnarlDistanceIndex::sum(current_child_seed.distance_right, distance_from_current_end_to_end_of_chain)); + + + //Cluster heads to remove because they got combined with the current seed + vector> to_remove; + //And the new cluster containing the current seed, and possibly anything that gets combined with it + ClusterHead new_cluster = {read_num, cluster_num, new_distances.first, new_distances.second}; + + /**Go through the clusters on the chain up to this point and see if anything can + be combined with the clusters on the child + Also update the distances of the chain clusters to reach the end of this node + */ + for (auto& chain_cluster_head : chain_problem->read_cluster_heads) { + //Each has distances up to the previous node + + const size_t chain_cluster_read_num = chain_cluster_head.first; + const size_t chain_cluster_cluster_num = chain_cluster_head.second; + + //The distances of the chain cluster + pair chain_cluster_distances (clustering_problem.all_seeds->at(chain_cluster_read_num)->at(chain_cluster_cluster_num).distance_left, + clustering_problem.all_seeds->at(chain_cluster_read_num)->at(chain_cluster_cluster_num).distance_right); + + + //The distance between the current seed and the current chain cluster + size_t distance_between = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(chain_cluster_distances.second, + distance_from_last_child_to_current_child), + current_child_seed.distance_left), + 1); + if (!is_first_child && last_child.net_handle == current_child.net_handle) { + //If the last child was the same as this child (seeds on the same node), + //then the distances right are including the current node, so subtract + //the length of this node + distance_between -= MIPayload::node_length(current_child_seed.minimizer_cache); + } + +#ifdef DEBUG_CLUSTER + cerr << "\t\t Compare this seed " << read_num << ":" << cluster_num << " with distance between: " << distance_between << endl; +#endif + + + if (chain_cluster_read_num == read_num && distance_between <= clustering_problem.read_distance_limit) { +#ifdef DEBUG_CLUSTER + cerr << "\t\tCombine chain cluster " << read_num << ":" << chain_cluster_cluster_num << endl; +#endif + //Union the two clusters and remember the new cluster head + new_cluster.cluster_num = clustering_problem.read_union_find.at(read_num).union_groups(chain_cluster_cluster_num, cluster_num); + + //Find the best distances of the two. The best right distance will always be the current seed's distance + //And remember the new combined cluster head + new_cluster.distance_left = std::min(new_cluster.distance_left, chain_cluster_distances.first); + + to_remove.emplace_back(chain_cluster_read_num, chain_cluster_cluster_num); + + //Try to union the fragment + if (clustering_problem.fragment_distance_limit != 0 && distance_between <= clustering_problem.fragment_distance_limit) { + clustering_problem.fragment_union_find.union_groups(cluster_num + clustering_problem.seed_count_prefix_sum[read_num], + chain_cluster_cluster_num + clustering_problem.seed_count_prefix_sum[chain_cluster_read_num]); + } + + } else if (clustering_problem.fragment_distance_limit != 0 && distance_between <= clustering_problem.fragment_distance_limit) { + //If we can union the fragments, then union them and keep the cluster around, updating the right distance + + clustering_problem.fragment_union_find.union_groups(cluster_num + clustering_problem.seed_count_prefix_sum[read_num], + chain_cluster_cluster_num + clustering_problem.seed_count_prefix_sum[chain_cluster_read_num]); + + clustering_problem.all_seeds->at(chain_cluster_read_num)->at(chain_cluster_cluster_num).distance_right = + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(chain_cluster_distances.second, + distance_from_last_child_to_current_end), + distance_from_current_end_to_end_of_chain); + } else { + //If this chain cluster doesn't get combined, then it is too far away to combine with anything later in the chain, + //so we remove it but remember to add it again if the left distance is small enough + + if (chain_cluster_distances.first <= + (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit) + && !skip_distances_to_ends) { + //If the current chain cluster can still be reached from the left + clustering_problem.all_seeds->at(chain_cluster_read_num)->at(chain_cluster_cluster_num).distance_right = std::numeric_limits::max(); + cluster_heads_to_add_again.emplace_back(chain_cluster_read_num, chain_cluster_cluster_num, + chain_cluster_distances.first, std::numeric_limits::max()); + } + to_remove.emplace_back(chain_cluster_read_num, chain_cluster_cluster_num); + } + + } + + //Remove all chain clusters that got combined with the current seed + for (pair& cluster_head : to_remove) { + chain_problem->read_cluster_heads.erase(cluster_head); + } + + //Add the cluster of the current seed which may or may not have been combined + chain_problem->read_cluster_heads.emplace(new_cluster.read_num, new_cluster.cluster_num); + clustering_problem.all_seeds->at(new_cluster.read_num)->at(new_cluster.cluster_num).distance_left = new_cluster.distance_left; + clustering_problem.all_seeds->at(new_cluster.read_num)->at(new_cluster.cluster_num).distance_right = new_cluster.distance_right; + + + //Update the best distances + //Only update the left distances if we haven't seen a node in the chain yet + if (!found_first_node) { + chain_problem->fragment_best_left = std::min(chain_problem->fragment_best_left, new_distances.first); + } + + //If we haven't found the first node for this read + if (!(read_num == 0 ? found_first_node_by_read.first : found_first_node_by_read.second)){ + //Update the best left distance + (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) = std::min((read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second), new_distances.first); + } + + //Since this is a node, the best right distance will be this distance + chain_problem->fragment_best_right = new_distances.second; + (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) = new_distances.second; + + //Also update the best right distances to the end of this node for clusters of the other read + for (size_t chain_read_num = 0 ; chain_read_num < clustering_problem.all_seeds->size() ; chain_read_num++) { + if (chain_read_num != read_num) { + if (chain_read_num == 0) { + chain_problem->read_best_right.first = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + chain_problem->read_best_right.first, + distance_from_last_child_to_current_end), + distance_from_current_end_to_end_of_chain); + } else { + chain_problem->read_best_right.second = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + chain_problem->read_best_right.second, + distance_from_last_child_to_current_end), + distance_from_current_end_to_end_of_chain); + } + } + } + } + + found_first_node = true; + if (read_num == 0) { + found_first_node_by_read.first = true; + } else { + found_first_node_by_read.second = true; + } + + + //Update the last node we saw to this one + last_child = current_child; + last_prefix_sum = MIPayload::prefix_sum(current_child_seed.minimizer_cache); + last_length = MIPayload::node_length(current_child_seed.minimizer_cache); + last_chain_component_end = MIPayload::chain_component(current_child_seed.minimizer_cache); + +} + +void SnarlDistanceIndexClusterer::add_snarl_to_chain_problem(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* chain_problem, + SnarlTreeNodeProblem::SnarlTreeChild& last_child, + size_t& last_prefix_sum, size_t& last_length, size_t& last_chain_component_end, + vector& cluster_heads_to_add_again, + bool& found_first_node, pair& found_first_node_by_read, + const SnarlTreeNodeProblem::SnarlTreeChild& current_child, bool is_first_child, + bool is_last_child, bool skip_distances_to_ends) const { + + /*Define a helper function to update the distances in a child using the loop distances + * in the chain + */ + auto update_distances_on_same_child = [&] (SnarlTreeNodeProblem& child_problem) { + //Distance to go forward (relative to the child) in the chain and back + if (child_problem.loop_left == std::numeric_limits::max() && child_problem.loop_right == std::numeric_limits::max()) { + return; + } + + + //Combined clusters in case we can combine anything + vector, pair>> combined_left (clustering_problem.all_seeds->size(), + make_pair(make_pair(std::numeric_limits::max(), 0), make_pair(0,0))); + vector, pair>> combined_right (clustering_problem.all_seeds->size(), + make_pair(make_pair(std::numeric_limits::max(), 0), make_pair(0,0))); + size_t combined_fragment_left = std::numeric_limits::max(); + size_t combined_fragment_right = std::numeric_limits::max(); + vector> to_erase; + + for (auto& child_cluster_head : child_problem.read_cluster_heads) { + //Go through each of the clusters on this child + size_t read_num = child_cluster_head.first; + size_t cluster_num = child_cluster_head.second; + size_t old_left = clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_left; + size_t old_right = clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_right; + //Get the new best distances for the cluster considering chain loops + size_t updated_left = std::min(old_left, SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(old_right, child_problem.loop_right), child_problem.node_length)); + size_t updated_right = std::min(old_right, SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(old_left, child_problem.loop_left), child_problem.node_length)); + + + + if (updated_left < old_left || updated_right < old_right ) { + //Update the distances + clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_left = updated_left; + clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_right = updated_right; + + child_problem.fragment_best_left = std::min(child_problem.fragment_best_left, + updated_left); + child_problem.fragment_best_right = std::min(child_problem.fragment_best_right, + updated_right); + (read_num == 0 ? child_problem.read_best_left.first : child_problem.read_best_left.second) = std::min((read_num == 0 ? child_problem.read_best_left.first : child_problem.read_best_left.second),updated_left); + (read_num == 0 ? child_problem.read_best_right.first : child_problem.read_best_right.second) = std::min((read_num == 0 ? child_problem.read_best_right.first : child_problem.read_best_right.second), + updated_right); + } + + //Now see if we can combine this cluster with anything else + //The distance between this cluster and anything else taking the left loop + size_t distance_between_left = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(updated_left, + child_problem.loop_left), + (read_num == 0 ? child_problem.read_best_left.first : child_problem.read_best_left.second)), + 1); + size_t distance_between_right = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(updated_right, + child_problem.loop_right), + (read_num == 0 ? child_problem.read_best_right.first : child_problem.read_best_right.second)), + 1); + size_t distance_between_left_fragment = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(updated_left, + child_problem.loop_left), + child_problem.fragment_best_left), + 1); + size_t distance_between_right_fragment = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(updated_right, + child_problem.loop_right), + child_problem.fragment_best_right), + 1); + pair cluster_head = make_pair(read_num, cluster_num); + if (distance_between_left <= clustering_problem.read_distance_limit) { + //Combine it left + to_erase.emplace_back(cluster_head); + if (combined_left[read_num].first.first == std::numeric_limits::max()){ + combined_left[read_num] = make_pair(cluster_head, + make_pair(updated_left, updated_right)); + } else { + to_erase.emplace_back(combined_left[read_num].first); + combined_left[read_num] = make_pair( + make_pair(read_num, + clustering_problem.read_union_find.at(read_num).union_groups(cluster_num, + combined_left[read_num].first.second)), + make_pair(std::min(updated_left, combined_left[read_num].second.first), + std::min(updated_right, combined_left[read_num].second.second))); + + } + } + if (distance_between_right <= clustering_problem.read_distance_limit) { + //Combine it right + to_erase.emplace_back(cluster_head); + if (combined_right[read_num].first.first == std::numeric_limits::max()){ + combined_right[read_num] = make_pair(cluster_head, make_pair(updated_left, updated_right)); + } else { + to_erase.emplace_back(combined_right[read_num].first); + combined_right[read_num] =make_pair( + make_pair(read_num, + clustering_problem.read_union_find.at(read_num).union_groups(cluster_num, + combined_right[read_num].first.second)), + make_pair(std::min(updated_left, combined_right[read_num].second.first), + std::min(updated_right, combined_right[read_num].second.second))); + + } + } + if (clustering_problem.fragment_distance_limit != 0 && + distance_between_left_fragment <= clustering_problem.fragment_distance_limit) { + //Combine the fragment + if (combined_fragment_left != std::numeric_limits::max()) { + combined_fragment_left = clustering_problem.fragment_union_find.union_groups(combined_fragment_left, + cluster_num + clustering_problem.seed_count_prefix_sum[read_num]); + } else { + combined_fragment_left = cluster_num + clustering_problem.seed_count_prefix_sum[read_num]; + } + } + if (clustering_problem.fragment_distance_limit != 0 && + distance_between_right_fragment <= clustering_problem.fragment_distance_limit) { + //Combine the fragment + if (combined_fragment_right != std::numeric_limits::max()) { + combined_fragment_right = clustering_problem.fragment_union_find.union_groups(combined_fragment_right, + cluster_num + clustering_problem.seed_count_prefix_sum[read_num]); + } else { + combined_fragment_right = cluster_num + clustering_problem.seed_count_prefix_sum[read_num]; + } + } + } + for (pair& cluster_head : to_erase) { + child_problem.read_cluster_heads.erase(cluster_head); + } + //Add new clusters that were combined + for (pair,pair>& cluster : combined_left) { + if (cluster.first.first != std::numeric_limits::max()){ + child_problem.read_cluster_heads.emplace(cluster.first); + clustering_problem.all_seeds->at(cluster.first.first)->at(cluster.first.second).distance_left = cluster.second.first; + clustering_problem.all_seeds->at(cluster.first.first)->at(cluster.first.second).distance_right = cluster.second.second; + } + } + for (pair, pair>& cluster : combined_right) { + if (cluster.first.first != std::numeric_limits::max()){ + child_problem.read_cluster_heads.emplace(cluster.first); + clustering_problem.all_seeds->at(cluster.first.first)->at(cluster.first.second).distance_left = cluster.second.first; + clustering_problem.all_seeds->at(cluster.first.first)->at(cluster.first.second).distance_right = cluster.second.second; + } + } + }; + + + net_handle_t& chain_handle = chain_problem->containing_net_handle; + SnarlTreeNodeProblem& child_problem = clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(current_child.net_handle)); + + //Skip this child if its seeds are all too far away + bool skip_snarl = false; + if (child_problem.fragment_best_left > (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit) && + child_problem.fragment_best_right > (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit)) { + skip_snarl = true; + } else { + //See if this clusters of this child can be combined with each other + //Also updates the minimum distances to include loops in the chain + //This only matters for snarls, since any path would have to pass through a node anyway + update_distances_on_same_child(child_problem); + } +#ifdef DEBUG_CLUSTER + cerr << "At child " << distance_index.net_handle_as_string(current_child.net_handle) << endl; +#endif + + /* + Get a bunch of distances from the current child that will be used to calculate distance + from the last child + */ + + + //The distance from the right side of the last child to the left side of this child + //(relative to the orientation of the chain + //If this is a looping chain, then find the distance normally. Otherwise use the prefix sums + size_t distance_from_last_child_to_current_child = std::numeric_limits::max(); + if (!is_first_child) { + //If this isn't the first child we're looking at + if ( last_chain_component_end == child_problem.chain_component_start) { + //If this child is in the same component as the last one + if (last_length == std::numeric_limits::max() && last_chain_component_end ) { + //If the last length is infinite, then is must be a snarl that is not start-end reachable, so the distance + //from the last child is the same as the distance from the start of the chain (the start of this compnent) + distance_from_last_child_to_current_child = child_problem.prefix_sum_value; + } else { + size_t distance_from_chain_start_to_last_node = SnarlDistanceIndex::sum(last_prefix_sum,last_length); + distance_from_last_child_to_current_child = SnarlDistanceIndex::minus(child_problem.prefix_sum_value, + distance_from_chain_start_to_last_node); + } + } + } + + + //The distance from the right side of the last child to the right side of this child, which is + //the distance we need to update the chain clusters to the end of this child + //This isn't quite right for the first thing in the chain but it doesn't matter because it only + //gets added to chain clusters + //If it gets calculated, it is the distance from the last child to the start of this child snarl + the length of the child snarl + size_t distance_from_last_child_to_current_end = + distance_from_last_child_to_current_child == std::numeric_limits::max() + ? std::numeric_limits::max() : + (last_child.net_handle == current_child.net_handle ? 0 + : SnarlDistanceIndex::sum(distance_from_last_child_to_current_child, + child_problem.node_length)); + + //The distance to add to get to the end of the chain. Only matters if this is the last thing in the chain + //The distances will include the distance to the end of a trivial chain, + //so we can't rely on distance_in_parent to know when the distance should be 0 + + size_t distance_from_current_end_to_end_of_chain; + if (!is_last_child || skip_distances_to_ends) { + //If this isn't the last child in the chain, then we only want the distance to the end of the current child + + distance_from_current_end_to_end_of_chain = 0; + } else if (SnarlDistanceIndex::get_record_offset(current_child.net_handle) == SnarlDistanceIndex::get_record_offset(chain_problem->end_in)) { + //If this is the last node in the chain + if (chain_problem->chain_component_end != child_problem.chain_component_end) { + //If they aren't in the same component + distance_from_current_end_to_end_of_chain = std::numeric_limits::max(); + } else { + distance_from_current_end_to_end_of_chain = 0; + } + } else if (chain_problem->is_looping_chain) { + //TODO: I think I should be able to do this without the distance index but none of our graphs so far have loops + // so I'm not going to bother + //If it's a looping chain then use the distance index + distance_from_current_end_to_end_of_chain = distance_index.distance_in_parent(chain_handle, chain_problem->end_in, + current_child.net_handle); + } else if (child_problem.node_length == std::numeric_limits::max() ) { + //If the node length is infinite, then it is a snarl that isn't start-end connected, so the start + //and end of the snarl are in different components of the chain. Since it reached here, the end + //node of the snarl is in the same component as the end of the chain, so the distance to the + //end of the chain is just the length of the last component of the chain, which is + //chain_problem.node_length + distance_from_current_end_to_end_of_chain = chain_problem->node_length; + + } else { + distance_from_current_end_to_end_of_chain = SnarlDistanceIndex::minus(chain_problem->node_length, + SnarlDistanceIndex::sum(child_problem.prefix_sum_value, child_problem.node_length)); + + } + +#ifdef DEBUG_CLUSTER +cerr << "\tDistance from last child to this one: " << distance_from_last_child_to_current_child << endl; +cerr << "\tDistance from start of chain to the left side of this one: " << (child_problem.chain_component_start != 0 + ? std::numeric_limits::max() : child_problem.prefix_sum_value) << endl; +cerr << "\tDistance from the last child to the right side of this one: " << distance_from_last_child_to_current_end << endl; +cerr << "\tDistance to get to the end of the chain: " << distance_from_current_end_to_end_of_chain << endl; +#endif + + //Clusters to remove from the chain because they got combined + vector> to_erase; + + //And new clusters to add that didn't get combined + vector, pair>> to_add; + + //There is at most one new cluster per read + pair new_cluster_by_read; + //And one new fragment cluster + size_t new_cluster_head_fragment = std::numeric_limits::max(); + + bool child_is_reversed = child_problem.is_reversed_in_parent; + + //Remember the current best chain distances, and reset them to inf since we need to update them + size_t old_best_right = std::move(chain_problem->fragment_best_right); + chain_problem->fragment_best_right = std::numeric_limits::max(); + pair old_best_right_by_read = std::move(chain_problem->read_best_right); + chain_problem->read_best_right = std::make_pair(std::numeric_limits::max(), std::numeric_limits::max()); + + + if (last_child.net_handle != current_child.net_handle && + SnarlDistanceIndex::sum(distance_from_last_child_to_current_child, old_best_right) + > (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit)) { +#ifdef DEBUG_CLUSTER + cerr << "This child is too far away from the last one to cluster anything" << endl; +#endif + if (!skip_distances_to_ends) { + //If we care about the distances to the ends, then remember which clusters might be reachable from the + //left side of the chain + for (auto& cluster_head : chain_problem->read_cluster_heads) { + //For each of the chain clusters + pair dists (clustering_problem.all_seeds->at(cluster_head.first)->at(cluster_head.second).distance_left, + clustering_problem.all_seeds->at(cluster_head.first)->at(cluster_head.second).distance_right); + if (dists.first <= (clustering_problem.fragment_distance_limit == 0 ? clustering_problem.read_distance_limit : clustering_problem.fragment_distance_limit)) { + //If this cluster can be clustered outside of the chain, remember to add it back + cluster_heads_to_add_again.emplace_back(cluster_head.first, cluster_head.second, + dists.first, std::numeric_limits::max()); + } + } + } + + //Now clear the chain's list of clusters + chain_problem->read_cluster_heads.clear(); + + //If the current child snarl has combinable clusters, add them to the chain + if (!skip_snarl) { + for (auto& cluster_head : child_problem.read_cluster_heads) { + //Add the clusters from this child to the chain + size_t read_num = cluster_head.first; + pair dists (clustering_problem.all_seeds->at(read_num)->at(cluster_head.second).distance_left, + clustering_problem.all_seeds->at(read_num)->at(cluster_head.second).distance_right); + size_t dist_left = child_problem.is_reversed_in_parent ? dists.second : dists.first; + size_t dist_right = child_problem.is_reversed_in_parent ? dists.first : dists.second; + + //Distances to the start of the chain, and the end of this node + //If this is the last thing in the chain, then the distance to the end of the chain + //If the snarl is isn't in the first component of the chain, then the left distance is infinite + pair new_distances = make_pair( + child_problem.chain_component_start != 0 ? std::numeric_limits::max() + : SnarlDistanceIndex::sum(dist_left, child_problem.prefix_sum_value), + SnarlDistanceIndex::sum(dist_right, distance_from_current_end_to_end_of_chain)); + + //Add this to the chain + chain_problem->read_cluster_heads.emplace(cluster_head); + clustering_problem.all_seeds->at(cluster_head.first)->at(cluster_head.second).distance_left = new_distances.first; + clustering_problem.all_seeds->at(cluster_head.first)->at(cluster_head.second).distance_right = new_distances.second; + //And update the best distances + if (!found_first_node) { + chain_problem->fragment_best_left = std::min(chain_problem->fragment_best_left, new_distances.first); + } + if (! (read_num == 0 ? found_first_node_by_read.first : found_first_node_by_read.second)) { + (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) = + std::min((read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second), new_distances.first); + } + chain_problem->fragment_best_right = std::min(chain_problem->fragment_best_right, new_distances.second); + (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) = + std::min((read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second), new_distances.second); + } + } + + + } else if (!skip_snarl) { + //Otherwise, check to see if anything on the current child can be combined with + //anything in the chain thus far + + + /**First, go through the clusters of the current child and see what can be combined + */ + + for (auto& child_cluster_head : child_problem.read_cluster_heads) { + //Go through all clusters of the current child and see if they can be combined with anything on the chain + const size_t read_num = child_cluster_head.first; + const size_t cluster_num = child_cluster_head.second; + ClusterHead& new_cluster = read_num == 0 ? new_cluster_by_read.first : new_cluster_by_read.second; + pair dists (clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_left, + clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_right); + const size_t distance_left = child_is_reversed ? dists.second : dists.first; + const size_t distance_right = child_is_reversed ? dists.first : dists.second; + //Distance between this cluster and a cluster on the same read from the previous child + size_t distance_between = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_left, + distance_from_last_child_to_current_child), + (read_num == 0 ? old_best_right_by_read.first : old_best_right_by_read.second)), + 1); + //Distance between this cluster and any cluster on the previous child + size_t fragment_distance_between = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_left, + distance_from_last_child_to_current_child), + old_best_right), + 1); + + //The new distances from this child to the start of the chain and the end of this child + pair new_distances = make_pair( + child_problem.chain_component_start != 0 ? std::numeric_limits::max() + : SnarlDistanceIndex::sum(distance_left, child_problem.prefix_sum_value), + SnarlDistanceIndex::sum(distance_right, distance_from_current_end_to_end_of_chain)); + + if (distance_between <= clustering_problem.read_distance_limit) { +#ifdef DEBUG_CLUSTER + cerr << "\t\tCombine child cluster " << read_num << ":" << cluster_num << endl; +#endif + //If this cluster can be merged with anything on the chain + if (new_cluster.read_num == std::numeric_limits::max()){ + //If nothing is in the combined cluster yet, this is the new combined cluster + new_cluster.read_num =child_cluster_head.first; + new_cluster.cluster_num =child_cluster_head.second; + new_cluster.distance_left = new_distances.first; + new_cluster.distance_right = new_distances.second; + } else { + //Otherwise, remember to forget about the old cluster + //Union the two clusters + new_cluster.cluster_num = clustering_problem.read_union_find.at(read_num).union_groups(cluster_num, new_cluster.cluster_num); + //And find the best distances of the two + new_cluster.distance_left = std::min(new_cluster.distance_left, new_distances.first); + new_cluster.distance_right = std::min(new_cluster.distance_right, new_distances.second); + } + } else { + //If it didn't get combined, remember to add it at the end + size_t distance_limit = clustering_problem.fragment_distance_limit == 0 + ? clustering_problem.read_distance_limit + : clustering_problem.fragment_distance_limit; + if (new_distances.first <= distance_limit || new_distances.second <= distance_limit){ + //But only if the distances are small enough + to_add.emplace_back(make_pair(read_num, cluster_num), new_distances); + } + } + //If we can combine the fragments + if (clustering_problem.fragment_distance_limit != 0 && fragment_distance_between <= clustering_problem.fragment_distance_limit){ + if (new_cluster_head_fragment != std::numeric_limits::max()) { + new_cluster_head_fragment = clustering_problem.fragment_union_find.union_groups(new_cluster_head_fragment, + cluster_num + clustering_problem.seed_count_prefix_sum[read_num]); + } else { + new_cluster_head_fragment = cluster_num + clustering_problem.seed_count_prefix_sum[read_num]; + } + } + + + //Update the best distances + if (!found_first_node) { + chain_problem->fragment_best_left = std::min(chain_problem->fragment_best_left, new_distances.first); + } + if (!(read_num == 0 ? found_first_node_by_read.first : found_first_node_by_read.second)) { + (read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second) = std::min((read_num == 0 ? chain_problem->read_best_left.first : chain_problem->read_best_left.second), new_distances.first); + } + chain_problem->fragment_best_right = std::min(chain_problem->fragment_best_right, new_distances.second); + (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) = std::min((read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second), new_distances.second); + } + + + + /**Next, go through the clusters on the chain up to this point and see if anything can + be combined with the clusters on the child + */ + for (auto& chain_cluster_head : chain_problem->read_cluster_heads) { + //Each has distances up to the previous node + + const size_t read_num = chain_cluster_head.first; + const size_t cluster_num = chain_cluster_head.second; + + ClusterHead& new_cluster = read_num == 0 ? new_cluster_by_read.first : new_cluster_by_read.second; + + //The distances for the chain cluster + pair chain_cluster_distances (clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_left, + clustering_problem.all_seeds->at(read_num)->at(cluster_num).distance_right); + + //Best distance to the left side (relative to the chain) of the current child + const size_t current_distance_left = (read_num == 0 ? child_problem.read_best_left.first : child_problem.read_best_left.second); + const size_t current_fragment_distance_left = child_problem.fragment_best_left; + + size_t distance_between = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(chain_cluster_distances.second, + distance_from_last_child_to_current_child), + current_distance_left), + 1); + size_t distance_between_fragment = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(chain_cluster_distances.second, + distance_from_last_child_to_current_child), + current_fragment_distance_left), + 1); + pair new_distances = make_pair( + chain_cluster_distances.first, + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(chain_cluster_distances.second, + distance_from_last_child_to_current_end), + distance_from_current_end_to_end_of_chain)); + + + if (distance_between <= clustering_problem.read_distance_limit) { +#ifdef DEBUG_CLUSTER + cerr << "\t\tCombine chain cluster " << read_num << ":" << cluster_num << endl; +#endif + //If we can union the reads + if ( new_cluster.read_num == std::numeric_limits::max()) { + new_cluster.read_num =chain_cluster_head.first; + new_cluster.cluster_num =chain_cluster_head.second; + new_cluster.distance_left = new_distances.first; + new_cluster.distance_right = new_distances.second; + } else { + //Union the two clusters + new_cluster.cluster_num = clustering_problem.read_union_find.at(read_num).union_groups(cluster_num, + new_cluster.cluster_num); + //Find the best distances of the two + new_cluster.distance_left = std::min( new_cluster.distance_left, new_distances.first); + new_cluster.distance_right = std::min(new_cluster.distance_right, new_distances.second); + + } + //Remember to erase the combined cluster. The new cluster head will be added later + to_erase.emplace_back(read_num, cluster_num); + } else { + to_add.emplace_back(make_pair(read_num, cluster_num), new_distances); + } + if (clustering_problem.fragment_distance_limit != 0 && distance_between_fragment <= clustering_problem.fragment_distance_limit) { + //If we can union the fragments + if (new_cluster_head_fragment != std::numeric_limits::max()) { + new_cluster_head_fragment = clustering_problem.fragment_union_find.union_groups(new_cluster_head_fragment, + cluster_num + clustering_problem.seed_count_prefix_sum[read_num]); + } else { + new_cluster_head_fragment = cluster_num + clustering_problem.seed_count_prefix_sum[read_num]; + } + } + chain_problem->fragment_best_right = std::min(chain_problem->fragment_best_right, new_distances.second); + (read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second) = std::min((read_num == 0 ? chain_problem->read_best_right.first : chain_problem->read_best_right.second), new_distances.second); + } + + //Remove clusters that got combined + for (pair& cluster_head : to_erase) { + chain_problem->read_cluster_heads.erase(cluster_head); + } + //Add new clusters that weren't combined + for (pair, pair>& cluster : to_add) { + chain_problem->read_cluster_heads.emplace(cluster.first); + clustering_problem.all_seeds->at(cluster.first.first)->at(cluster.first.second).distance_left = cluster.second.first; + clustering_problem.all_seeds->at(cluster.first.first)->at(cluster.first.second).distance_right = cluster.second.second; + } + //Add new clusters that were combined + if (new_cluster_by_read.first.read_num != std::numeric_limits::max()){ + chain_problem->read_cluster_heads.emplace(new_cluster_by_read.first.read_num, new_cluster_by_read.first.cluster_num); + clustering_problem.all_seeds->at(new_cluster_by_read.first.read_num)->at(new_cluster_by_read.first.cluster_num).distance_left = new_cluster_by_read.first.distance_left; + clustering_problem.all_seeds->at(new_cluster_by_read.first.read_num)->at(new_cluster_by_read.first.cluster_num).distance_right = new_cluster_by_read.first.distance_right; + } + if (new_cluster_by_read.second.read_num != std::numeric_limits::max()){ + chain_problem->read_cluster_heads.emplace(new_cluster_by_read.second.read_num, new_cluster_by_read.second.cluster_num); + clustering_problem.all_seeds->at(new_cluster_by_read.second.read_num)->at(new_cluster_by_read.second.cluster_num).distance_left = new_cluster_by_read.second.distance_left; + clustering_problem.all_seeds->at(new_cluster_by_read.second.read_num)->at(new_cluster_by_read.second.cluster_num).distance_right = new_cluster_by_read.second.distance_right; + } + } else { +#ifdef DEBUG_CLUSTER + cerr << "The snarl was too big to combine with anything, so go through the current clusters of the chain and add distances" << endl; + cerr << distance_from_last_child_to_current_end << " and " << distance_from_current_end_to_end_of_chain << " to the distances right" << endl; +#endif + //If this was a snarl that we skipped because the distances were too big, then we need to update the distances + chain_problem->fragment_best_right = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(old_best_right, + distance_from_last_child_to_current_end), + distance_from_current_end_to_end_of_chain); + chain_problem->read_best_right.first = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(old_best_right_by_read.first, + distance_from_last_child_to_current_end), + distance_from_current_end_to_end_of_chain); + chain_problem->read_best_right.second = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(old_best_right_by_read.second, + distance_from_last_child_to_current_end), + distance_from_current_end_to_end_of_chain); + for (pair cluster_head : chain_problem->read_cluster_heads) { + clustering_problem.all_seeds->at(cluster_head.first)->at(cluster_head.second).distance_right = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + clustering_problem.all_seeds->at(cluster_head.first)->at(cluster_head.second).distance_right, + distance_from_last_child_to_current_end), + distance_from_current_end_to_end_of_chain); + } + } + + + //Update the last node we saw to this one + last_child = current_child; + last_prefix_sum = child_problem.prefix_sum_value; + last_length = child_problem.node_length; //The length of this snarl + last_chain_component_end = child_problem.chain_component_end;//The component of the end node of this snarl +} + +//Cluster the root +//all children of the root will be in clustering_problem.root_children +//This is basically cluster_one_snarl except the snarl is the root, which has no boundary nodes +void SnarlDistanceIndexClusterer::cluster_root(ClusteringProblem& clustering_problem) const { +#ifdef DEBUG_CLUSTER + cerr << "Finding clusters on the root with " << clustering_problem.root_children.size() << " children" << endl; +#endif + if (clustering_problem.root_children.size() == 0) { + return; + } + + //Keep track of all clusters on the root + SnarlTreeNodeProblem root_problem(distance_index.get_root(), clustering_problem.all_seeds->size(), + clustering_problem.seed_count_prefix_sum.back(), distance_index); + + //Remember old distances + vector> child_distances (clustering_problem.seed_count_prefix_sum.back(), + make_pair(std::numeric_limits::max(), std::numeric_limits::max())); + + + //Sort the root children by parent, the order of the children doesn't matter + //Order of the parents doesn't matter either, it's just to get them together + std::sort(clustering_problem.root_children.begin(), clustering_problem.root_children.end(), + [&](const auto& a, const auto& b) { + return a.first < b.first; + }); + + //Go through the list of parent child pairs. Once we reach a new parent, cluster all children found up to this point + net_handle_t current_parent = clustering_problem.root_children.front().first; + vector children; + for (size_t root_child_i = 0 ; root_child_i < clustering_problem.root_children.size() ; root_child_i++) { + pair& parent_to_child = clustering_problem.root_children[root_child_i]; + net_handle_t& parent = parent_to_child.first; + + if (current_parent == parent || root_child_i == 0) { + children.emplace_back(parent_to_child.second); + } + if (current_parent != parent || root_child_i == clustering_problem.root_children.size()-1) { +#ifdef DEBUG_CLUSTER + cerr << "Clustering root snarl " << distance_index.net_handle_as_string(parent) << " with " << children.size() << " chidlren" << endl; +#endif + + if (children.size() > 0) { + + for (size_t i = 0; i < children.size() ; i++) { + //Go through each child node of the netgraph + + SnarlTreeNodeProblem* child_problem_i = &clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(children[i])); + for (const pair& head : child_problem_i->read_cluster_heads) { + child_distances[head.second + clustering_problem.seed_count_prefix_sum[head.first]] = + make_pair(clustering_problem.all_seeds->at(head.first)->at(head.second).distance_left, + clustering_problem.all_seeds->at(head.first)->at(head.second).distance_right); + } + + for (size_t j = 0 ; j <= i ; j++){ + //Go through other child net graph nodes up to and including i + + //Get the other node and its clusters + SnarlTreeNodeProblem* child_problem_j = &clustering_problem.all_node_problems.at( + clustering_problem.net_handle_to_node_problem_index.at(children[j])); + + + + compare_and_combine_cluster_on_child_structures(clustering_problem, child_problem_i, + child_problem_j, &root_problem, child_distances, true, false); + + } + } + } + current_parent = parent; + children.clear(); + children.emplace_back(parent_to_child.second); + } + + } +#ifdef DEBUG_CLUSTER + cerr << "\tFound clusters on the root" << endl; + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++) { + cerr << "\t for read num " << read_num << endl; + for (pair c : root_problem.read_cluster_heads) { + if (c.first == read_num) { + cerr << "\t\t" << c.first << ":"<at(c.first)->size() ; x++) { + if (clustering_problem.read_union_find[c.first].find_group(x) == c.second) { + cerr << clustering_problem.all_seeds->at(c.first)->at(x).pos << " "; + } + } + cerr << endl; + } + } + } + + for (pair group_id : root_problem.read_cluster_heads) { + assert (group_id.second == clustering_problem.read_union_find[group_id.first].find_group(group_id.second)); + } +#endif +} + + +//Cluster all the seeds on a node or chain of only seeds +//Seeds are assumed to be sorted +//Since the seeds can be linearly arranged, they can be clustered just by walking along an ordered list of seeds +void SnarlDistanceIndexClusterer::cluster_seeds_on_linear_structure(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* node_problem, + size_t structure_length, bool include_prefix_sum, bool skip_distances_to_ends) const{ + if (node_problem->children.size() == 0) { + return; + } +#ifdef DEBUG_CLUSTER + cerr << "Cluster " << node_problem->children.size() << " seeds on a single structure " << distance_index.net_handle_as_string(node_problem->containing_net_handle) << endl; + cerr << "\t with node length " << structure_length << endl; +#endif + + if (clustering_problem.read_distance_limit >= structure_length) { + //If the limit is greater than the node length, then all the + //seeds on this node must be in the same cluster + + //The cluster heads of the new cluster + size_t fragment_group_id = std::numeric_limits::max(); + pair group_ids (std::numeric_limits::max(), std::numeric_limits::max()); + + for (auto& child : node_problem->children) { + //Go through all seeds in the range + + size_t read_num = child.seed_indices.first; + size_t seed_i = child.seed_indices.second; + + //And the distances for this seed + size_t dist_left = clustering_problem.all_seeds->at(read_num)->at(seed_i).distance_left; + if (include_prefix_sum) { + dist_left = SnarlDistanceIndex::sum(dist_left, + MIPayload::prefix_sum( clustering_problem.all_seeds->at(read_num)->at(seed_i).minimizer_cache)); + } + //Since we only stored the proper distance left for seeds on chains + size_t dist_right = structure_length - dist_left + 1; + + //Find the new best distances for anything in this cluster + //Since we're traversing the seeds in order, the best left and right will be the first and last + //ones we see for each read + if ((read_num == 0 ? node_problem->read_best_left.first : node_problem->read_best_left.second) == std::numeric_limits::max()) { + //Only update the best left if it hasn't been set yet + (read_num == 0 ? node_problem->read_best_left.first : node_problem->read_best_left.second) = dist_left; + } + //Best right will always be the most recent thing seen + (read_num == 0 ? node_problem->read_best_right.first : node_problem->read_best_right.second) = dist_right; + node_problem->fragment_best_right = dist_right; + + + //Put this seed in the cluster for the node + if ((read_num == 0 ? group_ids.first : group_ids.second) == std::numeric_limits::max()) { + (read_num == 0 ? group_ids.first : group_ids.second) = seed_i; + } else { + (read_num == 0 ? group_ids.first : group_ids.second) = clustering_problem.read_union_find[read_num].union_groups((read_num == 0 ? group_ids.first : group_ids.second), seed_i); + } + if (clustering_problem.fragment_distance_limit != 0 ) { + if (fragment_group_id == std::numeric_limits::max() ) { + fragment_group_id = fragment_group_id = seed_i + clustering_problem.seed_count_prefix_sum[read_num]; + } else { + fragment_group_id = clustering_problem.fragment_union_find.union_groups( + fragment_group_id, seed_i + clustering_problem.seed_count_prefix_sum[read_num]); + } + } + } + if (!skip_distances_to_ends) { + + const SeedCache& first_seed = clustering_problem.all_seeds->at(node_problem->children.front().seed_indices.first)->at(node_problem->children.front().seed_indices.second); + node_problem->fragment_best_left = SnarlDistanceIndex::sum(first_seed.distance_left, + include_prefix_sum ? MIPayload::prefix_sum(first_seed.minimizer_cache) : 0); + + //Record the new cluster + for (size_t read_num = 0 ; read_num < clustering_problem.all_seeds->size() ; read_num++ ) { + if ((read_num == 0 ? group_ids.first : group_ids.second) != std::numeric_limits::max()) { + size_t group_id = (read_num == 0 ? group_ids.first : group_ids.second); + node_problem->read_cluster_heads.emplace(read_num, group_id); + clustering_problem.all_seeds->at(read_num)->at(group_id).distance_left = (read_num == 0 ? node_problem->read_best_left.first : node_problem->read_best_left.second); + clustering_problem.all_seeds->at(read_num)->at(group_id).distance_right = (read_num == 0 ? node_problem->read_best_right.first : node_problem->read_best_right.second); + } + + } + } + +#ifdef DEBUG_CLUSTER + cerr << "\t" << distance_index.net_handle_as_string(node_problem->containing_net_handle) << " is shorter than the distance limit so just one cluster" << endl; + +#endif + return; + } + + //The seeds may form multiple clusters on the node + //Walk through a sorted list of seeds and split into clusters + + //Offset of the first seed for the cluster we're building + //One for each read + pair read_first_offset (std::numeric_limits::max(), std::numeric_limits::max()); + //Offset of the latest seed for the cluster we're building + pair read_last_offset (std::numeric_limits::max(), std::numeric_limits::max()); + //And the same for the fragment + size_t fragment_last_offset = std::numeric_limits::max(); + size_t fragment_last_cluster = std::numeric_limits::max(); + + pair read_last_cluster (std::numeric_limits::max(), std::numeric_limits::max()); + + + for (auto& child : node_problem->children) { + //Go through all seeds in the range + + size_t read_num = child.seed_indices.first; + size_t seed_num = child.seed_indices.second; + + //And the distances for this seed + size_t offset = clustering_problem.all_seeds->at(read_num)->at(seed_num).distance_left; + if (include_prefix_sum) { + offset = SnarlDistanceIndex::sum(offset, + MIPayload::prefix_sum( clustering_problem.all_seeds->at(read_num)->at(seed_num).minimizer_cache)); + } + + //First and last offset and last cluster head for this read + size_t& first_offset = read_num == 0 ? read_first_offset.first : read_first_offset.second; + size_t& last_offset = read_num == 0 ? read_last_offset.first : read_last_offset.second; + size_t& last_cluster = read_num == 0 ? read_last_cluster.first : read_last_cluster.second; + + + if (first_offset == std::numeric_limits::max()) { + //If this is the first seed we've seen of this read + first_offset = offset; + (read_num == 0 ? node_problem->read_best_left.first : node_problem->read_best_left.second) = offset; + } + + if (last_offset != std::numeric_limits::max() && + offset - last_offset <= clustering_problem.read_distance_limit) { + //If this seed is in the same read cluster as the previous one, + //union them + + last_cluster = clustering_problem.read_union_find[read_num].union_groups(seed_num, last_cluster); + last_offset = offset; + + if (clustering_problem.fragment_distance_limit != 0) { + //If we are also clustering paired end reads by fragment distance, + //cluster these together + fragment_last_cluster = clustering_problem.fragment_union_find.union_groups(seed_num+clustering_problem.seed_count_prefix_sum[read_num], fragment_last_cluster); + } + } else { + //This becomes a new read cluster + if (!skip_distances_to_ends && last_cluster != std::numeric_limits::max()) { + //Record the previous cluster + node_problem->read_cluster_heads.emplace(read_num, last_cluster); + clustering_problem.all_seeds->at(read_num)->at(last_cluster).distance_left = first_offset; + clustering_problem.all_seeds->at(read_num)->at(last_cluster).distance_right = structure_length - last_offset + 1; + } + last_cluster = seed_num; + first_offset = offset; + last_offset = offset; + if (clustering_problem.fragment_distance_limit != 0) { + if (fragment_last_offset != std::numeric_limits::max() && + offset - fragment_last_offset <= clustering_problem.fragment_distance_limit) { + //If this is a new read cluster but the same fragment cluster + fragment_last_cluster = clustering_problem.fragment_union_find.union_groups(seed_num+clustering_problem.seed_count_prefix_sum[read_num], fragment_last_cluster); + + } else { + //If this is a new fragment cluster as well + fragment_last_cluster = seed_num+clustering_problem.seed_count_prefix_sum[read_num]; + } + } + } + fragment_last_offset = offset; + } + if (!skip_distances_to_ends) { + //If we want to remember, record the best distances to the ends of this structure + for (size_t i = 0 ; i < clustering_problem.all_seeds->size() ; i++) { + if ((i == 0 ? read_last_cluster.first : read_last_cluster.second) != std::numeric_limits::max()) { + node_problem->read_cluster_heads.emplace(i, (i == 0 ? read_last_cluster.first : read_last_cluster.second)); + clustering_problem.all_seeds->at(i)->at((i == 0 ? read_last_cluster.first : read_last_cluster.second)).distance_left = (i == 0 ? read_first_offset.first : read_first_offset.second); + clustering_problem.all_seeds->at(i)->at((i == 0 ? read_last_cluster.first : read_last_cluster.second)).distance_right = structure_length-(i == 0 ? read_last_offset.first : read_last_offset.second)+1; + + if (i == 0) { + node_problem->read_best_right.first = structure_length-(i == 0 ? read_last_offset.first : read_last_offset.second) +1; + } else { + node_problem->read_best_right.second = structure_length-(i == 0 ? read_last_offset.first : read_last_offset.second)+1; + } + } + } + + //Get the best left and right values of the node from the first and last seeds + const SeedCache& first_seed = clustering_problem.all_seeds->at(node_problem->children.front().seed_indices.first)->at(node_problem->children.front().seed_indices.second); + node_problem->fragment_best_left = SnarlDistanceIndex::sum(first_seed.distance_left, + include_prefix_sum ? MIPayload::prefix_sum(first_seed.minimizer_cache) : 0); + + node_problem->fragment_best_right = structure_length-fragment_last_offset+1; + } + return; +} + +size_t SnarlDistanceIndexClusterer::distance_between_seeds(const Seed& seed1, const Seed& seed2, bool stop_at_lowest_common_ancestor) const { + + /*Helper function to walk up the snarl tree + * Given a net handle, its parent, and the distances to the start and end of the handle, + * update the distances to reach the ends of the parent and update the handle and its parent + * If the parent is a chain, then the new distances include the boundary nodes of the chain. + * If it is a snarl, it does not*/ + auto update_distances = [&](net_handle_t& net, net_handle_t& parent, size_t& dist_start, size_t& dist_end) { +#ifdef debug_distances + cerr << " Updating distance from node " << distance_index.net_handle_as_string(net) << " at parent " << distance_index.net_handle_as_string(parent) << " from " << dist_start << " " << dist_end << endl; +#endif + + if (distance_index.is_trivial_chain(parent)) { + //Don't update distances for the trivial chain + return; + } else if (distance_index.is_simple_snarl(parent)) { + //If it's a simple snarl just check if they should be reversed + if (distance_index.is_reversed_in_parent (net)) { + size_t tmp = dist_start; + dist_start = dist_end; + dist_end = tmp; + } + return; + } + + net_handle_t start_bound = distance_index.get_bound(parent, false, true); + net_handle_t end_bound = distance_index.get_bound(parent, true, true); + + //The lengths of the start and end nodes of net + //This is only needed if net is a snarl, since the boundary nodes are not technically part of the snarl + size_t start_length = distance_index.is_chain(parent) ? distance_index.node_length(start_bound) : 0; + size_t end_length = distance_index.is_chain(parent) ? distance_index.node_length(end_bound) : 0; + + //Get the distances from the bounds of the parent to the node we're looking at + size_t distance_start_start = start_bound == net ? 0 + : SnarlDistanceIndex::sum(start_length, distance_index.distance_in_parent(parent, start_bound, distance_index.flip(net), graph)); + size_t distance_start_end = start_bound == distance_index.flip(net) ? 0 + : SnarlDistanceIndex::sum(start_length, distance_index.distance_in_parent(parent, start_bound, net, graph)); + size_t distance_end_start = end_bound == net ? 0 + : SnarlDistanceIndex::sum(end_length, distance_index.distance_in_parent(parent, end_bound, distance_index.flip(net), graph)); + size_t distance_end_end = end_bound == distance_index.flip(net) ? 0 + : SnarlDistanceIndex::sum(end_length, distance_index.distance_in_parent(parent, end_bound, net, graph)); + + size_t distance_start = dist_start; + size_t distance_end = dist_end; + + dist_start = std::min(SnarlDistanceIndex::sum(distance_start_start, distance_start), + SnarlDistanceIndex::sum(distance_start_end , distance_end)); + dist_end = std::min(SnarlDistanceIndex::sum(distance_end_start , distance_start), + SnarlDistanceIndex::sum(distance_end_end , distance_end)); +#ifdef debug_distances + cerr << " ...new distances to start and end: " << dist_start << " " << dist_end << endl; +#endif + return; + }; + + /* + * Get net handles for the two nodes and the distances from each position to the ends of the handles + */ + pos_t pos1 = seed1.pos; + pos_t pos2 = seed2.pos; + gbwtgraph::Payload payload1 = seed1.minimizer_cache; + gbwtgraph::Payload payload2 = seed2.minimizer_cache; + + bool has_cached_values1 = payload1 != MIPayload::NO_CODE; + bool has_cached_values2 = payload2 != MIPayload::NO_CODE; + net_handle_t net1 = has_cached_values1 ? distance_index.get_net_handle_from_values(MIPayload::record_offset(payload1), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::NODE_HANDLE, + MIPayload::node_record_offset(payload1)) + : distance_index.get_node_net_handle(get_id(pos1)); + net_handle_t net2 = has_cached_values2 ? distance_index.get_net_handle_from_values(MIPayload::record_offset(payload2), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::NODE_HANDLE, + MIPayload::node_record_offset(payload2)) + : distance_index.get_node_net_handle(get_id(pos2)); + + size_t minimum_distance = std::numeric_limits::max(); + if (net1 == net2) { + //If the two positions are on the same node, get the distance between them + size_t node_length = has_cached_values1 ? MIPayload::node_length(payload1) + : distance_index.node_length(net1); + size_t distance_to_start1 = is_rev(pos1) ? node_length - get_offset(pos1) : get_offset(pos1) + 1; + size_t distance_to_end1 = is_rev(pos1) ? get_offset(pos1) + 1 : node_length - get_offset(pos1); + size_t distance_to_start2 = is_rev(pos2) ? node_length - get_offset(pos2) : get_offset(pos2) + 1; + size_t distance_to_end2 = is_rev(pos2) ? get_offset(pos2) + 1 : node_length - get_offset(pos2); + + if (distance_to_start1 < distance_to_start2) { + //IF 1 comes before 2 + minimum_distance = SnarlDistanceIndex::minus(SnarlDistanceIndex::sum(distance_to_end1 , distance_to_start2), node_length); + } else { + minimum_distance = SnarlDistanceIndex::minus(SnarlDistanceIndex::sum(distance_to_end2 , distance_to_start1), node_length); + } + if (stop_at_lowest_common_ancestor) { + //If we only care about the lowest common ancestor, then return + return SnarlDistanceIndex::minus(minimum_distance, 1); + } + + } + + /* + * Since we want to use the minimizer payload, go up one level of the snarl tree here, before using the + * distance index. + * Find the parent and the distances to the ends of the parent using the payload + */ + + //Get the parents of the nodes + net_handle_t parent1; + //If the grandparent is a root/root snarl, then make it the parent and the node a trivial chain + //because they will be clustered here and added to the root instead of being added to the + //snarl tree to be clustered + if (has_cached_values1) { + if (MIPayload::is_trivial_chain(payload1)) { + //If the node is a trivial chain, then the parent is just the node but recorded as a chain in the net handle + parent1 = distance_index.get_net_handle_from_values (distance_index.get_record_offset(net1), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::CHAIN_HANDLE, + MIPayload::node_record_offset(payload1)); + if (MIPayload::parent_record_offset(payload1) == 0) { + //If the parent offset stored in the cache is the root, then this is a trivial chain + //child of the root not in a root snarl, so remember the root as the parent and the + //trivial chain as th enode + net1 = parent1; + parent1 = distance_index.get_root(); + } else if (MIPayload::parent_is_root(payload1) && !MIPayload::parent_is_chain(payload1)) { + //If the parent is a root snarl, then the node becomes the trivial chain + //and we get the parent root snarl from the cache + net1 = parent1; + parent1 = distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(payload1), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::ROOT_HANDLE); + } + } else if (MIPayload::parent_record_offset(payload1) == 0) { + //The parent is just the root + parent1 = distance_index.get_root(); + } else if (MIPayload::parent_is_root(payload1) && !MIPayload::parent_is_chain(payload1)) { + //If the parent is a root snarl + parent1 = distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(payload1), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::ROOT_HANDLE); + } else { + //Otherwise the parent is an actual chain and we use the value from the cache + parent1 = distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(payload1), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::CHAIN_HANDLE); + } + } else { + parent1 = distance_index.start_end_traversal_of(distance_index.get_parent(net1)); + if (distance_index.is_trivial_chain(parent1)){ + net_handle_t grandparent = distance_index.get_parent(parent1); + if (distance_index.is_root(grandparent)){ + net1 = parent1; + parent1 = distance_index.start_end_traversal_of(grandparent); + } + } + } + + net_handle_t parent2; + //If the grandparent is a root/root snarl, then make it the parent and the node a trivial chain + //because they will be clustered here and added to the root instead of being added to the + //snarl tree to be clustered + if (has_cached_values2) { + if (MIPayload::is_trivial_chain(payload2)) { + //If the node is a trivial chain, then the parent is just the node but recorded as a chain in the net handle + parent2 = distance_index.get_net_handle_from_values (distance_index.get_record_offset(net2), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::CHAIN_HANDLE, + MIPayload::node_record_offset(payload2)); + if (MIPayload::parent_record_offset(payload2) == 0) { + //If the parent offset stored in the cache is the root, then this is a trivial chain + //child of the root not in a root snarl, so remember the root as the parent and the + //trivial chain as th enode + net2 = parent2; + parent2 = distance_index.get_root(); + } else if (MIPayload::parent_is_root(payload2) && !MIPayload::parent_is_chain(payload2)) { + //If the parent is a root snarl, then the node becomes the trivial chain + //and we get the parent root snarl from the cache + net2 = parent2; + parent2 = distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(payload2), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::ROOT_HANDLE); + } + } else if (MIPayload::parent_record_offset(payload2) == 0) { + //The parent is just the root + parent2 = distance_index.get_root(); + } else if (MIPayload::parent_is_root(payload2) && !MIPayload::parent_is_chain(payload2)) { + //If the parent is a root snarl + parent2 = distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(payload2), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::ROOT_HANDLE); + } else { + //Otherwise the parent is an actual chain and we use the value from the cache + parent2 = distance_index.get_net_handle_from_values(MIPayload::parent_record_offset(payload2), + SnarlDistanceIndex::START_END, + SnarlDistanceIndex::CHAIN_HANDLE); + } + } else { + parent2 = distance_index.start_end_traversal_of(distance_index.get_parent(net2)); + if (distance_index.is_trivial_chain(parent2)){ + net_handle_t grandparent = distance_index.get_parent(parent2); + if (distance_index.is_root(grandparent)){ + net2 = parent2; + parent2 = distance_index.start_end_traversal_of(grandparent); + } + } + } + + + +#ifdef debug_distances + cerr << "Found parents " << distance_index.net_handle_as_string(parent1) << " and " << distance_index.net_handle_as_string(parent2) << endl; +#endif + + pair lowest_ancestor = distance_index.lowest_common_ancestor(parent1, parent2); + //The lowest common ancestor of the two positions + net_handle_t common_ancestor = distance_index.start_end_traversal_of(lowest_ancestor.first); + +#ifdef debug_distances + cerr << "Found the lowest common ancestor " << distance_index.net_handle_as_string(common_ancestor) << endl; +#endif + + //These are the distances to the ends of the node, including the position + size_t node_length1 = has_cached_values1 ? MIPayload::node_length(payload1) + : distance_index.minimum_length(net1); + size_t node_length2 = has_cached_values2 ? MIPayload::node_length(payload2) + : distance_index.minimum_length(net2); + size_t distance_to_start1 = is_rev(pos1) ? node_length1 - get_offset(pos1) : get_offset(pos1) + 1; + size_t distance_to_end1 = is_rev(pos1) ? get_offset(pos1) + 1 : node_length1 - get_offset(pos1); + size_t distance_to_start2 = is_rev(pos2) ? node_length2 - get_offset(pos2) : get_offset(pos2) + 1; + size_t distance_to_end2 = is_rev(pos2) ? get_offset(pos2) + 1 : node_length2 - get_offset(pos2); + +#ifdef debug_distances + cerr << "Reached node " << distance_index.net_handle_as_string(net1) << " for position 1" << endl; + cerr << " with distances to ends " << distance_to_start1 << " and " << distance_to_end1 << endl; + cerr << "Reached node " << distance_index.net_handle_as_string(net2) << " for position 2" << endl; + cerr << " with distances to ends " << distance_to_start2 << " and " << distance_to_end2 << endl; +#endif + /* get the distance from the ends of the nodes to the ends of the parent, and update the nodes to their parent*/ + + if (distance_index.start_end_traversal_of(parent1) == distance_index.start_end_traversal_of(parent2)) { + //If the parents are the same, then just find the distance between the nodes and return + //Find the minimum distance between the two children (net1 and net2) + if ( has_cached_values1 && MIPayload::parent_is_chain(payload1)) { + if (MIPayload::prefix_sum(payload1) < MIPayload::prefix_sum(payload2)) { + //If seed1 comes before seed2 + size_t distance_between = SnarlDistanceIndex::minus( SnarlDistanceIndex::minus(MIPayload::prefix_sum(payload2), + MIPayload::prefix_sum(payload1)), + MIPayload::node_length(payload1)); + minimum_distance = SnarlDistanceIndex::sum(distance_between, + SnarlDistanceIndex::sum(MIPayload::is_reversed(payload1) ? distance_to_start1 : distance_to_end1, + MIPayload::is_reversed(payload2) ? distance_to_end2 : distance_to_start2)); + } else { + size_t distance_between = SnarlDistanceIndex::minus( SnarlDistanceIndex::minus(MIPayload::prefix_sum(payload1), + MIPayload::prefix_sum(payload2)), + MIPayload::node_length(payload2)); + minimum_distance = SnarlDistanceIndex::sum(distance_between, + SnarlDistanceIndex::sum(MIPayload::is_reversed(payload2) ? distance_to_start2 : distance_to_end2, + MIPayload::is_reversed(payload1) ? distance_to_end1 : distance_to_start1)); + } + } else { + //Otherwise, the parent is a snarl and the distances are found with the index + size_t distance_start_start = distance_index.distance_in_parent(parent1, distance_index.flip(net1), distance_index.flip(net2), graph); + size_t distance_start_end = distance_index.distance_in_parent(parent1, distance_index.flip(net1), net2, graph); + size_t distance_end_start = distance_index.distance_in_parent(parent1, net1, distance_index.flip(net2), graph); + size_t distance_end_end = distance_index.distance_in_parent(parent1, net1, net2, graph); + + //And add those to the distances we've found to get the minimum distance between the positions + minimum_distance = std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_start_start , distance_to_start1), distance_to_start2), + std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_start_end , distance_to_start1), distance_to_end2), + std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_end_start , distance_to_end1), distance_to_start2), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_end_end , distance_to_end1), distance_to_end2)))); + } + if (stop_at_lowest_common_ancestor) { + return minimum_distance == std::numeric_limits::max() ? std::numeric_limits::max() + : minimum_distance - 1; + } + } + + //Otherwise, find the distances to the ends of the parents, update them, and continue + //only if the parent isn't the common ancestor + if (parent1 != common_ancestor && !distance_index.is_root(parent1)) { + if (has_cached_values1 && MIPayload::parent_is_chain(payload1) && !MIPayload::is_trivial_chain(payload1)) { + size_t distance_to_chain_start = MIPayload::prefix_sum(payload1); + size_t distance_to_chain_end = SnarlDistanceIndex::minus(SnarlDistanceIndex::minus(distance_index.minimum_length(parent1), + MIPayload::prefix_sum(payload1)), MIPayload::node_length(payload1)); + size_t old_distance_to_start = distance_to_start1; + size_t old_distance_to_end = distance_to_end1; +#ifdef debug_distances + cerr << "\tUsing cache to update to ends of chain1 using distances " << distance_to_chain_start << " and " << distance_to_chain_end << endl; +#endif + + distance_to_start1 = SnarlDistanceIndex::sum(distance_to_chain_start, + MIPayload::is_reversed(payload1) ? old_distance_to_end : old_distance_to_start); + distance_to_end1 = SnarlDistanceIndex::sum(distance_to_chain_end, + MIPayload::is_reversed(payload1) ? old_distance_to_start : old_distance_to_end); + } else { + update_distances(net1, parent1, distance_to_start1, distance_to_end1); + } + net1 = std::move(parent1); + } + if (parent2 != common_ancestor && !distance_index.is_root(parent2)) { + if (has_cached_values2 && MIPayload::parent_is_chain(payload2) && !MIPayload::is_trivial_chain(payload2)) { + size_t distance_to_chain_start = MIPayload::prefix_sum(payload2); + size_t distance_to_chain_end = SnarlDistanceIndex::minus(SnarlDistanceIndex::minus(distance_index.minimum_length(parent2), + MIPayload::prefix_sum(payload2)), MIPayload::node_length(payload2)); + size_t old_distance_to_start = distance_to_start2; + size_t old_distance_to_end = distance_to_end2; +#ifdef debug_distances + cerr << "\tUsing cache to update to ends of chain2 using distances " << distance_to_chain_start << " and " << distance_to_chain_end << endl; +#endif + + distance_to_start2 = SnarlDistanceIndex::sum(distance_to_chain_start, + MIPayload::is_reversed(payload2) ? old_distance_to_end : old_distance_to_start); + distance_to_end2 = SnarlDistanceIndex::sum(distance_to_chain_end, + MIPayload::is_reversed(payload2) ? old_distance_to_start : old_distance_to_end); + + } else { + update_distances(net2, parent2, distance_to_start2, distance_to_end2); + } + net2 = std::move(parent2); + } + + + +#ifdef debug_distances + cerr << "Updated to parents" << endl; + cerr << "Reached node " << distance_index.net_handle_as_string(net1) << " for position 1" << endl; + cerr << " with distances to ends " << distance_to_start1 << " and " << distance_to_end1 << endl; + cerr << "Reached node " << distance_index.net_handle_as_string(net2) << " for position 2" << endl; + cerr << " with distances to ends " << distance_to_start2 << " and " << distance_to_end2 << endl; +#endif + + + + if (!lowest_ancestor.second) { + //If these are not in the same connected component +#ifdef debug_distances + cerr << "These are in different connected components" << endl; +#endif + return std::numeric_limits::max(); + } + + /* + * Walk up the snarl tree until net1 and net2 are children of the lowest common ancestor + * Keep track of the distances to the ends of the net handles as we go + */ + + if (distance_index.start_end_traversal_of(net1) == distance_index.start_end_traversal_of(net2)){ + if (SnarlDistanceIndex::sum(distance_to_end1 , distance_to_start2) > distance_index.minimum_length(net1) && + SnarlDistanceIndex::sum(distance_to_end1 , distance_to_start2) != std::numeric_limits::max()) { + //If the positions are on the same node and are pointing towards each other, then + //check the distance between them in the node + minimum_distance = SnarlDistanceIndex::minus(SnarlDistanceIndex::sum(distance_to_end1 , distance_to_start2), + distance_index.minimum_length(net1)); + } + if (SnarlDistanceIndex::sum(distance_to_start1 , distance_to_end2) > distance_index.minimum_length(net1) && + SnarlDistanceIndex::sum(distance_to_start1 , distance_to_end2) != std::numeric_limits::max()) { + minimum_distance = std::min(SnarlDistanceIndex::minus(SnarlDistanceIndex::sum(distance_to_start1 , distance_to_end2), + distance_index.minimum_length(net1)), + minimum_distance); + } + if (!stop_at_lowest_common_ancestor) { + common_ancestor = distance_index.start_end_traversal_of(distance_index.get_parent(net1)); + } + + + } else { + + //Get the distance from position 1 up to the ends of a child of the common ancestor +#ifdef debug_distances + cerr << "Reaching the children of the lowest common ancestor for first position..." << endl; +#endif + while (distance_index.start_end_traversal_of(distance_index.get_parent(net1)) != common_ancestor && !distance_index.is_root(distance_index.get_parent(net1))) { + net_handle_t parent = distance_index.start_end_traversal_of(distance_index.get_parent(net1)); + update_distances(net1, parent, distance_to_start1, distance_to_end1); + net1 = parent; + } +#ifdef debug_distances + cerr << "Reached node " << distance_index.net_handle_as_string(net1) << " for position 1" << endl; + cerr << " with distances to ends " << distance_to_start1 << " and " << distance_to_end1 << endl; + cerr << "Reaching the children of the lowest common ancestor for position 2..." << endl; +#endif + //And the same for position 2 + while (distance_index.start_end_traversal_of(distance_index.get_parent(net2)) != distance_index.start_end_traversal_of(common_ancestor) && !distance_index.is_root(distance_index.get_parent(net2))) { + net_handle_t parent = distance_index.start_end_traversal_of(distance_index.get_parent(net2)); + update_distances(net2, parent, distance_to_start2, distance_to_end2); + net2 = parent; + } +#ifdef debug_distances + cerr << "Reached node " << distance_index.net_handle_as_string(net2) << " for position 2" << endl; + cerr << " with distances to ends " << distance_to_start2 << " and " << distance_to_end2 << endl; +#endif + } + if (stop_at_lowest_common_ancestor) { + + return minimum_distance == std::numeric_limits::max() ? std::numeric_limits::max() : minimum_distance-1; + } + + /* + * common_ancestor is now the lowest common ancestor of both net handles, and + * net1 and net2 are both children of common_ancestor + * Walk up to the root and check for distances between the positions within each + * ancestor + */ + + while (!distance_index.is_root(net1)){ +#ifdef debug_distances + cerr << "At common ancestor " << distance_index.net_handle_as_string(common_ancestor) << endl; + cerr << " with distances for child 1 (" << distance_index.net_handle_as_string(net1) << "): " << distance_to_start1 << " " << distance_to_end1 << endl; + cerr << " child 2 (" << distance_index.net_handle_as_string(net2) << "): " << distance_to_start2 << " " << distance_to_end2 << endl; +#endif + + //Find the minimum distance between the two children (net1 and net2) + size_t distance_start_start = distance_index.distance_in_parent(common_ancestor, distance_index.flip(net1), distance_index.flip(net2), graph); + size_t distance_start_end = distance_index.distance_in_parent(common_ancestor, distance_index.flip(net1), net2, graph); + size_t distance_end_start = distance_index.distance_in_parent(common_ancestor, net1, distance_index.flip(net2), graph); + size_t distance_end_end = distance_index.distance_in_parent(common_ancestor, net1, net2, graph); + + //And add those to the distances we've found to get the minimum distance between the positions + minimum_distance = std::min(minimum_distance, + std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_start_start , distance_to_start1), distance_to_start2), + std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_start_end , distance_to_start1), distance_to_end2), + std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_end_start , distance_to_end1), distance_to_start2), + SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(distance_end_end , distance_to_end1), distance_to_end2))))); + +#ifdef debug_distances + cerr << " Found distances between nodes: " << distance_start_start << " " << distance_start_end << " " << distance_end_start << " " << distance_end_end << endl; + cerr << " best distance is " << minimum_distance << endl; +#endif + if (!distance_index.is_root(common_ancestor)) { + //Update the distances to reach the ends of the common ancestor + update_distances(net1, common_ancestor, distance_to_start1, distance_to_end1); + update_distances(net2, common_ancestor, distance_to_start2, distance_to_end2); + + //Update which net handles we're looking at + net1 = common_ancestor; + net2 = common_ancestor; + common_ancestor = distance_index.start_end_traversal_of(distance_index.get_parent(common_ancestor)); + } else { + //Just update this one to break out of the loop + net1 = common_ancestor; + } + } + + //minimum distance currently includes both positions + return minimum_distance == std::numeric_limits::max() ? std::numeric_limits::max() : minimum_distance-1; +} + + +} + diff --git a/src/snarl_seed_clusterer.hpp b/src/snarl_seed_clusterer.hpp new file mode 100644 index 00000000000..23d49f2ae8a --- /dev/null +++ b/src/snarl_seed_clusterer.hpp @@ -0,0 +1,484 @@ +#ifndef VG_SNARL_SEED_CLUSTERER_HPP_INCLUDED +#define VG_SNARL_SEED_CLUSTERER_HPP_INCLUDED + +#include "snarls.hpp" +#include "snarl_distance_index.hpp" +#include "hash_map.hpp" +#include "small_bitset.hpp" +#include + + +namespace vg{ + + +/** + * SnarlDistanceIndexClusterer is used for clustering seeds (positions on the graph) + * A "cluster" is a partition of seeds that is based on the minimum distance between them in the graph + * Consider a graph where each seed is a node and two seeds are connected if the minimum distance + * between them is smaller than a given distance limit. Each connected component of this graph is a cluster + * + * The clustering algorithm is based on the snarl tree + * Clusters are formed on nodes of the snarl tree, which represent nodes/snarls/chains + * Each node/snarl/chain represents a subgraph of the variation graph + * A clustered snarl tree node contains all seeds that occur on its subgraph, and the seeds have been partitioned into clusters + * Each cluster knows the shortest distance from any seed it contains to both ends of the snarl tree node containing it + * Clustering is done progressively by walking up the snarl tree and forming clusters on each snarl tree node (only visiting nodes that have seeds on them) + * At each snarl tree node, assume that its children have already been clustered. + * The clusters of the children are compared to each other, and any pair that are close enough + * are combined to produce clusters on the parent + * The distances from each cluster to the ends of the parent are updated + * + * The algorithm starts by assigning each seed to its node on the snarl tree + * Since nodes are all on chains, this fills in all the children of chains that are nodes + * It then walks up the snarl tree, level by level, and clusters each snarl tree node that contains seeds + * At a given level, first cluster each chain in the level. After clustering a chain, assign it + * to its parent snarl. Then, go through each of the snarls that have just been given children, and + * cluster the snarls. Each snarl then gets assigned to its parent chain + * This completes one level of the snarl tree. Each chain in the next level has just been populated by the snarls + * from this level, and already knew about its nodes from the first step, so it is ready to be clustered + * + * Every time the clusterer is run, a ClusteringProblem is made to store information about the state of the clusterer + * The ClusteringProblem keeps track of which level of the snarl tree is currently being clustered, and + * keeps track of the children of the current and next level of the snarl tree. + * Each snarl tree node that contains seeds is represented by a SnarlTreeNodeProblem. + * The SnarlTreeNodeProblem represents the problem of clustering one snarl tree node. + * It knows the identities of its children and keeps track of its cluster heads + * + * + * + */ +class SnarlDistanceIndexClusterer { + + + + public: + + /// Seed information used in Giraffe. + struct Seed { + pos_t pos; + size_t source; // Source minimizer. + gbwtgraph::Payload minimizer_cache = MIPayload::NO_CODE; //minimizer payload + }; + + /// Seed information used for clustering + // Corresponds to one seed and stores the minimizer payload and distance information + // that gets updated during clustering + // TODO: This will copy information from the seed, since we need per-seed information anyways + // and some of it needs to be mutable, it's simpler than keeping around two collections of Seeds + struct SeedCache{ + + pos_t pos; + + //TODO: This gets copied because it needs to be mutable + //Cached values from the minimizer + //Use MIPayload::node_record_offset(minimizer_cache), etc to get values + gbwtgraph::Payload minimizer_cache; + + //The distances to the left and right of whichever cluster this seed represents + //This gets updated as clustering proceeds + //For a seed in a chain, distance_left is the left of the chain, right is the distance + //to the right side of the node, relative to the chain + size_t distance_left = std::numeric_limits::max(); + size_t distance_right = std::numeric_limits::max(); + size_t chain_component = std::numeric_limits::max(); + + }; + + /// Cluster information used in Giraffe. + struct Cluster { + std::vector seeds; // Seed ids. + size_t fragment; // Fragment id. + double score; // Sum of scores of distinct source minimizers of the seeds. + double coverage; // Fraction of read covered by the seeds. + SmallBitset present; // Minimizers that are present in the cluster. + }; + + SnarlDistanceIndexClusterer(const SnarlDistanceIndex& distance_index, const HandleGraph* graph); + SnarlDistanceIndexClusterer(const SnarlDistanceIndex* distance_index, const HandleGraph* graph); + SnarlDistanceIndexClusterer(const SnarlDistanceIndex& distance_index); + SnarlDistanceIndexClusterer(const SnarlDistanceIndex* distance_index); + + + /*Given a vector of seeds and a distance limit, + *cluster the seeds such that two seeds whose minimum distance + *between them (including both of the positions) is less than + *the distance limit are in the same cluster + *This produces a vector of clusters + */ + vector cluster_seeds ( const vector& seeds, size_t read_distance_limit) const; + + /* The same thing, but for paired end reads. + * Given seeds from multiple reads of a fragment, cluster each read + * by the read distance and all seeds by the fragment distance limit. + * fragment_distance_limit must be greater than read_distance_limit + * Returns a vector clusters for each read, where each cluster also has an assignment + * to a fragment cluster + * Requires that there are only two reads per fragment (all_seeds.size() == 2, meaning paired end reads) + * this requirement is just because I used std::pairs to represent two reads, but could be changed to a vector if we every have to map more than two reads per fragment + */ + + vector> cluster_seeds ( + const vector>& all_seeds, + size_t read_distance_limit, size_t fragment_distance_limit=0) const; + + + /** + * Find the minimum distance between two seeds. This will use the minimizer payload when possible + */ + size_t distance_between_seeds(const Seed& seed1, const Seed& seed2, + bool stop_at_lowest_common_ancestor) const; + + private: + + + //Actual clustering function that takes a vector of pointers to seeds + //fragment_distance_limit defaults to 0, meaning that we don't cluster by fragment + tuple, structures::UnionFind> cluster_seeds_internal ( + vector*>& all_seeds, + size_t read_distance_limit, size_t fragment_distance_limit=0) const; + + const SnarlDistanceIndex& distance_index; + const HandleGraph* graph; + + + /* + * This struct is used to store the clustering information about one + * snarl tree node (node/snarl/chain) + * + * It knows the cluster heads of the clusters on the node + * and the minimum distance from any seed in each cluster to the ends of the node + * If the node is a snarl, then the distances stored are to the boundary nodes but + * don't include the lengths of the boundary nodes; if the node is a node or chain, + * then the distances include the boundary nodes + * + * Relevant children of the snarl tree node are stored as SnarlTreeChild's, which + * may represent a seed (if the parent is a chain) or another snarl tree node + * The list of children is unsorted and must be sorted before clustering a chain + * + * This also stores additional information about the snarl tree node from the distance index + * including the distance from the ends of the node to the ends of the parent + */ + struct SnarlTreeNodeProblem { + + //set of the indices of heads of clusters (group ids in the + //union find) + //pair of + hash_set> read_cluster_heads; + + //Struct to store one child, which may be a seed, node, snarl, or chain + struct SnarlTreeChild { + //If the net_handle is a node, then the child is a seed, otherwise the handle + //is used to find the problem + net_handle_t net_handle; + pair seed_indices; + + //The values used to sort the children of a chain + //Storing it here is faster than looking it up each time + size_t chain_component; + size_t prefix_sum; + //Is this child a seed + //This is redundant with net_handle because any net_handle_t that is a node will really be a seed, + //but it's faster than looking it up in the distance index + bool is_seed; + //Have chain_component and prefix_sum been set? + //For a seed, it gets set when the child is made, otherwise the first time this + //child is seen when sorting + bool has_chain_values; + }; + //The children of this snarl tree node + //Initially unsorted, sort before clustering for chains + vector children; + + //The shortest distance from any seed in any cluster to the + //left/right end of the snarl tree node that contains these + //clusters + pair read_best_left = make_pair(std::numeric_limits::max(), std::numeric_limits::max()); + pair read_best_right = make_pair(std::numeric_limits::max(), std::numeric_limits::max()); + size_t fragment_best_left = std::numeric_limits::max(); + size_t fragment_best_right = std::numeric_limits::max(); + + //Distance from the start of the parent to the left of this node, etc + size_t distance_start_left = std::numeric_limits::max(); + size_t distance_start_right = std::numeric_limits::max(); + size_t distance_end_left = std::numeric_limits::max(); + size_t distance_end_right = std::numeric_limits::max(); + + //The snarl tree node that the clusters are on + net_handle_t containing_net_handle; + + //The parent and grandparent of containing_net_handle, which might or might not be set + //This is just to store information from the minimizer cache + net_handle_t parent_net_handle; + net_handle_t grandparent_net_handle; + + //The boundary node of containing_net_handle, for a snarl or chain + //if it is a snarl, then this is the actual node, not the sentinel + net_handle_t end_in; + + //Minimum length of a node or snarl + //If it is a chain, then it is distance_index.chain_minimum_length(), which is + //the expected length for a normal chain, and the length of the + //last component for a multicomponent chain + size_t node_length = std::numeric_limits::max(); + size_t prefix_sum_value = std::numeric_limits::max(); //of node or first node in snarl + size_t chain_component_start = 0; //of node or start of snarl + size_t chain_component_end = 0; //of node or end of snarl + + size_t loop_left = std::numeric_limits::max(); + size_t loop_right = std::numeric_limits::max(); + + //These are sometimes set if the value was in the cache + bool has_parent_handle = false; + bool has_grandparent_handle = false; + + //Only set this for nodes or snarls in chains + bool is_reversed_in_parent = false; + + bool is_trivial_chain = false; + bool is_looping_chain = false; + + + + + //Constructor + //read_count is the number of reads in a fragment (2 for paired end) + SnarlTreeNodeProblem( net_handle_t net, size_t read_count, size_t seed_count, const SnarlDistanceIndex& distance_index) : + containing_net_handle(std::move(net)), + fragment_best_left(std::numeric_limits::max()), fragment_best_right(std::numeric_limits::max()){ + read_cluster_heads.reserve(seed_count); + } + //Constructor for a node or trivial chain, used to remember information from the cache + SnarlTreeNodeProblem( net_handle_t net, size_t read_count, size_t seed_count, bool is_reversed_in_parent, size_t node_length, size_t prefix_sum, size_t component) : + containing_net_handle(net), + is_reversed_in_parent(is_reversed_in_parent), + node_length(node_length), + prefix_sum_value(prefix_sum), + chain_component_start(component), + chain_component_end(component), + fragment_best_left(std::numeric_limits::max()), fragment_best_right(std::numeric_limits::max()){ + read_cluster_heads.reserve(seed_count); + } + + //Set the values needed to cluster a chain + void set_chain_values(const SnarlDistanceIndex& distance_index) { + is_looping_chain = distance_index.is_looping_chain(containing_net_handle); + node_length = distance_index.chain_minimum_length(containing_net_handle); + end_in = distance_index.get_bound(containing_net_handle, true, true); + chain_component_end = distance_index.get_chain_component(end_in, true); + } + + //Set the values needed to cluster a snarl + void set_snarl_values(const SnarlDistanceIndex& distance_index) { + node_length = distance_index.minimum_length(containing_net_handle); + net_handle_t start_in = distance_index.get_node_from_sentinel(distance_index.get_bound(containing_net_handle, false, true)); + end_in = distance_index.get_node_from_sentinel(distance_index.get_bound(containing_net_handle, true, true)); + chain_component_start = distance_index.get_chain_component(start_in); + chain_component_end = distance_index.get_chain_component(end_in); + prefix_sum_value = SnarlDistanceIndex::sum( + distance_index.get_prefix_sum_value(start_in), + distance_index.minimum_length(start_in)); + loop_right = SnarlDistanceIndex::sum(distance_index.get_forward_loop_value(end_in), + 2*distance_index.minimum_length(end_in)); + //Distance to go backward in the chain and back + loop_left = SnarlDistanceIndex::sum(distance_index.get_reverse_loop_value(start_in), + 2*distance_index.minimum_length(start_in)); + + + } + + }; + + //These will be the cluster heads and distances for a cluster + struct ClusterHead { + size_t read_num = std::numeric_limits::max(); + size_t cluster_num = 0; + size_t distance_left = 0; + size_t distance_right = 0; + + inline ClusterHead() {} + inline ClusterHead(const size_t& read_num, const size_t& cluster_num, + const size_t& distance_left, const size_t& distance_right) : + read_num(read_num), cluster_num(cluster_num), + distance_left(distance_left), distance_right(distance_right) {} + }; + + + /* Hold all the tree relationships, seed locations, and cluster info + * for the current level of the snarl tree and the parent level + * As clustering occurs at the current level, the parent level + * is updated to know about its children + * + * One "level" is the chains at that level, and their parent snarls. + * Clustering one level means clustering the chains and then clustering the + * parent snarls. The parent snarls then get assigned to their parent chains, + * and ClusteringProblem gets reset for the next level (parent chains) + */ + struct ClusteringProblem { + + //Vector of all the seeds for each read + vector*>* all_seeds; + + //prefix sum vector of the number of seeds per read + //Used to get the index of a seed for the fragment clusters + //Also use this so that data structures that store information per seed can be single + //vectors, instead of a vector of vectors following the structure of all_seeds + //since it uses less memory allocation to use a single vector + vector seed_count_prefix_sum; + + //The distance limits. + //If the minimum distance between two seeds is less than this, + //they get put in the same cluster + size_t read_distance_limit; + size_t fragment_distance_limit; + + + //////////Data structures to hold clustering information + + //Structure to hold the clustering of the seeds + vector read_union_find; + //The indices of seeds in the union find are the indices if you appended each of + //the vectors of seeds for the fragment (i.e. if a seed in the second read is + //at index x in the second vector of seeds, then its index in fragment_union_find + //is x + the length of the first vector of seeds) + structures::UnionFind fragment_union_find; + + + + //////////Data structures to hold snarl tree relationships + //The snarls and chains get updated as we move up the snarl tree + + //Maps each net_handle_t to an index to its node problem, in all_node_problems + hash_map net_handle_to_node_problem_index; + //This stores all the snarl tree nodes and their clustering scratch work + vector all_node_problems; + + //All chains for the current level of the snarl tree and gets updated as the algorithm + //moves up the snarl tree. At one iteration, the algorithm will go through each chain + //in chain to children and cluster the chain using clusters on the children + vector* current_chains; + + + //Same as current_chains but for the level of the snarl + //tree above the current one + //This gets updated as the current level is processed - the snarls from this level + //are added as children to parent_chain_to_children. + //After processing one level, this becomes the next chain_to_children + vector* parent_chains; + + //All snarls for the current level of the snarl tree + //(chains from chain_to_children get added to their parent snarls, snarls get added to parent_snarls + //then all snarls in snarl_to_children are clustered and added to parent_chain_to_children) + vector parent_snarls; + + + //This holds all the child problems of the root + //Each pair is the parent and the child. This will be sorted by parent before + //clustering + vector> root_children; + + + ///////////////////////////////////////////////////////// + + //Constructor takes in a pointer to the seeds, the distance limits, and + //the total number of seeds in all_seeds + ClusteringProblem (vector*>* all_seeds, + size_t read_distance_limit, size_t fragment_distance_limit, size_t seed_count) : + all_seeds(all_seeds), + read_distance_limit(read_distance_limit), + fragment_distance_limit(fragment_distance_limit), + fragment_union_find (seed_count, false), + seed_count_prefix_sum(1,0){ + + for (size_t i = 0 ; i < all_seeds->size() ; i++) { + size_t size = all_seeds->at(i)->size(); + size_t offset = seed_count_prefix_sum.back() + size; + seed_count_prefix_sum.push_back(offset); + read_union_find.emplace_back(size, false); + + } + + net_handle_to_node_problem_index.reserve(5*seed_count); + all_node_problems.reserve(5*seed_count); + root_children.reserve(seed_count); + } + }; + + //Go through all the seeds and assign them to their parent chains or roots + //If a node is in a chain, then assign it to its parent chain and add the parent + //chain to chain_to_children_by_level + //If a node is a child of the root or of a root snarl, then add cluster it and + //remember to cluster the root snarl + void get_nodes( ClusteringProblem& clustering_problem, + vector>& chains_by_level) const; + + + //Cluster all the snarls at the current level + void cluster_snarl_level(ClusteringProblem& clustering_problem) const; + + //Cluster all the chains at the current level + //also assigns each chain to its parent and saves the distances to the ends of the parent + //for each chain + void cluster_chain_level(ClusteringProblem& clustering_problem, size_t depth) const; + + //Cluster the seeds on the specified node + //The seeds are unsorted + void cluster_one_node(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* node_problem) const; + + //Cluster the seeds in a snarl + void cluster_one_snarl(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* snarl_problem) const; + + //Cluster the seeds in a chain + //The children are unsorted, they get sorted before clustering + //If the children of the chain are only seeds on nodes, then cluster as if it is a node + void cluster_one_chain(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* chain_problem, + bool is_top_level_chain) const; + + //Helper function for adding the next seed to the chain clusters + void add_seed_to_chain_problem(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* chain_problem, + SnarlTreeNodeProblem::SnarlTreeChild& last_child, + size_t& last_prefix_sum, size_t& last_length, size_t& last_chain_component_end, + vector& cluster_heads_to_add_again, + bool& found_first_node, pair& found_first_node_by_read, + const SnarlTreeNodeProblem::SnarlTreeChild& current_child, bool is_first_child, bool is_last_child, + bool skip_distances_to_ends) const; + + //Helper function for adding the next snarl to the chain clusters + void add_snarl_to_chain_problem(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* chain_problem, + SnarlTreeNodeProblem::SnarlTreeChild& last_child, + size_t& last_prefix_sum, size_t& last_length, size_t& last_chain_component_end, + vector& cluster_heads_to_add_again, + bool& found_first_node, pair& found_first_node_by_read, + const SnarlTreeNodeProblem::SnarlTreeChild& current_child, bool is_first_child, bool is_last_child, + bool skip_distances_to_ends) const; + + //Cluster in the root - everything in clustering_problem.root_children + void cluster_root(ClusteringProblem& clustering_problem) const; + + //Cluster a list of seeds (SeedIndexes) that are on a single linear structure (node or chain) + //Requires that the list of seeds are sorted relative to their position on the structure + //This can be called on a chain if there are no nested seeds on the chain + //left offset is the distance from the left side of the structure + //if include_prefix_sum, then this is being called on a chain and the prefix sum must be added to the + //distance_left of the seeds + void cluster_seeds_on_linear_structure(ClusteringProblem& clustering_problem, + SnarlTreeNodeProblem* problem, size_t structure_length, bool include_prefix_sum, + bool skip_distances_to_ends) const; + + //Compare two children of the parent and combine their clusters, to create clusters in the parent + //child_distances contains the distances for cluster heads in the children, + //since the distances in the seeds will get updated to be the distances in the parent + //First child is true if this is the first time we see child_problem1. If first_child is true and this is + //a snarl, then we need to update the snarl's distances to its parents + void compare_and_combine_cluster_on_child_structures(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* child_problem1, + SnarlTreeNodeProblem* child_problem2, SnarlTreeNodeProblem* parent_problem, + const vector>& child_distances, bool is_root, bool first_child) const; + + //The same as above, but compare clusters on a single child + //This assumes that the child is the child of the root and not a root snarl + //so we just look at external distances + void compare_and_combine_cluster_on_one_child(ClusteringProblem& clustering_problem, SnarlTreeNodeProblem* child_problem) const; + +}; +} + +#endif diff --git a/src/snarls.cpp b/src/snarls.cpp index 9bd23685cc7..abaa5076815 100644 --- a/src/snarls.cpp +++ b/src/snarls.cpp @@ -1,321 +1,238 @@ -// -// snarls.cpp -// -// +/// +/// \file snarls.cpp +/// +/// -//#define debug +// #define debug + +#include #include "snarls.hpp" -#include "json2pb.h" -#include "algorithms/topological_sort.hpp" -#include "algorithms/is_acyclic.hpp" +#include "vg/io/json2pb.h" +#include "subgraph_overlay.hpp" namespace vg { -CactusSnarlFinder::CactusSnarlFinder(VG& graph) : - graph(graph) { - // Make sure the graph is sorted. - algorithms::sort(&graph); +SnarlManager SnarlFinder::find_snarls_parallel() { + // By default, just use a single thread, unless this finder has a parallel + // overriding implementation. + return find_snarls(); } -CactusSnarlFinder::CactusSnarlFinder(VG& graph, const string& hint_path) : - CactusSnarlFinder(graph) { - - // Save the hint path - hint_paths.insert(hint_path); - - // TODO: actually use it +HandleGraphSnarlFinder::HandleGraphSnarlFinder(const HandleGraph* graph) : graph(graph) { + // Nothing to do! } -SnarlManager CactusSnarlFinder::find_snarls() { - - if (graph.size() == 0) { - // No snarls here! - return SnarlManager(); - } - // convert to cactus - pair cac_pair = handle_graph_to_cactus(graph, hint_paths); - stCactusGraph* cactus_graph = cac_pair.first; - stList* telomeres = cac_pair.second; - - // get the snarl decomposition as a C struct - stSnarlDecomposition *snarls = stCactusGraph_getSnarlDecomposition(cactus_graph, telomeres); - - // Get a non-owning pointer to the list of chains (which are themselves lists of snarls). - stList* cactus_chains_list = snarls->topLevelChains; - - // And one to the list of top-level unary snarls - stList* cactus_unary_snarls_list = snarls->topLevelUnarySnarls; - - - // We'll fill this with all the snarls +SnarlManager HandleGraphSnarlFinder::find_snarls_unindexed() { + // Start with an empty SnarlManager SnarlManager snarl_manager; - // Fill the manager with all of the snarls, recursively. - recursively_emit_snarls(Visit(), Visit(), Visit(), Visit(), cactus_chains_list, cactus_unary_snarls_list, snarl_manager); - - // Free the decomposition - stSnarlDecomposition_destruct(snarls); - - // Free the telomeres - stList_destruct(telomeres); - - // free the cactus graph - stCactusGraph_destruct(cactus_graph); - - // Finish the SnarlManager - snarl_manager.finish(); - - // Return the completed SnarlManager - return snarl_manager; + // We need a stack with the information we need to translate the traversal + // into vg::Snarl and vg::Chain objects, so we can compute connectivity and + // snarl classification as we go up. + struct TranslationFrame { + // This will hold the unmanaged scratch snarl we pass to the manager. + Snarl snarl; + // This will hold all the child snarls that need their parent information filled in before they can become managed. + // They are sorted by chain. + vector> child_chains; + // For creating the current chain for this frame, we need to know where the chain claimed to start. + // If the start = the end and the chain is inside a snarl, it's just a trivial chain (single node) and we drop it. + handle_t current_chain_start; + }; -} - -const Snarl* CactusSnarlFinder::recursively_emit_snarls(const Visit& start, const Visit& end, - const Visit& parent_start, const Visit& parent_end, - stList* chains_list, stList* unary_snarls_list, SnarlManager& destination) { - -#ifdef debug - cerr << "Explore snarl " << start << " -> " << end << endl; -#endif - - // This is the snarl we are filling in to add to the SnarlManger, or an - // empty snarl if we're a fake root snarl. - Snarl snarl; - - if (start.node_id() != 0 && end.node_id() != 0) { - // This is a real snarl - - // Set up the start and end - *snarl.mutable_start() = start; - *snarl.mutable_end() = end; - - if (parent_start.node_id() != 0 && parent_end.node_id() != 0) { - // We have a parent that isn't the fake root, so fill in its ends - *snarl.mutable_parent()->mutable_start() = parent_start; - *snarl.mutable_parent()->mutable_end() = parent_end; + // Stack that lets us connect snarls to their parents. + // Holds each snarl and the child snarls we have finished for it so far. + vector stack; + + traverse_decomposition([&](handle_t chain_start) { + // We got the start of a (possibly empty) chain. + if (!stack.empty()) { + // We're in a snarl, so we're a chain that we need for snarl connectivity/classification. + stack.back().current_chain_start = chain_start; + + // Allocate a place to store the snarls in the chain. + stack.back().child_chains.emplace_back(); } - } - - // This will hold the pointer to the copy of the snarl in the SnarlManager, - // or null if the snarl is a fake root and we don't add it. - const Snarl* managed = nullptr; - - // Before we can pass our snarl to the snarl manager, we need to look at all - // its children so we can get connectivity info. - - // We have a vector of the snarls made for the child snarls in each ordinary - // chain, plus trivial chains for the unary snarls. - vector child_chains; - -#ifdef debug - cerr << "Look at " << stList_length(chains_list) << " child chains" << endl; -#endif - - int chain_offset = 0; - for (int64_t i = 0; i < stList_length(chains_list); i++) { - // For each child chain - stList* cactus_chain = (stList*)stList_get(chains_list, i); - - // Make a new chain. - // We aren't going to pass it on to the snarl manager, because chains need to be recomputed for consistency. - // But we need it for computing the internal snarl connectivity. - child_chains.emplace_back(); - auto& chain = child_chains.back(); - -#ifdef debug - cerr << "Chain " << i << " has " << stList_length(cactus_chain) << " child snarls" << endl; -#endif - - for (int64_t j = 0; j < stList_length(cactus_chain); j++) { - // for each child snarl in the chain - stSnarl* child_snarl = (stSnarl*)stList_get(cactus_chain, j); - - // scrape the vg coordinate information out of the cactus ends where we stuck - // it during cactus construction - CactusSide* cac_child_side1 = (CactusSide*)stCactusEdgeEnd_getObject(child_snarl->edgeEnd1); - CactusSide* cac_child_side2 = (CactusSide*)stCactusEdgeEnd_getObject(child_snarl->edgeEnd2); - - // Convert from CactusSide (the interior endpoint of each node) to Visit (inward at start, outward at end) - Visit child_start; - child_start.set_node_id(cac_child_side1->node); - // Start is backward if the interior is not an end - child_start.set_backward(!cac_child_side1->is_end); - Visit child_end; - child_end.set_node_id(cac_child_side2->node); - // End is backward if the interior is an end - child_end.set_backward(cac_child_side2->is_end); - - // Recursively create a snarl for the child - const Snarl* converted_child = recursively_emit_snarls(child_start, child_end, start, end, - child_snarl->chains, child_snarl->unarySnarls, destination); - // Work out if it should be backward in the chain - bool backward_in_chain = false; - if (!chain.empty()) { - bool last_backward_in_chain = chain.back().second; - auto dangling_id = last_backward_in_chain ? chain.back().first->end().node_id() : chain.back().first->start().node_id(); - // We are backward if our end is shared with the previous snarl in the chain. - backward_in_chain = converted_child->end().node_id() == dangling_id; + }, [&](handle_t chain_end) { + // We got the end of a (possibly empty) chain. + if (!stack.empty() && stack.back().current_chain_start == chain_end) { + // We're an empty chain in an actual snarl. + // Get rid of our empty chain vector that got no snarls in it + assert(stack.back().child_chains.back().empty()); + stack.back().child_chains.pop_back(); + } + }, [&](handle_t snarl_start) { + // Stack up a snarl + stack.emplace_back(); + // And fill in its start + auto& snarl = stack.back().snarl; + snarl.mutable_start()->set_node_id(graph->get_id(snarl_start)); + snarl.mutable_start()->set_backward(graph->get_is_reverse(snarl_start)); + }, [&](handle_t snarl_end) { + // Fill in its end + auto& snarl = stack.back().snarl; + snarl.mutable_end()->set_node_id(graph->get_id(snarl_end)); + snarl.mutable_end()->set_backward(graph->get_is_reverse(snarl_end)); + + // We need to manage all our children and put them in Chain objects that net graphs can understand. + vector managed_child_chains; + + for (auto& child_chain : stack.back().child_chains) { + // For every child chain + + // Make a translated version + managed_child_chains.emplace_back(); + for (auto& child : child_chain) { + // For each child snarl, fill us in as the parent (before we have connectivity info filled in) + *child.mutable_parent() = snarl; + // And report it to the manager with the cross-reference to us filled in. + const Snarl* managed_child = snarl_manager.add_snarl(child); + // And save it in the child chain. + // We know it must be forward in the chain. + managed_child_chains.back().emplace_back(managed_child, false); } - - // And then add it to this chain. - chain.emplace_back(converted_child, backward_in_chain); } - } - -#ifdef debug - cerr << "Look at " << stList_length(unary_snarls_list) << " child unary snarls" << endl; -#endif - - for (int64_t i = 0; i < stList_length(unary_snarls_list); i++) { - // for each child unary snarl - stSnarl* child_snarl = (stSnarl*)stList_get(unary_snarls_list, i); - - // TODO: deduplicate this code - - // scrape the vg coordinate information out of the cactus ends where we stuck - // it during cactus construction - CactusSide* cac_child_side1 = (CactusSide*)stCactusEdgeEnd_getObject(child_snarl->edgeEnd1); - CactusSide* cac_child_side2 = (CactusSide*)stCactusEdgeEnd_getObject(child_snarl->edgeEnd2); - - // Convert from CactusSide (the interior endpoint of each node) to Visit (inward at start, outward at end) - Visit child_start; - child_start.set_node_id(cac_child_side1->node); - // Start is backward if the interior is not an end - child_start.set_backward(!cac_child_side1->is_end); - Visit child_end; - child_end.set_node_id(cac_child_side2->node); - // End is backward if the interior is an end - child_end.set_backward(cac_child_side2->is_end); - // Make a trivial chain - child_chains.emplace_back(); - auto& chain = child_chains.back(); - - // Recursively create a snarl for the child, and then add it to the trivial chain as forward - chain.emplace_back(recursively_emit_snarls(child_start, child_end, start, end, - child_snarl->chains, child_snarl->unarySnarls, destination), false); - } - - if (snarl.start().node_id() != 0 || snarl.end().node_id() != 0) { // This snarl is real, we care about type and connectivity. + // All its children are done. - // First determine connectivity - { - - // Make a net graph for the snarl that uses internal connectivity - NetGraph connectivity_net_graph(start, end, child_chains, &graph, true); - - // Evaluate connectivity - // A snarl is minimal, so we know out start and end will be normal nodes. - handle_t start_handle = connectivity_net_graph.get_handle(start.node_id(), start.backward()); - handle_t end_handle = connectivity_net_graph.get_handle(end.node_id(), end.backward()); - - // Start out by assuming we aren't connected - bool connected_start_start = false; - bool connected_end_end = false; - bool connected_start_end = false; - - // We do a couple of direcred walk searches to test connectivity. - list queue{start_handle}; - unordered_set queued{start_handle}; - auto handle_edge = [&](const handle_t& other) { + ///// + // Determine connectivity + ///// + + // Make a net graph for the snarl that uses internal connectivity + NetGraph connectivity_net_graph(snarl.start(), snarl.end(), managed_child_chains, graph, true); + + // Evaluate connectivity + // A snarl is minimal, so we know out start and end will be normal nodes. + handle_t start_handle = connectivity_net_graph.get_handle(snarl.start().node_id(), snarl.start().backward()); + handle_t end_handle = connectivity_net_graph.get_handle(snarl.end().node_id(), snarl.end().backward()); + + // Start out by assuming we aren't connected + bool connected_start_start = false; + bool connected_end_end = false; + bool connected_start_end = false; + + // We do a couple of direcred walk searches to test connectivity. + list queue{start_handle}; + unordered_set queued{start_handle}; + auto handle_edge = [&](const handle_t& other) { #ifdef debug - cerr << "\tCan reach " << connectivity_net_graph.get_id(other) - << " " << connectivity_net_graph.get_is_reverse(other) << endl; + cerr << "\tCan reach " << connectivity_net_graph.get_id(other) + << " " << connectivity_net_graph.get_is_reverse(other) << endl; #endif - - // Whenever we see a new node orientation, queue it. - if (!queued.count(other)) { - queue.push_back(other); - queued.insert(other); - } - }; + // Whenever we see a new node orientation, queue it. + if (!queued.count(other)) { + queue.push_back(other); + queued.insert(other); + } + }; + #ifdef debug - cerr << "Looking for start-start turnarounds and through connections from " - << connectivity_net_graph.get_id(start_handle) << " " << - connectivity_net_graph.get_is_reverse(start_handle) << endl; + cerr << "Looking for start-start turnarounds and through connections from " + << connectivity_net_graph.get_id(start_handle) << " " << + connectivity_net_graph.get_is_reverse(start_handle) << endl; #endif + + while (!queue.empty()) { + handle_t here = queue.front(); + queue.pop_front(); - while (!queue.empty()) { - handle_t here = queue.front(); - queue.pop_front(); - - if (here == end_handle) { - // Start can reach the end - connected_start_end = true; - } - - if (here == connectivity_net_graph.flip(start_handle)) { - // Start can reach itself the other way around - connected_start_start = true; - } - - if (connected_start_end && connected_start_start) { - // No more searching needed - break; - } - - // Look at everything reachable on a proper rightward directed walk. - connectivity_net_graph.follow_edges(here, false, handle_edge); + if (here == end_handle) { + // Start can reach the end + connected_start_end = true; + } + + if (here == connectivity_net_graph.flip(start_handle)) { + // Start can reach itself the other way around + connected_start_start = true; } - auto end_inward = connectivity_net_graph.flip(end_handle); + if (connected_start_end && connected_start_start) { + // No more searching needed + break; + } + // Look at everything reachable on a proper rightward directed walk. + connectivity_net_graph.follow_edges(here, false, handle_edge); + } + + auto end_inward = connectivity_net_graph.flip(end_handle); + #ifdef debug - cerr << "Looking for end-end turnarounds from " << connectivity_net_graph.get_id(end_inward) - << " " << connectivity_net_graph.get_is_reverse(end_inward) << endl; + cerr << "Looking for end-end turnarounds from " << connectivity_net_graph.get_id(end_inward) + << " " << connectivity_net_graph.get_is_reverse(end_inward) << endl; #endif + + // Reset and search the other way from the end to see if it can find itself. + queue = {end_inward}; + queued = {end_inward}; + while (!queue.empty()) { + handle_t here = queue.front(); + queue.pop_front(); - // Reset and search the other way from the end to see if it can find itself. - queue = {end_inward}; - queued = {end_inward}; - while (!queue.empty()) { - handle_t here = queue.front(); - queue.pop_front(); - #ifdef debug - cerr << "Got to " << connectivity_net_graph.get_id(here) << " " - << connectivity_net_graph.get_is_reverse(here) << endl; + cerr << "Got to " << connectivity_net_graph.get_id(here) << " " + << connectivity_net_graph.get_is_reverse(here) << endl; #endif - - if (here == end_handle) { - // End can reach itself the other way around - connected_end_end = true; - break; - } - - // Look at everything reachable on a proper rightward directed walk. - connectivity_net_graph.follow_edges(here, false, handle_edge); + + if (here == end_handle) { + // End can reach itself the other way around + connected_end_end = true; + break; } - // Save the connectivity info. TODO: should the connectivity flags be - // calculated based on just the net graph, or based on actual connectivity - // within child snarls. - snarl.set_start_self_reachable(connected_start_start); - snarl.set_end_self_reachable(connected_end_end); - snarl.set_start_end_reachable(connected_start_end); + // Look at everything reachable on a proper rightward directed walk. + connectivity_net_graph.follow_edges(here, false, handle_edge); + } + + // Save the connectivity info. TODO: should the connectivity flags be + // calculated based on just the net graph, or based on actual connectivity + // within child snarls. + snarl.set_start_self_reachable(connected_start_start); + snarl.set_end_self_reachable(connected_end_end); + snarl.set_start_end_reachable(connected_start_end); #ifdef debug - cerr << "Connectivity: " << connected_start_start << " " << connected_end_end << " " << connected_start_end << endl; + cerr << "Connectivity: " << connected_start_start << " " << connected_end_end << " " << connected_start_end << endl; #endif - - - } + + ///// + // Determine tip presence + ///// - { - // Determine cyclicity/acyclicity + // Make a net graph that just pretends child snarls/chains are ordinary nodes + NetGraph flat_net_graph(snarl.start(), snarl.end(), managed_child_chains, graph); - // Make a net graph that just pretends child snarls/chains are ordinary nodes - NetGraph flat_net_graph(start, end, child_chains, &graph); - - // This definitely should be calculated based on the internal-connectivity-ignoring net graph. - snarl.set_directed_acyclic_net_graph(algorithms::is_directed_acyclic(&flat_net_graph)); + // Having internal tips in the net graph disqualifies a snarl from being an ultrabubble + auto tips = handlealgs::find_tips(&flat_net_graph); + +#ifdef debug + cerr << "Tips: " << endl; + for (auto& tip : tips) { + cerr << "\t" << flat_net_graph.get_id(tip) << (flat_net_graph.get_is_reverse(tip) ? '-' : '+') << endl; } +#endif + + // We should have at least the bounding nodes. + assert(tips.size() >= 2); + bool has_internal_tips = (tips.size() > 2); + + ///// + // Determine cyclicity/acyclicity + ///// + + // This definitely should be calculated based on the internal-connectivity-ignoring net graph. + snarl.set_directed_acyclic_net_graph(handlealgs::is_directed_acyclic(&flat_net_graph)); + + ///// + // Determine classification + ///// // Now we need to work out if the snarl can be a unary snarl or an ultrabubble or what. - if (start.node_id() == end.node_id()) { + if (snarl.start().node_id() == snarl.end().node_id()) { // Snarl has the same start and end (or no start or end, in which case we don't care). snarl.set_type(UNARY); #ifdef debug @@ -338,7 +255,7 @@ const Snarl* CactusSnarlFinder::recursively_emit_snarls(const Visit& start, cons } else { // See if we have all ultrabubble children bool all_ultrabubble_children = true; - for (auto& chain : child_chains) { + for (auto& chain : managed_child_chains) { for (auto& child : chain) { if (child.first->type() != ULTRABUBBLE) { all_ultrabubble_children = false; @@ -350,13 +267,18 @@ const Snarl* CactusSnarlFinder::recursively_emit_snarls(const Visit& start, cons } } - // Note that ultrabubbles *can* loop back on their start or end. - if (!all_ultrabubble_children) { // If we have non-ultrabubble children, we can't be an ultrabubble. snarl.set_type(UNCLASSIFIED); #ifdef debug cerr << "Snarl is UNCLASSIFIED because it has non-ultrabubble children" << endl; +#endif + } else if (has_internal_tips) { + // If we have internal tips, we can't be an ultrabubble + snarl.set_type(UNCLASSIFIED); + +#ifdef debug + cerr << "Snarl is UNCLASSIFIED because it contains internal tips" << endl; #endif } else if (!snarl.directed_acyclic_net_graph()) { // If all our children are ultrabubbles but we ourselves are cyclic, we can't be an ultrabubble @@ -375,13 +297,34 @@ const Snarl* CactusSnarlFinder::recursively_emit_snarls(const Visit& start, cons } } - // Now we know enough about the snarl to actually put it in the SnarlManager - managed = destination.add_snarl(snarl); + // Now we know all about our snarl, but we don't know about our parent. - } + if (stack.size() > 1) { + // We have a parent. Join it as a child, at the end of the current chain + assert(!stack[stack.size() - 2].child_chains.empty()); + stack[stack.size() - 2].child_chains.back().emplace_back(std::move(snarl)); + } else { + // Just manage ourselves now, because our parent can't manage us. + snarl_manager.add_snarl(snarl); + } + + // Leave the stack + stack.pop_back(); + }); - // Return a pointer to the managed snarl. - return managed; + // Give it back + return snarl_manager; +} + +SnarlManager HandleGraphSnarlFinder::find_snarls() { + // Find all the snarls + auto snarl_manager(find_snarls_unindexed()); + + // Index them + snarl_manager.finish(); + + // Return the finished SnarlManager + return snarl_manager; } bool start_backward(const Chain& chain) { @@ -551,18 +494,47 @@ ChainIterator chain_end_from(const Chain& chain, const Snarl* start_snarl, bool return ChainIterator(); } -// TODO: this is duplicative with the other constructor, but protobuf won't let me make -// a deserialization iterator to match its signature because its internal file streams -// disallow copy constructors -SnarlManager::SnarlManager(istream& in) { - // Add snarls to master list - for (stream::ProtobufIterator iter(in); iter.has_next(); iter.get_next()) { - // Add each snarl - add_snarl(*iter); +SnarlManager::SnarlManager(istream& in) : SnarlManager([&in](const function& consume_snarl) -> void { + // Find all the snarls in the input stream and use each of them in the callback-based constructor + for (vg::io::ProtobufIterator iter(in); iter.has_current(); iter.advance()) { + consume_snarl(*iter); } +}) { + // Nothing to do! +} + +SnarlManager::SnarlManager(const function&)>& for_each_snarl) { + for_each_snarl([&](Snarl& snarl) { + // Add each snarl to us + add_snarl(snarl); + }); // Record the tree structure and build the other indexes finish(); } + +void SnarlManager::serialize(ostream& out) const { + + vg::io::ProtobufEmitter emitter(out); + list stack; + + for (const Snarl* root : top_level_snarls()) { + stack.push_back(root); + + while (!stack.empty()) { + // Grab a snarl from the stack + const Snarl* snarl = stack.back(); + stack.pop_back(); + + // Write out the snarl + emitter.write_copy(*root); + + for (const Snarl* child_snarl : children_of(snarl)) { + // Stack up its children + stack.push_back(child_snarl); + } + } + } +} const vector& SnarlManager::children_of(const Snarl* snarl) const { if (snarl == nullptr) { @@ -584,6 +556,7 @@ const Snarl* SnarlManager::snarl_sharing_start(const Snarl* here) const { return next == here ? nullptr : next; } + const Snarl* SnarlManager::snarl_sharing_end(const Snarl* here) const { // Look out the end and see what we come to @@ -606,6 +579,16 @@ bool SnarlManager::chain_orientation_of(const Snarl* snarl) const { return false; } +size_t SnarlManager::chain_rank_of(const Snarl* snarl) const { + const Chain* chain = chain_of(snarl); + if (chain != nullptr) { + // The index is a perfectly good rank. + return record(snarl)->parent_chain_index; + } + // If you're in a single-snarl chain you are at index 0. + return 0; +} + bool SnarlManager::in_nontrivial_chain(const Snarl* here) const { return chain_of(here)->size() > 1; } @@ -668,15 +651,39 @@ bool SnarlManager::is_leaf(const Snarl* snarl) const { bool SnarlManager::is_root(const Snarl* snarl) const { return parent_of(snarl) == nullptr; } + +bool SnarlManager::is_trivial(const Snarl* snarl, const HandleGraph& graph) const { + // If it's an ultrabubble with no children and no contained nodes, it is a trivial snarl. + return snarl->type() == ULTRABUBBLE && + is_leaf(snarl) + && shallow_contents(snarl, graph, false).first.size() == 0; +} + +bool SnarlManager::all_children_trivial(const Snarl* snarl, const HandleGraph& graph) const { + for (auto& child : children_of(snarl)) { + if (!is_trivial(child, graph)) { + return false; + } + } + return true; +} const vector& SnarlManager::top_level_snarls() const { return roots; } void SnarlManager::for_each_top_level_snarl_parallel(const function& lambda) const { -#pragma omp parallel for - for (int i = 0; i < roots.size(); i++) { - lambda(roots[i]); + #pragma omp parallel + { + #pragma omp single + { + for (int i = 0; i < roots.size(); i++) { + #pragma omp task firstprivate(i) + { + lambda(roots[i]); + } + } + } } } @@ -719,6 +726,19 @@ void SnarlManager::for_each_snarl_parallel(const function& l for_each_top_level_snarl_parallel(process); } +void SnarlManager::for_each_top_level_chain(const function& lambda) const { + for (const Chain& chain : root_chains) { + lambda(&chain); + } +} + +void SnarlManager::for_each_top_level_chain_parallel(const function& lambda) const { +#pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < root_chains.size(); ++i) { + lambda(&root_chains[i]); + } +} + void SnarlManager::for_each_chain(const function& lambda) const { // We define a function to run a bunch of chains in serial @@ -756,21 +776,72 @@ void SnarlManager::for_each_chain_parallel(const function& l }); } +void SnarlManager::for_each_snarl_unindexed(const function& lambda) const { + for (const SnarlRecord& snarl_record : snarls) { + lambda(unrecord(&snarl_record)); + } +} + +const Snarl* SnarlManager::discrete_uniform_sample(minstd_rand0& random_engine)const{ + // have to set the seed to the random engine in the unit tests , pass the random engine + + int number_of_snarls = num_snarls(); +#ifdef debug + cerr << "number_of_snarls "<< number_of_snarls <start().node_id() << " -> "<end().node_id() < distribution(0, number_of_snarls-1); + int random_num = distribution(random_engine); +#ifdef debug + cerr << "modifying snarl num " << random_num << endl; + if(unrecord(&snarls[random_num]) == nullptr){ + cerr << "unrecorded snarl is null" <start() << endl; + cerr << snarl->end() < snarls + int num_snarls = this->snarls.size(); + return num_snarls; + +} + void SnarlManager::flip(const Snarl* snarl) { // Get a non-const pointer to the SnarlRecord, which we own. // Allowed because we ourselves aren't const. SnarlRecord* to_flip = (SnarlRecord*) record(snarl); + // Get the Snarl of it + Snarl* to_flip_snarl = unrecord(to_flip); // swap and reverse the start and end Visits - int64_t start_id = to_flip->start().node_id(); - bool start_orientation = to_flip->start().backward(); + int64_t start_id = to_flip_snarl->start().node_id(); + bool start_orientation = to_flip_snarl->start().backward(); - to_flip->mutable_start()->set_node_id(to_flip->end().node_id()); - to_flip->mutable_start()->set_backward(!to_flip->end().backward()); + to_flip_snarl->mutable_start()->set_node_id(to_flip_snarl->end().node_id()); + to_flip_snarl->mutable_start()->set_backward(!to_flip_snarl->end().backward()); - to_flip->mutable_end()->set_node_id(start_id); - to_flip->mutable_end()->set_backward(!start_orientation); + to_flip_snarl->mutable_end()->set_node_id(start_id); + to_flip_snarl->mutable_end()->set_backward(!start_orientation); if (to_flip->parent_chain != nullptr) { // Work out where we keep the orientation of this snarl in its parent chain @@ -810,8 +881,6 @@ void SnarlManager::flip(const Chain* chain) { } const Snarl* SnarlManager::add_snarl(const Snarl& new_snarl) { - // Don't let anyone add snarls if we are already finished. - assert(!finished); // Allocate a default SnarlRecord snarls.emplace_back(); @@ -820,6 +889,9 @@ const Snarl* SnarlManager::add_snarl(const Snarl& new_snarl) { // Hackily copy the snarl in *new_record = new_snarl; + + // Initialized snarl number for each record as deque is being filled + new_record->snarl_number = (size_t)snarls.size()-1; // TODO: Should this be a non-default SnarlRecord constructor? @@ -834,22 +906,15 @@ const Snarl* SnarlManager::add_snarl(const Snarl& new_snarl) { } void SnarlManager::finish() { - // We can only do this once - assert(!finished); - - // Mark ourselves finished so nobody can add more snarls. - // Do it before indexing so we can use out partly built indexes without hitting asserts. - finished = true; - // Build all the indexes from the snarls we were given build_indexes(); // Clean up the snarl and chain orientations so everything is predictably and intuitively oriented regularize(); + } const Snarl* SnarlManager::into_which_snarl(int64_t id, bool reverse) const { - assert(finished); return snarl_into.count(make_pair(id, reverse)) ? snarl_into.at(make_pair(id, reverse)) : nullptr; } @@ -859,7 +924,8 @@ const Snarl* SnarlManager::into_which_snarl(const Visit& visit) const { unordered_map, const Snarl*> SnarlManager::snarl_boundary_index() const { unordered_map, const Snarl*> index; - for (const Snarl& snarl : snarls) { + for (const SnarlRecord& snarl_record : snarls) { + const Snarl& snarl = *unrecord(&snarl_record); index[make_pair(snarl.start().node_id(), snarl.start().backward())] = &snarl; index[make_pair(snarl.end().node_id(), !snarl.end().backward())] = &snarl; } @@ -868,7 +934,8 @@ unordered_map, const Snarl*> SnarlManager::snarl_boundary_in unordered_map, const Snarl*> SnarlManager::snarl_end_index() const { unordered_map, const Snarl*> index; - for (const Snarl& snarl : snarls) { + for (const SnarlRecord& snarl_record : snarls) { + const Snarl& snarl = *unrecord(&snarl_record); index[make_pair(snarl.end().node_id(), !snarl.end().backward())] = &snarl; } return index; @@ -876,7 +943,8 @@ unordered_map, const Snarl*> SnarlManager::snarl_end_index() unordered_map, const Snarl*> SnarlManager::snarl_start_index() const { unordered_map, const Snarl*> index; - for (const Snarl& snarl : snarls) { + for (const SnarlRecord& snarl_record : snarls) { + const Snarl& snarl = *unrecord(&snarl_record); index[make_pair(snarl.start().node_id(), snarl.start().backward())] = &snarl; } return index; @@ -922,7 +990,7 @@ void SnarlManager::build_indexes() { parent->children.push_back(&snarl); // And that its parent is its parent - record(&snarl)->parent = parent; + rec.parent = unrecord(parent); } else { // record top level status @@ -931,7 +999,7 @@ void SnarlManager::build_indexes() { #endif roots.push_back(&snarl); - record(&snarl)->parent = nullptr; + rec.parent = nullptr; } } @@ -1133,115 +1201,82 @@ void SnarlManager::regularize() { } -pair, unordered_set > SnarlManager::shallow_contents(const Snarl* snarl, VG& graph, +pair, unordered_set > SnarlManager::shallow_contents(const Snarl* snarl, const HandleGraph& graph, bool include_boundary_nodes) const { + + pair, unordered_set > to_return; - pair, unordered_set > to_return; - - unordered_set already_stacked; + unordered_set already_stacked; // initialize stack for DFS traversal of site - vector stack; + vector stack; - Node* start_node = graph.get_node(snarl->start().node_id()); - Node* end_node = graph.get_node(snarl->end().node_id()); + handle_t start_node = graph.get_handle(snarl->start().node_id()); + handle_t end_node = graph.get_handle(snarl->end().node_id()); // mark the boundary nodes as already stacked so that paths will terminate on them - already_stacked.insert(start_node); - already_stacked.insert(end_node); + already_stacked.insert(graph.get_id(start_node)); + already_stacked.insert(graph.get_id(end_node)); // add boundary nodes as directed if (include_boundary_nodes) { - to_return.first.insert(start_node); - to_return.first.insert(end_node); + to_return.first.insert(graph.get_id(start_node)); + to_return.first.insert(graph.get_id(end_node)); } - - vector edges_of_node; - + // stack up the nodes one edge inside the snarl from the start - graph.edges_of_node(start_node, edges_of_node); - for (Edge* edge : edges_of_node) { - - // does the edge point into the snarl? - if (edge->from() == snarl->start().node_id() && edge->from_start() == snarl->start().backward()) { + graph.follow_edges(start_node, snarl->start().backward(), [&](const handle_t& node) { - Node* node = graph.get_node(edge->to()); - - if (!already_stacked.count(node)) { + if (!already_stacked.count(graph.get_id(node))) { stack.push_back(node); - already_stacked.insert(node); + already_stacked.insert(graph.get_id(node)); } - - to_return.second.insert(edge); - } - else if (edge->to() == snarl->start().node_id() && edge->to_end() != snarl->start().backward()) { - - Node* node = graph.get_node(edge->from()); - - if (!already_stacked.count(node)) { - stack.push_back(node); - already_stacked.insert(node); + if (snarl->start().backward()) { + to_return.second.insert(graph.edge_handle(node, start_node)); + } else { + to_return.second.insert(graph.edge_handle(start_node, node)); } - - to_return.second.insert(edge); - } - } - edges_of_node.clear(); - + }); + // stack up the nodes one edge inside the snarl from the end - graph.edges_of_node(end_node, edges_of_node); - for (Edge* edge : edges_of_node) { - // does the edge point into the snarl? - if (edge->from() == snarl->end().node_id() && edge->from_start() != snarl->end().backward()) { - - Node* node = graph.get_node(edge->to()); - - if (!already_stacked.count(node)) { + graph.follow_edges(end_node, !snarl->end().backward(), [&](const handle_t& node) { + + if (!already_stacked.count(graph.get_id(node))) { stack.push_back(node); - already_stacked.insert(node); + already_stacked.insert(graph.get_id(node)); } - - to_return.second.insert(edge); - } - else if (edge->to() == snarl->end().node_id() && edge->to_end() == snarl->end().backward()) { - - Node* node = graph.get_node(edge->from()); - - if (!already_stacked.count(node)) { - stack.push_back(node); - already_stacked.insert(node); + if (snarl->end().backward()) { + to_return.second.insert(graph.edge_handle(end_node, node)); + } else { + to_return.second.insert(graph.edge_handle(node, end_node)); } - - to_return.second.insert(edge); - } - } - edges_of_node.clear(); + }); // traverse the snarl with DFS, skipping over any child snarls // do not pay attention to valid walks since we also want to discover any tips while (stack.size()) { // pop the top node off the stack - Node* node = stack.back(); + handle_t node = stack.back(); stack.pop_back(); // record that this node is in the snarl - to_return.first.insert(node); + to_return.first.insert(graph.get_id(node)); - const Snarl* forward_snarl = into_which_snarl(node->id(), false); - const Snarl* backward_snarl = into_which_snarl(node->id(), true); + const Snarl* forward_snarl = into_which_snarl(graph.get_id(node), false); + const Snarl* backward_snarl = into_which_snarl(graph.get_id(node), true); if (forward_snarl) { // this node points into a snarl // What's on the other side of the snarl? - id_t other_id = forward_snarl->start().node_id() == node->id() ? forward_snarl->end().node_id() : forward_snarl->start().node_id(); + id_t other_id = forward_snarl->start().node_id() == graph.get_id(node) ? forward_snarl->end().node_id() : forward_snarl->start().node_id(); // stack up the node on the opposite side of the snarl // rather than traversing it - Node* opposite_node = graph.get_node(other_id); - if (!already_stacked.count(opposite_node)) { + handle_t opposite_node = graph.get_handle(other_id); + if (!already_stacked.count(other_id)) { stack.push_back(opposite_node); - already_stacked.insert(opposite_node); + already_stacked.insert(other_id); } } @@ -1249,167 +1284,132 @@ pair, unordered_set > SnarlManager::shallow_contents // the reverse of this node points into a snarl // What's on the other side of the snarl? - id_t other_id = backward_snarl->end().node_id() == node->id() ? backward_snarl->start().node_id(): backward_snarl->end().node_id(); + id_t other_id = backward_snarl->end().node_id() == graph.get_id(node) ? backward_snarl->start().node_id(): backward_snarl->end().node_id(); // stack up the node on the opposite side of the snarl // rather than traversing it - Node* opposite_node = graph.get_node(other_id); - if (!already_stacked.count(opposite_node)) { + handle_t opposite_node = graph.get_handle(other_id); + if (!already_stacked.count(other_id)) { stack.push_back(opposite_node); - already_stacked.insert(opposite_node); + already_stacked.insert(other_id); } } - graph.edges_of_node(node, edges_of_node); - - for (Edge* edge : edges_of_node) { - // which end of the edge is the current node? - if (edge->from() == node->id()) { + graph.follow_edges(node, false, [&](const handle_t& next_node) { + edge_t edge = graph.edge_handle(node, next_node); // does this edge point forward or backward? - if ((edge->from_start() && !backward_snarl) || - (!edge->from_start() && !forward_snarl)) { - - to_return.second.insert(edge); - Node* next_node = graph.get_node(edge->to()); + if ((graph.get_is_reverse(node) && !backward_snarl) || + (!graph.get_is_reverse(node) && !forward_snarl)) { + + to_return.second.insert(edge); - if (!already_stacked.count(next_node)) { + if (!already_stacked.count(graph.get_id(next_node))) { - stack.push_back(next_node); - already_stacked.insert(next_node); - } + stack.push_back(next_node); + already_stacked.insert(graph.get_id(next_node)); + } } - } - else { + }); + + graph.follow_edges(node, true, [&](const handle_t& prev_node) { + edge_t edge = graph.edge_handle(prev_node, node); // does this edge point forward or backward? - if ((edge->to_end() && !forward_snarl) || - (!edge->to_end() && !backward_snarl)) { - + if ((graph.get_is_reverse(node) && !forward_snarl) || + (!graph.get_is_reverse(node) && !backward_snarl)) { + to_return.second.insert(edge); - Node* next_node = graph.get_node(edge->from()); - if (!already_stacked.count(next_node)) { + if (!already_stacked.count(graph.get_id(prev_node))) { - stack.push_back(next_node); - already_stacked.insert(next_node); + stack.push_back(prev_node); + already_stacked.insert(graph.get_id(prev_node)); } } - } - } - - edges_of_node.clear(); + }); } return to_return; } -pair, unordered_set > SnarlManager::deep_contents(const Snarl* snarl, VG& graph, +pair, unordered_set > SnarlManager::deep_contents(const Snarl* snarl, const HandleGraph& graph, bool include_boundary_nodes) const { - pair, unordered_set > to_return; + pair, unordered_set > to_return; - unordered_set already_stacked; + unordered_set already_stacked; // initialize stack for DFS traversal of site - vector stack; - - Node* start_node = graph.get_node(snarl->start().node_id()); - Node* end_node = graph.get_node(snarl->end().node_id()); + vector stack; + + handle_t start_node = graph.get_handle(snarl->start().node_id()); + handle_t end_node = graph.get_handle(snarl->end().node_id()); // mark the boundary nodes as already stacked so that paths will terminate on them - already_stacked.insert(start_node); - already_stacked.insert(end_node); + already_stacked.insert(graph.get_id(start_node)); + already_stacked.insert(graph.get_id(end_node)); // add boundary nodes as directed if (include_boundary_nodes) { - to_return.first.insert(start_node); - to_return.first.insert(end_node); + to_return.first.insert(graph.get_id(start_node)); + to_return.first.insert(graph.get_id(end_node)); } - - vector edges_of_node; - + // stack up the nodes one edge inside the snarl from the start - graph.edges_of_node(start_node, edges_of_node); - for (Edge* edge : edges_of_node) { - // does the edge point into the snarl? - if (edge->from() == snarl->start().node_id() && edge->from_start() == snarl->start().backward()) { - - Node* node = graph.get_node(edge->to()); - - if (!already_stacked.count(node)) { + graph.follow_edges(start_node, snarl->start().backward(), [&](const handle_t& node) { + + if (!already_stacked.count(graph.get_id(node))) { stack.push_back(node); - already_stacked.insert(node); + already_stacked.insert(graph.get_id(node)); } - - to_return.second.insert(edge); - } - else if (edge->to() == snarl->start().node_id() && edge->to_end() != snarl->start().backward()) { - - Node* node = graph.get_node(edge->from()); - - if (!already_stacked.count(node)) { - stack.push_back(node); - already_stacked.insert(node); + if (snarl->start().backward()) { + to_return.second.insert(graph.edge_handle(node, start_node)); + } else { + to_return.second.insert(graph.edge_handle(start_node, node)); } - - to_return.second.insert(edge); - } - } - edges_of_node.clear(); - + }); + // stack up the nodes one edge inside the snarl from the end - graph.edges_of_node(end_node, edges_of_node); - for (Edge* edge : edges_of_node) { - // does the edge point into the snarl? - if (edge->from() == snarl->end().node_id() && edge->from_start() != snarl->end().backward()) { - - Node* node = graph.get_node(edge->to()); - - if (!already_stacked.count(node)) { + graph.follow_edges(end_node, !snarl->end().backward(), [&](const handle_t& node) { + + if (!already_stacked.count(graph.get_id(node))) { stack.push_back(node); - already_stacked.insert(node); + already_stacked.insert(graph.get_id(node)); } - - to_return.second.insert(edge); - } - else if (edge->to() == snarl->end().node_id() && edge->to_end() == snarl->end().backward()) { - - Node* node = graph.get_node(edge->from()); - - if (!already_stacked.count(node)) { - stack.push_back(node); - already_stacked.insert(node); + if (snarl->end().backward()) { + to_return.second.insert(graph.edge_handle(end_node, node)); + } else { + to_return.second.insert(graph.edge_handle(node, end_node)); } - - to_return.second.insert(edge); - } - } - edges_of_node.clear(); + }); // traverse the snarl with DFS, skipping over any child snarls // do not pay attention to valid walks since we also want to discover any tips while (stack.size()) { // pop the top node off the stack - Node* node = stack.back(); + handle_t node = stack.back(); stack.pop_back(); // record that this node is in the snarl - to_return.first.insert(node); - - graph.edges_of_node(node, edges_of_node); - - for (Edge* edge : edges_of_node) { + to_return.first.insert(graph.get_id(node)); + + graph.follow_edges(node, false, [&] (const handle_t& next_node) { + edge_t edge = graph.edge_handle(node, next_node); to_return.second.insert(edge); - // get the other end of the edge - Node* next_node = edge->from() == node->id() ? graph.get_node(edge->to()) : - graph.get_node(edge->from()); - if (!already_stacked.count(next_node)) { + if (!already_stacked.count(graph.get_id(next_node))) { stack.push_back(next_node); - already_stacked.insert(next_node); + already_stacked.insert(graph.get_id(next_node)); } - } - - edges_of_node.clear(); + }); + + graph.follow_edges(node, true, [&] (const handle_t& prev_node) { + edge_t edge = graph.edge_handle(prev_node, node); + to_return.second.insert(edge); + if (!already_stacked.count(graph.get_id(prev_node))) { + stack.push_back(prev_node); + already_stacked.insert(graph.get_id(prev_node)); + } + }); } return to_return; @@ -1436,7 +1436,7 @@ const Snarl* SnarlManager::manage(const Snarl& not_owned) const { return it->second; } -vector SnarlManager::visits_right(const Visit& visit, VG& graph, const Snarl* in_snarl) const { +vector SnarlManager::visits_right(const Visit& visit, const HandleGraph& graph, const Snarl* in_snarl) const { #ifdef debug cerr << "Look right from " << visit << endl; @@ -1477,9 +1477,10 @@ vector SnarlManager::visits_right(const Visit& visit, VG& graph, const Sn } } - - for (auto attached : graph.sides_of(right_side)) { + + graph.follow_edges(graph.get_handle(right_side.node), !right_side.is_end, [&](const handle_t& next_handle) { // For every NodeSide attached to the right side of this visit + NodeSide attached(graph.get_id(next_handle), right_side.is_end ? graph.get_is_reverse(next_handle) : !graph.get_is_reverse(next_handle)); #ifdef debug cerr << "\tFind NodeSide " << attached << endl; @@ -1537,13 +1538,13 @@ vector SnarlManager::visits_right(const Visit& visit, VG& graph, const Sn #endif } - } + }); return to_return; } -vector SnarlManager::visits_left(const Visit& visit, VG& graph, const Snarl* in_snarl) const { +vector SnarlManager::visits_left(const Visit& visit, const HandleGraph& graph, const Snarl* in_snarl) const { // Get everything right of the reversed visit vector to_return = visits_right(reverse(visit), graph, in_snarl); @@ -1557,13 +1558,19 @@ vector SnarlManager::visits_left(const Visit& visit, VG& graph, const Sna return to_return; } - + NetGraph::NetGraph(const Visit& start, const Visit& end, const HandleGraph* graph, bool use_internal_connectivity) : graph(graph), start(graph->get_handle(start.node_id(), start.backward())), end(graph->get_handle(end.node_id(), end.backward())), use_internal_connectivity(use_internal_connectivity) { // Nothing to do! + +#ifdef debug + cerr << "Creating net graph of " << graph->get_id(this->start) << (graph->get_is_reverse(this->start) ? "-" : "+") + << "->" << graph->get_id(this->end) << (graph->get_is_reverse(this->end) ? "-" : "+") << endl; +#endif + } NetGraph::NetGraph(const Visit& start, const Visit& end, @@ -1601,6 +1608,10 @@ void NetGraph::add_unary_child(const Snarl* unary) { // Save it as a unary snarl unary_boundaries.insert(snarl_bound); + +#ifdef debug + cerr << "\tAdd unary child snarl on " << graph->get_id(snarl_bound) << (graph->get_is_reverse(snarl_bound) ? "-" : "+") << endl; +#endif if (use_internal_connectivity) { // Save its connectivity @@ -1617,12 +1628,19 @@ void NetGraph::add_unary_child(const Snarl* unary) { void NetGraph::add_chain_child(const Chain& chain) { // For every chain, get its bounding handles in the base graph - handle_t chain_start_handle = graph->get_handle(get_start_of(chain)); - handle_t chain_end_handle = graph->get_handle(get_end_of(chain)); + auto start_visit = get_start_of(chain); + handle_t chain_start_handle = graph->get_handle(start_visit.node_id(), start_visit.backward()); + auto end_visit = get_end_of(chain); + handle_t chain_end_handle = graph->get_handle(end_visit.node_id(), end_visit.backward()); // Save the links that let us cross the chain. chain_ends_by_start[chain_start_handle] = chain_end_handle; chain_end_rewrites[graph->flip(chain_end_handle)] = graph->flip(chain_start_handle); + +#ifdef debug + cerr << "\tAdd child chain " << graph->get_id(chain_start_handle) << (graph->get_is_reverse(chain_start_handle) ? "-" : "+") + << " -> " << graph->get_id(chain_end_handle) << (graph->get_is_reverse(chain_end_handle) ? "-" : "+") << endl; +#endif if (use_internal_connectivity) { @@ -1694,6 +1712,10 @@ void NetGraph::add_chain_child(const Chain& chain) { connectivity[graph->get_id(chain_start_handle)] = make_tuple(false, false, true); } } + +bool NetGraph::has_node(id_t node_id) const { + return graph->has_node(node_id); +} handle_t NetGraph::get_handle(const id_t& node_id, bool is_reverse) const { // We never let anyone see any node IDs that aren't assigned to child snarls/chains or content nodes. @@ -1727,11 +1749,12 @@ string NetGraph::get_sequence(const handle_t& handle) const { throw runtime_error("Cannot expose sequences via NetGraph"); } -bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function& iteratee) const { +bool NetGraph::follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const { // Now we do the real work. #ifdef debug - cerr << "Look for edges on " << graph->get_id(handle) << " " << graph->get_is_reverse(handle) + cerr << "Look for edges in net graph of " << graph->get_id(start) << (graph->get_is_reverse(start) ? "-" : "+") + << "->" << graph->get_id(end) << (graph->get_is_reverse(end) ? "-" : "+") << " on " << graph->get_id(handle) << (graph->get_is_reverse(handle) ? "-" : "+") << " going " << (go_left ? "left" : "right") << endl; #endif @@ -1748,28 +1771,38 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function handle_t real_handle = other; if (chain_end_rewrites.count(other)) { // We're reading into the end of a chain. + +#ifdef debug + cerr << "\tRead into chain end; warp to start" << endl; +#endif + // Warp to the start. real_handle = chain_end_rewrites.at(other); } else if (chain_end_rewrites.count(graph->flip(other))) { // We're backing into the end of a chain. + +#ifdef debug + cerr << "\tBack into chain end; warp to start" << endl; +#endif + // Warp to the start. real_handle = graph->flip(chain_end_rewrites.at(graph->flip(other))); } #ifdef debug - cerr << "Found edge " << (go_left ? "from " : "to ") << graph->get_id(other) << " " << graph->get_is_reverse(other) << endl; + cerr << "\tFound edge " << (go_left ? "from " : "to ") << graph->get_id(other) << (graph->get_is_reverse(other) ? "-" : "+") << endl; #endif if (!seen.count(real_handle)) { seen.insert(real_handle); #ifdef debug - cerr << "Report as " << graph->get_id(real_handle) << " " << graph->get_is_reverse(real_handle) << endl; + cerr << "\t\tReport as " << graph->get_id(real_handle) << (graph->get_is_reverse(real_handle) ? "-" : "+") << endl; #endif return iteratee(real_handle); } else { #ifdef debug - cerr << "Edge has been seen" << endl; + cerr << "\t\tEdge has been seen" << endl; #endif return true; } @@ -1781,10 +1814,18 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function handle_t real_handle = other; if (chain_end_rewrites.count(other)) { // We're reading into the end of a chain. +#ifdef debug + cerr << "\tRead into chain end; warp to start" << endl; +#endif // Warp to the start. real_handle = chain_end_rewrites.at(other); } else if (chain_end_rewrites.count(graph->flip(other))) { // We're backing into the end of a chain. + +#ifdef debug + cerr << "\tBack into chain end; warp to start" << endl; +#endif + // Warp to the start. real_handle = graph->flip(chain_end_rewrites.at(graph->flip(other))); } @@ -1792,19 +1833,19 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function real_handle = graph->flip(real_handle); #ifdef debug - cerr << "Found edge " << (go_left ? "from " : "to ") << graph->get_id(other) << " " << graph->get_is_reverse(other) << endl; + cerr << "\tFound edge " << (go_left ? "from " : "to ") << graph->get_id(other) << (graph->get_is_reverse(other) ? "-" : "+") << endl; #endif if (!seen.count(real_handle)) { seen.insert(real_handle); #ifdef debug - cerr << "Report as " << graph->get_id(real_handle) << " " << graph->get_is_reverse(real_handle) << endl; + cerr << "\t\tReport as " << graph->get_id(real_handle) << (graph->get_is_reverse(real_handle) ? "-" : "+") << endl; #endif return iteratee(real_handle); } else { #ifdef debug - cerr << "Edge has been seen" << endl; + cerr << "\t\tEdge has been seen" << endl; #endif return true; } @@ -1819,7 +1860,7 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function //If start and end are the same, all edges are within the net graph #ifdef debug - cerr << "We are at the bound of the graph so don't say anything" << endl; + cerr << "\tWe are at the bound of the graph so don't say anything" << endl; #endif return true; } @@ -1828,7 +1869,7 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function // If we have an associated chain end for this start, we have to use chain connectivity to decide what to do. #ifdef debug - cerr << "We are a chain node" << endl; + cerr << "\tWe are a chain node" << endl; #endif bool connected_start_start; @@ -1837,14 +1878,14 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function tie(connected_start_start, connected_end_end, connected_start_end) = connectivity.at(graph->get_id(handle)); #ifdef debug - cerr << "Connectivity: " << connected_start_start << " " << connected_end_end << " " << connected_start_end << endl; + cerr << "\t\tConnectivity: " << connected_start_start << " " << connected_end_end << " " << connected_start_end << endl; #endif if (chain_ends_by_start.count(handle)) { // We visit the chain in its forward orientation #ifdef debug - cerr << "We are visiting the chain forward" << endl; + cerr << "\t\tWe are visiting the chain forward" << endl; #endif if (go_left) { @@ -1852,13 +1893,13 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function // So we care about end-end connectivity (how could we have left our end?) #ifdef debug - cerr << "We are going left from a forward chain" << endl; + cerr << "\t\t\tWe are going left from a forward chain" << endl; #endif if (connected_end_end) { #ifdef debug - cerr << "We can reverse and go back out the end" << endl; + cerr << "\t\t\t\tWe can reverse and go back out the end" << endl; #endif // Anything after us but in its reverse orientation could be our predecessor @@ -1872,7 +1913,7 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function if (connected_start_end) { #ifdef debug - cerr << "We can continue through and go out the start" << endl; + cerr << "\t\t\t\tWe can continue through and go out the start" << endl; #endif // Look left out of the start of the chain (which is the handle we really are on) @@ -1886,13 +1927,13 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function // We want our successors #ifdef debug - cerr << "We are going right from a forward chain" << endl; + cerr << "\t\t\tWe are going right from a forward chain" << endl; #endif if (connected_start_start) { #ifdef debug - cerr << "We can reverse and go back out the start" << endl; + cerr << "\t\t\t\tWe can reverse and go back out the start" << endl; #endif // Anything before us but in its reverse orientation could be our successor @@ -1906,7 +1947,7 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function if (connected_start_end) { #ifdef debug - cerr << "We can continue through and go out the end" << endl; + cerr << "\t\t\t\tWe can continue through and go out the end" << endl; #endif // Look right out of the end of the chain (which is the handle we really are on) @@ -1923,20 +1964,20 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function // Just flip the cases of above and reverse all the emitted orientations. #ifdef debug - cerr << "We are visiting the chain in reverse" << endl; + cerr << "\t\tWe are visiting the chain in reverse" << endl; #endif if (go_left) { // We want predecessors of the reverse version (successors, but flipped) #ifdef debug - cerr << "We are going left from a reverse chain" << endl; + cerr << "\t\t\tWe are going left from a reverse chain" << endl; #endif if (connected_start_start) { #ifdef debug - cerr << "We can reverse and go back out the start" << endl; + cerr << "\t\t\t\tWe can reverse and go back out the start" << endl; #endif if (!graph->follow_edges(handle, false, flip_and_handle_edge)) { @@ -1948,7 +1989,7 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function if (connected_start_end) { #ifdef debug - cerr << "We can continue through and go out the end" << endl; + cerr << "\t\t\t\tWe can continue through and go out the end" << endl; #endif if (!graph->follow_edges(chain_ends_by_start.at(graph->flip(handle)), false, flip_and_handle_edge)) { @@ -1961,13 +2002,13 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function // We want successors of the reverse version (predecessors, but flipped) #ifdef debug - cerr << "We are going right from a reverse chain" << endl; + cerr << "\t\t\tWe are going right from a reverse chain" << endl; #endif if (connected_end_end) { #ifdef debug - cerr << "We can reverse and go back out the end" << endl; + cerr << "\t\t\t\tWe can reverse and go back out the end" << endl; #endif if (!graph->follow_edges(chain_ends_by_start.at(graph->flip(handle)), false, handle_edge)) { @@ -1979,7 +2020,7 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function if (connected_start_end) { #ifdef debug - cerr << "We can continue through and go out the start" << endl; + cerr << "\t\t\t\tWe can continue through and go out the start" << endl; #endif if (!graph->follow_edges(handle, false, handle_edge)) { @@ -1999,7 +2040,7 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function // We are dealign with a node representing a unary child snarl. #ifdef debug - cerr << "We are looking at a unary snarl" << endl; + cerr << "\tWe are looking at a unary snarl" << endl; #endif // We have to use chain connectivity to decide what to do. @@ -2069,16 +2110,21 @@ bool NetGraph::follow_edges(const handle_t& handle, bool go_left, const function } #ifdef debug - cerr << "We are an ordinary node" << endl; + cerr << "\tWe are an ordinary node" << endl; #endif // Otherwise, this is an ordinary snarl content node return graph->follow_edges(handle, go_left, handle_edge); } -void NetGraph::for_each_handle(const function& iteratee, bool parallel) const { +bool NetGraph::for_each_handle_impl(const function& iteratee, bool parallel) const { // Find all the handles by a traversal. +#ifdef debug + cerr << "Look for contents of net graph of " << graph->get_id(start) << (graph->get_is_reverse(start) ? "-" : "+") + << "->" << graph->get_id(end) << (graph->get_is_reverse(end) ? "-" : "+") << endl; +#endif + // We have to do the traversal on the underlying backing graph, because // the traversal functions we implemented on the graph we present will // maybe use internal child snarl connectivity, which can mean parts of @@ -2089,53 +2135,87 @@ void NetGraph::for_each_handle(const function& iteratee, list queue; unordered_set queued; + // We define a function to queue up nodes we could visit next + auto see_node = [&](const handle_t& other) { + // Whenever we see a new node, add it to the queue + auto found = queued.find(graph->get_id(other)); + if (found == queued.end()) { + +#ifdef debug + cerr << "\t\t\tFound new contained node " << graph->get_id(other) << (graph->get_is_reverse(other) ? "-" : "+") << endl; +#endif + + queue.push_back(other); + queued.emplace_hint(found, graph->get_id(other)); + } + }; + // Start at both the start and the end of the snarl. - queue.push_back(start); - queued.insert(graph->get_id(start)); - queue.push_back(end); - queued.insert(graph->get_id(end)); + see_node(start); + see_node(end); while (!queue.empty()) { handle_t here = queue.front(); queue.pop_front(); +#ifdef debug + cerr << "\tVisit node " << graph->get_id(here) << (graph->get_is_reverse(here) ? "-" : "+") << endl; +#endif + if (unary_boundaries.count(graph->flip(here))) { // This is a backward unary child snarl head, so we need to look at it the other way around. here = graph->flip(here); + +#ifdef debug + cerr << "\t\tReverse to match unary child boundary" << endl; +#endif + } else if (chain_ends_by_start.count(graph->flip(here))) { // This is a backward child chain head, so we need to look at it the other way around. here = graph->flip(here); + +#ifdef debug + cerr << "\t\tReverse to match child chain head" << endl; +#endif + } else if (chain_end_rewrites.count(graph->flip(here))) { // This is a backward child chain tail, so we need to look at it the other way around. here = graph->flip(here); + +#ifdef debug + cerr << "\t\tReverse to match child chain tail" << endl; +#endif + } if (!chain_end_rewrites.count(here)) { // This is not a chain end, so it's either a real contained node or a chain head. // We can emit it. + +#ifdef debug + cerr << "\t\tVisit forward version" << endl; +#endif if (graph->get_is_reverse(here)) { if (!iteratee(graph->flip(here))) { // Run the iteratee on the forward version, and stop if it wants to stop - break; + return false; } } else { if (!iteratee(here)) { // Run the iteratee, and stop if it wants to stop. - break; + return false; } } - } - - // We define a function to queue up nodes we could visit next - auto handle_edge = [&](const handle_t& other) { - // Whenever we see a new node, add it to the queue - if (!queued.count(graph->get_id(other))) { - queue.push_back(other); - queued.insert(graph->get_id(other)); - } - }; + } else { +#ifdef debug + cerr << "\t\tSkip chain end but see start at " << graph->get_id(chain_end_rewrites.at(here)) << (graph->get_is_reverse(chain_end_rewrites.at(here)) ? "-" : "+") << endl; +#endif + // If we reach a chain end, make sure to eventually visit the chain start. + // There might not be any other edges to it. + see_node(chain_end_rewrites.at(here)); + } // We already have flipped any backward heads or tails frontward. So // we don't need to check if the backward version of us is in @@ -2145,30 +2225,51 @@ void NetGraph::for_each_handle(const function& iteratee, start == end) && !unary_boundaries.count(here) && !chain_ends_by_start.count(here) && !chain_end_rewrites.count(here)) { + +#ifdef debug + cerr << "\t\tRight side faces into net graph" << endl; +#endif // We have normal graph to our right and not the exterior of this snarl or the interior of a child. - graph->follow_edges(here, false, handle_edge); + graph->follow_edges(here, false, see_node); } if ((start != end && here != start && here != graph->flip(end)) || start == end) { + +#ifdef debug + cerr << "\t\tLeft side faces into net graph" << endl; +#endif + // We have normal graph to our left. - graph->follow_edges(here, true, handle_edge); + graph->follow_edges(here, true, see_node); } if (chain_end_rewrites.count(here)) { + +#ifdef debug + cerr << "\t\tWe are chain end; look right off reverse start at " << graph->get_id(chain_end_rewrites.at(here)) << (graph->get_is_reverse(chain_end_rewrites.at(here)) ? "-" : "+") << endl; +#endif + // We need to look right off the reverse head of this child snarl. - graph->follow_edges(chain_end_rewrites.at(here), false, handle_edge); + graph->follow_edges(chain_end_rewrites.at(here), false, see_node); } if (chain_ends_by_start.count(here)) { + +#ifdef debug + cerr << "\t\tWe are chain start; look right off end at " << graph->get_id(chain_ends_by_start.at(here)) << (graph->get_is_reverse(chain_ends_by_start.at(here)) ? "-" : "+") << endl; +#endif + // We need to look right off the (reverse) tail of this child snarl. - graph->follow_edges(chain_ends_by_start.at(here), false, handle_edge); + graph->follow_edges(chain_ends_by_start.at(here), false, see_node); } } + + return true; } -size_t NetGraph::node_size() const { +size_t NetGraph::get_node_count() const { // TODO: this is inefficient! size_t size = 0; for_each_handle([&](const handle_t& ignored) { @@ -2176,6 +2277,24 @@ size_t NetGraph::node_size() const { }); return size; } + +id_t NetGraph::min_node_id() const { + // TODO: this is inefficient! + id_t winner = numeric_limits::max(); + for_each_handle([&](const handle_t& handle) { + winner = min(winner, this->get_id(handle)); + }); + return winner; +} + +id_t NetGraph::max_node_id() const { + // TODO: this is inefficient! + id_t winner = numeric_limits::min(); + for_each_handle([&](const handle_t& handle) { + winner = max(winner, this->get_id(handle)); + }); + return winner; +} const handle_t& NetGraph::get_start() const { return start; @@ -2241,11 +2360,51 @@ handle_t NetGraph::get_handle_from_inward_backing_handle(const handle_t& backing throw runtime_error("Cannot assign backing handle to a child chain or unary snarl"); } } + +edge_t to_edge(const HandleGraph& graph, const Visit& v1, const Visit& v2) { + + id_t prev_id; + bool prev_back; + if (v1.node_id() != 0) { + prev_id = v1.node_id(); + prev_back = v1.backward(); + } else { + const Snarl& prev_snarl = v1.snarl(); + if (v1.backward()) { + prev_id = prev_snarl.start().node_id(); + prev_back = !prev_snarl.start().backward(); + } else { + prev_id = prev_snarl.end().node_id(); + prev_back = prev_snarl.end().backward(); + } + } + id_t cur_id; + bool cur_back; + if (v2.node_id() != 0) { + cur_id = v2.node_id(); + cur_back = v2.backward(); + } else { + const Snarl& cur_snarl = v2.snarl(); + if (v2.backward()) { + cur_id = cur_snarl.end().node_id(); + cur_back = !cur_snarl.end().backward(); + } else { + cur_id = cur_snarl.start().node_id(); + cur_back = cur_snarl.start().backward(); + } + } + + return graph.edge_handle(graph.get_handle(prev_id, prev_back), + graph.get_handle(cur_id, cur_back)); + +} + bool operator==(const Visit& a, const Visit& b) { // IDs and orientations have to match, and nobody has a snarl or the // snarls match. return a.node_id() == b.node_id() && + a.name() == b.name() && a.backward() == b.backward() && ((!a.has_snarl() && !b.has_snarl()) || a.snarl() == b.snarl()); @@ -2258,17 +2417,22 @@ bool operator!=(const Visit& a, const Visit& b) { bool operator<(const Visit& a, const Visit& b) { if (!a.has_snarl() && !b.has_snarl()) { // Compare everything but the snarl - return make_tuple(a.node_id(), a.backward()) < make_tuple(b.node_id(), b.backward()); + return make_tuple(a.node_id(), a.backward(), a.name()) < make_tuple(b.node_id(), b.backward(), b.name()); } else { // Compare including the snarl - return make_tuple(a.node_id(), a.snarl(), a.backward()) < make_tuple(b.node_id(), b.snarl(), b.backward()); + return make_tuple(a.node_id(), a.snarl(), a.backward(), a.name()) < make_tuple(b.node_id(), b.snarl(), b.backward(), b.name()); } } ostream& operator<<(ostream& out, const Visit& visit) { if (!visit.has_snarl()) { - // Use the node ID - out << visit.node_id(); + if (visit.name().empty()) { + // Use the node ID + out << visit.node_id(); + } else { + // Use the name + out << visit.name(); + } } else { // Use the snarl out << visit.snarl(); diff --git a/src/snarls.hpp b/src/snarls.hpp index 3db349b7544..f1aa99c2201 100644 --- a/src/snarls.hpp +++ b/src/snarls.hpp @@ -1,23 +1,25 @@ -// -// snarls.hpp -// -// Contains object to own Snarls and keep track of their tree relationships as well as utility -// functions that interact with snarls. -// +/// +/// \file snarls.hpp +/// +/// Contains object to own Snarls and keep track of their tree relationships as well as utility +/// functions that interact with snarls. +/// -#ifndef snarls_hpp -#define snarls_hpp +#ifndef VG_SNARLS_HPP_INCLUDED +#define VG_SNARLS_HPP_INCLUDED +#include #include #include #include #include #include #include -#include "stream.hpp" +#include +#include #include "vg.hpp" #include "handle.hpp" -#include "vg.pb.h" +#include #include "hash_map.hpp" #include "cactus.hpp" @@ -34,59 +36,76 @@ class SnarlManager; class SnarlFinder { public: virtual ~SnarlFinder() = default; - + /** - * Run a function on all root-level NestedSites in parallel. Site trees are - * passed by value so they have a clear place to live during parallel - * operations. + * Find all the snarls, and put them into a SnarlManager. */ virtual SnarlManager find_snarls() = 0; + + /** + * Find all the snarls of weakly connected components, optionally in + * parallel. If not implemented, defaults to the single-threaded + * implementation. + */ + virtual SnarlManager find_snarls_parallel(); }; /** - * Class for finding all snarls using the base-level Cactus snarl decomposition - * interface. + * Wrapper base class that can convert a bottom-up traversal of snarl + * boundaries into a full snarl finder. Mostly worries about snarl + * classification and connectivity information. */ -class CactusSnarlFinder : public SnarlFinder { - - /// Holds the vg graph we are looking for sites in. - VG& graph; - - /// Holds the names of reference path hints - unordered_set hint_paths; - - /// Create a snarl in the given SnarlManager with the given start and end, - /// containing the given child snarls in the list of chains of children and - /// the given list of unary children. Recursively creates snarls in the - /// SnarlManager for the children. Returns a pointer to the finished snarl - /// in the SnarlManager. Start and end may be empty visits, in which case no - /// snarl is created, all the child chains are added as root chains, and - /// null is returned. If parent_start and parent_end are empty Visits, no - /// parent() is added to the produced snarl. - const Snarl* recursively_emit_snarls(const Visit& start, const Visit& end, - const Visit& parent_start, const Visit& parent_end, - stList* chains_list, stList* unary_snarls_list, SnarlManager& destination); - -public: +class HandleGraphSnarlFinder : public SnarlFinder { +protected: /** - * Make a new CactusSnarlFinder to find snarls in the given graph. - * We can't filter trivial bubbles because that would break our chains. - * - * Optionally takes a hint path name. + * The graph we are finding snarls on. It must outlive us. */ - CactusSnarlFinder(VG& graph); + const HandleGraph* graph; /** - * Make a new CactusSnarlFinder with a single hinted path to base the - * decomposition on. + * Find all the snarls, and put them into a SnarlManager, but don't finish it. + * More snarls can be added later before it is finished. */ - CactusSnarlFinder(VG& graph, const string& hint_path); + virtual SnarlManager find_snarls_unindexed(); +public: + /** - * Find all the snarls with Cactus, and put them into a SnarlManager. + * Create a HandleGraphSnarlFinder to find snarls in the given graph. + */ + HandleGraphSnarlFinder(const HandleGraph* graph); + + virtual ~HandleGraphSnarlFinder() = default; + + /** + * Find all the snarls, and put them into a SnarlManager. */ virtual SnarlManager find_snarls(); + /** + * Visit all snarls and chains, including trivial snarls and single-node + * empty chains. + * + * Calls begin_chain and end_chain when entrering and exiting chains in the + * traversal. Within each chain, calls begin_snarl and end_snarl when + * entering and exiting each snarl, in order. The caller is intended to + * maintain its own stack to match up begin and end events. + * + * Each begin/end call receives the handle reading into/out of the snarl or + * chain. + * + * Both empty and cyclic chains have the in and out handles the same. + * They are distinguished by context; empty chains have no shild snarls, + * while cyclic chains do. + * + * Roots the decomposition at a global snarl with no bounding nodes, for + * which begin_snarl is not called. So the first call will be begin_chain. + * + * Start handles are inward facing and end handles are outward facing. + * Snarls must be oriented forward in their chains. + */ + virtual void traverse_decomposition(const function& begin_chain, const function& end_chain, + const function& begin_snarl, const function& end_snarl) const = 0; }; /** @@ -177,12 +196,14 @@ ChainIterator chain_rend(const Chain& chain); ChainIterator chain_rcbegin(const Chain& chain); ChainIterator chain_rcend(const Chain& chain); -/// We also define a function for getting the ChainIterator (forward or -/// reverse complement) for a chain starting with a given snarl in the given -/// inward orientation +/// We also define a function for getting the ChainIterator (forward or reverse +/// complement) for a chain starting with a given snarl in the given inward +/// orientation. Only works for bounding snarls of the chain. ChainIterator chain_begin_from(const Chain& chain, const Snarl* start_snarl, bool snarl_orientation); -/// And the end iterator for the chain (forward or reverse complement) -/// viewed from a given snarl in the given inward orientation +/// And the end iterator for the chain (forward or reverse complement) viewed +/// from a given snarl in the given inward orientation. Only works for bounding +/// snarls of the chain, and should be the *same* bounding snarl as was used +/// for chain_begin_from. ChainIterator chain_end_from(const Chain& chain, const Snarl* start_snarl, bool snarl_orientation); /** @@ -226,7 +247,7 @@ class NetGraph : public HandleGraph { /// Make a new NetGraph for the given snarl in the given backing graph, /// using the given chains as child chains. Unary snarls are stored as - /// single-snarl chains just like other trivial chains. + /// trivial chains just like other trivial chains. template NetGraph(const Visit& start, const Visit& end, const ChainContainer& child_chains_mixed, @@ -240,7 +261,7 @@ class NetGraph : public HandleGraph { // This is a unary snarl wrapped in a chain add_unary_child(chain.front().first); } else { - // This is a real (but possibly trivial) chain + // This is a real (but possibly singlr-snarl) chain add_chain_child(chain); } } @@ -271,11 +292,12 @@ class NetGraph : public HandleGraph { const vector& child_unary_snarls, const HandleGraph* graph, bool use_internal_connectivity = false); - + + /// Method to check if a node exists by ID + virtual bool has_node(id_t node_id) const; + /// Look up the handle for the node with the given ID in the given orientation virtual handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; - // Copy over the visit version which would otherwise be shadowed. - using HandleGraph::get_handle; /// Get the ID from a handle virtual id_t get_id(const handle_t& handle) const; @@ -296,20 +318,20 @@ class NetGraph : public HandleGraph { /// Loop over all the handles to next/previous (right/left) nodes. Passes /// them to a callback which returns false to stop iterating and true to /// continue. Returns true if we finished and false if we stopped early. - virtual bool follow_edges(const handle_t& handle, bool go_left, const function& iteratee) const; - - // Copy over the template for nice calls - using HandleGraph::follow_edges; + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; /// Loop over all the nodes in the graph in their local forward /// orientations, in their internal stored order. Stop if the iteratee returns false. - virtual void for_each_handle(const function& iteratee, bool parallel = false) const; - - // Copy over the template for nice calls - using HandleGraph::for_each_handle; + virtual bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; /// Return the number of nodes in the graph - virtual size_t node_size() const; + virtual size_t get_node_count() const; + + /// Return the smallest ID used. + virtual id_t min_node_id() const; + + /// Return the largest ID used. + virtual id_t max_node_id() const; // We also have some extra functions @@ -393,6 +415,9 @@ class SnarlManager { /// Construct a SnarlManager for the snarls contained in an input stream SnarlManager(istream& in); + + /// Construct a SnarlManager from a function that calls a callback with each Snarl in turn + SnarlManager(const function&)>& for_each_snarl); /// Default constructor for an empty SnarlManager. Must call finish() once /// all snarls have been added with add_snarl(). @@ -404,10 +429,13 @@ class SnarlManager { /// Cannot be copied because of all the internal pointer indexes SnarlManager(const SnarlManager& other) = delete; SnarlManager& operator=(const SnarlManager& other) = delete; - + // copy the SnarlManager /// Can be moved SnarlManager(SnarlManager&& other) = default; SnarlManager& operator=(SnarlManager&& other) = default; + + // Can be serialized + void serialize(ostream& out) const; /////////////////////////////////////////////////////////////////////////// // Write API @@ -462,7 +490,17 @@ class SnarlManager { /// If the given Snarl is backward in its chain, return true. Otherwise, /// return false. bool chain_orientation_of(const Snarl* snarl) const; - + + /// Get the rank that the given snarl appears in in its chain. If two + /// snarls are in forward orientation in the chain, then leaving the end of + /// the lower rank snarl will eventually reach the start of the higher rank + /// snarl. If either or both snarls is backward, you leave/arrive at the + /// other bounding node instead. + /// + /// Sorting snarls by rank will let you visit them in chain order without + /// walking the whole chain. + size_t chain_rank_of(const Snarl* snarl) const; + /// Return true if a Snarl is part of a nontrivial chain of more than one /// snarl. Note that chain_of() still works for snarls in trivial chains. bool in_nontrivial_chain(const Snarl* here) const; @@ -487,27 +525,35 @@ class SnarlManager { /// Returns true if snarl has no parent and false otherwise bool is_root(const Snarl* snarl) const; - + + /// Returns true if the snarl is trivial (an ultrabubble with just the + /// start and end nodes) and false otherwise. + /// TODO: Implement without needing the vg graph, by adding a flag to trivial snarls. + bool is_trivial(const Snarl* snarl, const HandleGraph& graph) const; + + /// Returns true if the snarl lacks any nontrivial children. + bool all_children_trivial(const Snarl* snarl, const HandleGraph& graph) const; + /// Returns a reference to a vector with the roots of the Snarl trees const vector& top_level_snarls() const; /// Returns the Nodes and Edges contained in this Snarl but not in any child Snarls (always includes the /// Nodes that form the boundaries of child Snarls, optionally includes this Snarl's own boundary Nodes) - pair, unordered_set > shallow_contents(const Snarl* snarl, VG& graph, + pair, unordered_set > shallow_contents(const Snarl* snarl, const HandleGraph& graph, bool include_boundary_nodes) const; /// Returns the Nodes and Edges contained in this Snarl, including those in child Snarls (optionally /// includes Snarl's own boundary Nodes) - pair, unordered_set > deep_contents(const Snarl* snarl, VG& graph, + pair, unordered_set > deep_contents(const Snarl* snarl, const HandleGraph& graph, bool include_boundary_nodes) const; /// Look left from the given visit in the given graph and gets all the /// attached Visits to nodes or snarls. - vector visits_left(const Visit& visit, VG& graph, const Snarl* in_snarl) const; + vector visits_left(const Visit& visit, const HandleGraph& graph, const Snarl* in_snarl) const; /// Look left from the given visit in the given graph and gets all the /// attached Visits to nodes or snarls. - vector visits_right(const Visit& visit, VG& graph, const Snarl* in_snarl) const; + vector visits_right(const Visit& visit, const HandleGraph& graph, const Snarl* in_snarl) const; /// Returns a map from all Snarl boundaries to the Snarl they point into. Note that this means that /// end boundaries will be reversed. @@ -531,16 +577,43 @@ class SnarlManager { /// Execute a function on all sites in parallel void for_each_snarl_parallel(const function& lambda) const; - + + /// Execute a function on all top level chains + void for_each_top_level_chain(const function& lambda) const; + + /// Execute a function on all top level chains in parallel + void for_each_top_level_chain_parallel(const function& lambda) const; + /// Ececute a function on all chains void for_each_chain(const function& lambda) const; /// Ececute a function on all chains in parallel void for_each_chain_parallel(const function& lambda) const; + + /// Iterate over snarls as they are stored in deque + void for_each_snarl_unindexed(const function& lambda) const; /// Given a Snarl that we don't own (like from a Visit), find the /// pointer to the managed copy of that Snarl. const Snarl* manage(const Snarl& not_owned) const; + + /// Sample snarls discrete uniformly + /// Returns a nullptr if no snarls are found + const Snarl* discrete_uniform_sample(minstd_rand0& random_engine)const; + + /// Count snarls in deque, a master list of snarls in graph + int num_snarls()const; + + ///Get the snarl number from the SnarlRecord* member with given snarl + inline size_t snarl_number(const Snarl* snarl) const{ + const SnarlRecord* record = SnarlManager::record(snarl); + return record->snarl_number; + } + //use the snarl number to access the Snarl* + inline const Snarl* translate_snarl_num(size_t snarl_num){ + return unrecord(&snarls.at(snarl_num)); + } + private: @@ -548,9 +621,11 @@ class SnarlManager { /// followed by indexing metadata, one after the other in memory. We can /// just cast a Snarl* to a pointer to one of these to get access to all /// the metadata. - struct SnarlRecord : public Snarl { - // Instead of relying on the first member being at offset 0, we inherit - // from the Protobuf type. + struct alignas(alignof(Snarl)) SnarlRecord { + /// With recent Protobuf, we can't inherit from Protobuf generated + /// classes, so we rely on the first member here being at offset 0. + /// This is achieved by making sure SnarlRecord is aligned like Snarl. + Snarl snarl; /// This is a vector of pointers into the master snarl container at /// children. We know the pointers are to valid SnarlRecords. A @@ -568,7 +643,11 @@ class SnarlManager { Chain* parent_chain = nullptr; /// And this is what index we are at in the chain; size_t parent_chain_index = 0; - + + /// This holds the index of the SnarlRecord* in the deque + /// We are doing this because a deque is not contiguous and the index lookup using a SnarlRecord* isn't easily derivable + size_t snarl_number; + /// Allow assignment from a Snarl object, fluffing it up into a full SnarlRecord SnarlRecord& operator=(const Snarl& other) { // Just call the base assignment operator @@ -597,14 +676,10 @@ class SnarlManager { return (Snarl*) record; } + /// Master list of the snarls in the graph. /// Use a deque so pointers never get invalidated but we still have some locality. deque snarls; - - /// Have we finished adding snarls? This ought to be true for any - /// non-trivial read operations. Otherwise the parent/child/chain indexes - /// haven't been computed. - bool finished = false; /// Roots of snarl trees vector roots; @@ -694,6 +769,9 @@ inline Visit to_visit(id_t node_id, bool is_reverse); /// Make a Visit from a snarl to traverse inline Visit to_visit(const Snarl& snarl); + +/// Make a Visit from a handle in a HandleGraph. +inline Visit to_visit(const handlegraph::HandleGraph& graph, const handle_t& handle); /// Get the reversed version of a visit inline Visit reverse(const Visit& visit); @@ -707,11 +785,17 @@ inline Mapping to_mapping(const Visit& visit, std::function node_l /// Converts a Visit to a Mapping. Throws an exception if the Visit is of a Snarl instead /// of a Node. Uses a graph to get node length. -inline Mapping to_mapping(const Visit& visit, VG& vg); - +inline Mapping to_mapping(const Visit& visit, const HandleGraph& vg); + +/// Convert a snarl traversal into an alignment +inline Alignment to_alignment(const SnarlTraversal& trav, const HandleGraph& graph); + /// Copies the boundary Visits from one Snarl into another inline void transfer_boundary_info(const Snarl& from, Snarl& to); - + +/// Make an edge_t from a pair of visits +edge_t to_edge(const handlegraph::HandleGraph& graph, const Visit& v1, const Visit& v2); + // We need some Visit operators /** @@ -869,6 +953,10 @@ inline Visit to_visit(const Snarl& snarl) { *to_return.mutable_snarl()->mutable_end() = snarl.end(); return to_return; } + +inline Visit to_visit(const handlegraph::HandleGraph& graph, const handle_t& handle) { + return to_visit(graph.get_id(handle), graph.get_is_reverse(handle)); +} inline Visit reverse(const Visit& visit) { // Copy the visit @@ -905,17 +993,26 @@ inline Mapping to_mapping(const Visit& visit, std::function node_l return mapping; } -inline Mapping to_mapping(const Visit& visit, VG& graph) { +inline Mapping to_mapping(const Visit& visit, const HandleGraph& graph) { return to_mapping(visit, [&](id_t id) { - return graph.get_node(id)->sequence().size(); + return graph.get_length(graph.get_handle(id)); }); } + +inline Alignment to_alignment(const SnarlTraversal& trav, const HandleGraph& graph) { + Alignment aln; + Path* path = aln.mutable_path(); + for (int i = 0; i < trav.visit_size(); ++i) { + *path->add_mapping() = to_mapping(trav.visit(i), graph); + } + return aln; +} inline void transfer_boundary_info(const Snarl& from, Snarl& to) { *to.mutable_start() = from.start(); *to.mutable_end() = from.end(); } - + } // note: this hash funtion is not used internally because we want the internal indices to ignore any diff --git a/src/source_sink_overlay.cpp b/src/source_sink_overlay.cpp new file mode 100644 index 00000000000..b3c3ebe32a4 --- /dev/null +++ b/src/source_sink_overlay.cpp @@ -0,0 +1,298 @@ +#include "source_sink_overlay.hpp" + +#include + +//#define debug + +namespace vg { + +using namespace std; +using namespace handlegraph; + +SourceSinkOverlay::SourceSinkOverlay(const HandleGraph* backing, size_t length, id_t source_id, id_t sink_id, + bool break_disconnected) : node_length(length), backing(backing), source_id(source_id), sink_id(sink_id) { + + // Both IDs or neither must be specified. + assert((this->source_id == 0) == (this->sink_id == 0)); + + if (this->source_id == 0 || this->sink_id == 0) { + // We need to autodetect our source and sink IDs + id_t backing_max_id = backing->get_node_count() > 0 ? backing->max_node_id() : 0; + + this->source_id = backing_max_id + 1; + this->sink_id = this->source_id + 1; + } + +#ifdef debug + cerr << "Make overlay for kmer size " << length << " with source " << this->source_id << " and sink " << this->sink_id << endl; +#endif + + // We have to divide the graph into connected components and get ahold of the tips. + vector, vector>> components = handlealgs::weakly_connected_components_with_tips(backing); + + for (auto& component : components) { + // Unpack each component + auto& component_ids = component.first; + auto& component_tips = component.second; + +#ifdef debug + cerr << "Weakly connected component of " << component_ids.size() << " has " << component_tips.size() << " tips:" << endl; + for (auto& tip : component_tips) { + cerr << "\t" << backing->get_id(tip) << " orientation " << backing->get_is_reverse(tip) << endl; + } +#endif + + // All the components need to be nonempty + assert(!component_ids.empty()); + + for (auto& handle : component_tips) { + // We need to cache the heads and tails as sets of handles, so we know to + // make edges to all of them when reading out of our synthetic source and + // sink nodes. + + if (backing->get_is_reverse(handle)) { + // It's a tail. Insert it forward as a tail. + backing_tails.insert(backing->flip(handle)); + } else { + // It's a head + backing_heads.insert(handle); + } + + } + + if (component_tips.empty() && break_disconnected) { + // If we're supposed to break open cycles, we also mix in an arbitrary node + // from each tipless component as a head, and each handle that reads into + // it as a tail. + + // Choose a fake head arbitrarily + handle_t fake_head = backing->get_handle(*component_ids.begin(), false); + backing_heads.insert(fake_head); + + // Find the fake tails that are to the left of it + backing->follow_edges(fake_head, true, [&](const handle_t& fake_tail) { + backing_tails.insert(fake_tail); + }); + } + } + + +} + +handle_t SourceSinkOverlay::get_source_handle() const { + return source_fwd; +} + +handle_t SourceSinkOverlay::get_sink_handle() const { + return sink_fwd; +} + +bool SourceSinkOverlay::has_node(id_t node_id) const { + return backing->has_node(node_id); +} + +handle_t SourceSinkOverlay::get_handle(const id_t& node_id, bool is_reverse) const { + if (node_id == source_id) { + // They asked for the source node + return is_reverse ? source_rev : source_fwd; + } else if (node_id == sink_id) { + // They asked for the sink node + return is_reverse ? sink_rev : sink_fwd; + } else { + // Otherwise they asked for something in the backing graph + handle_t backing_handle = backing->get_handle(node_id, is_reverse); + + // Budge up to make room for the source and sink in each orientation + return as_handle(as_integer(backing_handle) + 4); + } +} + +id_t SourceSinkOverlay::get_id(const handle_t& handle) const { + if (handle == source_fwd || handle == source_rev) { + return source_id; + } else if (handle == sink_fwd || handle == sink_rev) { + return sink_id; + } else { + return backing->get_id(to_backing(handle)); + } +} + +bool SourceSinkOverlay::get_is_reverse(const handle_t& handle) const { + if (handle == source_fwd || handle == sink_fwd) { + return false; + } else if (handle == source_rev || handle == sink_rev) { + return true; + } else { + return backing->get_is_reverse(to_backing(handle)); + } +} + +handle_t SourceSinkOverlay::flip(const handle_t& handle) const { + if (is_ours(handle)) { + // In our block of two handles, orientation is the low bit + return as_handle(as_integer(handle) ^ 1); + } else { + // Make the backing graph flip it + return from_backing(backing->flip(to_backing(handle))); + } +} + +size_t SourceSinkOverlay::get_length(const handle_t& handle) const { + if (is_ours(handle)) { + // Both our fake nodes are the same length + return node_length; + } else { + return backing->get_length(to_backing(handle)); + } +} + +string SourceSinkOverlay::get_sequence(const handle_t& handle) const { + if (handle == source_fwd || handle == sink_rev) { + // Reading into the graph is all '#' + return string(node_length, '#'); + } else if (handle == source_rev || handle == sink_fwd) { + // Reading out of the graph is all '$' + return string(node_length, '$'); + } else { + assert(!is_ours(handle)); + return backing->get_sequence(to_backing(handle)); + } +} + +bool SourceSinkOverlay::follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const { + if (is_ours(handle)) { + // We only care about the right of the source and the left of the sink + if ((handle == source_fwd && !go_left) || (handle == source_rev && go_left)) { + // We want the right of the source (every head node in the backing graph) + // Make sure to put it in the appropriate orientation. + + for (const handle_t& backing_head : backing_heads) { + // Feed each backing graph head to the iteratee, in the + // appropriate orientation depending on which way we want to + // go. + if (!iteratee(from_backing(go_left ? backing->flip(backing_head) : backing_head))) { + // If they say to stop, stop + return false; + } + } + + } else if ((handle == sink_fwd && go_left) || (handle == sink_rev && !go_left)) { + // We want the left of the sink (every tail node in the backing graph) + // Make sure to put it in the appropriate orientation. + + for (const handle_t& backing_tail : backing_tails) { + // Feed each backing graph tail to the iteratee, in the + // appropriate orientation depending on which way we want to + // go. + if (!iteratee(from_backing(go_left ? backing_tail : backing->flip(backing_tail)))) { + // If they say to stop, stop + return false; + } + } + } + return true; + } else { + // The handle refers to a node in the backing graph + auto backing_handle = to_backing(handle); + + if ((backing_heads.count(backing_handle) && go_left) || (backing_heads.count(backing->flip(backing_handle)) && !go_left)) { + // We want to read left off a head (possibly in reverse) into the synthetic source + if (!iteratee(go_left ? source_fwd : source_rev)) { + // If they say stop, stop + return false; + } + } + + if ((backing_tails.count(backing_handle) && !go_left) || (backing_tails.count(backing->flip(backing_handle)) && go_left)) { + // We want to read right off a tail (possibly in reverse) into the synthetic sink + if (!iteratee(go_left ? sink_rev : sink_fwd)) { + // If they say stop, stop + return false; + } + } + + // If we get through those, do the actual edges in the backing graph + return backing->follow_edges(backing_handle, go_left, [&](const handle_t& found) -> bool { + return iteratee(from_backing(found)); + }); + } +} + +bool SourceSinkOverlay::for_each_handle_impl(const function& iteratee, bool parallel) const { + + // First do the sourece and sink we added + if (!iteratee(source_fwd)) { + return false; + } + if (!iteratee(sink_fwd)) { + return false; + } + +#ifdef debug + cerr << "Try backing graph " << (parallel ? "in parallel" : "") << endl; +#endif + return backing->for_each_handle([&](const handle_t& backing_handle) -> bool { + // Now do each backing node, possibly in parallel. +#ifdef debug + cerr << "Invoke iteratee on " << backing->get_id(backing_handle) << endl; +#endif + return iteratee(from_backing(backing_handle)); + }, parallel); +} + +size_t SourceSinkOverlay::get_node_count() const { + return backing->get_node_count() + 2; +} + +id_t SourceSinkOverlay::min_node_id() const { + return min(backing->min_node_id(), min(source_id, sink_id)); +} + +id_t SourceSinkOverlay::max_node_id() const { + return max(backing->max_node_id(), max(source_id, sink_id)); +} + +size_t SourceSinkOverlay::get_degree(const handle_t& handle, bool go_left) const { + if (is_ours(handle)) { + if ((handle == source_fwd && !go_left) || (handle == source_rev && go_left)) { + // We are reading into every graph head + return backing_heads.size(); + } else if ((handle == sink_fwd && go_left) || (handle == sink_rev && !go_left)) { + // We are reading into every graph tail + return backing_tails.size(); + } + // Otherwise we're reading off the outside ends of the source/sink nodes + return 0; + } else { + // We need to find the backing graph degree and possibly adjust it if this is a head or tail + handle_t backing_handle = to_backing(handle); + + size_t degree = backing->get_degree(backing_handle, go_left); + + if (backing_heads.count(backing->forward(backing_handle))) { + // We are a head. Are we going off the left end when forward, or the right end when reverse? + if (go_left != backing->get_is_reverse(backing_handle)) { + // If so we count the synthetic edge. + degree++; + } + } + if (backing_tails.count(backing->forward(backing_handle))) { + // We are a tial. Are we going off the left end when reverse, or the right end when forward? + if (go_left != !backing->get_is_reverse(backing_handle)) { + // If so we count the synthetic edge. + degree++; + } + } + + return degree; + } +} + +handle_t SourceSinkOverlay::get_underlying_handle(const handle_t& handle) const { + if (is_ours(handle)) { + throw std::runtime_error("error:[SourceSinkOverlay] cannot request underlying handle of source or sink node"); + } + return to_backing(handle); +} + +} diff --git a/src/source_sink_overlay.hpp b/src/source_sink_overlay.hpp new file mode 100644 index 00000000000..70b9e65b85b --- /dev/null +++ b/src/source_sink_overlay.hpp @@ -0,0 +1,160 @@ +#ifndef VG_SOURCE_SINK_OVERLAY_HPP_INCLUDED +#define VG_SOURCE_SINK_OVERLAY_HPP_INCLUDED + +/** + * \file source_sink_overlay.hpp + * + * Provides SourceSinkOverlay, a HandleGraph implementation that joins all the + * heads and tails of a backing graph to single source and sink nodes. + * + */ + + +#include "handle.hpp" + +#include + + +namespace vg { + +using namespace handlegraph; + +/** + * Present a HandleGraph that is a backing HandleGraph with all its head nodes + * connected to a single source node, and all its tail nodes connected to a + * single sink node. + */ +class SourceSinkOverlay : public ExpandingOverlayGraph { + +public: + /** + * Make a new SourceSinkOverlay. The backing graph must not be modified + * while the overlay exists. + * + * The overlay will project a source node consisting of '#' characters, and + * a sink node consisting of '$' characters. The lengths of the nodes may + * be specified, and default to 1024, the max length that GCSA2 supports. + * The IDs of the nodes will be autodetected from the backing graph's max + * ID if not specified (or given as 0). If either is specified, both must + * be specified. + * + * Also breaks into disconnected components with no tips, unless + * break_disconnected is false. When breaking into such a component, we + * choose an arbitrary node, link the source node to its start, and link + * everything that also went to its start to the sink node. + */ + SourceSinkOverlay(const HandleGraph* backing, size_t length = 1024, id_t source_id = 0, id_t sink_id = 0, + bool break_disconnected = true); + + /// Expose the handle to the synthetic source + handle_t get_source_handle() const; + + /// Expose the handle to the synthetic sink + handle_t get_sink_handle() const; + + //////////////////////////////////////////////////////////////////////////// + // Handle-based interface + //////////////////////////////////////////////////////////////////////////// + + /// Check if a node exists by ID + virtual bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + virtual handle_t get_handle(const id_t& node_id, bool is_reverse) const; + + /// Get the ID from a handle + virtual id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + virtual bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + virtual handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + virtual size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + virtual string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee returns false. + virtual bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph + virtual size_t get_node_count() const; + + /// Return the smallest ID in the graph. + virtual id_t min_node_id() const; + + /// Return the largest ID in the graph. + virtual id_t max_node_id() const; + + /// Compute the degree of one side of a handle in O(1) time, if the backing + /// graph also provides this facility in O(1) time. Takes O(n) time + /// otherwise in the returned degree. + virtual size_t get_degree(const handle_t& handle, bool go_left) const; + + //////////////////////////////////////////////////////////////////////////// + // Overlay interface + //////////////////////////////////////////////////////////////////////////// + + /// Get the handle in the underlying graph that corresponds to the handle in + /// the overlay. + /// Throws an error if called on either the source or sink node + virtual handle_t get_underlying_handle(const handle_t& handle) const; + +protected: + + /// How long are the projected nodes? + size_t node_length; + + /// What backing graph do we overlay? + const HandleGraph* backing; + + /// What is our projected source node ID? + id_t source_id; + /// What is our projected sink node ID? + id_t sink_id; + + /// We keep a set of backing graph head handles, in backing graph handle + /// space. This also includes anything else we need to hook up to our + /// source node to break into tipless components. + unordered_set backing_heads; + /// And similarly for the tails. These handles read out of their components. + unordered_set backing_tails; + + // We reserve the 4 low numbers of the handles for our new source and sink, and shift everything else up. + // These could have been static, but I couldn't figure out how to properly initialize them using constexpr functions. + const handle_t source_fwd = as_handle(0); + const handle_t source_rev = as_handle(1); + const handle_t sink_fwd = as_handle(2); + const handle_t sink_rev = as_handle(3); + + /// Convert a backing graph handle to our handle to the same node + inline handle_t from_backing(const handle_t& backing_handle) const { + return as_handle(as_integer(backing_handle) + 4); + } + + /// Convert our handle to a backing graph node into a backing graph handle to the same node + inline handle_t to_backing(const handle_t& our_handle) const { + return as_handle(as_integer(our_handle) - 4); + } + + /// Determine if a handle points to an overlay-added node or not + inline bool is_ours(const handle_t& our_handle) const { + return ((uint64_t) as_integer(our_handle)) < 4; + } + +}; + + +} + +#endif diff --git a/src/sparse_union_find.cpp b/src/sparse_union_find.cpp new file mode 100644 index 00000000000..933d8560606 --- /dev/null +++ b/src/sparse_union_find.cpp @@ -0,0 +1,114 @@ + +#include +#include +#include "sparse_union_find.hpp" +// #define debug +namespace vg { + + using namespace std; + + SparseUnionFind::SparseUnionFind(bool include_children, vector node_ids) : + include_children(include_children), node_ids(node_ids), UnionFind(node_ids.size(), include_children) { + for (size_t i = 0; i < node_ids.size(); i++) { + sparse_to_dense[node_ids[i]] = i; + dense_to_sparse[i]= node_ids[i]; + } +#ifdef debug + cout << "============================================================================= " << endl; + cout << "sparse_to_dense" < SparseUnionFind::group(size_t i) { + + + vector to_return = UnionFind::group(sparse_to_dense.at(i)); + + vector sparse_to_return; + //traverse to_return and retrieve ids from dense_to_sparse + for(size_t i =0; i < to_return.size(); i++){ + //iterate through vector + size_t node_to_lookup = to_return[i]; + //do a lookup + size_t translated_node_id= dense_to_sparse[node_to_lookup]; + //push to vector + sparse_to_return.push_back(translated_node_id); + } + return sparse_to_return; + } + + vector> SparseUnionFind::all_groups() { + vector> to_return = UnionFind::all_groups(); + + vector> sparse_to_return; + + //translate from dense to sparse + for(size_t i = 0; i sparse_groups; + for(size_t j = 0; j +#include +#include +#include +#include +#include +#include + +namespace vg { + + using namespace std; + using namespace structures; + + class SparseUnionFind : public UnionFind{ + + public: + vector node_ids; + unordered_map sparse_to_dense;//incoming + unordered_map dense_to_sparse;//outgoing + + + SparseUnionFind(bool include_children, vector node_ids); + + /// Destructor + ~SparseUnionFind(); + + /// Returns the number of indices in the UnionFind + size_t size(); + + /// Returns the group ID that index i belongs to (can change after calling union) + size_t find_group(size_t i); + + /// Merges the group containing index i with the group containing index j + void union_groups(size_t i, size_t j); + + /// Returns the size of the group containing index i + size_t group_size(size_t i); + + /// Returns a vector of the indices in the same group as index i + vector group(size_t i); + + /// Returns all of the groups, each in a separate vector + vector> all_groups(); + + private: + bool include_children; + + }; + + +} +#endif \ No newline at end of file diff --git a/src/splicing.cpp b/src/splicing.cpp new file mode 100644 index 00000000000..2cb7f94ffd6 --- /dev/null +++ b/src/splicing.cpp @@ -0,0 +1,1706 @@ +/** + * \file splicing.cpp + * + * Implements SpliceRegion and some other splicing tools + * + */ + +#include "splicing.hpp" + +//#define debug_splice_region +//#define debug_trimming +//#define debug_fusing +//#define debug_from_hit +//#define debug_linker_split + +#ifdef debug_splice_region +#include +#endif + +namespace vg { + +SpliceStats::SpliceStats(const GSSWAligner& scorer) { + + // human frequencies from Burset, Seledstov, and Solovyev (2000) + vector> default_motifs{ + {string("GT"), string("AG"), 0.9924}, + {string("GC"), string("AG"), 0.0069}, + {string("AT"), string("AC"), 0.0005} + }; + // mixture model trained on gencode v. 29 + vector default_mixture_weights{ + 0.056053626960353785, + 0.08887092416144658, + 0.24633134729683695, + 0.0008866793308038118, + 0.6078574222505589 + }; + vector> default_component_params{ + {4.531698286987208, 0.137211790877491}, + {5.272613870298457, 0.432711724560919}, + {9.092960704882925, 1.3246330622550786}, + {0.6443259788228138, 0.5969347049425677}, + {7.256815224883574, 1.0409647232592127} + }; + init(default_motifs, default_mixture_weights, default_component_params, scorer); +} + +SpliceStats::SpliceStats(const vector>& motifs, + const vector& lognormal_mixture_weights, + const vector>& lognormal_component_params, + const GSSWAligner& scorer) { + init(motifs, lognormal_mixture_weights, lognormal_component_params, scorer); +} + +size_t SpliceStats::motif_size() const { + return motif_data.size(); +} + +const string& SpliceStats::oriented_motif(size_t motif_num, bool left_side) const { + return left_side ? get<1>(motif_data[motif_num]) : get<0>(motif_data[motif_num]); +} + +bool SpliceStats::motif_is_reverse(size_t motif_num) const { + return motif_num % 2; +} + +string SpliceStats::unoriented_motif(size_t motif_num, bool left_side) const { + return left_side ? get<1>(unaltered_motif_data[motif_num / 2]) : get<0>(unaltered_motif_data[motif_num / 2]); +} + +int32_t SpliceStats::motif_score(size_t motif_num) const { + return get<2>(motif_data[motif_num]); +} + +double SpliceStats::motif_frequency(size_t motif_num) const { + return get<2>(unaltered_motif_data[motif_num / 2]); +} + +int32_t SpliceStats::intron_length_score(int64_t length) const { + return round((intron_length_log_likelihood(length) - mode_log_likelihood) / log_base); +} + +void SpliceStats::update_motifs(const vector>& motifs, + const GSSWAligner& scorer) { + init(motifs, mixture_weights, component_params, scorer); +} + +void SpliceStats::update_intron_length_distribution(const vector& lognormal_mixture_weights, + const vector>& lognormal_component_params, + const GSSWAligner& scorer) { + init(unaltered_motif_data, lognormal_mixture_weights, lognormal_component_params, scorer); +} + +void SpliceStats::update_scoring(const GSSWAligner& scorer) { + init(unaltered_motif_data, mixture_weights, component_params, scorer); +} + +double SpliceStats::intron_length_log_likelihood(int64_t length) const { + double x = length; + double likelihood = 0.0; + for (size_t i = 0; i < mixture_weights.size(); ++i) { + double mu, sigma; + tie(mu, sigma) = component_params[i]; + likelihood += mixture_weights[i] * lognormal_pdf(x, mu, sigma); + } + return log(likelihood); +} + +void SpliceStats::init(const vector>& motifs, + const vector& lognormal_mixture_weights, + const vector>& lognormal_component_params, + const GSSWAligner& scorer) { + + if (lognormal_mixture_weights.size() != lognormal_component_params.size()) { + cerr << "error:[SpliceStats] do not have same number of weights and component parameters" << endl; + exit(1); + } + double total_weight = 0.0; + for (auto wt : lognormal_mixture_weights) { + total_weight += wt; + } + if (abs(total_weight - 1.0) > .00001) { + cerr << "error:[SpliceStats] mixture component weights do not sum to 1" << endl; + exit(1); + } + if (motifs.empty()) { + cerr << "error:[SpliceStats] list of motifs is empty" << endl; + exit(1); + } + if (lognormal_mixture_weights.empty()) { + cerr << "error:[SpliceStats] list of intron length distribution parameters is empty" << endl; + exit(1); + } + + // TODO: does this normalization to 1 make sense? + double total_frequency = 0.0; + for (const auto& record : motifs) { + if (get<0>(record).size() != 2 || get<1>(record).size() != 2) { + cerr << "error:[SpliceStats] Splice motif " << get<0>(record) << "-" << get<1>(record) << " is not a pair of dinucleotides." << endl; + exit(1); + } + if (get<2>(record) < 0.0 || get<2>(record) > 1.0) { + cerr << "error:[SpliceStats] Frequency of splice motif " << get<0>(record) << "-" << get<1>(record) << " given as " << get<2>(record) << ". Must be a number between 0 and 1." << endl; + exit(1); + } + total_frequency += get<2>(record); + } + // a little slop for numerical imprecision + if (total_frequency > 1.000001) { + cerr << "error:[SpliceStats] Frequency of splice motifs sum to " << total_frequency << ". Must be a number between 0 and 1." << endl; + exit(1); + } + + // in case we're resetting + motif_data.clear(); + unaltered_motif_data = motifs; + +#ifdef debug_splice_region + cerr << "recording splice table" << endl; +#endif + + motif_data.reserve(motifs.size()); + for (const auto& record : motifs) { + int32_t score = round(log(get<2>(record)) / scorer.log_base); + motif_data.emplace_back(); + get<0>(motif_data.back()) = get<0>(record); + // reverse the second string because it's encountered in reverse when going into + // an intron + get<1>(motif_data.back()) = string(get<1>(record).rbegin(), get<1>(record).rend()); + // convert frequency to a log likelihood + get<2>(motif_data.back()) = score; + + // now do the reverse complement + motif_data.emplace_back(); + get<0>(motif_data.back()) = reverse_complement(get<1>(record)); + get<1>(motif_data.back()) = reverse_complement(string(get<0>(record).rbegin(), get<0>(record).rend())); + get<2>(motif_data.back()) = score; + } + +#ifdef debug_splice_region + for (int i = 0; i < motif_data.size(); ++i) { + const auto& record = motif_data[i]; + cerr << (i % 2 == 0 ? "+" : "-") << "\t" << get<0>(record) << "\t" << get<1>(record) << "\t" << get<2>(record) << endl; + } +#endif + + log_base = scorer.log_base; + mixture_weights = lognormal_mixture_weights; + component_params = lognormal_component_params; + + // find the mode of the mixture distribution + + // determine the interval we're going to search in + int64_t mode_range_min = numeric_limits::max(); + int64_t mode_range_max = numeric_limits::min(); + for (const auto& comp_params : component_params) { + double mu, sigma; + tie(mu, sigma) = comp_params; + mode_range_min = min(mode_range_min, floor(exp(mu - sigma * sigma))); + mode_range_max = max(mode_range_max, ceil(exp(mu))); + } + + // search for the modal log likeilhood + mode_log_likelihood = -numeric_limits::max(); + int64_t modal_length = -1; + + // in case of very wide distributions, limit the total number of steps (~100k) + int64_t max_num_steps = 128 * 1024; + int64_t step = max((mode_range_max - mode_range_min) / max_num_steps, 1); + for (int64_t l = mode_range_min; l <= mode_range_max; l += step) { + double log_likelihood = intron_length_log_likelihood(l); + if (log_likelihood > mode_log_likelihood) { + mode_log_likelihood = log_likelihood; + modal_length = l; + } + } + // refine in case the step was too small + for (int64_t l = max(0, modal_length - step); l < modal_length + step; ++l) { + double log_likelihood = intron_length_log_likelihood(l); + if (log_likelihood > mode_log_likelihood) { + mode_log_likelihood = log_likelihood; + } + } +} + +// FIXME: magic numbers in the incremental graph initializer... +SpliceRegion::SpliceRegion(const pos_t& seed_pos, bool search_left, int64_t search_dist, + const HandleGraph& graph, + const DinucleotideMachine& dinuc_machine, + const SpliceStats& splice_stats) + : subgraph(graph, seed_pos, search_left, search_dist + 2, 5, search_dist * search_dist), motif_matches(splice_stats.motif_size()) +{ + +#ifdef debug_splice_region + cerr << "constructing splice region starting from seed pos " << seed_pos << " in direction left? " << search_left << ", max dist " << search_dist << endl; +#endif + + + // add a buffer of 2 bases for the dinucleotide itself + // TODO: feels inelegant to do this here and in the initializer list + search_dist += 2; + + // remember the starting location + handle_t handle = subgraph.handle_at_order(0); + seed = pair(handle, offset(seed_pos)); + + // extract the subgraph and initialize the DP structure + vector>> dinuc_states; + dinuc_states.emplace_back(handle, vector(subgraph.get_length(handle), + dinuc_machine.init_state())); + + while (subgraph.is_extendable()) { + handle = subgraph.extend(); + dinuc_states.emplace_back(handle, vector(subgraph.get_length(handle), + dinuc_machine.init_state())); +#ifdef debug_splice_region + cerr << "extract " << graph.get_id(subgraph.get_underlying_handle(handle)) << " " << graph.get_is_reverse(subgraph.get_underlying_handle(handle)) << " at distance " << subgraph.min_distance_from_start(handle) << endl; +#endif + } + int64_t incr = search_left ? -1 : 1; + + // check if we match any motifs at this location and if so remember it + auto record_motif_matches = [&](handle_t handle, int64_t j, + const vector& states) { + for (size_t i = 0; i < splice_stats.motif_size(); ++i) { + if (dinuc_machine.matches(states[j], splice_stats.oriented_motif(i, search_left))) { + if ((j == 0 && !search_left) || (j + 1 == states.size() && search_left)) { + // we need to cross a node boundary to backtrack + subgraph.follow_edges(handle, !search_left, [&](const handle_t& prev) { + if (search_left) { + if (subgraph.get_base(prev, 0) == splice_stats.oriented_motif(i, true).front() && + (prev != seed.first || seed.second != 0)) { + int64_t trav_dist = subgraph.min_distance_from_start(prev) + subgraph.get_length(prev) - 1; + motif_matches[i].emplace_back(prev, 1, trav_dist); +#ifdef debug_splice_region + cerr << "record match to motif " << i << " at " << subgraph.order_of(prev) << "-th node " << subgraph.get_id(prev) << ", ending on node " << subgraph.order_of(handle) << ", dist " << trav_dist << endl; +#endif + } + } + else { + size_t k = subgraph.get_length(prev) - 1; + if (subgraph.get_base(prev, k) == splice_stats.oriented_motif(i, false).front() && + (prev != seed.first || seed.second != subgraph.get_length(seed.first))) { + int64_t trav_dist = subgraph.min_distance_from_start(prev) + k; + motif_matches[i].emplace_back(prev, k, trav_dist); +#ifdef debug_splice_region + cerr << "record match to motif " << i << " at " << subgraph.order_of(prev) << "-th node " << subgraph.get_id(prev) << ", ending on node " << subgraph.order_of(handle) << ", dist " << trav_dist << endl; +#endif + } + } + }); + } + else { + int64_t trav_dist = subgraph.min_distance_from_start(handle); + if (search_left) { + trav_dist += states.size() - j - 2; + } + else { + trav_dist += j - 1; + } +#ifdef debug_splice_region + cerr << "record match to motif " << i << " at " << (j - 2 * incr + !search_left) << "-th node " << subgraph.get_id(handle) << ", ending on same node, dist " << trav_dist << endl; +#endif + motif_matches[i].emplace_back(handle, j - 2 * incr + !search_left, trav_dist); + } + } + } + }; + + + // now actually do the DP + for (size_t i = 0; i < dinuc_states.size(); ++i) { + + handle_t here = dinuc_states[i].first; + vector& states = dinuc_states[i].second; + string seq = subgraph.get_sequence(here); + + // determine where we'll start iterating from + int64_t j; + if (i == 0) { + j = search_left ? offset(seed_pos) - 1 : offset(seed_pos); + } + else { + j = search_left ? seq.size() - 1 : 0; + } + + // determine the bounds of the iteration + int64_t prev_dist = subgraph.min_distance_from_start(here); + int64_t left_end = 0; + int64_t right_end = seq.size(); + if (prev_dist + seq.size() >= search_dist) { + if (search_left) { + left_end = prev_dist + seq.size() - search_dist; + } + else { + right_end = search_dist - prev_dist; + } + } + +#ifdef debug_splice_region + cerr << "node number " << i << ", underlying ID " << graph.get_id(subgraph.get_underlying_handle(here)) << ", iteration bounds: j = " << j << ", incr = " << incr << ", left end = " << left_end << ", right end " << right_end << ", node len = " << seq.size() << endl; +#endif + // are we starting at the boundary of a node? + if ((j == 0 && !search_left) || (j == seq.size() - 1 && search_left)) { + // merge all of the incoming transition states + subgraph.follow_edges(here, !search_left, [&](const handle_t& prev) { + vector& incoming_states = dinuc_states[subgraph.order_of(prev)].second; + uint32_t incoming = search_left ? incoming_states.front() : incoming_states.back(); + states[j] = dinuc_machine.merge_state(states[j], dinuc_machine.update_state(incoming, seq[j])); + }); + record_motif_matches(here, j, states); + j += incr; + } + + // carry forward the transitions to the end of the node + for (; j >= left_end && j < right_end; j += incr) { + states[j] = dinuc_machine.update_state(states[j - incr], seq[j]); + record_motif_matches(here, j, states); + } + } +} + +const IncrementalSubgraph& SpliceRegion::get_subgraph() const { + return subgraph; +} + +const pair& SpliceRegion::get_seed_pos() const { + return seed; +} + +const vector>& SpliceRegion::candidate_splice_sites(size_t motif_num) const { + return motif_matches[motif_num]; +} + +JoinedSpliceGraph::JoinedSpliceGraph(const HandleGraph& parent_graph, + const IncrementalSubgraph& left_subgraph, + handle_t left_splice_node, size_t left_splice_offset, + const IncrementalSubgraph& right_subgraph, + handle_t right_splice_node, size_t right_splice_offset) + : parent_graph(&parent_graph), left_subgraph(&left_subgraph), right_subgraph(&right_subgraph), + left_handle_trans(left_subgraph.get_node_count(), -1), right_handle_trans(right_subgraph.get_node_count(), -1), + left_splice_offset(left_splice_offset), right_splice_offset(right_splice_offset) +{ + // TODO: use the handle translator as scratch instead of these temporary vectors? + vector keep_left(left_subgraph.get_node_count(), false); + vector keep_right(right_subgraph.get_node_count(), false); + + // keep handles that can reach the left side of the join + keep_left[left_subgraph.order_of(left_splice_node)] = true; + vector stack(1, left_splice_node); + while (!stack.empty()) { + handle_t here = stack.back(); + stack.pop_back(); + left_subgraph.follow_edges(here, true, [&](const handle_t& prev) { + if (!keep_left[left_subgraph.order_of(prev)]) { + keep_left[left_subgraph.order_of(prev)] = true; + stack.emplace_back(prev); + } + }); + } + + // keep handles that can reach the right side of the join + stack.emplace_back(right_splice_node); + keep_right[right_subgraph.order_of(right_splice_node)] = true; + // TODO: repetitive code + while (!stack.empty()) { + handle_t here = stack.back(); + stack.pop_back(); + right_subgraph.follow_edges(here, false, [&](const handle_t& prev) { + if (!keep_right[right_subgraph.order_of(prev)]) { + keep_right[right_subgraph.order_of(prev)] = true; + stack.emplace_back(prev); + } + }); + } + + for (int64_t i = 0; i < left_subgraph.get_node_count(); ++i) { + if (keep_left[i]) { + left_handle_trans[i] = handle_idxs.size(); + handle_idxs.push_back(i); + } + } + + num_left_handles = handle_idxs.size(); + + // in reverse order + for (int64_t i = right_subgraph.get_node_count() - 1; i >= 0; --i) { + if (keep_right[i]) { + right_handle_trans[i] = handle_idxs.size(); + handle_idxs.push_back(i); + } + } +} + +pair JoinedSpliceGraph::translate_node_ids(Path& path) const { + + pair splice_idxs(numeric_limits::max(), + numeric_limits::max()); + + for (size_t i = 0; i < path.mapping_size(); ++i) { + + Position* position = path.mutable_mapping(i)->mutable_position(); + + // record any splice positions + if (position->node_id() == get_id(left_splice_node())) { + splice_idxs.first = i; + } + else if (position->node_id() == get_id(right_splice_node())) { + splice_idxs.second = i; + } + + // project down to the parent graph + size_t j = position->node_id() - 1; + auto subgraph = j < num_left_handles ? left_subgraph : right_subgraph; + handle_t underlying = subgraph->get_underlying_handle(subgraph->handle_at_order(handle_idxs[j])); + if (position->is_reverse()) { + underlying = parent_graph->flip(underlying); + } + // adjust offsets and IDs in the position + auto interval = underlying_interval(get_handle(position->node_id(), position->is_reverse())); + position->set_node_id(parent_graph->get_id(underlying)); + position->set_is_reverse(parent_graph->get_is_reverse(underlying)); + position->set_offset(position->offset() + interval.first); + } + return splice_idxs; +} + +handle_t JoinedSpliceGraph::left_seed_node() const { + return handlegraph::number_bool_packing::pack(0, false); +} + +handle_t JoinedSpliceGraph::right_seed_node() const { + return handlegraph::number_bool_packing::pack(handle_idxs.size() - 1, false); +} + +handle_t JoinedSpliceGraph::left_splice_node() const { + return handlegraph::number_bool_packing::pack(num_left_handles - 1, false); +} + +handle_t JoinedSpliceGraph::right_splice_node() const { + return handlegraph::number_bool_packing::pack(num_left_handles, false); +} + +int64_t JoinedSpliceGraph::min_link_length() const { + handle_t splice_left = left_subgraph->handle_at_order(handle_idxs[num_left_handles - 1]); + handle_t splice_right = right_subgraph->handle_at_order(handle_idxs[num_left_handles]); + return (left_subgraph->min_distance_from_start(splice_left) + + right_subgraph->min_distance_from_start(splice_right) + + left_splice_offset + + right_subgraph->get_length(splice_right) - right_splice_offset); +} + +int64_t JoinedSpliceGraph::max_link_length() const { + handle_t splice_left = left_subgraph->handle_at_order(handle_idxs[num_left_handles - 1]); + handle_t splice_right = right_subgraph->handle_at_order(handle_idxs[num_left_handles]); + return (left_subgraph->max_distance_from_start(splice_left) + + right_subgraph->max_distance_from_start(splice_right) + + left_splice_offset + + right_subgraph->get_length(splice_right) - right_splice_offset); + +} + +bool JoinedSpliceGraph::has_node(id_t node_id) const { + return node_id > 0 && node_id <= handle_idxs.size(); +} + +handle_t JoinedSpliceGraph::get_handle(const id_t& node_id, bool is_reverse) const { + return handlegraph::number_bool_packing::pack(node_id - 1, is_reverse); +} + +id_t JoinedSpliceGraph::get_id(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_number(handle) + 1; +} + +bool JoinedSpliceGraph::get_is_reverse(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_bit(handle); +} + +handle_t JoinedSpliceGraph::flip(const handle_t& handle) const { + return handlegraph::number_bool_packing::toggle_bit(handle); +} + +size_t JoinedSpliceGraph::get_length(const handle_t& handle) const { + auto interval = underlying_interval(handle); + return interval.second - interval.first; +} + +string JoinedSpliceGraph::get_sequence(const handle_t& handle) const { + auto interval = underlying_interval(handle); + size_t i = handlegraph::number_bool_packing::unpack_number(handle); + const IncrementalSubgraph& subgraph = i < num_left_handles ? *left_subgraph : *right_subgraph; + handle_t under = subgraph.handle_at_order(handle_idxs[i]); + if (get_is_reverse(handle)) { + under = subgraph.flip(under); + } + return subgraph.get_subsequence(under, interval.first, interval.second - interval.first); +} + +bool JoinedSpliceGraph::follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const { + + bool using_left_edges = go_left != get_is_reverse(handle); + size_t i = handlegraph::number_bool_packing::unpack_number(handle); + + auto traverse_within_subgraph = [&](const IncrementalSubgraph& subgraph, + const vector& handle_trans) { + // traverse within the subgraph + handle_t under = subgraph.handle_at_order(handle_idxs[i]); + if (get_is_reverse(handle)) { + under = subgraph.flip(under); + } + return subgraph.follow_edges(under, go_left, [&](const handle_t& next) { + // filter to only the handles that are included in the joined graph + int64_t translated = handle_trans[subgraph.order_of(next)]; + if (translated != -1) { + return iteratee(handlegraph::number_bool_packing::pack(translated, + get_is_reverse(next))); + } + else { + return true; + } + }); + }; + + if (i + 1 < num_left_handles || (i + 1 == num_left_handles && using_left_edges)) { + // internal to the left subgraph + return traverse_within_subgraph(*left_subgraph, left_handle_trans); + } + else if (i > num_left_handles || (i == num_left_handles && !using_left_edges)) { + // internal to the right subgraph + return traverse_within_subgraph(*right_subgraph, right_handle_trans); + } + else if (i + 1 == num_left_handles) { + // rightward across the splice join + return iteratee(handlegraph::number_bool_packing::pack(num_left_handles, + get_is_reverse(handle))); + } + else { + // leftward across the splice join + return iteratee(handlegraph::number_bool_packing::pack(num_left_handles - 1, + get_is_reverse(handle))); + } +} + +bool JoinedSpliceGraph::for_each_handle_impl(const function& iteratee, + bool parallel) const { + bool keep_going = true; + for (size_t i = 0; i < handle_idxs.size() && keep_going; ++i) { + keep_going = iteratee(handlegraph::number_bool_packing::pack(i, false)); + } + // not doing parallel, never expect to use it + return keep_going; +} + +size_t JoinedSpliceGraph::get_node_count() const { + return handle_idxs.size(); +} + +id_t JoinedSpliceGraph::min_node_id() const { + return 1; +} + +id_t JoinedSpliceGraph::max_node_id() const { + return handle_idxs.size(); +} + +char JoinedSpliceGraph::get_base(const handle_t& handle, size_t index) const { + size_t i = handlegraph::number_bool_packing::unpack_number(handle); + const IncrementalSubgraph& subgraph = i < num_left_handles ? *left_subgraph : *right_subgraph; + handle_t under = subgraph.handle_at_order(handle_idxs[i]); + if (get_is_reverse(handle)) { + under = subgraph.flip(under); + } + auto interval = underlying_interval(handle); + return subgraph.get_base(under, interval.first + index); +} + +string JoinedSpliceGraph::get_subsequence(const handle_t& handle, size_t index, size_t size) const { + size_t i = handlegraph::number_bool_packing::unpack_number(handle); + const IncrementalSubgraph& subgraph = i < num_left_handles ? *left_subgraph : *right_subgraph; + handle_t under = subgraph.handle_at_order(handle_idxs[i]); + if (get_is_reverse(handle)) { + under = subgraph.flip(under); + } + auto interval = underlying_interval(handle); + index = min(interval.first + index, interval.second); + size = min(size, interval.second - index); + return subgraph.get_subsequence(under, index, size); +} + +pair JoinedSpliceGraph::underlying_interval(const handle_t& handle) const { + size_t i = handlegraph::number_bool_packing::unpack_number(handle); + const IncrementalSubgraph& subgraph = i < num_left_handles ? *left_subgraph : *right_subgraph; + handle_t under = subgraph.handle_at_order(handle_idxs[i]); + size_t begin, end; + if (i == 0) { + begin = -subgraph.min_distance_from_start(under); + } + else if (i == num_left_handles) { + begin = right_splice_offset; + } + else { + begin = 0; + } + if (i + 1 == handle_idxs.size()) { + end = subgraph.get_length(under) + subgraph.min_distance_from_start(under); + } + else if (i + 1 == num_left_handles) { + end = left_splice_offset; + } + else { + end = subgraph.get_length(under); + } + pair return_val(begin, end); + if (get_is_reverse(handle)) { + return_val.first = subgraph.get_length(under) - end; + return_val.second = subgraph.get_length(under) - begin; + } + return return_val; +} + +multipath_alignment_t from_hit(const Alignment& alignment, const HandleGraph& graph, + const pos_t& hit_pos, const MaximalExactMatch& mem, + const GSSWAligner& scorer) { + // TODO: mostly copied from multipath alignment graph + + multipath_alignment_t multipath_aln; + transfer_read_metadata(alignment, multipath_aln); + + // stack for DFS, each record contains tuples of + // (read begin, node offset, next node index, next node handles, + vector>> stack; + stack.emplace_back(mem.begin, offset(hit_pos), 0, + vector{graph.get_handle(id(hit_pos), is_rev(hit_pos))}); + while (!stack.empty()) { + auto& back = stack.back(); + if (get<2>(back) == get<3>(back).size()) { + stack.pop_back(); + continue; + } + + handle_t trav = get<3>(back)[get<2>(back)]; + get<2>(back)++; + +#ifdef debug_from_hit + cerr << "checking node " << graph.get_id(trav) << endl; +#endif + + string node_seq = graph.get_sequence(trav); + size_t node_idx = get<1>(back); + string::const_iterator read_iter = get<0>(back); + + // look for a match along the entire node sequence + for (; node_idx < node_seq.size() && read_iter != mem.end; node_idx++, read_iter++) { + if (node_seq[node_idx] != *read_iter) { +#ifdef debug_from_hit + cerr << "node sequence does not match read" << endl; +#endif + break; + } + } + + if (read_iter == mem.end) { + break; + } + + if (node_idx == node_seq.size()) { + stack.emplace_back(read_iter, 0, 0, vector()); + graph.follow_edges(trav, false, [&](const handle_t& next) { + get<3>(stack.back()).emplace_back(next); + }); + } + } + + subpath_t* subpath = multipath_aln.add_subpath(); + path_t* path = subpath->mutable_path(); + for (size_t i = 0; i < stack.size(); ++i) { + path_mapping_t* mapping = path->add_mapping(); + if (i == 0 && mem.begin != alignment.sequence().begin()) { + edit_t* edit = mapping->add_edit(); + edit->set_to_length(mem.begin - alignment.sequence().begin()); + edit->set_from_length(0); + edit->set_sequence(string(alignment.sequence().begin(), mem.begin)); + } + + handle_t handle = get<3>(stack[i])[get<2>(stack[i]) - 1]; + + position_t* position = mapping->mutable_position(); + position->set_node_id(graph.get_id(handle)); + position->set_is_reverse(graph.get_is_reverse(handle)); + position->set_offset(get<1>(stack[i])); + + edit_t* edit = mapping->add_edit(); + int64_t len = min(graph.get_length(handle) - get<1>(stack[i]), + mem.end - get<0>(stack[i])); + edit->set_to_length(len); + edit->set_from_length(len); + + if (i + 1 == stack.size() && mem.end != alignment.sequence().end()) { + edit_t* edit = mapping->add_edit(); + edit->set_to_length(alignment.sequence().end() - mem.end); + edit->set_from_length(0); + edit->set_sequence(string(mem.end, alignment.sequence().end())); + } + } + subpath->set_score(scorer.score_partial_alignment(alignment, graph, *path, + alignment.sequence().begin())); + + identify_start_subpaths(multipath_aln); + return multipath_aln; +} + +tuple trimmed_end(const Alignment& aln, int64_t len, bool from_end, + const HandleGraph& graph, const GSSWAligner& aligner) { + +#ifdef debug_trimming + cerr << "trimming alignment " << pb2json(aln) << " by " << len << ", from end? " << from_end << endl; +#endif + + const Path& path = aln.path(); + path_t dummy_path; + + tuple return_val; + + bool copied_full_path = false; + if (path.mapping_size()) { + if (from_end) { + const Mapping& final_mapping = path.mapping(path.mapping_size() - 1); + if (final_mapping.edit(final_mapping.edit_size() - 1).from_length() == 0) { + // we have to walk further to skip the softclips + len += final_mapping.edit(final_mapping.edit_size() - 1).to_length(); +#ifdef debug_trimming + cerr << "bump walk length up to " << len << " for right side soft-clip" << endl; +#endif + } + if (path.mapping(0).edit(0).from_length() == 0) { + // we don't want to walk onto the softclip on the other end + len = min(len, aln.sequence().size() - path.mapping(0).edit(0).to_length()); +#ifdef debug_trimming + cerr << "cap walk length to " << len << " for left side soft-clip" << endl; +#endif + } + int64_t i = path.mapping_size() - 1; + while (i >= 0 && (len > mapping_to_length(path.mapping(i)) + || mapping_from_length(path.mapping(i)) == 0)) { + auto to_length = mapping_to_length(path.mapping(i)); + len = max(len - to_length, 0); + get<1>(return_val) += to_length; + +#ifdef debug_trimming + cerr << "after mapping " << i << ", remaining length " << len << endl; +#endif + + from_proto_mapping(path.mapping(i), *dummy_path.add_mapping()); + --i; + } + if (i < 0) { +#ifdef debug_trimming + cerr << "walked entire path" << endl; +#endif + get<0>(return_val) = initial_position(path); + get<1>(return_val) = path_to_length(path); + dummy_path.clear_mapping(); + from_proto_path(path, dummy_path); + copied_full_path = true; + } + else { + const Mapping& mapping = path.mapping(i); + int64_t j = mapping.edit_size() - 1; + int64_t from_length = 0; + path_mapping_t* dummy_mapping = nullptr; + while (j >= 0 && (len > mapping.edit(j).to_length() + || mapping.edit(j).from_length() == 0)) { + auto to_length = mapping.edit(j).to_length(); + len = max(len - to_length, 0); + get<1>(return_val) += to_length; + from_length += mapping.edit(j).from_length(); + +#ifdef debug_trimming + cerr << "after edit " << j << ", remaining length " << len << endl; +#endif + if (!dummy_mapping) { + dummy_mapping = dummy_path.add_mapping(); + } + from_proto_edit(mapping.edit(j), *dummy_mapping->add_edit()); + --j; + } + if (j >= 0 && len > 0) { + auto last_from_length = (len * mapping.edit(j).from_length()) / mapping.edit(j).to_length(); + get<1>(return_val) += len; + from_length += last_from_length; + +#ifdef debug_trimming + cerr << "handling final (partial) edit with to length " << len << ", from length " << last_from_length << endl; +#endif + + if (!dummy_mapping) { + dummy_mapping = dummy_path.add_mapping(); + } + auto* dummy_edit = dummy_mapping->add_edit(); + dummy_edit->set_from_length(last_from_length); + dummy_edit->set_to_length(len); + if (!mapping.edit(j).sequence().empty()) { + dummy_edit->set_sequence(mapping.edit(j).sequence().substr(mapping.edit(j).to_length() - len, len)); + } + } + const Position& position = mapping.position(); + get_id(get<0>(return_val)) = position.node_id(); + get_is_rev(get<0>(return_val)) = position.is_reverse(); + get_offset(get<0>(return_val)) = position.offset() + mapping_from_length(mapping) - from_length; + if (dummy_mapping) { + auto dummy_position = dummy_mapping->mutable_position(); + dummy_position->set_node_id(id(get<0>(return_val))); + dummy_position->set_is_reverse(is_rev(get<0>(return_val))); + dummy_position->set_offset(offset(get<0>(return_val))); + } + } + } + else { + if (path.mapping(0).edit(0).from_length() == 0) { + // we have to walk further to skip the softclips + len += path.mapping(0).edit(0).to_length(); +#ifdef debug_trimming + cerr << "bump walk length up to " << len << " for left side soft-clip" << endl; +#endif + } + const Mapping& final_mapping = path.mapping(path.mapping_size() - 1); + if (final_mapping.edit(final_mapping.edit_size() - 1).from_length() == 0) { + // we don't want to walk onto the softclip on the other end + len = min(len, aln.sequence().size() - final_mapping.edit(final_mapping.edit_size() - 1).to_length()); +#ifdef debug_trimming + cerr << "cap walk length to " << len << " for right side soft-clip" << endl; +#endif + } + int64_t i = 0; + while (i < path.mapping_size() && (len > mapping_to_length(path.mapping(i)) + || mapping_from_length(path.mapping(i)) == 0 )) { + + auto to_length = mapping_to_length(path.mapping(i)); + len = max(len - to_length, 0); + get<1>(return_val) += to_length; + +#ifdef debug_trimming + cerr << "after mapping " << i << ", remaining length " << len << endl; +#endif + + from_proto_mapping(path.mapping(i), *dummy_path.add_mapping()); + ++i; + } + if (i == path.mapping_size()) { +#ifdef debug_trimming + cerr << "walked entire path" << endl; +#endif + get<0>(return_val) = final_position(path); + get<1>(return_val) = path_to_length(path); + dummy_path.clear_mapping(); + from_proto_path(path, dummy_path); + copied_full_path = true; + } + else { + const Mapping& mapping = path.mapping(i); + int64_t j = 0; + int64_t from_length = 0; + path_mapping_t* dummy_mapping = nullptr; + while (j < mapping.edit_size() && (len > mapping.edit(j).to_length() + || mapping.edit(j).from_length() == 0)) { + auto to_length = mapping.edit(j).to_length(); + len = max(len - to_length, 0); + get<1>(return_val) += to_length; + from_length += mapping.edit(j).from_length(); + +#ifdef debug_trimming + cerr << "after edit " << j << ", remaining length " << len << endl; +#endif + + if (!dummy_mapping) { + dummy_mapping = dummy_path.add_mapping(); + from_proto_position(mapping.position(), *dummy_mapping->mutable_position()); + } + from_proto_edit(mapping.edit(j), *dummy_mapping->add_edit()); + ++j; + } + if (j != mapping.edit_size() && len > 0) { + auto last_from_length = (len * mapping.edit(j).from_length()) / mapping.edit(j).to_length(); + get<1>(return_val) += len; + from_length += last_from_length; + +#ifdef debug_trimming + cerr << "handling final (partial) edit with to length " << len << ", from length " << last_from_length << endl; +#endif + + if (!dummy_mapping) { + dummy_mapping = dummy_path.add_mapping(); + from_proto_position(mapping.position(), *dummy_mapping->mutable_position()); + } + auto* dummy_edit = dummy_mapping->add_edit(); + dummy_edit->set_from_length(last_from_length); + dummy_edit->set_to_length(len); + if (!mapping.edit(j).sequence().empty()) { + dummy_edit->set_sequence(mapping.edit(j).sequence().substr(0, len)); + } + } + const Position& position = mapping.position(); + get_id(get<0>(return_val)) = position.node_id(); + get_is_rev(get<0>(return_val)) = position.is_reverse(); + get_offset(get<0>(return_val)) = position.offset() + from_length; + } + } + } + + string::const_iterator begin; + if (from_end) { + begin = aln.sequence().end() - get<1>(return_val); + // TODO: kind of inelegant + if (!copied_full_path) { + // the path was built in reverse, flip around + for (size_t i = 0, end = dummy_path.mapping_size() / 2; i < end; ++i) { + swap(*dummy_path.mutable_mapping(i), + *dummy_path.mutable_mapping(dummy_path.mapping_size() - i - 1)); + } + // the final mapping was also built in reverse + auto* mapping = dummy_path.mutable_mapping(0); + for (size_t i = 0, end = mapping->edit_size() / 2; i < end; ++i) { + swap(*mapping->mutable_edit(i), + *mapping->mutable_edit(mapping->edit_size() - i - 1)); + } + } + } + else { + begin = aln.sequence().begin(); + } + + + get<2>(return_val) = aligner.score_partial_alignment(aln, graph, dummy_path, begin); + +#ifdef debug_trimming + cerr << "scored trimmed subpath " << debug_string(dummy_path) << " with substring " << (begin - aln.sequence().begin()) << ":" << (begin - aln.sequence().begin()) + get<1>(return_val) << ": " << get<2>(return_val) << endl; +#endif + + return return_val; +} + +// TODO: this implementation ended up requiring a lot of duplicated code, i could probably clean it up +bool trim_path(path_t* path, bool from_left, int64_t mapping_idx, int64_t edit_idx, int64_t base_idx) { + + bool do_trim = ((from_left && (mapping_idx != 0 || edit_idx != 0 || base_idx != 0)) || + (!from_left && mapping_idx != path->mapping_size())); + + if (edit_idx == 0 && base_idx == 0) { + // position is past-the-last on a mapping + if (from_left) { + auto mappings = path->mutable_mapping(); + mappings->erase(mappings->begin(), mappings->begin() + mapping_idx); + } + else { + path->mutable_mapping()->resize(mapping_idx); + } + } + else { + // position is inside a mapping + auto mapping = path->mutable_mapping(mapping_idx); + if (base_idx == 0) { + // position is past-the-last on an edit + if (from_left) { + int64_t from_length_removed = 0; + for (int64_t i = 0; i < edit_idx; ++i) { + from_length_removed += mapping->edit(i).from_length(); + } + auto edits = mapping->mutable_edit(); + edits->erase(edits->begin(), edits->begin() + edit_idx); + mapping->mutable_position()->set_offset(mapping->position().offset() + + from_length_removed); + auto mappings = path->mutable_mapping(); + mappings->erase(mappings->begin(), mappings->begin() + mapping_idx); + } + else { + mapping->mutable_edit()->resize(edit_idx); + path->mutable_mapping()->resize(mapping_idx + 1); + } + } + else { + // position is inside an edit + auto edit = mapping->mutable_edit(edit_idx); + if (from_left) { + int64_t from_length_removed = 0; + if (base_idx > 0) { + if (edit->from_length() > 0) { + from_length_removed += base_idx; + } + edit->set_from_length(max(edit->from_length() - base_idx, 0)); + edit->set_to_length(max(edit->to_length() - base_idx, 0)); + if (!edit->sequence().empty()) { + edit->set_sequence(edit->sequence().substr(base_idx, edit->to_length())); + } + if (edit->from_length() == 0 && edit->to_length() == 0) { + ++edit_idx; + } + } + for (int64_t i = 0; i < edit_idx; ++i) { + from_length_removed += mapping->edit(i).from_length(); + } + auto edits = mapping->mutable_edit(); + edits->erase(edits->begin(), edits->begin() + edit_idx); + mapping->mutable_position()->set_offset(mapping->position().offset() + + from_length_removed); + auto mappings = path->mutable_mapping(); + mappings->erase(mappings->begin(), mappings->begin() + mapping_idx); + } + else { + if (base_idx < max(edit->from_length(), edit->to_length())) { + edit->set_from_length(min(edit->from_length(), base_idx)); + edit->set_to_length(min(edit->to_length(), base_idx)); + if (!edit->sequence().empty()) { + edit->set_sequence(edit->sequence().substr(0, base_idx)); + } + if (edit->from_length() == 0 && edit->to_length() == 0) { + --edit_idx; + } + } + mapping->mutable_edit()->resize(edit_idx + 1); + path->mutable_mapping()->resize(mapping_idx + 1); + } + } + } + return do_trim; +} + +pair, pair> split_splice_segment(const Alignment& splice_segment, + const tuple& left_trace, + const tuple& right_trace, + int64_t splice_junction_idx, + const GSSWAligner& scorer, + const HandleGraph& graph) { + +#ifdef debug_linker_split + cerr << "splitting splice segment " << pb2json(splice_segment) << endl; + cerr << "split is at index " << splice_junction_idx << endl; + cerr << "left trace: " << get<0>(left_trace) << " " << get<1>(left_trace) << " " << get<2>(left_trace) << endl; + cerr << "right trace: " << get<0>(right_trace) << " " << get<1>(right_trace) << " " << get<2>(right_trace) << endl; +#endif + + // TODO: make the scoring robust to hanging indels at the trace positions + + pair, pair> return_val; + auto& left_path = return_val.first.first; + + // walk the part of the splice segment before the trace on the left side + size_t left_to_length = 0; + size_t left_leading_to_length = 0; + for (int64_t i = 0; i < get<0>(left_trace); ++i) { + left_leading_to_length += mapping_to_length(splice_segment.path().mapping(i)); + } + // special logic to handle the mapping with the traced location + if (get<0>(left_trace) < splice_junction_idx) { + path_mapping_t* post_trace_mapping = nullptr; + size_t trace_leading_from_length = 0; + const auto& trace_mapping = splice_segment.path().mapping(get<0>(left_trace)); + // walk edits that come before the traced location + for (int64_t j = 0; j < get<1>(left_trace); ++j) { + left_leading_to_length += trace_mapping.edit(j).to_length(); + trace_leading_from_length += trace_mapping.edit(j).from_length(); + } + if (get<1>(left_trace) < trace_mapping.edit_size()) { + // the trace ends in an edit + const auto& trace_edit = trace_mapping.edit(get<1>(left_trace)); + if (trace_edit.to_length() != 0) { + left_leading_to_length += get<2>(left_trace); + } + if (trace_edit.from_length() != 0) { + trace_leading_from_length += get<2>(left_trace); + } + if (get<2>(left_trace) < max(trace_edit.from_length(), trace_edit.to_length())) { + + post_trace_mapping = left_path.add_mapping(); + auto edit = post_trace_mapping->add_edit(); + edit->set_from_length(max(0, trace_edit.from_length() - get<2>(left_trace))); + edit->set_to_length(max(0, trace_edit.to_length() - get<2>(left_trace))); + if (!trace_edit.sequence().empty()) { + edit->set_sequence(trace_edit.sequence().substr(get<2>(left_trace), string::npos)); + } + left_to_length += edit->to_length(); + } + } + for (int64_t j = get<1>(left_trace) + 1; j < trace_mapping.edit_size(); ++j) { + if (!post_trace_mapping) { + post_trace_mapping = left_path.add_mapping(); + } + from_proto_edit(trace_mapping.edit(j), *post_trace_mapping->add_edit()); + left_to_length += trace_mapping.edit(j).to_length(); + } + if (post_trace_mapping) { + const auto& trace_pos = trace_mapping.position(); + auto pos = post_trace_mapping->mutable_position(); + pos->set_node_id(trace_pos.node_id()); + pos->set_is_reverse(trace_pos.is_reverse()); + pos->set_offset(trace_pos.offset() + trace_leading_from_length); + } + } + + // pull out the left half of the alignment + for (int64_t i = get<0>(left_trace) + 1; i < splice_junction_idx; ++i) { + const auto& mapping = splice_segment.path().mapping(i); + if (mapping_from_length(mapping) == 0 && mapping_to_length(mapping) == 0) { + continue; + } + auto left_mapping = left_path.add_mapping(); + from_proto_position(mapping.position(), *left_mapping->mutable_position()); + for (size_t j = 0; j < mapping.edit_size(); ++j) { + const auto& edit = mapping.edit(j); + if (edit.from_length() != 0 || edit.to_length() != 0) { + from_proto_edit(edit, *left_mapping->add_edit()); + left_to_length += edit.to_length(); + } + } + } + + // and then pull out the right half of the alignment up to the trace point + auto& right_path = return_val.second.first; + for (size_t i = splice_junction_idx; i < get<0>(right_trace); ++i) { + const auto& mapping = splice_segment.path().mapping(i); + if (mapping_from_length(mapping) == 0 && mapping_to_length(mapping) == 0) { + continue; + } + auto right_mapping = right_path.add_mapping(); + from_proto_position(mapping.position(), *right_mapping->mutable_position()); + for (size_t j = 0; j < mapping.edit_size(); ++j) { + const auto& edit = mapping.edit(j); + if (edit.from_length() != 0 || edit.to_length() != 0) { + from_proto_edit(edit, *right_mapping->add_edit()); + } + } + } + + // and special logic for the right trace mapping + if (get<0>(right_trace) >= splice_junction_idx && + get<0>(right_trace) < splice_segment.path().mapping_size() && + (get<1>(right_trace) != 0 || get<2>(right_trace) != 0)) { + auto pre_trace_mapping = right_path.add_mapping(); + const auto& trace_mapping = splice_segment.path().mapping(get<0>(right_trace)); + from_proto_position(trace_mapping.position(), *pre_trace_mapping->mutable_position()); + for (int64_t j = 0; j < get<1>(right_trace); ++j) { + from_proto_edit(trace_mapping.edit(j), *pre_trace_mapping->add_edit()); + } + if (get<1>(right_trace) < trace_mapping.edit_size() && get<2>(right_trace) != 0) { + const auto& trace_edit = trace_mapping.edit(get<1>(right_trace)); + auto edit = pre_trace_mapping->add_edit(); + if (trace_edit.from_length() != 0) { + edit->set_from_length(get<2>(right_trace)); + } + if (trace_edit.to_length() != 0) { + edit->set_to_length(get<2>(right_trace)); + } + if (!trace_edit.sequence().empty()) { + edit->set_sequence(trace_edit.sequence().substr(0, get<2>(right_trace))); + } + } + } + +#ifdef debug_linker_split + cerr << "scoring split segments with read interval starts " << left_leading_to_length << " and " << (left_to_length + left_leading_to_length) << endl; +#endif + + // score the two halves (but don't take the full length bonus, since this isn't actually + // the end of the full read) + return_val.first.second = scorer.score_partial_alignment(splice_segment, graph, left_path, + splice_segment.sequence().begin() + left_leading_to_length, true); + return_val.second.second = scorer.score_partial_alignment(splice_segment, graph, right_path, + splice_segment.sequence().begin() + left_to_length + left_leading_to_length, + true); + +#ifdef debug_linker_split + cerr << "left partial score " << return_val.first.second << ", partial path " << debug_string(return_val.first.first) << endl; + cerr << "right partial score " << return_val.second.second << ", partial path " << debug_string(return_val.second.first) << endl; +#endif + + // deletions can span the splice junction, in which case they will have been scored incorrectly + // by taking the gap open penalty twice + if (return_val.first.first.mapping_size() != 0 && return_val.second.first.mapping_size() != 0) { + + int64_t left_del_size = 0, right_del_size = 0; + + // measure left deletion at the end + bool in_deletion = true; + for (int64_t i = left_path.mapping_size() - 1; i >= 0 && in_deletion; --i) { + const auto& mapping = left_path.mapping(i); + for (int64_t j = mapping.edit_size() - 1; j >= 0 && in_deletion; --j) { + const auto& edit = mapping.edit(j); + if (edit.to_length() == 0) { + left_del_size += edit.from_length(); + } + else { + in_deletion = false; + } + } + } + + // measure right deletion at the beginning + in_deletion = true; + for (int64_t i = 0; i < right_path.mapping_size() && in_deletion; ++i) { + const auto& mapping = right_path.mapping(i); + for (int64_t j = 0; j < mapping.edit_size() && in_deletion; ++j) { + const auto& edit = mapping.edit(j); + if (edit.to_length() == 0) { + right_del_size += edit.from_length(); + } + else { + in_deletion = false; + } + } + } + +#ifdef debug_linker_split + cerr << "spanning split, left deletion size: " << left_del_size << ", right deletion size " << right_del_size << endl; +#endif + + if (left_del_size != 0 && right_del_size != 0) { + // split the total gap score between the two (can break dynamic programmability + // a little bit, but i think it's worth it to have a good alignment across the + // splice junction) + int32_t total_gap_score = scorer.score_gap(left_del_size + right_del_size); + int32_t left_gap_score = (left_del_size * total_gap_score) / (left_del_size + right_del_size); + return_val.first.second += (left_gap_score - scorer.score_gap(left_del_size)); + return_val.second.second += (total_gap_score - left_gap_score - scorer.score_gap(right_del_size)); + +#ifdef debug_linker_split + cerr << "re-divided deletion score, now left score " << return_val.first.second << ", right score " << return_val.second.second << endl; +#endif + } + } + + return return_val; +} + +multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, + multipath_alignment_t&& left_mp_aln, multipath_alignment_t&& right_mp_aln, + int64_t left_bridge_point, const Alignment& splice_segment, + int64_t splice_junction_idx, int32_t splice_score, const GSSWAligner& scorer, + const HandleGraph& graph) { + +#ifdef debug_fusing + cerr << "fusing spliced alignments" << endl; + cerr << "left alignment" << endl; + cerr << debug_string(left_mp_aln) << endl; + cerr << "right alignment" << endl; + cerr << debug_string(right_mp_aln) << endl; + cerr << "linker" << endl; + cerr << pb2json(splice_segment) << endl; +#endif + + // TODO: allow multiple splices to happen on the same multipath alignment? + + pos_t pos_left = initial_position(splice_segment.path()); + pos_t pos_right = final_position(splice_segment.path()); + + int64_t right_bridge_point = left_bridge_point + path_to_length(splice_segment.path()); + + auto left_locations = search_multipath_alignment(left_mp_aln, pos_left, left_bridge_point); + auto right_locations = search_multipath_alignment(right_mp_aln, pos_right, right_bridge_point); + +#ifdef debug_fusing + cerr << "left splice locations:" << endl; + for (auto loc : left_locations) { + cerr << "\t" << get<0>(loc) << " " << get<1>(loc) << " " << get<2>(loc) << " " << get<3>(loc) << endl; + } + cerr << "right splice locations:" << endl; + for (auto loc : right_locations) { + cerr << "\t" << get<0>(loc) << " " << get<1>(loc) << " " << get<2>(loc) << " " << get<3>(loc) << endl; + } +#endif + + if (left_locations.empty() || right_locations.empty()) { + cerr << "error: splice segment could not be located on multipath alignment of read " << alignment.name() << endl; + exit(1); + } + + // trace the splice segment from first location on the list + tuple left_path_trace, right_path_trace; + vector> left_mp_aln_trace, right_mp_aln_trace; + tie(left_path_trace, left_mp_aln_trace) = trace_path(left_mp_aln, splice_segment.path(), get<0>(left_locations.front()), + get<1>(left_locations.front()), get<2>(left_locations.front()), + get<3>(left_locations.front()), false, splice_junction_idx); + tie(right_path_trace, right_mp_aln_trace) = trace_path(right_mp_aln, splice_segment.path(), get<0>(right_locations.front()), + get<1>(right_locations.front()), get<2>(right_locations.front()), + get<3>(right_locations.front()), true, splice_junction_idx); + + // trace the splice segment on subsequent locations on the left + for (size_t i = 1; i < left_locations.size(); ++i) { + int64_t j, k, l, m; + tie(j, k, l, m) = left_locations[i]; + auto trace = trace_path(left_mp_aln, splice_segment.path(), j, k, l, m, false, splice_junction_idx); + if (trace.first > left_path_trace) { + left_path_trace = trace.first; + left_mp_aln_trace = move(trace.second); + } + else if (trace.first == left_path_trace) { + for (auto& mp_aln_trace : trace.second) { + left_mp_aln_trace.push_back(mp_aln_trace); + } + } + } + + // trace the splice segment on subsequent locations on the right + for (size_t i = 1; i < right_locations.size(); ++i) { + int64_t j, k, l, m; + tie(j, k, l, m) = right_locations[i]; + auto trace = trace_path(right_mp_aln, splice_segment.path(), j, k, l, m, true, splice_junction_idx); + if (trace.first < right_path_trace) { + right_path_trace = trace.first; + right_mp_aln_trace = move(trace.second); + } + else if (trace.first == right_path_trace) { + for (auto& mp_aln_trace : trace.second) { + right_mp_aln_trace.push_back(mp_aln_trace); + } + } + } + + // now also walk back the multipath locations to match + vector> left_rev_nexts; + for (size_t i = 0; i < left_mp_aln_trace.size(); ++i) { + int64_t s, m, e, b; + tie(s, m, e, b) = left_mp_aln_trace[i]; + } + + + // ensure that the connection locations are unique and in increasing order by subpath index + sort(left_mp_aln_trace.begin(), left_mp_aln_trace.end()); + sort(right_mp_aln_trace.begin(), right_mp_aln_trace.end()); + auto new_left_end = unique(left_mp_aln_trace.begin(), left_mp_aln_trace.end()); + left_mp_aln_trace.erase(new_left_end, left_mp_aln_trace.end()); + auto new_right_end = unique(right_mp_aln_trace.begin(), right_mp_aln_trace.end()); + right_mp_aln_trace.erase(new_right_end, right_mp_aln_trace.end()); + +#ifdef debug_fusing + cerr << "left traced path location:" << endl; + cerr << "\t" << get<0>(left_path_trace) << " " << get<1>(left_path_trace) << " " << get<2>(left_path_trace) << endl; + cerr << "left traced multipath locations:" << endl; + for (auto loc : left_mp_aln_trace) { + cerr << "\t" << get<0>(loc) << " " << get<1>(loc) << " " << get<2>(loc) << " " << get<3>(loc) << endl; + } + cerr << "right traced path location:" << endl; + cerr << "\t" << get<0>(right_path_trace) << " " << get<1>(right_path_trace) << " " << get<2>(right_path_trace) << endl; + cerr << "right traced multipath locations:" << endl; + for (auto loc : right_mp_aln_trace) { + cerr << "\t" << get<0>(loc) << " " << get<1>(loc) << " " << get<2>(loc) << " " << get<3>(loc) << endl; + } +#endif + + vector to_keep_left(left_mp_aln.subpath_size(), false); + vector is_bridge_left(left_mp_aln.subpath_size(), false); + for (const auto& pos : left_mp_aln_trace) { + is_bridge_left[get<0>(pos)] = true; + } + for (int64_t i = left_mp_aln.subpath_size() - 1; i >= 0; --i) { + if (is_bridge_left[i]) { + to_keep_left[i] = true; + } + else { + for (auto j : left_mp_aln.subpath(i).next()) { + to_keep_left[i] = to_keep_left[j] || to_keep_left[i]; + } + for (const auto& connection : left_mp_aln.subpath(i).connection()) { + to_keep_left[i] = to_keep_left[connection.next()] || to_keep_left[i]; + } + } + } + +#ifdef debug_fusing + cerr << "deciding what to remove on left:" << endl; + for (size_t i = 0; i < to_keep_left.size(); ++i) { + cerr << "\t" << i << ": keep? " << to_keep_left[i] << ", bridge? " << is_bridge_left[i] << endl; + } +#endif + + vector left_removed_so_far(left_mp_aln.subpath_size() + 1, 0); + int64_t left_loc_idx = 0; + vector left_to_length(left_mp_aln.subpath_size(), 0); + for (int64_t i = 0; i < left_mp_aln.subpath_size(); ++i) { + // keep track of the read interval + int64_t to_len = path_to_length(left_mp_aln.subpath(i).path()); + for (auto n : left_mp_aln.subpath(i).next()) { + left_to_length[n] = left_to_length[i] + to_len; + } + for (const auto& connection : left_mp_aln.subpath(i).connection()) { + left_to_length[connection.next()] = left_to_length[i] + to_len; + } + + if (!to_keep_left[i]) { + left_removed_so_far[i + 1] = left_removed_so_far[i] + 1; + continue; + } + // TODO: what if there are multiple locations with hits on the same subpath... + if (left_loc_idx < left_mp_aln_trace.size() && i == get<0>(left_mp_aln_trace[left_loc_idx])) { + int64_t s, m, e, b; + tie(s, m, e, b) = left_mp_aln_trace[left_loc_idx]; + ++left_loc_idx; + +#ifdef debug_fusing + cerr << "trimming subpath " << s << " at location " << m << " " << e << " " << b << endl; +#endif + + auto path = left_mp_aln.mutable_subpath(i)->mutable_path(); + bool trimmed = trim_path(path, false, m, e, b); + if (trimmed) { + int32_t new_score = scorer.score_partial_alignment(alignment, graph, *path, + alignment.sequence().begin() + left_to_length[i]); + left_mp_aln.mutable_subpath(i)->set_score(new_score); + } + left_mp_aln.mutable_subpath(i)->mutable_next()->clear(); + // TODO: i don't like doing this... need to revise for multiple cuts + left_mp_aln.mutable_subpath(i)->mutable_connection()->clear(); + } + + if (left_removed_so_far[i]) { + *left_mp_aln.mutable_subpath(i - left_removed_so_far[i]) = move(*left_mp_aln.mutable_subpath(i)); + } + left_removed_so_far[i + 1] = left_removed_so_far[i]; + } + if (left_removed_so_far.back() != 0) { + left_mp_aln.mutable_subpath()->resize(left_mp_aln.subpath_size() - left_removed_so_far.back()); + for (size_t i = 0; i < left_mp_aln.subpath_size(); ++i) { + auto subpath = left_mp_aln.mutable_subpath(i); + size_t nexts_removed_so_far = 0; + for (size_t j = 0; j < subpath->next_size(); ++j) { + if (to_keep_left[subpath->next(j)]) { + subpath->set_next(j - nexts_removed_so_far, + subpath->next(j) - left_removed_so_far[subpath->next(j)]); + } + else { + ++nexts_removed_so_far; + } + } + if (nexts_removed_so_far != 0) { + subpath->mutable_next()->resize(subpath->next_size() - nexts_removed_so_far); + } + size_t connections_removed_so_far = 0; + for (size_t j = 0; j < subpath->connection_size(); ++j) { + auto connection = subpath->mutable_connection(j); + if (to_keep_left[connection->next()]) { + connection->set_next(connection->next() - left_removed_so_far[connection->next()]); + if (connections_removed_so_far) { + *subpath->mutable_connection(j - connections_removed_so_far) = *connection; + } + } + else { + ++connections_removed_so_far; + } + } + if (connections_removed_so_far != 0) { + subpath->mutable_connection()->resize(subpath->connection_size() - connections_removed_so_far); + } + } + } + + +#ifdef debug_fusing + cerr << "after processing left:" << endl; + cerr << debug_string(left_mp_aln) << endl; +#endif + + size_t left_subpaths_end = left_mp_aln.subpath_size(); + + auto splice_segment_halves = split_splice_segment(splice_segment, left_path_trace, right_path_trace, + splice_junction_idx, scorer, graph); + +#ifdef debug_fusing + cerr << "split the linker:" << endl; + cerr << "left score " << splice_segment_halves.first.second << ", path " << debug_string(splice_segment_halves.first.first) << endl; + cerr << "right score " << splice_segment_halves.second.second << ", path " << debug_string(splice_segment_halves.second.first) << endl; +#endif + + bool have_left_linker = !splice_segment_halves.first.first.mapping().empty(); + bool have_right_linker = !splice_segment_halves.second.first.mapping().empty(); + + if (have_left_linker) { + for (const auto& left_loc : left_mp_aln_trace) { + auto i = get<0>(left_loc) - left_removed_so_far[get<0>(left_loc)]; + // TODO: shouldn't this always be >= 0? the linker's attachment should be preserved... + if (i < left_mp_aln.subpath_size()) { + left_mp_aln.mutable_subpath(i)->add_next(left_mp_aln.subpath_size()); + } + } + + auto subpath = left_mp_aln.add_subpath(); + subpath->set_score(splice_segment_halves.first.second); + *subpath->mutable_path() = move(splice_segment_halves.first.first); + } + + if (have_right_linker) { + if (!have_left_linker) { + // we skipped the left side of the splice, so connect to the left splice points + for (const auto& left_loc : left_mp_aln_trace) { + auto i = get<0>(left_loc) - left_removed_so_far[get<0>(left_loc)]; + auto connection = left_mp_aln.mutable_subpath(i)->add_connection(); + connection->set_next(left_mp_aln.subpath_size()); + connection->set_score(splice_score); + } + } + else { + // the left side also exists as a subpath, connect to it + auto connection = left_mp_aln.mutable_subpath(left_mp_aln.subpath_size() - 1)->add_connection(); + connection->set_next(left_mp_aln.subpath_size()); + connection->set_score(splice_score); + } + + auto subpath = left_mp_aln.add_subpath(); + subpath->set_score(splice_segment_halves.second.second); + *subpath->mutable_path() = move(splice_segment_halves.second.first); + } + +#ifdef debug_fusing + cerr << "after processing linker:" << endl; + cerr << debug_string(left_mp_aln) << endl; +#endif + + size_t right_subpaths_begin = left_mp_aln.subpath_size(); + + // figure out the read position of the subpaths before we start getting rid of edges + vector right_to_length(right_mp_aln.subpath_size(), 0); + for (size_t i = 0; i < right_mp_aln.subpath_size(); ++i) { + // keep track of the read interval + int64_t to_len = path_to_length(right_mp_aln.subpath(i).path()); + for (auto n : right_mp_aln.subpath(i).next()) { + right_to_length[n] = right_to_length[i] + to_len; + } + for (const auto& connection : right_mp_aln.subpath(i).connection()) { + right_to_length[connection.next()] = right_to_length[i] + to_len; + } + } + + vector to_keep_right(right_mp_aln.subpath_size(), false); + vector is_bridge_right(right_mp_aln.subpath_size(), false); + for (const auto& pos : right_mp_aln_trace) { + is_bridge_right[get<0>(pos)] = true; + } + for (int64_t i = 0; i < right_mp_aln.subpath_size(); ++i) { + to_keep_right[i] = to_keep_right[i] || is_bridge_right[i]; + for (auto j : right_mp_aln.subpath(i).next()) { + to_keep_right[j] = to_keep_right[j] || to_keep_right[i]; + } + for (const auto& connection : right_mp_aln.subpath(i).connection()) { + to_keep_right[connection.next()] = to_keep_right[connection.next()] || to_keep_right[i]; + } + } + +#ifdef debug_fusing + cerr << "deciding what to remove on right:" << endl; + for (size_t i = 0; i < to_keep_right.size(); ++i) { + cerr << "\t" << i << ": keep? " << to_keep_right[i] << ", bridge? " << is_bridge_right[i] << endl; + } +#endif + + // transfer the subpaths from the right multipath alignment onto the left one + + vector right_removed_so_far(right_mp_aln.subpath_size() + 1, 0); + int64_t right_loc_idx = 0; + for (int64_t i = 0; i < right_mp_aln.subpath_size(); ++i) { + if (!to_keep_right[i]) { + right_removed_so_far[i + 1] = right_removed_so_far[i] + 1; + continue; + } + if (right_loc_idx < right_mp_aln_trace.size() && i == get<0>(right_mp_aln_trace[right_loc_idx])) { + // this is where the splice alignment began + int64_t s, m, e, b; + tie(s, m, e, b) = right_mp_aln_trace[right_loc_idx]; + ++right_loc_idx; + + +#ifdef debug_fusing + cerr << "trimming subpath " << s << " at location " << m << " " << e << " " << b << endl; +#endif + + auto path = right_mp_aln.mutable_subpath(i)->mutable_path(); + int64_t to_len = path_to_length(*path); + bool trimmed = trim_path(path, true, m, e, b); + if (trimmed) { + int64_t new_to_len = path_to_length(*path); + int32_t new_score = scorer.score_partial_alignment(alignment, graph, *path, + (alignment.sequence().begin() + right_to_length[i] + + to_len - new_to_len)); + right_mp_aln.mutable_subpath(i)->set_score(new_score); + } + // add the edges into the right side + if (right_subpaths_begin == left_subpaths_end) { + // both of the splice segments were empty, connect directly to the left splice point + for (const auto& left_loc : left_mp_aln_trace) { + auto i = get<0>(left_loc) - left_removed_so_far[get<0>(left_loc)]; + if (i < left_subpaths_end) { + auto connection = left_mp_aln.mutable_subpath(i)->add_connection(); + connection->set_next(left_mp_aln.subpath_size()); + connection->set_score(splice_score); + } + } + } + else if (!have_right_linker) { + // the splice segment on the right side is empty, make connection the the left side + auto connection = left_mp_aln.mutable_subpath(right_subpaths_begin - 1)->add_connection(); + connection->set_next(left_mp_aln.subpath_size()); + connection->set_score(splice_score); + } + else { + // add the edge from the rightmost splice segment + left_mp_aln.mutable_subpath(right_subpaths_begin - 1)->add_next(left_mp_aln.subpath_size()); + } + } + + *left_mp_aln.add_subpath() = move(*right_mp_aln.mutable_subpath(i)); + right_removed_so_far[i + 1] = right_removed_so_far[i]; + } + // fix up the edges on the transferred subpaths from the right alignment + for (size_t i = right_subpaths_begin; i < left_mp_aln.subpath_size(); ++i) { + + auto subpath = left_mp_aln.mutable_subpath(i); + size_t nexts_removed_so_far = 0; + for (size_t j = 0; j < subpath->next_size(); ++j) { + if (to_keep_right[subpath->next(j)]) { + subpath->set_next(j - nexts_removed_so_far, + subpath->next(j) - right_removed_so_far[subpath->next(j)] + right_subpaths_begin); + } + else { + ++nexts_removed_so_far; + } + } + if (nexts_removed_so_far != 0) { + subpath->mutable_next()->resize(subpath->next_size() - nexts_removed_so_far); + } + size_t connections_removed_so_far = 0; + for (size_t j = 0; j < subpath->connection_size(); ++j) { + auto connection = subpath->mutable_connection(j); + if (to_keep_right[connection->next()]) { + connection->set_next(connection->next() - right_removed_so_far[connection->next()] + right_subpaths_begin); + if (connections_removed_so_far) { + *subpath->mutable_connection(j - connections_removed_so_far) = *connection; + } + } + else { + ++connections_removed_so_far; + } + } + if (connections_removed_so_far != 0) { + subpath->mutable_connection()->resize(subpath->connection_size() - connections_removed_so_far); + } + } + +#ifdef debug_fusing + cerr << "fused alignment:" << endl; + cerr << debug_string(left_mp_aln) << endl; +#endif + + // starts can change pretty drastically, so just clear and reidentify + identify_start_subpaths(left_mp_aln); + + // and remove any empty bits + remove_empty_alignment_sections(left_mp_aln); + +#ifdef debug_fusing + cerr << "final product after removing empty subpaths:" << endl; + cerr << debug_string(left_mp_aln) << endl; +#endif + + // pass the left (where we collected everything) out without copying + return move(left_mp_aln); +} + + +} diff --git a/src/splicing.hpp b/src/splicing.hpp new file mode 100644 index 00000000000..9b240bb5274 --- /dev/null +++ b/src/splicing.hpp @@ -0,0 +1,290 @@ +/** + * \file splicing.hpp + * + * Defines splicing related objects and functions + * + */ +#ifndef VG_SPLICING_HPP_INCLUDED +#define VG_SPLICING_HPP_INCLUDED + +#include "dinucleotide_machine.hpp" +#include "incremental_subgraph.hpp" +#include "aligner.hpp" +#include "multipath_alignment.hpp" +#include "statistics.hpp" + +namespace vg { + +using namespace std; + +/* + * Object that represents: + * 1. A table of the acceptable splice motifs and their scores + * 2. A distribution of intron lengths and their scores + */ +class SpliceStats { +public: + // Defaults to the three canonical human splicing motifs with the frequencies + // reported in Burset, Seledstov, and Solovyev (2000) + // - GT-AG: 0.9924 + // - GC-AG: 0.0069 + // - AT-AC: 0.0005 + // and a default intron length distribution trained on GENCODE v.29. + SpliceStats(const GSSWAligner& scorer); + + // Construct with triples of (5' dinucleotide, 3' dinucleotide, frequency). + // Frequencies may sum to < 1.0, but may not sum to > 1.0. + // Lognormal parameters should be given in pairs of (mu, sigma). + SpliceStats(const vector>& motifs, + const vector& lognormal_mixture_weights, + const vector>& lognormal_component_params, + const GSSWAligner& scorer); + + // the number of splicing motifs + size_t motif_size() const; + // the dinucleotide motif on one side in the order that the nucleotides + // are encountered when traversing into the intron + const string& oriented_motif(size_t motif_num, bool left_side) const; + // true if the corresponding motif would be encountered on the reverse + // strand of a transcript + bool motif_is_reverse(size_t motif_num) const; + // the dinucleotide motif on one side ins its standard representation + string unoriented_motif(size_t motif_num, bool left_side) const; + // the score associated with a splicing motif + int32_t motif_score(size_t motif_num) const; + // the frequency of the motif in the input data + double motif_frequency(size_t motif_num) const; + // the score associated with an intron length + int32_t intron_length_score(int64_t length) const; + // change the motifs + void update_motifs(const vector>& motifs, + const GSSWAligner& scorer); + // change the intron distribution + void update_intron_length_distribution(const vector& lognormal_mixture_weights, + const vector>& lognormal_component_params, + const GSSWAligner& scorer); + // must be called if scoring parameters are changed + void update_scoring(const GSSWAligner& scorer); +private: + + // internal function for the constructor + void init(const vector>& motifs, + const vector& lognormal_mixture_weights, + const vector>& lognormal_component_params, + const GSSWAligner& scorer); + + double intron_length_log_likelihood(int64_t length) const; + + // the splice table stored in its most useful form + vector> motif_data; + // the original input data, in case we need to update the scoring + vector> unaltered_motif_data; + // intron length distribution component weights + vector mixture_weights; + // intron length distribution component parameters + vector> component_params; + // the maximum possible log likelihood for an intron length + double mode_log_likelihood; + // a copy of the aligner's log base (it's convenient to have it here) + double log_base; +}; + + +/* + * Object that identifies possible splice sites in a small region of + * the graph and answers queries about them. + */ +class SpliceRegion { +public: + + SpliceRegion(const pos_t& seed_pos, bool search_left, int64_t search_dist, + const HandleGraph& graph, + const DinucleotideMachine& dinuc_machine, + const SpliceStats& splice_stats); + SpliceRegion() = default; + ~SpliceRegion() = default; + + // returns the locations in the of a splice motif and the distance of those location + // from the search position. crashes if given a motif that was not provided + // to the constructor. + const vector>& candidate_splice_sites(size_t motif_num) const; + + // the underlying subgraph we've extracted + const IncrementalSubgraph& get_subgraph() const; + + // the position that extraction began from + const pair& get_seed_pos() const; + +private: + + IncrementalSubgraph subgraph; + + pair seed; + + vector>> motif_matches; + +}; + +/* + * HandleGraph implementation that fuses two incremental subgraphs at + * an identified splice junction + */ +class JoinedSpliceGraph : public HandleGraph { +public: + + JoinedSpliceGraph(const HandleGraph& parent_graph, + const IncrementalSubgraph& left_subgraph, + handle_t left_splice_node, size_t left_splice_offset, + const IncrementalSubgraph& right_subgraph, + handle_t right_splice_node, size_t right_splice_offset); + + JoinedSpliceGraph() = default; + ~JoinedSpliceGraph() = default; + + ////////////////////////// + /// Specialized interface + ////////////////////////// + + /// Translates an aligned path to the underlying graph of the two incremental subgraphs + /// Also returns the indexes of the left and right splice points. If either is not present, + /// the corresponding index is set to numeric_limits::max() + pair translate_node_ids(Path& path) const; + + /// Get the handle corresponding to the seed node of the left subgraph + handle_t left_seed_node() const; + + /// Get the handle corresponding to the seed node of the right subgraph + handle_t right_seed_node() const; + + /// Get the handle corresponding to the left side of the splice join + handle_t left_splice_node() const; + + /// Get the handle corresponding to the right side of the splice join + handle_t right_splice_node() const; + + /// Get the minimum length of a walk from the left seed to the right seed + int64_t min_link_length() const; + + /// Get the maximum length of a walk from the left seed to the right seed + int64_t max_link_length() const; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + /// Method to check if a node exists by ID + bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph + size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + id_t max_node_id() const; + + /////////////////////////////////// + /// Optional HandleGraph interface + /////////////////////////////////// + + /// Returns one base of a handle's sequence, in the orientation of the + /// handle. + char get_base(const handle_t& handle, size_t index) const; + + /// Returns a substring of a handle's sequence, in the orientation of the + /// handle. If the indicated substring would extend beyond the end of the + /// handle's sequence, the return value is truncated to the sequence's end. + string get_subsequence(const handle_t& handle, size_t index, size_t size) const; + +private: + + // the index subsequence of the underlying sequence that this handle corresponds + // to (on the same strand as the handle) + pair underlying_interval(const handle_t& handle) const; + + const HandleGraph* parent_graph; + + const IncrementalSubgraph* left_subgraph; + + const IncrementalSubgraph* right_subgraph; + + vector left_handle_trans; + + vector right_handle_trans; + + size_t left_splice_offset; + + size_t right_splice_offset; + + // handles from the underlying subgraphs + vector handle_idxs; + + // the size of the prefix of the handles vector that come from the + // left subgraph + size_t num_left_handles; +}; + +multipath_alignment_t from_hit(const Alignment& alignment, const HandleGraph& graph, + const pos_t& hit_pos, const MaximalExactMatch& mem, + const GSSWAligner& scorer); + +// return the position of the base when trimming a given length from either the start +// or the end of the alignment. softclips do not contributed to the total, and extra +// will be trimmed to avoid splitting an insertion. also returns the total amount of +// sequence trimmed (including softclip) and the score of the sub-alignment that would +// be removed. +tuple trimmed_end(const Alignment& aln, int64_t len, bool from_end, + const HandleGraph& graph, const GSSWAligner& aligner); + +// remove the portion of the path either to the left or right of a given position along it, +// described in terms of the indexes in the path object. returns true if any edits were +// actually removed. +bool trim_path(path_t* path, bool from_left, int64_t mapping_idx, int64_t edit_idx, int64_t base_idx); + +// consumes two multipath alignments and returns a spliced multipath alignment, which is connected +// by the splice segment. the splice segment may be discontiguous at the splice point, which is +// is indicated by giving the index of the first mapping that is past the splice junctinon. +// the splice score refers to the score value of th splice itself +multipath_alignment_t&& fuse_spliced_alignments(const Alignment& alignment, + multipath_alignment_t&& left_mp_aln, multipath_alignment_t&& right_mp_aln, + int64_t left_bridge_point, const Alignment& splice_segment, + int64_t splice_junction_idx, int32_t splice_score, const GSSWAligner& scorer, + const HandleGraph& graph); + +} + +#endif // VG_SPLICING_HPP_INCLUDED diff --git a/src/split_strand_graph.cpp b/src/split_strand_graph.cpp new file mode 100644 index 00000000000..1bb64f3b685 --- /dev/null +++ b/src/split_strand_graph.cpp @@ -0,0 +1,99 @@ +/** + * \file split_strand_graph.cpp: contains the implementation of StrandSplitGraph + */ + + +#include "split_strand_graph.hpp" + + +namespace vg { + +using namespace std; + + StrandSplitGraph::StrandSplitGraph(const HandleGraph* graph) : graph(graph){ + // nothing to do + } + + bool StrandSplitGraph::has_node(id_t node_id) const { + return graph->has_node(node_id >> 1); + } + + handle_t StrandSplitGraph::get_handle(const id_t& node_id, bool is_reverse) const { + return handlegraph::number_bool_packing::pack(node_id, is_reverse); + } + + id_t StrandSplitGraph::get_id(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_number(handle); + } + + bool StrandSplitGraph::get_is_reverse(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_bit(handle); + } + + handle_t StrandSplitGraph::flip(const handle_t& handle) const { + return handlegraph::number_bool_packing::toggle_bit(handle); + } + + size_t StrandSplitGraph::get_length(const handle_t& handle) const { + return graph->get_length(graph->get_handle(get_id(handle) >> 1)); + } + + string StrandSplitGraph::get_sequence(const handle_t& handle) const { + return graph->get_sequence(get_underlying_handle(handle)); + } + + bool StrandSplitGraph::follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const { + + return graph->follow_edges(get_underlying_handle(handle), go_left, + [&] (const handle_t& next) { + return iteratee(get_handle((graph->get_id(next) << 1) + (graph->get_is_reverse(next) != get_is_reverse(handle)), + get_is_reverse(handle))); + }); + } + + bool StrandSplitGraph::for_each_handle_impl(const function& iteratee, + bool parallel) const { + return graph->for_each_handle([&](const handle_t& underlying_handle) { + id_t node_id = graph->get_id(underlying_handle); + // forward version of the node + bool keep_going = iteratee(get_handle(node_id << 1)); + // reverse version of the node + if (keep_going) { + keep_going = iteratee(get_handle((node_id << 1) | 1)); + } + return keep_going; + }, parallel); + } + + size_t StrandSplitGraph::get_node_count() const { + return graph->get_node_count() << 1; + } + + id_t StrandSplitGraph::min_node_id() const { + return graph->min_node_id() << 1; + } + + id_t StrandSplitGraph::max_node_id() const { + return (graph->max_node_id() << 1) | 1; + } + + handle_t StrandSplitGraph::get_underlying_handle(const handle_t& handle) const { + return graph->get_handle(get_id(handle) >> 1, + ((get_id(handle) & 1) == 1) != get_is_reverse(handle)); + } + + bool StrandSplitGraph::has_overlay_node_for(const nid_t& underlying_id) const { + return this->has_node(underlying_id << 1); + } + + handle_t StrandSplitGraph::get_overlay_handle(const handle_t& underlying_handle) const { + // Get the ID of the node in the underlyign graph + id_t underlying_id = graph->get_id(underlying_handle); + // Get the ID of the named orientation of that node, in our graph. + id_t overlay_id = (underlying_id << 1) | (graph->get_is_reverse(underlying_handle) ? 1 : 0); + // Get a handle to the forward orientation of our node + return get_handle(overlay_id, false); + } +} + diff --git a/src/split_strand_graph.hpp b/src/split_strand_graph.hpp new file mode 100644 index 00000000000..d38796dc61b --- /dev/null +++ b/src/split_strand_graph.hpp @@ -0,0 +1,118 @@ +#ifndef VG_SPLIT_STRAND_GRAPH_HPP_INCLUDED +#define VG_SPLIT_STRAND_GRAPH_HPP_INCLUDED + +/** \file + * split_strand_graph.hpp: defines a handle graph overlay that duplicates nodes + * and edges so that both the forward and reverse strand of the underlying graph + * are now on the forward strand + */ + +#include "handle.hpp" +#include "utility.hpp" + +namespace vg { + +using namespace std; + + /** + * A HandleGraph implementation that overlays some other handle graph and splits + * the two strands of its nodes into separate nodes + */ + class StrandSplitGraph : public ExpandingOverlayGraph { + public: + + /// Initialize as the reverse version of another graph, optionally also + /// complementing + StrandSplitGraph(const HandleGraph* graph); + + /// Default constructor -- not actually functional + StrandSplitGraph() = default; + + /// Default destructor + ~StrandSplitGraph() = default; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + // Method to check if a node exists by ID + bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + bool follow_edges_impl(const handle_t& handle, bool go_left, + const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + bool for_each_handle_impl(const function& iteratee, + bool parallel = false) const; + + /// Return the number of nodes in the graph + size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + id_t max_node_id() const; + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /** + * Returns the handle in the underlying graph that corresponds to a handle in the + * overlay + */ + handle_t get_underlying_handle(const handle_t& handle) const; + + /////////////////////////////////// + /// Extra methods + /////////////////////////////////// + + /** + * Returns true if any nodes in the overlay correspond to the given node in the underlying graph. + */ + bool has_overlay_node_for(const nid_t& underlying_id) const; + + /** + * Returns the handle in the overlay graph that corresponds to a handle + * and orientation in the underlying graph. Reverse versions of + * underlying graph nodes become the locally-forward overlay node that + * represents them. + */ + handle_t get_overlay_handle(const handle_t& underlying_handle) const; + + private: + /// The underlying graph we're making splitting + const HandleGraph* graph = nullptr; + }; +} + +#endif diff --git a/src/srpe.cpp b/src/srpe.cpp deleted file mode 100644 index 45bde9c86c7..00000000000 --- a/src/srpe.cpp +++ /dev/null @@ -1,123 +0,0 @@ -#include "srpe.hpp" - -using namespace std; -namespace vg{ - - double SRPE::discordance_score(vector alns, VG* subgraph){ - // Sum up the mapping scores - // subtract the soft clips - // the discordant insert size reads - // and a flat penalty for discordant orientation - throw runtime_error("Unimplemented!"); - } - - void SRPE::call_svs_paired_end(vg::VG* graph, ifstream& gamstream, vector& bps, string refpath){ - - } - - void SRPE::call_svs_split_read(vg::VG* graph, ifstream& gamstream, vector& bps, string refpath){ - // We're going to do a bunch of split-read mappings now, - // then decide if our orientations support an inversion, an insertion, - // or a deletion. - } - - - void SRPE::call_svs(string graphfile, string gamfile, string refpath){ - vg::VG* graph; - if (!graphfile.empty()){ - ifstream in(graphfile); - graph = new VG(in, false); - } - ifstream gamstream; - gamstream.open(gamfile); - // Set up path index - ff.set_my_vg(graph); - ff.soft_clip_limit = 20; - ff.fill_node_to_position(refpath); - - std::function (vector)> merge_breakpoints = [](vector bps){ - vector ret; - BREAKPOINT sent; - sent.start = -100; - ret.push_back(sent); - for (int i = 0; i < bps.size(); i++){ - BREAKPOINT a = bps[i]; - bool merged = false; - for (int j = 0; j < ret.size(); j++){ - if (ret[j].overlap(a, 20)){ - ret[j].other_supports += 1; - merged = true; - } - } - if (!merged){ - ret.push_back(a); - } - } - return ret; - }; - - vector pe_bps; - vector sr_bps; - - call_svs_paired_end(graph, gamstream, pe_bps, refpath); - call_svs_split_read(graph, gamstream, sr_bps, refpath); - vector pe_merged = merge_breakpoints(pe_bps); - vector sr_merged = merge_breakpoints(sr_bps); - vector merged; - merged.insert(merged.begin(), pe_merged.begin(), pe_merged.end()); - merged.insert(merged.begin(), sr_merged.begin(), sr_merged.end()); - merged = merge_breakpoints(merged); - - } - - void SRPE::aln_to_bseq(Alignment& a, bseq1_t* read){ - read->seq = (char*) a.sequence().c_str(); - read->qual = (char*) a.quality().c_str(); - read->l_seq = a.sequence().length(); - } - - /* - typedef struct { - int32_t len; // length of sequence - int32_t nsr; // number of supporting reads - char *seq; // unitig sequence - char *cov; // cov[i]-33 gives per-base coverage at i - int n_ovlp[2]; // number of 5'-end [0] and 3'-end [1] overlaps - fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1] - } fml_utg_t; - */ - - /** - * function assemble - * inputs: a vector of Alignments to be assembled (based on their sequences) - * outputs: - */ - void SRPE::assemble(vector alns, vector& unitigs){ - int n_seqs, n_utgs; - n_seqs = alns.size(); - bseq1_t* mr_bseqs = new bseq1_t [alns.size()]; - for (int i = 0; i < n_seqs; ++i){ - aln_to_bseq( alns[i], mr_bseqs + i ); - } - fml_utg_t *utgs; - fml_opt_t opt; - fml_opt_init(&opt); - utgs = fml_assemble(&opt, n_seqs, mr_bseqs, &n_utgs); - for (int i = 0; i < n_utgs; ++i){ - unitigs.push_back( *(utgs + i) ); - } - - fml_utg_destroy(n_utgs, utgs); - } - - void SRPE::assemble(string refpath, int64_t start_pos, int64_t end_pos, vector& unitigs){ - // Get all alignments on from to - } - void SRPE::assemble(int64_t node_id, int64_t pos, int window_size){ - - } - - - -} - diff --git a/src/srpe.hpp b/src/srpe.hpp deleted file mode 100644 index 75708877220..00000000000 --- a/src/srpe.hpp +++ /dev/null @@ -1,199 +0,0 @@ -#ifndef VG_SRPE -#define VG_SRPE -#include -#include -#include -#include "filter.hpp" -#include "index.hpp" -#include "path_index.hpp" -#include "IntervalTree.h" -#include "vg.pb.h" -#include "fml.h" -#include "vg.hpp" -#include -#include "alignment.hpp" -#include "genotypekit.hpp" -using namespace std; -namespace vg{ - - - struct BREAKPOINT{ - string name; - Position position; - vector mates; - - string contig; - int64_t start = -1; - int64_t upper_bound = 100; - int64_t lower_bound = 100; - - // Does the breakpoint point this way --->> - // or this way <<--- - bool isForward; - // 0: Unset, 1: INS, 2: DEL, 3: INV, 4: DUP - int SV_TYPE = 0; - // - - int normal_supports = 0; - int tumor_supports = 0; - - int fragl_supports = 0; - int split_supports = 0; - int other_supports = 0; - - inline int total_supports(){ - return fragl_supports + split_supports + other_supports; - } - inline bool overlap(BREAKPOINT p, int dist){ - - if (start > -1 ){ - if ( abs(start - p.start) < dist){ - return true; - } - } - else{ - if (position.node_id() == p.position.node_id() && abs(position.offset() - p.position.offset()) < dist){ - return true; - } - } - - return false; - } - inline string to_string(){ - stringstream x; - x << "Pos: " << start << " u: " << upper_bound << " l: " << lower_bound << " s: " << total_supports(); - return x.str(); - } - - }; - - - -/** - * Overview: - * Use the GAM/GAM index and a filter to locate Alignments - * which may indicate the presence of - * structural variants at a given site. - * - * Signatures include: - * Deletions/Insertions: Stacked soft clips (tips) - * Inversions: mismatched P/E reads( <-- && --> rather than the expected ( --> <-- ) - * Duplications: Read depth signals - * Translocations: Distant read pairs - */ - -class DepthMap { - /** - * Map - * or - * Map - */ -public: - int8_t* depths; - uint64_t size; - map node_pos; - vg::VG* g_graph; - inline DepthMap(int64_t sz) { depths = new int8_t[sz]; }; - inline DepthMap() {}; - inline DepthMap(vg::VG* graph){ - g_graph = graph; - int64_t tot_size = 0; - std::function count_size = [&](Node* n){ - #pragma omp critical - { - node_pos[n->id()] = tot_size; - tot_size += n->sequence().length(); - } - }; - graph->for_each_node(count_size); - size = tot_size; - depths = new int8_t(size); - }; - inline int8_t get_depth(int64_t node_id, int64_t offset) { return depths[node_id + offset]; }; - inline void set_depth(int64_t node_id, int64_t offset, int8_t d) { depths[node_id + offset] = d; }; - inline void increment_depth(int64_t node_id, int64_t offset) { - - depths[node_pos [node_id] + offset] += 1; -}; - inline void fill_depth(const vg::Path& p){ - for (int i = 0; i < p.mapping_size(); i++){ - Mapping m = p.mapping(i); - Position p = m.position(); - int64_t nodeid = p.node_id(); - int offset = p.offset(); - for (int j = 0; j < m.edit_size(); j++){ - Edit e = m.edit(j); - if (e.from_length() == e.to_length() && e.sequence().empty()){ - for (int x = 0; x < e.from_length(); ++x){ - increment_depth(nodeid, offset + x); - } - } - } - } - }; - -}; - - class SRPE{ - - - - - public: - vector ref_names; - map pindexes; - - vector > intervals; - - void call_svs_paired_end(vg::VG* graph, ifstream& gamstream, vector& bps, string refpath=""); - void call_svs_split_read(vg::VG* graph, ifstream& gamstream, vector& bps, string refpath=""); - void call_svs(string graphfile, string gamfile, string refpath); - - // Calculate a proxy for discordance between a set of Alginments - // and a subgraph (e.g. one that's been modified with a candidate variant) - // Useful for deciding which variant is closest to what's represented in reads - double discordance_score(vector alns, VG* subgraph); - - // Convert Alignments to the read-like objects Fermi-lite uses in assembly - void aln_to_bseq(Alignment& a, bseq1_t* read); - - // Assemble a set of alignments into a set of unitigs - // UNITIGS ARE GRAPH ELEMENTS - you could make them subgraphs. - // Alignments need not map to the graph (e.g. they could be unmapped reads) - void assemble(vector alns, vector& unitigs); - - // Assemble a set of Alignments that map along between and , - // which are reference-relative coordinates (a.k.a your standard, linear ref coordinates) - void assemble(string refpath, int64_t start_pos, int64_t end_pos, vector& unitigs); - - // Assemble all reads that overlap a given position (within window_size bp) - void assemble(int64_t node_id, int64_t offset, int window_size); - - // Are multiple references present in the same subgraph? - bool overlapping_refs = false; - - // Maps from node-id to read depth - DepthMap depth; - - // Every SRPE gets its own filter - vg::Filter ff; - - // Every SRPE also gets its own name->alignment map - // and a name->mate map - map name_to_aln; - map aln_to_mate; - - // A graph (or subgraph) for the region this SRPE is handling. - vg::VG* graph; - // xg::XG* xindex; - // gcsa::GCSA* gindex; - // gcsa::LCPArray * lcp_ind; - - // Cap the total coverage at a given position - int max_reads = 125; - - - -}; -} -#endif diff --git a/src/ssw_aligner.cpp b/src/ssw_aligner.cpp index 268eedd040a..3926ab3b1c2 100644 --- a/src/ssw_aligner.cpp +++ b/src/ssw_aligner.cpp @@ -23,7 +23,13 @@ Alignment SSWAligner::align(const string& query, const string& ref) { gap_extension); StripedSmithWaterman::Filter filter; StripedSmithWaterman::Alignment alignment; - aligner.Align(query.c_str(), ref.c_str(), ref.size(), filter, &alignment); + + // We need to send out own mask length, recommended to be half the sequence length and at least 15. + int32_t mask_len = min(max((size_t) 15, query.size()), (size_t) numeric_limits::max()); + + assert(ref.size() <= numeric_limits::max()); + + aligner.Align(query.c_str(), ref.c_str(), (int) ref.size(), filter, &alignment, mask_len); return ssw_to_vg(alignment, query, ref); } @@ -44,7 +50,7 @@ Alignment SSWAligner::ssw_to_vg(const StripedSmithWaterman::Alignment& ssw_aln, for (auto& elem : vcflib::splitCigar(ssw_aln.cigar_string)) { int32_t length = elem.first; - string type = elem.second; + string type(1, elem.second); Edit* edit; //cerr << e->length << e->type << endl; switch (type[0]) { diff --git a/src/ssw_aligner.hpp b/src/ssw_aligner.hpp index d4743a029eb..31fc111542a 100644 --- a/src/ssw_aligner.hpp +++ b/src/ssw_aligner.hpp @@ -5,7 +5,7 @@ #include #include #include "ssw_cpp.h" -#include "vg.pb.h" +#include #include "path.hpp" namespace vg { diff --git a/src/statistics.cpp b/src/statistics.cpp new file mode 100644 index 00000000000..be9c5dd2e1e --- /dev/null +++ b/src/statistics.cpp @@ -0,0 +1,671 @@ +/** + * \file statistics.cpp + * + * Contains implementations of statistical functions + * + */ + +#include "statistics.hpp" + +namespace vg { + + +double median(std::vector &v) { + size_t n = v.size() / 2; + std::nth_element(v.begin(), v.begin()+n, v.end()); + int vn = v[n]; + if (v.size()%2 == 1) { + return vn; + } else { + std::nth_element(v.begin(), v.begin()+n-1, v.end()); + return 0.5*(vn+v[n-1]); + } +} + +// from Python exmaple here: +// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm +void wellford_update(size_t& count, double& mean, double& M2, double new_val) { + ++count; + double delta = new_val - mean; + mean += delta / (double)count; + double delta2 = new_val - mean; + M2 += delta * delta2; +} + +pair wellford_mean_var(size_t count, double mean, double M2, bool sample_variance) { + if (count == 0 || (sample_variance && count == 1)) { + return make_pair(nan(""), nan("")); + } else { + return make_pair(mean, M2 / (double)(sample_variance ? count - 1 : count)); + } +} + +double Phi(double x) { + return 0.5 * (1.0 + std::erf(x / std::sqrt(2.0))); +} + +// Modified from qnorm function in R source: +// https://svn.r-project.org/R/trunk/src/nmath/qnorm.c +double Phi_inv(double p) { + assert(0.0 < p && p < 1.0); + double q, r, val; + + q = p - 0.5; + + /*-- use AS 241 --- */ + /* double ppnd16_(double *p, long *ifault)*/ + /* ALGORITHM AS241 APPL. STATIST. (1988) VOL. 37, NO. 3 + + Produces the normal deviate Z corresponding to a given lower + tail area of P; Z is accurate to about 1 part in 10**16. + + (original fortran code used PARAMETER(..) for the coefficients + and provided hash codes for checking them...) + */ + if (fabs(q) <= .425) {/* 0.075 <= p <= 0.925 */ + r = .180625 - q * q; + val = + q * (((((((r * 2509.0809287301226727 + + 33430.575583588128105) * r + 67265.770927008700853) * r + + 45921.953931549871457) * r + 13731.693765509461125) * r + + 1971.5909503065514427) * r + 133.14166789178437745) * r + + 3.387132872796366608) + / (((((((r * 5226.495278852854561 + + 28729.085735721942674) * r + 39307.89580009271061) * r + + 21213.794301586595867) * r + 5394.1960214247511077) * r + + 687.1870074920579083) * r + 42.313330701600911252) * r + 1.); + } + else { /* closer than 0.075 from {0,1} boundary */ + + /* r = min(p, 1-p) < 0.075 */ + if (q > 0) + r = 1.0 - p; + else + r = p; + + r = sqrt(- log(r)); + /* r = sqrt(-log(r)) <==> min(p, 1-p) = exp( - r^2 ) */ + + if (r <= 5.) { /* <==> min(p,1-p) >= exp(-25) ~= 1.3888e-11 */ + r += -1.6; + val = (((((((r * 7.7454501427834140764e-4 + + .0227238449892691845833) * r + .24178072517745061177) * + r + 1.27045825245236838258) * r + + 3.64784832476320460504) * r + 5.7694972214606914055) * + r + 4.6303378461565452959) * r + + 1.42343711074968357734) + / (((((((r * + 1.05075007164441684324e-9 + 5.475938084995344946e-4) * + r + .0151986665636164571966) * r + + .14810397642748007459) * r + .68976733498510000455) * + r + 1.6763848301838038494) * r + + 2.05319162663775882187) * r + 1.); + } + else { /* very close to 0 or 1 */ + r += -5.; + val = (((((((r * 2.01033439929228813265e-7 + + 2.71155556874348757815e-5) * r + + .0012426609473880784386) * r + .026532189526576123093) * + r + .29656057182850489123) * r + + 1.7848265399172913358) * r + 5.4637849111641143699) * + r + 6.6579046435011037772) + / (((((((r * + 2.04426310338993978564e-15 + 1.4215117583164458887e-7)* + r + 1.8463183175100546818e-5) * r + + 7.868691311456132591e-4) * r + .0148753612908506148525) + * r + .13692988092273580531) * r + + .59983220655588793769) * r + 1.); + } + + if(q < 0.0) + val = -val; + /* return (q >= 0.)? r : -r ;*/ + } + return val; +} + +double lognormal_pdf(double x, double mu, double sigma) { + const static double root_2pi = sqrt(2.0 * 3.14159265358979323846); + double density; + if (x > 0.0) { + double z = (log(x) - mu) / sigma; + density = exp(-z * z / 2.0) / (sigma * x * root_2pi); + } + else { + density = 0.0; + } + return density; +} + +// https://stackoverflow.com/a/19039500/238609 +double slope(const std::vector& x, const std::vector& y) { + const auto n = x.size(); + const auto s_x = std::accumulate(x.begin(), x.end(), 0.0); + const auto s_y = std::accumulate(y.begin(), y.end(), 0.0); + const auto s_xx = std::inner_product(x.begin(), x.end(), x.begin(), 0.0); + const auto s_xy = std::inner_product(x.begin(), x.end(), y.begin(), 0.0); + const auto a = (n * s_xy - s_x * s_y) / (n * s_xx - s_x * s_x); + return a; +} + +//https://stats.stackexchange.com/a/7459/14524 +// returns alpha parameter of zipf distribution +double fit_zipf(const vector& y) { + // assume input is log-scaled + // fit a log-log model + assert(y.size()); + vector ly(y.size()); + for (int i = 0; i < ly.size(); ++i) { + //cerr << y[i] << " "; + ly[i] = log(y[i]); + } + //cerr << endl; + vector lx(y.size()); + for (int i = 1; i <= lx.size(); ++i) { + lx[i-1] = log(i); + } + return -slope(lx, ly); +} + +double fit_fixed_shape_max_exponential(const vector& x, double shape, double tolerance) { + + // Fit S for a fixed N with the density of the maximum of N exponential variables + // + // NS exp(-Sx) (1 - exp(-Sx))^(N - 1) + // + // where S is the rate + // where N is the shape + + double x_sum = 0; + double x_max = numeric_limits::lowest(); + for (const double& val : x) { + x_sum += val; + x_max = max(x_max, val); + } + + // compute the log of the 1st and 2nd derivatives for the log likelihood (split up by positive and negative summands) + // we have to do it this wonky way because the exponentiated numbers get very large and cause overflow otherwise + + double log_deriv_neg_part = log(x_sum); + + function log_deriv_pos_part = [&](double rate) { + double accumulator = numeric_limits::lowest(); + for (const double& val : x) { + if (val > 0.0) { + // should always be > 0, but just so we don't blow up on some very small graphs + accumulator = add_log(accumulator, log(val) - rate * val - log(1.0 - exp(-rate * val))); + } + } + accumulator += log(shape - 1.0); + return add_log(accumulator, log(x.size() / rate)); + }; + + function log_deriv2_neg_part = [&](double rate) { + double accumulator = numeric_limits::lowest(); + for (const double& val : x) { + if (val > 0.0) { + // should always be > 0, but just so we don't blow up on some very small graphs + accumulator = add_log(accumulator, 2.0 * log(val) - rate * val - 2.0 * log(1.0 - exp(-rate * val))); + } + } + accumulator += log(shape - 1.0); + return add_log(accumulator, log(x.size() / (rate * rate))); + }; + + // set a maximum so this doesn't get in an infinite loop even when numerical issues + // prevent convergence + size_t max_iters = 1000; + size_t iter = 0; + + // use Newton's method to find the MLE + double rate = 1.0 / x_max; + double prev_rate = rate * (1.0 + 10.0 * tolerance); + while (abs(prev_rate / rate - 1.0) > tolerance && iter < max_iters) { + prev_rate = rate; + double log_d2 = log_deriv2_neg_part(rate); + double log_d_pos = log_deriv_pos_part(rate); + double log_d_neg = log_deriv_neg_part; + // determine if the value of the 1st deriv is positive or negative, and compute the + // whole ratio to the 2nd deriv from the positive and negative parts accordingly + if (log_d_pos > log_d_neg) { + rate += exp(subtract_log(log_d_pos, log_d_neg) - log_d2); + } + else { + rate -= exp(subtract_log(log_d_neg, log_d_pos) - log_d2); + } + ++iter; + } + return rate; +} + + +double fit_fixed_rate_max_exponential(const vector& x, double rate, double tolerance) { + + // Fit N for a fixed S with the density of the maximum of N exponential variables + // + // NS exp(-Sx) (1 - exp(-Sx))^(N - 1) + // + // where S is the rate + // where N is the shape + + function log_likelihood = [&](double shape) { + return max_exponential_log_likelihood(x, rate, shape); + }; + // expand interval until we find a region where the likelihood is decreasing with + // shape increasing + double max_shape = 1.0; + double max_shape_likelihood = log_likelihood(max_shape); + double prev_max_shape_likelihood = max_shape_likelihood - 1.0; + while (prev_max_shape_likelihood <= max_shape_likelihood) { + prev_max_shape_likelihood = max_shape_likelihood; + max_shape *= 2.0; + max_shape_likelihood = log_likelihood(max_shape); + } + + // use golden section search to find the maximum + return golden_section_search(log_likelihood, 0.0, max_shape, tolerance); +} + +pair fit_max_exponential(const vector& x, + double tolerance) { + + // set a maximum so this doesn't get in an infinite loop even when numerical issues + // prevent convergence + size_t max_iters = 1000; + size_t iter = 0; + + // alternate maximizing shape and rate until convergence + double shape = 1.0; + double rate = fit_fixed_shape_max_exponential(x, shape, tolerance / 2.0); + double prev_shape = shape + 10.0 * tolerance; + double prev_rate = rate + 10.0 * tolerance; + while ((abs(prev_rate / rate - 1.0) > tolerance / 2.0 + || abs(prev_shape / shape - 1.0) > tolerance / 2.0) + && iter < max_iters) { + prev_shape = shape; + prev_rate = rate; + + shape = fit_fixed_rate_max_exponential(x, rate, tolerance / 2.0); + rate = fit_fixed_shape_max_exponential(x, shape, tolerance / 2.0); + + ++iter; + } + + return pair(rate, shape); +} + +//tuple fit_offset_max_exponential(const vector& x, +// const function& shape_prior, +// double tolerance) { +// +// // the max log likelihood of the data for a fixed location parameter +// function fit_log_likelihood = [&](double loc) { +// vector x_offset(x.size()); +// for (size_t i = 0; i < x.size(); ++i) { +// x_offset[i] = x[i] - loc; +// } +// pair params = fit_max_exponential(x_offset); +// return max_exponential_log_likelihood(x, params.first, params.second, loc) + log(shape_prior(shape)); +// }; +// +// // the maximum value of location so that all data points are in the support +// double max_loc = *min_element(x.begin(), x.end()); +// // search with exponentially expanding windows backward to find the window +// // that contains the highest likelihood MLE for the location +// double min_loc = max_loc - 1.0; +// double log_likelihood = numeric_limits::lowest(); +// double probe_log_likelihood = fit_log_likelihood(min_loc); +// while (probe_log_likelihood > log_likelihood) { +// log_likelihood = probe_log_likelihood; +// double probe_loc = max_loc - 2.0 * (max_loc - min_loc); +// probe_log_likelihood = fit_log_likelihood(probe_loc); +// min_loc = probe_loc; +// } +// +// // find the MLE location +// double location = golden_section_search(fit_log_likelihood, min_loc, max_loc, tolerance); +// +// // fit the scale and shape given the locatino +// vector x_offset(x.size()); +// for (size_t i = 0; i < x.size(); ++i) { +// x_offset[i] = x[i] - location; +// } +// auto params = fit_max_exponential(x_offset); +// +// return make_tuple(params.first, params.second, location); +//} + +double max_exponential_log_likelihood(const vector& x, double rate, double shape, + double location) { + double accumulator_1 = 0.0; + double accumulator_2 = 0.0; + for (const double& val : x) { + if (val <= location) { + // this should be -inf, but doing this avoids some numerical problems + continue; + } + accumulator_1 += log(1.0 - exp(-rate * (val - location))); + accumulator_2 += (val - location); + } + return x.size() * log(rate * shape) - rate * accumulator_2 + (shape - 1.0) * accumulator_1; +} + +pair fit_weibull(const vector& x) { + // Method adapted from Datsiou & Overend (2018) Weibull parameter estimation and + // goodness-of-fit for glass strength data + + assert(x.size() >= 3); + + vector x_local = x; + sort(x_local.begin(), x_local.end()); + + // regress the transformed ordered data points against the inverse CDF + vector> X(x_local.size() - 1, vector(2, 1.0)); + vector y(X.size()); + for (size_t i = 1; i < x_local.size(); ++i) { + X[i - 1][1] = log(x_local[i]); + y[i - 1] = log(-log(1.0 - double(i) / double(x.size()))); + } + vector coefs = regress(X, y); + + // convert the coefficients into the parameters + return make_pair(exp(-coefs[0] / coefs[1]), coefs[1]); +} + +tuple fit_offset_weibull(const vector& x, + double tolerance) { + + // the max log likelihood of the data for a fixed location parameter + function fit_log_likelihood = [&](double loc) { + vector x_offset(x.size()); + for (size_t i = 0; i < x.size(); ++i) { + x_offset[i] = x[i] - loc; + } + pair params = fit_weibull(x_offset); + return weibull_log_likelihood(x, params.first, params.second, loc); + }; + + // the maximum value of location so that all data points are in the support + double max_loc = *min_element(x.begin(), x.end()); + + // search with exponentially expanding windows backward to find the window + // that contains the highest likelihood MLE for the location + double min_loc = max_loc - 1.0; + double log_likelihood = numeric_limits::lowest(); + double probe_log_likelihood = fit_log_likelihood(min_loc); + while (probe_log_likelihood > log_likelihood) { + log_likelihood = probe_log_likelihood; + double probe_loc = max_loc - 2.0 * (max_loc - min_loc); + probe_log_likelihood = fit_log_likelihood(probe_loc); + min_loc = probe_loc; + } + + // find the MLE location + double location = golden_section_search(fit_log_likelihood, min_loc, max_loc, tolerance); + + // fit the scale and shape given the locatino + vector x_offset(x.size()); + for (size_t i = 0; i < x.size(); ++i) { + x_offset[i] = x[i] - location; + } + auto params = fit_weibull(x_offset); + + return make_tuple(params.first, params.second, location); +} + +double weibull_log_likelihood(const vector& x, double scale, double shape, + double location) { + double sum_1 = 0.0, sum_2 = 0.0; + for (const double& val : x) { + sum_1 += log(val - location); + sum_2 += pow((val - location) / scale, shape); + } + return x.size() * (log(shape) - shape * log(scale)) + (shape - 1.0) * sum_1 - sum_2; +} + +double golden_section_search(const function& f, double x_min, double x_max, + double tolerance) { + + const static double inv_phi = (sqrt(5.0) - 1.0) / 2.0; + + // the number of steps needed to achieve the required precision (precalculating avoids + // fiddly floating point issues on the breakout condition) + size_t steps = size_t(ceil(log(tolerance / (x_max - x_min)) / log(inv_phi))); + + // the two interior points we will evaluate the function at + double x_lo = x_min + inv_phi * inv_phi * (x_max - x_min); + double x_hi = x_min + inv_phi * (x_max - x_min); + + // the function value at the two interior points + double f_lo = f(x_lo); + double f_hi = f(x_hi); + + for (size_t step = 0; step < steps; ++step) { + if (f_lo < f_hi) { + // there is a max in one of the right two sections + x_min = x_lo; + x_lo = x_hi; + x_hi = x_min + inv_phi * (x_max - x_min); + f_lo = f_hi; + f_hi = f(x_hi); + } + else { + // there is a max in one of the left two sections + x_max = x_hi; + x_hi = x_lo; + x_lo = x_min + inv_phi * inv_phi * (x_max - x_min); + f_hi = f_lo; + f_lo = f(x_lo); + } + } + + // return the midpoint of the interval we narrowed down to + if (f_lo > f_hi) { + return (x_min + x_hi) / 2.0; + } + else { + return (x_lo + x_max) / 2.0; + } +} + +double phred_to_prob(uint8_t phred) { + // Use a statically initialized lookup table + static std::vector prob_by_phred([](void) -> std::vector { + std::vector to_return; + to_return.reserve((int)numeric_limits::max() + 1); + for (int i = 0; i <= numeric_limits::max(); i++) { + to_return.push_back(phred_to_prob((double) i)); + } + return to_return; + }()); + + // Look up in it + return prob_by_phred[phred]; +} + +double phred_for_at_least_one(size_t p, size_t n) { + + /** + * Assume that we have n <= MAX_AT_LEAST_ONE_EVENTS independent events with probability p each. + * Let x be the AT_LEAST_ONE_PRECISION most significant bits of p. Then + * + * phred_at_least_one[(n << AT_LEAST_ONE_PRECISION) + x] + * + * is an approximate phred score of at least one event occurring. + * + * We exploit the magical thread-safety of static local initialization to + * fill this in exactly once when needed. + */ + static std::vector phred_at_least_one([](void) -> std::vector { + // Initialize phred_at_least_one by copying from the result of this function. + std::vector to_return; + size_t values = static_cast(1) << AT_LEAST_ONE_PRECISION; + to_return.resize((MAX_AT_LEAST_ONE_EVENTS + 1) * values, 0.0); + for (size_t n = 1; n <= MAX_AT_LEAST_ONE_EVENTS; n++) { + for (size_t p = 0; p < values; p++) { + // Because each p represents a range of probabilities, we choose a value + // in the middle for the approximation. + double probability = (2 * p + 1) / (2.0 * values); + // Phred for at least one out of n. + to_return[(n << AT_LEAST_ONE_PRECISION) + p] = prob_to_phred(1.0 - std::pow(1.0 - probability, n)); + } + } + return to_return; + }()); + + // Make sure we don't go out of bounds. + assert(n <= MAX_AT_LEAST_ONE_EVENTS); + + p >>= 8 * sizeof(size_t) - AT_LEAST_ONE_PRECISION; + return phred_at_least_one[(n << AT_LEAST_ONE_PRECISION) + p]; +} + +// This is just like phred_for_at_least_one but we don't prob_to_phred +// TODO: combine the code somehow? +double prob_for_at_least_one(size_t p, size_t n) { + + /** + * Assume that we have n <= MAX_AT_LEAST_ONE_EVENTS independent events with probability p each. + * Let x be the AT_LEAST_ONE_PRECISION most significant bits of p. Then + * + * prob_at_least_one[(n << AT_LEAST_ONE_PRECISION) + x] + * + * is an approximate probability of at least one event occurring. + * + * We exploit the magical thread-safety of static local initialization to + * fill this in exactly once when needed. + */ + static std::vector prob_at_least_one([](void) -> std::vector { + // Initialize prob_at_least_one by copying from the result of this function. + std::vector to_return; + size_t values = static_cast(1) << AT_LEAST_ONE_PRECISION; + to_return.resize((MAX_AT_LEAST_ONE_EVENTS + 1) * values, 0.0); + for (size_t n = 1; n <= MAX_AT_LEAST_ONE_EVENTS; n++) { + for (size_t p = 0; p < values; p++) { + // Because each p represents a range of probabilities, we choose a value + // in the middle for the approximation. + double probability = (2 * p + 1) / (2.0 * values); + // Prob for at least one out of n. + to_return[(n << AT_LEAST_ONE_PRECISION) + p] = 1.0 - std::pow(1.0 - probability, n); + } + } + return to_return; + }()); + + // Make sure we don't go out of bounds. + assert(n <= MAX_AT_LEAST_ONE_EVENTS); + + p >>= 8 * sizeof(size_t) - AT_LEAST_ONE_PRECISION; + return prob_at_least_one[(n << AT_LEAST_ONE_PRECISION) + p]; +} + +vector> transpose(const vector>& A) { + vector> AT(A.front().size()); + for (size_t i = 0; i < AT.size(); ++i) { + AT[i].resize(A.size()); + for (size_t j = 0; j < A.size(); ++j) { + AT[i][j] = A[j][i]; + } + } + return AT; +} + +vector> matrix_multiply(const vector>& A, + const vector>& B) { + assert(A.front().size() == B.size()); + + vector> AB(A.size()); + for (size_t i = 0; i < A.size(); ++i) { + AB[i].resize(B.front().size(), 0.0); + for (size_t j = 0; j < B.front().size(); ++j) { + for (size_t k = 0; k < B.size(); ++k) { + AB[i][j] += A[i][k] * B[k][j]; + } + } + } + return AB; +} + +vector matrix_multiply(const vector>& A, + const vector& b) { + assert(A.front().size() == b.size()); + + vector Ab(A.size(), 0.0); + for (size_t i = 0; i < A.size(); ++i) { + for (size_t j = 0; j < A.front().size(); ++j) { + Ab[i] += A[i][j] * b[j]; + } + } + return Ab; +} + +vector> matrix_invert(const vector>& A) { + + // invert by Gaussian elimination + + assert(A.front().size() == A.size()); + + vector> A_inv(A.size()); + + for (size_t i = 0; i < A.size(); ++i) { + A_inv[i].resize(A.size(), 0.0); + A_inv[i][i] = 1.0; + } + + // a non-const local copy + auto A_loc = A; + + // forward loop, make upper triangular + + for (int64_t i = 0; i < A_loc.size(); ++i) { + int64_t ii = i; + while (A_loc[ii][i] == 0.0 && ii < A_loc.size()) { + ++ii; + } + if (ii == A_loc.size()) { + std::runtime_error("error: matrix is not invertible!"); + + } + swap(A_loc[i],A_loc[ii]); + swap(A_inv[i], A_inv[ii]); + + // make the diagonal entry 1 + double factor = A_loc[i][i]; + for (int64_t j = 0; j < A_loc.size(); ++j) { + A_loc[i][j] /= factor; + A_inv[i][j] /= factor; + } + + // make the off diagonals in one column 0's + for (ii = i + 1; ii < A_loc.size(); ++ii) { + factor = A_loc[ii][i]; + for (size_t j = 0; j < A_loc.size(); ++j) { + A_loc[ii][j] -= factor * A_loc[i][j]; + A_inv[ii][j] -= factor * A_inv[i][j]; + } + + } + } + + // backward loop, make identity + + for (int64_t i = A_loc.size() - 1; i >= 0; --i) { + // make the off diagonals in one column 0's + for (int64_t ii = i - 1; ii >= 0; --ii) { + double factor = A_loc[ii][i]; + for (size_t j = 0; j < A_loc.size(); ++j) { + A_loc[ii][j] -= factor * A_loc[i][j]; + A_inv[ii][j] -= factor * A_inv[i][j]; + } + } + } + + return A_inv; +} + +vector regress(const vector>& X, vector& y) { + auto X_t = transpose(X); + return matrix_multiply(matrix_multiply(matrix_invert(matrix_multiply(X_t, X)), X_t), y); +} + +} diff --git a/src/distributions.hpp b/src/statistics.hpp similarity index 58% rename from src/distributions.hpp rename to src/statistics.hpp index fb5a4b3cdb4..c623a330879 100644 --- a/src/distributions.hpp +++ b/src/statistics.hpp @@ -1,21 +1,425 @@ -#ifndef VG_DISTRIBUTIONS_HPP_INCLUDED -#define VG_DISTRIBUTIONS_HPP_INCLUDED - -// distributions.hpp: contains functions for probability distributions. -// Functions from here are used to estimate likelihoods in genotyping. We also -// have some portable reimplementations of C++ distributions from -// because the system-provided ones differ in behavior between STL -// implementations and compilers. +#ifndef VG_STATISTICS_HPP_INCLUDED +#define VG_STATISTICS_HPP_INCLUDED +/** + * \file statistics.hpp + * + * Defines a range of statistical functions + * + */ -#include #include +#include +#include #include "utility.hpp" namespace vg { using namespace std; + +double median(std::vector &v); +// Online mean-variance computation with wellfords algorithm (pass 0's to 1st 3 params to start) +void wellford_update(size_t& count, double& mean, double& M2, double new_val); +pair wellford_mean_var(size_t count, double mean, double M2, bool sample_variance = false); + + +template +double stdev(const T& v) { + double sum = std::accumulate(v.begin(), v.end(), 0.0); + double mean = sum / v.size(); + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [mean](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + return std::sqrt(sq_sum / v.size()); +} + +struct SummaryStatistics { + double mean; + double median; + double stdev; + double mode; + size_t number_of_values; + double max_value; + size_t count_of_max; +}; + +/// Returns summary statistics for a multiset of numbers. +template +SummaryStatistics summary_statistics(const std::map& values) { + SummaryStatistics result { 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0 }; + + double sum_of_values = 0.0; + size_t max_freq = 0; + for (auto iter = values.begin(); iter != values.end(); ++iter) { + sum_of_values += iter->first * iter->second; + result.number_of_values += iter->second; + if (iter->second > max_freq) { + result.mode = iter->first; max_freq = iter->second; + } + } + if (result.number_of_values == 0) { + return result; + } + result.mean = sum_of_values / result.number_of_values; + + double accumulator = 0.0; + for (auto iter = values.begin(); iter != values.end(); ++iter) { + double value = iter->first - result.mean; + accumulator += value * value * iter->second; + } + result.stdev = std::sqrt(accumulator / result.number_of_values); + + size_t midpoint = result.number_of_values / 2; + size_t found = 0; + for (auto iter = values.begin(); iter != values.end(); ++iter) { + if (found + iter->second > midpoint) { + result.median = iter->first; + if (result.number_of_values % 2 == 0 && found >= midpoint) { + auto prev = iter; + do { + --prev; + } while (prev->second == 0); + result.median = (result.median + prev->first) / 2.0; + } + break; + } + found += iter->second; + } + + auto back = values.rbegin(); + result.max_value = back->first; + result.count_of_max = back->second; + + return result; +} + +/// The standard normal cumulative distribution function +double Phi(double x); + +/// Inverse CDF of a standard normal distribution. Must have 0 < quantile < 1. +double Phi_inv(double quantile); + +/// Probability density function or log-normal distribution +double lognormal_pdf(double x, double mu, double sigma); + +/* + * Return the log of the sum of two log-transformed values without taking them + * out of log space. + */ +inline double add_log(double log_x, double log_y) { + return log_x > log_y ? log_x + log1p(exp(log_y - log_x)) : log_y + log1p(exp(log_x - log_y)); +} + +/* + * Return the log of the difference of two log-transformed values without taking + * them out of log space. + */ +inline double subtract_log(double log_x, double log_y) { + return log_x + log1p(-exp(log_y - log_x)); +} + +/** + * Convert a number ln to the same number log 10. + */ +inline double ln_to_log10(double ln) { + return ln / log(10); +} + +/** + * Convert a number log 10 to the same number ln. + */ +inline double log10_to_ln(double l10) { + return l10 * log(10); +} + +/** + * Given the log10 of a value, retunr the log10 of (that value plus one). + */ +inline double log10_add_one(double x) { + return log10(pow(10, x) + 1); +} + +/** + * Return the log of the sum of two log10-transformed values without taking them + * out of log space. + */ +inline double add_log10(double i, double j) { + if (i < j) { + return log10_add_one(j - i) + ((j - i < 10) ? i : j); + } + return log10_add_one(i - j) + ((i - j <= 10) ? j : i); +} + +/** + * Assume that we have n independent random events that occur with probability + * p each (p is interpreted as a real number between 0 at 0 and 1 at its + * maximum value). Return an approximate probability for at least one event + * occurring as a phred score. + * + * n must be <= MAX_AT_LEAST_ONE_EVENTS. + */ +double phred_for_at_least_one(size_t p, size_t n); + +/** + * Assume that we have n independent random events that occur with probability + * p each (p is interpreted as a real number between 0 at 0 and 1 at its + * maximum value). Return an approximate probability for at least one event + * occurring as a raw probability. + * + * n must be <= MAX_AT_LEAST_ONE_EVENTS. + */ +double prob_for_at_least_one(size_t p, size_t n); + +/// How many events should we allow in phred_for_at_least_one and +/// prob_for_at_least_one? Should be >= our longest supported minimizer index. +constexpr static size_t MAX_AT_LEAST_ONE_EVENTS = 32; +/// Use this many bits for approximate probabilities. +constexpr static size_t AT_LEAST_ONE_PRECISION = 8; + +// normal pdf, from http://stackoverflow.com/a/10848293/238609 +template +T normal_pdf(T x, T m = 0.0, T s = 1.0) +{ + static const T inv_sqrt_2pi = 0.3989422804014327; + T z = (x - m) / s; + + return inv_sqrt_2pi / s * std::exp(T(-0.5) * z * z); +} + + +/// Convert a probability to a natural log probability. +inline double prob_to_logprob(double prob) { + return log(prob); +} +/// Convert natural log probability to a probability +inline double logprob_to_prob(double logprob) { + return exp(logprob); +} +/// Add two probabilities (expressed as logprobs) together and return the result +/// as a logprob. +inline double logprob_add(double logprob1, double logprob2) { + // Pull out the larger one to avoid underflows + double pulled_out = max(logprob1, logprob2); + return pulled_out + prob_to_logprob(logprob_to_prob(logprob1 - pulled_out) + logprob_to_prob(logprob2 - pulled_out)); +} +/// Invert a logprob, and get the probability of its opposite. +inline double logprob_invert(double logprob) { + return prob_to_logprob(1.0 - logprob_to_prob(logprob)); +} + +/// Convert 8-bit Phred quality score to probability of wrongness, using a lookup table. +double phred_to_prob(uint8_t phred); + +/// Convert floating point Phred quality score to probability of wrongness. +inline double phred_to_prob(double phred) { + return pow(10, -phred / 10); +} + +/// Convert probability of wrongness to integer Phred quality score. +inline double prob_to_phred(double prob) { + return -10.0 * log10(prob); +} + +/// Convert a Phred quality score directly to a natural log probability of wrongness. +inline double phred_to_logprob(int phred) { + return (-((double)phred) / 10) / log10(exp(1.0)); +} + +/// Convert a natural log probability of wrongness directly to a Phred quality score. +inline double logprob_to_phred(double logprob ) { + return -10.0 * logprob * log10(exp(1.0)); +} + +/// Take the geometric mean of two logprobs +inline double logprob_geometric_mean(double lnprob1, double lnprob2) { + return log(sqrt(exp(lnprob1 + lnprob2))); +} + +/// Take the geometric mean of two phred-encoded probabilities +inline double phred_geometric_mean(double phred1, double phred2) { + return prob_to_phred(sqrt(phred_to_prob(phred1 + phred2))); +} + +/// Add two probabilities (expressed as phred scores) together and return the result +/// as a phred score. +inline double phred_add(double phred1, double phred2) { + return logprob_to_phred(logprob_add(phred_to_logprob(phred1), phred_to_logprob(phred2))); +} + + + +/** + * Compute the sum of the values in a collection, where the values are log + * probabilities and the result is the log of the total probability. Items must + * be convertible to/from doubles for math. + */ +template +typename Collection::value_type logprob_sum(const Collection& collection) { + + // Set up an alias + using Item = typename Collection::value_type; + + // Pull out the minimum value + auto min_iterator = min_element(begin(collection), end(collection)); + + if(min_iterator == end(collection)) { + // Nothing there, p = 0 + return Item(prob_to_logprob(0)); + } + + auto check_iterator = begin(collection); + ++check_iterator; + if(check_iterator == end(collection)) { + // We only have a single element anyway. We don't want to subtract it + // out because we'll get 0s. + return *min_iterator; + } + + // Pull this much out of every logprob. + Item pulled_out = *min_iterator; + + if(logprob_to_prob(pulled_out) == 0) { + // Can't divide by 0! + // TODO: fix this in selection + pulled_out = prob_to_logprob(1); + } + + Item total(0); + for(auto& to_add : collection) { + // Sum up all the scaled probabilities. + total += logprob_to_prob(to_add - pulled_out); + } + + // Re-log and re-scale + return pulled_out + prob_to_logprob(total); +} + +/** + * Compute the sum of the values in a collection, represented by an iterator + * range, where the values are Phred scores and the result is the Phred score + * of the total probability. Items must be convertible to/from doubles for + * math. + */ +template +typename std::iterator_traits::value_type phred_sum(const Iterator& begin_it, const Iterator& end_it) { + + // Set up an alias for the type we're operating on + using Item = typename std::iterator_traits::value_type; + + // Pull out the minimum probability + auto min_iterator = max_element(begin_it, end_it); + + if (min_iterator == end_it) { + // Nothing there, p = 0 + return Item(logprob_to_phred(prob_to_logprob(0))); + } + + auto check_iterator = begin_it; + ++check_iterator; + if (check_iterator == end_it) { + // We only have a single element anyway. We don't want to subtract it + // out because we'll get 0s. + return *min_iterator; + } + + // Pull this much out of every logprob. + double pulled_out = phred_to_logprob(*min_iterator); + + if (logprob_to_prob(pulled_out) == 0) { + // Can't divide by 0! + // TODO: fix this in selection + pulled_out = prob_to_logprob(1); + } + + double total(0); + for(auto to_add_it = begin_it; to_add_it != end_it; ++to_add_it) { + // Sum up all the scaled probabilities. + total += logprob_to_prob(phred_to_logprob(*to_add_it) - pulled_out); + } + + // Re-log and re-scale + return Item(logprob_to_phred(pulled_out + prob_to_logprob(total))); +} + +/** + * Compute the sum of the values in a collection, where the values are Phred + * scores and the result is the Phred score of the total probability. Items + * must be convertible to/from doubles for math. + */ +template +typename Collection::value_type phred_sum(const Collection& collection) { + return phred_sum(begin(collection), end(collection)); +} + + +double slope(const std::vector& x, const std::vector& y); +double fit_zipf(const vector& y); + +/// Returns the MLE rate parameter for the distribution of (shape) iid exponential RVs +double fit_fixed_shape_max_exponential(const vector& x, double shape, double tolerance = 1e-8); + +/// Returns the MLE estimate for the number of iid exponential RVs the data are maxima of +double fit_fixed_rate_max_exponential(const vector& x, double rate, double tolerance = 1e-8); + +/// Returns the MLE rate and shape parameters of a max exponential +pair fit_max_exponential(const vector& x, double tolerance = 1e-8); + +// TODO: I'm eliminating this algorithm because it is approx non-identifiable for large values of shape +///// Returns the MLE rate, shape, and location parameters of an offset max exponential +//tuple fit_offset_max_exponential(const vector& x, double tolerance = 1e-8); + +/// Return the CDF of a max exponential with the given parameters +inline double max_exponential_cdf(double x, double rate, double shape, double location = 0.0) { + return x > location ? pow(1.0 - exp(-(x - location) * rate), shape) : 0.0; +} + +/// The log likelihood of a max exponential with the given parameters on the given data +double max_exponential_log_likelihood(const vector& x, double rate, double shape, + double location = 0.0); + +/// Returns an estimate of the rate and shape parameters of a Weibull distribution +pair fit_weibull(const vector& x); + +/// Returns an estimate of the rate, shape, and location (minimum value) of a 3-parameter Weibull distribution +tuple fit_offset_weibull(const vector& x, + double tolerance = 1e-8); + +/// Return the CDF of a max exponential with the given parameters +inline double weibull_cdf(double x, double scale, double shape, double location = 0.0) { + return x > location ? 1.0 - exp(-pow((x - location) / scale, shape)) : 0.0; +} + +/// Returns the log likelihood of some data generated by a Weibull distribution +double weibull_log_likelihood(const vector& x, double scale, double shape, + double location = 0.0); + +/// Returns a local maximum of a function within an interval +double golden_section_search(const function& f, double x_min, double x_max, + double tolerance = 1e-8); + +/// A shitty set of linear algebra functions + +vector> transpose(const vector>& A); + +vector> matrix_multiply(const vector>& A, + const vector>& B); + +vector matrix_multiply(const vector>& A, + const vector& b); + +vector> matrix_invert(const vector>& A); + + +/// Returns the coefficients of a regression (does not automatically compute constant) +vector regress(const vector>& X, vector& y); + +/* + ********************************* + * Code ported over from FreeBayes + ********************************* + */ + // We use this slightly nonstandard type for our math. We wrap it so it's easy // to change later. @@ -25,14 +429,14 @@ using real_t = long double; * Calculate the natural log of the gamma function of the given argument. */ inline real_t gamma_ln(real_t x) { - - real_t cofactors[] = {76.18009173, + + real_t cofactors[] = {76.18009173, -86.50532033, 24.01409822, -1.231739516, 0.120858003E-2, - -0.536382E-5}; - + -0.536382E-5}; + real_t x1 = x - 1.0; real_t tmp = x1 + 5.5; tmp -= (x1 + 0.5) * log(tmp); @@ -42,7 +446,7 @@ inline real_t gamma_ln(real_t x) { ser += cofactors[j]/x1; } real_t y = (-1.0 * tmp + log(2.50662827465 * ser)); - + return y; } @@ -156,8 +560,8 @@ real_t binomial_cmf_ln(ProbIn success_logprob, size_t trials, size_t successes) for(size_t considered_successes = 0; considered_successes <= successes; considered_successes++) { // For every number of successes up to this one, add in the probability. case_logprobs.push_back(choose_ln(trials, considered_successes) + - success_logprob * considered_successes + - logprob_invert(success_logprob) * (trials - considered_successes)); + success_logprob * considered_successes + + logprob_invert(success_logprob) * (trials - considered_successes)); } // Sum up all those per-case probabilities @@ -194,7 +598,7 @@ bool advance_split(Iter start, Iter end) { #ifdef debug cerr << "Trying to advance split with " << *start << " items in first category" << endl; #endif - + // Try advancing what comes after us. auto next = start; ++next; @@ -245,7 +649,7 @@ bool advance_split(Iter start, Iter end) { cerr << "Could not advance or reset child split" << endl; #endif - + return false; } } @@ -387,14 +791,14 @@ real_t multinomial_censored_sampling_prob_ln(const vector& probs, const while (!stack.empty()) { // Emit the current state - + #ifdef debug cerr << "Category counts:" << endl; for (auto& count : category_counts) { cerr << count << endl; } #endif - + auto case_logprob = multinomial_sampling_prob_ln(probs, category_counts); #ifdef debug @@ -421,11 +825,11 @@ real_t multinomial_censored_sampling_prob_ln(const vector& probs, const // We finally found something to advance, so stop ascending the stack. break; } else { - + #ifdef debug cerr << "Could not advanced class at stack depth " << stack.size() - 1 << endl; #endif - + // Pop off the back of the stack. stack.pop_back(); @@ -446,7 +850,7 @@ real_t multinomial_censored_sampling_prob_ln(const vector& probs, const entry = 0; } it->second.front() = obs.at(it->first); - + // Populate the stack with the next class stack.push_back(it); @@ -478,21 +882,21 @@ template class uniform_real_distribution { public: typedef T result_type; - + uniform_real_distribution(T _a = 0.0, T _b = 1.0) : m_a(_a), m_b(_b) { // Nothing to do! } - + void reset() { // Also nothing to do! } - + template T operator()(Generator &_g) { - double dScale = (m_b - m_a) / ((T)(_g.max() - _g.min()) + (T)1); + double dScale = (m_b - m_a) / ((T)(_g.max() - _g.min()) + (T)1); return (_g() - _g.min()) * dScale + m_a; } - + T a() const { return m_a; } @@ -500,7 +904,7 @@ class uniform_real_distribution { T b() const { return m_b; } - + protected: T m_a; T m_b; @@ -510,15 +914,15 @@ template class normal_distribution { public: typedef T result_type; - + normal_distribution(T _mean = 0.0, T _stddev = 1.0) : m_mean(_mean), m_stddev(_stddev) { // Nothing to do! } - + void reset() { m_distU1.reset(); } - + template T operator()(Generator &_g) { // Use Box-Muller algorithm @@ -528,20 +932,113 @@ class normal_distribution { double r = sqrt(-2.0 * log(u1)); return m_mean + m_stddev * r * sin(2.0 * pi * u2); } - + T mean() const { return m_mean; } T stddev() const { return m_stddev; } - + protected: T m_mean; T m_stddev; vg::uniform_real_distribution m_distU1; }; +template +class truncated_normal_distribution { +public: + typedef T result_type; + + truncated_normal_distribution(T _mu = 0.0, + T _sigma = 1.0, + T _a = -numeric_limits::max() / 2.0, + T _b = numeric_limits::max() / 2.0) + : m_mu(_mu), m_sigma(_sigma), m_alpha((_a - _mu) / _sigma), m_beta((_b - _mu) / _sigma) + { + assert(m_sigma > 0.0); + assert(m_alpha <= m_beta); + } + + void reset() { + m_distU1.reset(); + } + + template + T operator()(Generator &_g) { + T u = m_distU1(_g); + return m_mu + m_sigma * Phi_inv(u * Phi(m_beta) + (1.0 - u) * Phi(m_alpha)); + } + + T mean() const { + if (m_alpha != m_beta) { + return m_mu + m_sigma * A(); + } + else { + return m_mu + m_sigma * m_alpha; + } + } + + T stddev() const { + if (m_alpha != m_beta) { + T b = (m_alpha * normal_pdf(m_alpha) - m_beta * normal_pdf(m_beta)) / Z(); + T a = A(); + return m_sigma * std::sqrt(1.0 + b - a * a); + } + else { + return 0.0; + } + } + + T density(T x) const { + T z = (x - m_mu) / m_sigma; + if (z >= m_alpha && z <= m_beta) { + if (m_alpha != m_beta) { + return normal_pdf(z) / (m_sigma * Z()); + } + else { + return numeric_limits::infinity(); + } + } + else { + return 0.0; + } + } + + T cumul(T x) const { + T z = (x - m_mu) / m_sigma; + if (z >= m_alpha && z <= m_beta) { + if (m_alpha != m_beta) { + return (Phi(z) - Phi(m_alpha)) / Z(); + } + else { + return 1.0; + } + } + else if (z > m_beta) { + return 1.0; + } + else { + return 0.0; + } + } + +protected: + T Z() const { + return Phi(m_beta) - Phi(m_alpha); + } + T A() const { + return (normal_pdf(m_alpha) - normal_pdf(m_beta)) / Z(); + } + + T m_mu; + T m_sigma; + T m_alpha; + T m_beta; + vg::uniform_real_distribution m_distU1; +}; + /// We use this widerer to widen the output of a PRNG that generates only /// numbers in a smaller range so they cover a wider int type. template @@ -549,7 +1046,7 @@ class WideningPRNG { public: using result_type = OutType; - + WideningPRNG(PRNG& to_widen) : base(to_widen) { // Nothing to do } @@ -581,7 +1078,7 @@ class WideningPRNG { auto used_bits = numeric_limits::digits - unused_bits; // Get just that max used bit - OutType used_bit = 1 << (used_bits - 1); + OutType used_bit = OutType(1) << (used_bits - 1); // If a generated number from the RNG has this bit flag in it, it has // passed the largest complete power of 2 it can make and needs to be @@ -596,9 +1093,9 @@ class WideningPRNG { used_bit = used_bit >> 1; used_bits--; } - + assert(used_bits > 0); - + OutType generated = 0; int generated_bits = 0; @@ -632,17 +1129,17 @@ template class uniform_int_distribution { public: typedef T result_type; - + uniform_int_distribution(T _a = 0, T _b = numeric_limits::max()) : m_a(_a), m_b(_b) { // Make sure inclusive bounds are valid assert(_b >= _a); } - + void reset() { // Also nothing to do! } - - + + template T operator()(Generator &_g) { @@ -658,7 +1155,7 @@ class uniform_int_distribution { // Since they are so big and inclusive we can't always hold their real sizes, so hold size-1 WorkType source_range_size_minus_1 = (WorkType) _g.max() - (WorkType) _g.min(); WorkType dest_range_size_minus_1 = (WorkType) m_b - (WorkType) m_a; - + if (source_range_size_minus_1 >= dest_range_size_minus_1) { // The generator's result is going to be wide enough return generate_from_wide_generator(_g); @@ -672,67 +1169,67 @@ class uniform_int_distribution { return generate_from_wide_generator(widened); } } - + T a() const { return m_a; } - + T b() const { return m_b; } - + protected: - + /// Generate a result when we know the generator will produce a result on a /// range as big as or bigger than ours. template T generate_from_wide_generator(Generator &_g) { // Jordan's strategy: discard anything above the highest multiple of your range, then mod down to your range. - + #ifdef debug cerr << "Source range " << _g.min() << " to " << _g.max() << endl; cerr << "Dest range " << m_a << " to " << m_b << endl; #endif - + // Define an unsigned widest type to work in using WorkType = typename make_unsigned::type>::type; - + // How big are the source and destination ranges? // Since they are so big and inclusive we can't always hold their real sizes, so hold size-1 WorkType source_range_size_minus_1 = (WorkType) _g.max() - (WorkType) _g.min(); WorkType dest_range_size_minus_1 = (WorkType) m_b - (WorkType) m_a; - + // We must be generating a smaller range from a bigger rnage here. assert(source_range_size_minus_1 >= dest_range_size_minus_1); - + if (dest_range_size_minus_1 == source_range_size_minus_1) { // Ranges are the same size. No real work to do. return (WorkType) _g() - (WorkType) _g.min() + (WorkType) m_a; } - + // Otherwise the ranges differ in size. Which means the dest range must // be smaller. Which means the dest range's real size is representable. WorkType dest_range_size = dest_range_size_minus_1 + 1; - + // Find how many numbers we have to clip off of the top of the source // range so the rest can be covered by tiled destination ranges. WorkType remainder = source_range_size_minus_1 % dest_range_size; // Change the remainder from source_range_size_minus_1 to the remainder for the actual source range size remainder = (remainder + 1) % dest_range_size; - + if (remainder == 0) { // We perfectly tiled the source range return ((WorkType) _g() - (WorkType) _g.min()) % dest_range_size + (WorkType) m_a; } - + // Otherwise there are some values we need to reject - + // Sample a value until we get one that isn't too close to the top of the range. WorkType sampled; do { sampled = (WorkType) _g(); } while (_g.max() - sampled < remainder); - + // Convert to destination range. return (sampled - (WorkType) _g.min()) % dest_range_size + m_a; } @@ -741,14 +1238,14 @@ class uniform_int_distribution { T m_a; T m_b; }; - + /// We provide a partial discrete_distribution implementation that is just the parts we need template class discrete_distribution { public: typedef T result_type; typedef double param_type; - + template discrete_distribution(InputIt first, InputIt last) : m_weights{first, last} { // We can't use an empty weights vector @@ -756,7 +1253,7 @@ class discrete_distribution { // Compute partial sums std::partial_sum(m_weights.begin(), m_weights.end(), std::back_inserter(m_sums)); } - + discrete_distribution(initializer_list weights = {1}) : discrete_distribution(weights.begin(), weights.end()) { // Nothing to do } @@ -764,44 +1261,59 @@ class discrete_distribution { void reset() { // Also nothing to do! } - + template T operator()(Generator &_g) { - + // Set up to generate a double from 0 to max weight vg::uniform_real_distribution backing_dist(0, m_sums.back()); // Do it and find which cumumative sum is greater than it auto winning_iterator = std::lower_bound(m_sums.begin(), m_sums.end(), backing_dist(_g)); - + // Find its category number and return that. return winning_iterator - m_sums.begin(); - + } - -protected: + + protected: // If we ever want to implement the params stuff we need the weights stored. vector m_weights; vector m_sums; - + }; + +// ewen's allele sampling distribution. for use in genotype prior (as in freebayes) +// gives Pr(a1, ...,an;theta) where ai is the number of sampled haplotypes (out of n) that +// have i different alleles at a given locus. theta is the population mutation rate. +// ex: for a single diploid genotype, a={2,0} = heterozygous: 2 alleles occur once. +// a={0,1} = homozygous: 1 allele occurs twice. +// +// https://en.wikipedia.org/wiki/Ewens%27s_sampling_formula +// https://github.com/ekg/freebayes/blob/master/src/Ewens.cpp#L17 +inline real_t ewens_af_prob_ln(const vector& a, real_t theta) { + + // first term (wrt formula as stated on wikipedia) + // n! / (theta * (theta + 1) * ... (theta + n - 1)) + real_t term1_num_ln = factorial_ln(a.size()); + real_t term1_denom_ln = 0.; + for (int i = 0; i < a.size(); ++i) { + term1_denom_ln += log(theta + i); + } + real_t term1_ln = term1_num_ln - term1_denom_ln; + + // second term + // prod [ (theta^aj) / (j^aj * aj!) + real_t term2_ln = 0.; + for (int j = 0; j < a.size(); ++j) { + real_t num = log(pow(theta, a[j])); + real_t denom = log(pow(1. + j, a[j]) + factorial_ln(a[j])); + term2_ln += num - denom; + } + return term1_ln + term2_ln; } -#endif - - - - - - - - - - - - - - - +} +#endif diff --git a/src/stream.cpp b/src/stream.cpp deleted file mode 100644 index 0a684e24d3f..00000000000 --- a/src/stream.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include "stream.hpp" - -namespace vg { - -namespace stream { - -using namespace std; - -void finish(std::ostream& out) { - // Put an EOF on the stream by making a writer, marking it as EOF, and letting it clean up. - BlockedGzipOutputStream bgzip_out(out); - bgzip_out.EndFile(); -} - -} - -} - diff --git a/src/stream.hpp b/src/stream.hpp deleted file mode 100644 index 76529689828..00000000000 --- a/src/stream.hpp +++ /dev/null @@ -1,1040 +0,0 @@ -#ifndef VG_STREAM_HPP_INCLUDED -#define VG_STREAM_HPP_INCLUDED - -// de/serialization of protobuf objects from/to a length-prefixed, gzipped binary stream -// from http://www.mail-archive.com/protobuf@googlegroups.com/msg03417.html - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "blocked_gzip_output_stream.hpp" -#include "blocked_gzip_input_stream.hpp" - -namespace vg { - -namespace stream { - -using namespace std; - -/// Protobuf will refuse to read messages longer than this size. -const size_t MAX_PROTOBUF_SIZE = 1000000000; -/// We aim to generate messages that are this size -const size_t TARGET_PROTOBUF_SIZE = MAX_PROTOBUF_SIZE/2; - -/// Write the EOF marker to the given stream, so that readers won't complain that it might be truncated when they read it in. -/// Internal EOF markers MAY exist, but a file SHOULD have exactly one EOF marker at its end. -void finish(std::ostream& out); - -/// Write objects using adaptive chunking. Takes a stream to write to, a total -/// element count to write, a guess at how many elements should be in a chunk, -/// and a function that, given a destination virtual offset in the output -/// stream (or -1), a start element, and a length, returns a Protobuf object -/// representing that range of elements. -/// -/// Adaptively sets the chunk size, in elements, so that no too-large Protobuf -/// records are serialized. -/// -/// Returns true on success, but throws errors on failure. -template -bool write(std::ostream& out, size_t element_count, size_t chunk_elements, - const std::function& lambda) { - - // How many elements have we serialized so far - size_t serialized = 0; - - BlockedGzipOutputStream bgzip_out(out); - ::google::protobuf::io::CodedOutputStream coded_out(&bgzip_out); - - auto handle = [](bool ok) { - if (!ok) throw std::runtime_error("stream::write: I/O error writing protobuf"); - }; - - while (serialized < element_count) { - - // Work out how many elements can go in this chunk, accounting for the total element count - chunk_elements = std::min(chunk_elements, element_count - serialized); - - // Work out where the chunk is going. - // TODO: we need to back up the coded output stream after every chunk, - // and push the partial buffer into BGZF, and get a new buffer, which - // wastes time. -#ifdef debug - cerr << "Trim stream and determine offset" << endl; -#endif - coded_out.Trim(); - int64_t virtual_offset = bgzip_out.Tell(); -#ifdef debug - cerr << "Offset is " << virtual_offset << endl; -#endif - - // Serialize a chunk -#ifdef debug - cerr << "Go get " << chunk_elements << " elements" << endl; -#endif - std::string chunk_data; - handle(lambda(virtual_offset, serialized, chunk_elements).SerializeToString(&chunk_data)); - - if (chunk_data.size() > MAX_PROTOBUF_SIZE) { - // This is too big! - - if (chunk_elements > 1) { - // But we can make it smaller. Try again at half this size. - chunk_elements = chunk_elements / 2; - continue; - } else { - // This single element is too large - throw std::runtime_error("stream::write: message for element " + - std::to_string(serialized) + " too large error writing protobuf"); - } - } else { - // We can send this message -#ifdef debug - cerr << "Writing message/group of " << chunk_data.size() << " bytes and elements " - << serialized << "-" << (serialized + chunk_elements) << endl; -#endif - - // Say we have a group of a single message -#ifdef debug - cerr << "\tWrite group length" << endl; -#endif - coded_out.WriteVarint64(1); - handle(!coded_out.HadError()); - // and prefix each object with its size -#ifdef debug - cerr << "\tWrite message length" << endl; -#endif - coded_out.WriteVarint32(chunk_data.size()); - handle(!coded_out.HadError()); -#ifdef debug - cerr << "\tWrite message data" << endl; -#endif - coded_out.WriteRaw(chunk_data.data(), chunk_data.size()); - handle(!coded_out.HadError()); -#ifdef debug - cerr << "\tMessage/group written" << endl; -#endif - - // Remember how far we've serialized now - serialized += chunk_elements; - - if (chunk_data.size() < TARGET_PROTOBUF_SIZE/2) { - // We were less than half the target size, so try being twice as - // big next time. - chunk_elements *= 2; - } else if (chunk_data.size() > TARGET_PROTOBUF_SIZE && chunk_elements > 1) { - // We were larger than the target size and we can be smaller - chunk_elements /= 2; - } - } - } - - return true; - -} - -/// Write objects using adaptive chunking. Takes a stream to write to, a total -/// element count to write, a guess at how many elements should be in a chunk, -/// and a function that, given a start element and a length, returns a Protobuf -/// object representing that range of elements. -/// -/// Adaptively sets the chunk size, in elements, so that no too-large Protobuf -/// records are serialized. -/// -/// Returns true on success, but throws errors on failure. -template -bool write(std::ostream& out, size_t element_count, size_t chunk_elements, - const std::function& lambda) { - - return write(out, element_count, chunk_elements, - static_cast&>( - [&lambda](int64_t virtual_offset, size_t chunk_start, size_t chunk_length) -> T { - - // Ignore the virtual offset - return lambda(chunk_start, chunk_length); - })); -} - -/// Write objects. count should be equal to the number of objects to write. -/// count is written before the objects, but if it is 0, it is not written. To -/// get the objects, calls lambda with the highest virtual offset that can be -/// seek'd to in order to read the object (or -1 if the stream is not -/// tellable), and the index of the object to retrieve. If not all objects are -/// written, return false, otherwise true. -template -bool write(std::ostream& out, size_t count, const std::function& lambda) { - - // Make all our streams on the stack, in case of error. - BlockedGzipOutputStream bgzip_out(out); - ::google::protobuf::io::CodedOutputStream coded_out(&bgzip_out); - - auto handle = [](bool ok) { - if (!ok) { - throw std::runtime_error("stream::write: I/O error writing protobuf"); - } - }; - - // We can't seek directly to individual messages, because we can only read - // count-prefixed groups. So the highest seek offset is going to be where - // we are now, where the group count is being written. - coded_out.Trim(); - int64_t virtual_offset = bgzip_out.Tell(); - - // prefix the chunk with the number of objects, if any objects are to be written - if(count > 0) { - coded_out.WriteVarint64(count); - handle(!coded_out.HadError()); - } - - std::string s; - size_t written = 0; - for (size_t n = 0; n < count; ++n, ++written) { - handle(lambda(virtual_offset, n).SerializeToString(&s)); - if (s.size() > MAX_PROTOBUF_SIZE) { - throw std::runtime_error("stream::write: message too large error writing protobuf"); - } - -#ifdef debug - cerr << "Writing message of " << s.size() << " bytes at " << n << "/" << count << " in group @ " << virtual_offset << endl; -#endif - - // and prefix each object with its size - coded_out.WriteVarint32(s.size()); - handle(!coded_out.HadError()); - coded_out.WriteRaw(s.data(), s.size()); - handle(!coded_out.HadError()); - } - - return !count || written == count; -} - -/// Write objects. count should be equal to the number of objects to write. -/// count is written before the objects, but if it is 0, it is not written. To -/// get the objects, calls lambda with the index of the object to retrieve. If -/// not all objects are written, return false, otherwise true. -template -bool write(std::ostream& out, size_t count, const std::function& lambda) { - return write(out, count, - static_cast&>( - [&lambda](int64_t virtual_offset, size_t object_number) -> T { - // Discard the virtual offset - return lambda(object_number); - })); -} - -/// Start, continue, or finish a buffered stream of objects. -/// If the length of the buffer is greater than the limit, writes the buffer out. -/// Otherwise, leaves the objects in the buffer. -/// Must be called with a buffer limit of 0 after all the objects have been produced, to flush the buffer. -/// When called with a buffer limit of 0, automatically appends an EOF marker. -/// Returns true unless an error occurs. -template -bool write_buffered(std::ostream& out, std::vector& buffer, size_t buffer_limit) { - bool wrote = false; - if (buffer.size() >= buffer_limit) { - std::function lambda = [&buffer](size_t n) { return buffer.at(n); }; -#pragma omp critical (stream_out) - wrote = write(out, buffer.size(), lambda); - buffer.clear(); - } - if (buffer_limit == 0) { - // The session is over. Append the EOF marker. - finish(out); - } - return wrote; -} - -/// Deserialize the input stream into the objects. Skips over groups of objects -/// with count 0. Takes a callback function to be called on the objects, with -/// the object and the blocked gzip virtual offset of its group (or -1 if the -/// input is not blocked gzipped), and another to be called per object group -/// with the group size. -template -void for_each_with_group_length(std::istream& in, - const std::function& lambda, - const std::function& handle_count) { - - BlockedGzipInputStream bgzip_in(in); - - // Have a function to complain if any protobuf things report failure - auto handle = [](bool ok) { - if (!ok) { - throw std::runtime_error("[stream::for_each] obsolete, invalid, or corrupt protobuf input"); - } - }; - - while (true) { - // For each count-prefixed group - - // Get the offset we're at, or -1 if we can't seek/tell - int64_t virtual_offset = bgzip_in.Tell(); - -#ifdef debug - cerr << "At virtual offset " << virtual_offset << endl; -#endif - - // Read the count - size_t count; - { - -#ifdef debug - // Peek ahead at the group header - char* data; - int size; - bool worked = bgzip_in.Next((const void**)&data, &size); - if (worked) { - cerr << "Next data is " << size << " bytes: " << endl; - for (size_t i = 0; size > 0 && i < std::min(size, 10); i++) { - cerr << (unsigned int)data[i] << " "; - } - cerr << endl; - bgzip_in.BackUp(size); - } else { - cerr << "Peek failed!" << endl; - } -#endif - - ::google::protobuf::io::CodedInputStream coded_in(&bgzip_in); - bool saw_count = coded_in.ReadVarint64((::google::protobuf::uint64*) &count); - if (!saw_count) { - -#ifdef debug - cerr << "Could not read count. Stopping read." << endl; -#endif - // EOF (probably) - return; - } - -#ifdef debug - cerr << "Found group with count " << count << endl; -#endif - } - - // Call the count callback - handle_count(count); - - // Make a shared buffer string to hold message data for each message. - std::string s; - for (size_t i = 0; i < count; ++i) { - uint32_t msgSize = 0; - - // Make sure to use a new CodedInputStream every time, because each - // one limits all input size on the assumption it is reading a - // single message. - ::google::protobuf::io::CodedInputStream coded_in(&bgzip_in); - - // Alot space for size, and for reading next chunk's length - coded_in.SetTotalBytesLimit(MAX_PROTOBUF_SIZE * 2, MAX_PROTOBUF_SIZE * 2); - - // the messages are prefixed by their size. Insist on reading it. - handle(coded_in.ReadVarint32(&msgSize)); - -#ifdef debug - cerr << "Found message size of " << msgSize << endl; -#endif - - if (msgSize > MAX_PROTOBUF_SIZE) { - throw std::runtime_error("[stream::for_each] protobuf message of " + - std::to_string(msgSize) + " bytes is too long"); - } - -#ifdef debug - cerr << "Reading message of " << msgSize << " bytes at " << i << "/" << count << " in group @ " << virtual_offset << endl; -#endif - - // Note that the message may be 0-size, which is a valid (all default values) Protobuf message. - T object; - if (msgSize > 0) { - // Actually need to parse the nonempty message - handle(coded_in.ReadString(&s, msgSize)); - handle(object.ParseFromString(s)); - } - // Process the message, passing along the virtual offset of the group, if available - lambda(virtual_offset, object); - } - } -} - -template -void for_each(std::istream& in, - const std::function& lambda) { - std::function noop = [](size_t) { }; - for_each_with_group_length(in, lambda, noop); -} - -template -void for_each(std::istream& in, - const std::function& lambda) { - for_each(in, static_cast&>([&lambda](int64_t virtual_offset, T& item) { - lambda(item); - })); -} - -// Parallelized versions of for_each - -// First, an internal implementation underlying several variants below. -// lambda2 is invoked on interleaved pairs of elements from the stream. The -// elements of each pair are in order, but the overall order in which lambda2 -// is invoked on pairs is undefined (concurrent). lambda1 is invoked on an odd -// last element of the stream, if any. -template -void for_each_parallel_impl(std::istream& in, - const std::function& lambda2, - const std::function& lambda1, - const std::function& handle_count, - const std::function& single_threaded_until_true) { - - // objects will be handed off to worker threads in batches of this many - const size_t batch_size = 256; - static_assert(batch_size % 2 == 0, "stream::for_each_parallel::batch_size must be even"); - // max # of such batches to be holding in memory - size_t max_batches_outstanding = 256; - // max # we will ever increase the batch buffer to - const size_t max_max_batches_outstanding = 1 << 13; // 8192 - // number of batches currently being processed - size_t batches_outstanding = 0; - - // this loop handles a chunked file with many pieces - // such as we might write in a multithreaded process - #pragma omp parallel default(none) shared(in, lambda1, lambda2, handle_count, batches_outstanding, max_batches_outstanding, single_threaded_until_true) - #pragma omp single - { - auto handle = [](bool retval) -> void { - if (!retval) throw std::runtime_error("obsolete, invalid, or corrupt protobuf input"); - }; - - BlockedGzipInputStream bgzip_in(in); - ::google::protobuf::io::CodedInputStream coded_in(&bgzip_in); - - std::vector *batch = nullptr; - - // process chunks prefixed by message count - size_t count; - while (coded_in.ReadVarint64((::google::protobuf::uint64*) &count)) { - handle_count(count); - for (size_t i = 0; i < count; ++i) { - if (!batch) { - batch = new std::vector(); - batch->reserve(batch_size); - } - - // Reconstruct the CodedInputStream in place to reset its maximum- - // bytes-ever-read counter, because it thinks it's reading a single - // message. - coded_in.~CodedInputStream(); - new (&coded_in) ::google::protobuf::io::CodedInputStream(&bgzip_in); - // Allot space for size, and for reading next chunk's length - coded_in.SetTotalBytesLimit(MAX_PROTOBUF_SIZE * 2, MAX_PROTOBUF_SIZE * 2); - - uint32_t msgSize = 0; - // the messages are prefixed by their size - handle(coded_in.ReadVarint32(&msgSize)); - - if (msgSize > MAX_PROTOBUF_SIZE) { - throw std::runtime_error("[stream::for_each] protobuf message of " + - std::to_string(msgSize) + " bytes is too long"); - } - - - { - std::string s; - if (msgSize > 0) { - // pick off the message (serialized protobuf object) - handle(coded_in.ReadString(&s, msgSize)); - } - // Even empty messages need to be handled; they are all-default Protobuf objects. - batch->push_back(std::move(s)); - } - - if (batch->size() == batch_size) { - // time to enqueue this batch for processing. first, block if - // we've hit max_batches_outstanding. - size_t b; -#pragma omp atomic capture - b = ++batches_outstanding; - - bool do_single_threaded = !single_threaded_until_true(); - if (b >= max_batches_outstanding || do_single_threaded) { - - // process this batch in the current thread - { - T obj1, obj2; - for (int i = 0; iat(i))); - handle(obj2.ParseFromString(batch->at(i+1))); - lambda2(obj1,obj2); - } - } // scope obj1 & obj2 - delete batch; -#pragma omp atomic capture - b = --batches_outstanding; - - if (4 * b / 3 < max_batches_outstanding - && max_batches_outstanding < max_max_batches_outstanding - && !do_single_threaded) { - // we went through at least 1/4 of the batch buffer while we were doing this thread's batch - // this looks risky, since we want the batch buffer to stay populated the entire time we're - // occupying this thread on compute, so let's increase the batch buffer size - // (skip this adjustment if you're in single-threaded mode and thus expect the buffer to be - // empty) - max_batches_outstanding *= 2; - } - } - else { - // spawn a task in another thread to process this batch -#pragma omp task default(none) firstprivate(batch) shared(batches_outstanding, lambda2, handle, single_threaded_until_true) - { - { - T obj1, obj2; - for (int i = 0; iat(i))); - handle(obj2.ParseFromString(batch->at(i+1))); - lambda2(obj1,obj2); - } - } // scope obj1 & obj2 - delete batch; -#pragma omp atomic update - batches_outstanding--; - } - } - - batch = nullptr; - } - } - } - - #pragma omp taskwait - // process final batch - if (batch) { - { - T obj1, obj2; - int i = 0; - for (; i < batch->size()-1; i+=2) { - handle(obj1.ParseFromString(batch->at(i))); - handle(obj2.ParseFromString(batch->at(i+1))); - lambda2(obj1, obj2); - } - if (i == batch->size()-1) { // odd last object - handle(obj1.ParseFromString(batch->at(i))); - lambda1(obj1); - } - } // scope obj1 & obj2 - delete batch; - } - } -} - -// parallel iteration over interleaved pairs of elements; error out if there's an odd number of elements -template -void for_each_interleaved_pair_parallel(std::istream& in, - const std::function& lambda2) { - std::function err1 = [](T&){ - throw std::runtime_error("stream::for_each_interleaved_pair_parallel: expected input stream of interleaved pairs, but it had odd number of elements"); - }; - std::function no_count = [](size_t i) {}; - std::function no_wait = [](void) {return true;}; - for_each_parallel_impl(in, lambda2, err1, no_count, no_wait); -} - -template -void for_each_interleaved_pair_parallel_after_wait(std::istream& in, - const std::function& lambda2, - const std::function& single_threaded_until_true) { - std::function err1 = [](T&){ - throw std::runtime_error("stream::for_each_interleaved_pair_parallel: expected input stream of interleaved pairs, but it had odd number of elements"); - }; - std::function no_count = [](size_t i) {}; - for_each_parallel_impl(in, lambda2, err1, no_count, single_threaded_until_true); -} - -// parallelized for each individual element -template -void for_each_parallel(std::istream& in, - const std::function& lambda1, - const std::function& handle_count) { - std::function lambda2 = [&lambda1](T& o1, T& o2) { lambda1(o1); lambda1(o2); }; - std::function no_wait = [](void) {return true;}; - for_each_parallel_impl(in, lambda2, lambda1, handle_count, no_wait); -} - -template -void for_each_parallel(std::istream& in, - const std::function& lambda) { - std::function noop = [](size_t) { }; - for_each_parallel(in, lambda, noop); -} - - -/** - * - * Class that wraps an output stream and allows emitting groups of Protobuf - * objects to it, with internal buffering. Handles finishing the file on its - * own, and allows tracking of BGZF virtual offsets within a non-seekable - * stream (as long as the entire stream is controleld by one instance). Cannot - * be copied, but can be moved. - * - * Can call callbacks with the groups emitted and their virtual offsets, for - * indexing purposes. - * - * Note that the callbacks may be called by the ProtobufEmitter's destructor, - * so anything they reference needs to outlive the ProtobufEmitter. - * - * Not thread-safe. May be more efficient than repeated write/write_buffered - * calls because a single BGZF stream can be used. - */ -template -class ProtobufEmitter { -public: - /// Constructor - ProtobufEmitter(std::ostream& out, size_t max_group_size = 1000) : - group(), - max_group_size(max_group_size), - bgzip_out(new BlockedGzipOutputStream(out)) - { - if (bgzip_out->Tell() == -1) { - // Say we are starting at the beginnign of the stream, if we don't know where we are. - bgzip_out->StartFile(); - } - } - - /// Destructor that finishes the file - ~ProtobufEmitter() { - if (bgzip_out.get() != nullptr) { - // Before we are destroyed, write stuff out. - emit_group(); - // Tell our stream to finish the file (since it hasn't been moved away) - bgzip_out->EndFile(); - } - } - - // Prohibit copy - ProtobufEmitter(const ProtobufEmitter& other) = delete; - ProtobufEmitter& operator=(const ProtobufEmitter& other) = delete; - // Allow default move - ProtobufEmitter(ProtobufEmitter&& other) = default; - ProtobufEmitter& operator=(ProtobufEmitter&& other) = default; - - /// Emit the given item. - /// TODO: Not thread safe. - void write(T&& item) { - if (group.size() >= max_group_size) { - emit_group(); - } - group.emplace_back(std::move(item)); - } - - /// Emit a copy of the given item. - /// To use when you have something you can't move. - void write_copy(const T& item) { - if (group.size() >= max_group_size) { - emit_group(); - } - group.push_back(item); - } - - /// Define a type for group emission event listeners - using listener_t = std::function&, int64_t, int64_t)>; - - /// Add an event listener that listens for emitted groups. The listener - /// will be called with the group buffer, the start virtual offset, and the - /// past-end virtual offset. Moves the function passed in. - /// Anything the function uses by reference must outlive this object! - void on_group(listener_t&& listener) { - group_handlers.emplace_back(std::move(listener)); - } - - /// Actually write out everything in the buffer. - /// Doesn't actually flush the underlying streams to disk. - /// Assumes that no more than one group's worht of items are in the buffer. - void emit_group() { - - if (group.empty()) { - // Nothing to do - return; - } - - // We can't write a non-empty buffer if our stream is gone/moved away - assert(bgzip_out.get() != nullptr); - - auto handle = [](bool ok) { - if (!ok) { - throw std::runtime_error("stream::ProtobufEmitter::emit_group: I/O error writing protobuf"); - } - }; - - // Work out where the group we emit will start - int64_t virtual_offset = bgzip_out->Tell(); - - ::google::protobuf::io::CodedOutputStream coded_out(bgzip_out.get()); - - // Prefix the group with the number of objects - coded_out.WriteVarint64(group.size()); - handle(!coded_out.HadError()); - - std::string s; - size_t written = 0; - for (auto& item : group) { - handle(item.SerializeToString(&s)); - if (s.size() > MAX_PROTOBUF_SIZE) { - throw std::runtime_error("stream::ProtobufEmitter::emit_group: message too large error writing protobuf"); - } - - #ifdef debug - cerr << "Writing message of " << s.size() << " bytes in group @ " << virtual_offset << endl; - #endif - - // And prefix each object with its size - coded_out.WriteVarint32(s.size()); - handle(!coded_out.HadError()); - coded_out.WriteRaw(s.data(), s.size()); - handle(!coded_out.HadError()); - } - - // Work out where we ended - coded_out.Trim(); - int64_t next_virtual_offset = bgzip_out->Tell(); - - for (auto& handler : group_handlers) { - // Report the group to each group handler that is listening - handler(group, virtual_offset, next_virtual_offset); - } - - // Empty the buffer because everything in it is written - group.clear(); - } - - -private: - - // This is our internal buffer - vector group; - // This is how big we let it get before we dump it - size_t max_group_size; - // Since Protobuf streams can't be copied or moved, we wrap ours in a uniqueptr_t so we can be moved. - unique_ptr bgzip_out; - - // If someone wants to listen in on emitted groups, they can register a handler - vector group_handlers; - -}; - -/** - * Refactored stream::for_each function that follows the unidirectional iterator interface. - * Also supports seeking and telling at the group level in bgzip files. - * Cannot be copied, but can be moved. - */ -template -class ProtobufIterator { -public: - /// Constructor - ProtobufIterator(std::istream& in) : - group_count(0), - group_idx(0), - group_vo(-1), - item_vo(-1), - item_bytes(0), - end_next(false), - bgzip_in(new BlockedGzipInputStream(in)) - { - get_next(); - } - - /// Return true if dereferencing the iterator will produce a valid value, and false otherwise. - inline bool has_next() const { - return item_vo != -1; - } - - /// Advance the iterator to the next message, or the end if this was the last message. - void get_next() { - if (end_next || group_count == group_idx) { - // We have made it to the end of the group we are reading. We will - // start a new group now. - - // Determine exactly where we are positioned, if possible, before - // creating the CodedInputStream to read the group's item count - auto virtual_offset = bgzip_in->Tell(); - - if (virtual_offset == -1) { - // We don't have seek capability, so we just count up the groups we read. - // On construction this is -1; bump it up to 0 for the first group. - group_vo++; - } else { - // We can seek. We need to know what offset we are at - group_vo = virtual_offset; - } - - // Start at the start of the new group - group_idx = 0; - - // Make a CodedInputStream to read the group length - ::google::protobuf::io::CodedInputStream coded_in(bgzip_in.get()); - // Alot space for group's length (generously) - coded_in.SetTotalBytesLimit(MAX_PROTOBUF_SIZE * 2, MAX_PROTOBUF_SIZE * 2); - - // Try and read the group's length - if (end_next || !coded_in.ReadVarint64((::google::protobuf::uint64*) &group_count)) { - // We didn't get a length (or we want to end the iteration) - - // This is the end of the input stream, switch to state that - // will match the end constructor - group_vo = -1; - item_vo = -1; - item_bytes = 0; - value = T(); - return; - } - - } - - // Now we know we're in a group. - - // Get the item's virtual offset, if available - auto virtual_offset = bgzip_in->Tell(); - - // We need a fresh CodedInputStream every time, because of the total byte limit - ::google::protobuf::io::CodedInputStream coded_in(bgzip_in.get()); - // Alot space for size and item (generously) - coded_in.SetTotalBytesLimit(MAX_PROTOBUF_SIZE * 2, MAX_PROTOBUF_SIZE * 2); - - // A message starts here - if (virtual_offset == -1) { - // Just track the counter. - item_vo++; - } else { - // We know where here is - item_vo = virtual_offset; - } - - // The messages are prefixed by their size - uint32_t msgSize = 0; - handle(coded_in.ReadVarint32(&msgSize)); - item_bytes = msgSize; - - if (msgSize > MAX_PROTOBUF_SIZE) { - throw std::runtime_error("[stream::ProtobufIterator::get_next] protobuf message of " + - std::to_string(msgSize) + " bytes is too long"); - } - - - // We have a message. - value.Clear(); - if (msgSize) { - // It has non-default field values. Parse them. - std::string s; - handle(coded_in.ReadString(&s, msgSize)); - handle(value.ParseFromString(s)); - } - - // Move on to the next message in the group - group_idx++; - } - -// inline ProtobufIterator operator++( int ) { -// ProtobufIterator temp = *this; -// get_next(); -// return temp; -// } - - inline const T& operator*() const { - return value; - } - - /// Take the current item, which must exist, and advance the iterator to the next one. - inline T take() { - T temp = std::move(value); - get_next(); - // Return by value, which gets moved. - return temp; - } - - /// How many serialized, uncompressed bytes did the currently loaded item take. - /// Will be 0 if no current item is available, but can also be 0 for valid, all-default items. - inline size_t get_item_size() const { - return item_bytes; - } - - /// Return the virtual offset of the group being currently read (i.e. the - /// group to which the current message belongs), to seek back to. You can't - /// seek back to the current message, just to the start of the group. - /// Returns -1 instead if the underlying file doesn't support seek/tell. - /// Returns the past-the-end virtual offset of the file if EOF is reached. - inline int64_t tell_group() const { - if (bgzip_in->Tell() != -1) { - // The backing file supports seek/tell (which we ascertain by attempting it). - if (group_vo == -1) { - // We hit EOF and have no loaded message - return tell_raw(); - } else { - // Return the *group's* virtual offset (not the current one) - return group_vo; - } - } else { - // group_vo holds a count. But we need to say we can't seek. - return -1; - } - } - - /// Seek to the given virtual offset and start reading the group that is there. - /// The next value produced will be the first value in that group. - /// Return false if seeking is unsupported or the seek fails. - inline bool seek_group(int64_t virtual_offset) { - if (virtual_offset < 0) { - // That's not allowed - return false; - } - - // Try and do the seek - bool sought = bgzip_in->Seek(virtual_offset); - - if (!sought) { - // We can't seek - return false; - } - - // Get ready to read the group that's here - group_count = 0; - group_idx = 0; - end_next = false; - - // Read it (or detect EOF) - get_next(); - - // It worked! - return true; - } - - /// Return the raw virtual offset that the cursor is at in the file, or -1 - /// for an unseekable/untellable stream. Not necessarily at a group - /// boundary, so cannot be used for seeking. Useful for getting the final - /// virtual offset when the cursor hits the end of a file. - /// This is NOT the virtual offset at which the currently loaded item occurs! - inline int64_t tell_raw() const { - return bgzip_in->Tell(); - } - - /// Return the virtual offset of the currently loaded item, or tell_raw() if at end. - /// Returns -1 for an unseekable/untellable stream. - inline int64_t tell_item() const { - if (bgzip_in->Tell() != -1) { - // The backing file supports seek/tell (which we ascertain by attempting it). - if (item_vo == -1) { - // We hit EOF and have no loaded message - return tell_raw(); - } else { - // Return the item's virtual offset - return item_vo; - } - } else { - // item_vo holds a count. But we need to say we can't seek. - return -1; - } - } - - /// Seek to the given virtual offset and read a single item from there. The - /// next value produced will be the item that is there, and then the - /// iterator will end (unless the user seeks somewhere else). Return false - /// if seeking is unsupported or the seek fails. - /// Will not work right if EOF is sought. - inline bool seek_item_and_stop(int64_t virtual_offset) { - if (virtual_offset < 0) { - // That's not allowed - return false; - } - - // Try and do the seek - bool sought = bgzip_in->Seek(virtual_offset); - - if (!sought) { - // We can't seek - return false; - } - - // Pretend to be in a 1-element group, which will be the last one until someone seeks again. - group_count = 1; - group_idx = 0; - - // Allow an item to be read - end_next = false; - - // Read the element - get_next(); - - // Stop after this one - end_next = true; - - // It worked! - return true; - } - - /// Returns iterators that act like begin() and end() for a stream containing protobuf data - static std::pair, ProtobufIterator> range(std::istream& in) { - return std::make_pair(ProtobufIterator(in), ProtobufIterator()); - } - -private: - - T value; - - // This holds the number of messages that exist in the current group. - size_t group_count; - // This holds the number of messages read in the current group. - size_t group_idx; - // This holds the virtual offset of the current group's start, or the - // number of the current group if seeking is not available. - // If the iterator is the end iterator, this is -1. - int64_t group_vo; - // This holds the virtual offset of the current item, or -1 if seeking is not possible. - // Useful for seeking back to the item later, although you will have to seek to a group to iterate, after that. - int64_t item_vo; - // This holds the number of serialized bytes that the currently loaded item took up in the stream. - // Doesn't count group size or item length overhead. - size_t item_bytes; - // This is a flag for whether we should hit the end on the next get_next() - // It is set when we seek to a message individually, and unset when we seek to a group. - bool end_next; - - // Since Protobuf streams can't be copied or moved, we wrap ours in a uniqueptr_t so we can be moved. - unique_ptr bgzip_in; - - void handle(bool ok) { - if (!ok) { - throw std::runtime_error("[stream::ProtobufIterator] obsolete, invalid, or corrupt protobuf input"); - } - } -}; - -/// Produce an std::function that can be invoked with Protobuf objects and save them to the given stream. -/// Easy way to get a dumping callback to feed to something that wants a callback. -/// The passed stream must outlive the resulting function. -template -std::function emit_to(ostream& out) { - // We are going to be clever and make a lambda capture a shared_ptr to an - // emitter, so we can have the emitter last as long as the function we - // return. - shared_ptr> emitter(new ProtobufEmitter(out)); - - return [emitter](const Item& item) { - // Write out each item. - // TODO: Set up so we can use the move operation the cursors support - // Not easy because of https://stackoverflow.com/a/30394755 - emitter->write_copy(item); - }; -} - -} - -} - -#endif diff --git a/src/stream_index.cpp b/src/stream_index.cpp new file mode 100644 index 00000000000..1f42c30bb9c --- /dev/null +++ b/src/stream_index.cpp @@ -0,0 +1,687 @@ +#include "stream_index.hpp" + +#include +#include + +#include +#include + +namespace vg { + +using namespace std; + +BitString::BitString(uint64_t bits, size_t length) : bits(bits), bit_length(length) { + // Make sure we didn't overflow + assert(length <= numeric_limits::max()); + + if (bit_length == 0) { + // Just remove all the bits + this->bits = 0; + } else { + // We won't shift out all the bits so it is safe to shift + this->bits = this->bits << (TOTAL_BITS - bit_length); + } +} + +BitString::BitString() : BitString(0, 0) { + // Nothing to do +} + +auto BitString::to_number() const -> uint64_t { + auto total_bits = CHAR_BIT * sizeof(decltype(this->bits)); + + if (bit_length == 0) { + // We have no used bits + return 0; + } else { + // We won't shift out all the bits so it is safe to shift + return bits >> (TOTAL_BITS - bit_length); + } +} + +auto BitString::drop_prefix(size_t prefix_length) const -> BitString { + if (prefix_length >= bit_length) { + // We are losing all our bits + return BitString(); + } + + // Otherwise shift off the dropped bits and return that + BitString other; + other.bits = bits << prefix_length; + other.bit_length = bit_length - prefix_length; + + return other; +} + +auto BitString::split(size_t prefix_length) const -> pair { + assert(prefix_length <= bit_length); + + // Make the prefix + auto non_prefix_bits = TOTAL_BITS - prefix_length; + BitString prefix; + if (non_prefix_bits < TOTAL_BITS) { + // The prefix will be nonempty + prefix.bits = (bits >> non_prefix_bits) << non_prefix_bits; + prefix.bit_length = prefix_length; + } + + // Return the prefix and the rest + return make_pair(prefix, drop_prefix(prefix_length)); +} + +auto BitString::operator==(const BitString& other) const -> bool { + // There's nothing fancy here because we demand all unused bits in the storage are 0. + return (bits == other.bits && bit_length == other.bit_length); +} + +auto BitString::operator!=(const BitString& other) const -> bool { + // There's nothing fancy here because we demand all unused bits in the storage are 0. + return (bits != other.bits || bit_length != other.bit_length); +} + +auto BitString::common_prefix_length(const BitString& other) const -> size_t { + // Make a mask where identical bits are 0 + auto mask = bits ^ other.bits; + // Count the identical bits (count unset leading bits) with a compiler builtin + size_t identical_bits = __builtin_clzll(mask); + // Return the number of matching bits before the first mismatch, or the length of the shorter BitString + return min(min(identical_bits, (size_t) bit_length), (size_t) other.bit_length); +} + +auto BitString::at_or_before(const BitString& other) const -> bool { + auto first_diff = common_prefix_length(other); + if (first_diff >= length() || first_diff >= other.length()) { + // No differences spotted, so we can't conclusively place ourselves second + return true; + } + + // Otherwise, there's a difference. If we have the zero, we aren't second. + return !drop_prefix(first_diff).peek(); +} + +auto BitString::at_or_after(const BitString& other) const -> bool { + auto first_diff = common_prefix_length(other); + if (first_diff >= length() || first_diff >= other.length()) { + // No differences spotted, so we can't conclusively place ourselves first + return true; + } + + // Otherwise, there's a difference. If we have the one, we aren't first. + return drop_prefix(first_diff).peek(); +} + +auto BitString::peek() const -> bool { + if (bit_length == 0) { + return false; + } + + // If we aren't empty, get the high bit + return ((uint64_t)1 << 63) & bits; +} + +auto BitString::length() const -> size_t { + return bit_length; +} + +auto BitString::empty() const -> bool { + return bit_length == 0; +} + +auto operator<<(ostream& out, const BitString& bs) -> ostream& { + BitString temp = bs; + out << bs.length() << ":"; + while(!temp.empty()) { + // Pop off and print each bit + out << (temp.peek() ? '1' : '0'); + temp = temp.drop_prefix(1); + } + + return out; +} + +const string StreamIndexBase::MAGIC_BYTES = "GAI!"; + +auto StreamIndexBase::bin_to_prefix(bin_t bin) -> BitString { + +#ifdef debug + cerr << "Looking for ID prefix for bin " << bin << endl; +#endif + + // The bin is an offset with the low n bits set, plus an n-bit index. + // We have to figure out the value of the all-1s offset. + bin_t bin_offset; + // And how many bits are used (n) + size_t used_bits; + + // The offset will have as many 1s as the used bits in the number if the + // number is all 1s after some leading 0s. Otherwise the offset will have 1 + // fewer bits than the number. + + // Count the leading 0 bits + size_t leading_zeros = __builtin_clzll(bin); + +#ifdef debug + cerr << "The bin number has " << leading_zeros << " leading zeros" << endl; +#endif + + if (leading_zeros == numeric_limits::digits) { + // All zero bin is the root and gets the empty prefix + // Don't even bother with the rest of the logic +#ifdef debug + cerr << "The bin is bin 0 so the answer is " << BitString() << endl; +#endif + return BitString(); + } else if (leading_zeros == numeric_limits::digits - 1) { + // It must be bin 1, which has a 1-bit offset +#ifdef debug + cerr << "The bin is all zeros but one bit, so it is bin 1" << endl; +#endif + bin_offset = 1; + used_bits = 1; + } else { + // Make an all-1s value as wide as our bin number + bin_offset = ((bin_t)~0) >> leading_zeros; + + // Estimate the used bits; the bits that aren't 0s are 1s in the offset. + used_bits = numeric_limits::digits - leading_zeros; + + if (bin_offset > bin) { + // We need 1 fewer bits for the offset. + bin_offset = bin_offset >> 1; + // Correct the used bits in the bin index + used_bits--; + } + } + +#ifdef debug + cerr << "The bin offset value is " << bin_offset << " and the bin index uses " << used_bits << " bits" << endl; + cerr << "The bin index is " << bin - bin_offset << endl; +#endif + + /// Subtract out the offset to get the bin index, and use it as a prefix + /// with the appropriate number of bits based on what the offset was. + BitString result(bin - bin_offset, used_bits); +#ifdef debug + cerr << "It is " << result << endl; +#endif + return result; +} + +auto StreamIndexBase::id_to_prefix(id_t id) -> BitString { + // ID is signed and has only 63 "digits". + // The trees work on 64 bit strings. + return BitString(id, numeric_limits::digits); +} + +auto StreamIndexBase::used_bins_of_range(id_t min_id, id_t max_id, const function& iteratee) const -> bool { + // The iteratee types are the same so we can just pass that along. + return bins_by_id_prefix.traverse_in_order(id_to_prefix(min_id), id_to_prefix(max_id), iteratee); +} + +auto StreamIndexBase::common_bin(id_t a, id_t b) -> bin_t { + // Convert to unsigned numbers + bin_t a_bin = a; + bin_t b_bin = b; + + // Define the offset for the bin + bin_t offset = ((bin_t)~0); + + // We're just going to pop off bits until we find the common prefix. + // Always pop off one bit, even if we are binning a number and itself. + // TODO: Find a faster way to do this with the appropriate instruction intrinsics. + do { + a_bin = a_bin >> 1; + b_bin = b_bin >> 1; + offset = offset >> 1; + } while(a_bin != b_bin); + return a_bin + offset; +} + +auto StreamIndexBase::window_of_id(id_t id) -> window_t { + return id >> WINDOW_SHIFT; +} + +auto StreamIndexBase::add_group(id_t min_id, id_t max_id, int64_t virtual_start, int64_t virtual_past_end) -> void { + + if (min_id < last_group_min_id) { + // Someone is trying to index an unsorted GAM. + // This is probably user error, so complain appropriately: + cerr << "error [vg::GAMIndex]: GAM data being indexed is not sorted. Sort with vg gamsort." << endl; + exit(1); + } + last_group_min_id = min_id; + + // Find the bin for the run + bin_t bin = common_bin(min_id, max_id); + +#ifdef debug + cerr << "Group spanning " << min_id << "-" << max_id << " at " + << virtual_start << "-" << virtual_past_end << " lands in bin " << bin << endl; +#endif + + // Find the existing ranges in the bin. + // We know the previous one, if present, must end at or before this one's start. + auto& ranges = bin_to_ranges[bin]; + + if (ranges.empty()) { + // We just made a new vector of bin ranges. Remember it. + bins_by_id_prefix.insert(bin_to_prefix(bin), bin); + } + + if (!ranges.empty() && ranges.back().second == virtual_start) { + // We fit right after the last range. + ranges.back().second = virtual_past_end; +#ifdef debug + cerr << "Extend existing range to " << ranges.back().first << "-" << ranges.back().second << endl; +#endif + } else { + // We need a new range + bin_to_ranges[bin].emplace_back(virtual_start, virtual_past_end); + } + + for (window_t w = window_of_id(min_id); w <= window_of_id(max_id); w++) { + // For each window that this group overlaps + + if (!window_to_start.count(w)) { + // If it is the first group we encounter in the window, it must also be the earliest-staring group in the window. + + // This is the earliest virtual offset to overlap that window + window_to_start[w] = virtual_start; + +#ifdef debug + cerr << "Start window " << w << endl; +#endif + } + } +} + +auto StreamIndexBase::find(id_t node_id) const -> vector> { + vector> to_return; + + find(node_id, [&](int64_t run_start, int64_t run_past_end) -> bool { + // For each run we find, remember it + to_return.emplace_back(run_start, run_past_end); + // Keep getting runs until we run out in the index. We can't actually scan the data. + return true; + }); + + return to_return; +} + +auto StreamIndexBase::find(id_t node_id, const function scan_callback) const -> void { + // Look for a single-node inclusive range + find(node_id, node_id, std::move(scan_callback)); +} + +auto StreamIndexBase::find(id_t min_node, id_t max_node, const function scan_callback) const -> void { + +#ifdef debug + cerr << "Query for node range " << min_node << "-" << max_node << endl; +#endif + + // Find the window that gives us a lower bound on the virtual offset we + // need to be at to find things that touch this node ID. + window_t min_window = window_of_id(min_node); + window_t max_window = window_of_id(max_node); + +#ifdef debug + cerr << "Looking for first filled window of " << min_window << "-" << max_window << endl; +#endif + + // Find the minimum virtual offset we need to consider + int64_t min_vo = 0; + // It will be for the first occupied window at or after the min window but not greater than the max window. + auto found = window_to_start.lower_bound(min_window); + if (found != window_to_start.end() && found->first <= max_window) { + // Some groups overlapped this window, and they started here. + min_vo = found->second; + +#ifdef debug + cerr << "First occupied window is " << found->first << " at offset " << min_vo << endl; +#endif + } else { + // No groups overlapped any window within the range, so don't iterate anything. + +#ifdef debug + cerr << "No windows occupied; range is empty" << endl; +#endif + + return; + } + + // This will hold bins that actually have vectors in the index, as + // iterators to them in the index. + vector used_bins; + + // Loop over the bins we have to deal with + used_bins_of_range(min_node, max_node, [&](bin_t bin_number) -> bool { + // All bins we get should be nonempty + auto found = bin_to_ranges.find(bin_number); + assert(found != bin_to_ranges.end()); + used_bins.push_back(found); + // TODO: Is there a way we can just process all the bins one at a time, + // instead of merging across them? + return true; + }); + + // Define a cursor type within a bin. + // A cursor has an iterator to a pair of start and past-end VOs that occur in a bin. + // But we also need to be able to look at the cursor and know if it is done + // So we also keep the index in used_bins that it belongs to + using bin_cursor_t = pair>::const_iterator, size_t>; + + // As we iterate we will be interested in the smallest cursor (i.e. the cursor to the earliest-starting run not already iterated.) + // Define a way to get that + // Since the queue puts the "greatest" element at the top, our "less" is really a "greater" to turn it around. + auto greater_for_cursors = [&](const bin_cursor_t& a, const bin_cursor_t& b) { + return a.first->first > b.first->first; + }; + + // We want a priority queue of cursors, so we can easily find the next one to use + priority_queue, decltype(greater_for_cursors)> cursor_queue(greater_for_cursors); + + // Set up a cursor in each bin + // TODO: Could we do one cursor per specificity level instead? That would be faster. + for (size_t i = 0; i < used_bins.size(); i++) { + // Look at the start of the bin + bin_cursor_t cursor = make_pair(used_bins[i]->second.begin(), i); + + while(cursor.first != used_bins[cursor.second]->second.end() && cursor.first->second < min_vo) { + // Skip any runs that end before the window VO + cursor.first++; + } + + if (cursor.first != used_bins[cursor.second]->second.end()) { + // If there are still runs in the bin, use them. + cursor_queue.push(cursor); + } + +#ifdef debug + cerr << "Bin " << used_bins[i]->first << " overlaps the query and has runs after the window min VO" << endl; +#endif + } + + bool keep_going = true; + + while (keep_going && !cursor_queue.empty()) { + // Loop until the user asks us to stop or we run out of things to give them. + +#ifdef debug + cerr << "Find earliest-starting run we haven't used yet in any bin" << endl; +#endif + + // Pull off the top element + bin_cursor_t top = cursor_queue.top(); + cursor_queue.pop(); + + // The bin windows are proper runs, so they can't overlap. + // So after we deal with this run, we won't have to adjust any other bin cursors. + +#ifdef debug + cerr << "Found run " << top.first->first << "-" << top.first->second << endl; +#endif + + // Call the callback with the range max(min_vo from the window, that run's start) to that run's end. + keep_going &= scan_callback(max(min_vo, top.first->first), top.first->second); + + if (!keep_going) { + // The user is done with runs. They must have found a group that has an out-of-range minimum node ID. + // We are done! + return; + } + + // Move up the min VO to the past the end of the run we just did. + // TODO: We shouldn't need to do this since the runs can't overlap + min_vo = top.first->second; + + // Advance what was the top iterator + top.first++; + if (top.first != used_bins[top.second]->second.end()) { + // We haven't yet hit the end of this bin, so the cursor is eligible to be used again + cursor_queue.push(top); + } else { +#ifdef debug + cerr << "Found bin index " << top.second << " is exhausted" << endl; +#endif + } + } +} + +auto StreamIndexBase::scan_backward(const function scan_callback) const -> void { + // Remember the previous range's start VO, to be the next range's past-end VO. + int64_t prev_vo = numeric_limits::max(); + + for(auto rit = window_to_start.rbegin(); rit != window_to_start.rend(); ++rit) { + + // Go over the window offsets we have stored in reverse order. + // We can use them as handy valid pointers to groups that are easy to go over in reverse order. + + if (!scan_callback(rit->second, prev_vo)) { + // The iteratee is done + return; + } + + // Remember the start VO to be the next end + prev_vo = rit->second; + } +} + +/// Return true if the given ID is in any of the sorted, coalesced, inclusive ranges in the vector, and false otherwise. +/// TODO: Is repeated binary search on the ranges going to be better than an unordered_set of all the individual IDs? +auto StreamIndexBase::is_in_range(const vector>& ranges, id_t id) -> bool { + // Use a binary search + size_t left = 0; + size_t past_right = ranges.size(); + + while (past_right >= left + 1) { + // We have a nonempty interval + + // Find the middle + size_t center = (left + past_right) / 2; + assert(center < ranges.size()); + + // Look at the range there + auto& range = ranges[center]; + + if (id < range.first) { + // If we're before it, go left + past_right = center; + } else if (id > range.second) { + // If we're after it, go right + left = center + 1; + } else { + // If we're in it, return true + return true; + } + } + + // If we get here, it wasn't in any range + return false; + +} + +auto StreamIndexBase::save(ostream& to) const -> void { + // We aren't going to save as Protobuf messages; we're going to save as a bunch of varints. + + // Format is + // Magic bytes + // Index version (varint32) + // Bin count (varint64) + // For each bin: + // Bin number (varint64) + // Run count (varint64) + // For each run: + // Start (varint64) + // Past-end (varint64) + // And then window count (varint64) + // And for each window: + // Window number (varint64) + // Window start (varint64) + + // All the integers are Protobuf variable-length values. + // The result is gzip-compressed. + + ::google::protobuf::io::OstreamOutputStream raw_out(&to); + ::google::protobuf::io::GzipOutputStream gzip_out(&raw_out); + ::google::protobuf::io::CodedOutputStream coded_out(&gzip_out); + + // Save the magic bytes + coded_out.WriteRaw((void*)MAGIC_BYTES.c_str(), MAGIC_BYTES.size()); + + // Save the version + coded_out.WriteVarint32(OUTPUT_VERSION); + + // Save the bin count + coded_out.WriteVarint64(bin_to_ranges.size()); + for (auto& kv : bin_to_ranges) { + // For each bin, save the number + coded_out.WriteVarint64(kv.first); + // And the number of runs + coded_out.WriteVarint64(kv.second.size()); + + for (auto& run : kv.second) { + // For each run, write the VO range + coded_out.WriteVarint64(run.first); + coded_out.WriteVarint64(run.second); + } + } + + // Save the window count + coded_out.WriteVarint64(window_to_start.size()); + for (auto& kv : window_to_start) { + // Save each window's number and start + coded_out.WriteVarint64(kv.first); + coded_out.WriteVarint64(kv.second); + } + +} + +auto StreamIndexBase::load(istream& from) -> void { + + ::google::protobuf::io::IstreamInputStream raw_in(&from); + ::google::protobuf::io::GzipInputStream gzip_in(&raw_in); + + + bin_to_ranges.clear(); + window_to_start.clear(); + + // Define an error handling function + auto handle = [](bool ok) { + if (!ok) throw std::runtime_error("GAMIndex::load detected corrupt index file"); + }; + + // Look for the magic value + + // First read a bit of data + char* buffer; + int buffer_size = 0; + while (buffer_size == 0) { + // We must retry until we get some data, accoridng to the ZeroCopyInputStream spec + handle(gzip_in.Next((const void**)&buffer, &buffer_size)); + } + + // TODO: In theory, we might have arbitrarily small buffers given to us. + // We assume that the buffers are always big enough to actually peek the magic value and back up. + assert(buffer_size >= MAGIC_BYTES.size()); + + // We will fill this in with the version if we find it + uint32_t input_version = 0; + + // Check to see if the magic bytes are there + if (std::equal(MAGIC_BYTES.begin(), MAGIC_BYTES.end(), buffer)) { + // We found the magic bytes! We know this is a versioned GAM index file. + + // Roll back to just after them + gzip_in.BackUp(buffer_size - MAGIC_BYTES.size()); + + // Read the input version + { + ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); + handle(coded_in.ReadVarint32(&input_version)); + } + } else { + // No magic bytes means input version 0 + // Roll back everything + gzip_in.BackUp(buffer_size); + } + + if (input_version > MAX_INPUT_VERSION) { + throw std::runtime_error("GAMIndex::load can understand only up to index version " + to_string(MAX_INPUT_VERSION) + + " and file is version " + to_string(input_version)); + } + + switch (input_version) { + case 0: + case 1: + // Read the number of bins that are used + uint64_t bin_count; + { + // TODO: To avoid hitting the coded input stream's byte limit (why is + // it even at this level?) we destory and recreate it for every + // semantic group. + ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); + handle(coded_in.ReadVarint64(&bin_count)); + } + + for (size_t i = 0; i < bin_count; i++) { + // Read the bin number and run count for each bin + uint64_t bin_number; + uint64_t run_count; + { + ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); + handle(coded_in.ReadVarint64(&bin_number)); + handle(coded_in.ReadVarint64(&run_count)); + } + + // Create the empty bin + auto& runs = bin_to_ranges[bin_number]; + + // Remember it in the bins by ID prefix index + bins_by_id_prefix.insert(bin_to_prefix(bin_number), bin_number); + + for (size_t j = 0; j < run_count; j++) { + // Load each run + uint64_t run_start; + uint64_t run_end; + + { + ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); + handle(coded_in.ReadVarint64(&run_start)); + handle(coded_in.ReadVarint64(&run_end)); + } + + runs.emplace_back(run_start, run_end); + + } + + } + + + // Now count the number of windows + uint64_t window_count; + { + ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); + handle(coded_in.ReadVarint64(&window_count)); + } + + for (size_t i = 0; i < window_count; i++) { + // Load each window + uint64_t window_number; + uint64_t window_start; + + { + ::google::protobuf::io::CodedInputStream coded_in(&gzip_in); + handle(coded_in.ReadVarint64(&window_number)); + handle(coded_in.ReadVarint64(&window_start)); + } + + window_to_start[window_number] = window_start; + } + break; + default: + throw std::runtime_error("Unimplemented GAM index version " + to_string(input_version)); + } + +} + +} diff --git a/src/stream_index.hpp b/src/stream_index.hpp new file mode 100644 index 00000000000..32b6255a412 --- /dev/null +++ b/src/stream_index.hpp @@ -0,0 +1,945 @@ +#ifndef VG_STREAM_INDEX_HPP_INCLUDED +#define VG_STREAM_INDEX_HPP_INCLUDED + +/** + * \file stream_index.hpp + * Contains the StreamIndex template, which allows lookup by relevant node ID in sorted VPKG-formatted files. + */ + +#include +#include +#include +#include +#include + +#include "types.hpp" +#include +#include +#include "scanner.hpp" + +namespace vg { + +using namespace std; + +// For efficiently storing the bins that are populated in the relatively sparse +// bin space that the index uses, we use a compact prefix trie (radix tree) on +// the node ID bit string prefixes that correspond to the bins. + +// To make that work we in turn need a bit string type that is easily +// convertible to/from 64-bit numbers. + +/** + * Represents a string of up to 64 bits. + */ +class BitString { +public: + + /// Make a new BitString representing the low length bits of the given number + BitString(uint64_t bits, size_t length); + + /// Make an empty BitString + BitString(); + + // Copyable and movable + BitString& operator=(const BitString& other) = default; + BitString& operator=(BitString&& other) = default; + BitString(const BitString& other) = default; + BitString(BitString&& other) = default; + + /// Convert the BitString back to a number + uint64_t to_number() const; + + /// Get a suffix of a BitString by dropping the specified number of bits. + /// If we drop all the bits or more, get an empty BitString. + BitString drop_prefix(size_t prefix_length) const; + + /// Split into a prefix of the given length and a suffix of the rest + pair split(size_t prefix_length) const; + + /// Determine if two BitStrings are equal + bool operator==(const BitString& other) const; + + /// Determine if two BitStrings are unequal + bool operator!=(const BitString& other) const; + + /// Get the length of the longest common prefix (index of the first + /// mismatching bit) between this BitString and another. + size_t common_prefix_length(const BitString& other) const; + + /// Return true if one BitString is a prefix of the other, or if this BitString has the 0 at the first differing bit. + bool at_or_before(const BitString& other) const; + + /// Return true if one BitString is a prefix of the other, or if the other BitString has the 0 at the first differing bit. + bool at_or_after(const BitString& other) const; + + /// Peek at the top bit and see if it is a 1 (true) or 0 (false). + /// Empty bit strings get false. + bool peek() const; + + /// Get the length of the BitString + size_t length() const; + + /// Return true if the bit string is empty + bool empty() const; + +protected: + /// Holds the actual bits of the bit string. + /// We store the bits aligned to the left to make finding the first mismatch easier. + /// All bits below the last used bit are 0. + uint64_t bits; + + /// Holds the number of bits that are used. + uint8_t bit_length; + + /// How many total bits are possible? + const static size_t TOTAL_BITS = numeric_limits::digits; +}; + +/// Allow BitStrings to be printed for debugging +ostream& operator<<(ostream& out, const BitString& bs); + +/** + * Represents a radix tree keyed by/internally using BitStrings. + * Each item has a BitString as a key, and items are stored in a trie/prefix tree. + * Each node has at most one item and at most two children. + * Movable but not copyable. + * TODO: Implement copy. + */ +template +class BitStringTree { +public: + + /// Insert the given item under the given key + void insert(const BitString& key, const Item& value); + + /// Enumerate items whose keys match prefixes of the given key, in order from most specific to least specific. + /// Returns false if stopped early. + bool traverse_up(const BitString& key, const function& iteratee) const; + + /// Enumerate items which have the given low and high bit strings as a + /// prefix, or which fall between them, as an in-order traversal of the + /// tree. Returns false if stopped early. + bool traverse_in_order(const BitString& low, const BitString& high, const function& iteratee) const; + +protected: + struct TreeNode { + /// Each TreeNode represents a prefix over its parent + BitString prefix; + /// Each TreeNode holds a 0 child and a 1 child, at most + /// Their prefixes say what full prefixes they actually correspond to + unique_ptr children[2]; + /// Each TreeNode also may hold an item record + Item content; + /// This is set if we actually have one + bool has_content = false; + + /// Insert the given item at or under this node. + /// Its key must have had our prefix already removed from it. + void insert(const BitString& key, const Item& value); + + /// Search down to the node that corresponds to the given key, or where + /// it would be. Call the iteratee for that node and every parent, from + /// bottom to top, until the iteratee returns false. If not found, do + /// not call the iteratee. Retruns true if the iteratee did not ask to + /// stop, and false otherwise. The key will have already had this + /// node's prefix removed. + bool traverse_up(const BitString& key, const function& iteratee) const; + + /// Iterate over elements in the tree with an in-order traversal + /// between the two given keys, inclusive. Low and high have already + /// had this node's prefix removed. + bool traverse_in_order(const BitString& low, const BitString& high, const function& iteratee) const; + + // Note that depth is bounded so we don't need recursion-breaking in our destructor + + }; + + /// The root node has an empty prefix + TreeNode root; + +}; + +/** + * An index for a node-ID-sorted VPKG-formatted Protobuf file, such as GAM or VG. + * + * Works on a BAI-like concept of bins partitioning node ID space. + * + * Files are serialized as count-prefixed groups of Protobuf messages. These + * groups are the smallest unit that can be deserialized. + * + * Every *group* of messages gets assigned to a bin which is the longest bin + * that completely contains the ID range used in the group. + * + * We define *runs* of adjacent groups which have the same bin, which are the + * basic subject of the index. + * + * We then store an index from bin to the virtual offset ranges (start and + * past-the-end), in order, of runs that are assigned to the bin. + * + * You will get non-contiguous virtual offset ranges for a node ID range when + * some messages run into the range from the left, then messages that start + * later don't, and then messages that start even later do again. + * + * We also have a BAI-style linear index, mapping from tiling windows in node + * ID space to the lowest virtual offset of a group that overlaps the window. + * + * The bin structure is that we partition all of node ID space into bins of + * power-of-2 size, starting with size 2 nodes. We number the bins such that 0 + * is the whole-ID-space bin, divided into 1 and 2, then into 3, 4, 5, and 6, + * then into 7, 8, 9, 10, 11, 12, 13, and 14, and so on. + * + * The tiling windows are just the node IDs down-shifted by a few bits. + * + * Messages that use no nodes (i.e. unmapped reads) are considered to visit + * node ID 0. The maximum and minimum id_t values are used as sentinels, so + * they can't be real nodes. + * + * All find operations are thread-safe with respect to each other. Simultaneous + * adds or finds and adds are prohibited. + * + * Most of the basic index API doesn't depend on the message type. So we put it + * in this base class and inherit form it in templates that provide the + * high-level interface in terms of message instances. + */ +class StreamIndexBase { +public: + StreamIndexBase() = default; + + // Bins are identified of unsigned integers of the same width as node IDs. + using bin_t = make_unsigned::type; + + // So are windows, but we give them their own semantic type + using window_t = make_unsigned::type; + + /// Load an index from a file. + /// File holds the index, not the actual data being indexed. + /// Index file format doesn't care what type of message is being indexed. + void load(istream& from); + + /// Save an index to a file. + void save(ostream& to) const; + + // Like the XG we support versioning. + + /// What's the maximum index version number we can read with this code? + const static uint32_t MAX_INPUT_VERSION = 1; + /// What's the version we serialize? + const static uint32_t OUTPUT_VERSION = 1; + /// What magic value do we embed in the compressed index data? + /// TODO: Make this depend on type of message being indexed so we can't mix up index files. + const static string MAGIC_BYTES; + + /////////////////// + // Lower-level virtual-offset-based interface + /////////////////// + + // Note that retrieving all the runs overlapping a node ID or node ID range + // isn't possible. We can use the index to look up addresses to start at, + // but the only way to know when to stop scanning groups is when you find a + // group in the file with a minimum node ID that is too large. Then you + // know to jump to the next start address. + + /// Find all the ranges of run virtual offsets from the first position that + /// might be relevant for the given node ID to the ends of all the bins it + /// is in. Trims ranges by the linear index on the low end, and returns a + /// series of potentially abutting but non-overlapping virtual offset + /// ranges. Does not stop early (because it has no access to the actual + /// reads to tell when it should stop looking at runs in a bin). So you + /// will get ranges covering all runs in a bin that follow the runs you are + /// interested in as well. + vector> find(id_t node_id) const; + + /// Find all the ranges of run virtual offsets to check for reads visiting + /// the given node ID. Relies on a scanning callback, which will be called + /// repeatedly with the start and past-the-end virtual offsets of runs + /// which may contain groups touching the given node ID. When called, the + /// callback should scan the run and return either true if it wants the + /// next run, or false if it encountered a group with an out-of-range start + /// and wants to stop iteration. Runs will be emitted in order, and + /// truncated on the left to either the appropriate lower bound from the + /// linear index, or the past-the-end of the previous run scanned (which + /// should be moot, because runs should not overlap in the index). + void find(id_t node_id, const function scan_callback) const; + + /// Find all the ranges of run virtual offsets to check for reads visiting + /// the given inclusive node ID range. Relies on a scanning callback, which + /// will be called repeatedly with the start and past-the-end virtual + /// offsets of runs which may contain groups touching the given node ID. + /// When called, the callback should scan the run and return either true if + /// it wants the next run, or false if it encountered a group with an + /// out-of-range start and wants to stop iteration. Runs will be emitted in + /// order, and truncated on the left to either the appropriate lower bound + /// from the linear index, or the past-the-end of the previous run scanned. + void find(id_t min_node, id_t max_node, const function scan_callback) const; + + /// Iterate over ranges of virtual offsets from the end of the file to the + /// start. The ranges to *not* necessarily correspond to runs. The ending + /// VO of the first range iterated may be numeric_limits::max(). + /// The start VO of each range is guaranteed to be a valid VO of a group. + /// The past-end VO may be the start of the previously iterated range. + /// We have no scan_forward because you can just do that with a cursor. + /// Stops when the callback returns false. + void scan_backward(const function scan_callback) const; + + /// Add a group into the index, based on its minimum and maximum + /// (inclusive) used node IDs. Must be called for all groups in virtual + /// offset order. + void add_group(id_t min_id, id_t max_id, int64_t virtual_start, int64_t virtual_past_end); + + /////////////////// + // Lowest-level functions for thinking about bins and windows. + /////////////////// + + /// Get the ID prefix bits corresponding to a bin + static BitString bin_to_prefix(bin_t bin); + + /// Get the given ID as a bit string + static BitString id_to_prefix(id_t id); + + /// Get the most specific bin that contains both of the given node IDs. + static bin_t common_bin(id_t a, id_t b); + + /// Get the linear index window that the given node ID falls in. The window + /// range for a group is its min nodes' window through its max node's + /// window. + static window_t window_of_id(id_t id); + + /// Iterate over the *populated* bins in the index, in in-order bin tree + /// traversal order, that any of the node IDs in the given inclusive range + /// occur in. Returns false if asked to stop. + bool used_bins_of_range(id_t min_id, id_t max_id, const function& iteratee) const; + + +protected: + // How many bits of a node ID do we truncate to get its linear index window? + const static size_t WINDOW_SHIFT = 8; + + /// Maps from bin number to all the ranges of virtual offsets, in order, for runs that land in the given bin. + /// A run lands in a bin if that bin is the most specific bin that includes both its lowest and highest nodes it uses. + unordered_map>> bin_to_ranges; + + /// Maps from the bit string representing the prefix of node IDs that a bin + /// matches to the bin's bin number. Only contains entries for nonempty + /// bins. + BitStringTree bins_by_id_prefix; + + /// Maps from linear index window to the virtual offset of the first group + /// that overlaps that window (taking the group as a min-to-max node + /// range). If you are looking for reads that visit a node, they can't + /// possibly occur in a group before the first offset stored for the node's + /// window (or any greater window). TODO: Should we make this a vector + /// instead and hope nobody uses high/sparse node IDs? + map window_to_start; + + /// What was the minimum node ID of the last group added? + /// If this isn't strictly increasing, we're trying to index data that is not sorted. + id_t last_group_min_id = numeric_limits::min(); + + /// Return true if the given ID is in any of the sorted, coalesced, inclusive ranges in the vector, and false otherwise. + /// TODO: Is repeated binary search on the ranges going to be better than an unordered_set of all the individual IDs? + static bool is_in_range(const vector>& ranges, id_t id); + +private: + // Not copyable because we contain pointers. + StreamIndexBase(const StreamIndexBase& other) = delete; + StreamIndexBase& operator=(const StreamIndexBase& other) = delete; + +}; + +/** An index that provides a higher-level API in terms of the actual messages + * being indexed. This is the main entry point for users in most cases. + * + * All find operations are thread-safe with respect to each other. Simultaneous + * adds or finds and adds are prohibited. + * + */ +template +class StreamIndex : public StreamIndexBase { +public: + StreamIndex() = default; + + // Methods that actually go get messages for you are going to need a cursor on an open, seekable data file. + using cursor_t = vg::io::ProtobufIterator; + + /////////////////// + // Top-level message-based interface + /////////////////// + + /// Call the given callback with all messages in the index that visit the given node. + void find(cursor_t& cursor, id_t node_id, const function handle_result) const; + + /// Call the given callback with all messages in the index that visit a node in the given inclusive range. + void find(cursor_t& cursor, id_t min_node, id_t max_node, const function handle_result) const; + + /// Call the given callback with all the messages in the index that visit + /// a node in any of the given sorted, coalesced inclusive ranges. + /// Emits each message at most once. + /// If only_fully_contained is set, only messages where *all* the involved nodes are in one of the ranges will match. + void find(cursor_t& cursor, const vector>& ranges, const function handle_result, + bool only_fully_contained = false) const; + + /// Given a cursor at the beginning of a sorted, readable file, index the file. + void index(cursor_t& cursor); + + /// Add a group articulated as a vector of messages, between the given virtual offsets. + /// Must be called in virtual offset order for successive groups. + void add_group(const vector& msgs, int64_t virtual_start, int64_t virtual_past_end); + + // Unhide overloads from the base + using StreamIndexBase::find; + using StreamIndexBase::add_group; + +protected: + + /// Call the given iteratee for each node ID relevant to the given message. + /// IDs may repeat. + /// If the iteratee returns false, stop iteration. + /// Calls the iteratee with 0 only if there are no relevant node IDs *or* the message is relevant to queries for unplaced content. + void for_each_id(const Message& msg, const function iteratee) const; + +}; + +/// Define a GAM index as a stream index over a stream of Alignments +using GAMIndex = StreamIndex; + + +//////////// +// Template Implementations +//////////// + +template +void BitStringTree::insert(const BitString& key, const Item& value) { + root.insert(key, value); +} + +template +bool BitStringTree::traverse_up(const BitString& key, const function& iteratee) const { + return root.traverse_up(key, iteratee); +} + +template +bool BitStringTree::traverse_in_order(const BitString& low, const BitString& high, const function& iteratee) const { + return root.traverse_in_order(low, high, iteratee); +} + +template +void BitStringTree::TreeNode::insert(const BitString& key, const Item& value) { + +#ifdef debug + cerr << "Inserting key " << key << " under " << this << " with value " << value << endl; +#endif + + if (key.empty()) { + // It goes here. + // We can only take one item +#ifdef debug + cerr << "Item belongs here" << endl; +#endif + assert(!has_content); + content = value; + has_content = true; + } else { + // Get the first bit of its prefix + bool lead_bit = key.peek(); + + if (!children[lead_bit]) { + // We need to make a new child to hold this item + children[lead_bit] = unique_ptr(new TreeNode()); + // Populate it + children[lead_bit]->prefix = key; + children[lead_bit]->content = value; + children[lead_bit]->has_content = true; +#ifdef debug + cerr << "Item belongs in new child " << children[lead_bit].get() << " with lead bit " + << lead_bit << " and prefix " << children[lead_bit]->prefix << endl; +#endif + } else { + // We already have a child in this direction. + +#ifdef debug + cerr << "Item belongs on branch with existing child " << children[lead_bit].get() << " with lead bit " + << lead_bit << " and prefix " << children[lead_bit]->prefix << endl; +#endif + + // See where the key diverges from our child's key + auto breakpoint = children[lead_bit]->prefix.common_prefix_length(key); + + if (breakpoint >= children[lead_bit]->prefix.length()) { + // This key belongs inside the child node we have + +#ifdef debug + cerr << "Key " << key << " is a prefix of " << children[lead_bit]->prefix << " so item lives at or under child" << endl; +#endif + + // Insert recursively + children[lead_bit]->insert(key.drop_prefix(breakpoint), value); + } else { + // The item to be added diverges somewhere along the branch to the child. + // And it isn't at the 0th bit because we organized our children by bit 0. + +#ifdef debug + cerr << "Key " << key << " matches " << children[lead_bit]->prefix << " up through " << breakpoint << endl; +#endif + + // Break up the child's key + auto prefix_parts = children[lead_bit]->prefix.split(breakpoint); + + // Create a new node to sit at the split point and wire it in + unique_ptr new_child(new TreeNode()); + new_child->prefix = prefix_parts.first; + children[lead_bit]->prefix = prefix_parts.second; + new_child->children[prefix_parts.second.peek()] = move(children[lead_bit]); + children[lead_bit] = move(new_child); + +#ifdef debug + cerr << "Added new node " << children[lead_bit].get() << " with shared prefix " << children[lead_bit]->prefix + << " and make its " << prefix_parts.second.peek() << " child the old child with prefix " << prefix_parts.second << endl; +#endif + + // Recursively insert either at the breakpoint node or under it in the other child slot + children[lead_bit]->insert(key.drop_prefix(breakpoint), value); + } + } + } +} + +template +bool BitStringTree::TreeNode::traverse_up(const BitString& key, const function& iteratee) const { + if (key.empty()) { + // We are the item being sought + if (has_content) { + // We actually have an item, so send it + return iteratee(content); + } else { + // No item so it can't ask to stop + return true; + } + } else { + // The item must belong to a child slot + // Get the first bit of its prefix + bool lead_bit = key.peek(); + + if (children[lead_bit]) { + // We have a child that would be responsible for the key + + // But how long is the match + auto breakpoint = children[lead_bit]->prefix.common_prefix_length(key); + + if (breakpoint >= children[lead_bit]->prefix.length()) { + // This key belongs inside the child. + // Search recursively + if (children[lead_bit]->traverse_up(key.drop_prefix(breakpoint), iteratee)) { + // The child returned true + if (has_content) { + // We also have an item, so send it + return iteratee(content); + } else { + // We have no item, so just pass up the true + return true; + } + } else { + // The iteratee has already stopped. + return false; + } + } else { + // The key branches off before the child. + // So we have to process ourselves as the bottom node + if (has_content) { + // We actually have an item, so send it + return iteratee(content); + } else { + // No item so it can't ask to stop + return true; + } + } + } else { + // No child exists that is responsible for the key. + // We have no results under us, so just do us. + if (has_content) { + // We have an item, so send it + return iteratee(content); + } else { + // We have no item, so just pass up the true + return true; + } + } + } +} + +template +bool BitStringTree::TreeNode::traverse_in_order(const BitString& low, const BitString& high, + const function& iteratee) const { + + // This is actually pretty easy. We know we're in the range. We just need + // to figure out if our children are in the range, and if so, call them. + + // We use special bit string comparison, so our bounded range is all the + // stuff that we don't have bits showing it comes before the low or after + // the high. + +#ifdef debug + cerr << "Arrived at node " << this << " with range " << low << " - " << high << endl; +#endif + + /// Define a function to process each child. + /// Returns false if we stop early + auto do_child = [&](bool child_index) -> bool { + // Grab a reference to the child's unique_ptr + auto& child = children[child_index]; + + if (child) { + // The child exists. Grab its prefix + auto& child_prefix = child->prefix; + +#ifdef debug + cerr << "Child " << child.get() << " has prefix " << child_prefix << endl; +#endif + + if (low.at_or_before(child_prefix) && high.at_or_after(child_prefix)) { + // The child is in the range. + + // But we need to work out which of the range bounds are + // outside the region which the child is responsible for and + // not pass them on. + BitString child_low = child_prefix.at_or_before(low) ? low.drop_prefix(child_prefix.length()) : BitString(); + BitString child_high = child_prefix.at_or_after(high) ? high.drop_prefix(child_prefix.length()) : BitString(); + + return child->traverse_in_order(child_low, child_high, iteratee); + } + } + + // If we get here we didn't need to visit the child at all. + return true; + }; + + // Do the left child, then us if we have an item, then the right child. + return do_child(false) && ((has_content && iteratee(content)) || !has_content) && do_child(true); +} + +template +auto StreamIndex::find(cursor_t& cursor, id_t min_node, id_t max_node, + const function handle_result) const -> void { + + find(cursor, vector>{{min_node, max_node}}, handle_result); + +} + +template +auto StreamIndex::find(cursor_t& cursor, id_t node_id, const function handle_result) const -> void { + find(cursor, node_id, node_id, std::move(handle_result)); +} + +template +auto StreamIndex::find(cursor_t& cursor, const vector>& ranges, + const function handle_result, bool only_fully_contained) const -> void { + +#ifdef debug + cerr << "Begin a find query on ranges:" << endl; + for (auto& range : ranges) { + cerr << "\t" << range.first << "-" << range.second << endl; + } +#endif + + // We need seek support + assert(cursor.tell_group() != -1); + + // Because a node in a later range may appear earlier in the file than a + // node in an earlier range (but in a high-in-the-hierarchy bin), in + // general we need to jump around in the file. TODO: Use a processed_up_to + // counter to constrain us to one sweep in the only_fully_contained case. + + // To prevent us from scanning groups multiple times over, we keep a map + // from already-processed group start VO to the VO of the next group (or + // EOF). We can ride down chains in this map whenever we hit somewhere we + // have already been, instead of actually re-reading anything. + unordered_map next_unprocessed; + + // We access it with this accessor function. It returns the given address + // if the group there has not been read, or the next unprocessed VO (or EOF + // VO) if it has. + auto get_next_unprocessed = [&](int64_t currently_at) { + // If we have to chain through multiple VOs to find the final one, we store them here. + vector chain; + +#ifdef debug + cerr << "Find next unprocessed group after " << currently_at << endl; +#endif + + auto found = next_unprocessed.find(currently_at); + while(found != next_unprocessed.end()) { + // We have a place to go. + + // Remember this place as a place that needs to go to the final place we find. + chain.push_back(currently_at); + +#ifdef debug + cerr << currently_at << " chains to " << found->second << endl; +#endif + + // Advance to the place we found. + currently_at = found->second; + found = next_unprocessed.find(currently_at); + } + + // Now we hit the end. Save the final answer back to the map for + // everything but the last item, so we never need to scan it again + for (size_t i = 0; i + 1 < chain.size(); i++) { + next_unprocessed[chain[i]] = currently_at; + } + +#ifdef debug + cerr << "It is " << currently_at << endl; +#endif + + return currently_at; + }; + + // And this accessor marks a group as processed + auto mark_processed = [&](int64_t start_vo, int64_t past_end_vo) { + +#ifdef debug + cerr << "Mark group " << start_vo << " to " << past_end_vo << " as processed" << endl; +#endif + + next_unprocessed[start_vo] = past_end_vo; + }; + + for (auto& range : ranges) { + // For each range of IDs to look up + +#ifdef debug + cerr << "Look up range " << range.first << "-" << range.second << endl; +#endif + + find(range.first, range.second, [&](int64_t start_vo, int64_t past_end_vo) -> bool { + // For each matching range of virtual offsets in the index + +#ifdef debug + cerr << "Look at VOs " << start_vo << "-" << past_end_vo << endl; +#endif + + // Warp the start past any already-processed groups we know about + start_vo = get_next_unprocessed(start_vo); + if (start_vo >= past_end_vo) { + // Skip this whole range and look at the next one + +#ifdef debug + cerr << "The VO range has already been processed." << endl; +#endif + + return true; + } + + // Now the range starts with a group we have never seen before. + + // Seek the cursor, even if we are already at the group in question. + // TODO: We don't have a good way to tell if we are at the beginning of a group or not. + +#ifdef debug + cerr << "Seek cursor to " << start_vo << endl; +#endif + + cursor.seek_group(start_vo); + + // We need to track each group we encounter, so we can tell when an + // entire group is past the top end of the ID range we are + // currently looking up. + int64_t group_vo = cursor.tell_group(); + id_t group_min_id = numeric_limits::max(); + while (cursor.has_current() && cursor.tell_group() < past_end_vo) { + // Read each message until we find a group that starts out of range + + // Which group is this message in? + auto message_group_vo = cursor.tell_group(); + + if (message_group_vo != group_vo) { + // We finished the previous group. + +#ifdef debug + cerr << "Finished group " << group_vo << endl; +#endif + + // Record the group as processed + mark_processed(group_vo, message_group_vo); + + if (group_min_id != numeric_limits::max() && group_min_id > range.second) { + // Everything in the (non-empty) previous group was too high. We don't care about this group; our iteration is over. + +#ifdef debug + cerr << "Group was out of bounds for its range with min id " << group_min_id << " > " << range.second << endl; + cerr << "Move on to next range" << endl; +#endif + + // Stop early. Don't finish this run and don't look at the next runs for this query range. + return false; + } + + // Otherwise we need to start a new group + group_min_id = numeric_limits::max(); + + // Zip the group VO ahead to the next unprocessed group (which may be here, or at EOF) + group_vo = get_next_unprocessed(message_group_vo); + if (group_vo != message_group_vo) { + // We want to go to a different group next. + if (group_vo >= past_end_vo) { + // But it's out of range for this range. Don't go there. +#ifdef debug + cerr << "Next unprocessed VO is out of range" << endl; +#endif + break; + } else { + // Seek there and restart the loop to see if we found anything good. +#ifdef debug + cerr << "Seek to next unprocessed VO at " << group_vo << endl; +#endif + cursor.seek_group(group_vo); + continue; + } + } else { + // Otherwise, we are continuing with this group we just found. +#ifdef debug + cerr << "Next unprocessed VO is right here." << endl; +#endif + } + } + + // Filter the message by the query and yield it if it matches + const auto& message = *cursor; + bool message_match = false; + + for_each_id(message, [&](const id_t& found) { + // For each ID touched by the message + + // Min it in, keeping 0 as the sentinel for no nodes touched. + group_min_id = min(group_min_id, found); + if (is_in_range(ranges, found)) { + // We want this node (or unplaced messages like this one). + message_match = true; + if (!only_fully_contained) { + // All we care about is that any of the nodes match. + // we know enough to keep this message. + return false; + } + } else if (only_fully_contained) { + // We need *all* of the nodes to match, and this one didn't. + message_match = false; + // We know enough to discard this message. + return false; + } + // Keep looking + return true; + }); + + if (message_match) { + // This message is one that matches the query. Yield it. + handle_result(message); + } + + // Look for the next message + cursor.advance(); + + } + + if (group_vo < past_end_vo) { + // We finished a final group, from group_vo to past_end_vo +#ifdef debug + cerr << "Finished last group " << group_vo << endl; +#endif + + // Mark it finished + mark_processed(group_vo, past_end_vo); + + if (group_min_id != numeric_limits::max() && group_min_id > range.second) { + // If the (non-empty) last group had all its node IDs past the max + // node ID, we know nothing after it can possibly match, so stop + // iteration. + +#ifdef debug + cerr << "Group was out of bounds with min id " << group_min_id << " > " << range.second << endl; + cerr << "Move on to next range" << endl; +#endif + + return false; + } + } + + // Otherwise, the last group we looked at was not yet out of bounds, so get another range to look at, if one exists. + return true; + }); + + } +} + +template +auto StreamIndex::index(cursor_t& cursor) -> void { + // Keep track of what group we are in + int64_t group_vo = cursor.tell_group(); + // And load all its messages + vector group; + + // We need to have seek support + assert(group_vo != -1); + + while (cursor.has_current()) { + // For each message + + // Work out what group it is in + int64_t message_group_vo = cursor.tell_group(); + + if (message_group_vo != group_vo) { + // This is the start of a new group + + // Record the old group as being up to here + add_group(group, group_vo, message_group_vo); + + // Set up for the new group + group.clear(); + group_vo = message_group_vo; + } + + // Add the message to the group and move on + group.emplace_back(std::move(cursor.take())); + } + + if (!group.empty()) { + // Record the final group. Use wherever the cursor landed at the end as its final virtual offset. + add_group(group, group_vo, cursor.tell_group()); + } +} + +template +auto StreamIndex::add_group(const vector& msgs, int64_t virtual_start, int64_t virtual_past_end) -> void { + // Find the min and max ID visited by any of the messages + id_t min_id = numeric_limits::max(); + id_t max_id = numeric_limits::min(); + + for (auto& msg : msgs) { + // For each message + for_each_id(msg, [&](const id_t& found) { + // For each ID touched by the message + + // Min and max in the ID, keeping 0 to represent no mappings. + min_id = min(min_id, found); + max_id = max(max_id, found); + + // Don't stop early + return true; + }); + } + + add_group(min_id, max_id, virtual_start, virtual_past_end); +} + +template +auto StreamIndex::for_each_id(const Message& msg, const function iteratee) const -> void { + // Visit all the IDs. + // Zeros will come out if the message is empty (unplaced) or has an unplaced child message. + // Duplicates will come out but that is fine. + IDScanner::scan(msg, iteratee); +} + +} + +#endif + + diff --git a/src/stream_sorter.cpp b/src/stream_sorter.cpp new file mode 100644 index 00000000000..79ed6c97a27 --- /dev/null +++ b/src/stream_sorter.cpp @@ -0,0 +1,14 @@ +#include "stream_sorter.hpp" + +/** + * \file stream_sorter.cpp + * StreamSorter: sort a stream of Protobuf messages in VPKG by node ID. + * Unplaced messages come first. + */ + +namespace vg { + +using namespace std; + +} + diff --git a/src/stream_sorter.hpp b/src/stream_sorter.hpp new file mode 100644 index 00000000000..c45ecbf1854 --- /dev/null +++ b/src/stream_sorter.hpp @@ -0,0 +1,540 @@ +#ifndef VG_STREAM_SORTER_HPP_INCLUDED +#define VG_STREAM_SORTER_HPP_INCLUDED + +#include +#include +#include +#include +#include "types.hpp" +#include "progressive.hpp" +#include "stream_index.hpp" +#include "utility.hpp" +#include "vg/io/json2pb.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/** + * \file stream_sorter.hpp + * VPKG-format file sorting tools. + */ +using namespace std; +namespace vg { + +/** + * We define a PositionScanner that scans any VG Protobuf message that has + * Positions or node IDs in it and emits the Positions as is and the node IDs + * wrapped in Positions. This lets us use one sorting implementation on + * Positions even with things like Graphs where non-Position node IDs are + * important. + */ + +/// Provides the ability to sort a stream of Protobuf Messages, either "dumbly" +/// (in memory), or streaming into temporary files. For Alignments, paired +/// Alignments are not necessarily going to end up next to each other, so if +/// sorting by position make sure to set the position cross-references first if +/// you want to be able to find them. +template +class StreamSorter : public Progressive { +public: + + ////////////////// + // Main entry points + ////////////////// + + /// Create a stream sorter, showing sort progress on standard error if + /// show_progress is true. + StreamSorter(bool show_progress = false); + + /// Sort a stream of VPKG-format Protobuf data, using temporary files, + /// limiting the number of simultaneously open input files and the size of + /// in-memory data. Optionally index the sorted file into the given index. + void stream_sort(istream& stream_in, ostream& stream_out, StreamIndex* index_to = nullptr); + + /// Sort a stream of VPKG-format Protobuf data, loading it all into memory and + /// doing a single giant sort operation. Optionally index the sorted file + /// into the given index. + void easy_sort(istream& stream_in, ostream& stream_out, StreamIndex* index_to = nullptr); + + ////////////////// + // Supporting API + ////////////////// + + /// Sort a vector of messages, in place. + void sort(vector& msgs) const; + + /// Return true if out of Messages a and b, a must come before b, and false otherwise. + bool less_than(const Message& a, const Message& b) const; + + /// Determine the minumum Position visited by an Message. The minumum + /// Position is the lowest node ID visited by the message, with the + /// lowest offset visited on that node ID as the offset, and the + /// orientation set to false if the forward strand is visited, and true if + /// only the reverse strand is visited. + Position get_min_position(const Message& msg) const; + + /// Return True if position A is less than position B in our sort, and false otherwise. + /// Position order is defined first by node ID, then by strand (forward first), and then by offset within the strand. + /// We can't sort by actual base on the forward strand, because we need to be able to sort without knowing the graph's node lengths. + bool less_than(const Position& a, const Position& b) const; + + private: + /// What's the maximum size of messages in serialized, uncompressed bytes to + /// load into memory for a single temp file chunk, during the streaming + /// sort? + /// For reference, a whole-genome GAM file is about 500 GB of uncompressed data + size_t max_buf_size = (512 * 1024 * 1024); + /// What's the max fan-in when combining temp files, during the streaming sort? + /// This will be computed based on the max file descriptor limit from the OS. + size_t max_fan_in; + + using cursor_t = vg::io::ProtobufIterator; + using emitter_t = vg::io::ProtobufEmitter; + + /// Open all the given input files, keeping the streams and cursors in the given lists. + /// We use lists because none of these should be allowed to move after creation. + void open_all(const vector& filenames, list& streams, list& cursors); + + /// Merge all the messages from the given list of cursors into the given emitter. + /// The total expected number of messages can be passed for progress bar purposes. + void streaming_merge(list& cursors, emitter_t& emitter, size_t expected_messages = 0); + + /// Merge all the given temp input files into one or more temp output + /// files, opening no more than max_fan_in input files at a time. The input + /// files, which must be from temp_file::create(), will be deleted. + /// + /// If messages_per_file is specified, it will be used to show progress bars, + /// and will be updated for newly-created files. + vector streaming_merge(const vector& temp_names_in, unordered_map* messages_per_file = nullptr); +}; + +using GAMSorter = StreamSorter; + +////////////// +// Template Implementations +////////////// + +template +StreamSorter::StreamSorter(bool show_progress) { + this->show_progress = show_progress; + + // We would like this many FDs max, if not limited below that. + max_fan_in = 2048; + // We need at least this many to sort practically. + int min_fan_in = 100; + + // We need this many extra FDs not used for fan-in + int extra_fds = 10; + + // Work out how many FDs we are allowed + struct rlimit fd_limit; + if (getrlimit(RLIMIT_NOFILE, &fd_limit) != 0) { + // We don't know; choose a conservative default. + max_fan_in = min_fan_in; + cerr << "warning:[vg::StreamSorter]: Cannot determine file descriptor limits; using " + << max_fan_in << " temp file fan-in" << endl; + } else { + // We read the limit + if (fd_limit.rlim_cur != RLIM_INFINITY && fd_limit.rlim_cur < max_fan_in + extra_fds) { + // Max out our FD limit + fd_limit.rlim_cur = min(max_fan_in + extra_fds, fd_limit.rlim_max); + + if (setrlimit(RLIMIT_NOFILE, &fd_limit) != 0) { + // We asked for a value in bound sso we should have succeeded + throw runtime_error("Error adjusting file descriptor limit to " + to_string(fd_limit.rlim_cur) + + " / " + to_string(fd_limit.rlim_max)); + } + } + + if (fd_limit.rlim_cur != RLIM_INFINITY && fd_limit.rlim_cur < max_fan_in + extra_fds) { + // We need to limit ourselves to under the max FD limit + if (fd_limit.rlim_cur < extra_fds + min_fan_in) { + // If we can't at least do a fan-in of 10 we have a big problem. + cerr << "error:[vg::StreamSorter]: Open file limit very low (" << fd_limit.rlim_cur << "); we need " + << (extra_fds + min_fan_in) << endl; + exit(1); + } + + // Set the max fan in to be subject to the limit + max_fan_in = min((size_t)(fd_limit.rlim_cur - extra_fds), max_fan_in); + } + } +} + +template +void StreamSorter::sort(vector& msgs) const { + std::sort(msgs.begin(), msgs.end(), [&](const Message& a, const Message& b) { + return this->less_than(a, b); + }); +} + +template +void StreamSorter::easy_sort(istream& stream_in, ostream& stream_out, StreamIndex* index_to) { + std::vector sort_buffer; + + vg::io::for_each(stream_in, [&](Message &msg) { + sort_buffer.push_back(msg); + }); + + this->sort(sort_buffer); + + // Maintain our own group buffer at a higher scope than the emitter. + vector group_buffer; + + { + // Make an output emitter + vg::io::ProtobufEmitter emitter(stream_out); + + if (index_to != nullptr) { + emitter.on_message([&](const Message& m) { + // Copy every message that is emitted. + // TODO: Just compute indexing stats instead. + group_buffer.push_back(m); + }); + + emitter.on_group([&](int64_t start_vo, int64_t past_end_vo) { + // On every group, tell the index to record the group stats, and clear the buffer. + index_to->add_group(group_buffer, start_vo, past_end_vo); + group_buffer.clear(); + }); + } + + for (auto& msg : sort_buffer) { + // Feed in all the sorted messages + emitter.write(std::move(msg)); + } + + // Emitter destruction will terminate the file with an EOF marker + } +} + +template +void StreamSorter::stream_sort(istream& stream_in, ostream& stream_out, StreamIndex* index_to) { + + // We want to work out the file size, if we can. + size_t file_size = 0; + { + // Save our position + auto here = stream_in.tellg(); + // Go to the end + stream_in.seekg(0, stream_in.end); + // Get its position + auto there = stream_in.tellg(); + // Go back to where we were + stream_in.seekg(here); + + if (stream_in.good()) { + // We can seek in this stream. So how far until the end? + file_size = there - here; + } else { + // It's entirely possible that none of that worked. So clear the error flags and leave the size at 0. + stream_in.clear(); + } + } + + + // Don't give an actual 0 to the progress code or it will NaN + create_progress("break into sorted chunks", file_size == 0 ? 1 : file_size); + + // Eventually we put sorted chunks of data in temp files and put their names here + vector outstanding_temp_files; + + // This tracks the number of messages in each file, by file name + unordered_map messages_per_file; + // This tracks the total messages observed on input + size_t total_messages_read = 0; + + // This cursor will read in the input file. + cursor_t input_cursor(stream_in); + + #pragma omp parallel shared(stream_in, input_cursor, outstanding_temp_files, messages_per_file, total_messages_read) + { + + while(true) { + + vector thread_buffer; + + #pragma omp critical (input_cursor) + { + // Each thread fights for the file and the winner takes some data + size_t buffered_message_bytes = 0; + while (input_cursor.has_current() && buffered_message_bytes < max_buf_size) { + // Until we run out of input messages or space, buffer each, recording its size. + thread_buffer.emplace_back(std::move(input_cursor.take())); + // Note that the message has to be small enough for its size to fit in a signed int + buffered_message_bytes += thread_buffer.back().ByteSize(); + } + + // Update the progress bar + update_progress(stream_in.tellg()); + } + + if (thread_buffer.empty()) { + // No data was found + break; + } + + // Do a sort of the data we grabbed + this->sort(thread_buffer); + + // Save it to a temp file. + string temp_name = temp_file::create(); + ofstream temp_stream(temp_name); + // OK to save as one massive group here. + // TODO: This write could also be in a thread. + vg::io::write_buffered(temp_stream, thread_buffer, 0); + + #pragma omp critical (outstanding_temp_files) + { + // Remember the temp file name + outstanding_temp_files.push_back(temp_name); + // Remember the messages in the file, for progress purposes + messages_per_file[temp_name] = thread_buffer.size(); + // Remember how many messages we found in the total + total_messages_read += thread_buffer.size(); + } + } + } + + // Now we know the reader thmessages have taken care of the input, and all the data is in temp files. + + destroy_progress(); + + while (outstanding_temp_files.size() > max_fan_in) { + // We can't merge them all at once, so merge subsets of them. + outstanding_temp_files = streaming_merge(outstanding_temp_files, &messages_per_file); + } + + // Now we can merge (and maybe index) the final layer of the tree. + + // Open up cursors into all the files. + list temp_ifstreams; + list temp_cursors; + open_all(outstanding_temp_files, temp_ifstreams, temp_cursors); + + // Maintain our own group buffer at a higher scope than the emitter. + vector group_buffer; + { + // Make an output emitter + emitter_t emitter(stream_out); + + if (index_to != nullptr) { + emitter.on_message([&index_to,&group_buffer](const Message& m) { + // Copy every message that is emitted. + // TODO: Just compute indexing stats instead. + group_buffer.push_back(m); + }); + + emitter.on_group([&index_to,&group_buffer](int64_t start_vo, int64_t past_end_vo) { + // On every group, tell the index to record the group stats, and clear the buffer. + index_to->add_group(group_buffer, start_vo, past_end_vo); + group_buffer.clear(); + }); + } + + // Merge the cursors into the emitter + streaming_merge(temp_cursors, emitter, total_messages_read); + + } + + // Clean up + temp_cursors.clear(); + temp_ifstreams.clear(); + for (auto& filename : outstanding_temp_files) { + temp_file::remove(filename); + } + +} + +template +void StreamSorter::open_all(const vector& filenames, list& streams, list& cursors) { + // The open files need to live in a collection; the cursors don't own them. + // They also can't be allowed to move since we reference them. + // The cursors also need to live in a collection, because we don't want to be + // moving/copying them and their internal buffers and streams. + // And they can't move after creation either. + + // So everything lives in caller-passed lists. + + for (auto& filename : filenames) { + // Open each file + streams.emplace_back(); + streams.back().open(filename); + // Make a cursor for it + cursors.emplace_back(streams.back()); + } + +} + +template +void StreamSorter::streaming_merge(list& cursors, emitter_t& emitter, size_t expected_messages) { + + create_progress("merge " + to_string(cursors.size()) + " files", expected_messages == 0 ? 1 : expected_messages); + // Count the messages we actually see + size_t observed_messages = 0; + + // Put all the files in a priority queue based on which has a message that comes first. + // We work with pointers to cursors because we don't want to be copying the actual cursors around the heap. + // We also *reverse* the order, because priority queues put the "greatest" element first + auto cursor_order = [&](cursor_t*& a, cursor_t*& b) { + if (b->has_current()) { + if(!a->has_current()) { + // Cursors that aren't empty come first + return true; + } + return less_than(*(*b), *(*a)); + } + return false; + }; + priority_queue, decltype(cursor_order)> cursor_queue(cursor_order); + + for (auto& cursor : cursors) { + // Put the cursor pointers in the queue + cursor_queue.push(&cursor); + } + + while(!cursor_queue.empty() && cursor_queue.top()->has_current()) { + // Until we have run out of data in all the temp files + + // Pop off the winning cursor + cursor_t* winner = cursor_queue.top(); + cursor_queue.pop(); + + // Grab and emit its message, and advance it + emitter.write(std::move(winner->take())); + + // Put it back in the heap if it is not depleted + if (winner->has_current()) { + cursor_queue.push(winner); + } + // TODO: Maybe keep it off the heap for the next loop somehow if it still wins + + observed_messages++; + if (expected_messages != 0) { + update_progress(observed_messages); + } + } + + // We finished the files, so say we're done. + // TODO: Should we warn/fail if we expected the wrong number of messages? + update_progress(expected_messages == 0 ? 1 : expected_messages); + destroy_progress(); + +} + +template +vector StreamSorter::streaming_merge(const vector& temp_files_in, unordered_map* messages_per_file) { + + // What are the names of the merged files we create? + vector temp_files_out; + + // We don't do this loop in parallel because the point of looping is to limit the total currently open files. + for (size_t start_file = 0; start_file < temp_files_in.size(); start_file += max_fan_in) { + // For each range of sufficiently few files, starting at start_file and running for file_count + size_t file_count = min(max_fan_in, temp_files_in.size() - start_file); + + // Open up cursors into all the files. + list temp_ifstreams; + list temp_cursors; + open_all(vector(&temp_files_out[start_file], &temp_files_out[start_file + file_count]), temp_ifstreams, temp_cursors); + + // Work out how many messages to expect + size_t expected_messages = 0; + if (messages_per_file != nullptr) { + for (size_t i = start_file; i < start_file + file_count; i++) { + expected_messages += messages_per_file->at(temp_files_in.at(i)); + } + } + + // Open an output file + string out_file_name = temp_file::create(); + ofstream out_stream(out_file_name); + temp_files_out.push_back(out_file_name); + + // Make an output emitter + emitter_t emitter(out_stream); + + // Merge the cursors into the emitter + streaming_merge(temp_cursors, emitter, expected_messages); + + // The output file will be flushed and finished automatically when the emitter goes away. + + // Clean up the input files we used + temp_cursors.clear(); + temp_ifstreams.clear(); + for (size_t i = start_file; i < file_count; i++) { + temp_file::remove(temp_files_in.at(i)); + } + + if (messages_per_file != nullptr) { + // Save the total messages that should be in the created file, in case we need to do another pass + (*messages_per_file)[out_file_name] = expected_messages; + } + } + + return temp_files_out; + +} + +template +bool StreamSorter::less_than(const Message &a, const Message &b) const { + return less_than(get_min_position(a), get_min_position(b)); +} + +template +Position StreamSorter::get_min_position(const Message& msg) const { + // This holds the min Position we get + Position min_pos; + // We set this to true when we fill in the min + bool have_min = false; + + WrappingPositionScanner::scan(msg, [&](const Position& observed) -> bool { + if (!have_min || less_than(observed, min_pos)) { + // We have a new min position + min_pos = observed; + have_min = true; + } + + // Keep scanning + return true; + }); + + // If we don't have a min, we should return the empty position (which we already have). + // Otherwise we should return the min position. + return min_pos; +} + +template +bool StreamSorter::less_than(const Position& a, const Position& b) const { + if (a.node_id() < b.node_id()) { + return true; + } else if (a.node_id() > b.node_id()) { + return false; + } + + if (a.is_reverse() < b.is_reverse()) { + return true; + } else if (a.is_reverse() > b.is_reverse()) { + return false; + } + + if (a.offset() < b.offset()) { + return true; + } + + return false; +} + + +} +#endif diff --git a/src/subcommand/add_main.cpp b/src/subcommand/add_main.cpp index 846dc75f122..f0c09620545 100644 --- a/src/subcommand/add_main.cpp +++ b/src/subcommand/add_main.cpp @@ -16,6 +16,9 @@ #include "../vg.hpp" #include "../variant_adder.hpp" +#include + + using namespace std; @@ -138,9 +141,6 @@ int main_add(int argc, char** argv) { // Configure GCSA2 verbosity so it doesn't spit out loads of extra info gcsa::Verbosity::set(gcsa::Verbosity::SILENT); - // Configure its temp directory to the system temp directory - gcsa::TempFile::setDirectory(temp_file::get_dir()); - // Turn on nested parallelism, so we can parallelize over VCFs and over alignment bands omp_set_nested(1); @@ -161,22 +161,43 @@ int main_add(int argc, char** argv) { } // Load the graph - VG* graph; - get_input_file(optind, argc, argv, [&](istream& in) { - graph = new VG(in, show_progress); - }); + + unique_ptr graph; + string graph_filename = get_input_file_name(optind, argc, argv); + graph = vg::io::VPKG::load_one(graph_filename); + + VG* vg_graph = dynamic_cast(graph.get()); - if (graph == nullptr) { + // Call this to populate the vg_graph if it isn't populated. + auto ensure_vg = [&]() -> vg::VG* { + if (vg_graph == nullptr) { + // Copy instead. + vg_graph = new vg::VG(); + handlealgs::copy_path_handle_graph(graph.get(), vg_graph); + // Give the unique_ptr ownership and delete the graph we loaded. + graph.reset(vg_graph); + // Make sure the paths are all synced up + vg_graph->paths.to_graph(vg_graph->graph); + } + return vg_graph; + }; + + // TODO: We need to move VariantAdder away from vg::VG eventually. + // Right now we always need the vg format graph. + // TODO: deduplicate ensure_vg with other subcommands? + ensure_vg(); + + if (vg_graph == nullptr) { cerr << "error:[vg add]: Could not load graph" << endl; exit(1); } { // Clear existing path ranks (since we invalidate them) - graph->paths.clear_mapping_ranks(); + vg_graph->paths.clear_mapping_ranks(); // Make a VariantAdder for the graph - VariantAdder adder(*graph); + VariantAdder adder(*vg_graph); // Report updates when running interactively adder.print_updates = true; @@ -207,14 +228,12 @@ int main_add(int argc, char** argv) { // TODO: should we sort the graph? // Rebuild all the path ranks and stuff - graph->paths.rebuild_mapping_aux(); + vg_graph->paths.rebuild_mapping_aux(); } // Output the modified graph - graph->serialize_to_ostream(std::cout); + vg_graph->serialize_to_ostream(std::cout); - delete graph; - // NB: If you worry about "still reachable but possibly lost" warnings in valgrind, // this would free all the memory used by protobuf: //ShutdownProtobufLibrary(); @@ -223,5 +242,5 @@ int main_add(int argc, char** argv) { } // Register subcommand -static Subcommand vg_add("add", "add variants from a VCF to a graph", main_add); +static Subcommand vg_add("add", "add variants from a VCF to a graph", DEPRECATED, main_add); diff --git a/src/subcommand/align_main.cpp b/src/subcommand/align_main.cpp index 155bafbdf49..33ffb67d779 100644 --- a/src/subcommand/align_main.cpp +++ b/src/subcommand/align_main.cpp @@ -14,8 +14,14 @@ #include "subcommand.hpp" #include "../utility.hpp" -#include "../vg.hpp" -#include "../stream.hpp" +#include "../handle.hpp" +#include "../path.hpp" +#include "../split_strand_graph.hpp" +#include "../dagified_graph.hpp" +#include "../ssw_aligner.hpp" +#include "../aligner.hpp" +#include +#include using namespace std; using namespace vg; @@ -171,13 +177,12 @@ int main_align(int argc, char** argv) { } } - VG* graph = nullptr; + unique_ptr graph; if (ref_seq.empty()) { // Only look at a filename if we don't have an explicit reference // sequence. - get_input_file(optind, argc, argv, [&](istream& in) { - graph = new VG(in); - }); + string graph_filename = get_input_file_name(optind, argc, argv); + graph = vg::io::VPKG::load_one(graph_filename); } ifstream matrix_stream; @@ -188,6 +193,7 @@ int main_align(int argc, char** argv) { exit(1); } } + Alignment alignment; if (!ref_seq.empty()) { @@ -198,10 +204,51 @@ int main_align(int argc, char** argv) { SSWAligner ssw = SSWAligner(match, mismatch, gap_open, gap_extend); alignment = ssw.align(seq, ref_seq); } else { - Aligner aligner = Aligner(match, mismatch, gap_open, gap_extend, full_length_bonus, vg::default_gc_content, seq.size()); - if(matrix_stream.is_open()) aligner.load_scoring_matrix(matrix_stream); - alignment = graph->align(seq, &aligner, true, false, 0, pinned_alignment, pin_left, - banded_global, 0, 0, 0, 0, 0, debug); + + // construct a score matrix + int8_t* score_matrix; + if (matrix_stream.is_open()) { + score_matrix = AlignerClient::parse_matrix(matrix_stream); + } + else { + score_matrix = (int8_t*) malloc(sizeof(int8_t) * 16); + for (size_t i = 0; i < 16; ++i) { + if (i % 5 == 0) { + score_matrix[i] = match; + } + else { + score_matrix[i] = -mismatch; + } + } + } + + // initialize an aligner + Aligner aligner = Aligner(score_matrix, gap_open, gap_extend, full_length_bonus, vg::default_gc_content); + + free(score_matrix); + + // put everything on the forward strand + StrandSplitGraph split(&(*graph)); + + // dagify it as far as we might ever want + DagifiedGraph dag(&split, seq.size() + aligner.longest_detectable_gap(seq.size(), seq.size() / 2)); + + alignment.set_sequence(seq); + if (pinned_alignment) { + aligner.align_pinned(alignment, dag, pin_left); + } + else if (banded_global) { + aligner.align_global_banded(alignment, dag, 1, true); + } + else { + aligner.align(alignment, dag, true); + } + + // translate back from the overlays + translate_oriented_node_ids(*alignment.mutable_path(), [&](vg::id_t node_id) { + handle_t under = split.get_underlying_handle(dag.get_underlying_handle(dag.get_handle(node_id))); + return make_pair(graph->get_id(under), graph->get_is_reverse(under)); + }); } if (!seq_name.empty()) { @@ -215,12 +262,8 @@ int main_align(int argc, char** argv) { [&alignment] (size_t n) { return alignment; }; - stream::write(cout, 1, lambda); - stream::finish(cout); - } - - if (graph != nullptr) { - delete graph; + vg::io::write(cout, 1, lambda); + vg::io::finish(cout); } return 0; diff --git a/src/subcommand/annotate_main.cpp b/src/subcommand/annotate_main.cpp index 74c2bcf0d3a..418767dceaf 100644 --- a/src/subcommand/annotate_main.cpp +++ b/src/subcommand/annotate_main.cpp @@ -1,10 +1,16 @@ #include "subcommand.hpp" #include "../vg.hpp" +#include "../xg.hpp" #include "../utility.hpp" #include "../mapper.hpp" -#include "../stream.hpp" +#include +#include #include "../alignment.hpp" #include "../annotation.hpp" +#include "../gff_reader.hpp" +#include "../region_expander.hpp" +#include "../algorithms/alignment_path_offsets.hpp" +#include #include #include @@ -15,20 +21,24 @@ using namespace vg::subcommand; void help_annotate(char** argv) { cerr << "usage: " << argv[0] << " annotate [options] >output.{gam,vg,tsv}" << endl << "graph annotation options:" << endl - << " -x, --xg-name FILE xg index of the graph to annotate (required)" << endl + << " -x, --xg-name FILE xg index or graph to annotate (required)" << endl << " -b, --bed-name FILE a BED file to convert to GAM. May repeat." << endl - << " -f, --gff-name FILE a GFF3/GTF file to convert to GAM. May repeat." << endl + << " -f, --gff-name FILE a GFF3 file to convert to GAM. May repeat." << endl + << " -g, --ggff output at GGFF subgraph annotation file instead of GAM (requires -s)" << endl + << " -s, --snarls FILE file containing snarls to expand GFF intervals into" << endl << "alignment annotation options:" << endl << " -a, --gam FILE file of Alignments to annotate (required)" << endl << " -x, --xg-name FILE xg index of the graph against which the Alignments are aligned (required)" << endl << " -p, --positions annotate alignments with reference positions" << endl + << " -m, --multi-position annotate alignments with multiple reference positions" << endl + << " -l, --search-limit N when annotating with positions, search this far for paths (default: read length)" << endl << " -b, --bed-name FILE annotate alignments with overlapping region names from this BED. May repeat." << endl << " -n, --novelty output TSV table with header describing how much of each Alignment is novel" << endl << " -t, --threads use the specified number of threads" << endl; } /// Find the region of the Mapping's node used by the Mapping, in forward strand space, as start to past_end. -static pair mapping_to_range(const xg::XG* xg_index, const Mapping& mapping) { +static pair mapping_to_range(const HandleGraph* xg_index, const Mapping& mapping) { // How much of the node does it cover? auto mapping_length = mapping_from_length(mapping); @@ -37,7 +47,7 @@ static pair mapping_to_range(const xg::XG* xg_index, const Mappi if (mapping.position().is_reverse()) { // On the reverse strand we need the node length // TODO: getting it can be slow - auto node_length = xg_index->node_length(mapping.position().node_id()); + auto node_length = xg_index->get_length(xg_index->get_handle(mapping.position().node_id())); node_range.first = node_length - mapping.position().offset() - mapping_length; node_range.second = node_length - mapping.position().offset(); @@ -83,7 +93,11 @@ int main_annotate(int argc, char** argv) { vector gff_names; string gam_name; bool add_positions = false; + bool add_multiple_positions = false; + size_t search_limit = 0; bool novelty = false; + bool output_ggff = false; + string snarls_name; int c; optind = 2; // force optind past command positional argument @@ -92,9 +106,13 @@ int main_annotate(int argc, char** argv) { { {"gam", required_argument, 0, 'a'}, {"positions", no_argument, 0, 'p'}, + {"multi-positions", no_argument, 0, 'm'}, + {"search-limit", required_argument, 0, 'l'}, {"xg-name", required_argument, 0, 'x'}, {"bed-name", required_argument, 0, 'b'}, {"gff-name", required_argument, 0, 'f'}, + {"ggff", no_argument, 0, 'g'}, + {"snarls", required_argument, 0, 's'}, {"novelty", no_argument, 0, 'n'}, {"threads", required_argument, 0, 't'}, {"help", required_argument, 0, 'h'}, @@ -102,7 +120,7 @@ int main_annotate(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hx:a:pb:f:nt:h", + c = getopt_long (argc, argv, "hx:a:pml:b:f:gs:nt:h", long_options, &option_index); // Detect the end of the options. @@ -126,11 +144,28 @@ int main_annotate(int argc, char** argv) { case 'f': gff_names.push_back(optarg); break; + + case 'g': + output_ggff = true; + break; + + case 's': + snarls_name = optarg; + break; case 'p': add_positions = true; break; + case 'm': + add_positions = true; + add_multiple_positions = true; + break; + + case 'l': + search_limit = parse(optarg); + break; + case 'n': novelty = true; break; @@ -149,17 +184,32 @@ int main_annotate(int argc, char** argv) { abort (); } } - xg::XG* xg_index = nullptr; + + PathPositionHandleGraph* xg_index = nullptr; + unique_ptr path_handle_graph; + bdsg::ReferencePathOverlayHelper overlay_helper; + if (!xg_name.empty()) { - get_input_file(xg_name, [&](istream& in) { - // Read in the XG index - xg_index = new xg::XG(in); - }); + // Read in the XG index + path_handle_graph = vg::io::VPKG::load_one(xg_name); + xg_index = overlay_helper.apply(path_handle_graph.get()); } else { cerr << "error [vg annotate]: no xg index provided" << endl; return 1; } + + unique_ptr snarl_manager = nullptr; + if (!snarls_name.empty()) { + ifstream snarl_stream; + snarl_stream.open(snarls_name); + if (!snarl_stream) { + cerr << "error:[vg mpmap] Cannot open Snarls file " << snarls_name << endl; + exit(1); + } + snarl_manager = vg::io::VPKG::load_one(snarl_stream); + } + Mapper mapper(xg_index, nullptr, nullptr); if (!gam_name.empty()) { @@ -206,14 +256,14 @@ int main_annotate(int argc, char** argv) { << novel_bp << endl; }; get_input_file(gam_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else { // We are annotating the actual reads // Make per-thread buffers for writing them vector> buffers; - buffers.resize(get_thread_count()); + buffers.resize(vg::get_thread_count()); // We will need to track mappings from graph node regions to BED features. // We don't want each of those mappings to have a copy of the feature name, because that could be big. @@ -256,20 +306,27 @@ int main_annotate(int argc, char** argv) { // Scan the Mappings. We know each Mapping will be all perfect matches. // Record that the alignment covers the given region on the given node. - features_on_node[mapping.position().node_id()].emplace_back(mapping_to_range(xg_index, mapping), interned_name); + features_on_node[mapping.position().node_id()].emplace_back(mapping_to_range(xg_index, mapping), + interned_name); } } }); } get_input_file(gam_name, [&](istream& in) { - stream::for_each_parallel(in, [&](Alignment& aln) { + vg::io::for_each_parallel(in, [&](Alignment& aln) { // For each read if (add_positions) { // Annotate it with its initial position on each path it touches aln.clear_refpos(); - mapper.annotate_with_initial_path_positions(aln); + if (add_multiple_positions) { + // One position per node + vg::algorithms::annotate_with_node_path_positions(*mapper.xindex, aln, search_limit); + } else { + // One position per alignment + vg::algorithms::annotate_with_initial_path_positions(*mapper.xindex, aln, search_limit); + } } if (!features_on_node.empty()) { @@ -303,16 +360,17 @@ int main_annotate(int argc, char** argv) { // Output the alignment auto& buffer = buffers.at(omp_get_thread_num()); buffer.emplace_back(std::move(aln)); - stream::write_buffered(cout, buffer, 1000); + vg::io::write_buffered(cout, buffer, 1000); }); }); for (auto& buffer : buffers) { // Finish each buffer - stream::write_buffered(cout, buffer, 0); + vg::io::write_buffered(cout, buffer, 0); } } - } else { + } + else { // Annotating the graph. We must do something. if (bed_names.empty() && gff_names.empty()) { // We weren't asked to do anything. @@ -320,30 +378,110 @@ int main_annotate(int argc, char** argv) { return 1; } - for (auto& bed_name : bed_names) { - // Convert each BED file to GAM - get_input_file(bed_name, [&](istream& bed_stream) { - vector buffer; - parse_bed_regions(bed_stream, xg_index, &buffer); - stream::write_buffered(cout, buffer, 0); // flush - }); + if (output_ggff) { - // TODO: We'll get an EOF marker per input file. + if (!bed_names.empty()) { + cerr << "error [vg annotate] BED conversion to GGFF is not currently supported. Convert to GFF3 first." << endl; + return 1; + } + + // define a function that converts to GGFF + RegionExpander region_expander(&(*xg_index), &(*snarl_manager)); + function output_ggff_record = [&](const GFFRecord& record) { + + auto subgraph = region_expander.expanded_subgraph(record); + + if (subgraph.empty()) { + cout << "."; + } + + for (auto iter = subgraph.begin(); iter != subgraph.end(); ) { + + cout << iter->first.first << "[" << iter->second.first << ":" << iter->second.second << "]"; + if (iter->first.second) { + cout << "-"; + } + else { + cout << "+"; + } + + ++iter; + if (iter != subgraph.end()) { + cout << ","; + } + } + cout << "\t"; + + if (record.source.empty()) { + cout << "."; + } + else { + cout << record.source; + } + cout << "\t"; + + if (record.type.empty()) { + cout << "."; + } + else { + cout << record.type; + } + cout << "\t"; + + if (isnan(record.score)) { + cout << "."; + } + else { + cout << record.score; + } + cout << "\t"; + + if (record.phase == -1) { + cout << "."; + } + else { + cout << record.phase; + } + cout << "\t"; + + if (record.attributes.empty()) { + cout << "."; + } + else { + cout << record.attributes; + } + cout << "\n"; + }; + + for (auto& gff_name : gff_names) { + get_input_file(gff_name, [&](istream& gff_stream) { + GFFReader gff_reader(gff_stream); + gff_reader.for_each_gff_record(output_ggff_record); + }); + } } - - for (auto& gff_name : gff_names) { - get_input_file(gff_name, [&](istream& gff_stream) { - vector buffer; - parse_gff_regions(gff_stream, xg_index, &buffer); - stream::write_buffered(cout, buffer, 0); // flush - }); + else { + for (auto& bed_name : bed_names) { + // Convert each BED file to GAM + get_input_file(bed_name, [&](istream& bed_stream) { + vector buffer; + parse_bed_regions(bed_stream, xg_index, &buffer); + vg::io::write_buffered(cout, buffer, 0); // flush + }); + + // TODO: We'll get an EOF marker per input file. + } + + for (auto& gff_name : gff_names) { + get_input_file(gff_name, [&](istream& gff_stream) { + vector buffer; + parse_gff_regions(gff_stream, xg_index, &buffer); + vg::io::write_buffered(cout, buffer, 0); // flush + }); + } } } - if (xg_index) { - delete xg_index; - } - return 0; } diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp index de945c3abfa..f103462983f 100644 --- a/src/subcommand/augment_main.cpp +++ b/src/subcommand/augment_main.cpp @@ -21,48 +21,49 @@ #include #include "subcommand.hpp" - #include "../option.hpp" - +#include "../xg.hpp" #include "../vg.hpp" -#include "../pileup_augmenter.hpp" +#include "../augment.hpp" +#include "../packer.hpp" +#include "../io/save_handle_graph.hpp" +#include +#include +#include +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include using namespace std; using namespace vg; using namespace vg::subcommand; -// this used to be pileup_main() -static Pileups* compute_pileups(VG* graph, const string& gam_file_name, int thread_count, int min_quality, - int max_mismatches, int window_size, int max_depth, bool use_mapq, - bool show_progress); - -// this used to be the first half of call_main() -static void augment_with_pileups(PileupAugmenter& augmenter, Pileups& pileups, bool expect_subgraph, - bool show_progress); - void help_augment(char** argv, ConfigurableParser& parser) { - cerr << "usage: " << argv[0] << " augment [options] > augmented_graph.vg" << endl + cerr << "usage: " << argv[0] << " augment [options] [alignment.gam] > augmented_graph.vg" << endl << "Embed GAM alignments into a graph to facilitate variant calling" << endl << endl << "general options:" << endl - << " -a, --augmentation-mode M augmentation mode. M = {pileup, direct} [direct]" << endl + << " -i, --include-paths merge the paths implied by alignments into the graph" << endl + << " -S, --keep-softclips include softclips from input alignments (they are cut by default)" << endl + << " -B, --label-paths don't augment with alignments, just use them for labeling the graph" << endl << " -Z, --translation FILE save translations from augmented back to base graph to FILE" << endl << " -A, --alignment-out FILE save augmented GAM reads to FILE" << endl + << " -F, --gaf expect (and write) GAF instead of GAM" << endl + << " -s, --subgraph graph is a subgraph of the one used to create GAM. ignore alignments with missing nodes" << endl + << " -m, --min-coverage N minimum coverage of a breakpoint required for it to be added to the graph" << endl + << " -c, --expected-cov N expected coverage. used only for memory tuning [default : 128]" << endl + << " -q, --min-baseq N ignore edits whose sequence have average base quality < N" << endl + << " -Q, --min-mapq N ignore alignments with mapping quality < N" << endl + << " -N, --max-n F maximum fraction of N bases in an edit for it to be included [default : 0.25]" << endl + << " -E, --edges-only only edges implied by reads, ignoring edits" << endl << " -h, --help print this help message" << endl << " -p, --progress show progress" << endl << " -v, --verbose print information and warnings about vcf generation" << endl - << " -t, --threads N number of threads to use" << endl - << "pileup options:" << endl - << " -P, --pileup FILE save pileups to FILE" << endl - << " -S, --support FILE save supports to FILE" << endl - << " -g, --min-aug-support N minimum support to augment graph [" - << PileupAugmenter::Default_min_aug_support << "]" << endl - << " -U, --subgraph expect a subgraph and ignore extra pileup entries outside it" << endl - << " -q, --min-quality N ignore bases with PHRED quality < N (default=10)" << endl - << " -m, --max-mismatches N ignore bases with > N mismatches within window centered on read (default=1)" << endl - << " -w, --window-size N size of window to apply -m option (default=0)" << endl - << " -M, --ignore-mapq do not combine mapping qualities with base qualities in pileup" << endl; + << " -t, --threads N number of threads (only 1st pass with -m or -q option is multithreaded)" << endl + << "loci file options:" << endl + << " -l, --include-loci FILE merge all alleles in loci into the graph" << endl + << " -L, --include-gt FILE merge only the alleles in called genotypes into the graph" << endl; // Then report more options parser.print_help(cerr); @@ -70,23 +71,23 @@ void help_augment(char** argv, ConfigurableParser& parser) { int main_augment(int argc, char** argv) { - // augmentation mode - string augmentation_mode = "direct"; - - // load pileupes from here - string pileup_file_name; - - // minimum support to consider adding a variant to the graph - int min_aug_support = PileupAugmenter::Default_min_aug_support; - - // Should we expect a subgraph and ignore pileups for missing nodes/edges? - bool expect_subgraph = false; - // Write the translations (as protobuf) to this path string translation_file_name; - // Write the supports (as protobuf) to this path - string support_file_name; + // Include a path in the graph for each GAM + bool include_paths = false; + + // Include the softclips for each path + bool include_softclips = false; + + // Just label the paths with the GAM + bool label_paths = false; + + // Merge alleles from this loci file instead of GAM + string loci_file; + + // Merge only alleles from called genotypes in the loci file + bool called_genotypes_only = false; // Load in GAM alignments to map over to the augmented graph from here string gam_in_file_name; @@ -94,54 +95,68 @@ int main_augment(int argc, char** argv) { // Write the GAM alignments (from gam_in_file_name) projected on the augmented graph here string gam_out_file_name; - // Print some progress messages to screen - bool show_progress = false; + // Expect given graph to be subgraph of that used to create GAM and not + // fail when nodes are missing + bool is_subgraph = false; - // Print verbose message - bool verbose = false; + // Min coverage for graph to be broken at a breakpoint + // Whene non-zero, the Packer will be used to collect breakpoints + size_t min_coverage = 0; - // Number of threads to use (will default to all if not specified) - int thread_count = 0; + // Used to set data_width for Packer + size_t expected_coverage = 128; - // Bases wit quality less than 10 will not be added to the pileup - int min_quality = 10; + // Minimum average base quality in an edit's sequence for it to be used + double min_baseq = 0; - // Bases with more than this many mismatches within the window_size not added - int max_mismatches = 1; + // Minimum mapping quality of an alignment for it to be used + double min_mapq = 0; - // Window size for above (0 effectively turns this check off) - int window_size = 0; + // Maximum fraction of Ns + double max_frac_n = 0.25; - // Hack to prevent protobuf messages from getting too big by limiting depth at - // any given position to max_depth - int max_depth = 1000; - - // Combine MAPQ and PHRED base qualities to determine quality at each position - // If false, only PHRED base quality will be used. - bool use_mapq = true; + // Only add edges (no new sequence) + // The motivation is to help vg call expect all breakpoint edges to be in graph, but in + // practice, it seems they already are. Todo: remove? + double edges_only = false; + + // GAF format toggle + string aln_format = "GAM"; + + // Print some progress messages to screen + bool show_progress = false; + // Print verbose message + bool verbose = false; static const struct option long_options[] = { - // General Options + // Deprecated Options {"augmentation-mode", required_argument, 0, 'a'}, + // General Options {"translation", required_argument, 0, 'Z'}, - {"alignment-out", required_argument, 0, 'A'}, + {"alignment-out", required_argument, 0, 'A'}, + {"include-paths", no_argument, 0, 'i'}, + {"cut-softclips", no_argument, 0, 'C'}, + {"keep-softclips", no_argument, 0, 'S'}, + {"label-paths", no_argument, 0, 'B'}, + {"subgraph", no_argument, 0, 's'}, + {"min-coverage", required_argument, 0, 'm'}, + {"expected-cov", required_argument, 0, 'c'}, + {"min-baseq", required_argument, 0, 'q'}, + {"min-mapq", required_argument, 0, 'Q'}, + {"max-n", required_argument, 0, 'N'}, + {"edges-only", no_argument, 0, 'E'}, + {"gaf", no_argument, 0, 'F'}, {"help", no_argument, 0, 'h'}, {"progress", required_argument, 0, 'p'}, {"verbose", no_argument, 0, 'v'}, {"threads", required_argument, 0, 't'}, - // Pileup Options - {"pileup", required_argument, 0, 'P'}, - {"support", required_argument, 0, 'S'}, - {"min-quality", required_argument, 0, 'q'}, - {"max-mismatches", required_argument, 0, 'm'}, - {"window-size", required_argument, 0, 'w'}, - {"ignore-mapq", no_argument, 0, 'M'}, - {"min-aug-support", required_argument, 0, 'g'}, - {"subgraph", no_argument, 0, 'U'}, + // Loci Options + {"include-loci", required_argument, 0, 'l'}, + {"include-gt", required_argument, 0, 'L'}, {0, 0, 0, 0} }; - static const char* short_options = "a:Z:A:hpvt:P:S:q:m:w:Mg:U"; + static const char* short_options = "a:Z:A:iCSBhpvt:l:L:sm:c:q:Q:N:EF"; optind = 2; // force optind past command positional arguments // This is our command-line parser @@ -149,16 +164,53 @@ int main_augment(int argc, char** argv) { // Parse all the options we have defined here. switch (c) { - // General Options + // Deprecated. case 'a': - augmentation_mode = optarg; + cerr << "[vg augment] warning: -a / --augmentation-mode option is deprecated" << endl; break; + // General Options case 'Z': translation_file_name = optarg; break; case 'A': gam_out_file_name = optarg; break; + case 'i': + include_paths = true; + break; + case 'C': + cerr << "[vg augment] warning: -C / --cut-softclips option is deprecated (now enabled by default)" << endl; + break; + case 'S': + include_softclips = true; + break; + case 'B': + label_paths = true; + break; + case 's': + is_subgraph = true; + break; + case 'm': + min_coverage = parse(optarg); + break; + case 'c': + expected_coverage = parse(optarg); + break; + case 'q': + min_baseq = parse(optarg); + break; + case 'Q': + min_mapq = parse(optarg); + break; + case 'N': + max_frac_n = parse(optarg); + break; + case 'E': + edges_only = true; + break; + case 'F': + aln_format = "GAF"; + break; case 'h': case '?': /* getopt_long already printed an error message. */ @@ -170,35 +222,24 @@ int main_augment(int argc, char** argv) { break; case 'v': verbose = true; - break; - case 't': - thread_count = parse(optarg); - break; - - // Pileup Options - case 'P': - pileup_file_name = optarg; break; - case 'S': - support_file_name = optarg; - break; - case 'q': - min_quality = parse(optarg); - break; - case 'm': - max_mismatches = parse(optarg); + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); break; - case 'w': - window_size = parse(optarg); + } + // Loci Options + case 'l': + loci_file = optarg; break; - case 'M': - use_mapq = false; - break; - case 'g': - min_aug_support = parse(optarg); - break; - case 'U': - expect_subgraph = true; + case 'L': + loci_file = optarg; + called_genotypes_only = true; break; default: @@ -206,344 +247,202 @@ int main_augment(int argc, char** argv) { } }); - if (argc <= 3) { - help_augment(argv, parser); - return 1; - } - // Parse the command line options, updating optind. parser.parse(argc, argv); - if (thread_count != 0) { - // Use a non-default number of threads - omp_set_num_threads(thread_count); - } - thread_count = get_thread_count(); - - // Parse the arguments - if (optind >= argc) { + // Parse the two positional arguments + if (optind + 1 > argc) { + cerr << "[vg augment] error: too few arguments" << endl; help_augment(argv, parser); return 1; } - string graph_file_name = argv[optind++]; - gam_in_file_name = argv[optind++]; + string graph_file_name = get_input_file_name(optind, argc, argv); + if (optind < argc) { + gam_in_file_name = get_input_file_name(optind, argc, argv); + } + if (gam_in_file_name.empty() && loci_file.empty()) { + cerr << "[vg augment] error: gam file argument required" << endl; + return 1; + } if (gam_in_file_name == "-" && graph_file_name == "-") { cerr << "[vg augment] error: graph and gam can't both be from stdin." << endl; return 1; } - if (gam_in_file_name == "-" && !gam_out_file_name.empty()) { - cerr << "[vg augment] error: cannot stream input gam when using -A option (as it requires 2 passes)" << endl; + if (label_paths && (!gam_out_file_name.empty() || !translation_file_name.empty() || edges_only)) { + cerr << "[vg augment] error: Translation (-Z), GAM (-A) output and edges-only (-E) do not work with \"label-only\" (-B) mode" << endl; return 1; } - - if (augmentation_mode != "pileup" && augmentation_mode != "direct") { - cerr << "[vg augment] error: pileup and direct are currently the only supported augmentation modes (-a)" << endl; + if (include_paths && edges_only) { + cerr <<"vg augment] error: -E cannot be used with -i" << endl; return 1; } - - if (augmentation_mode != "direct" and !gam_out_file_name.empty()) { - cerr << "[vg augment] error: GAM output only works with \"direct\" augmentation mode" << endl; - return 1; - } - - if (augmentation_mode != "pileup" and (!support_file_name.empty() || !pileup_file_name.empty())) { - cerr << "[vg augment] error: Pileup (-P) and Support (-S) output only work with \"pileup\" augmentation mode" << endl; - return 1; + if (gam_in_file_name == "-" && !label_paths) { + cerr << "[vg augment] warning: reading the entire GAM from stdin into memory. it is recommended to pass in" + << " a filename rather than - so it can be streamed over two passes" << endl; + if (!gam_out_file_name.empty()) { + cerr << " warning: when streaming in a GAM with -A, the output GAM will lose all non-Path related fields from the input" << endl; + } } - - // read the graph if (show_progress) { cerr << "Reading input graph" << endl; } - VG* graph; - get_input_file(graph_file_name, [&](istream& in) { - graph = new VG(in); - }); - - - Pileups* pileups = nullptr; + + // Read the graph + unique_ptr graph; + graph = vg::io::VPKG::load_one(graph_file_name); + VG* vg_graph = dynamic_cast(graph.get()); + HandleGraph* vectorizable_graph = nullptr; + unique_ptr packer; + bdsg::VectorizableOverlayHelper overlay_helper; - if (!pileup_file_name.empty() || augmentation_mode == "pileup") { - // We will need the computed pileups - - // compute the pileups from the graph and gam - pileups = compute_pileups(graph, gam_in_file_name, thread_count, min_quality, max_mismatches, - window_size, max_depth, use_mapq, show_progress); - } - - if (!pileup_file_name.empty()) { - // We want to write out pileups. - if (show_progress) { - cerr << "Writing pileups" << endl; + if (label_paths) { + // Just add path names with extend() + function lambda = [&](Alignment& alignment) { + if (!include_softclips) { + softclip_trim(alignment); + } + Path simplified_path = simplify(alignment.path()); + *simplified_path.mutable_name() = alignment.name(); + add_path_to_graph(graph.get(), simplified_path); + }; + if (aln_format == "GAM") { + get_input_file(gam_in_file_name, [&](istream& alignment_stream) { + vg::io::for_each(alignment_stream, lambda); + }); + } else { + assert(aln_format == "GAF"); + vg::io::gaf_unpaired_for_each(*graph, gam_in_file_name, lambda); } - ofstream pileup_file(pileup_file_name); - if (!pileup_file) { - cerr << "[vg augment] error: unable to open output pileup file: " << pileup_file_name << endl; - exit(1); + if (vg_graph != nullptr) { + vg_graph->paths.sort_by_mapping_rank(); + vg_graph->paths.rebuild_mapping_aux(); } - pileups->write(pileup_file); } - - if (augmentation_mode == "direct") { - // Augment with the reads - - if (!support_file_name.empty()) { - cerr << "[vg augment] error: support calculation in direct augmentation mode is unimplemented" << endl; - exit(1); - } - - // We don't need any pileups - if (pileups != nullptr) { - delete pileups; - pileups = nullptr; + else { + // the packer's required for any kind of filtering logic -- so we use it when + // baseq is present as well, or n-fraction. + if (min_coverage > 0 || min_baseq || max_frac_n < 1.) { + vectorizable_graph = dynamic_cast(overlay_helper.apply(graph.get())); + size_t data_width = Packer::estimate_data_width(expected_coverage); + size_t bin_count = Packer::estimate_bin_count(get_thread_count()); + packer = make_unique(vectorizable_graph, true, false, false, false, 0, bin_count, data_width); + // makes sure filters are activated. + min_coverage = max(size_t(min_coverage), size_t(1)); } - // Load all the reads - vector reads; - // And pull out their paths - vector read_paths; - get_input_file(gam_in_file_name, [&](istream& alignment_stream) { - stream::for_each(alignment_stream, [&](Alignment& alignment) { - // Trim the softclips off of every read - // Work out were to cut - int cut_start = softclip_start(alignment); - int cut_end = softclip_end(alignment); - // Cut the sequence and quality - alignment.set_sequence(alignment.sequence().substr(cut_start, alignment.sequence().size() - cut_start - cut_end)); - if (alignment.quality().size() != 0) { - alignment.set_quality(alignment.quality().substr(cut_start, alignment.quality().size() - cut_start - cut_end)); - } - // Trim the path - *alignment.mutable_path() = trim_hanging_ends(alignment.path()); - - // Save every read - reads.push_back(alignment); - // And the path for the read, separately - // TODO: Make edit use callbacks or something so it doesn't need a vector of paths necessarily - read_paths.push_back(alignment.path()); - }); - }); - - // Augment the graph, rewriting the paths. - // Don't embed paths or break at ends. - auto translation = graph->edit(read_paths, false, true, false); - - // Write the augmented graph - if (show_progress) { - cerr << "Writing augmented graph" << endl; - } - graph->serialize_to_ostream(cout); - - if (!translation_file_name.empty()) { - // Write the translations - if (show_progress) { - cerr << "Writing translation table" << endl; - } - ofstream translation_file(translation_file_name); - if (!translation_file) { - cerr << "[vg augment]: Error opening translation file: " << translation_file_name << endl; - return 1; - } - stream::write_buffered(translation_file, translation, 0); - translation_file.close(); - } - + // Actually do augmentation + vector translation; if (!gam_out_file_name.empty()) { - // Write out the modified GAM - ofstream gam_out_file(gam_out_file_name); if (!gam_out_file) { - cerr << "[vg augment]: Error opening output GAM file: " << gam_out_file_name << endl; + cerr << "[vg augment] error: could not open output GAM file: " << gam_out_file_name << endl; return 1; } - - // We use this buffer and do a buffered write - vector gam_buffer; - for (size_t i = 0; i < reads.size(); i++) { - // Say we are going to write out the alignment - gam_buffer.push_back(reads[i]); - - // Set its path to the corrected embedded path - *gam_buffer.back().mutable_path() = read_paths[i]; - - // Write it back out - stream::write_buffered(gam_out_file, gam_buffer, 100); - } - // Flush the buffer - stream::write_buffered(gam_out_file, gam_buffer, 0); } - } else if (augmentation_mode == "pileup") { - // We want to augment with pileups - - // The PileupAugmenter object will take care of all augmentation - PileupAugmenter augmenter(graph, PileupAugmenter::Default_default_quality, min_aug_support); - - // compute the augmented graph from the pileup - // Note: we can save a fair bit of memory by clearing pileups, and re-reading off of - // pileup_file_name - augment_with_pileups(augmenter, *pileups, expect_subgraph, show_progress); - delete pileups; - pileups = nullptr; - - // write the augmented graph - if (show_progress) { - cerr << "Writing augmented graph" << endl; - } - augmenter.write_augmented_graph(cout, false); - - // write the agumented gam - if (!gam_out_file_name.empty()) { - ofstream gam_out_file(gam_out_file_name); - if (!gam_out_file) { - cerr << "[vg augment]: Error opening output GAM file: " << gam_out_file_name << endl; - return 1; - } - get_input_file(gam_in_file_name, [&](istream& alignment_stream) { - vector gam_buffer; - function lambda = [&gam_out_file, &gam_buffer, &augmenter](Alignment& alignment) { - list aug_path; - augmenter.map_path(alignment.path(), aug_path, true); - alignment.mutable_path()->clear_mapping(); - for (auto& aug_mapping : aug_path) { - *alignment.mutable_path()->add_mapping() = aug_mapping.to_mapping(); + if (gam_in_file_name == "-" || !loci_file.empty()) { + vector buffer; + if (gam_in_file_name == "-") { + // this is usually bad news, but we gave a warning + function lambda = [&](Alignment& alignment) { + Path& path = *alignment.mutable_path(); + path.set_name(alignment.name()); + buffer.push_back(path); + }; + if (aln_format == "GAM") { + get_input_file(gam_in_file_name, [&](istream& alignment_stream) { + vg::io::for_each(alignment_stream, lambda); + }); + } else { + assert(aln_format == "GAF"); + vg::io::gaf_unpaired_for_each(*graph, gam_in_file_name, lambda); + } + } else if (!loci_file.empty()) { + function lambda = [&graph, &buffer, &called_genotypes_only](Locus& locus) { + // if we are only doing called genotypes, record so we can filter alleles + set alleles_in_genotype; + if (called_genotypes_only) { + for (int i = 0; i < locus.genotype_size(); ++i) { + for (int j = 0; j < locus.genotype(i).allele_size(); ++j) { + alleles_in_genotype.insert(locus.genotype(i).allele(j)); + } } - gam_buffer.push_back(alignment); - stream::write_buffered(gam_out_file, gam_buffer, 100); - }; - stream::for_each(alignment_stream, lambda); - stream::write_buffered(gam_out_file, gam_buffer, 0); - }); + } + for (int i = 0; i < locus.allele_size(); ++i) { + // skip alleles not in the genotype if using only called genotypes + if (!alleles_in_genotype.empty()) { + if (!alleles_in_genotype.count(i)) continue; + } + Path path = simplify(locus.allele(i)); + stringstream name; + name << locus.name() << ":" << i; + path.set_name(name.str()); + buffer.push_back(path); + } + }; + get_input_file(loci_file, [&](istream& loci_stream) { + vg::io::for_each(loci_stream, lambda); + }); + } + + augment(graph.get(), + buffer, + aln_format, + translation_file_name.empty() ? nullptr : &translation, + gam_out_file_name, + include_paths, + include_paths, + !include_softclips, + is_subgraph, + min_baseq, + min_mapq, + packer.get(), + min_coverage, + max_frac_n, + edges_only); + } else { + // much better to stream from a file so we can do two passes without storing in memory + augment(graph.get(), + gam_in_file_name, + aln_format, + translation_file_name.empty() ? nullptr : &translation, + gam_out_file_name, + include_paths, + include_paths, + !include_softclips, + is_subgraph, + min_baseq, + min_mapq, + packer.get(), + min_coverage, + max_frac_n, + edges_only); } - // write the translation + // we don't have a streaming interface for translation: write the buffer now if (!translation_file_name.empty()) { - // write the translations + // Write the translations if (show_progress) { cerr << "Writing translation table" << endl; } ofstream translation_file(translation_file_name); if (!translation_file) { - cerr << "[vg augment] error: error opening translation file: " << translation_file_name << endl; + cerr << "[vg augment]: Error opening translation file: " << translation_file_name << endl; return 1; } - augmenter._augmented_graph.write_translations(translation_file); + vg::io::write_buffered(translation_file, translation, 0); translation_file.close(); } + } - // write the supports - if (!support_file_name.empty()) { - // write the supports - if (show_progress) { - cerr << "Writing supports" << endl; - } - ofstream support_file(support_file_name); - if (!support_file) { - cerr << "[vg augment] error: error opening supports file: " << support_file_name << endl; - return 1; - } - augmenter._augmented_graph.write_supports(support_file); - support_file.close(); - } - } else { - cerr << "[vg augment] error: unrecognized augmentation mode" << endl; - exit(1); - } - - if (pileups != nullptr) { - delete pileups; - pileups = nullptr; - } + // Serialize the graph using VPKG. + vg::io::save_handle_graph(graph.get(), cout); - delete graph; - return 0; } -Pileups* compute_pileups(VG* graph, const string& gam_file_name, int thread_count, int min_quality, - int max_mismatches, int window_size, int max_depth, bool use_mapq, - bool show_progress) { - - // Make Pileups makers for each thread. - vector pileups; - for (int i = 0; i < thread_count; ++i) { - pileups.push_back(new Pileups(graph, min_quality, max_mismatches, window_size, max_depth, use_mapq)); - } - - // setup alignment stream - get_input_file(gam_file_name, [&](istream& alignment_stream) { - // compute the pileups. - if (show_progress) { - cerr << "Computing pileups" << endl; - } - - function lambda = [&pileups, &graph](Alignment& aln) { - int tid = omp_get_thread_num(); - pileups[tid]->compute_from_alignment(aln); - }; - stream::for_each_parallel(alignment_stream, lambda); - }); - - // single-threaded (!) merge - if (show_progress && pileups.size() > 1) { - cerr << "Merging pileups" << endl; - } - for (int i = 1; i < pileups.size(); ++i) { - pileups[0]->merge(*pileups[i]); - delete pileups[i]; - } - return pileups[0]; -} - -void augment_with_pileups(PileupAugmenter& augmenter, Pileups& pileups, bool expect_subgraph, - bool show_progress) { - - if (show_progress) { - cerr << "Computing augmented graph from the pileup" << endl; - } - - pileups.for_each_node_pileup([&](const NodePileup& node_pileup) { - if (!augmenter._graph->has_node(node_pileup.node_id())) { - // This pileup doesn't belong in this graph - if(!expect_subgraph) { - throw runtime_error("Found pileup for nonexistent node " + to_string(node_pileup.node_id())); - } - // If that's expected, just skip it - return; - } - // Send approved pileups to the augmenter - augmenter.call_node_pileup(node_pileup); - - }); - - pileups.for_each_edge_pileup([&](const EdgePileup& edge_pileup) { - if (!augmenter._graph->has_edge(edge_pileup.edge())) { - // This pileup doesn't belong in this graph - if(!expect_subgraph) { - throw runtime_error("Found pileup for nonexistent edge " + pb2json(edge_pileup.edge())); - } - // If that's expected, just skip it - return; - } - // Send approved pileups to the augmenter - augmenter.call_edge_pileup(edge_pileup); - }); - - // map the edges from original graph - if (show_progress) { - cerr << "Mapping edges into augmented graph" << endl; - } - augmenter.update_augmented_graph(); - - // map the paths from the original graph - if (show_progress) { - cerr << "Mapping paths into augmented graph" << endl; - } - augmenter.map_paths(); -} - // Register subcommand -static Subcommand vg_augment("augment", "augment a graph from an alignment", PIPELINE, 5, main_augment); +static Subcommand vg_augment("augment", "augment a graph from an alignment", PIPELINE, 8, main_augment); diff --git a/src/subcommand/autoindex_main.cpp b/src/subcommand/autoindex_main.cpp new file mode 100644 index 00000000000..f36b5341f9d --- /dev/null +++ b/src/subcommand/autoindex_main.cpp @@ -0,0 +1,374 @@ +/** \file autoindex_main.cpp + * + * Defines the "vg autoindex" subcommand, which produces indexes needed for other subcommands + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "subcommand.hpp" +#include "index_registry.hpp" +#include "utility.hpp" + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +int64_t parse_memory_usage(const string& mem_arg) { + if (mem_arg.empty()) { + cerr << "error:[vg autoindex] target memory usage arg is empty" << endl; + exit(1); + } + string mem = mem_arg; + if (mem.back() == 'B') { + mem.pop_back(); + } + int64_t base; + if (mem.back() == 'k') { + base = 1024; + mem.pop_back(); + } + else if (mem.back() == 'M') { + base = 1024 * 1024; + mem.pop_back(); + } + else if (mem.back() == 'G') { + base = 1024 * 1024 * 1024; + mem.pop_back(); + } + else if (isdigit(mem.back())) { + base = 1; + } + else { + cerr << "error:[vg autoindex] unrecognized unit for target memory usage: " << mem.back() << endl; + exit(1); + } + return parse(mem) * base; +} + +string mem_usage_string(int64_t mem) { + assert(mem > 0); + stringstream strm; + strm.precision(1); + if (mem >= 1024 * 1024 * 1024) { + strm << double(mem) / (1024 * 1024 * 1024) << "GB"; + } + else if (mem >= 1024 * 1024) { + strm << double(mem) / (1024 * 1024) << "MB"; + } + else if (mem >= 1024) { + strm << double(mem) / (1024) << "kB"; + } + else { + strm << double(mem) << "B"; + } + return strm.str(); +}; + +// expects a string of form "Index Registry Name:filepath1,filepath2,filepath3" +pair> parse_provide_string(const string& str) { + + pair> return_val; + + size_t i = str.find(':'); + if (i >= str.size()) { + cerr << "error: Couldn't parse index provide string: " << str << endl; + exit(1); + } + return_val.first = str.substr(0, i); + while (i < str.size()) { + size_t end = str.find(',', i + 1); + return_val.second.emplace_back(str.substr(i + 1, end - i - 1)); + i = end; + } + if (return_val.second.empty()) { + cerr << "error: Couldn't parse index provide string: " << str << endl; + exit(1); + } + return return_val; +} + +void help_autoindex(char** argv) { + cerr + << "usage: " << argv[0] << " autoindex [options]" << endl + << "options:" << endl + << " output:" << endl + << " -p, --prefix PREFIX prefix to use for all output (default: index)" << endl + << " -w, --workflow NAME workflow to produce indexes for, can be provided multiple" << endl + << " times. options: map, mpmap, rpvg, giraffe (default: map)" << endl + << " input data:" << endl + << " -r, --ref-fasta FILE FASTA file containing the reference sequence (may repeat)" << endl + << " -v, --vcf FILE VCF file with sequence names matching -r (may repeat)" << endl + << " -i, --ins-fasta FILE FASTA file with sequences of INS variants from -v" << endl + << " -g, --gfa FILE GFA file to make a graph from" << endl + << " -x, --tx-gff FILE GTF/GFF file with transcript annotations (may repeat)" << endl + << " -H, --hap-tx-gff FILE GTF/GFF file with transcript annotations of a named haplotype (may repeat)" << endl + << " configuration:" << endl + << " -f, --gff-feature STR GTF/GFF feature type (col. 3) to add to graph (default: " << IndexingParameters::gff_feature_name << ")" << endl + << " -a, --gff-tx-tag STR GTF/GFF tag (in col. 9) for transcript ID (default: " << IndexingParameters::gff_transcript_tag << ")" << endl + << " logging and computation:" << endl + << " -T, --tmp-dir DIR temporary directory to use for intermediate files" << endl + << " -M, --target-mem MEM target max memory usage (not exact, formatted INT[kMG])" << endl + << " (default: 1/2 of available)" << endl +// TODO: hiding this now that we have rewinding options, since detailed args aren't really in the spirit of this subcommand +// << " --gbwt-buffer-size NUM GBWT construction buffer size in millions of nodes; may need to be" << endl +// << " increased for graphs with long haplotypes (default: " << IndexingParameters::gbwt_insert_batch_size / gbwt::MILLION << ")" << endl +// << " --gcsa-size-limit NUM limit on size of GCSA2 temporary files on disk in bytes" << endl + << " -t, --threads NUM number of threads (default: all available)" << endl + << " -V, --verbosity NUM log to stderr (0 = none, 1 = basic, 2 = debug; default " << (int) IndexingParameters::verbosity << ")" << endl + //<< " -d, --dot print the dot-formatted graph of index recipes and exit" << endl + << " -h, --help print this help message to stderr and exit" << endl; +} + +int main_autoindex(int argc, char** argv) { + + if (argc == 2) { + help_autoindex(argv); + return 1; + } + +#define OPT_KEEP_INTERMEDIATE 1000 +#define OPT_FORCE_UNPHASED 1001 +#define OPT_FORCE_PHASED 1002 +#define OPT_GBWT_BUFFER_SIZE 1003 +#define OPT_GCSA_SIZE_LIMIT 1004 + + // load the registry + IndexRegistry registry = VGIndexes::get_vg_index_registry(); + bool print_dot = false; + vector targets; + vector vcf_names; + bool force_unphased = false; + bool force_phased = false; + int64_t target_mem_usage = IndexRegistry::get_system_memory() / 2; + + string gfa_name; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + {"prefix", required_argument, 0, 'p'}, + {"workflow", required_argument, 0, 'w'}, + {"ref-fasta", required_argument, 0, 'r'}, + {"vcf", required_argument, 0, 'v'}, + {"ins-fasta", required_argument, 0, 'i'}, + {"gfa", required_argument, 0, 'g'}, + {"tx-gff", required_argument, 0, 'x'}, + {"hap-tx-gff", required_argument, 0, 'H'}, + {"gff-feature", required_argument, 0, 'f'}, + {"gff-tx-tag", required_argument, 0, 'a'}, + {"provide", required_argument, 0, 'P'}, + {"request", required_argument, 0, 'R'}, + {"target-mem", required_argument, 0, 'M'}, + {"gbwt-buffer-size", required_argument, 0, OPT_GBWT_BUFFER_SIZE}, + {"gcsa-size-limit", required_argument, 0, OPT_GCSA_SIZE_LIMIT}, + {"tmp-dir", required_argument, 0, 'T'}, + {"threads", required_argument, 0, 't'}, + {"verbosity", required_argument, 0, 'V'}, + {"dot", no_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + {"keep-intermediate", no_argument, 0, OPT_KEEP_INTERMEDIATE}, + {"force-unphased", no_argument, 0, OPT_FORCE_UNPHASED}, + {"force-phased", no_argument, 0, OPT_FORCE_PHASED}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "p:w:r:v:i:g:x:H:a:P:R:f:M:T:t:dV:h", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'p': + registry.set_prefix(optarg); + break; + case 'w': + if (optarg == string("map")) { + for (auto& target : VGIndexes::get_default_map_indexes()) { + targets.emplace_back(move(target)); + } + } + else if (optarg == string("mpmap")) { + for (auto& target : VGIndexes::get_default_mpmap_indexes()) { + targets.emplace_back(move(target)); + } + } + else if (optarg == string("giraffe")) { + for (auto& target : VGIndexes::get_default_giraffe_indexes()) { + targets.emplace_back(move(target)); + } + } + else if (optarg == string("rpvg")) { + for (auto& target : VGIndexes::get_default_rpvg_indexes()) { + targets.emplace_back(move(target)); + } + } + else { + cerr << "error: Unrecognized workflow (-w): " << optarg << endl; + return 1; + } + break; + case 'r': + registry.provide("Reference FASTA", optarg); + break; + case 'v': + vcf_names.push_back(optarg); + break; + case 'i': + registry.provide("Insertion Sequence FASTA", optarg); + break; + case 'g': + gfa_name = optarg; + break; + case 'x': + registry.provide("GTF/GFF", optarg); + break; + case 'H': + registry.provide("Haplotype GTF/GFF", optarg); + break; + case 'f': + IndexingParameters::gff_feature_name = optarg; + break; + case 'a': + IndexingParameters::gff_transcript_tag = optarg; + break; + case 'P': + { + auto parsed = parse_provide_string(optarg); + registry.provide(parsed.first, parsed.second); + break; + } + case 'R': + targets.emplace_back(optarg); + break; + case 'M': + target_mem_usage = parse_memory_usage(optarg); + break; + case OPT_GBWT_BUFFER_SIZE: + IndexingParameters::gbwt_insert_batch_size = std::max(parse(optarg), 1ul) * gbwt::MILLION; + break; + case 'T': + temp_file::set_dir(optarg); + break; + case 't': + omp_set_num_threads(parse(optarg)); + break; + case 'V': + { + int verbosity = parse(optarg); + if (verbosity < IndexingParameters::None || verbosity > IndexingParameters::Debug) { + cerr << "error: Verbosity (-V) must be integer in {0, 1, 2}: " << optarg << endl; + return 1; + } + IndexingParameters::verbosity = (IndexingParameters::Verbosity) verbosity; + break; + } + case 'd': + print_dot = true; + break; + case OPT_KEEP_INTERMEDIATE: + registry.set_intermediate_file_keeping(true); + break; + case OPT_FORCE_UNPHASED: + force_unphased = true; + break; + case OPT_FORCE_PHASED: + force_phased = true; + break; + case OPT_GCSA_SIZE_LIMIT: + IndexingParameters::gcsa_size_limit = parse(optarg); + break; + case 'h': + help_autoindex(argv); + return 0; + default: + return 1; + } + } + + if (IndexingParameters::verbosity >= IndexingParameters::Basic) { + cerr << "[vg autoindex] Executing command:"; + for (int i = 0; i < argc; ++i) { + cerr << " " << argv[i]; + } + cerr << endl; + } + + assert(!(force_phased && force_unphased)); + + // we have special logic for VCFs to make it friendly to both phased + // and unphased VCF files + if (!vcf_names.empty()) { + // we interpret it as a phased VCF if any of the VCFs have phasing + bool phased = force_phased; + if (!force_unphased) { + for (size_t i = 0; i < vcf_names.size() && !phased; ++i) { + phased = IndexRegistry::vcf_is_phased(vcf_names[i]); + } + } + + for (auto& vcf_name : vcf_names) { + if (phased) { + registry.provide("VCF w/ Phasing", vcf_name); + } + else { + registry.provide("VCF", vcf_name); + } + } + } + + if (!gfa_name.empty()) { + if (IndexRegistry::gfa_has_haplotypes(gfa_name)) { + registry.provide("Reference GFA w/ Haplotypes", gfa_name); + } + else { + registry.provide("Reference GFA", gfa_name); + } + } + + if (print_dot) { + // don't index, just visualize the plan + cout << registry.to_dot(targets); + return 0; + } + + registry.set_target_memory_usage(target_mem_usage); + + if (targets.empty()) { + // default to vg map indexes + targets = VGIndexes::get_default_map_indexes(); + } + // deduplicate + sort(targets.begin(), targets.end()); + targets.resize(unique(targets.begin(), targets.end()) - targets.begin()); + + try { + registry.make_indexes(targets); + } + catch (InsufficientInputException ex) { + cerr << "error:[vg autoindex] Input is not sufficient to create indexes" << endl; + cerr << ex.what(); + return 1; + } + + return 0; + +} + +// Register subcommand +static Subcommand vg_autoindex("autoindex", "mapping tool-oriented index construction from interchange formats", PIPELINE, 1, main_autoindex); + diff --git a/src/subcommand/benchmark_main.cpp b/src/subcommand/benchmark_main.cpp index 7a54dab7e75..165400d36c3 100644 --- a/src/subcommand/benchmark_main.cpp +++ b/src/subcommand/benchmark_main.cpp @@ -14,11 +14,8 @@ #include "../benchmark.hpp" #include "../version.hpp" -#include "../vg.hpp" -#include "../xg.hpp" -#include "../algorithms/extract_connecting_graph.hpp" -#include "../algorithms/topological_sort.hpp" -#include "../algorithms/weakly_connected_components.hpp" +#include "../gbwt_extender.hpp" +#include "../gbwt_helper.hpp" @@ -36,6 +33,10 @@ int main_benchmark(int argc, char** argv) { bool show_progress = false; + // Which experiments should we run? + bool sort_and_order_experiment = false; + bool get_sequence_experiment = true; + int c; optind = 2; // force optind past command positional argument while (true) { @@ -86,96 +87,102 @@ int main_benchmark(int argc, char** argv) { // Turn on nested parallelism, so we can parallelize over VCFs and over alignment bands omp_set_nested(1); - // Generate a test graph - VG vg_mut; - for (size_t i = 1; i < 101; i++) { - // It will have 100 nodes - vg_mut.create_node("ACGTACGT", i); - } - size_t bits = 1; - for (size_t i = 1; i < 101; i++) { - for (size_t j = 1; j < 101; j++) { - if ((bits ^ (i + (j << 3))) % 50 == 0) { - // Make some arbitrary edges - vg_mut.create_edge(i, j, false, false); - } - // Shifts and xors make good PRNGs right? - bits = bits ^ (bits << 13) ^ j; - } - } - - const VG vg(vg_mut); - - // And a test XG of it - const xg::XG xg_index(vg_mut.graph); - vector results; - results.push_back(run_benchmark("vg::algorithms topological_order", 1000, [&]() { - vector order = algorithms::topological_order(&vg); - assert(order.size() == vg.node_size()); - })); - - results.push_back(run_benchmark("vg::algorithms sort", 1000, [&]() { - vg_mut = vg; - }, [&]() { - algorithms::sort(&vg_mut); - })); + // We're doing long alignments so we need to raise the WFA score caps + WFAExtender::ErrorModel error_model = WFAExtender::default_error_model; + error_model.mismatches.max = std::numeric_limits::max(); + error_model.gaps.max = std::numeric_limits::max(); + error_model.gap_length.max = std::numeric_limits::max(); - results.push_back(run_benchmark("vg::algorithms orient_nodes_forward", 1000, [&]() { - vg_mut = vg; - }, [&]() { - algorithms::orient_nodes_forward(&vg_mut); - })); + size_t node_length = 32; + for (size_t node_count = 10; node_count <= 320; node_count *= 2) { - results.push_back(run_benchmark("vg::algorithms weakly_connected_components", 1000, [&]() { - auto components = algorithms::weakly_connected_components(&vg); - assert(components.size() == 1); - assert(components.front().size() == vg.node_size()); - })); - - results.push_back(run_benchmark("VG::get_node", 1000, [&]() { - for (size_t rep = 0; rep < 100; rep++) { - for (size_t i = 1; i < 101; i++) { - vg_mut.get_node(i); + // Prepare a GBWT of one long path + std::vector paths; + paths.emplace_back(); + for (size_t i = 0; i < node_count; i++) { + paths.back().push_back(gbwt::Node::encode(i + 1, false)); + } + gbwt::GBWT index = get_gbwt(paths); + + // Turn it into a GBWTGraph. + // Make a SequenceSource we will consult later for getting sequence. + gbwtgraph::SequenceSource source; + uint32_t bits = 0xcafebebe; + auto step_rng = [&bits]() { + // Try out + bits = (bits * 73 + 1375) % 477218579; + }; + for (size_t i = 0; i < node_count; i++) { + std::stringstream ss; + for (size_t j = 0; j < node_length; j++) { + // Pick a deterministic character + ss << "ACGT"[bits & 0x3]; + step_rng(); } + source.add_node(i + 1, ss.str()); } - })); - - results.push_back(run_benchmark("algorithms::extract_connecting_graph on xg", 1000, [&]() { - pos_t pos_1 = make_pos_t(55, false, 0); - pos_t pos_2 = make_pos_t(32, false, 0); + // And then make the graph + gbwtgraph::GBWTGraph graph(index, source); - int64_t max_len = 500; + // Decide what we are going to align + pos_t from_pos = make_pos_t(1, false, 3); + pos_t to_pos = make_pos_t(node_count, false, 11); - VG extractor; + // Synthesize a sequence + std::stringstream seq_stream; + seq_stream << source.get_sequence(get_id(from_pos)).substr(get_offset(from_pos) + 1); + for (nid_t i = get_id(from_pos) + 1; i < get_id(to_pos); i++) { + std::string seq = source.get_sequence(i); + // Add some errors + if (bits & 0x1) { + int offset = bits % seq.size(); + step_rng(); + char replacement = "ACGT"[bits & 0x3]; + step_rng(); + if (bits & 0x1) { + seq[offset] = replacement; + } else { + step_rng(); + if (bits & 0x1) { + seq.insert(offset, 1, replacement); + } else { + seq.erase(offset); + } + } + } + step_rng(); + // And keep the sequence + seq_stream << seq; + } + seq_stream << source.get_sequence(get_id(to_pos)).substr(0, get_offset(to_pos)); - auto trans = algorithms::extract_connecting_graph(&xg_index, &extractor, max_len, pos_1, pos_2, true, true); - - })); - - results.push_back(run_benchmark("algorithms::extract_connecting_graph on vg", 1000, [&]() { - pos_t pos_1 = make_pos_t(55, false, 0); - pos_t pos_2 = make_pos_t(32, false, 0); + std::string to_connect = seq_stream.str(); - int64_t max_len = 500; + // Make the Aligner and Extender + Aligner aligner; + WFAExtender extender(graph, aligner, error_model); - VG extractor; + results.push_back(run_benchmark("connect() on " + std::to_string(node_count) + " node sequence", 1, [&]() { + // Do the alignment + WFAAlignment aligned = extender.connect(to_connect, from_pos, to_pos); + // Make sure it succeeded + assert(aligned); + })); + } - auto trans = algorithms::extract_connecting_graph(&vg, &extractor, max_len, pos_1, pos_2, true, true); - - })); - // Do the control against itself results.push_back(run_benchmark("control", 1000, benchmark_control)); + cout << "# Benchmark results for vg " << Version::get_short() << endl; cout << "# runs\ttest(us)\tstddev(us)\tcontrol(us)\tstddev(us)\tscore\terr\tname" << endl; for (auto& result : results) { cout << result << endl; } - + return 0; } diff --git a/src/subcommand/bugs_main.cpp b/src/subcommand/bugs_main.cpp deleted file mode 100644 index c2e35353e5e..00000000000 --- a/src/subcommand/bugs_main.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/** \file bugs_main.cpp - * - * Defines the "vg bugs" subcommand, which shows and opens Github issues. - */ - - -#include -#include -#include -#include - -#include - -#include "subcommand.hpp" - -using namespace std; -using namespace vg; -using namespace vg::subcommand; - -void help_bugs(char** argv){ - cerr << "usage: " << argv[0] << " bugs" << endl - << "options: " << endl - << "--new, -n make a new bug report" << endl - << endl; -} - -int main_bugs(int argc, char** argv){ - - bool new_bug = false; - int c; - while (true) { - static struct option long_options[] = - { - {"new", no_argument, 0, 'n'}, - {0, 0, 0, 0} - }; - - int option_index = 0; - c = getopt_long (argc, argv, "n", - long_options, &option_index); - - /* Detect the end of the options. */ - if (c == -1) - break; - - switch (c) - { - case 'n': - new_bug = true; - break; - case 'h': - case '?': - /* getopt_long already printed an error message. */ - help_bugs(argv); - exit(1); - break; - default: - abort (); - } - } - - // What should we open? - string url = new_bug ? string("https://github.com/vgteam/vg/issues/new") : string("https://github.com/vgteam/vg/issues"); - - // What should we run? -#ifdef __APPLE__ - string open_command = "open"; -#else - string open_command = "xdg-open"; -#endif - - // Open the URL in the appropriate browser (which may be lynx or similar) - return system((open_command + " " + url).c_str()); - -} - -// Register subcommand -static Subcommand vg_bugs("bugs", "show or create bugs", DEVELOPMENT, main_bugs); - diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index c7a6a553547..abd54291401 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -6,194 +6,825 @@ #include #include #include - +#include #include #include #include "subcommand.hpp" - -#include "../option.hpp" - -#include "../vg.hpp" -#include "../support_caller.hpp" - - +#include "../path.hpp" +#include "../graph_caller.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../xg.hpp" +#include "../gbzgraph.hpp" +#include "../gbwtgraph_helper.hpp" +#include +#include +#include using namespace std; using namespace vg; using namespace vg::subcommand; -void help_call(char** argv, ConfigurableParser& parser) { - cerr << "usage: " << argv[0] << " call [options] > output.vcf" << endl - << "Output variant calls in VCF or Loci format given a graph and pileup" << endl - << endl - << "genotyper options:" << endl - - << "general options:" << endl - << " -z, --translation FILE input translation table" << endl - << " -b, --base-graph FILE base graph. currently needed for XREF tag" << endl - << " -h, --help print this help message" << endl - << " -p, --progress show progress" << endl - << " -v, --verbose print information and warnings about vcf generation" << endl - << " -t, --threads N number of threads to use" << endl; - - // Then report more options - parser.print_help(cerr); -} +/// Helper to ensure that a PathHandleGraph has the VectorizableHandleGraph and PathPositionHandleGraph interfaces. +typedef bdsg::PairOverlayHelper ReferencePathVectorizableOverlayHelper; + +void help_call(char** argv) { + cerr << "usage: " << argv[0] << " call [options] > output.vcf" << endl + << "Call variants or genotype known variants" << endl + << endl + << "support calling options:" << endl + << " -k, --pack FILE Supports created from vg pack for given input graph" << endl + << " -m, --min-support M,N Minimum allele support (M) and minimum site support (N) for call [default = 2,4]" << endl + << " -e, --baseline-error X,Y Baseline error rates for Poisson model for small (X) and large (Y) variants [default= 0.005,0.01]" << endl + << " -B, --bias-mode Use old ratio-based genotyping algorithm as opposed to porbablistic model" << endl + << " -b, --het-bias M,N Homozygous alt/ref allele must have >= M/N times more support than the next best allele [default = 6,6]" << endl + << "GAF options:" << endl + << " -G, --gaf Output GAF genotypes instead of VCF" << endl + << " -T, --traversals Output all candidate traversals in GAF without doing any genotyping" << endl + << " -M, --trav-padding N Extend each flank of traversals (from -T) with reference path by N bases if possible" << endl + << "general options:" << endl + << " -v, --vcf FILE VCF file to genotype (must have been used to construct input graph with -a)" << endl + << " -a, --genotype-snarls Genotype every snarl, including reference calls (use to compare multiple samples)" << endl + << " -A, --all-snarls Genotype all snarls, including nested child snarls (like deconstruct -a)" << endl + << " -c, --min-length N Genotype only snarls with reference traversal length >= N" << endl + << " -C, --max-length N Genotype only snarls with reference traversal length <= N" << endl + << " -f, --ref-fasta FILE Reference fasta (required if VCF contains symbolic deletions or inversions)" << endl + << " -i, --ins-fasta FILE Insertions fasta (required if VCF contains symbolic insertions)" << endl + << " -s, --sample NAME Sample name [default=SAMPLE]" << endl + << " -r, --snarls FILE Snarls (from vg snarls) to avoid recomputing." << endl + << " -g, --gbwt FILE Only call genotypes that are present in given GBWT index." << endl + << " -z, --gbz Only call genotypes that are present in GBZ index (applies only if input graph is GBZ)." << endl + << " -N, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names in output" << endl + << " -p, --ref-path NAME Reference path to call on (multipile allowed. defaults to all paths)" << endl + << " -S, --ref-sample NAME Call on all paths with given sample name (cannot be used with -p)" << endl + << " -o, --ref-offset N Offset in reference path (multiple allowed, 1 per path)" << endl + << " -l, --ref-length N Override length of reference in the contig field of output VCF" << endl + << " -d, --ploidy N Ploidy of sample. Only 1 and 2 supported. (default: 2)" << endl + << " -R, --ploidy-regex RULES use the given comma-separated list of colon-delimited REGEX:PLOIDY rules to assign" << endl + << " ploidies to contigs not visited by the selected samples, or to all contigs simulated" << endl + << " from if no samples are used. Unmatched contigs get ploidy 2 (or that from -d)." << endl + << " -n, --nested Activate nested calling mode (experimental)" << endl + << " -I, --chains Call chains instead of snarls (experimental)" << endl + << " -t, --threads N number of threads to use" << endl; +} int main_call(int argc, char** argv) { + string pack_filename; + string vcf_filename; + string sample_name = "SAMPLE"; + string snarl_filename; + string gbwt_filename; + bool gbz_paths = false; string translation_file_name; + bool gbz_translation = false; + string ref_fasta_filename; + string ins_fasta_filename; + vector ref_paths; + string ref_sample; + vector ref_path_offsets; + vector ref_path_lengths; + string min_support_string; + string baseline_error_string; + string bias_string; + // require at least some support for all breakpoint edges + // inceases sv precision, but at some recall cost. + // think this is worth leaving on by default and not adding an option (famouse last words) + bool expect_bp_edges = true; + bool ratio_caller = false; + bool legacy = false; + int ploidy = 2; + // copied over from vg sim + std::vector> ploidy_rules; + + bool traversals_only = false; + bool gaf_output = false; + size_t trav_padding = 0; + bool genotype_snarls = false; + bool nested = false; + bool call_chains = false; + bool all_snarls = false; + int64_t min_ref_allele_len = 0; + int64_t max_ref_allele_len = numeric_limits::max(); - // todo: get rid of this. only used to check if deletion edge is novel. must be some - // way to get that out of the translations - string base_graph_file_name; + // constants + const size_t avg_trav_threshold = 50; + const size_t avg_node_threshold = 50; + const size_t min_depth_bin_width = 50; + const size_t max_depth_bin_width = 50000000; + const double depth_scale_fac = 1.5; + const size_t max_yens_traversals = traversals_only ? 100 : 50; + // used to merge up snarls from chains when generating traversals + const size_t max_chain_edges = 1000; + const size_t max_chain_trivial_travs = 5; - // This manages conversion from an augmented graph to a VCF, and makes the - // actual calls. - SupportCaller support_caller; - - bool show_progress = false; - int thread_count = 0; - - static const struct option long_options[] = { - {"base-graph", required_argument, 0, 'b'}, - {"translation", required_argument, 0, 'z'}, - {"progress", no_argument, 0, 'p'}, - {"verbose", no_argument, 0, 'v'}, - {"threads", required_argument, 0, 't'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - static const char* short_options = "z:b:pvt:h"; - optind = 2; // force optind past command positional arguments + int c; + optind = 2; // force optind past command positional argument + while (true) { + + static const struct option long_options[] = { + {"pack", required_argument, 0, 'k'}, + {"bias-mode", no_argument, 0, 'B'}, + {"baseline-error", required_argument, 0, 'e'}, + {"het-bias", required_argument, 0, 'b'}, + {"min-support", required_argument, 0, 'm'}, + {"vcf", required_argument, 0, 'v'}, + {"genotype-snarls", no_argument, 0, 'a'}, + {"all-snarls", no_argument, 0, 'A'}, + {"min-length", required_argument, 0, 'c'}, + {"max-length", required_argument, 0, 'C'}, + {"ref-fasta", required_argument, 0, 'f'}, + {"ins-fasta", required_argument, 0, 'i'}, + {"sample", required_argument, 0, 's'}, + {"snarls", required_argument, 0, 'r'}, + {"gbwt", required_argument, 0, 'g'}, + {"gbz", no_argument, 0, 'z'}, + {"translation", required_argument, 0, 'N'}, + {"gbz-translation", no_argument, 0, 'O'}, + {"ref-path", required_argument, 0, 'p'}, + {"ref-sample", required_argument, 0, 'S'}, + {"ref-offset", required_argument, 0, 'o'}, + {"ref-length", required_argument, 0, 'l'}, + {"ploidy", required_argument, 0, 'd'}, + {"ploidy-regex", required_argument, 0, 'R'}, + {"gaf", no_argument, 0, 'G'}, + {"traversals", no_argument, 0, 'T'}, + {"min-trav-len", required_argument, 0, 'M'}, + {"legacy", no_argument, 0, 'L'}, + {"nested", no_argument, 0, 'n'}, + {"chains", no_argument, 0, 'I'}, + {"threads", required_argument, 0, 't'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + + c = getopt_long (argc, argv, "k:Be:b:m:v:aAc:C:f:i:s:r:g:zN:Op:S:o:l:d:R:GTLM:nt:h", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; - // This is our command-line parser - ConfigurableParser parser(short_options, long_options, [&](int c) { - // Parse all the options we have defined here. switch (c) { - case 'z': - translation_file_name = optarg; + case 'k': + pack_filename = optarg; + break; + case 'B': + ratio_caller = true; break; case 'b': - base_graph_file_name = optarg; + bias_string = optarg; break; - case 'p': - show_progress = true; + case 'm': + min_support_string = optarg; break; + case 'e': + baseline_error_string = optarg; + break; case 'v': - support_caller.verbose = true; + vcf_filename = optarg; + break; + case 'a': + genotype_snarls = true; + break; + case 'A': + all_snarls = true; + break; + case 'c': + min_ref_allele_len = parse(optarg); + break; + case 'C': + max_ref_allele_len = parse(optarg); + break; + case 'f': + ref_fasta_filename = optarg; + break; + case 'i': + ins_fasta_filename = optarg; + break; + case 's': + sample_name = optarg; + break; + case 'r': + snarl_filename = optarg; + break; + case 'g': + gbwt_filename = optarg; + break; + case 'z': + gbz_paths = true; + break; + case 'N': + translation_file_name = optarg; + break; + case 'O': + gbz_translation = true; + break; + case 'p': + ref_paths.push_back(optarg); break; + case 'S': + ref_sample = optarg; + break; + case 'o': + ref_path_offsets.push_back(parse(optarg)); + break; + case 'l': + ref_path_lengths.push_back(parse(optarg)); + break; + case 'd': + ploidy = parse(optarg); + break; + case 'R': + for (auto& rule : split_delims(optarg, ",")) { + // For each comma-separated rule + auto parts = split_delims(rule, ":"); + if (parts.size() != 2) { + cerr << "error: ploidy rules must be REGEX:PLOIDY" << endl; + exit(1); + } + try { + // Parse the regex + std::regex match(parts[0]); + size_t weight = parse(parts[1]); + // Save the rule + ploidy_rules.emplace_back(match, weight); + } catch (const std::regex_error& e) { + // This is not a good regex + cerr << "error: unacceptable regular expression \"" << parts[0] << "\": " << e.what() << endl; + exit(1); + } + } + break; + case 'G': + gaf_output = true; + break; + case 'T': + traversals_only = true; + gaf_output = true; + break; + case 'M': + trav_padding = parse(optarg); + break; + case 'L': + legacy = true; + break; + case 'n': + nested =true; + break; + case 'I': + call_chains =true; + break; case 't': - thread_count = parse(optarg); + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); break; + } case 'h': case '?': /* getopt_long already printed an error message. */ - help_call(argv, parser); + help_call(argv); exit(1); break; default: - abort (); + abort (); } - }); - // Register the support_caller converter for configuring with its options. - parser.register_configurable(&support_caller); + } - if (argc <= 3) { - help_call(argv, parser); + if (argc <= 2) { + help_call(argv); return 1; } - - // Parse the command line options, updating optind. - parser.parse(argc, argv); - if (thread_count != 0) { - // Use a non-default number of threads - omp_set_num_threads(thread_count); + // parse the supports (stick together to keep number of options down) + vector support_toks = split_delims(min_support_string, ","); + double min_allele_support = -1; + double min_site_support = -1; + if (support_toks.size() >= 1) { + min_allele_support = parse(support_toks[0]); + min_site_support = min_allele_support; + } + if (support_toks.size() == 2) { + min_site_support = parse(support_toks[1]); + } else if (support_toks.size() > 2) { + cerr << "error [vg call]: -m option expects at most two comma separated numbers M,N" << endl; + return 1; + } + // parse the biases + vector bias_toks = split_delims(bias_string, ","); + double het_bias = -1; + double ref_het_bias = -1; + if (bias_toks.size() >= 1) { + het_bias = parse(bias_toks[0]); + ref_het_bias = het_bias; + } + if (bias_toks.size() == 2) { + ref_het_bias = parse(bias_toks[1]); + } else if (bias_toks.size() > 2) { + cerr << "error [vg call]: -b option expects at most two comma separated numbers M,N" << endl; + return 1; + } + // parse the baseline errors (defaults are in snarl_caller.hpp) + vector error_toks = split_delims(baseline_error_string, ","); + double baseline_error_large = -1; + double baseline_error_small = -1; + if (error_toks.size() == 2) { + baseline_error_small = parse(error_toks[0]); + baseline_error_large = parse(error_toks[1]); + if (baseline_error_small > baseline_error_large) { + cerr << "warning [vg call]: with baseline error -e X,Y option, small variant error (X) normally less than large (Y)" << endl; + } + } else if (error_toks.size() != 0) { + cerr << "error [vg call]: -e option expects exactly two comma-separated numbers X,Y" << endl; + return 1; + } + + if (trav_padding > 0 && traversals_only == false) { + cerr << "error [vg call]: -M option can only be used in conjunction with -T" << endl; + return 1; } - thread_count = get_thread_count(); - // Parse the arguments - if (optind >= argc) { - help_call(argv, parser); + if (!vcf_filename.empty() && genotype_snarls) { + cerr << "error [vg call]: -v and -a options cannot be used together" << endl; return 1; } - string graph_file_name = get_input_file_name(optind, argc, argv); - if (string(support_caller.support_file_name).empty()) { - cerr << "[vg call]: Support file must be specified with -s" << endl; + if ((min_ref_allele_len > 0 || max_ref_allele_len < numeric_limits::max()) && (legacy || !vcf_filename.empty() || nested)) { + cerr << "error [vg call]: -c/-C no supported with -v, -l or -n" << endl; + return 1; + } + if (!ref_paths.empty() && !ref_sample.empty()) { + cerr << "error [vg call]: -S cannot be used with -p" << endl; return 1; } - if (translation_file_name.empty()) { - cerr << "[vg call]: Translation file must be specified with -Z" << endl; + // Read the graph + unique_ptr path_handle_graph; + unique_ptr gbz_graph; + gbwt::GBWT* gbwt_index = nullptr; + PathHandleGraph* graph = nullptr; + string graph_filename = get_input_file_name(optind, argc, argv); + auto input = vg::io::VPKG::try_load_first(graph_filename); + if (get<0>(input)) { + gbz_graph = std::move(get<0>(input)); + graph = gbz_graph.get(); + if (gbz_paths) { + gbwt_index = &gbz_graph->gbz.index; + } + } else if (get<1>(input)) { + path_handle_graph = std::move(get<1>(input)); + graph = path_handle_graph.get(); + } else { + cerr << "Error [vg call]: Input graph is not a GBZ or path handle graph" << endl; return 1; } + if (gbz_paths && !gbz_graph) { + cerr << "Error [vg call]: -z can only be used when input graph is in GBZ format" << endl; + return 1; + } + if (gbz_translation && !gbz_graph) { + cerr << "Error [vg call]: -O can only be used when input graph is in GBZ format" << endl; + return 1; + } + + // Read the translation + unique_ptr>> translation; + if (gbz_graph.get() != nullptr && gbz_translation) { + // try to get the translation from the graph + translation = make_unique>>(); + *translation = load_translation_back_map(gbz_graph->gbz.graph); + if (translation->empty()) { + // not worth keeping an empty translation + translation = nullptr; + } + } + if (!translation_file_name.empty()) { + if (!translation->empty()) { + cerr << "Warning [vg call]: Using translation from -N overrides that in input GBZ (you probably don't want to use -N)" << endl; + } + ifstream translation_file(translation_file_name.c_str()); + if (!translation_file) { + cerr << "Error [vg call]: Unable to load translation file: " << translation_file_name << endl; + return 1; + } + translation = make_unique>>(); + *translation = load_translation_back_map(*graph, translation_file); + } + + // Apply overlays as necessary + bool need_path_positions = vcf_filename.empty(); + bool need_vectorizable = !pack_filename.empty(); + bdsg::ReferencePathOverlayHelper pp_overlay_helper; + ReferencePathVectorizableOverlayHelper ppv_overlay_helper; + bdsg::PathVectorizableOverlayHelper pv_overlay_helper; + if (need_path_positions && need_vectorizable) { + graph = dynamic_cast(ppv_overlay_helper.apply(graph)); + } else if (need_path_positions && !need_vectorizable) { + graph = dynamic_cast(pp_overlay_helper.apply(graph)); + } else if (!need_path_positions && need_vectorizable) { + graph = dynamic_cast(pv_overlay_helper.apply(graph)); + } - if (base_graph_file_name.empty()) { - cerr << "[vg call]: Base graph file must be specified with -b" << endl; + // Check our offsets + if (ref_path_offsets.size() != 0 && ref_path_offsets.size() != ref_paths.size()) { + cerr << "error [vg call]: when using -o, the same number paths must be given with -p" << endl; return 1; } - - // read the graph - if (show_progress) { - cerr << "Reading input graph" << endl; + if (!ref_path_offsets.empty() && !vcf_filename.empty()) { + cerr << "error [vg call]: -o cannot be used with -v" << endl; + return 1; + } + // Check our ref lengths + if (ref_path_lengths.size() != 0 && ref_path_lengths.size() != ref_paths.size()) { + cerr << "error [vg call]: when using -l, the same number paths must be given with -p" << endl; + return 1; + } + // Check bias option + if (!bias_string.empty() && !ratio_caller) { + cerr << "error [vg call]: -b can only be used with -B" << endl; + return 1; + } + // Check ploidy option + if (ploidy < 1 || ploidy > 2) { + cerr << "error [vg call]: ploidy (-d) must be either 1 or 2" << endl; + return 1; + } + if (ratio_caller == true && ploidy != 2) { + cerr << "error [vg call]: ploidy (-d) must be 2 when using ratio caller (-B)" << endl; + return 1; + } + if (legacy == true && ploidy != 2) { + cerr << "error [vg call]: ploidy (-d) must be 2 when using legacy caller (-L)" << endl; + return 1; + } + if (!vcf_filename.empty() && !gbwt_filename.empty()) { + cerr << "error [vg call]: gbwt (-g) cannot be used when genotyping VCF (-v)" << endl; + return 1; + } + if (legacy == true && !gbwt_filename.empty()) { + cerr << "error [vg call]: gbwt (-g) cannot be used with legacy caller (-L)" << endl; + return 1; + } + if (gbz_paths && !gbwt_filename.empty()) { + cerr << "error [vg call]: gbwt (-g) cannot be used with gbz graph (-z): choose one or the other" << endl; + return 1; } - VG* graph; - get_input_file(graph_file_name, [&](istream& in) { - graph = new VG(in); - }); - // and the base graph - VG* base_graph = NULL; - get_input_file(base_graph_file_name, [&](istream& in) { - base_graph = new VG(in); - }); + // in order to add subpath support, we let all ref_paths be subpaths and then convert coordinates + // on vcf export. the exception is writing the header where we need base paths. we keep + // track of them the best we can here (just for writing the ##contigs) + unordered_map basepath_length_map; - SupportAugmentedGraph augmented_graph; - // Move our input graph into the augmented graph - // TODO: less terrible interface. also shouldn't have to re-index. - swap(augmented_graph.graph, *graph); - swap(augmented_graph.graph.paths, graph->paths); - augmented_graph.graph.paths.rebuild_node_mapping(); - augmented_graph.graph.paths.rebuild_mapping_aux(); - augmented_graph.graph.paths.to_graph(augmented_graph.graph.graph); + // call doesn't always require path positions .. .don't change that now + function compute_path_length = [&] (path_handle_t path_handle) { + PathPositionHandleGraph* pp_graph = dynamic_cast(graph); + if (pp_graph) { + return pp_graph->get_path_length(path_handle); + } else { + size_t len = 0; + graph->for_each_step_in_path(path_handle, [&] (step_handle_t step) { + len += graph->get_length(graph->get_handle_of_step(step)); + }); + return len; + } + }; + + // No paths specified: use them all + if (ref_paths.empty()) { + set ref_sample_names; + graph->for_each_path_handle([&](path_handle_t path_handle) { + const string& name = graph->get_path_name(path_handle); + PathSense path_sense = PathMetadata::parse_sense(name); + if (!Paths::is_alt(name) && path_sense != PathSense::HAPLOTYPE) { + string sample_name = PathMetadata::parse_sample_name(name); + if (ref_sample.empty() || sample_name == ref_sample) { + ref_paths.push_back(name); + // keep track of length best we can using maximum coordinate in event of subpaths + subrange_t subrange; + string base_name = Paths::strip_subrange(name, &subrange); + size_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + size_t& cur_len = basepath_length_map[base_name]; + cur_len = max(cur_len, compute_path_length(path_handle) + offset); + if (sample_name != PathMetadata::NO_SAMPLE_NAME) { + ref_sample_names.insert(sample_name); + } + } + } + }); + if (ref_sample_names.size() > 1 && ref_sample.empty()) { + cerr << "error [vg call]: Multiple reference samples detected: ["; + size_t count = 0; + for (const string& n : ref_sample_names) { + cerr << n; + if (++count >= std::min(ref_sample_names.size(), (size_t)5)) { + if (ref_sample_names.size() > 5) { + cerr << ", ..."; + } + break; + } else { + cerr << ", "; + } + } + cerr << "]. Please use -S to specify a single reference sample or use -p to specify reference paths"; + return 1; + } + } else { + // if paths are given, we convert them to subpaths so that ref paths list corresponds + // to path names in graph. subpath handling will only be handled when writing the vcf + // (this way, we add subpath support without changing anything in between) + vector ref_subpaths; + unordered_map ref_path_set; + for (const string& ref_path : ref_paths) { + ref_path_set[ref_path] = false; + } + graph->for_each_path_handle([&](path_handle_t path_handle) { + const string& name = graph->get_path_name(path_handle); + subrange_t subrange; + string base_name = Paths::strip_subrange(name, &subrange); + size_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + if (ref_path_set.count(base_name)) { + ref_subpaths.push_back(name); + // keep track of length best we can + if (ref_path_lengths.empty()) { + size_t& cur_len = basepath_length_map[base_name]; + cur_len = max(cur_len, compute_path_length(path_handle) + offset); + } + ref_path_set[base_name] = true; + } + }); + + // if we have reference lengths, great! this will be the only way to get a correct header in the presence of supbpaths + if (!ref_path_lengths.empty()) { + assert(ref_path_lengths.size() == ref_paths.size()); + for (size_t i = 0; i < ref_paths.size(); ++i) { + basepath_length_map[ref_paths[i]] = ref_path_lengths[i]; + } + } - augmented_graph.base_graph = base_graph; + // Check our paths + for (const auto& ref_path_used : ref_path_set) { + if (!ref_path_used.second) { + cerr << "error [vg call]: Reference path \"" << ref_path_used.first << "\" not found in graph" << endl; + return 1; + } + } + + swap(ref_paths, ref_subpaths); + } + + // build table of ploidys + vector ref_path_ploidies; + for (const string& ref_path : ref_paths) { + int path_ploidy = ploidy; + for (auto& rule : ploidy_rules) { + if (std::regex_match(ref_path, rule.first)) { + path_ploidy = rule.second; + break; + } + } + ref_path_ploidies.push_back(path_ploidy); + } + + // Load or compute the snarls + unique_ptr snarl_manager; + if (!snarl_filename.empty()) { + ifstream snarl_file(snarl_filename.c_str()); + if (!snarl_file) { + cerr << "Error [vg call]: Unable to load snarls file: " << snarl_filename << endl; + return 1; + } + snarl_manager = vg::io::VPKG::load_one(snarl_file); + } else { + IntegratedSnarlFinder finder(*graph); + snarl_manager = unique_ptr(new SnarlManager(std::move(finder.find_snarls_parallel()))); + } - delete graph; + // Make a Packed Support Caller + unique_ptr snarl_caller; + algorithms::BinnedDepthIndex depth_index; - // Load the supports - ifstream support_file(support_caller.support_file_name); - if (!support_file) { - cerr << "[vg call]: Unable to load supports file: " - << string(support_caller.support_file_name) << endl; - return 1; + unique_ptr packer; + unique_ptr support_finder; + if (!pack_filename.empty()) { + // Load our packed supports (they must have come from vg pack on graph) + packer = unique_ptr(new Packer(graph)); + packer->load_from_file(pack_filename); + if (nested) { + // Make a nested packed traversal support finder (using cached veresion important for poisson caller) + support_finder.reset(new NestedCachedPackedTraversalSupportFinder(*packer, *snarl_manager)); + } else { + // Make a packed traversal support finder (using cached veresion important for poisson caller) + support_finder.reset(new CachedPackedTraversalSupportFinder(*packer, *snarl_manager)); + } + + // need to use average support when genotyping as small differences in between sample and graph + // will lead to spots with 0-support, espeically in and around SVs. + support_finder->set_support_switch_threshold(avg_trav_threshold, avg_node_threshold); + + // upweight breakpoint edges even when taking average support otherwise + support_finder->set_min_bp_edge_override(expect_bp_edges); + + // todo: toggle between min / average (or thresholds) via command line + + SupportBasedSnarlCaller* packed_caller = nullptr; + + if (ratio_caller == false) { + // Make a depth index + depth_index = algorithms::binned_packed_depth_index(*packer, ref_paths, min_depth_bin_width, max_depth_bin_width, + depth_scale_fac, 0, true, true); + // Make a new-stype probablistic caller + auto poisson_caller = new PoissonSupportSnarlCaller(*graph, *snarl_manager, *support_finder, depth_index, + //todo: qualities need to be used better in conjunction with + //expected depth. + //packer->has_qualities()); + false); + + // Pass the errors through + poisson_caller->set_baseline_error(baseline_error_small, baseline_error_large); + + packed_caller = poisson_caller; + } else { + // Make an old-style ratio support caller + auto ratio_caller = new RatioSupportSnarlCaller(*graph, *snarl_manager, *support_finder); + if (het_bias >= 0) { + ratio_caller->set_het_bias(het_bias, ref_het_bias); + } + packed_caller = ratio_caller; + } + if (min_allele_support >= 0) { + packed_caller->set_min_supports(min_allele_support, min_allele_support, min_site_support); + } + + snarl_caller = unique_ptr(packed_caller); } - augmented_graph.load_supports(support_file); - // Load the translations - ifstream translation_file(translation_file_name.c_str()); - if (!translation_file) { - cerr << "[vg call]: Unable to load translations file: " << translation_file_name << endl; + if (!snarl_caller) { + cerr << "error [vg call]: pack file (-k) is required" << endl; return 1; } - augmented_graph.load_translations(translation_file); - - if (show_progress) { - cerr << "Calling variants with support caller" << endl; + + unique_ptr alignment_emitter; + if (gaf_output) { + alignment_emitter = vg::io::get_non_hts_alignment_emitter("-", "GAF", {}, get_thread_count(), graph); } - // project the augmented graph to a reference path - // in order to create a VCF of calls. - support_caller.call(augmented_graph, {}); + unique_ptr graph_caller; + unique_ptr traversal_finder; + unique_ptr gbwt_index_up; - delete base_graph; + vcflib::VariantCallFile variant_file; + unique_ptr ref_fasta; + unique_ptr ins_fasta; + if (!vcf_filename.empty()) { + // Genotype the VCF + variant_file.parseSamples = false; + variant_file.open(vcf_filename); + if (!variant_file.is_open()) { + cerr << "error: [vg call] could not open " << vcf_filename << endl; + return 1; + } + + // load up the fasta + if (!ref_fasta_filename.empty()) { + ref_fasta = unique_ptr(new FastaReference); + ref_fasta->open(ref_fasta_filename); + } + if (!ins_fasta_filename.empty()) { + ins_fasta = unique_ptr(new FastaReference); + ins_fasta->open(ins_fasta_filename); + } + + VCFGenotyper* vcf_genotyper = new VCFGenotyper(*graph, *snarl_caller, + *snarl_manager, variant_file, + sample_name, ref_paths, ref_path_ploidies, + ref_fasta.get(), + ins_fasta.get(), + alignment_emitter.get(), + traversals_only, + gaf_output, + trav_padding); + graph_caller = unique_ptr(vcf_genotyper); + } else if (legacy) { + // de-novo caller (port of the old vg call code, which requires a support based caller) + LegacyCaller* legacy_caller = new LegacyCaller(*dynamic_cast(graph), + *dynamic_cast(snarl_caller.get()), + *snarl_manager, + sample_name, ref_paths, ref_path_offsets, ref_path_ploidies); + graph_caller = unique_ptr(legacy_caller); + } else { + // flow caller can take any kind of traversal finder. two are supported for now: + + if (!gbwt_filename.empty() || gbz_paths) { + // GBWT traversals + if (!gbz_paths) { + gbwt_index_up = vg::io::VPKG::load_one(gbwt_filename); + gbwt_index = gbwt_index_up.get(); + if (gbwt_index == nullptr) { + cerr << "error:[vg call] unable to load gbwt index file: " << gbwt_filename << endl; + return 1; + } + } + GBWTTraversalFinder* gbwt_traversal_finder = new GBWTTraversalFinder(*graph, *gbwt_index); + traversal_finder = unique_ptr(gbwt_traversal_finder); + } else { + // Flow traversals (Yen's algorithm) + + // todo: do we ever want to toggle in min-support? + function node_support = [&] (handle_t h) { + return support_finder->support_val(support_finder->get_avg_node_support(graph->get_id(h))); + }; + + function edge_support = [&] (edge_t e) { + return support_finder->support_val(support_finder->get_edge_support(e)); + }; + + // create the flow traversal finder + FlowTraversalFinder* flow_traversal_finder = new FlowTraversalFinder(*graph, *snarl_manager, max_yens_traversals, + node_support, edge_support); + traversal_finder = unique_ptr(flow_traversal_finder); + } + + if (nested) { + graph_caller.reset(new NestedFlowCaller(*dynamic_cast(graph), + *dynamic_cast(snarl_caller.get()), + *snarl_manager, + sample_name, *traversal_finder, ref_paths, ref_path_offsets, + ref_path_ploidies, + alignment_emitter.get(), + traversals_only, + gaf_output, + trav_padding, + genotype_snarls)); + } else { + graph_caller.reset(new FlowCaller(*dynamic_cast(graph), + *dynamic_cast(snarl_caller.get()), + *snarl_manager, + sample_name, *traversal_finder, ref_paths, ref_path_offsets, + ref_path_ploidies, + alignment_emitter.get(), + traversals_only, + gaf_output, + trav_padding, + genotype_snarls, + make_pair(min_ref_allele_len, max_ref_allele_len))); + } + } + + string header; + if (!gaf_output) { + // Init The VCF + VCFOutputCaller* vcf_caller = dynamic_cast(graph_caller.get()); + assert(vcf_caller != nullptr); + // Make sure we get the LV/PS tags with -A + vcf_caller->set_nested(all_snarls); + vcf_caller->set_translation(translation.get()); + // Make sure the basepath information we inferred above goes directy to the VCF header + // (and that it does *not* try to read it from the graph paths) + vector header_ref_paths; + vector header_ref_lengths; + bool need_overrides = dynamic_cast(graph_caller.get()) == nullptr; + for (const auto& path_len : basepath_length_map) { + header_ref_paths.push_back(path_len.first); + if (need_overrides) { + header_ref_lengths.push_back(path_len.second); + } + } + header = vcf_caller->vcf_header(*graph, header_ref_paths, header_ref_lengths); + } + + // Call the graph + if (!call_chains) { + + // Call each snarl + // (todo: try chains in normal mode) + graph_caller->call_top_level_snarls(*graph, all_snarls ? GraphCaller::RecurseAlways : GraphCaller::RecurseOnFail); + } else { + // Attempt to call chains instead of snarls so that the output traversals are longer + // Todo: this could probably help in some cases when making VCFs too + graph_caller->call_top_level_chains(*graph, max_chain_edges, max_chain_trivial_travs, + all_snarls ? GraphCaller::RecurseAlways : GraphCaller::RecurseOnFail); + } + + if (!gaf_output) { + // Output VCF + VCFOutputCaller* vcf_caller = dynamic_cast(graph_caller.get()); + assert(vcf_caller != nullptr); + cout << header << flush; + vcf_caller->write_variants(cout, snarl_manager.get()); + } + return 0; } // Register subcommand -static Subcommand vg_call("call", "call variants on an augmented graph", PIPELINE, 5, main_call); +static Subcommand vg_call("call", "call or genotype VCF variants", PIPELINE, 10, main_call); diff --git a/src/subcommand/chain_main.cpp b/src/subcommand/chain_main.cpp new file mode 100644 index 00000000000..10a78c58708 --- /dev/null +++ b/src/subcommand/chain_main.cpp @@ -0,0 +1,268 @@ +/** \file chain_main.cpp + * + * Defines the "vg chain" subcommand, which runs a serialized hit chaining problem. + */ + +#include + +#include +#include +#include + +#include + +#include "subcommand.hpp" + +#include "../algorithms/chain_items.hpp" +#include "../integrated_snarl_finder.hpp" + +//#define USE_CALLGRIND + +#ifdef USE_CALLGRIND +#include +#endif + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_chain(char** argv) { + cerr << "usage: " << argv[0] << " chain [options] input.json" << endl + << "options:" << endl + << " -p, --progress show progress" << endl; +} + +int main_chain(int argc, char** argv) { + + bool show_progress = false; + + + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + {"progress", no_argument, 0, 'p'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "ph?", + long_options, &option_index); + + /* Detect the end of the options. */ + if (c == -1) + break; + + switch (c) + { + + case 'p': + show_progress = true; + break; + + case 'h': + case '?': + /* getopt_long already printed an error message. */ + help_chain(argv); + exit(1); + break; + + default: + abort (); + + } + } + + if (optind == argc) { + // No positional arguments + help_chain(argv); + exit(1); + } + + string problem_filename = get_input_file_name(optind, argc, argv); + + if (optind != argc) { + // Too many positional arguments + help_chain(argv); + exit(1); + } + + // Load up the JSON file we are supposed to have. + json_error_t json_error; + json_t* problem_json = json_load_file(problem_filename.c_str(), 0, &json_error); + if (!problem_json) { + throw std::runtime_error(json_error.text); + } + assert(json_is_object(problem_json)); + + if (show_progress) { + std::cerr << "Loaded problem from " << problem_filename << std::endl; + } + + // Populate the graph. + // TODO: use more libvgio stuff somehow. + HashGraph graph; + json_t* graph_json = json_object_get(problem_json, "subgraph"); + if (graph_json && json_is_object(graph_json)) { + json_t* nodes_json = json_object_get(graph_json, "node"); + if (nodes_json && json_is_array(nodes_json)) { + // Go through the node array + for (size_t i = 0; i < json_array_size(nodes_json); i++) { + json_t* node_json = json_array_get(nodes_json, i); + if (node_json && json_is_object(node_json)) { + // Make each node + const char* node_id = nullptr; + const char* sequence = nullptr; + + if (json_unpack_ex(node_json, &json_error, 0, "{s:s, s:s}", "id", &node_id, "sequence", &sequence) == 0) { + // This record is well-formed, so make the node + assert(node_id != nullptr); + assert(sequence != nullptr); + graph.create_handle(sequence, vg::parse(node_id)); + } else { + std::cerr << "warning:[vg chain] Unreadable node object at index " << i << ": " << json_error.text << std::endl; + } + } else { + std::cerr << "warning:[vg chain] No node object at index " << i << std::endl; + } + } + } else { + std::cerr << "warning:[vg chain] No nodes" << std::endl; + } + json_t* edges_json = json_object_get(graph_json, "edge"); + if (edges_json && json_is_array(edges_json)) { + // Go through the edge array + for (size_t i = 0; i < json_array_size(edges_json); i++) { + json_t* edge_json = json_array_get(edges_json, i); + if (edge_json && json_is_object(edge_json)) { + // Decode each edge + // Note that Jansson is C and can't use bool; it's "b" will decode an int. + const char* from_id = nullptr; + int from_start = 0; + const char* to_id = nullptr; + int to_end = 0; + + if (json_unpack_ex(edge_json, &json_error, 0, "{s:s, s?b, s:s, s?b}", "from", &from_id, "from_start", &from_start, "to", &to_id, "to_end", &to_end) == 0) { + // This record is well-formed, so make the edge + assert(from_id != nullptr); + assert(to_id != nullptr); + handle_t from_handle = graph.get_handle(vg::parse(from_id), from_start); + handle_t to_handle = graph.get_handle(vg::parse(to_id), to_end); + graph.create_edge(from_handle, to_handle); + } else { + std::cerr << "warning:[vg chain] Unreadable edge object at index " << i << ": " << json_error.text << std::endl; + } + } else { + std::cerr << "warning:[vg chain] No edge object at index " << i << std::endl; + } + } + } else { + std::cerr << "warning:[vg chain] No edges" << std::endl; + } + } else { + std::cerr << "warning:[vg chain] No graph" << std::endl; + } + if (show_progress) { + std::cerr << "Reconstructed " << graph.get_node_count() << " nodes and " << graph.get_edge_count() << " edges" << std::endl; + } + + if (graph.get_node_count() == 0) { + std::cerr << "error:[vg chain] Cannot build indexes for an empty graph" << std::endl; + exit(1); + } + + // Create the chaining space based on it + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + if (show_progress) { + std::cerr << "Built distance index" << std::endl; + } + + // Decide how to score alignments + vg::Aligner scorer; + + // Create all the items to chain + std::vector items; + json_t* items_json = json_object_get(problem_json, "items"); + if (items_json && json_is_array(items_json)) { + items.reserve(json_array_size(items_json)); + for (size_t i = 0; i < json_array_size(items_json); i++) { + json_t* item_json = json_array_get(items_json, i); + if (item_json && json_is_object(item_json)) { + // For each chainable item we got, decode it. + // Note that Jansson is C and can't use bool; it's "b" will decode an int. + const char* read_start = nullptr; + const char* read_end = nullptr; + const int score = 31; + json_t* graph_start = nullptr; + const char* graph_start_id = nullptr; + const char* graph_start_offset = "0"; + int graph_start_is_reverse = 0; + json_t* graph_end = nullptr; + const char* graph_end_id = nullptr; + const char* graph_end_offset = "0"; + int graph_end_is_reverse = 0; + if (json_unpack_ex(item_json, &json_error, 0, "{s:s, s:s, s?i, s:o, s:o}", + "read_start", &read_start, + "read_end", &read_end, + "score", &score, + "graph_start", &graph_start, + "graph_end", &graph_end) == 0 && + json_unpack_ex(graph_start, &json_error, 0, "{s:s, s?s, s?b}", + "node_id", &graph_start_id, "offset", &graph_start_offset, "is_reverse", &graph_start_is_reverse) == 0 && + json_unpack_ex(graph_end, &json_error, 0, "{s:s, s?s, s?b}", + "node_id", &graph_end_id, "offset", &graph_end_offset, "is_reverse", &graph_end_is_reverse) == 0) { + + // We have an item record. + assert(read_start != nullptr); + assert(read_end != nullptr); + assert(graph_start_id != nullptr); + assert(graph_end_id != nullptr); + + // We can only handle items where they occupy space on just one node. + assert(strcmp(graph_start_id, graph_end_id) == 0); + + // And we assume the lengths match. + size_t start = vg::parse(read_start); + size_t length = vg::parse(read_end) - start; + + // Pack up into an item + items.emplace_back(start, make_pos_t(vg::parse(graph_start_id), graph_start_is_reverse, vg::parse(graph_start_offset)), length, score); + } else { + std::cerr << "warning:[vg chain] Unreadable item object at index " << i << ": " << json_error.text << std::endl; + } + } else { + std::cerr << "warning:[vg chain] No item object at index " << i << std::endl; + } + } + } else { + std::cerr << "warning:[vg chain] No items" << std::endl; + } + if (show_progress) { + std::cerr << "Reconstructed " << items.size() << " chainable items" << std::endl; + } + + // Now we have parsed the JSON, so throw it out. + json_decref(problem_json); + problem_json = nullptr; + +#ifdef USE_CALLGRIND + // We want to profile the chaining, not the loading. + CALLGRIND_START_INSTRUMENTATION; +#endif + + // Do the chaining. We assume items is already sorted right. + std::pair> score_and_chain = vg::algorithms::find_best_chain(items, distance_index, graph, scorer.gap_open, scorer.gap_extension); + + std::cout << "Best chain gets score " << score_and_chain.first << std::endl; + + return 0; +} + +// Register subcommand +static Subcommand vg_chain("chain", "run a serialized chaining problem", DEVELOPMENT, main_chain); + diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index 211f6d49924..817babd7fcb 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -13,21 +13,28 @@ #include "subcommand.hpp" #include "../vg.hpp" -#include "../stream.hpp" +#include +#include +#include #include "../utility.hpp" #include "../chunker.hpp" -#include "../gam_index.hpp" +#include "../stream_index.hpp" #include "../region.hpp" #include "../haplotype_extracter.hpp" #include "../algorithms/sorted_id_ranges.hpp" +#include "../algorithms/find_gbwt.hpp" +#include +#include "../io/save_handle_graph.hpp" using namespace std; using namespace vg; using namespace vg::subcommand; -using namespace xg; +static string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi = 0, bool components = false); static int split_gam(istream& gam_stream, size_t chunk_size, const string& out_prefix, size_t gam_buffer_size = 100); +static void check_read(const Alignment& aln, const HandleGraph* graph); + void help_chunk(char** argv) { cerr << "usage: " << argv[0] << " chunk [options] > [chunk.vg]" << endl @@ -39,22 +46,26 @@ void help_chunk(char** argv) { << "For a single-range chunk (-p or -r), the graph data is sent to standard output instead of a file." << endl << endl << "options:" << endl - << " -x, --xg-name FILE use this xg index to chunk subgraphs" << endl - << " -G, --gbwt-name FILE use this GBWT haplotype index for haplotype extraction" << endl - << " -a, --gam-name FILE chunk this gam file (not stdin, sorted, with FILE.gai index) instead of the graph" << endl + << " -x, --xg-name FILE use this graph or xg index to chunk subgraphs" << endl + << " -G, --gbwt-name FILE use this GBWT haplotype index for haplotype extraction (for -T)" << endl + << " -a, --gam-name FILE chunk this gam file instead of the graph (multiple allowed)" << endl << " -g, --gam-and-graph when used in combination with -a, both gam and graph will be chunked" << endl << "path chunking:" << endl - << " -p, --path TARGET write the chunk in the specified (0-based inclusive)\n" + << " -p, --path TARGET write the chunk in the specified (0-based inclusive, multiple allowed)\n" << " path range TARGET=path[:pos1[-pos2]] to standard output" << endl << " -P, --path-list FILE write chunks for all path regions in (line - separated file). format" << endl << " for each as in -p (all paths chunked unless otherwise specified)" << endl << " -e, --input-bed FILE write chunks for all (0-based end-exclusive) bed regions" << endl + << " -S, --snarls FILE write given path-range(s) and all snarls fully contained in them, as alternative to -c" << endl << "id range chunking:" << endl << " -r, --node-range N:M write the chunk for the specified node range to standard output\n" << " -R, --node-ranges FILE write the chunk for each node range in (newline or whitespace separated) file" << endl << " -n, --n-chunks N generate this many id-range chunks, which are determined using the xg index" << endl << "simple gam chunking:" << endl << " -m, --gam-split-size N split gam (specified with -a, sort/index not required) up into chunks with at most N reads each" << endl + << "component chunking:" << endl + << " -C, --components create a chunk for each connected component. if a targets given with (-p, -P, -r, -R), limit to components containing them" << endl + << " -M, --path-components create a chunk for each path in the graph's connected component" << endl << "general:" << endl << " -s, --chunk-size N create chunks spanning N bases (or nodes with -r/-R) for all input regions." << endl << " -o, --overlap N overlap between chunks when using -s [0]" << endl @@ -64,8 +75,10 @@ void help_chunk(char** argv) { << " -c, --context-steps N expand the context of the chunk this many node steps [1]" << endl << " -l, --context-length N expand the context of the chunk by this many bp [0]" << endl << " -T, --trace trace haplotype threads in chunks (and only expand forward from input coordinates)." << endl - << " Produces a .annotate.txt file with haplotype frequencies for each chunk." << endl + << " Produces a .annotate.txt file with haplotype frequencies for each chunk." << endl + << " --no-embedded-haplotypes Don't load haplotypes from the graph. It is possible to -T without any haplotypes available." << endl << " -f, --fully-contained only return GAM alignments that are fully contained within chunk" << endl + << " -O, --output-fmt Specify output format (vg, pg, hg, gfa). [pg (vg with -T)]" << endl << " -t, --threads N for tasks that can be done in parallel, use this many threads [1]" << endl << " -h, --help" << endl; } @@ -79,9 +92,9 @@ int main_chunk(int argc, char** argv) { string xg_file; string gbwt_file; - string gam_file; + vector gam_files; bool gam_and_graph = false; - string region_string; + vector region_strings; string path_list_file; int chunk_size = 0; int overlap = 0; @@ -93,11 +106,18 @@ int main_chunk(int argc, char** argv) { bool id_range = false; string node_range_string; string node_ranges_file; - int threads = 1; bool trace = false; + bool no_embedded_haplotypes = false; bool fully_contained = false; int n_chunks = 0; size_t gam_split_size = 0; + string output_format = "pg"; + bool output_format_set = false; + bool components = false; + bool path_components = false; + string snarl_filename; + + #define OPT_NO_EMBEDDED_HAPLOTYPES 1000 int c; optind = 2; // force optind past command positional argument @@ -114,22 +134,27 @@ int main_chunk(int argc, char** argv) { {"chunk-size", required_argument, 0, 's'}, {"overlap", required_argument, 0, 'o'}, {"input-bed", required_argument, 0, 'e'}, + {"snarls", required_argument, 0, 'S'}, {"output-bed", required_argument, 0, 'E'}, {"prefix", required_argument, 0, 'b'}, {"context", required_argument, 0, 'c'}, {"id-ranges", no_argument, 0, 'r'}, {"id-range", no_argument, 0, 'R'}, - {"trace", required_argument, 0, 'T'}, + {"trace", no_argument, 0, 'T'}, + {"no-embedded-haplotypes", no_argument, 0, OPT_NO_EMBEDDED_HAPLOTYPES}, {"fully-contained", no_argument, 0, 'f'}, {"threads", required_argument, 0, 't'}, {"n-chunks", required_argument, 0, 'n'}, {"context-length", required_argument, 0, 'l'}, {"gam-split-size", required_argument, 0, 'm'}, + {"components", no_argument, 0, 'C'}, + {"path-components", no_argument, 0, 'M'}, + {"output-fmt", required_argument, 0, 'O'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:E:b:c:r:R:Tft:n:l:m:", + c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:S:E:b:c:r:R:Tft:n:l:m:CMO:", long_options, &option_index); @@ -149,7 +174,7 @@ int main_chunk(int argc, char** argv) { break; case 'a': - gam_file = optarg; + gam_files.push_back(optarg); break; case 'g': @@ -157,7 +182,7 @@ int main_chunk(int argc, char** argv) { break; case 'p': - region_string = optarg; + region_strings.push_back(optarg); break; case 'P': @@ -175,7 +200,11 @@ int main_chunk(int argc, char** argv) { case 'e': in_bed_file = optarg; break; - + + case 'S': + snarl_filename = optarg; + break; + case 'E': out_bed_file = optarg; break; @@ -212,16 +241,41 @@ int main_chunk(int argc, char** argv) { gam_split_size = parse(optarg); break; + case 'C': + components = true; + break; + + case 'M': + components = true; + path_components = true; + break; + case 'T': trace = true; break; + + case OPT_NO_EMBEDDED_HAPLOTYPES: + no_embedded_haplotypes = true; + break; case 'f': fully_contained = true; break; case 't': - threads = parse(optarg); + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg chunk] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } + + case 'O': + output_format = optarg; + output_format_set = true; break; case 'h': @@ -235,89 +289,151 @@ int main_chunk(int argc, char** argv) { } } - omp_set_num_threads(threads); - // need at most one of -n, -p, -P, -e, -r, -R, -m as an input - if ((n_chunks == 0 ? 0 : 1) + (region_string.empty() ? 0 : 1) + (path_list_file.empty() ? 0 : 1) + (in_bed_file.empty() ? 0 : 1) + - (node_ranges_file.empty() ? 0 : 1) + (node_range_string.empty() ? 0 : 1) + (gam_split_size == 0 ? 0 : 1) > 1) { - cerr << "error:[vg chunk] at most one of {-n, -p, -P, -e, -r, -R, m} required to specify input regions" << endl; + if ((n_chunks == 0 ? 0 : 1) + (region_strings.empty() ? 0 : 1) + (path_list_file.empty() ? 0 : 1) + (in_bed_file.empty() ? 0 : 1) + + (node_ranges_file.empty() ? 0 : 1) + (node_range_string.empty() ? 0 : 1) + (gam_split_size == 0 ? 0 : 1) + + (path_components ? 1 : 0) > 1) { + cerr << "error:[vg chunk] at most one of {-n, -p, -P, -e, -r, -R, -m, '-M'} required to specify input regions" << endl; return 1; } // need -a if using -f - if ((gam_split_size != 0 || fully_contained) && gam_file.empty()) { + if ((gam_split_size != 0 || fully_contained) && gam_files.empty()) { cerr << "error:[vg chunk] gam file must be specified with -a when using -f or -m" << endl; return 1; } - // context steps default to 1 if using id_ranges. otherwise, force user to specify to avoid - // misunderstandings - if (context_steps < 0 && gam_split_size == 0) { - if (id_range) { - if (!context_length) { - context_steps = 1; - } - } else { - cerr << "error:[vg chunk] context expansion steps must be specified with -c/--context when chunking on paths" << endl; - return 1; + if (components == true && context_steps >= 0) { + cerr << "error:[vg chunk] context cannot be specified (-c) when splitting into components (-C)" << endl; + return 1; + } + + if (!snarl_filename.empty() && context_steps >= 0) { + cerr << "error:[vg chunk] context cannot be specified (-c) when using snarls (-S)" << endl; + return 1; + } + if (!snarl_filename.empty() && region_strings.empty() && path_list_file.empty() && in_bed_file.empty()) { + cerr << "error:[vg chunk] snarl chunking can only be used with path regions (-p -P -e)" << endl; + return 1; + } + + // check the output format + std::transform(output_format.begin(), output_format.end(), output_format.begin(), ::tolower); + if (!vg::io::valid_output_format(output_format)) { + cerr << "error[vg chunk]: invalid output format" << endl; + return 1; + } + if (trace && output_format != "vg") { + // todo: trace code goes through vg conversion anyway and according to unit tests + // fails when not outputting vg + output_format = "vg"; + if (output_format_set) { + cerr << "warning[vg chunk]: ignoring -O and setting output format to vg, as required by -T" << endl; } + } + else if (output_format == "vg") { + cerr << "warning[vg chunk]: the vg-protobuf format is DEPRECATED. you probably want to use PackedGraph (pg) instead" << endl; + } + string output_ext = output_format == "gfa" ? ".gfa" : ".vg"; // figure out which outputs we want. the graph always // needs to be chunked, even if only gam output is requested, // because we use the graph to get the nodes we're looking for. // but we only write the subgraphs to disk if chunk_graph is true. - bool chunk_gam = !gam_file.empty() && gam_split_size == 0; + bool chunk_gam = !gam_files.empty() && gam_split_size == 0; bool chunk_graph = gam_and_graph || (!chunk_gam && gam_split_size == 0); + // Load the snarls + unique_ptr snarl_manager; + if (!snarl_filename.empty()) { + ifstream snarl_file(snarl_filename.c_str()); + if (!snarl_file) { + cerr << "error:[vg chunk] Unable to load snarls file: " << snarl_filename << endl; + return 1; + } + snarl_manager = vg::io::VPKG::load_one(snarl_file); + } + // Load our index - xg::XG xindex; - if (chunk_graph || trace || context_steps > 0 || context_length > 0 || (!id_range && gam_split_size == 0)) { + PathPositionHandleGraph* graph = nullptr; + unique_ptr path_handle_graph; + bdsg::ReferencePathOverlayHelper overlay_helper; + + if (chunk_graph || trace || context_steps > 0 || context_length > 0 || (!id_range && gam_split_size == 0) || components) { if (xg_file.empty()) { - cerr << "error:[vg chunk] xg index (-x) required" << endl; + cerr << "error:[vg chunk] graph or xg index (-x) required" << endl; return 1; } ifstream in(xg_file.c_str()); if (!in) { - cerr << "error:[vg chunk] unable to load xg index file " << xg_file << endl; + cerr << "error:[vg chunk] unable to load graph / xg index file " << xg_file << endl; return 1; } - xindex.load(in); + in.close(); + + path_handle_graph = vg::io::VPKG::load_one(xg_file); + graph = overlay_helper.apply(path_handle_graph.get()); in.close(); } // Now load the haplotype data - unique_ptr gbwt_index; - if (trace && !gbwt_file.empty()) { - // We are tracing haplotypes, and we want to use the GBWT instead of the old gPBWT. - gbwt_index = unique_ptr(new gbwt::GBWT()); + + unique_ptr gbwt_index_holder; + const gbwt::GBWT* gbwt_index = nullptr; + if (trace) { + // We are tracing haplotypes. + // We might want a GBWT. - // Open up the index - ifstream in(gbwt_file.c_str()); - if (!in) { - cerr << "error:[vg chunk] unable to load gbwt index file " << gbwt_file << endl; - return 1; + // TODO: Make the tube map stop calling us in trace mode *without* a GBWT? + + if (!gbwt_file.empty()) { + // A GBWT file is specified, so load that + gbwt_index_holder = vg::io::VPKG::load_one(gbwt_file); + if (gbwt_index_holder.get() == nullptr) { + // Complain if we couldn't get it but were supposed to. + cerr << "error:[vg::chunk] unable to load gbwt index file " << gbwt_file << endl; + exit(1); + } + gbwt_index = gbwt_index_holder.get(); } - - // And load it - gbwt_index->load(in); + + if (!gbwt_index && !no_embedded_haplotypes) { + // We didn't get a GBWT from a file, and we are allowed to use the one in the graph, if any. + gbwt_index = vg::algorithms::find_gbwt(path_handle_graph.get()); + } + + // It's OK if gbwt_index is still null here! } - // We need an index on the GAM to chunk it - unique_ptr gam_index; - if (chunk_gam) { - get_input_file(gam_file + ".gai", [&](istream& index_stream) { - gam_index = unique_ptr(new GAMIndex()); - gam_index->load(index_stream); - }); + // We need an index on the GAM to chunk it (if we're not doing components) + vector> gam_indexes; + if (chunk_gam && !components) { + for (auto gam_file : gam_files) { + try { + get_input_file(gam_file + ".gai", [&](istream& index_stream) { + gam_indexes.push_back(unique_ptr(new GAMIndex())); + gam_indexes.back()->load(index_stream); + }); + } catch (...) { + cerr << "error:[vg chunk] unable to load GAM index file: " << gam_file << ".gai" << endl + << " note: a GAM index is required when *not* chunking by components with -C or -M" << endl; + exit(1); + } + } } + // If we're chunking on components with a GAM, this map will be used + // (instead of an index) + unordered_map node_to_component; // parse the regions into a list vector regions; - if (!region_string.empty()) { - Region region; - parse_region(region_string, region); - regions.push_back(region); + if (!region_strings.empty()) { + for (auto& region_string : region_strings) { + Region region; + parse_region(region_string, region); + regions.push_back(region); + } } else if (!path_list_file.empty()) { ifstream pr_stream(path_list_file.c_str()); @@ -340,17 +456,33 @@ int main_chunk(int argc, char** argv) { } else if (id_range) { if (n_chunks) { - // determine the ranges from the xg index itself + // Determine the ranges from the source graph itself. // how many nodes per range? - int nodes_per_chunk = xindex.node_count / n_chunks; + size_t node_count = graph->get_node_count(); + size_t nodes_per_chunk = node_count / n_chunks; + + // We need to articulate our chunks in terms of ID ranges, but we + // have no guarantee that the graph we pull from will be in ID + // order. An XG probably ought to be in topological order anyway. + // So we pull all the IDs and sort them in a big vector in order to + // get the chunk ID breakpoints. + vector rank_to_id(node_count + 1); size_t i = 1; + graph->for_each_handle([&](handle_t handle) { + rank_to_id[i++] = graph->get_id(handle); + }); + + // Sort so we can find the nth ID easily + std::sort(rank_to_id.begin(), rank_to_id.end()); + + i = 1; // iterate through the node ranks to build the regions - while (i < xindex.node_count) { + while (i < node_count) { // make a range from i to i+nodeS_per_range - vg::id_t a = xindex.rank_to_id(i); + vg::id_t a = rank_to_id[i]; size_t j = i + nodes_per_chunk; - if (j > xindex.node_count) j = xindex.node_count; - vg::id_t b = xindex.rank_to_id(j); + if (j > node_count) j = node_count; + vg::id_t b = rank_to_id[j]; Region region; region.start = a; region.end = b; @@ -387,31 +519,56 @@ int main_chunk(int argc, char** argv) { delete range_stream; } } - else { + else if (graph != nullptr && (!components || path_components)) { // every path - size_t max_rank = xindex.max_path_rank(); - for (size_t rank = 1; rank <= max_rank; ++rank) { - Region region; - region.seq = xindex.path_name(rank); - regions.push_back(region); + graph->for_each_path_handle([&](path_handle_t path_handle) { + Region region; + region.seq = graph->get_path_name(path_handle); + if (!Paths::is_alt(region.seq)) { + regions.push_back(region); + } + }); + } + + if (context_steps >= 0 && regions.empty()) { + cerr << "error:[vg chunk] extracting context (-c) requires a region to take context around" << endl; + return 1; + } + + // context steps default to 1 if using id_ranges. otherwise, force user to specify to avoid + // misunderstandings + if (context_steps < 0 && gam_split_size == 0) { + if (id_range) { + if (!context_length) { + context_steps = 1; + } + } else if (!components && snarl_filename.empty()){ + cerr << "error:[vg chunk] context (-c) or snarls (-S) must be specified when chunking on paths" << endl; + return 1; } } // validate and fill in sizes for regions that span entire path + function get_path_length = [&](const string& path_name) { + size_t path_length = 0; + for (handle_t handle : graph->scan_path(graph->get_path_handle(path_name))) { + path_length += graph->get_length(handle); + } + return path_length; + }; if (!id_range) { for (auto& region : regions) { - size_t rank = xindex.path_rank(region.seq); - if (rank == 0) { + if (!graph->has_path(region.seq)) { cerr << "error[vg chunk]: input path " << region.seq << " not found in xg index" << endl; return 1; } region.start = max((int64_t)0, region.start); - if (region.end == -1) { - region.end = xindex.path_length(rank); - } else if (!id_range) { - if (region.start < 0 || region.end >= xindex.path_length(rank)) { + if (region.end == -1) { + region.end = get_path_length(region.seq) - 1; + } else if (!id_range && !components) { + if (region.start < 0 || region.end >= get_path_length(region.seq)) { cerr << "error[vg chunk]: input region " << region.seq << ":" << region.start << "-" << region.end - << " is out of bounds of path " << region.seq << " which has length "<< xindex.path_length(rank) + << " is out of bounds of path " << region.seq << " which has length "<< get_path_length(region.seq) << endl; return -1; } @@ -437,30 +594,38 @@ int main_chunk(int argc, char** argv) { swap(regions, chunked_regions); } - // now ready to get our chunk on + // when using -C for components, regions will be derived from the connected components + vector> component_ids; + if (components == true && regions.empty()) { + // no regions given, we find our components from scratch and make some dummy regions + component_ids = handlealgs::weakly_connected_components(graph); + for (int i = 0; i < component_ids.size(); ++i) { + Region region; + region.seq = ""; + region.start = 0; + region.end = 0; + regions.push_back(region); + } + } + // now ready to get our chunk on if (gam_split_size != 0) { - ifstream gam_stream; - // Open the GAM file, whether splitting directly or seeking with an index - gam_stream.open(gam_file); - if (!gam_stream) { - cerr << "error[vg chunk]: unable to open input gam: " << gam_file << endl; - return 1; + for (size_t gi = 0; gi < gam_files.size(); ++gi) { + ifstream gam_stream; + string& gam_file = gam_files[gi]; + // Open the GAM file, whether splitting directly or seeking with an index + gam_stream.open(gam_file); + if (!gam_stream) { + cerr << "error[vg chunk]: unable to open input gam: " << gam_file << endl; + return 1; + } + // just chunk up every N reads in the gam without any path or id logic. Don't do anything else. + string prefix = gi == 0 ? out_chunk_prefix : out_chunk_prefix + std::to_string(gi); + split_gam(gam_stream, gam_split_size, prefix); } - // just chunk up every N reads in the gam without any path or id logic. Don't do anything else. - return split_gam(gam_stream, gam_split_size, out_chunk_prefix); + return 0; } - // what's the name of chunk i? - function chunk_name = - [&out_chunk_prefix](int i, const Region& region, string ext) -> string { - stringstream chunk_name; - string seq = region.seq.empty() ? "ids" : region.seq; - chunk_name << out_chunk_prefix << "_" << i << "_" << seq << "_" - << region.start << "_" << region.end << ext; - return chunk_name.str(); - }; - int num_regions = regions.size(); // because we are expanding context, and not cutting nodes, our output @@ -469,30 +634,36 @@ int main_chunk(int argc, char** argv) { vector output_regions(num_regions); // initialize chunkers + size_t threads = get_thread_count(); vector chunkers(threads); for (auto& chunker : chunkers) { - chunker.xg = &xindex; + chunker.graph = graph; } // When chunking GAMs, every thread gets its own cursor to seek into the input GAM. - list gam_streams; - vector cursors; + // Todo: when operating on multiple gams, we make |threads| X |gams| cursors, even though + // we only ever use |threads| threads. + vector> gam_streams_vec(gam_files.size()); + vector> cursors_vec(gam_files.size()); if (chunk_gam) { - cursors.reserve(threads); - for (size_t i = 0; i < threads; i++) { - // Open a stream for every thread - gam_streams.emplace_back(gam_file); - if (!gam_streams.back()) { - cerr << "error[vg chunk]: unable to open GAM file " << gam_file << endl; - return 1; + for (size_t gam_i = 0; gam_i < gam_streams_vec.size(); ++gam_i) { + auto& gam_file = gam_files[gam_i]; + auto& gam_streams = gam_streams_vec[gam_i]; + auto& cursors = cursors_vec[gam_i]; + cursors.reserve(threads); + for (size_t i = 0; i < threads; i++) { + // Open a stream for every thread + gam_streams.emplace_back(gam_file); + if (!gam_streams.back()) { + cerr << "error[vg chunk]: unable to open GAM file " << gam_file << endl; + return 1; + } + // And wrap it in a cursor + cursors.emplace_back(gam_streams.back()); } - - // And wrap it in a cursor - cursors.emplace_back(gam_streams.back()); } } - // extract chunks in parallel #pragma omp parallel for @@ -500,18 +671,31 @@ int main_chunk(int argc, char** argv) { int tid = omp_get_thread_num(); Region& region = regions[i]; PathChunker& chunker = chunkers[tid]; - VG* subgraph = NULL; + unique_ptr subgraph; map trace_thread_frequencies; - if (id_range == false) { - subgraph = new VG(); - chunker.extract_subgraph(region, context_steps, context_length, - trace, *subgraph, output_regions[i]); + if (!component_ids.empty()) { + subgraph = vg::io::new_output_graph(output_format); + chunker.extract_component(component_ids[i], *subgraph, false); + output_regions[i] = region; + } + else if (id_range == false) { + subgraph = vg::io::new_output_graph(output_format); + if (components == true) { + chunker.extract_path_component(region.seq, *subgraph, output_regions[i]); + } else if (snarl_manager.get() != nullptr) { + chunker.extract_snarls(region, *snarl_manager, *subgraph); + output_regions[i] = region; + } else { + chunker.extract_subgraph(region, context_steps, context_length, + trace, *subgraph, output_regions[i]); + } } else { if (chunk_graph || context_steps > 0) { - subgraph = new VG(); + subgraph = vg::io::new_output_graph(output_format); output_regions[i].seq = region.seq; chunker.extract_id_range(region.start, region.end, - context_steps, context_length, trace, + components ? numeric_limits::max() : context_steps, + context_length, trace && !components, *subgraph, output_regions[i]); } else { // in this case, there's no need to actually build the subgraph, so we don't @@ -521,31 +705,56 @@ int main_chunk(int argc, char** argv) { } // optionally trace our haplotypes - if (trace && subgraph) { + if (trace && subgraph && gbwt_index) { int64_t trace_start; - int64_t trace_end; + int64_t trace_steps = 0; if (id_range) { trace_start = output_regions[i].start; - trace_end = output_regions[i].end; + trace_steps = output_regions[i].end - trace_start; } else { - trace_start = xindex.node_at_path_position(output_regions[i].seq, - output_regions[i].start); - trace_end = xindex.node_at_path_position(output_regions[i].seq, - output_regions[i].end); + path_handle_t path_handle = graph->get_path_handle(output_regions[i].seq); + step_handle_t trace_start_step = graph->get_step_at_position(path_handle, output_regions[i].start); + step_handle_t trace_end_step = graph->get_step_at_position(path_handle, output_regions[i].end); + // make sure we don't loop forever in next loop + if (output_regions[i].start > output_regions[i].end) { + swap(trace_start_step, trace_end_step); + } + trace_start = graph->get_id(graph->get_handle_of_step(trace_start_step)); + for (; trace_start_step != trace_end_step; trace_start_step = graph->get_next_step(trace_start_step)) { + ++trace_steps; + } + // haplotype_extender is forward only. until it's made bidirectional, try to + // detect backward paths and trace them backwards. this will not cover all possible cases though. + if (graph->get_is_reverse(graph->get_handle_of_step(trace_start_step)) && + graph->get_is_reverse(graph->get_handle_of_step(trace_end_step))) { + trace_start = graph->get_id(graph->get_handle_of_step(trace_end_step)); + } } - int64_t trace_steps = trace_end - trace_start; Graph g; - trace_haplotypes_and_paths(xindex, gbwt_index.get(), trace_start, trace_steps, + trace_haplotypes_and_paths(*graph, *gbwt_index, trace_start, trace_steps, g, trace_thread_frequencies, false); - subgraph->paths.for_each([&trace_thread_frequencies](const Path& path) { - trace_thread_frequencies[path.name()] = 1;}); - subgraph->extend(g); + subgraph->for_each_path_handle([&trace_thread_frequencies, &subgraph](path_handle_t path_handle) { + trace_thread_frequencies[subgraph->get_path_name(path_handle)] = 1;}); + VG* vg_subgraph = dynamic_cast(subgraph.get()); + if (vg_subgraph != nullptr) { + // our graph is in vg format, just extend it + vg_subgraph->extend(g); + } else { + // our graph is not in vg format. covert it, extend it, convert it back + // this can eventually be avoided by handlifying the haplotype tracer + VG vg; + handlealgs::copy_path_handle_graph(subgraph.get(), &vg); + subgraph.reset(); + vg.extend(g); + subgraph = vg::io::new_output_graph(output_format); + handlealgs::copy_path_handle_graph(&vg, subgraph.get()); + } } ofstream out_file; ostream* out_stream = NULL; if (chunk_graph) { - if ((!region_string.empty() || !node_range_string.empty()) && + if ((!region_strings.empty() || !node_range_string.empty()) && (regions.size() == 1) && chunk_size == 0) { // If we are going to output only one chunk, it should go to // stdout instead of to a file on disk @@ -553,7 +762,7 @@ int main_chunk(int argc, char** argv) { } else { // Otherwise, we write files under the specified prefix, using // a prefix-i-seq-start-end convention. - string name = chunk_name(i, output_regions[i], ".vg"); + string name = chunk_name(out_chunk_prefix, i, output_regions[i], output_ext, 0, components); out_file.open(name); if (!out_file) { cerr << "error[vg chunk]: can't open output chunk file " << name << endl; @@ -561,40 +770,63 @@ int main_chunk(int argc, char** argv) { } out_stream = &out_file; } - - subgraph->serialize_to_ostream(*out_stream); + + assert(subgraph); + vg::io::save_handle_graph(subgraph.get(), *out_stream); } // optional gam chunking if (chunk_gam) { - assert(gam_index.get() != nullptr); - GAMIndex::cursor_t& cursor = cursors[tid]; + if (!components) { + // old way: use the gam index + for (size_t gi = 0; gi < gam_indexes.size(); ++gi) { + auto& gam_index = gam_indexes[gi]; + assert(gam_index.get() != nullptr); + GAMIndex::cursor_t& cursor = cursors_vec[gi][tid]; - string gam_name = chunk_name(i, output_regions[i], ".gam"); - ofstream out_gam_file(gam_name); - if (!out_gam_file) { - cerr << "error[vg chunk]: can't open output gam file " << gam_name << endl; - exit(1); - } + string gam_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".gam", gi, components); + ofstream out_gam_file(gam_name); + if (!out_gam_file) { + cerr << "error[vg chunk]: can't open output gam file " << gam_name << endl; + exit(1); + } - // Work out the ID ranges to look up - vector> region_id_ranges; - if (subgraph != NULL) { - // Use the regions from the graph - region_id_ranges = vg::algorithms::sorted_id_ranges(subgraph); + // Work out the ID ranges to look up + vector> region_id_ranges; + if (subgraph) { + // Use the regions from the graph + region_id_ranges = vg::algorithms::sorted_id_ranges(subgraph.get()); + } else { + // Use the region we were asked for + region_id_ranges = {{region.start, region.end}}; + } + + auto emit = vg::io::emit_to(out_gam_file); + + auto handle_read = [&](const Alignment& aln) { + check_read(aln, graph); + emit(aln); + }; + + gam_index->find(cursor, region_id_ranges, handle_read, fully_contained); + } } else { - // Use the region we were asked for - region_id_ranges = {{region.start, region.end}}; +#pragma omp critical (node_to_component) + { + // we're doing components, just use stl map, which we update here + subgraph->for_each_handle([&](handle_t sg_handle) { + // note, if components overlap, this is arbitrary. up to user to only use + // path components if they are disjoint + node_to_component[subgraph->get_id(sg_handle)] = i; + }); + } } - - gam_index->find(cursor, region_id_ranges, stream::emit_to(out_gam_file), fully_contained); } - // trace annotations if (trace) { // Even if we have only one chunk, the trace annotation data always // ends up in a file. - string annot_name = chunk_name(i, output_regions[i], ".annotate.txt"); + string annot_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".annotate.txt", 0, components); ofstream out_annot_file(annot_name); if (!out_annot_file) { cerr << "error[vg chunk]: can't open output trace annotation file " << annot_name << endl; @@ -604,8 +836,6 @@ int main_chunk(int argc, char** argv) { out_annot_file << tf.first << "\t" << tf.second << endl; } } - - delete subgraph; } // write a bed file if asked giving a more explicit linking of chunks to files @@ -618,13 +848,80 @@ int main_chunk(int argc, char** argv) { const Region& oregion = output_regions[i]; string seq = id_range ? "ids" : oregion.seq; obed << seq << "\t" << oregion.start << "\t" << (oregion.end + 1) - << "\t" << chunk_name(i, oregion, chunk_gam ? ".gam" : ".vg"); + << "\t" << chunk_name(out_chunk_prefix, i, oregion, chunk_gam ? ".gam" : output_ext, 0, components); if (trace) { - obed << "\t" << chunk_name(i, oregion, ".annotate.txt"); + obed << "\t" << chunk_name(out_chunk_prefix, i, oregion, ".annotate.txt", 0, components); } obed << "\n"; } } + + // write out component gams + if (chunk_gam && components) { + + // buffer size of each component, total across threads + static const size_t output_buffer_total_size = 100000; + // split the buffer into threads + size_t output_buffer_size = max((size_t)1, output_buffer_total_size / threads); + // number of thread_buffers + size_t num_buffers = num_regions * threads; + + vector> output_buffers(num_buffers); + vector append_buffer(num_regions, false); + + // protect our output buffers + std::mutex* output_buffer_locks = new std::mutex[num_regions]; + + // We may have too many components to keep a buffer open for each one. So we open them as-needed only when flushing. + function flush_gam_buffer = [&](size_t buffer_idx) { + size_t comp_number = buffer_idx / threads; + string gam_name = chunk_name(out_chunk_prefix, comp_number, output_regions[comp_number], ".gam", 0, components); + { + std::lock_guard guard(output_buffer_locks[comp_number]); + ofstream out_gam_file(gam_name, append_buffer[comp_number] ? std::ios_base::app : std::ios_base::out); + if (!out_gam_file) { + cerr << "error[vg chunk]: can't open output gam file " << gam_name << endl; + exit(1); + } + vg::io::write_buffered(out_gam_file, output_buffers[buffer_idx], output_buffers[buffer_idx].size()); + append_buffer[comp_number] = true; + } + output_buffers[buffer_idx].clear(); + }; + + function chunk_gam_callback = [&](Alignment& aln) { + check_read(aln, graph); + + // we're going to lose unmapped reads right here + if (aln.path().mapping_size() > 0) { + nid_t aln_node_id = aln.path().mapping(0).position().node_id(); + unordered_map::iterator comp_it = node_to_component.find(aln_node_id); + if (comp_it != node_to_component.end()) { + int32_t aln_component = comp_it->second; + size_t buffer_idx = aln_component * threads + omp_get_thread_num(); + output_buffers[buffer_idx].push_back(aln); + if (output_buffers[buffer_idx].size() >= output_buffer_size) { + flush_gam_buffer(buffer_idx); + } + } + } + }; + + for (auto gam_file : gam_files) { + get_input_file(gam_file, [&](istream& gam_stream) { + vg::io::for_each_parallel(gam_stream, chunk_gam_callback); + }); + } +#pragma omp parallel for + for (size_t buffer_idx = 0; buffer_idx < num_buffers; ++buffer_idx) { + if (!output_buffers[buffer_idx].empty()) { + flush_gam_buffer(buffer_idx); + + } + } + + delete [] output_buffer_locks; + } return 0; } @@ -632,30 +929,190 @@ int main_chunk(int argc, char** argv) { // Register subcommand static Subcommand vg_chunk("chunk", "split graph or alignment into chunks", main_chunk); +// Output name of a chunk +string chunk_name(const string& out_chunk_prefix, int i, const Region& region, string ext, int gi, bool components) { + stringstream chunk_name; + string seq = region.seq.empty() ? "ids" : region.seq; + chunk_name << out_chunk_prefix; + if (gi > 0) { + chunk_name << "-" << gi; + } + if (!components) { + chunk_name << "_" << i << "_" << seq << "_" << region.start << "_" << region.end; + } else if (region.seq.empty()) { + chunk_name << "_" << i; + } else { + chunk_name << "_" << region.seq; + } + chunk_name << ext; + return chunk_name.str(); +} + // Split out every chunk_size reads into a different file int split_gam(istream& gam_stream, size_t chunk_size, const string& out_prefix, size_t gam_buffer_size) { ofstream out_file; size_t count = 0; - vector gam_buffer; - // Todo: try parallel stream. The only snag is that we'd have to either know - // a-priori if it's interleaved or not, or else make a new stream function that handles - // the last element instead of throwing error (very trivial as for_each_parallel_impl supports this) - stream::for_each(gam_stream, [&](Alignment& alignment) { - if (count++ % chunk_size == 0) { - if (out_file.is_open()) { - out_file.close(); + + // We're going to skip parsing the GAM reads and just treat these as type-tagged opaque messages. + vg::io::MessageIterator gam_iterator(gam_stream); + + // We're going to do multithreaded output of batches of this size. + // If our reads are paired, these had better be even. + // We want to use a reasonably substantial size here instead of + // gam_buffer_size which is pretty small, so we don't spin too much on the + // OMP task machinery. + size_t batch_size = std::min((size_t) 1000, chunk_size); + + // We use this to merge together compressed data from multiple threads. + unique_ptr gam_multiplexer; + + // We need to know how many threads there will be. + size_t thread_count = get_thread_count(); + + // Each thread needs a place to keep a MessageEmitter + vector> emitters(thread_count); + + // We fill in batch buffers in the main thread and give them away to tasks to write. + vector* batch_in_progress = nullptr; + + #pragma omp parallel shared(gam_multiplexer, emitters) + { + #pragma omp single + { + while (gam_iterator.has_current()) { + // There's a read message to process. + if (count++ % chunk_size == 0) { + // We're at read 0, or the first read past the end of a chunk. + + // Wait for all tasks + #pragma omp taskwait + + for (size_t i = 0; i < thread_count; i++) { + // Flush everything tasks have written into the multiplexer + if (emitters[i]) { + emitters[i]->flush(); + emitters[i].reset(); + gam_multiplexer->register_breakpoint(i); + } + } + + if (out_file.is_open()) { + // Destroy the old multiplexer to flush + gam_multiplexer.reset(); + // And close the file out. + out_file.close(); + } + stringstream out_name; + out_name << out_prefix << setfill('0') <(); + batch_in_progress->reserve(batch_size); + } + + + // Grab the message, paired with its tag, and advance. + // TODO: Stop copying tag? + batch_in_progress->emplace_back(std::move(gam_iterator.take())); + if (batch_in_progress->back().first.empty()) { + // This is untagged data; assume it's GAM. + batch_in_progress->back().first = "GAM"; + } + if (!batch_in_progress->back().second) { + // This is just a tag alone; throw this away. + batch_in_progress->pop_back(); + count--; + } + + if (batch_in_progress->size() == batch_size || count % chunk_size == 0 || !gam_iterator.has_current()) { + // We've hit the batch size, or we've hit the last read that fits in this chunk, or we've hit the end of the input. + // Launch a task to deal with this batch + #pragma omp task firstprivate(batch_in_progress) + { + // Get our thread + size_t thread = omp_get_thread_num(); + + // Find our GAM emitter + auto& emitter_ptr = emitters[thread]; + if (!emitter_ptr) { + // Make it exist if it doesn't yet. Make sure to compress and to pass along the buffer size. + emitter_ptr.reset(new vg::io::MessageEmitter(gam_multiplexer->get_thread_stream(thread), true, gam_buffer_size)); + } + + for (auto& message : *batch_in_progress) { + // Send each message over to the emitter, with the tag, moving the message body + emitter_ptr->write(message.first, std::move(*message.second)); + } + + // Throw out our assigned batch copy. + delete batch_in_progress; + + if (gam_multiplexer->want_breakpoint(thread)) { + // The multiplexer wants our data. + // Flush and create a breakpoint. + emitter_ptr->flush(); + gam_multiplexer->register_breakpoint(thread); + // TODO: No EOF marker we could remove??? + } + } + + + // Task frees the batch, so null out our pointer to it. + batch_in_progress = nullptr; } } - gam_buffer.push_back(alignment); - stream::write_buffered(out_file, gam_buffer, gam_buffer_size); - }); + + // Wait for the final tasks. + #pragma omp taskwait + + for (size_t i = 0; i < thread_count; i++) { + // Flush everything tasks have written into the multiplexer + if (emitters[i]) { + emitters[i]->flush(); + emitters[i].reset(); + gam_multiplexer->register_breakpoint(i); + } + } + + // Get rid of the multiplexer to flush + gam_multiplexer.reset(); + + // There will be no final batch, because when we hit EOF we launch a task + assert(batch_in_progress == nullptr); + } + } + + if (out_file.is_open()) { + // Close out the file. + out_file.close(); + } return 0; } +/// Stop and print an error if the graph exists and the read does not appear to +/// actually be aligned against the graph. +static void check_read(const Alignment& aln, const HandleGraph* graph) { + if (!graph) { + return; + } + // Make sure the nodes it visits could be the nodes in the graph. + AlignmentValidity validity = alignment_is_valid(aln, graph); + if (!validity) { + #pragma omp critical (cerr) + { + std::cerr << "error:[vg chunk] Alignment " << aln.name() << " cannot be interpreted against this graph: " << validity.message << std::endl; + std::cerr << "Make sure that you are using the same graph that the reads were mapped to!" << std::endl; + } + exit(1); + } +} + diff --git a/src/subcommand/circularize_main.cpp b/src/subcommand/circularize_main.cpp index 94f86f52d85..04b91929e9c 100644 --- a/src/subcommand/circularize_main.cpp +++ b/src/subcommand/circularize_main.cpp @@ -12,7 +12,11 @@ #include "subcommand.hpp" +#include "../utility.hpp" +#include "../handle.hpp" #include "../vg.hpp" +#include +#include using namespace std; using namespace vg; @@ -117,42 +121,49 @@ int main_circularize(int argc, char** argv){ paths_to_circularize.push_back(path); } + // TODO: if we settle on a uniform serialzation method that covers the VG class, the code is ready to be switched VG* graph; get_input_file(optind, argc, argv, [&](istream& in) { graph = new VG(in); }); // Check if paths are in graph: - for (string p : paths_to_circularize){ - bool paths_in_graph = true; - if (!graph->paths.has_path(p)){ + for (const string& p : paths_to_circularize){ + if (!graph->has_path(p)){ cerr << "ERROR: PATH NOT IN GRAPH - " << p << endl; - paths_in_graph = false; - } - - if (!paths_in_graph){ exit(1); } - } if (describe){ - for (auto& p : graph->paths._paths){ - cout << p.first << endl; - } + graph->for_each_path_handle([&](const path_handle_t& path_handle) { + cout << graph->get_path_name(path_handle) << endl; + }); exit(0); } if (head > 0 && tail > head){ - graph->circularize(head, tail); + graph->create_edge(graph->get_handle(tail), graph->get_handle(head)); } else{ - graph->circularize(paths_to_circularize); + for (const auto& path_name : paths_to_circularize) { + path_handle_t path = graph->get_path_handle(path_name); + if (graph->get_step_count(path) > 0) { + graph->create_edge(graph->get_handle_of_step(graph->path_back(path)), + graph->get_handle_of_step(graph->path_begin(path))); + } + graph->set_circularity(path, true); + } } - - graph->serialize_to_ostream(std::cout); - delete graph; - + + graph->serialize_to_ostream(cout); +// SerializableHandleGraph* to_serialize = dynamic_cast(&(*graph)); +// if (!to_serialize) { +// cerr << "error: graph format is not serializable!" << endl; +// return 1; +// } +// to_serialize->serialize(std::cout); + return 0; } diff --git a/src/subcommand/clip_main.cpp b/src/subcommand/clip_main.cpp new file mode 100644 index 00000000000..ddbd5956862 --- /dev/null +++ b/src/subcommand/clip_main.cpp @@ -0,0 +1,383 @@ +#include "subcommand.hpp" +#include "../vg.hpp" +#include "../utility.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../io/save_handle_graph.hpp" +#include +#include +#include +#include "../clip.hpp" +#include + +#include +#include + +using namespace vg; +using namespace vg::subcommand; +using namespace vg::io; + +void help_clip(char** argv) { + cerr << "usage: " << argv[0] << " [options] " << endl + << "Chop out variation within path intervals of a vg graph" << endl + << endl + << "input options: " << endl + << " -b, --bed FILE BED regions corresponding to path intervals of the graph to target" << endl + << " -r, --snarls FILE Snarls from vg snarls (recomputed if not given unless -d and -P used)." << endl + << "depth clipping options: " << endl + << " -d, --depth N Clip out nodes and edges with path depth below N" << endl + << "stub clipping options:" << endl + << " -s, --stubs Clip out all stubs (nodes with degree-0 sides that aren't on reference)" << endl + << " -S, --stubbify-paths Clip out all edges necessary to ensure selected reference paths have exactly two stubs" << endl + << "snarl complexity clipping options: [default mode]" << endl + << " -n, --max-nodes N Only clip out snarls with > N nodes" << endl + << " -e, --max-edges N Only clip out snarls with > N edges" << endl + << " -N --max-nodes-shallow N Only clip out snarls with > N nodes not including nested snarls" << endl + << " -E --max-edges-shallow N Only clip out snarls with > N edges not including nested snarls" << endl + << " -a, --max-avg-degree N Only clip out snarls with average degree > N" << endl + << " -l, --max-reflen-prop F Ignore snarls whose reference traversal spans more than F (0<=F<=1) of the whole reference path" << endl + << " -L, --max-reflen N Ignore snarls whose reference traversal spans more than N bp" << endl + << "big deletion edge clipping options:" << endl + << " -D, --max-deletion-edge N Clip out all edges whose endpoints have distance > N on a reference path" << endl + << " -c, --context N Search up to at most N steps from reference paths for candidate deletion edges [1]" << endl + << "general options: " << endl + << " -P, --path-prefix STRING Do not clip out alleles on paths beginning with given prefix (such references must be specified either with -P or -b). Multiple allowed" << endl + << " -m, --min-fragment-len N Don't write novel path fragment if it is less than N bp long" << endl + << " -B, --output-bed Write BED-style file of affected intervals instead of clipped graph. " << endl + << " Columns 4-9 are: snarl node-count edge-count shallow-node-count shallow-edge-count avg-degree" << endl + << " -t, --threads N number of threads to use [default: all available]" << endl + << " -v, --verbose Print some logging messages" << endl + << endl; +} + +int main_clip(int argc, char** argv) { + + string bed_path; + string snarls_path; + vector ref_prefixes; + int64_t min_depth = -1; + int64_t min_fragment_len = 0; + bool verbose = false; + bool depth_clipping = false; + bool stub_clipping = false; + bool stubbify_reference = false; + + size_t max_nodes = 0; + size_t max_edges = 0; + size_t max_nodes_shallow = 0; + size_t max_edges_shallow = 0; + double max_avg_degree = 0.; + double max_reflen_prop = numeric_limits::max(); + size_t max_reflen = numeric_limits::max(); + bool out_bed = false; + bool snarl_option = false; + + int64_t max_deletion = -1; + int64_t context_steps = -1; + + if (argc == 2) { + help_clip(argv); + return 1; + } + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"bed", required_argument, 0, 'b'}, + {"depth", required_argument, 0, 'd'}, + {"stubs", no_argument, 0, 's'}, + {"stubbify-paths", no_argument, 0, 'S'}, + {"max-nodes", required_argument, 0, 'n'}, + {"max-edges", required_argument, 0, 'e'}, + {"max-nodes-shallow", required_argument, 0, 'N'}, + {"max-edges-shallow", required_argument, 0, 'E'}, + {"max-avg-degree", required_argument, 0, 'a'}, + {"max-reflen-prop", required_argument, 0, 'l'}, + {"max-reflen", required_argument, 0, 'L'}, + {"max-deletion", required_argument, 0, 'D'}, + {"context", required_argument, 0, 'c'}, + {"path-prefix", required_argument, 0, 'P'}, + {"snarls", required_argument, 0, 'r'}, + {"min-fragment-len", required_argument, 0, 'm'}, + {"output-bed", no_argument, 0, 'B'}, + {"threads", required_argument, 0, 't'}, + {"verbose", required_argument, 0, 'v'}, + {0, 0, 0, 0} + + }; + int option_index = 0; + c = getopt_long (argc, argv, "hb:d:sSn:e:N:E:a:l:L:D:c:P:r:m:Bt:v", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + + case '?': + case 'h': + help_clip(argv); + return 0; + case 'b': + bed_path = optarg; + break; + case 'd': + min_depth = parse(optarg); + break; + case 's': + stub_clipping = true; + break; + case 'S': + stubbify_reference = true; + break; + case 'n': + max_nodes = parse(optarg); + snarl_option = true; + break; + case 'e': + max_edges = parse(optarg); + snarl_option = true; + break; + case 'N': + max_nodes_shallow = parse(optarg); + snarl_option = true; + break; + case 'E': + max_edges_shallow = parse(optarg); + snarl_option = true; + break; + case 'a': + max_avg_degree = parse(optarg); + snarl_option = true; + break; + case 'l': + max_reflen_prop = parse(optarg); + snarl_option = true; + break; + case 'L': + max_reflen = parse(optarg); + snarl_option = true; + break; + case 'D': + max_deletion = parse(optarg); + break; + case 'c': + context_steps = parse(optarg); + break; + case 'P': + ref_prefixes.push_back(optarg); + break; + case 'r': + snarls_path = optarg; + break; + case 'm': + min_fragment_len = parse(optarg); + break; + case 'B': + out_bed = true; + break; + case 'v': + verbose = true; + break; + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg clip] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } + default: + abort(); + } + } + + if (bed_path.empty() == ref_prefixes.empty()) { + cerr << "error:[vg-clip] Reference intervals must be specified with one of -b or -P" << endl; + return 1; + } + + if ((min_depth >= 0 || max_deletion >= 0 || stub_clipping || stubbify_reference) && (snarl_option || out_bed)) { + cerr << "error:[vg-clip] bed output (-B) and snarl complexity options (-n, -e, -N, -E, -a, -l, -L) cannot be used with -d, -D, -s or -S" << endl; + return 1; + } + + // to do: I think it could be a good idea to combine these options + if (min_depth >= 0 && max_deletion >= 0) { + cerr << "error:[vg-clip] -d cannot (yet?) be used with -D" << endl; + return 1; + } + + // ditto about combining + if ((stub_clipping || stubbify_reference) && (min_depth >= 0 || max_deletion >= 0)) { + cerr << "error:[vg-clip] -s and -S cannot (yet?) be used with -d or -D" << endl; + return 1; + } + + if (context_steps >= 0 && max_deletion < 0) { + cerr << "error:[vg-clip] -c can only be used with -D" << endl; + return 1; + } + + if (stubbify_reference && ref_prefixes.empty()) { + cerr << "error:[vg-clip] -S can only be used with -P" << endl; + return 1; + } + + // default to same + if (max_deletion > 0 && context_steps < 0) { + context_steps = max_deletion; + } + + // load the graph + string graph_path = get_input_file_name(optind, argc, argv); + unique_ptr graph = vg::io::VPKG::load_one(graph_path); + + // optional overlay only needed with bed regions input + bdsg::PathPositionOverlayHelper overlay_helper; + PathPositionHandleGraph* pp_graph = nullptr; + + unique_ptr snarl_manager; + vector bed_regions; + + // need the path positions unless we're doing depth, deletion or stub clipping without regions + bool need_pp = !(bed_path.empty() && (min_depth >= 0 || max_deletion >= 0 || stub_clipping)); + + // need snarls if input regions are provided, or doing snarl based clipping + bool need_snarls = !bed_path.empty() || (min_depth < 0 && max_deletion < 0 && !stub_clipping); + + if (need_pp) { + pp_graph = overlay_helper.apply(graph.get()); + if (verbose) { + cerr << "[vg clip]: Computed path position overlay of input graph" << endl; + } + } + + if (need_snarls) { + // Load or compute the snarls which are required for targetting bed regions + if (!snarls_path.empty()) { + ifstream snarl_file(snarls_path.c_str()); + if (!snarl_file) { + cerr << "Error [vg clip]: Unable to load snarls file: " << snarls_path << endl; + return 1; + } + snarl_manager = vg::io::VPKG::load_one(snarl_file); + if (verbose) { + cerr << "[vg clip]: Loaded " << snarl_manager->num_snarls() << " snarls" << endl; + } + } else { + IntegratedSnarlFinder finder(*graph); + snarl_manager = unique_ptr(new SnarlManager(std::move(finder.find_snarls_parallel()))); + if (verbose) { + cerr << "[vg clip]: Computed " << snarl_manager->num_snarls() << " snarls" << endl; + } + } + + // load the bed file + if (!bed_path.empty()) { + parse_bed_regions(bed_path, bed_regions); + if (verbose) { + cerr << "[vg clip]: Loaded " << bed_regions.size() << " BED regions" << endl; + } + // contig names left in this set are *not* in the graph + unordered_set contig_set; + for (const Region& region : bed_regions) { + contig_set.insert(region.seq); + } + graph->for_each_path_handle([&] (path_handle_t path_handle) { + string base_name = Paths::strip_subrange(graph->get_path_name(path_handle)); + if (contig_set.count(base_name)) { + // todo: should take into account coordinate comp + contig_set.erase(base_name); + } + }); + vector bed_regions_in_graph; + for (const Region& region : bed_regions) { + if (!contig_set.count(region.seq)) { + bed_regions_in_graph.push_back(region); + } + } + if (bed_regions_in_graph.size() != bed_regions.size()) { + if (verbose) { + cerr << "[vg clip]: Dropped " << (bed_regions.size() - bed_regions_in_graph.size()) << " BED regions whose sequence names do not correspond to paths in the graph" << endl; + } + if (bed_regions_in_graph.empty()) { + cerr << "warning:[vg-clip] No BED region found that lies on path in graph (use vg paths -Lv to list paths that are in the graph)" << endl; + } + } + swap(bed_regions, bed_regions_in_graph); + } else { + assert(need_pp); + assert(!ref_prefixes.empty()); + // load the bed regions from the reference path prefix + pp_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = pp_graph->get_path_name(path_handle); + subrange_t subrange; + path_name = Paths::strip_subrange(path_name, &subrange); + int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + for (const string& ref_prefix : ref_prefixes) { + if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) { + Region region = {path_name, offset, offset + (int64_t)pp_graph->get_path_length(path_handle) - 1}; + bed_regions.push_back(region); + break; + } + } + }); + if (verbose) { + cerr << "[vg clip]: Inferred " << bed_regions.size() << " BED regions from paths in the graph" << endl; + } + } + } + + if (min_depth >= 0) { + // run the depth clipping + if (bed_path.empty()) { + // do the whole graph + clip_low_depth_nodes_and_edges(graph.get(), min_depth, ref_prefixes, min_fragment_len, verbose); + } else { + // do the contained snarls + clip_contained_low_depth_nodes_and_edges(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_depth, min_fragment_len, verbose); + } + + } else if (max_deletion >= 0) { + // run the deletion edge clipping on the whole graph + clip_deletion_edges(graph.get(), max_deletion, context_steps, ref_prefixes, min_fragment_len, verbose); + } else if (stub_clipping || stubbify_reference) { + // run the stub clipping + if (bed_path.empty()) { + // do the whole graph + if (stubbify_reference) { + // important that this is done first, as it can actually create non-reference stubs that'd need removal below + stubbify_ref_paths(graph.get(), ref_prefixes, min_fragment_len, verbose); + } + if (stub_clipping) { + clip_stubs(graph.get(), ref_prefixes, min_fragment_len, verbose); + } + } else { + assert(stub_clipping && !stubbify_reference); + // do the contained snarls + clip_contained_stubs(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_fragment_len, verbose); + } + }else { + // run the alt-allele clipping + clip_contained_snarls(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_fragment_len, + max_nodes, max_edges, max_nodes_shallow, max_edges_shallow, max_avg_degree, max_reflen_prop, max_reflen, out_bed, verbose); + } + + // write the graph + if (!out_bed) { + vg::io::save_handle_graph(graph.get(), std::cout); + } + + return 0; +} + + +// Register subcommand +static Subcommand vg_clip("clip", "remove BED regions (other other nodes from their snarls) from a graph", main_clip); diff --git a/src/subcommand/cluster_main.cpp b/src/subcommand/cluster_main.cpp new file mode 100644 index 00000000000..266a624e622 --- /dev/null +++ b/src/subcommand/cluster_main.cpp @@ -0,0 +1,405 @@ +/** + * \file cluster_main.cpp: experimental snarl clustering test harness + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "subcommand.hpp" + +#include "../snarl_seed_clusterer.hpp" +#include "../mapper.hpp" +#include "../annotation.hpp" +#include "../xg.hpp" +#include +#include +#include + +#include +#include + +//#define USE_CALLGRIND + +#ifdef USE_CALLGRIND +#include +#endif + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_cluster(char** argv) { + cerr + << "usage: " << argv[0] << " cluster [options] input.gam > output.gam" << endl + << "Find and cluster mapping seeds." << endl + << endl + << "basic options:" << endl + << " -x, --xg-name FILE use this xg index or graph (required)" << endl + << " -g, --gcsa-name FILE use this GCSA2/LCP index pair (both FILE and FILE.lcp)" << endl + << " -m, --minimizer-name FILE use this minimizer index" << endl + << " -d, --dist-name FILE cluster using this distance index (required)" << endl + << " -c, --hit-cap INT ignore minimizers with more than this many locations [10]" << endl + << "computational parameters:" << endl + << " -t, --threads INT number of compute threads to use" << endl; +} + +int main_cluster(int argc, char** argv) { + + if (argc == 2) { + help_cluster(argv); + return 1; + } + + // initialize parameters with their default options + string xg_name; + string gcsa_name; + string minimizer_name; + string distance_name; + // How close should two hits be to be in the same cluster? + size_t distance_limit = 1000; + size_t hit_cap = 10; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"xg-name", required_argument, 0, 'x'}, + {"gcsa-name", required_argument, 0, 'g'}, + {"minimizer-name", required_argument, 0, 'm'}, + {"dist-name", required_argument, 0, 'd'}, + {"hit-cap", required_argument, 0, 'c'}, + {"threads", required_argument, 0, 't'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "hx:g:m:d:c:t:", + long_options, &option_index); + + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'x': + xg_name = optarg; + if (xg_name.empty()) { + cerr << "error:[vg cluster] Must provide XG file with -x." << endl; + exit(1); + } + break; + + case 'g': + gcsa_name = optarg; + if (gcsa_name.empty()) { + cerr << "error:[vg cluster] Must provide GCSA file with -g." << endl; + exit(1); + } + break; + + case 'm': + minimizer_name = optarg; + if (minimizer_name.empty()) { + cerr << "error:[vg cluster] Must provide minimizer file with -m." << endl; + exit(1); + } + break; + + case 'd': + distance_name = optarg; + if (distance_name.empty()) { + cerr << "error:[vg cluster] Must provide distance index file with -d." << endl; + exit(1); + } + break; + + case 'c': + hit_cap = parse(optarg); + break; + + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg cluster] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + } + break; + + case 'h': + case '?': + default: + help_cluster(argv); + exit(1); + break; + } + } + + + if (xg_name.empty()) { + cerr << "error:[vg cluster] Finding clusters requires an XG index, must provide XG file (-x)" << endl; + exit(1); + } + + if (gcsa_name.empty() && minimizer_name.empty()) { + cerr << "error:[vg cluster] Finding clusters requires a GCSA2 index or minimizer index (-g, -m)" << endl; + exit(1); + } + + + if (distance_name.empty()) { + cerr << "error:[vg cluster] Finding clusters requires a distance index, must provide distance index file (-d)" << endl; + exit(1); + } + + // create in-memory objects + unique_ptr path_handle_graph = vg::io::VPKG::load_one(xg_name); + bdsg::PathPositionOverlayHelper overlay_helper; + PathPositionHandleGraph* xg_index = overlay_helper.apply(path_handle_graph.get()); + unique_ptr gcsa_index; + unique_ptr lcp_index; + if (!gcsa_name.empty()) { + gcsa_index = vg::io::VPKG::load_one(gcsa_name); + lcp_index = vg::io::VPKG::load_one(gcsa_name + ".lcp"); + } + unique_ptr minimizer_index; + if (!minimizer_name.empty()) { + minimizer_index = vg::io::VPKG::load_one(minimizer_name); + } + unique_ptr distance_index = vg::io::VPKG::load_one(distance_name); + + // Make the clusterer + SnarlDistanceIndexClusterer clusterer(*distance_index); + + // Make a Mapper to look up MEM seeds + unique_ptr mapper; + if (gcsa_index) { + // We will find MEMs using a Mapper + mapper = make_unique(xg_index, gcsa_index.get(), lcp_index.get()); + } + // Otherwise we will find minimizers using the minimizer_index + + get_input_file(optind, argc, argv, [&](istream& in) { + // Open up the input GAM + + // Make the output emitter + vg::io::ProtobufEmitter emitter(cout); + +#ifdef USE_CALLGRIND + // We want to profile the clustering and the code around it. + CALLGRIND_START_INSTRUMENTATION; +#endif + + vg::io::for_each_parallel(in, [&](Alignment& aln) { + // For each input alignment + + // We will find all the seed hits + vector seeds; + + // If working with MEMs, this will hold all the MEMs + vector mems; + // If working with minimizers, this will hold all the minimizers in the query + vector minimizers; + // And either way this will map from seed to MEM or minimizer that generated it + vector seed_to_source; + + if (mapper) { + // Find MEMs + double lcp_avg, fraction_filtered; + mems = mapper->find_mems_deep(aln.sequence().begin(), aln.sequence().end(), lcp_avg, fraction_filtered); + + // Convert to position seeds + for (size_t i = 0; i < mems.size(); i++) { + auto& mem = mems[i]; + for (gcsa::node_type n : mem.nodes) { + // Convert from GCSA node_type packing to a pos_t + seeds.push_back(make_pos_t(n)); + // And remember which MEM the seed came from. + seed_to_source.push_back(i); + } + } + } else { + // Find minimizers + assert(minimizer_index); + + // Find minimizers in the query + minimizers = minimizer_index->minimizers(aln.sequence()); + + for (size_t i = 0; i < minimizers.size(); i++) { + // For each minimizer + if (hit_cap != 0 && minimizer_index->count(minimizers[i]) <= hit_cap) { + // The minimizer is infrequent enough to be informative, so feed it into clustering + + // Locate it in the graph. We do not have to reverse the hits for a + // reverse minimizers, as the clusterer only cares about node ids. + auto hits = minimizer_index->find(minimizers[i]); + for (auto hit = hits.first; hit != hits.first + hits.second; ++hit) { + // For each position, remember it and what minimizer it came from + seeds.push_back(hit->position.decode()); + seed_to_source.push_back(i); + } + } + } + + } + vector seed_clusters; + for (pos_t pos : seeds) { + seed_clusters.emplace_back(); + seed_clusters.back().pos = pos; + } + + + // Cluster the seeds. Get sets of input seed indexes that go together. + // Make sure to time it. + std::chrono::time_point start = std::chrono::system_clock::now(); + vector clusters = clusterer.cluster_seeds(seed_clusters, distance_limit); + std::chrono::time_point end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + + // Compute the covered portion of the read represented by each cluster + vector read_coverage_by_cluster; + for (auto& cluster : clusters) { + // We set bits in here to true when query anchors cover them + vector covered(aln.sequence().size()); + // We use this to convert iterators to indexes + auto start = aln.sequence().begin(); + + for (auto hit_index : cluster.seeds) { + // For each hit in the cluster, work out what anchor sequence it is from. + size_t source_index = seed_to_source.at(hit_index); + + if (mapper) { + // Using MEMs + for (size_t i = (mems[source_index].begin - start); i < (mems[source_index].end - start); i++) { + // Set all the bits in read space for that MEM + covered[i] = true; + } + } else { + // Using minimizers + // The offset of a reverse minimizer is the endpoint of the kmer + size_t start_offset = minimizers[source_index].offset; + if (minimizers[source_index].is_reverse) { + start_offset = start_offset + 1 - minimizer_index->k(); + } + for (size_t i = start_offset; i < start_offset + minimizer_index->k(); i++) { + // Set all the bits in read space for that minimizer. + // Each minimizr is a length-k exact match starting at a position + covered[i] = true; + } + } + } + + // Count up the covered positions + size_t covered_count = 0; + for (auto bit : covered) { + covered_count += bit; + } + + // Turn that into a fraction + read_coverage_by_cluster.push_back(covered_count / (double) covered.size()); + } + + // Make a vector of cluster indexes to sort + vector cluster_indexes_in_order; + for (size_t i = 0; i < clusters.size(); i++) { + cluster_indexes_in_order.push_back(i); + } + + // Put the most covering cluster's index first + std::sort(cluster_indexes_in_order.begin(), cluster_indexes_in_order.end(), [&](const size_t& a, const size_t& b) -> bool { + // Return true if a must come before b, and false otherwise + return read_coverage_by_cluster.at(a) > read_coverage_by_cluster.at(b); + }); + + // Find the seeds in the clusters tied for best. + vector best; + if (!clusters.empty()) { + // How much does the best cluster cover + double best_coverage = read_coverage_by_cluster.at(cluster_indexes_in_order.front()); + for (size_t i = 0; i < cluster_indexes_in_order.size() && + read_coverage_by_cluster.at(cluster_indexes_in_order[i]) >= best_coverage; i++) { + + // For each cluster covering that much or more of the read + for (auto seed_index : clusters.at(cluster_indexes_in_order[i]).seeds) { + // For each seed in those clusters + + // Mark that seed as being part of the best cluster(s) + best.push_back(seeds.at(seed_index)); + } + + } + + } + + // Decide if they are in the right place for the original alignment or not + unordered_set true_nodes; + for (auto& mapping : aln.path().mapping()) { + true_nodes.insert(mapping.position().node_id()); + } + // We are in the right place if we share any nodes + bool have_overlap = false; + for (auto& pos : best) { + if (true_nodes.count(get_id(pos))) { + // The cluster had a position on a node that the real alignment had. + have_overlap = true; + } + } + + // We also want to know if we overlap any non-filtered hit + bool have_hit_overlap = false; + for (auto& pos : seeds) { + if (true_nodes.count(get_id(pos))) { + // The hit set had a position on a node that the real alignment had. + have_hit_overlap = true; + } + } + + // And we need a vector of cluster sizes + vector cluster_sizes; + cluster_sizes.reserve(clusters.size()); + for (auto& cluster : clusters) { + cluster_sizes.push_back((double)cluster.seeds.size()); + } + + // Tag the alignment with cluster accuracy + set_annotation(aln, "best_cluster_overlap", have_overlap); + // And with any-hit overlap + set_annotation(aln, "any_seed_overlap", have_hit_overlap); + // And with cluster time + set_annotation(aln, "cluster_seconds", elapsed_seconds.count()); + // And with hit count clustered + set_annotation(aln, "seed_count", (double)seeds.size()); + // And with cluster count returned + set_annotation(aln, "cluster_count", (double)clusters.size()); + // And with size of each cluster + set_annotation(aln, "cluster_sizes", cluster_sizes); + // And with the coverage of the read in the best cluster + set_annotation(aln, "best_cluster_coverage", clusters.empty() ? 0.0 : + read_coverage_by_cluster.at(cluster_indexes_in_order.front())); + + + // TODO: parallelize this + #pragma omp critical (cout) + emitter.write(std::move(aln)); + }); + }); + + return 0; +} + +// Register subcommand +static Subcommand vg_cluster("cluster", "find and cluster mapping seeds", DEVELOPMENT, main_cluster); + + diff --git a/src/subcommand/combine_main.cpp b/src/subcommand/combine_main.cpp new file mode 100644 index 00000000000..723414887b9 --- /dev/null +++ b/src/subcommand/combine_main.cpp @@ -0,0 +1,231 @@ +/** \file combine_main.cpp + * + * Defines the "vg combine" subcommand + */ + + +#include +#include +#include + +#include + +#include +#include +#include + +#include "subcommand.hpp" + +#include "../handle.hpp" +#include "../vg.hpp" +#include "../io/save_handle_graph.hpp" + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_combine(char** argv) { + cerr << "usage: " << argv[0] << " combine [options] [graph2.vg ...] >merged.vg" << endl + << "Combines one or more graphs into a single file, regardless of input format." << endl + << "Node IDs will be modified as needed to resolve conflicts (in same manner as vg ids -j)." << endl + << endl + << "Options:" << endl + << " -c, --cat-proto Merge graphs by converting each to Protobuf (if not already) and catting the results." + << " Node IDs not modified [DEPRECATED]" << endl + << " -p, --connect-paths Add edges necessary to connect paths with the same name present in different graphs." << endl + << " ex: If path x is present in graphs N-1 and N, then an edge connecting the last node of x in N-1 " << endl + << " and the first node of x in N will be added." << endl; +} + +static int cat_proto_graphs(int argc, char** argv); + +int main_combine(int argc, char** argv) { + + if (argc == 2) { + help_combine(argv); + return 1; + } + + bool connect_paths = false; + bool cat_proto = false; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"connect-paths", no_argument, 0, 'p'}, + {"cat-proto", no_argument, 0, 'c'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "hpc", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'p': + connect_paths = true; + break; + case 'c': + cat_proto = true; + break; + case 'h': + case '?': + help_combine(argv); + exit(1); + break; + + default: + abort (); + } + } + + if (cat_proto) { + if (connect_paths) + cerr << "warning [vg combine]: --cat-proto/-c option is deprecated and will be removed in a future version of vg." << endl; + return cat_proto_graphs(argc, argv); + } + + unique_ptr first_graph; + string first_graph_filename = get_input_file_name(optind, argc, argv); + first_graph = vg::io::VPKG::load_one(first_graph_filename); + int64_t max_node_id = first_graph->max_node_id(); + + while (optind < argc) { + + unique_ptr graph; + string graph_filename = get_input_file_name(optind, argc, argv); + graph = vg::io::VPKG::load_one(graph_filename); + + // join the id spaces if necessary + int64_t delta = max_node_id - graph->min_node_id(); + if (delta >= 0) { + graph->increment_node_ids(delta + 1); + } + max_node_id = graph->max_node_id(); + + if (connect_paths) { + handlealgs::append_path_handle_graph(graph.get(), first_graph.get(), true); + } else { + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (first_graph->has_path(path_name)) { + cerr << "error [vg combine]: Paths with name \"" << path_name << "\" found in multiple input graphs. If they are consecutive subpath ranges, they can be connected by using the -p option." << endl; + exit(1); + } + }); + handlealgs::copy_path_handle_graph(graph.get(), first_graph.get()); + } + } + + // Serialize the graph using VPKG. + vg::io::save_handle_graph(first_graph.get(), cout); + + return 0; +} + +// Register subcommand +static Subcommand vg_combine("combine", "merge multiple graph files together", main_combine); + + +// This is the original vg combine logic, which itself mimics using "cat" to join up protobuf files +// Since it relies on the Protobuf format itself, particular the ability to stream together chunks that +// would otherwise be invalid individually, it is probably never going to be ported to the handle graph +// api, which is why it's been relegated to the deprecated bin +int cat_proto_graphs(int argc, char** argv) { + + while (optind < argc) { + get_input_file(optind, argc, argv, [&](istream& in) { + // We're producing output in uncompressed, "VG"-type-tagged, VPKG Protobuf format. + // We will check if this file is uncompressed or compressed VG-type-tagged data. + + if (vg::io::BlockedGzipInputStream::SmellsLikeGzip(in)) { + // It is compressed. + + // Save our start position + auto start = in.tellg(); + + { + // Try decompressing. + vg::io::BlockedGzipInputStream decompressed(in); + if (decompressed.IsBGZF() && vg::io::MessageIterator::sniff_tag(decompressed) == "VG") { + // We have Blocked GZIP which we can potentially just forward. + // It looks like compressed VG Protobuf data. + + // Decompress it all to stdout, using the ZeroCopyInputStream API. + char* buffer = nullptr; + int bytes = 0; + while (cout && decompressed.Next((const void**) &buffer, &bytes)) { + // Each time we get bytes, write them to stdout. + cout.write(buffer, bytes); + } + + if (!cout) { + cerr << "error [vg combine]: Could not write decompressed data to output stream." << endl; + exit(1); + } + + // Do the next input file + return; + } + } + + // We may have hit EOF. + in.clear(); + + // If we get here, it wasn't compressed VG Protobuf. + // So we need to go back to the start of the file, since the decompressor read some. + in.seekg(start); + + } else if (vg::io::MessageIterator::sniff_tag(in) == "VG") { + // It isn't compressed, but it looks like uncompressed VG Protobuf. + // Send the uncompressed data to stdout. + cout << in.rdbuf(); + + if (!cout) { + cerr << "error [vg combine]: Could not write raw data to output stream." << endl; + exit(1); + } + + // Do the next input file + return; + } + + // If we get here, it isn't compressed or uncompressed VG protobuf. + // Read it as a PathHandleGraph + unique_ptr graph = vg::io::VPKG::load_one(in); + + // Convert to vg::VG + VG* vg_graph = dynamic_cast(graph.get()); + if (vg_graph == nullptr) { + vg_graph = new vg::VG(); + handlealgs::copy_path_handle_graph(graph.get(), vg_graph); + // Give the unique_ptr ownership and delete the graph we loaded. + graph.reset(vg_graph); + // Make sure the paths are all synced up + vg_graph->paths.to_graph(vg_graph->graph); + } + + { + // Save to stdout, uncompressed + vg::io::ProtobufEmitter emitter(cout, false); + vg_graph->serialize_to_emitter(emitter); + // Make sure the emitter goes away and writes before we check on the stream. + } + + if (!cout) { + cerr << "error [vg combine]: Could not write converted graph to output stream." << endl; + exit(1); + } + + }); + } + return 0; +} diff --git a/src/subcommand/compare_main.cpp b/src/subcommand/compare_main.cpp deleted file mode 100644 index 849d3cbe42d..00000000000 --- a/src/subcommand/compare_main.cpp +++ /dev/null @@ -1,148 +0,0 @@ -/** \file compare_main.cpp - * - * Defines the "vg compare" subcommand - */ - - -#include -#include -#include - -#include - -#include "subcommand.hpp" - -#include "../vg.hpp" -#include "../index.hpp" - -using namespace std; -using namespace vg; -using namespace vg::subcommand; - -void help_compare(char** argv) { - cerr << "usage: " << argv[0] << " compare [options] graph1 graph2" << endl - << "Compare kmer sets of two graphs" << endl - << endl - << "options:" << endl - << " -d, --db-name1 FILE use this db for graph1 (defaults to .index/)" << endl - << " -e, --db-name2 FILE use this db for graph2 (defaults to .index/)" << endl - << " -t, --threads N number of threads to use" << endl; -} - -int main_compare(int argc, char** argv) { - - if (argc <= 3) { - help_compare(argv); - return 1; - } - - string db_name1; - string db_name2; - int num_threads = 1; - - int c; - optind = 2; // force optind past command positional argument - while (true) { - static struct option long_options[] = - { - {"help", no_argument, 0, 'h'}, - {"db-name1", required_argument, 0, 'd'}, - {"db-name2", required_argument, 0, 'e'}, - {"threads", required_argument, 0, 't'}, - {0, 0, 0, 0} - }; - - int option_index = 0; - c = getopt_long (argc, argv, "hd:e:t:", - long_options, &option_index); - - // Detect the end of the options. - if (c == -1) - break; - - switch (c) - { - - case 'd': - db_name1 = optarg; - break; - - case 'e': - db_name2 = optarg; - break; - - case 't': - num_threads = parse(optarg); - break; - - case 'h': - case '?': - help_compare(argv); - exit(1); - break; - - default: - abort (); - } - } - - omp_set_num_threads(num_threads); - - if (db_name1.empty()) { - db_name1 = get_input_file_name(optind, argc, argv); - } - if (db_name2.empty()) { - db_name2 = get_input_file_name(optind, argc, argv); - } - - // Note: only supporting rocksdb index for now. - - Index index1; - index1.open_read_only(db_name1); - - Index index2; - index2.open_read_only(db_name2); - - pair index1_vs_index2; - pair index2_vs_index1; - - // Index::compare is not parallel, but at least we can do the - // two directions at the same time... -#pragma omp parallel sections - { -#pragma omp section - { - index1_vs_index2 = index1.compare_kmers(index2); - } -#pragma omp section - { - index2_vs_index1 = index2.compare_kmers(index1); - } - } - {// <-- for emacs - assert(index1_vs_index2.first == index2_vs_index1.first); - - int64_t db1_count = index1_vs_index2.first + index1_vs_index2.second; - int64_t db2_count = index2_vs_index1.first + index2_vs_index1.second; - int64_t db1_only = index1_vs_index2.second; - int64_t db2_only = index2_vs_index1.second; - int64_t db1_and_db2 = index1_vs_index2.first; - int64_t db1_or_db2 = db1_only + db2_only + db1_and_db2; - - cout << "{\n" - << "\"db1_path\": " << "\"" << db_name1 << "\"" << ",\n" - << "\"db2_path\": " << "\"" << db_name2 << "\"" << ",\n" - << "\"db1_total\": " << db1_count << ",\n" - << "\"db2_total\": " << db2_count << ",\n" - << "\"db1_only\": " << db1_only << ",\n" - << "\"db2_only\": " << db2_only << ",\n" - << "\"intersection\": " << db1_and_db2 << ",\n" - << "\"union\": " << db1_or_db2 << "\n" - << "}" << endl; - } - return 0; -} - -// Register subcommand -static Subcommand vg_compare("compare", "compare the kmer space of two graphs", main_compare); - diff --git a/src/subcommand/concat_main.cpp b/src/subcommand/concat_main.cpp index b14588acc60..bf12821e931 100644 --- a/src/subcommand/concat_main.cpp +++ b/src/subcommand/concat_main.cpp @@ -11,8 +11,13 @@ #include #include "subcommand.hpp" - +#include "../option.hpp" +#include "../xg.hpp" #include "../vg.hpp" +#include +#include +#include "../io/save_handle_graph.hpp" +#include using namespace std; using namespace vg; @@ -20,9 +25,13 @@ using namespace vg::subcommand; void help_concat(char** argv) { cerr << "usage: " << argv[0] << " concat [options] [graph2.vg ...] >merged.vg" << endl - << "Concatenates graphs in order by adding edges from the tail nodes of the" << endl - << "predecessor to the head nodes of the following graph. Node IDs are" << endl - << "compacted, so care should be taken if consistent IDs are required." << endl; + << "Concatenates graphs in order by adding edges from the tail nodes of the" << endl + << "predecessor to the head nodes of the following graph. If node ID spaces overlap " + << "between graphs, they will be resolved (as in vg ids -j)" << endl + << endl + << "Options:" << endl + << " -p, --only-join-paths Only add edges necessary to join up appended paths (as opposed between all heads/tails)" << endl + << endl; } int main_concat(int argc, char** argv) { @@ -32,17 +41,20 @@ int main_concat(int argc, char** argv) { return 1; } + bool only_join_paths = false; + int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, + {"only-join-paths", no_argument, 0, 'p'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "h", + c = getopt_long (argc, argv, "hp", long_options, &option_index); // Detect the end of the options. @@ -51,38 +63,48 @@ int main_concat(int argc, char** argv) { switch (c) { - case 'h': - case '?': - help_concat(argv); - exit(1); - break; - - default: - abort (); + case 'p': + only_join_paths = true; + break; + case 'h': + case '?': + help_concat(argv); + exit(1); + break; + default: + abort (); } } - list graphs; + unique_ptr first_graph; + get_input_file(optind, argc, argv, [&](istream& in) { + first_graph = vg::io::VPKG::load_one(in); + }); + int64_t max_node_id = first_graph->max_node_id(); while (optind < argc) { - VG* graph; + + unique_ptr graph; get_input_file(optind, argc, argv, [&](istream& in) { - graph = new VG(in); - }); - graphs.push_back(graph); - } + graph = vg::io::VPKG::load_one(in); + }); + + // join the id spaces if necessary + int64_t delta = max_node_id - graph->min_node_id(); + if (delta >= 0) { + graph->increment_node_ids(delta + 1); + } + max_node_id = graph->max_node_id(); - VG merged; - for (list::iterator g = graphs.begin(); g != graphs.end(); ++g) { - merged.append(**g); + handlealgs::append_path_handle_graph(graph.get(), first_graph.get(), only_join_paths); } - // output - merged.serialize_to_ostream(std::cout); + // Serialize the graph using VPKG. + vg::io::save_handle_graph(first_graph.get(), cout); return 0; } // Register subcommand -static Subcommand vg_concat("concat", "concatenate graphs tail-to-head", main_concat); +static Subcommand vg_concat("concat", "concatenate graphs tail-to-head", DEPRECATED, main_concat); diff --git a/src/subcommand/construct_main.cpp b/src/subcommand/construct_main.cpp index fd256be6252..153aa2a2cb3 100644 --- a/src/subcommand/construct_main.cpp +++ b/src/subcommand/construct_main.cpp @@ -7,11 +7,13 @@ #include "subcommand.hpp" -#include "../stream.hpp" +#include #include "../constructor.hpp" #include "../msa_converter.hpp" #include "../region.hpp" +#include + using namespace std; using namespace vg; using namespace vg::subcommand; @@ -22,10 +24,10 @@ void help_construct(char** argv) { << "construct from a reference and variant calls:" << endl << " -r, --reference FILE input FASTA reference (may repeat)" << endl << " -v, --vcf FILE input VCF (may repeat)" << endl - << " -n, --rename V=F rename contig V in the VCFs to contig F in the FASTAs (may repeat)" << endl + << " -n, --rename V=F match contig V in the VCFs to contig F in the FASTAs (may repeat)" << endl << " -a, --alt-paths save paths for alts of variants by variant ID" << endl - << " -R, --region REGION specify a particular chromosome or 1-based inclusive region" << endl - << " -C, --region-is-chrom don't attempt to parse the region (use when the reference" << endl + << " -R, --region REGION specify a VCF contig name or 1-based inclusive region (may repeat, if on different contigs)" << endl + << " -C, --region-is-chrom don't attempt to parse the regions (use when the reference" << endl << " sequence name could be inadvertently parsed as a region)" << endl << " -z, --region-size N variants per region to parallelize (default: 1024)" << endl << " -t, --threads N use N threads to construct graph (defaults to numCPUs)" << endl @@ -33,12 +35,15 @@ void help_construct(char** argv) { << " -I, --insertions FILE a FASTA file containing insertion sequences "<< endl << " (referred to in VCF) to add to graph." << endl << " -f, --flat-alts N don't chop up alternate alleles from input VCF" << endl + << " -l, --parse-max N don't chop up alternate alleles from input VCF longer than N (default: 100)" << endl + << " -i, --no-trim-indels don't remove the 1bp reference base from alt alleles of indels." << endl + << " -N, --in-memory construct the entire graph in memory before outputting it." < fasta_filenames; vector vcf_filenames; vector insertion_filenames; - string region; + vector regions; bool region_is_chrom = false; string msa_filename; - int max_node_size = 1000; + int max_node_size = 32; bool keep_paths = true; + bool construct_in_memory = false; string msa_format = "fasta"; bool show_progress = false; @@ -88,13 +94,16 @@ int main_construct(int argc, char** argv) { {"threads", required_argument, 0, 't'}, {"region", required_argument, 0, 'R'}, {"region-is-chrom", no_argument, 0, 'C'}, - {"node-max", required_argument, 0, 'm'},\ + {"node-max", required_argument, 0, 'm'}, {"flat-alts", no_argument, 0, 'f'}, + {"parse-max", required_argument, 0, 'l'}, + {"no-trim-indels", no_argument, 0, 'i'}, + {"in-memory", no_argument, 0, 'N'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "v:r:n:ph?z:t:R:m:as:CfSI:M:dF:", + c = getopt_long (argc, argv, "v:r:n:ph?z:t:R:m:aCfl:SI:M:dF:iN", long_options, &option_index); /* Detect the end of the options. */ @@ -119,6 +128,14 @@ int main_construct(int argc, char** argv) { keep_paths = false; break; + case 'i': + constructor.trim_indels = false; + break; + + case 'N': + construct_in_memory = true; + break; + case 'r': fasta_filenames.push_back(optarg); break; @@ -162,7 +179,7 @@ int main_construct(int argc, char** argv) { break; case 'R': - region = optarg; + regions.push_back(optarg); break; case 'C': @@ -180,6 +197,10 @@ int main_construct(int argc, char** argv) { case 'f': constructor.flat = true; break; + + case 'l': + constructor.max_parsed_variant_size = parse(optarg); + break; case 'h': case '?': @@ -189,8 +210,7 @@ int main_construct(int argc, char** argv) { break; default: - abort (); - + throw runtime_error("Not implemented: " + to_string(c)); } } @@ -209,22 +229,13 @@ int main_construct(int argc, char** argv) { // Actually use the Constructor. // TODO: If we aren't always going to use the Constructor, refactor the subcommand to not always create and configure it. - // We need a callback to handle pieces of graph as they are produced. - auto callback = [&](Graph& big_chunk) { - // Wrap the chunk in a vg object that can properly divide it into - // reasonably sized serialized chunks. - VG* g = new VG(big_chunk, false, true); -#pragma omp critical (cout) - g->serialize_to_ostream_as_part(cout); - }; - // Copy shared parameters into the constructor constructor.max_node_size = max_node_size; constructor.show_progress = show_progress; - - if (!region.empty()) { - // We want to limit to a certain region + unordered_set used_region_contigs; + for (auto& region : regions) { + // We want to limit to one or more region if (!region_is_chrom) { // We are allowed to parse the region. // Break out sequence name and region bounds @@ -234,6 +245,12 @@ int main_construct(int argc, char** argv) { seq_name, start_pos, stop_pos); + + if (used_region_contigs.count(seq_name)) { + cerr << "error:[vg construct] cannot construct multiple regions of " << seq_name << endl; + exit(1); + } + used_region_contigs.insert(seq_name); if (start_pos > 0 && stop_pos > 0) { // These are 0-based, so if both are nonzero we got a real set of coordinates @@ -259,79 +276,57 @@ int main_construct(int argc, char** argv) { } } - // This will own all the VCF files - vector> variant_files; - for (auto& vcf_filename : vcf_filenames) { - // Make sure each VCF file exists. Otherwise Tabix++ may exit with a non- - // helpful message. - - // We can't invoke stat woithout a place for it to write. But all we - // really want is its return value. - struct stat temp; - if(stat(vcf_filename.c_str(), &temp)) { - cerr << "error:[vg construct] file \"" << vcf_filename << "\" not found" << endl; - return 1; - } - vcflib::VariantCallFile* variant_file = new vcflib::VariantCallFile(); - variant_file->parseSamples = false; // Major speedup if there are many samples. - variant_files.emplace_back(variant_file); - variant_file->open(vcf_filename); - if (!variant_file->is_open()) { - cerr << "error:[vg construct] could not open" << vcf_filename << endl; - return 1; - } - } if (fasta_filenames.empty()) { cerr << "error:[vg construct] a reference is required for graph construction" << endl; return 1; } - vector> references; - for (auto& fasta_filename : fasta_filenames) { - // Open each FASTA file - FastaReference* reference = new FastaReference(); - references.emplace_back(reference); - reference->open(fasta_filename); - } - - vector > insertions; - for (auto& insertion_filename : insertion_filenames){ - // Open up those insertion files - FastaReference* insertion = new FastaReference(); - insertions.emplace_back(insertion); - insertion->open(insertion_filename); - } - - // Make vectors of just bare pointers - vector vcf_pointers; - for(auto& vcf : variant_files) { - vcf_pointers.push_back(vcf.get()); - } - vector fasta_pointers; - for(auto& fasta : references) { - fasta_pointers.push_back(fasta.get()); - } - vector ins_pointers; - for (auto& ins : insertions){ - ins_pointers.push_back(ins.get()); - } - - if (ins_pointers.size() > 1){ + if (insertion_filenames.size() > 1){ cerr << "Error: only one insertion file may be provided." << endl; exit(1); } - // Construct the graph. - constructor.construct_graph(fasta_pointers, vcf_pointers, - ins_pointers, callback); - - // Now all the graph chunks are written out. - // Add an EOF marker - stream::finish(cout); - - // NB: If you worry about "still reachable but possibly lost" warnings in valgrind, - // this would free all the memory used by protobuf: - //ShutdownProtobufLibrary(); + if (construct_in_memory) { + // Build the whole thing into memory + bdsg::HashGraph constructed; + constructor.construct_graph(fasta_filenames, vcf_filenames, insertion_filenames, &constructed); + constructed.serialize(cout); + } else { + // Make an emitter that serializes the actual Graph objects, with buffering. + // But just serialize one graph at a time in each group. + // Make sure to compress the output. + vg::io::ProtobufEmitter emitter(cout, true, 1); + + // We need a callback to handle pieces of graph as they are produced. + auto callback = [&](Graph& big_chunk) { + // Sort the nodes by ID so that the serialized chunks come out in sorted order + // TODO: We still interleave chunks from different threads working on different contigs + std::sort(big_chunk.mutable_node()->begin(), big_chunk.mutable_node()->end(), [](const Node& a, const Node& b) -> bool { + // Return true if a comes before b + return a.id() < b.id(); + }); + + // We don't validate the chunk because its end node may be held + // back for the next chunk, while edges and path mappings for it + // still live in this chunk. Also, we no longer create a VG to + // re-chunk the chunk (because we can now handle chunks up to about + // 1 GB serialized), and the VG class has the validator. + + // One thread at a time can write to the emitter and the output stream + #pragma omp critical (emitter) + emitter.write_copy(big_chunk); + }; + + // Construct the graph. + constructor.construct_graph(fasta_filenames, vcf_filenames, insertion_filenames, callback); + + // The output will be flushed when the ProtobufEmitter we use in the callback goes away. + // Don't add an extra EOF marker or anything. + + // NB: If you worry about "still reachable but possibly lost" warnings in valgrind, + // this would free all the memory used by protobuf: + //ShutdownProtobufLibrary(); + } } else if (!msa_filename.empty()) { @@ -358,5 +353,5 @@ int main_construct(int argc, char** argv) { } // Register subcommand -static Subcommand vg_construct("construct", "graph construction", PIPELINE, 1, main_construct); +static Subcommand vg_construct("construct", "graph construction", PIPELINE, 2, main_construct); diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp new file mode 100644 index 00000000000..710bdb4458a --- /dev/null +++ b/src/subcommand/convert_main.cpp @@ -0,0 +1,697 @@ +#include "subcommand.hpp" +#include "../vg.hpp" +#include "../utility.hpp" +#include "xg.hpp" +#include "../algorithms/gfa_to_handle.hpp" +#include "../algorithms/find_gbwtgraph.hpp" +#include "../io/save_handle_graph.hpp" +#include "../gfa.hpp" +#include "../gbwt_helper.hpp" +#include "../gbwtgraph_helper.hpp" +#include +#include +#include + +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" + +#include +#include + +#include +#include + +using namespace vg; +using namespace vg::subcommand; +using namespace vg::io; + +//------------------------------------------------------------------------------ + +// We need a type for describing what kind of input to parse. +enum input_type { input_handlegraph, input_gam, input_gaf, input_gfa, input_gbwtgraph }; +const input_type INPUT_DEFAULT = input_handlegraph; + +// We also need a type for a tri-state for deciding what kind of GFA output algorithm to use. +enum algorithm_type { algorithm_auto, algorithm_vg, algorithm_gbwtgraph }; +const algorithm_type ALGORITHM_DEFAULT = algorithm_auto; + +void help_convert(char** argv); +void no_multiple_inputs(input_type input); +// Generate an XG with nodes, edges, and paths from input. +// Promote haplotype-sense paths for the samples in ref_samples to reference sense. +// Copy across other haplotype-sense paths if unless drop_haplotypes is true. +void graph_to_xg_adjusting_paths(const PathHandleGraph* input, xg::XG* output, const std::unordered_set& ref_samples, bool drop_haplotypes); +// Copy paths from input to output. +// Promote haplotype-sense paths for the samples in ref_samples to reference sense. +// Copy across other haplotype-sense paths if unless drop_haplotypes is true. +void add_and_adjust_paths(const PathHandleGraph* input, MutablePathHandleGraph* output, const std::unordered_set& ref_samples, bool drop_haplotypes); + + +//------------------------------------------------------------------------------ + +int main_convert(int argc, char** argv) { + + string output_format; + input_type input = INPUT_DEFAULT; + int64_t input_rgfa_rank = 0; + string gfa_trans_path; + string input_aln; + string gbwt_name; + unordered_set ref_samples; + bool drop_haplotypes = false; + set rgfa_paths; + vector rgfa_prefixes; + bool rgfa_pline = false; + bool wline = true; + algorithm_type gfa_output_algorithm = ALGORITHM_DEFAULT; + int num_threads = omp_get_max_threads(); // For GBWTGraph to GFA. + + if (argc == 2) { + help_convert(argv); + return 1; + } + + constexpr int OPT_REF_SAMPLE = 1000; + constexpr int OPT_GBWTGRAPH_ALGORITHM = 1001; + constexpr int OPT_VG_ALGORITHM = 1002; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"gfa-in", no_argument, 0, 'g'}, + {"in-rgfa-rank", required_argument, 0, 'r'}, + {"gbwt-in", required_argument, 0, 'b'}, + {"ref-sample", required_argument, 0, OPT_REF_SAMPLE}, + {"drop-haplotypes", no_argument, 0, 'H'}, + {"vg-out", no_argument, 0, 'v'}, + {"hash-out", no_argument, 0, 'a'}, + {"packed-out", no_argument, 0, 'p'}, + {"xg-out", no_argument, 0, 'x'}, + {"gfa-out", no_argument, 0, 'f'}, + {"rgfa-path", required_argument, 0, 'P'}, + {"rgfa-prefix", required_argument, 0, 'Q'}, + {"rgfa-pline", no_argument, 0, 'B'}, + {"gfa-trans", required_argument, 0, 'T'}, + {"no-wline", no_argument, 0, 'W'}, + {"gbwtgraph-algorithm", no_argument, 0, OPT_GBWTGRAPH_ALGORITHM}, + {"vg-algorithm", no_argument, 0, OPT_VG_ALGORITHM}, + {"gam-to-gaf", required_argument, 0, 'G'}, + {"gaf-to-gam", required_argument, 0, 'F'}, + {"threads", required_argument, 0, 't'}, + {0, 0, 0, 0} + + }; + int option_index = 0; + c = getopt_long (argc, argv, "hgr:b:HvxapxfP:Q:BT:WG:F:t:", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + + case '?': + case 'h': + help_convert(argv); + return 0; + case 'g': + no_multiple_inputs(input); + input = input_gfa; + break; + case 'r': + input_rgfa_rank = stol(optarg); + break; + case 'b': + no_multiple_inputs(input); + input = input_gbwtgraph; + gbwt_name = optarg; + break; + case OPT_REF_SAMPLE: + ref_samples.insert(optarg); + break; + case 'H': + drop_haplotypes = true; + break; + case 'v': + output_format = "vg"; + break; + case 'a': + output_format = "hash"; + break; + case 'p': + output_format = "packed"; + break; + case 'x': + output_format = "xg"; + break; + case 'f': + output_format = "gfa"; + break; + case 'P': + rgfa_paths.insert(optarg); + break; + case 'Q': + rgfa_prefixes.push_back(optarg); + break; + case 'B': + rgfa_pline = true; + break; + case 'T': + gfa_trans_path = optarg; + break; + case 'W': + wline = false; + break; + case OPT_GBWTGRAPH_ALGORITHM: + gfa_output_algorithm = algorithm_gbwtgraph; + break; + case OPT_VG_ALGORITHM: + gfa_output_algorithm = algorithm_vg; + break; + case 'G': + no_multiple_inputs(input); + input = input_gam; + input_aln = optarg; + break; + case 'F': + no_multiple_inputs(input); + input = input_gaf; + input_aln = optarg; + break; + case 't': + { + num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg convert] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + } + break; + default: + abort(); + } + } + + if (!gfa_trans_path.empty() && input != input_gfa) { + cerr << "error [vg convert]: -T can only be used with -g" << endl; + return 1; + } + if (output_format != "gfa" && (!rgfa_paths.empty() || !rgfa_prefixes.empty() || !wline)) { + cerr << "error [vg convert]: -P, -Q, and -W can only be used with -f" << endl; + return 1; + } + if (gfa_output_algorithm == algorithm_gbwtgraph) { + if (output_format != "gfa") { + cerr << "error [vg convert]: Only GFA output format can be used with the GBWTGraph library GFA conversion algorithm" << endl; + return 1; + } + if (input == input_gfa) { + cerr << "error [vg convert]: GFA input cannot be used with the GBWTGraph library GFA conversion algorithm" << endl; + return 1; + } + if (!(rgfa_paths.empty() && rgfa_prefixes.empty() && wline)) { + cerr << "error [vg convert]: GFA output options (-P, -Q, -W) cannot be used with the GBWTGraph library GFA conversion algorithm" << endl; + return 1; + } + } + if (output_format == "gfa" && !ref_samples.empty()) { + cerr << "error [vg convert]: paths cannot be converted to reference sense when writing GFA output" << endl; + return 1; + } + if (output_format == "vg") { + cerr << "[vg convert] warning: vg-protobuf output (-v / --vg-out) is deprecated. please use -p instead." << endl; + } + + + // with -F or -G we convert an alignment and not a graph + if (input == input_gam || input == input_gaf) { + if (!output_format.empty()) { + cerr << "error [vg convert]: Alignment conversion options (-F and -G) cannot be used " + << "with any graph conversion options" << endl; + return 1; + } + + unique_ptr input_graph; + string input_graph_filename = get_input_file_name(optind, argc, argv); + input_graph = vg::io::VPKG::load_one(input_graph_filename); + + unique_ptr emitter = get_non_hts_alignment_emitter("-", (input == input_gam) ? "GAF" : "GAM", {}, get_thread_count(), + input_graph.get()); + std::function lambda = [&] (Alignment& aln) { + emitter->emit_singles({aln}); + }; + if (input == input_gam) { + get_input_file(input_aln, [&](istream& in) { + vg::io::for_each_parallel(in, lambda); + }); + } else { + gaf_unpaired_for_each_parallel(*input_graph, input_aln, lambda); + } + return 0; + } + + if (output_format.empty()) { + // default to PackedGraph + output_format = "packed"; + } + + // allocate a graph using the graph_type string to decide a class + unique_ptr output_graph; + if (output_format == "vg") { + output_graph = unique_ptr(new VG()); + } else if (output_format == "hash") { + output_graph = unique_ptr(new bdsg::HashGraph()); + } else if (output_format == "packed") { + output_graph = unique_ptr(new bdsg::PackedGraph()); + } else if (output_format == "xg") { + output_graph = unique_ptr(new xg::XG()); + } else if (output_format == "gfa") { + // we need an intermediary for going gfa to gfa, use packed graph + output_graph = unique_ptr(new bdsg::PackedGraph()); + } + PathHandleGraph* output_path_graph = dynamic_cast(output_graph.get()); + + unique_ptr input_graph; + unique_ptr input_gbwt; + + if (input == input_gfa) { + // we have to check this manually since we're not using the istream-based loading + // functions in order to be able to use the disk-backed loading algorithm + if (optind >= argc) { + cerr << "error [vg convert]: no input graph supplied" << endl; + return 1; + } + string input_stream_name = argv[optind]; + if (output_format == "xg") { + xg::XG* xg_graph = dynamic_cast(output_graph.get()); + + // Need to go through a handle graph + bdsg::HashGraph intermediate; + cerr << "warning [vg convert]: currently cannot convert GFA directly to XG; converting through another format" << endl; + algorithms::gfa_to_path_handle_graph(input_stream_name, &intermediate, + input_rgfa_rank, gfa_trans_path); + graph_to_xg_adjusting_paths(&intermediate, xg_graph, ref_samples, drop_haplotypes); + } + else { + // If the GFA doesn't have forward references, we can handle it + // efficiently even if we are streaming it, so we shouldn't warn about + // "-" input here. + try { + if (output_path_graph != nullptr) { + MutablePathMutableHandleGraph* mutable_output_graph = dynamic_cast(output_path_graph); + assert(mutable_output_graph != nullptr); + algorithms::gfa_to_path_handle_graph(input_stream_name, mutable_output_graph, + input_rgfa_rank, gfa_trans_path); + } + else { + MutableHandleGraph* mutable_output_graph = dynamic_cast(output_graph.get()); + assert(mutable_output_graph != nullptr); + algorithms::gfa_to_handle_graph(input_stream_name, mutable_output_graph, + gfa_trans_path); + } + } catch (algorithms::GFAFormatError& e) { + cerr << "error [vg convert]: Input GFA is not acceptable." << endl; + cerr << e.what() << endl; + exit(1); + } catch (std::ios_base::failure& e) { + cerr << "error [vg convert]: IO error processing input GFA." << endl; + cerr << e.what() << endl; + exit(1); + } + } + } + else { + if (input == input_gbwtgraph) { + // We need to read the input as a GBWTGraph file and attach it to a GBWT. + get_input_file(optind, argc, argv, [&](istream& in) { + input_graph = vg::io::VPKG::load_one(in); + }); + gbwtgraph::GBWTGraph* gbwt_graph = dynamic_cast(input_graph.get()); + if (gbwt_graph == nullptr) { + cerr << "error [vg convert]: input graph is not a GBWTGraph" << endl; + exit(1); + } + input_gbwt = vg::io::VPKG::load_one(gbwt_name); + gbwt_graph->set_gbwt(*input_gbwt); + } else if (input == input_handlegraph) { + string input_graph_filename = get_input_file_name(optind, argc, argv); + input_graph = vg::io::VPKG::load_one(input_graph_filename); + } else { + throw std::runtime_error("Unimplemented input type"); + } + + PathHandleGraph* input_path_graph = dynamic_cast(input_graph.get()); + + // Convert HandleGraph to HandleGraph. + if (output_format != "gfa") { + // XG output. + if (output_format == "xg") { + xg::XG* xg_graph = dynamic_cast(output_graph.get()); + if (input_path_graph != nullptr) { + // We can convert to XG with paths, which we might adjust + graph_to_xg_adjusting_paths(input_path_graph, xg_graph, ref_samples, drop_haplotypes); + } else { + // No paths, just convert to xg without paths + xg_graph->from_handle_graph(*input_graph); + } + } + // PathHandleGraph (possibly with haplotypes) to PathHandleGraph. + else if (input_path_graph != nullptr && output_path_graph != nullptr) { + MutablePathMutableHandleGraph* mutable_output_graph = dynamic_cast(output_path_graph); + assert(mutable_output_graph != nullptr); + // ID hint (TODO: currently not needed since only odgi used it) + mutable_output_graph->set_id_increment(input_graph->min_node_id()); + // Copy the graph as-is + handlealgs::copy_handle_graph(input_graph.get(), mutable_output_graph); + // Copy the paths across with possibly some rewriting + add_and_adjust_paths(input_path_graph, mutable_output_graph, ref_samples, drop_haplotypes); + } + // HandleGraph output. + else { + if (input_path_graph != nullptr) { + cerr << "warning [vg convert]: output format does not support paths, they are being dropped from the input" << endl; + } + MutableHandleGraph* mutable_output_graph = dynamic_cast(output_graph.get()); + assert(mutable_output_graph != nullptr); + // ID hint (TODO: currently not needed since only odgi used it) + mutable_output_graph->set_id_increment(input_graph->min_node_id()); + handlealgs::copy_handle_graph(input_graph.get(), mutable_output_graph); + } + } + } + + // GFA output. + if (output_format == "gfa") { + if (gfa_output_algorithm == algorithm_auto) { + // Determine algorithm to use. + if (!rgfa_paths.empty() || !rgfa_prefixes.empty() || rgfa_pline || !wline) { + // We've asked for special conversion options that only the vg algorithm supports. + gfa_output_algorithm = algorithm_vg; + } else if (vg::algorithms::find_gbwtgraph(input_graph.get())) { + // There's a GBWTGraph available so use that algorithm. + gfa_output_algorithm = algorithm_gbwtgraph; + } else { + // No GBWTGraph is available so use the VG algorithm. + gfa_output_algorithm = algorithm_vg; + } + } + if (gfa_output_algorithm == algorithm_gbwtgraph) { + // We need to find a GBWTGraph to use for this + const gbwtgraph::GBWTGraph* gbwt_graph = vg::algorithms::find_gbwtgraph(input_graph.get()); + if (gbwt_graph == nullptr) { + cerr << "error [vg convert]: input graph does not have a GBWTGraph, so GBWTGraph library GFA conversion algorithm cannot be used." << endl; + return 1; + } + + gbwtgraph::GFAExtractionParameters parameters; + parameters.num_threads = num_threads; + gbwtgraph::gbwt_to_gfa(*gbwt_graph, std::cout, parameters); + } else if (gfa_output_algorithm == algorithm_vg) { + // Use HandleGraph GFA conversion code + const PathHandleGraph* graph_to_write; + if (input == input_gfa) { + graph_to_write = dynamic_cast(output_graph.get()); + } else { + graph_to_write = dynamic_cast(input_graph.get()); + } + for (const string& path_name : rgfa_paths) { + if (!graph_to_write->has_path(path_name)) { + cerr << "error [vg convert]: specified path, " << " not found in graph" << path_name << endl; + return 1; + } + } + if (!rgfa_prefixes.empty()) { + graph_to_write->for_each_path_matching({}, {}, {}, [&](path_handle_t path_handle) { + // Scan for any paths of any sense matching an rGFA prefix. + string path_name = graph_to_write->get_path_name(path_handle); + for (const string& prefix : rgfa_prefixes) { + if (path_name.substr(0, prefix.length()) == prefix) { + rgfa_paths.insert(path_name); + continue; + } + } + }); + } + graph_to_gfa(graph_to_write, std::cout, rgfa_paths, rgfa_pline, wline); + } else { + throw std::runtime_error("Unimplemented GFA output algorithm"); + } + } + // Serialize the output graph. + else { + vg::io::save_handle_graph(output_graph.get(), cout); + } + + return 0; +} + +//------------------------------------------------------------------------------ + +void help_convert(char** argv) { + cerr << "usage: " << argv[0] << " convert [options] " << endl + << "input options:" << endl + << " -g, --gfa-in input in GFA format" << endl + << " -r, --in-rgfa-rank N import rgfa tags with rank <= N as paths [default=0]" << endl + << " -b, --gbwt-in FILE input graph is a GBWTGraph using the GBWT in FILE" << endl + << " --ref-sample STR change haplotypes for this sample to reference paths (may repeat)" << endl + << "gfa input options (use with -g):" << endl + << " -T, --gfa-trans FILE write gfa id conversions to FILE" << endl + << "output options:" << endl + << " -v, --vg-out output in VG's original Protobuf format [DEPRECATED: use -p instead]." << endl + << " -a, --hash-out output in HashGraph format" << endl + << " -p, --packed-out output in PackedGraph format [default]" << endl + << " -x, --xg-out output in XG format" << endl + << " -f, --gfa-out output in GFA format" << endl + << " -H, --drop-haplotypes do not include haplotype paths in the output (useful with GBWTGraph / GBZ inputs)" << endl + << "gfa output options (use with -f):" << endl + << " -P, --rgfa-path STR write given path as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl + << " -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl + << " -B, --rgfa-pline paths written as rGFA tags also written as lines" << endl + << " -W, --no-wline write all paths as GFA P-lines instead of W-lines. Allows handling multiple phase blocks and subranges used together." << endl + << " --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm. Not compatible with other GBWT output options or non-GBWT graphs." << endl + << " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types, but can't preserve original GFA coordinates." << endl + << "alignment options:" << endl + << " -G, --gam-to-gaf FILE convert GAM FILE to GAF" << endl + << " -F, --gaf-to-gam FILE convert GAF FILE to GAM" << endl + << "general options:" << endl + << " -t, --threads N use N threads (defaults to numCPUs)" << endl; +} + +void no_multiple_inputs(input_type input) { + if (input != INPUT_DEFAULT) { + std::cerr << "error [vg convert]: cannot combine input types (GFA, GBWTGraph, GBZ, GAM, GAF)" << std::endl; + std::exit(EXIT_FAILURE); + } +} + +//------------------------------------------------------------------------------ + +/// Check to make sure the haplotype paths with sample names in the given set +/// have no more than one phase block per sample/haplotype/contig combination. +/// Also return a map from sample name to set of observed haplotype numbers (so +/// we can include them in reference-sense paths only if needed). +std::unordered_map> check_duplicate_path_names(const PathHandleGraph* input, const std::unordered_set& ref_samples) { + // Check to make sure no ref samples have fragmented haplotypes. If they + // do, we can't drop the phase block and can't change the sense to + // reference. Store set of phase blocks by sample, haplotype, contig. + // If we stored just counts, we couldn't handle multiple subranges on the + // same phase block properly. + std::unordered_map, std::unordered_set> phase_block_sets; + // Also determine whether to strip the haplotype numbers; if there are + // multiple haplotypes stored for a sample (and the stored set exceeds size + // 1) we will keep them. + std::unordered_map> sample_to_haplotypes; + if (!ref_samples.empty()) { + input->for_each_path_matching({PathSense::HAPLOTYPE}, ref_samples, {}, [&](const path_handle_t& path) { + // For each path in these samples' haplotypes... + + auto sample = input->get_sample_name(path); + auto haplotype = input->get_haplotype(path); + auto contig = input->get_locus_name(path); + auto phase_block = input->get_phase_block(path); + + // Find the place to remember phase blocks for it + auto& phase_block_set = phase_block_sets[std::tuple(sample, haplotype, contig)]; + + // Insert the phase block + phase_block_set.insert(phase_block); + + if (phase_block_set.size() > 1) { + // We can't resolve these. + std::cerr << "error [vg convert]: multiple phase blocks on sample " << sample + << " haplotype " << haplotype + << " contig " << contig + << " prevent promoting the sample to a reference" << std::endl; + std::exit(EXIT_FAILURE); + } + + // Log its haplotypes + sample_to_haplotypes[sample].insert(haplotype); + }); + } + + return sample_to_haplotypes; +} + +void graph_to_xg_adjusting_paths(const PathHandleGraph* input, xg::XG* output, const std::unordered_set& ref_samples, bool drop_haplotypes) { + // Building an XG uses a slightly different interface, so we duplicate some + // code from the normal MutablePathMutableHandleGraph build. + // TODO: Find a way to unify the duplicated code? + + // Make sure we can safely promote any haplotypes to reference, and get the + // information we need to determine if we need to keep haplotype numbers + // when doing so. + auto sample_to_haplotypes = check_duplicate_path_names(input, ref_samples); + + // Enumerate nodes. + auto for_each_sequence = [&](const std::function& lambda) { + input->for_each_handle([&](const handle_t& handle) { + lambda(input->get_sequence(handle), input->get_id(handle)); + }); + }; + + // Enumerate edges. + auto for_each_edge = [&](const std::function& lambda) { + input->for_each_edge([&](const edge_t& edge) { + lambda(input->get_id(edge.first), input->get_is_reverse(edge.first), + input->get_id(edge.second), input->get_is_reverse(edge.second)); + }); + }; + + // Enumerate path steps. + auto for_each_path_element = [&](const std::function& lambda) { + + // Define a function to copy over a path. + // XG constructuon relies on name-encoded path metadata. + auto copy_path = [&](const path_handle_t& path, const std::string new_name) { + bool is_circular = input->get_is_circular(path); + for (handle_t handle : input->scan_path(path)) { + lambda(new_name, input->get_id(handle), input->get_is_reverse(handle), "", false, is_circular); + } + // TODO: Should we preserve empty paths here? + }; + + // Copy over the generic and existing reference paths + input->for_each_path_matching({PathSense::GENERIC, PathSense::REFERENCE}, {}, {}, [&](const path_handle_t& path) { + copy_path(path, input->get_path_name(path)); + }); + + if (!ref_samples.empty()) { + // Copy all haplotype paths matching the ref samples as reference + input->for_each_path_matching({PathSense::HAPLOTYPE}, ref_samples, {}, [&](const path_handle_t& path) { + + // Compose the new reference-ified metadata + std::string sample = input->get_sample_name(path); + std::string locus = input->get_locus_name(path); + int64_t haplotype; + if (sample_to_haplotypes[sample].size() > 1) { + // We should preserve the haplotype because we have multiple + // haplotype phases of this sample. + haplotype = input->get_haplotype(path); + } else { + // We should drop the haplotype number because this sample has only + // one haplotype phase. + haplotype = PathMetadata::NO_HAPLOTYPE; + } + auto subrange = input->get_subrange(path); + + // Make a new name with reference-ified metadata. + // Phase block is safe to discard because we checked for duplicates without it. + auto new_name = PathMetadata::create_path_name(PathSense::REFERENCE, + sample, + locus, + haplotype, + PathMetadata::NO_PHASE_BLOCK, + subrange); + + // Copy out to the xg + copy_path(path, new_name); + }); + } + + if (!drop_haplotypes) { + // Copy across any other haplotypes. + input->for_each_path_matching({PathSense::HAPLOTYPE}, {}, {}, [&](const path_handle_t& path) { + if (ref_samples.count(input->get_sample_name(path))) { + // Skip those we already promoted to reference sense + return; + } + copy_path(path, input->get_path_name(path)); + }); + } + }; + + // Build XG. + output->from_enumerators(for_each_sequence, for_each_edge, for_each_path_element, false); +} + +void add_and_adjust_paths(const PathHandleGraph* input, MutablePathHandleGraph* output, const std::unordered_set& ref_samples, bool drop_haplotypes) { + + // Make sure we aren't working with fragmented haplotypes that can't convert to reference sense. + auto sample_to_haplotypes = check_duplicate_path_names(input, ref_samples); + + // Copy all generic and reference paths that exist already + input->for_each_path_matching({PathSense::GENERIC, PathSense::REFERENCE}, {}, {}, [&](const path_handle_t& path) { + handlegraph::algorithms::copy_path(input, path, output); + }); + + if (!ref_samples.empty()) { + // Copy all haplotype paths matching the ref samples as reference + input->for_each_path_matching({PathSense::HAPLOTYPE}, ref_samples, {}, [&](const path_handle_t& path) { + + // Compose the new reference-ified metadata + std::string sample = input->get_sample_name(path); + std::string locus = input->get_locus_name(path); + int64_t haplotype; + if (sample_to_haplotypes[sample].size() > 1) { + // We should preserve the haplotype because we have multiple + // haplotype phases of this sample. + haplotype = input->get_haplotype(path); + } else { + // We should drop the haplotype number because this sample has only + // one haplotype phase. + haplotype = PathMetadata::NO_HAPLOTYPE; + } + auto subrange = input->get_subrange(path); + bool is_circular = input->get_is_circular(path); + + // Make a new path with reference-ified metadata. + // Phase block is safe to discard because we checked for duplicates without it. + path_handle_t into_path = output->create_path(PathSense::REFERENCE, + sample, + locus, + haplotype, + PathMetadata::NO_PHASE_BLOCK, + subrange, + is_circular); + + // Copy across the steps + handlegraph::algorithms::copy_path(input, path, output, into_path); + }); + } + + if (!drop_haplotypes) { + // Copy across any other haplotypes. + input->for_each_path_matching({PathSense::HAPLOTYPE}, {}, {}, [&](const path_handle_t& path) { + if (ref_samples.count(input->get_sample_name(path))) { + // Skip those we already promoted to reference sense + return; + } + handlegraph::algorithms::copy_path(input, path, output); + }); + } +} + +//------------------------------------------------------------------------------ + +// Register subcommand +static Subcommand vg_convert("convert", "convert graphs between handle-graph compliant formats as well as GFA", main_convert); diff --git a/src/subcommand/crash_main.cpp b/src/subcommand/crash_main.cpp deleted file mode 100644 index acf02e67fb6..00000000000 --- a/src/subcommand/crash_main.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/** \file crash_main.cpp - * - * Defines the "vg crash" subcommand, which throws errors to test the backtrace system. - */ - - -#include -#include -#include -#include - -#include -#include -#include - -#include "subcommand.hpp" - -#include "../benchmark.hpp" -#include "../utility.hpp" - -using namespace std; -using namespace vg; -using namespace vg::subcommand; - -void help_crash(char** argv){ - cerr << "usage: " << argv[0] << " crash [options]" << endl - << "Throw an error to test error handling" << endl - << endl - << "options: " << endl - << " -t, --threads N number of threads to run" << endl - << endl; -} - -// Give stack unwinding something to do -void recurse_and_error(size_t i) { - i--; - if (i == 0) { - cerr << "Thread " << omp_get_thread_num() << " now crashing!" << endl; - throw runtime_error("Intentional test error from thread " + to_string(omp_get_thread_num())); - } else { - recurse_and_error(i); - cerr << "Don't tail call optimize me!" << endl; - } -} - -int main_crash(int argc, char** argv){ - - int c; - optind = 2; // force optind past command positional argument - while (true) { - static struct option long_options[] = - - { - /* These options set a flag. */ - {"help", no_argument, 0, 'h'}, - {"threads", required_argument, 0, 't'}, - {0, 0, 0, 0} - }; - - int option_index = 0; - c = getopt_long (argc, argv, "ht:", - long_options, &option_index); - - - /* Detect the end of the options. */ - if (c == -1) - break; - - switch (c) - { - - case 't': - omp_set_num_threads(parse(optarg)); - break; - - case 'h': - case '?': - /* getopt_long already printed an error message. */ - help_crash(argv); - exit(1); - break; - - - default: - cerr << "Unimplemented option " << (char) c << endl; - exit(1); - } - } - - #pragma omp parallel for - for (size_t i = 0; i < 1000; i++) { - // Start a bunch of loop iterations that may be on arbitrary threads - #pragma omp task - { - // Make each a different size for more nondeterminism - int iter_count = rand() % 200; - - for (size_t j = 0; j < iter_count; j++) { - // Do some busy work - benchmark_control(); - // Make sure to have lots of sleeps to give opportunities to other threads to get signals - usleep(1); - } - if (i == 432) { - // In one thread, throw an error - recurse_and_error(10); - } - for (size_t j = 0; j < iter_count; j++) { - // Do more busy work - benchmark_control(); - usleep(1); - } - } - } - cerr << "Waiting for tasks" << endl; - #pragma omp taskwait - - - - return 0; -} - -// Register subcommand -static Subcommand vg_crash("crash", "throw an error", DEVELOPMENT, main_crash); - diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 019e04c874f..975336f6aa4 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -14,29 +14,73 @@ #include "../vg.hpp" #include "../deconstructor.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../gbwtgraph_helper.hpp" +#include "../gbwt_helper.hpp" +#include "../gbzgraph.hpp" +#include +#include +#include +#include +#include + +//#define USE_CALLGRIND + +#ifdef USE_CALLGRIND +#include +#endif using namespace std; using namespace vg; using namespace vg::subcommand; void help_deconstruct(char** argv){ - cerr << "usage: " << argv[0] << " deconstruct [options] -p .vg" << endl + cerr << "usage: " << argv[0] << " deconstruct [options] [-p|-P] " << endl << "Outputs VCF records for Snarls present in a graph (relative to a chosen reference path)." << endl << "options: " << endl - << "--path / -p REQUIRED: A reference path to deconstruct against." << endl + << " -p, --path NAME A reference path to deconstruct against (multiple allowed)." << endl + << " -P, --path-prefix NAME All paths [excluding GBWT threads / non-reference GBZ paths] beginning with NAME used as reference (multiple allowed)." << endl + << " Other non-ref paths not considered as samples. " << endl + << " -r, --snarls FILE Snarls file (from vg snarls) to avoid recomputing." << endl + << " -g, --gbwt FILE only consider alt traversals that correspond to GBWT threads FILE (not needed for GBZ graph input)." << endl + << " -T, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names and AT fields in output" << endl + << " -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output" << endl + << " -e, --path-traversals Only consider traversals that correspond to paths in the graph." << endl + << " -a, --all-snarls Process all snarls, including nested snarls (by default only top-level snarls reported)." << endl + << " -d, --ploidy N Expected ploidy. If more traversals found, they will be flagged as conflicts (default: 2)" << endl + << " -c, --context-jaccard N Set context mapping size used to disambiguate alleles at sites with multiple reference traversals (default: 10000)." << endl + << " -u, --untangle-travs Use context mapping to determine the reference-relative positions of each step in allele traversals (AP INFO field)." << endl + << " -K, --keep-conflicted Retain conflicted genotypes in output." << endl + << " -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input)." << endl + << " -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample)." << endl + << " -t, --threads N Use N threads" << endl + << " -v, --verbose Print some status messages" << endl << endl; } int main_deconstruct(int argc, char** argv){ - //cerr << "WARNING: EXPERIMENTAL" << endl; if (argc <= 2) { help_deconstruct(argv); return 1; } vector refpaths; + vector refpath_prefixes; string graphname; - string outfile = ""; + string snarl_file_name; + string gbwt_file_name; + string translation_file_name; + bool gbz_translation = false; + bool path_restricted_traversals = false; + bool show_progress = false; + int ploidy = 2; + bool set_ploidy = false; + bool all_snarls = false; + bool keep_conflicted = false; + bool strict_conflicts = false; + int context_jaccard_window = 10000; + bool untangle_traversals = false; + bool contig_only_ref = false; int c; optind = 2; // force optind past command positional argument @@ -45,48 +89,280 @@ int main_deconstruct(int argc, char** argv){ { {"help", no_argument, 0, 'h'}, {"path", required_argument, 0, 'p'}, + {"path-prefix", required_argument, 0, 'P'}, + {"path-sep", required_argument, 0, 'H'}, + {"snarls", required_argument, 0, 'r'}, + {"gbwt", required_argument, 0, 'g'}, + {"translation", required_argument, 0, 'T'}, + {"gbz-translation", no_argument, 0, 'O'}, + {"path-traversals", no_argument, 0, 'e'}, + {"ploidy", required_argument, 0, 'd'}, + {"context-jaccard", required_argument, 0, 'c'}, + {"untangle-travs", no_argument, 0, 'u'}, + {"all-snarls", no_argument, 0, 'a'}, + {"keep-conflicted", no_argument, 0, 'K'}, + {"strict-conflicts", no_argument, 0, 'S'}, + {"contig-only-ref", no_argument, 0, 'C'}, + {"threads", required_argument, 0, 't'}, + {"verbose", no_argument, 0, 'v'}, {0, 0, 0, 0} - }; - int option_index = 0; - c = getopt_long (argc, argv, "hp:", - long_options, &option_index); + int option_index = 0; + c = getopt_long (argc, argv, "hp:P:H:r:g:T:OeKSCd:c:uat:v", + long_options, &option_index); - // Detect the end of the options. - if (c == -1) - break; + // Detect the end of the options. + if (c == -1) + break; - switch (c) - { - case 'p': - refpaths = split(optarg, ","); - break; - case '?': - case 'h': - help_deconstruct(argv); - return 1; - default: - help_deconstruct(argv); - abort(); + switch (c) + { + case 'p': + refpaths.push_back(optarg); + break; + case 'P': + refpath_prefixes.push_back(optarg); + break; + case 'H': + cerr << "Warning [vg deconstruct]: -H is deprecated, and will be ignored" << endl; + break; + case 'r': + snarl_file_name = optarg; + break; + case 'g': + gbwt_file_name = optarg; + break; + case 'T': + translation_file_name = optarg; + break; + case 'O': + gbz_translation = true; + break; + case 'e': + path_restricted_traversals = true; + break; + case 'd': + ploidy = parse(optarg); + set_ploidy = true; + break; + case 'c': + context_jaccard_window = parse(optarg); + break; + case 'u': + untangle_traversals = true; + break; + case 'a': + all_snarls = true; + break; + case 'K': + keep_conflicted = true; + break; + case 'S': + strict_conflicts = true; + break; + case 'C': + contig_only_ref = true; + break; + case 't': + omp_set_num_threads(parse(optarg)); + break; + case 'v': + show_progress = true; + break; + case '?': + case 'h': + help_deconstruct(argv); + return 1; + default: + help_deconstruct(argv); + abort(); + } + + } + + // Read the graph + + unique_ptr path_handle_graph_up; + unique_ptr gbz_graph; + gbwt::GBWT* gbwt_index = nullptr; + PathHandleGraph* path_handle_graph = nullptr; + + string path_handle_graph_filename = get_input_file_name(optind, argc, argv); + auto input = vg::io::VPKG::try_load_first(path_handle_graph_filename); + if (get<0>(input)) { + gbz_graph = std::move(get<0>(input)); + path_handle_graph = gbz_graph.get(); + gbwt_index = &gbz_graph->gbz.index; + } else if (get<1>(input)) { + path_handle_graph_up = std::move(get<1>(input)); + path_handle_graph = path_handle_graph_up.get(); + } else { + cerr << "Error [vg deconstruct]: Input graph is not a GBZ or path handle graph" << endl; + return 1; + } + + if (!gbz_graph && gbz_translation) { + cerr << "Error [vg deconstruct]: -O can only be used when input graph is in GBZ format" << endl; + } + + if (set_ploidy && !path_restricted_traversals && gbwt_file_name.empty() && !gbz_graph) { + cerr << "Error [vg deconstruct]: -d can only be used with -e or -g or GBZ input" << endl; + return 1; + } + + if ((!gbwt_file_name.empty() || gbz_graph) && path_restricted_traversals && !gbz_graph) { + cerr << "Error [vg deconstruct]: -e cannot be used with -g or GBZ input" << endl; + return 1; + } + + if (!gbwt_file_name.empty() || gbz_graph) { + // context jaccard depends on having steps for each alt traversal, which is + // not something we have on hand when getting traversals from the GBWT/GBZ + // so we toggle it off in this case to prevent outputting invalid VCFs (GTs go missing) + // at sites with multiple reference paths + context_jaccard_window = 0; + } + + // We might need to apply an overlay to get good path position queries + bdsg::ReferencePathOverlayHelper overlay_helper; + + // Set up to time making the overlay + clock_t overlay_start_clock = clock(); + std::chrono::time_point overlay_start_time = std::chrono::system_clock::now(); + + // Make the overlay + PathPositionHandleGraph* graph = overlay_helper.apply(path_handle_graph); + + // See how long that took + clock_t overlay_stop_clock = clock(); + std::chrono::time_point overlay_stop_time = std::chrono::system_clock::now(); + double overlay_cpu_seconds = (overlay_stop_clock - overlay_start_clock) / (double)CLOCKS_PER_SEC; + std::chrono::duration overlay_seconds = overlay_stop_time - overlay_start_time; + + if (show_progress && graph != dynamic_cast(path_handle_graph)) { + std::cerr << "Computed overlay in " << overlay_seconds.count() << " seconds using " << overlay_cpu_seconds << " CPU seconds." << std::endl; + } + + // Read the GBWT + unique_ptr gbwt_index_up; + if (!gbwt_file_name.empty()) { + if (gbwt_index) { + cerr << "Warning [vg deconstruct]: Using GBWT from -g overrides that in input GBZ (you probably don't want to use -g)" << endl; + } + gbwt_index_up = vg::io::VPKG::load_one(gbwt_file_name); + if (!gbwt_index_up) { + cerr << "Error [vg deconstruct]: Unable to load gbwt index file: " << gbwt_file_name << endl; + return 1; + } + gbwt_index = gbwt_index_up.get(); + } + + if (!refpaths.empty()) { + // Check our paths + for (const string& ref_path : refpaths) { + if (!graph->has_path(ref_path)) { + cerr << "error [vg deconstruct]: Reference path \"" << ref_path << "\" not found in graph/gbwt" << endl; + return 1; } + } + } + + if (refpaths.empty() && refpath_prefixes.empty()) { + // No paths specified: use them all + graph->for_each_path_handle([&](path_handle_t path_handle) { + const string& name = graph->get_path_name(path_handle); + if (!Paths::is_alt(name) && PathMetadata::parse_sense(name) != PathSense::HAPLOTYPE) { + refpaths.push_back(name); + } + }); + } + // Read the translation + unique_ptr>> translation; + if (gbz_graph.get() != nullptr && gbz_translation) { + // try to get the translation from the graph + translation = make_unique>>(); + *translation = load_translation_back_map(gbz_graph->gbz.graph); + if (translation->empty()) { + // not worth keeping an empty translation + translation = nullptr; + } + } + if (!translation_file_name.empty()) { + if (!translation->empty()) { + cerr << "Warning [vg deconstruct]: Using translation from -T overrides that in input GBZ (you probably don't want to use -T)" << endl; } - graphname = argv[optind]; - vg::VG* graph; - if (!graphname.empty()){ - ifstream gstream(graphname); - graph = new vg::VG(gstream); + ifstream translation_file(translation_file_name.c_str()); + if (!translation_file) { + cerr << "Error [vg deconstruct]: Unable to load translation file: " << translation_file_name << endl; + return 1; } + translation = make_unique>>(); + *translation = load_translation_back_map(*graph, translation_file); + } + + // Load or compute the snarls + unique_ptr snarl_manager; + if (!snarl_file_name.empty()) { + ifstream snarl_file(snarl_file_name.c_str()); + if (!snarl_file) { + cerr << "Error [vg deconstruct]: Unable to load snarls file: " << snarl_file_name << endl; + return 1; + } + if (show_progress) { + cerr << "Loading snarls" << endl; + } + snarl_manager = vg::io::VPKG::load_one(snarl_file); + } else { + IntegratedSnarlFinder finder(*graph); + if (show_progress) { + cerr << "Finding snarls" << endl; + } + snarl_manager = unique_ptr(new SnarlManager(std::move(finder.find_snarls_parallel()))); + } + + // process the prefixes to find ref paths + if (!refpath_prefixes.empty()) { + graph->for_each_path_handle([&](const path_handle_t& path_handle) { + string path_name = graph->get_path_name(path_handle); + for (auto& prefix : refpath_prefixes) { + if (path_name.compare(0, prefix.size(), prefix) == 0) { + refpaths.push_back(path_name); + break; + } + } + }); + } + + if (refpaths.empty()) { + cerr << "Error [vg deconstruct]: No specified reference path or prefix found in graph" << endl; + return 1; + } - // load graph +#ifdef USE_CALLGRIND + // We want to profile stuff that accesses paths, not the loading. + CALLGRIND_START_INSTRUMENTATION; +#endif - // Deconstruct - Deconstructor dd; - dd.deconstruct(refpaths, graph); + // Deconstruct + Deconstructor dd; + if (show_progress) { + cerr << "Deconstructing top-level snarls" << endl; + } + dd.set_translation(translation.get()); + dd.set_nested(all_snarls); + dd.deconstruct(refpaths, graph, snarl_manager.get(), path_restricted_traversals, ploidy, + all_snarls, + context_jaccard_window, + untangle_traversals, + keep_conflicted, + strict_conflicts, + !contig_only_ref, + gbwt_index); return 0; } // Register subcommand -static Subcommand vg_deconstruct("deconstruct", "convert a graph into VCF relative to a reference", main_deconstruct); +static Subcommand vg_deconstruct("deconstruct", "create a VCF from variation in the graph", TOOLKIT, main_deconstruct); diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp new file mode 100644 index 00000000000..54284f67fd5 --- /dev/null +++ b/src/subcommand/depth_main.cpp @@ -0,0 +1,277 @@ +/** \file depth_main.cpp + * + * Estimate sequencing depth from a (packed) alignment. + */ + + +#include +#include +#include + +#include +#include + +#include "subcommand.hpp" + +#include +#include +#include "../handle.hpp" +#include +#include "../utility.hpp" +#include "../packer.hpp" +#include "algorithms/coverage_depth.hpp" + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_depth(char** argv) { + cerr << "usage: " << argv[0] << " depth [options] " << endl + << "options:" << endl + << " packed coverage depth (print 1-based positional depths along path):" << endl + << " -k, --pack FILE supports created from vg pack for given input graph" << endl + << " -d, --count-dels count deletion edges within the bin as covering reference positions" << endl + << " GAM/GAF coverage depth (print for depth):" << endl + << " -g, --gam FILE read alignments from this GAM file (could be '-' for stdin)" << endl + << " -a, --gaf FILE read alignments from this GAF file (could be '-' for stdin)" << endl + << " -n, --max-nodes N maximum nodes to consider [1000000]" << endl + << " -s, --random-seed N random seed for sampling nodes to consider" << endl + << " -Q, --min-mapq N ignore alignments with mapping quality < N [0]" << endl + << " path coverage depth (print 1-based positional depths along path):" << endl + << " activate by specifiying -p without -k" << endl + << " -c, --count-cycles count each time a path steps on a position (by default paths are only counted once)" << endl + << " common options:" << endl + << " -p, --ref-path NAME reference path to call on (multipile allowed. defaults to all paths)" << endl + << " -P, --paths-by STR select the paths with the given name prefix" << endl + << " -b, --bin-size N bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl + << " -m, --min-coverage N ignore nodes with less than N coverage depth [1]" << endl + << " -t, --threads N number of threads to use [all available]" << endl; +} + +int main_depth(int argc, char** argv) { + + if (argc == 2) { + help_depth(argv); + return 1; + } + + string pack_filename; + unordered_set ref_paths_input_set; + vector path_prefixes; + size_t bin_size = 1; + bool count_dels = false; + + string gam_filename; + string gaf_filename; + size_t max_nodes = 1000000; + int random_seed = time(NULL); + size_t min_mapq = 0; + bool count_cycles = false; + + size_t min_coverage = 1; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + + static const struct option long_options[] = { + {"pack", required_argument, 0, 'k'}, + {"ref-path", required_argument, 0, 'p'}, + {"paths-by", required_argument, 0, 'P'}, + {"bin-size", required_argument, 0, 'b'}, + {"count-dels", no_argument, 0, 'd'}, + {"gam", required_argument, 0, 'g'}, + {"gaf", no_argument, 0, 'a'}, + {"max-nodes", required_argument, 0, 'n'}, + {"random-seed", required_argument, 0, 's'}, + {"min-mapq", required_argument, 0, 'Q'}, + {"min-coverage", required_argument, 0, 'm'}, + {"count-cycles", no_argument, 0, 'c'}, + {"threads", required_argument, 0, 't'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "hk:p:P:b:dg:a:n:s:m:ct:", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'k': + pack_filename = optarg; + break; + case 'p': + ref_paths_input_set.insert(optarg); + break; + case 'P': + path_prefixes.push_back(optarg); + break; + case 'b': + bin_size = parse(optarg); + break; + case 'd': + count_dels = true; + break; + case 'g': + gam_filename = optarg; + break; + case 'a': + gaf_filename = optarg; + break; + case 'n': + max_nodes = parse(optarg); + break; + case 's': + random_seed = parse(optarg); + break; + case 'Q': + min_mapq = parse(optarg); + break; + case 'm': + min_coverage = parse(optarg); + break; + case 'c': + count_cycles = true; + break; + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg depth] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } + case 'h': + case '?': + /* getopt_long already printed an error message. */ + help_depth(argv); + exit(1); + break; + default: + abort (); + } + } + + if (argc <= 2) { + help_depth(argv); + return 1; + } + + size_t input_count = pack_filename.empty() ? 0 : 1; + if (!gam_filename.empty()) ++input_count; + if (!gaf_filename.empty()) ++input_count; + if (input_count > 1) { + cerr << "error:[vg depth] At most one of a pack file (-k), a GAM file (-g), or a GAF file (-a) must be given" << endl; + exit(1); + } + + // Read the graph + unique_ptr path_handle_graph; + string path_handle_graph_filename = get_input_file_name(optind, argc, argv); + path_handle_graph = vg::io::VPKG::load_one(path_handle_graph_filename); + PathHandleGraph* graph = path_handle_graph.get(); + + // Apply the overlay if necessary + bdsg::PathVectorizableOverlayHelper overlay_helper; + if (!pack_filename.empty()) { + graph = dynamic_cast(overlay_helper.apply(path_handle_graph.get())); + assert(graph != nullptr); + } + + // Process the pack (or paths) + unique_ptr packer; + if (!pack_filename.empty() || input_count == 0) { + if (!pack_filename.empty()) { + // Load our packed supports (they must have come from vg pack on graph) + packer = unique_ptr(new Packer(graph)); + packer->load_from_file(pack_filename); + } + + // we want our paths sorted by the subpath parse so the output is sorted + map, string> ref_paths; + unordered_set base_path_set; + + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + subrange_t subrange; + string base_name = Paths::strip_subrange(path_name, &subrange); + base_path_set.insert(base_name); + // just take anything if no selection + bool use_it = !Paths::is_alt(path_name) && path_prefixes.empty() && ref_paths_input_set.empty(); + + // then look in the input paths -p + if (!use_it && ref_paths_input_set.count(base_name)) { + use_it = true; + } + + // then look in the prefixes + for (size_t i = 0; i < path_prefixes.size() && !use_it; ++i) { + if (path_name.substr(0, path_prefixes[i].length()) == path_prefixes[i]) { + use_it = true; + } + } + if (use_it) { + auto coord = make_pair(base_name, subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first); + assert(!ref_paths.count(coord)); + ref_paths[coord] = path_name; + } + }); + + for (const auto& ref_name : ref_paths_input_set) { + if (!base_path_set.count(ref_name)) { + cerr << "error:[vg depth] Path \"" << ref_name << "\" not found in graph" << endl; + } + } + + for (const auto& ref_coord_path : ref_paths) { + const string& ref_path = ref_coord_path.second; + const string& base_path = ref_coord_path.first.first; + const size_t subpath_offset = ref_coord_path.first.second; + + if (bin_size > 1) { + vector> binned_depth; + if (!pack_filename.empty()) { + binned_depth = algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels); + } else { + binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage, count_cycles); + } + for (auto& bin_cov : binned_depth) { + // bins can ben nan if min_coverage filters everything out. just skip + if (!isnan(get<3>(bin_cov))) { + cout << base_path << "\t" << (get<0>(bin_cov) + 1 + subpath_offset)<< "\t" << (get<1>(bin_cov) + 1 + subpath_offset) << "\t" << get<2>(bin_cov) + << "\t" << sqrt(get<3>(bin_cov)) << endl; + } + } + } else { + if (!pack_filename.empty()) { + algorithms::packed_depths(*packer, ref_path, min_coverage, cout); + } else { + algorithms::path_depths(*graph, ref_path, min_coverage, count_cycles, cout); + } + } + } + } + + // Process the gam + if (!gam_filename.empty() || !gaf_filename.empty()) { + const string& mapping_filename = !gam_filename.empty() ? gam_filename : gaf_filename; + pair mapping_cov; + mapping_cov = algorithms::sample_mapping_depth(*graph, mapping_filename, max_nodes, random_seed, + min_coverage, min_mapq, !gam_filename.empty() ? "GAM" : "GAF"); + cout << mapping_cov.first << "\t" << sqrt(mapping_cov.second) << endl; + } + + return 0; + +} + +// Register subcommand +static Subcommand vg_depth("depth", "estimate sequencing depth", main_depth); + diff --git a/src/subcommand/dotplot_main.cpp b/src/subcommand/dotplot_main.cpp index 40941774ffb..5157b91d4ef 100644 --- a/src/subcommand/dotplot_main.cpp +++ b/src/subcommand/dotplot_main.cpp @@ -14,8 +14,12 @@ #include "../vg.hpp" #include "../xg.hpp" +#include +#include #include "../position.hpp" +#include "algorithms/nearest_offsets_in_paths.hpp" + using namespace std; using namespace vg; using namespace vg::subcommand; @@ -24,7 +28,7 @@ void help_dotplot(char** argv) { cerr << "usage: " << argv[0] << " dotplot [options]" << endl << "options:" << endl << " input:" << endl - << " -x, --xg FILE use the graph in the XG index FILE" << endl; + << " -x, --xg FILE use the graph or the XG index FILE" << endl; //<< " output:" << endl; } @@ -76,19 +80,20 @@ int main_dotplot(int argc, char** argv) { cerr << "[vg dotplot] Error: an xg index is required" << endl; exit(1); } else { - xg::XG xgidx; - ifstream in(xg_file.c_str()); - xgidx.load(in); + unique_ptr path_handle_graph = vg::io::VPKG::load_one(xg_file); + bdsg::PathPositionOverlayHelper overlay_helper; + PathPositionHandleGraph* xindex = overlay_helper.apply(path_handle_graph.get()); + cout << "query.name" << "\t" << "query.pos" << "\t" << "orientation" << "\t" << "target.name" << "\t" << "target.pos" << endl; - xgidx.for_each_handle([&](const handle_t& h) { - vg::id_t id = xgidx.get_id(h); - for (size_t i = 0; i < xgidx.node_length(id); ++i) { + xindex->for_each_handle([&](const handle_t& h) { + vg::id_t id = xindex->get_id(h); + for (size_t i = 0; i < xindex->get_length(xindex->get_handle(id)); ++i) { pos_t p = make_pos_t(id, false, i); - map > > offsets = xgidx.offsets_in_paths(p); + map > > offsets = algorithms::offsets_in_paths(&(*xindex), p); // cross the offsets in output for (auto& o : offsets) { auto& name1 = o.first; diff --git a/src/subcommand/explode_main.cpp b/src/subcommand/explode_main.cpp index 2a8c4bb28b6..f5be68feb4a 100644 --- a/src/subcommand/explode_main.cpp +++ b/src/subcommand/explode_main.cpp @@ -9,7 +9,7 @@ #include "subcommand.hpp" #include "../vg.hpp" -#include "../stream.hpp" +#include #include "../utility.hpp" @@ -71,6 +71,10 @@ int main_explode(int argc, char** argv) { } } + cerr << "vg explode is deprecated. Please use \"vg chunk -C source.vg -b part_dir/component\" for same* functionality as \"vg explode source.vg part_dir\"" << endl + << " * (unlike explode, the output directory must already exist when running chunk, though)" << endl; + return 1; + VG* graph; get_input_file(optind, argc, argv, [&](istream& in) { graph = new VG(in); @@ -89,63 +93,81 @@ int main_explode(int argc, char** argv) { size_t component_index = 0; // Track all the nodes we've already assigned to subgraphs - set used; + unordered_set used; - graph->for_each_node([&](Node* start) { + graph->for_each_handle([&](const handle_t& start) { if (!used.count(start)) { // It's a new connected component! + // TODO: this could be replaced by any handle graph type now VG component; // We want to track the path names in each component - set path_names; + set paths; + + deque queue{start}; - graph->for_each_connected_node(start, [&](Node* n) { - // Mark this connected node as used in a component. - used.insert(n); + // Mark this connected node as used in a component. + used.insert(start); + + while (!queue.empty()) { - // Copy node over - component.create_node(n->sequence(), n->id()); + handle_t handle = queue.front(); + queue.pop_front(); - // Copy over its edges - for (auto* e : graph->edges_of(n)) { - component.add_edge(*e); - } + // Copy node over + handle_t new_handle = component.create_handle(graph->get_sequence(handle), + graph->get_id(handle)); - // Copy paths over - for (auto& path : graph->paths.get_node_mapping_by_path_name(n)) { - // Some paths might not actually touch this node at all. - bool nonempty = false; - for (auto& m : path.second) { - component.paths.append_mapping(path.first, *m); - nonempty = true; + // Copy over its edges and queue the next handles + graph->follow_edges(handle, false, [&](const handle_t& next) { + if (component.has_node(graph->get_id(next))) { + component.create_edge(new_handle, component.get_handle(graph->get_id(next), + graph->get_is_reverse(next))); } - if (nonempty) { - // This path had mappings, so it qualifies for the component - path_names.insert(path.first); + if (!used.count(next)) { + queue.push_back(next); + used.insert(next); } - } - }); + }); + graph->follow_edges(handle, true, [&](const handle_t& prev) { + if (component.has_node(graph->get_id(prev))) { + component.create_edge(component.get_handle(graph->get_id(prev), + graph->get_is_reverse(prev)), new_handle); + } + if (!used.count(prev)) { + queue.push_back(prev); + used.insert(prev); + } + }); + + // Record paths + graph->for_each_step_on_handle(handle, [&](const step_handle_t& step) { + paths.insert(graph->get_path_handle_of_step(step)); + }); + } - // We inserted mappings into the component in more or less arbitrary - // order, so sort them by rank. - component.paths.sort_by_mapping_rank(); - // Then rebuild the other path indexes - component.paths.rebuild_mapping_aux(); + // Copy the paths over + for (path_handle_t path_handle : paths) { + path_handle_t new_path_handle = component.create_path_handle(graph->get_path_name(path_handle), + graph->get_is_circular(path_handle)); + for (handle_t handle : graph->scan_path(path_handle)) { + component.append_step(new_path_handle, component.get_handle(graph->get_id(handle), + graph->get_is_reverse(handle))); + } + } // Save the component string filename = output_dir + "/component" + to_string(component_index) + ".vg"; // Now report what paths went into the component in parseable TSV cout << filename; - for (auto& path_name : path_names) { - cout << "\t" << path_name; + for (auto& path_handle : paths) { + cout << "\t" << graph->get_path_name(path_handle); } cout << endl; component.serialize_to_file(filename); - - component_index++; } }); @@ -158,6 +180,6 @@ int main_explode(int argc, char** argv) { } // Register subcommand -static Subcommand vg_explode("explode", "split graph into connected components", main_explode); +static Subcommand vg_explode("explode", "split graph into connected components", DEPRECATED, main_explode); diff --git a/src/subcommand/filter_main.cpp b/src/subcommand/filter_main.cpp index 55a3c4d0ece..7a233e6986e 100644 --- a/src/subcommand/filter_main.cpp +++ b/src/subcommand/filter_main.cpp @@ -14,7 +14,10 @@ #include "subcommand.hpp" #include "../vg.hpp" +#include "../xg.hpp" #include "../readfilter.hpp" +#include +#include using namespace std; using namespace vg; @@ -25,27 +28,35 @@ void help_filter(char** argv) { << "Filter alignments by properties." << endl << endl << "options:" << endl + << " -M, --input-mp-alns input is multipath alignments (GAMP) rather than GAM" << endl << " -n, --name-prefix NAME keep only reads with this prefix in their names [default='']" << endl + << " -N, --name-prefixes FILE keep reads with names with one of many prefixes, one per nonempty line" << endl + << " -a, --subsequence NAME keep reads that contain this subsequence" << endl + << " -A, --subsequences FILE keep reads that contain one of these subsequences, one per nonempty line" << endl + << " -p, --proper-pairs keep reads that are annotated as being properly paired" << endl + << " -P, --only-mapped keep reads that are mapped" << endl << " -X, --exclude-contig REGEX drop reads with refpos annotations on contigs matching the given regex (may repeat)" << endl - << " -s, --min-secondary N minimum score to keep secondary alignment [default=0]" << endl - << " -r, --min-primary N minimum score to keep primary alignment [default=0]" << endl + << " -F, --exclude-feature NAME drop reads with the given feature in the \"features\" annotation (may repeat)" << endl + << " -s, --min-secondary N minimum score to keep secondary alignment" << endl + << " -r, --min-primary N minimum score to keep primary alignment" << endl << " -O, --rescore re-score reads using default parameters and only alignment information" << endl << " -f, --frac-score normalize score based on length" << endl << " -u, --substitutions use substitution count instead of score" << endl << " -o, --max-overhang N filter reads whose alignments begin or end with an insert > N [default=99999]" << endl << " -m, --min-end-matches N filter reads that don't begin with at least N matches on each end" << endl << " -S, --drop-split remove split reads taking nonexistent edges" << endl - << " -x, --xg-name FILE use this xg index (required for -R, -S, and -D)" << endl - << " -R, --regions-file only output alignments that intersect regions (BED file with 0-based coordinates expected)" << endl - << " -B, --output-basename output to file(s) (required for -R). The ith file will correspond to the ith BED region" << endl - << " -A, --append-regions append to alignments created with -RB" << endl - << " -c, --context STEPS expand the context of the subgraph this many steps when looking up chunks" << endl + << " -x, --xg-name FILE use this xg index or graph (required for -S and -D)" << endl << " -v, --verbose print out statistics on numbers of reads filtered by what." << endl + << " -V, --no-output print out statistics (as above) but do not write out filtered GAM." << endl << " -q, --min-mapq N filter alignments with mapping quality < N" << endl << " -E, --repeat-ends N filter reads with tandem repeat (motif size <= 2N, spanning >= N bases) at either end" << endl << " -D, --defray-ends N clip back the ends of reads that are ambiguously aligned, up to N bases" << endl << " -C, --defray-count N stop defraying after N nodes visited (used to keep runtime in check) [default=99999]" << endl << " -d, --downsample S.P filter out all but the given portion 0.P of the reads. S may be an integer seed as in SAMtools" << endl + << " -i, --interleaved assume interleaved input. both ends will be filtered out if either fails filter" << endl + << " -I, --interleaved-all assume interleaved input. both ends will be filtered out if *both* fail filters" << endl + << " -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q." << endl + << " -U, --complement apply the complement of the filter implied by the other arguments." << endl << " -t, --threads N number of threads [1]" << endl; } @@ -55,12 +66,45 @@ int main_filter(int argc, char** argv) { help_filter(argv); return 1; } - - // This is the better design for a subcommand: we have a class that - // implements it and encapsulates all the default parameters, and then we - // just feed in overrides in the option parsing code. Thsi way we don't have - // multiple defaults all over the place. - ReadFilter filter; + + bool input_gam = true; + vector name_prefixes; + vector excluded_refpos_contigs; + unordered_set excluded_features; + vector subsequences; + bool set_min_primary = false; + double min_primary; + bool set_min_secondary = false; + double min_secondary; + bool rescore = false; + bool frac_score = false; + bool sub_score = false; + bool set_max_overhang = false; + int max_overhang; + bool set_min_end_matches = false; + int min_end_matches; + bool drop_split = false; + bool set_min_mapq = false; + int min_mapq; + bool verbose = false; + bool write_output = true; + bool set_repeat_size = false; + int repeat_size; + bool set_defray_length = false; + int defray_length; + bool set_defray_count = false; + int defray_count; + bool set_downsample = false; + uint64_t seed; + double downsample_probability; + bool interleaved = false; + bool filter_on_all = false; + bool set_min_base_quality = false; + int min_base_quality; + double min_base_quality_fraction; + bool complement_filter = false; + bool only_proper_pairs = false; + bool only_mapped = false; // What XG index, if any, should we load to support the other options? string xg_name; @@ -70,8 +114,15 @@ int main_filter(int argc, char** argv) { while (true) { static struct option long_options[] = { + {"input-mp-alns", no_argument, 0, 'M'}, {"name-prefix", required_argument, 0, 'n'}, + {"name-prefixes", required_argument, 0, 'N'}, + {"subsequence", required_argument, 0, 'a'}, + {"subsequences", required_argument, 0, 'A'}, + {"proper-pairs", no_argument, 0, 'p'}, + {"only-mapped", no_argument, 0, 'P'}, {"exclude-contig", required_argument, 0, 'X'}, + {"exclude-feature", required_argument, 0, 'F'}, {"min-secondary", required_argument, 0, 's'}, {"min-primary", required_argument, 0, 'r'}, {"rescore", no_argument, 0, 'O'}, @@ -81,22 +132,22 @@ int main_filter(int argc, char** argv) { {"min-end-matches", required_argument, 0, 'm'}, {"drop-split", no_argument, 0, 'S'}, {"xg-name", required_argument, 0, 'x'}, - {"regions-file", required_argument, 0, 'R'}, - {"output-basename", required_argument, 0, 'B'}, - {"append-regions", no_argument, 0, 'A'}, - {"context", required_argument, 0, 'c'}, {"verbose", no_argument, 0, 'v'}, {"min-mapq", required_argument, 0, 'q'}, {"repeat-ends", required_argument, 0, 'E'}, {"defray-ends", required_argument, 0, 'D'}, {"defray-count", required_argument, 0, 'C'}, {"downsample", required_argument, 0, 'd'}, + {"interleaved", no_argument, 0, 'i'}, + {"interleaved-all", no_argument, 0, 'I'}, + {"min-base-quality", required_argument, 0, 'b'}, + {"complement", no_argument, 0, 'U'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "n:X:s:r:Od:e:fauo:m:Sx:R:B:Ac:vq:E:D:C:d:t:", + c = getopt_long (argc, argv, "Mn:N:a:A:pPX:F:s:r:Od:e:fauo:m:Sx:vVq:E:D:C:d:iIb:Ut:", long_options, &option_index); /* Detect the end of the options. */ @@ -105,67 +156,109 @@ int main_filter(int argc, char** argv) { switch (c) { + case 'M': + input_gam = false; + break; case 'n': - filter.name_prefix = optarg; + name_prefixes.push_back(optarg); + break; + case 'N': + get_input_file(optarg, [&](istream& in) { + // Parse the input file right here in the option parsing. + for (string line; getline(in, line);) { + // For each line + if (line.empty()) { + // No empty lines + break; + } + name_prefixes.push_back(line); + } + }); + break; + case 'a': + subsequences.push_back(optarg); + break; + case 'A': + get_input_file(optarg, [&](istream& in) { + // Parse the input file right here in the option parsing. + for (string line; getline(in, line);) { + // For each line + if (line.empty()) { + // No empty lines + break; + } + subsequences.push_back(line); + } + }); + break; + case 'p': + only_proper_pairs = true; + break; + case 'P': + only_mapped = true; break; case 'X': - filter.excluded_refpos_contigs.push_back(parse(optarg)); + excluded_refpos_contigs.push_back(parse(optarg)); + break; + case 'F': + excluded_features.insert(optarg); break; case 's': - filter.min_secondary = parse(optarg); + set_min_secondary = true; + min_secondary = parse(optarg); break; case 'r': - filter.min_primary = parse(optarg); + set_min_primary = true; + min_primary = parse(optarg); break; case 'O': - filter.rescore = true; + rescore = true; break; case 'f': - filter.frac_score = true; + frac_score = true; break; case 'u': - filter.sub_score = true; + sub_score = true; break; case 'o': - filter.max_overhang = parse(optarg); + set_max_overhang = true; + max_overhang = parse(optarg); break; case 'm': - filter.min_end_matches = parse(optarg); + set_min_end_matches = true; + min_end_matches = parse(optarg); break; case 'S': - filter.drop_split = true; + drop_split = true; case 'x': xg_name = optarg; break; - case 'R': - filter.regions_file = optarg; - break; - case 'B': - filter.outbase = optarg; - break; - case 'A': - filter.append_regions = true; - break; - case 'c': - filter.context_size = parse(optarg); - break; case 'q': - filter.min_mapq = parse(optarg); + set_min_mapq = true; + min_mapq = parse(optarg); break; case 'v': - filter.verbose = true; + verbose = true; + break; + case 'V': + verbose = true; + write_output = false; break; case 'E': - filter.repeat_size = parse(optarg); + set_repeat_size = true; + repeat_size = parse(optarg); break; case 'D': - filter.defray_length = parse(optarg); + set_defray_length = true; + defray_length = parse(optarg); break; case 'C': - filter.defray_count = parse(optarg); + set_defray_count = true; + defray_count = parse(optarg); break; case 'd': { + set_downsample = true; // We need to split out the seed and the probability in S.P string opt_string(optarg); @@ -180,30 +273,46 @@ int main_filter(int argc, char** argv) { // Everything including and after the decimal point is the probability auto prob_string = opt_string.substr(point); - filter.downsample_probability = parse(prob_string); + downsample_probability = parse(prob_string); // Everything before that is the seed auto seed_string = opt_string.substr(0, point); // Use the 0 seed by default even if no actual seed shows up - int32_t seed = 0; if (seed_string != "") { // If there was a seed (even 0), parse it seed = parse(seed_string); } - - if (seed != 0) { - // We want a nonempty mask. - - // Use the C rng like Samtools does to get a mask. - // See https://github.com/samtools/samtools/blob/60138c42cf04c5c473dc151f3b9ca7530286fb1b/sam_view.c#L298-L299 - srand(seed); - filter.downsample_seed_mask = rand(); - } } } break; + case 'i': + interleaved = true; + break; + case 'I': + interleaved = true; + filter_on_all = true; + break; + case 'b': + { + set_min_base_quality = true; + vector parts = split_delims(string(optarg), ":"); + if (parts.size() != 2) { + cerr << "[vg filter] Error: -b expects value in form of :" << endl; + return 1; + } + min_base_quality = parse(parts[0]); + min_base_quality_fraction = parse(parts[1]); + if (min_base_quality_fraction < 0 || min_base_quality_fraction > 1) { + cerr << "[vg filter] Error: second part of -b input must be between 0 and 1" << endl; + return 1; + } + } + break; + case 'U': + complement_filter = true; + break; case 't': - filter.threads = parse(optarg); + omp_set_num_threads(parse(optarg)); break; case 'h': @@ -217,15 +326,7 @@ int main_filter(int argc, char** argv) { abort (); } } - - if (filter.threads < 1) { - cerr << "error:[vg filter]: Cannot use " << filter.threads << " threads." << endl; - exit(1); - } - - omp_set_num_threads(filter.threads); - // setup alignment stream if (optind >= argc) { help_filter(argv); return 1; @@ -233,30 +334,94 @@ int main_filter(int argc, char** argv) { // What should our return code be? int error_code = 0; - - get_input_file(optind, argc, argv, [&](istream& in) { - // Open up the alignment stream - - // If the user gave us an XG index, we probably ought to load it up. - // TODO: make sure if we add any other error exits from this function we - // remember to delete this! - xg::XG* xindex = nullptr; - if (!xg_name.empty()) { - // read the xg index - ifstream xg_stream(xg_name); - if(!xg_stream) { - cerr << "Unable to open xg index: " << xg_name << endl; - error_code = 1; - return; + + // Sort the prefixes for reads we will accept, for efficient search + sort(name_prefixes.begin(), name_prefixes.end()); + + // If the user gave us an XG index, we probably ought to load it up. + PathPositionHandleGraph* xindex = nullptr; + unique_ptr path_handle_graph; + bdsg::ReferencePathOverlayHelper overlay_helper; + if (!xg_name.empty()) { + // read the xg index + path_handle_graph = vg::io::VPKG::load_one(xg_name); + xindex = overlay_helper.apply(path_handle_graph.get()); + } + + // template lambda to set parameters + auto set_params = [&](auto& filter) { + filter.name_prefixes = name_prefixes; + filter.subsequences = subsequences; + filter.excluded_refpos_contigs = excluded_refpos_contigs; + filter.excluded_features = excluded_features; + if (set_min_secondary) { + filter.min_secondary = min_secondary; + } + if (set_min_primary) { + filter.min_primary = min_primary; + } + filter.rescore = rescore; + filter.frac_score = frac_score; + filter.sub_score = sub_score; + if (set_max_overhang){ + filter.max_overhang = max_overhang; + } + if (set_min_end_matches) { + filter.min_end_matches = min_end_matches; + } + filter.drop_split = drop_split; + if (set_min_mapq) { + filter.min_mapq = min_mapq; + } + filter.verbose = verbose; + filter.write_output = write_output; + if (set_repeat_size) { + filter.repeat_size = repeat_size; + } + if (set_defray_length){ + filter.defray_length = defray_length; + } + if (set_defray_count) { + filter.defray_count = defray_count; + } + if (set_downsample) { + filter.downsample_probability = downsample_probability; + if (seed != 0) { + // We want a nonempty mask. + + // Use the C rng like Samtools does to get a mask. + // See https://github.com/samtools/samtools/blob/60138c42cf04c5c473dc151f3b9ca7530286fb1b/sam_view.c#L298-L299 + srand(seed); + filter.downsample_seed_mask = rand(); } - xindex = new xg::XG(xg_stream); } + filter.only_proper_pairs = only_proper_pairs; + filter.only_mapped = only_mapped; + filter.interleaved = interleaved; + filter.filter_on_all = filter_on_all; + if (set_min_base_quality) { + filter.min_base_quality = min_base_quality; + filter.min_base_quality_fraction = min_base_quality_fraction; + } + filter.complement_filter = complement_filter; + filter.threads = get_thread_count(); + filter.graph = xindex; + }; - // Read in the alignments and filter them. - error_code = filter.filter(&in, xindex); + // Read in the alignments and filter them. + get_input_file(optind, argc, argv, [&](istream& in) { + // Open up the alignment stream - if(xindex != nullptr) { - delete xindex; + // Read in the alignments and filter them. + if (input_gam) { + ReadFilter filter; + set_params(filter); + error_code = filter.filter(&in); + } + else { + ReadFilter filter; + set_params(filter); + error_code = filter.filter(&in); } }); @@ -264,5 +429,5 @@ int main_filter(int argc, char** argv) { } // Register subcommand -static Subcommand vg_vectorize("filter", "filter reads", main_filter); +static Subcommand vg_filter("filter", "filter reads", main_filter); diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp index ccadf50820b..406eb2a13a2 100644 --- a/src/subcommand/find_main.cpp +++ b/src/subcommand/find_main.cpp @@ -1,62 +1,68 @@ #include "subcommand.hpp" -#include "../vg.hpp" #include "../utility.hpp" #include "../mapper.hpp" -#include "../stream.hpp" +#include +#include +#include +#include "../io/save_handle_graph.hpp" +#include +#include #include "../region.hpp" -#include "../gam_index.hpp" +#include "../stream_index.hpp" +#include "../algorithms/subgraph.hpp" #include "../algorithms/sorted_id_ranges.hpp" +#include "../algorithms/approx_path_distance.hpp" +#include "../algorithms/extract_connecting_graph.hpp" +#include "../algorithms/walk.hpp" +#include #include #include using namespace vg; using namespace vg::subcommand; +using namespace vg::io; void help_find(char** argv) { cerr << "usage: " << argv[0] << " find [options] >sub.vg" << endl << "options:" << endl - << " -d, --db-name DIR use this db (defaults to .index/)" << endl - << " -x, --xg-name FILE use this xg index (instead of rocksdb db)" << endl << "graph features:" << endl + << " -x, --xg-name FILE use this xg index or graph (instead of rocksdb db)" << endl << " -n, --node ID find node(s), return 1-hop context as graph" << endl << " -N, --node-list FILE a white space or line delimited list of nodes to collect" << endl + << " --mapping FILE also include nodes that map to the selected node ids" << endl << " -e, --edges-end ID return edges on end of node with ID" << endl << " -s, --edges-start ID return edges on start of node with ID" << endl << " -c, --context STEPS expand the context of the subgraph this many steps" << endl << " -L, --use-length treat STEPS in -c or M in -r as a length in bases" << endl - << " -p, --path TARGET find the node(s) in the specified path range(s) TARGET=path[:pos1[-pos2]]" << endl << " -P, --position-in PATH find the position of the node (specified by -n) in the given path" << endl - << " -R, --rank-in PATH find the rank of the node (specified by -n) in the given path" << endl << " -I, --list-paths write out the path names in the index" << endl - << " -X, --approx-pos ID get the approximate position of this node" << endl << " -r, --node-range N:M get nodes from N to M" << endl << " -G, --gam GAM accumulate the graph touched by the alignments in the GAM" << endl + << " --connecting-start POS find the graph connecting from POS (node ID, + or -, node offset) to --connecting-end" << endl + << " --connecting-end POS find the graph connecting to POS (node ID, + or -, node offset) from --connecting-start" << endl + << " --connecting-range INT traverse up to INT bases when going from --connecting-start to --connecting-end (default: 100)" << endl + << "subgraphs by path range:" << endl + << " -p, --path TARGET find the node(s) in the specified path range(s) TARGET=path[:pos1[-pos2]]" << endl + << " -R, --path-bed FILE read our targets from the given BED FILE" << endl + << " -E, --path-dag with -p or -R, gets any node in the partial order from pos1 to pos2, assumes id sorted DAG" << endl + << " -W, --save-to PREFIX instead of writing target subgraphs to stdout," << endl + << " write one per given target to a separate file named PREFIX[path]:[start]-[end].vg" << endl + << " -K, --subgraph-k K instead of graphs, write kmers from the subgraphs" << endl + << " -H, --gbwt FILE when enumerating kmers from subgraphs, determine their frequencies in this GBWT haplotype index" << endl << "alignments:" << endl - << " -d, --db-name DIR use this RocksDB database to retrieve alignments" << endl << " -l, --sorted-gam FILE use this sorted, indexed GAM file" << endl - << " -a, --alignments write all alignments from input sorted GAM or RocksDB" << endl << " -o, --alns-on N:M write alignments which align to any of the nodes between N and M (inclusive)" << endl << " -A, --to-graph VG get alignments to the provided subgraph" << endl << "sequences:" << endl - << " -g, --gcsa FILE use this GCSA2 index of the sequence space of the graph" << endl - << " -z, --kmer-size N split up --sequence into kmers of size N" << endl - << " -j, --kmer-stride N step distance between succesive kmers in sequence (default 1)" << endl - << " -S, --sequence STR search for sequence STR using --kmer-size kmers" << endl + << " -g, --gcsa FILE use this GCSA2 index of the sequence space of the graph (required for sequence queries)" << endl + << " -S, --sequence STR search for sequence STR using" << endl << " -M, --mems STR describe the super-maximal exact matches of the STR (gcsa2) in JSON" << endl << " -B, --reseed-length N find non-super-maximal MEMs inside SMEMs of length at least N" << endl << " -f, --fast-reseed use fast SMEM reseeding algorithm" << endl << " -Y, --max-mem N the maximum length of the MEM (default: GCSA2 order)" << endl << " -Z, --min-mem N the minimum length of the MEM (default: 1)" << endl - << " -k, --kmer STR return a graph of edges and nodes matching this kmer" << endl - << " -T, --table instead of a graph, return a table of kmers" << endl - << " (works only with kmers in the index)" << endl - << " -C, --kmer-count report approximate count of kmer (-k) in db" << endl << " -D, --distance return distance on path between pair of nodes (-n). if -P not used, best path chosen heurstically" << endl - << "haplotypes:" << endl - << " -H, --haplotypes FILE count xg threads in agreement with alignments in the GAM" << endl - << " -t, --extract-threads extract the threads, writing them as paths to the .vg stream on stdout" << endl - << " -q, --threads-named S return all threads whose names are prefixed with string S (multiple allowed)" << endl << " -Q, --paths-named S return all paths whose names are prefixed with S (multiple allowed)" << endl; } @@ -68,21 +74,17 @@ int main_find(int argc, char** argv) { return 1; } - string db_name; string sequence; - int kmer_size=0; - int kmer_stride = 1; vector kmers; vector node_ids; - string node_list_file; + string node_list_file, node_mapping_file; int context_size=0; bool use_length = false; - bool count_kmers = false; bool kmer_table = false; - vector targets; + vector targets_str; + vector targets; string path_name; bool position_in = false; - bool rank_in = false; string range; string gcsa_in; string xg_name; @@ -90,14 +92,15 @@ int main_find(int argc, char** argv) { int mem_reseed_length = 0; bool use_fast_reseed = true; string sorted_gam_name; - bool get_alignments = false; bool get_mappings = false; string aln_on_id_range; vg::id_t start_id = 0; vg::id_t end_id = 0; bool pairwise_distance = false; - string haplotype_alignments; string gam_file; + pos_t connecting_start = make_pos_t(0, false, 0); + pos_t connecting_end = make_pos_t(0, false, 0); + size_t connecting_range = 100; int max_mem_length = 0; int min_mem_length = 1; string to_graph_file; @@ -105,8 +108,17 @@ int main_find(int argc, char** argv) { vector extract_thread_patterns; bool extract_paths = false; vector extract_path_patterns; - vg::id_t approx_id = 0; bool list_path_names = false; + bool path_dag = false; + string bed_targets_file; + string save_to_prefix; + int subgraph_k = 0; + string gbwt_name; + + constexpr int OPT_MAPPING = 1000; + constexpr int OPT_CONNECTING_START = 1001; + constexpr int OPT_CONNECTING_END = 1002; + constexpr int OPT_CONNECTING_RANGE = 1003; int c; optind = 2; // force optind past command positional argument @@ -114,48 +126,45 @@ int main_find(int argc, char** argv) { static struct option long_options[] = { //{"verbose", no_argument, &verbose_flag, 1}, - {"db-name", required_argument, 0, 'd'}, {"xg-name", required_argument, 0, 'x'}, {"gcsa", required_argument, 0, 'g'}, {"node", required_argument, 0, 'n'}, {"node-list", required_argument, 0, 'N'}, + {"mapping", required_argument, 0, OPT_MAPPING}, {"edges-end", required_argument, 0, 'e'}, {"edges-start", required_argument, 0, 's'}, - {"kmer", required_argument, 0, 'k'}, - {"table", no_argument, 0, 'T'}, {"sequence", required_argument, 0, 'S'}, {"mems", required_argument, 0, 'M'}, {"reseed-length", required_argument, 0, 'B'}, {"fast-reseed", no_argument, 0, 'f'}, - {"kmer-stride", required_argument, 0, 'j'}, - {"kmer-size", required_argument, 0, 'z'}, {"context", required_argument, 0, 'c'}, {"use-length", no_argument, 0, 'L'}, - {"kmer-count", no_argument, 0, 'C'}, {"path", required_argument, 0, 'p'}, + {"path-bed", required_argument, 0, 'R'}, + {"path-dag", no_argument, 0, 'E'}, + {"save-to", required_argument, 0, 'W'}, {"position-in", required_argument, 0, 'P'}, - {"rank-in", required_argument, 0, 'R'}, {"node-range", required_argument, 0, 'r'}, {"sorted-gam", required_argument, 0, 'l'}, - {"alignments", no_argument, 0, 'a'}, {"mappings", no_argument, 0, 'm'}, {"alns-on", required_argument, 0, 'o'}, {"distance", no_argument, 0, 'D'}, - {"haplotypes", required_argument, 0, 'H'}, {"gam", required_argument, 0, 'G'}, + {"connecting-start", required_argument, 0, OPT_CONNECTING_START}, + {"connecting-end", required_argument, 0, OPT_CONNECTING_END}, + {"connecting-range", required_argument, 0, OPT_CONNECTING_RANGE}, {"to-graph", required_argument, 0, 'A'}, {"max-mem", required_argument, 0, 'Y'}, {"min-mem", required_argument, 0, 'Z'}, - {"extract-threads", no_argument, 0, 't'}, - {"threads-named", required_argument, 0, 'q'}, {"paths-named", required_argument, 0, 'Q'}, - {"approx-pos", required_argument, 0, 'X'}, {"list-paths", no_argument, 0, 'I'}, + {"subgraph-k", required_argument, 0, 'K'}, + {"gbwt", required_argument, 0, 'H'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "d:x:n:e:s:o:k:hc:LS:z:j:CTp:P:r:l:amg:M:R:B:fDH:G:N:A:Y:Z:tq:X:IQ:", + c = getopt_long (argc, argv, "x:n:e:s:o:hc:LS:p:P:r:l:mg:M:B:fDG:N:A:Y:Z:IQ:ER:W:K:H:", long_options, &option_index); // Detect the end of the options. @@ -164,9 +173,6 @@ int main_find(int argc, char** argv) { switch (c) { - case 'd': - db_name = optarg; - break; case 'x': xg_name = optarg; @@ -176,10 +182,6 @@ int main_find(int argc, char** argv) { gcsa_in = optarg; break; - case 'k': - kmers.push_back(optarg); - break; - case 'S': sequence = optarg; break; @@ -204,21 +206,17 @@ int main_find(int argc, char** argv) { case 'Z': min_mem_length = parse(optarg); break; - - case 'j': - kmer_stride = parse(optarg); - break; - case 'z': - kmer_size = parse(optarg); + case 'p': + targets_str.push_back(optarg); break; - case 'C': - count_kmers = true; + case 'R': + bed_targets_file = optarg; break; - case 'p': - targets.push_back(optarg); + case 'E': + path_dag = true; break; case 'P': @@ -226,9 +224,8 @@ int main_find(int argc, char** argv) { position_in = true; break; - case 'R': - path_name = optarg; - rank_in = true; + case 'W': + save_to_prefix = optarg; break; case 'c': @@ -247,6 +244,10 @@ int main_find(int argc, char** argv) { node_list_file = optarg; break; + case OPT_MAPPING: + node_mapping_file = optarg; + break; + case 'e': end_id = parse(optarg); break; @@ -255,10 +256,6 @@ int main_find(int argc, char** argv) { start_id = parse(optarg); break; - case 'T': - kmer_table = true; - break; - case 'r': range = optarg; break; @@ -266,10 +263,6 @@ int main_find(int argc, char** argv) { case 'l': sorted_gam_name = optarg; break; - - case 'a': - get_alignments = true; - break; case 'I': list_path_names = true; @@ -287,36 +280,39 @@ int main_find(int argc, char** argv) { pairwise_distance = true; break; - case 'H': - haplotype_alignments = optarg; - break; - - case 't': - extract_threads = true; - break; - - case 'q': - extract_threads = true; - extract_thread_patterns.push_back(optarg); - break; - case 'Q': extract_paths = true; extract_path_patterns.push_back(optarg); break; - case 'X': - approx_id = parse(optarg); - break; - case 'G': gam_file = optarg; break; + + case OPT_CONNECTING_START: + connecting_start = parse(optarg); + break; + + case OPT_CONNECTING_END: + connecting_end = parse(optarg); + break; + + case OPT_CONNECTING_RANGE: + connecting_range = parse(optarg); + break; case 'A': to_graph_file = optarg; break; + case 'K': + subgraph_k = atoi(optarg); + break; + + case 'H': + gbwt_name = optarg; + break; + case 'h': case '?': help_find(argv); @@ -332,8 +328,8 @@ int main_find(int argc, char** argv) { return 1; } - if (db_name.empty() && gcsa_in.empty() && xg_name.empty() && sorted_gam_name.empty()) { - cerr << "[vg find] find requires -d, -g, -x, or -l to know where to find its database" << endl; + if (gcsa_in.empty() && xg_name.empty() && sorted_gam_name.empty()) { + cerr << "[vg find] find requires -g, -x, or -l to know where to find its database" << endl; return 1; } @@ -347,6 +343,11 @@ int main_find(int argc, char** argv) { exit(1); } + if ((id(connecting_start) == 0) != (id(connecting_end) == 0)) { + cerr << "error:[vg find] --connecting-start and --connecting-end must be specified together." << endl; + exit(1); + } + // process input node list if (!node_list_file.empty()) { ifstream nli; @@ -358,27 +359,67 @@ int main_find(int argc, char** argv) { string line; while (getline(nli, line)){ for (auto& idstr : split_delims(line, " \t")) { - node_ids.push_back(parse(idstr.c_str())); + node_ids.push_back(parse(idstr.c_str())); } } nli.close(); } - // open RocksDB index - unique_ptr vindex; - if (!db_name.empty()) { - vindex = unique_ptr(new Index()); - vindex->open_read_only(db_name); + // Add the duplicate nodes that map to the original node ids according to the + // provided node mapping. + if (!node_mapping_file.empty() && !node_ids.empty()) { + gcsa::NodeMapping mapping; + sdsl::load_from_file(mapping, node_mapping_file); + std::unordered_set original_ids(node_ids.begin(), node_ids.end()); + for (gcsa::size_type id = mapping.begin(); id < mapping.end(); id++) { + if (original_ids.find(mapping(id)) != original_ids.end()) { + node_ids.push_back(id); + } + } } - xg::XG xindex; + PathPositionHandleGraph* xindex = nullptr; + unique_ptr path_handle_graph; + bdsg::PathPositionOverlayHelper overlay_helper; + bool input_gfa = false; if (!xg_name.empty()) { - ifstream in(xg_name.c_str()); - xindex.load(in); + path_handle_graph = vg::io::VPKG::load_one(xg_name); + input_gfa = dynamic_cast(path_handle_graph.get()) != nullptr; + xindex = overlay_helper.apply(path_handle_graph.get()); + + // Remove node ids that do not exist in the graph. + std::vector final_ids; + for (nid_t id : node_ids) { + if (xindex->has_node(id)) { + final_ids.push_back(id); + } else { + std::cerr << "warning: [vg find] no node with id " << id << " in the graph" << std::endl; + } + } + node_ids = final_ids; + } + function()> get_output_graph = [&]() { + if (input_gfa) { + return unique_ptr(new GFAHandleGraph()); + } + // todo: move away from VG here + return unique_ptr(new VG()); + }; + + unique_ptr gbwt_index; + if (!gbwt_name.empty()) { + // We are tracing haplotypes, and we want to use the GBWT instead of the old gPBWT. + // Load the GBWT from its container + gbwt_index = vg::io::VPKG::load_one(gbwt_name.c_str()); + if (gbwt_index.get() == nullptr) { + // Complain if we couldn't. + cerr << "error:[vg find] unable to load gbwt index file" << endl; + return 1; + } } unique_ptr gam_index; - unique_ptr> gam_cursor; + unique_ptr> gam_cursor; if (!sorted_gam_name.empty()) { // Load the GAM index gam_index = unique_ptr(new GAMIndex()); @@ -388,31 +429,6 @@ int main_find(int argc, char** argv) { }); } - if (get_alignments) { - // Dump all the alignments - if (vindex.get() != nullptr) { - // Dump from RocksDB - - vector output_buf; - auto lambda = [&output_buf](const Alignment& aln) { - output_buf.push_back(aln); - stream::write_buffered(cout, output_buf, 100); - }; - vindex->for_each_alignment(lambda); - stream::write_buffered(cout, output_buf, 0); - } else if (gam_index.get() != nullptr) { - // Dump from sorted GAM - // TODO: This is basically a noop. - get_input_file(sorted_gam_name, [&](istream& in) { - // Stream the alignments in and then stream them back out. - stream::for_each(in, stream::emit_to(cout)); - }); - } else { - cerr << "error [vg find]: Cannot find alignments without a RocksDB index or a sorted GAM" << endl; - exit(1); - } - } - if (!aln_on_id_range.empty()) { // Parse the range vector parts = split_delims(aln_on_id_range, ":"); @@ -423,37 +439,20 @@ int main_find(int argc, char** argv) { convert(parts.front(), start_id); convert(parts.back(), end_id); } - - if (vindex.get() != nullptr) { - // Find in RocksDB - - // We need a set of all the IDs. - vector ids; - for (auto i = start_id; i <= end_id; ++i) { - ids.push_back(i); - } - - vector output_buf; - auto lambda = [&output_buf](const Alignment& aln) { - output_buf.push_back(aln); - stream::write_buffered(cout, output_buf, 100); - }; - vindex->for_alignment_to_nodes(ids, lambda); - stream::write_buffered(cout, output_buf, 0); - } else if (gam_index.get() != nullptr) { + if (gam_index.get() != nullptr) { // Find in sorted GAM get_input_file(sorted_gam_name, [&](istream& in) { // Make a cursor for input // TODO: Refactor so we can put everything with the GAM index inside one get_input_file call to deduplicate code - stream::ProtobufIterator cursor(in); + vg::io::ProtobufIterator cursor(in); // Find the alignments and dump them to cout - gam_index->find(cursor, start_id, end_id, stream::emit_to(cout)); + gam_index->find(cursor, start_id, end_id, vg::io::emit_to(cout)); }); } else { - cerr << "error [vg find]: Cannot find alignments on range without a RocksDB index or a sorted GAM" << endl; + cerr << "error [vg find]: Cannot find alignments on range without a sorted GAM" << endl; exit(1); } } @@ -462,27 +461,8 @@ int main_find(int argc, char** argv) { // Find alignments touching a graph // Load up the graph - ifstream tgi(to_graph_file); - unique_ptr graph = unique_ptr(new VG(tgi)); - - if (vindex.get() != nullptr) { - // Collet the IDs in a vector - vector ids; - graph->for_each_node([&](Node* n) { ids.push_back(n->id()); }); - - // Throw out the graph - graph.reset(); - - // Find in RocksDB - vector output_buf; - auto lambda = [&output_buf](const Alignment& aln) { - output_buf.push_back(aln); - stream::write_buffered(cout, output_buf, 100); - }; - vindex->for_alignment_to_nodes(ids, lambda); - stream::write_buffered(cout, output_buf, 0); - - } else if (gam_index.get() != nullptr) { + auto graph = vg::io::VPKG::load_one(to_graph_file); + if (gam_index.get() != nullptr) { // Find in sorted GAM // Get the ID ranges from the graph @@ -493,65 +473,64 @@ int main_find(int argc, char** argv) { get_input_file(sorted_gam_name, [&](istream& in) { // Make a cursor for input // TODO: Refactor so we can put everything with the GAM index inside one get_input_file call to deduplicate code - stream::ProtobufIterator cursor(in); + vg::io::ProtobufIterator cursor(in); // Find the alignments and send them to cout - gam_index->find(cursor, ranges, stream::emit_to(cout)); + gam_index->find(cursor, ranges, vg::io::emit_to(cout)); }); } else { - cerr << "error [vg find]: Cannot find alignments on graph without a RocksDB index or a sorted GAM" << endl; + cerr << "error [vg find]: Cannot find alignments on graph without a sorted GAM" << endl; exit(1); } - - } if (!xg_name.empty()) { if (!node_ids.empty() && path_name.empty() && !pairwise_distance) { - // get the context of the node - vector graphs; - set ids; - for (auto node_id : node_ids) ids.insert(node_id); + auto output_graph = get_output_graph(); + auto& graph = *output_graph; for (auto node_id : node_ids) { - Graph g; - xindex.neighborhood(node_id, context_size, g, !use_length); - if (context_size == 0) { - for (auto& edge : xindex.edges_of(node_id)) { - // if both ends of the edge are in our targets, keep them - if (ids.count(edge.to()) && ids.count(edge.from())) { - *g.add_edge() = edge; - } - } - } - graphs.push_back(g); + graph.create_handle(xindex->get_sequence(xindex->get_handle(node_id)), node_id); } - VG result_graph; - for (auto& graph : graphs) { - // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them. - result_graph.extend(graph); + if (context_size > 0) { + if (use_length) { + vg::algorithms::expand_subgraph_by_length(*xindex, graph, context_size); + } else { + vg::algorithms::expand_subgraph_by_steps(*xindex, graph, context_size); + } + } else { + vg::algorithms::add_connecting_edges_to_subgraph(*xindex, graph); } - result_graph.remove_orphan_edges(); + vg::algorithms::add_subpaths_to_subgraph(*xindex, graph); + + VG* vg_graph = dynamic_cast(&graph); + if (vg_graph) { + vg_graph->remove_orphan_edges(); - // Order the mappings by rank. TODO: how do we handle breaks between - // different sections of a path with a single name? - result_graph.paths.sort_by_mapping_rank(); + // Order the mappings by rank. TODO: how do we handle breaks between + // different sections of a path with a single name? + vg_graph->paths.sort_by_mapping_rank(); + } // return it - result_graph.serialize_to_ostream(cout); + vg::io::save_handle_graph(&graph, cout); // TODO: We're serializing graphs all with their own redundant EOF markers if we use multiple functions simultaneously. } else if (end_id != 0) { - for (auto& e : xindex.edges_on_end(end_id)) { - cout << (e.from_start() ? -1 : 1) * e.from() << "\t" << (e.to_end() ? -1 : 1) * e.to() << endl; - } + xindex->follow_edges(xindex->get_handle(end_id), false, [&](handle_t next) { + edge_t e = xindex->edge_handle(xindex->get_handle(end_id, false), next); + cout << (xindex->get_is_reverse(e.first) ? -1 : 1) * xindex->get_id(e.first) << "\t" + << (xindex->get_is_reverse(e.second) ? -1 : 1) * xindex->get_id(e.second) << endl; + }); } else if (start_id != 0) { - for (auto& e : xindex.edges_on_start(start_id)) { - cout << (e.from_start() ? -1 : 1) * e.from() << "\t" << (e.to_end() ? -1 : 1) * e.to() << endl; - } + xindex->follow_edges(xindex->get_handle(start_id), true, [&](handle_t next) { + edge_t e = xindex->edge_handle(xindex->get_handle(start_id, true), next); + cout << (xindex->get_is_reverse(e.first) ? -1 : 1) * xindex->get_id(e.first) << "\t" + << (xindex->get_is_reverse(e.second) ? -1 : 1) * xindex->get_id(e.second) << endl; + }); } - if (!node_ids.empty() && !path_name.empty() && !pairwise_distance && (position_in || rank_in)) { + if (!node_ids.empty() && !path_name.empty() && !pairwise_distance && position_in) { // Go get the positions of these nodes in this path - if (xindex.path_rank(path_name) == 0) { + if (xindex->has_path(path_name) == false) { // This path doesn't exist, and we'll get a segfault or worse if // we go look for positions in it. cerr << "[vg find] error, path \"" << path_name << "\" not found in index" << endl; @@ -560,12 +539,15 @@ int main_find(int argc, char** argv) { // Note: this isn't at all consistent with -P option with rocksdb, which couts a range // and then mapping, but need this info right now for scripts/chunked_call + path_handle_t path_handle = xindex->get_path_handle(path_name); for (auto node_id : node_ids) { cout << node_id; - for (auto r : (position_in ? xindex.position_in_path(node_id, path_name) - : xindex.node_ranks_in_path(node_id, path_name))) { - cout << "\t" << r; - } + assert(position_in); + xindex->for_each_step_on_handle(xindex->get_handle(node_id), [&](step_handle_t step_handle) { + if (xindex->get_path_handle_of_step(step_handle) == path_handle) { + cout << "\t" << xindex->get_position_of_step(step_handle); + } + }); cout << endl; } } @@ -574,51 +556,161 @@ int main_find(int argc, char** argv) { cerr << "[vg find] error, exactly 2 nodes (-n) required with -D" << endl; exit(1); } - cout << xindex.min_approx_path_distance(node_ids[0], node_ids[1]) << endl; - return 0; - } - if (approx_id != 0) { - cout << xindex.node_start(approx_id) << endl; + cout << vg::algorithms::min_approx_path_distance(dynamic_cast(&*xindex), make_pos_t(node_ids[0], false, 0), make_pos_t(node_ids[1], false, 0), 1000) << endl; return 0; } if (list_path_names) { - size_t m = xindex.max_path_rank(); - for (size_t i = 1; i <= m; ++i) { - cout << xindex.path_name(i) << endl; - } + xindex->for_each_path_handle([&](path_handle_t path_handle) { + cout << xindex->get_path_name(path_handle) << endl; + }); + } + // handle targets from BED + if (!bed_targets_file.empty()) { + parse_bed_regions(bed_targets_file, targets); + } + // those given on the command line + for (auto& target : targets_str) { + Region region; + parse_region(target, region); + targets.push_back(region); } if (!targets.empty()) { - Graph graph; + auto output_graph = get_output_graph(); + auto& graph = *output_graph; + auto prep_graph = [&](void) { + if (context_size > 0) { + if (use_length) { + vg::algorithms::expand_subgraph_by_length(*xindex, graph, context_size); + } else { + vg::algorithms::expand_subgraph_by_steps(*xindex, graph, context_size); + } + } else { + vg::algorithms::add_connecting_edges_to_subgraph(*xindex, graph); + } + vg::algorithms::add_subpaths_to_subgraph(*xindex, graph); + VG* vg_graph = dynamic_cast(&graph); + if (vg_graph) { + vg_graph->remove_orphan_edges(); + + // Order the mappings by rank. TODO: how do we handle breaks between + // different sections of a path with a single name? + vg_graph->paths.sort_by_mapping_rank(); + } + }; for (auto& target : targets) { // Grab each target region - string name; - int64_t start, end; - parse_region(target, name, start, end); - if(xindex.path_rank(name) == 0) { + if(xindex->has_path(target.seq) == false) { // Passing a nonexistent path to get_path_range produces Undefined Behavior - cerr << "[vg find] error, path " << name << " not found in index" << endl; + cerr << "[vg find] error, path " << target.seq << " not found in index" << endl; exit(1); } + path_handle_t path_handle = xindex->get_path_handle(target.seq); // no coordinates given, we do whole thing (0,-1) - if (start < 0 && end < 0) { - start = 0; + if (target.start < 0 && target.end < 0) { + target.start = 0; + } + vg::algorithms::extract_path_range(*xindex, path_handle, target.start, target.end, graph); + if (path_dag) { + // find the start and end node of this + // and fill things in + nid_t id_start = std::numeric_limits::max(); + nid_t id_end = 1; + graph.for_each_handle([&](handle_t handle) { + nid_t id = graph.get_id(handle); + id_start = std::min(id_start, id); + id_end = std::max(id_end, id); + }); + vg::algorithms::extract_id_range(*xindex, id_start, id_end, graph); + } + if (!save_to_prefix.empty()) { + prep_graph(); + // write to our save_to file + stringstream s; + s << save_to_prefix << target.seq; + if (target.end >= 0) s << ":" << target.start << ":" << target.end; + s << ".vg"; + ofstream out(s.str().c_str()); + vg::io::save_handle_graph(&graph, out); + out.close(); + // reset our graph + dynamic_cast(graph).clear(); + } + if (subgraph_k) { + prep_graph(); // don't forget to prep the graph, or the kmer set will be wrong[ + // enumerate the kmers, calculating including their start positions relative to the reference + // and write to stdout? + bool use_gbwt = false; + if (!gbwt_name.empty()) { + use_gbwt = true; + } + vg::algorithms::for_each_walk( + graph, subgraph_k, 0, + [&](const vg::algorithms::walk_t& walk) { + // get the reference-relative position + string start_str, end_str; + for (auto& p : vg::algorithms::nearest_offsets_in_paths(xindex, walk.begin, subgraph_k*2)) { + const uint64_t& start_p = p.second.front().first; + const bool& start_rev = p.second.front().second; + if (p.first == path_handle && (!start_rev && start_p >= target.start || start_rev && start_p <= target.end)) { + start_str = target.seq + ":" + std::to_string(start_p) + (p.second.front().second ? "-" : "+"); + } + } + for (auto& p : vg::algorithms::nearest_offsets_in_paths(xindex, walk.end, subgraph_k*2)) { + const uint64_t& end_p = p.second.front().first; + const bool& end_rev = p.second.front().second; + if (p.first == path_handle && (!end_rev && end_p <= target.end || end_rev && end_p >= target.start)) { + end_str = target.seq + ":" + std::to_string(end_p) + (p.second.front().second ? "-" : "+"); + } + } + if (!start_str.empty() && !end_str.empty()) { + stringstream ss; + ss << target.seq << ":" << target.start << "-" << target.end << "\t" + << walk.seq << "\t" << start_str << "\t" << end_str << "\t"; + uint64_t on_path = 0; + for (auto& h : walk.path) { + xindex->for_each_step_on_handle(xindex->get_handle(graph.get_id(h), graph.get_is_reverse(h)), + [&](const step_handle_t& step) { + if (xindex->get_path_handle_of_step(step) == path_handle) { + ++on_path; + } + }); + } + // get haplotype frequency + if (use_gbwt) { + ss << walk_haplotype_frequency(graph, *gbwt_index, walk) << "\t"; + } else { + ss << 0 << "\t"; + } + if (on_path == walk.path.size()) { + ss << "ref" << "\t"; + } else { + ss << "non.ref" << "\t"; + } + for (auto& h : walk.path) { + ss << graph.get_id(h) << (graph.get_is_reverse(h)?"-":"+") << ","; + } + if (use_gbwt) { + ss << "\t"; + for (auto& name : walk_haplotype_names(graph, *gbwt_index, walk)) { + ss << name << ","; + } + } + // write our record +#pragma omp critical (cout) + cout << ss.str() << std::endl; + } + }); } - xindex.get_path_range(name, start, end, graph); } - if (context_size > 0) { - xindex.expand_context(graph, context_size, true, !use_length); + if (save_to_prefix.empty() && !subgraph_k) { + prep_graph(); + vg::io::save_handle_graph(&graph, cout); } - VG vgg; vgg.extend(graph); // removes dupes - - // Order the mappings by rank. TODO: how do we handle breaks between - // different sections of a path with a single name? - vgg.paths.sort_by_mapping_rank(); - - vgg.serialize_to_ostream(cout); } if (!range.empty()) { - Graph graph; - int64_t id_start=0, id_end=0; + auto output_graph = get_output_graph(); + auto& graph = *output_graph; + nid_t id_start=0, id_end=0; vector parts = split_delims(range, ":"); if (parts.size() == 1) { cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl; @@ -627,99 +719,52 @@ int main_find(int argc, char** argv) { convert(parts.front(), id_start); convert(parts.back(), id_end); if (!use_length) { - xindex.get_id_range(id_start, id_end, graph); + vg::algorithms::extract_id_range(*xindex, id_start, id_end, graph); } else { // treat id_end as length instead. - xindex.get_id_range_by_length(id_start, id_end, graph, true); + size_t length = 0; + nid_t found_id_end = id_start; + for (nid_t cur_id = id_start; length < id_end; ++cur_id) { + if (!xindex->has_node(cur_id)) { + break; + } + length += xindex->get_length(xindex->get_handle(cur_id)); + found_id_end = cur_id; + } + vg::algorithms::extract_id_range(*xindex, id_start, found_id_end, graph); } if (context_size > 0) { - xindex.expand_context(graph, context_size, true, !use_length); - } - VG vgg; vgg.extend(graph); // removes dupes - vgg.remove_orphan_edges(); - vgg.serialize_to_ostream(cout); - } - if(!haplotype_alignments.empty()) { - // What should we do with each alignment? - function lambda = [&xindex](Alignment& aln) { - // Count the amtches to the path. The path might be empty, in - // which case it will yield the biggest size_t you can have. - size_t matches = xindex.count_matches(aln.path()); - - // We do this single-threaded, at least for now, so we don't - // need to worry about coordinating output, and we can just - // spit out the counts as bare numbers. - cout << matches << endl; - }; - if (haplotype_alignments == "-") { - stream::for_each(std::cin, lambda); - } else { - ifstream in; - in.open(haplotype_alignments.c_str()); - if(!in.is_open()) { - cerr << "[vg find] error: could not open alignments file " << haplotype_alignments << endl; - exit(1); + if (use_length) { + vg::algorithms::expand_subgraph_by_length(*xindex, graph, context_size); + } else { + vg::algorithms::expand_subgraph_by_steps(*xindex, graph, context_size); } - stream::for_each(in, lambda); - } - - } - if (extract_threads) { - bool extract_reverse = false; - map > threads; - if (extract_thread_patterns.empty()) { - threads = xindex.extract_threads(extract_reverse); } else { - for (auto& pattern : extract_thread_patterns) { - for (auto& t : xindex.extract_threads_matching(pattern, extract_reverse)) { - threads[t.first] = t.second; - } - } + vg::algorithms::add_connecting_edges_to_subgraph(*xindex, graph); } - for(auto t : threads) { - // Convert to a Path - auto& thread = *t.second.begin(); - auto& thread_name = t.first; - Path path; - for(xg::XG::ThreadMapping& m : thread) { - // Convert all the mappings - Mapping mapping; - mapping.mutable_position()->set_node_id(m.node_id); - mapping.mutable_position()->set_is_reverse(m.is_reverse); - Edit* e = mapping.add_edit(); - size_t l = xindex.node_length(m.node_id); - e->set_from_length(l); - e->set_to_length(l); - *(path.add_mapping()) = mapping; - } - - // Get each thread's name - path.set_name(thread_name); + vg::algorithms::add_subpaths_to_subgraph(*xindex, graph); - // We need a Graph for serialization purposes. We do one chunk per - // thread in case the threads are long. - Graph g; - *(g.add_path()) = path; - - // Dump the graph with its mappings. TODO: can we restrict these to - vector gb = { g }; - stream::write_buffered(cout, gb, 0); + VG* vg_graph = dynamic_cast(&graph); + if (vg_graph) { + vg_graph->remove_orphan_edges(); } + vg::io::save_handle_graph(&graph, cout); } if (extract_paths) { - vector paths; for (auto& pattern : extract_path_patterns) { - for (auto& p : xindex.paths_by_prefix(pattern)) { - paths.push_back(p); - } - } - for(auto& path : paths) { - // We need a Graph for serialization purposes. - Graph g; - *(g.add_path()) = xindex.path(path.name()); - // Dump the graph with its mappings. TODO: can we restrict these to - vector gb = { g }; - stream::write_buffered(cout, gb, 0); + + // We want to write uncompressed protobuf Graph objects containing our paths. + vg::io::ProtobufEmitter out(cout, false); + + xindex->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = xindex->get_path_name(path_handle); + if (pattern.length() <= path_name.length() && path_name.compare(0, pattern.length(), pattern) == 0) { + // We need a Graph for serialization purposes. + Graph g; + *g.add_path() = path_from_path_handle(*xindex, path_handle); + out.write(std::move(g)); + } + }); } } if (!gam_file.empty()) { @@ -739,7 +784,7 @@ int main_find(int argc, char** argv) { } }; if (gam_file == "-") { - stream::for_each(std::cin, lambda); + vg::io::for_each(std::cin, lambda); } else { ifstream in; in.open(gam_file.c_str()); @@ -747,105 +792,25 @@ int main_find(int argc, char** argv) { cerr << "[vg find] error: could not open alignments file " << gam_file << endl; exit(1); } - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); } // now we have the nodes to get - Graph graph; + auto output_graph = get_output_graph(); + auto& graph = *output_graph; for (auto& node : nodes) { - *graph.add_node() = xindex.node(node); + handle_t node_handle = xindex->get_handle(node); + graph.create_handle(xindex->get_sequence(node_handle), xindex->get_id(node_handle)); } - xindex.expand_context(graph, max(1, context_size), true); // get connected edges - VG vgg; vgg.extend(graph); - vgg.serialize_to_ostream(cout); + vg::algorithms::expand_subgraph_by_steps(*xindex, graph, max(1, context_size)); // get connected edges + vg::algorithms::add_connecting_edges_to_subgraph(*xindex, graph); + vg::io::save_handle_graph(&graph, cout); } - } else if (!db_name.empty()) { - if (!node_ids.empty() && path_name.empty()) { - // get the context of the node - vector graphs; - for (auto node_id : node_ids) { - VG g; - vindex->get_context(node_id, g); - if (context_size > 0) { - vindex->expand_context(g, context_size); - } - graphs.push_back(g); - } - VG result_graph; - for (auto& graph : graphs) { - // Allow duplicate nodes and edges (from e.g. multiple -n options); silently collapse them. - result_graph.extend(graph); - } - result_graph.remove_orphan_edges(); - // return it - result_graph.serialize_to_ostream(cout); - } else if (end_id != 0) { - vector edges; - vindex->get_edges_on_end(end_id, edges); - for (vector::iterator e = edges.begin(); e != edges.end(); ++e) { - cout << (e->from_start() ? -1 : 1) * e->from() << "\t" << (e->to_end() ? -1 : 1) * e->to() << endl; - } - } else if (start_id != 0) { - vector edges; - vindex->get_edges_on_start(start_id, edges); - for (vector::iterator e = edges.begin(); e != edges.end(); ++e) { - cout << (e->from_start() ? -1 : 1) * e->from() << "\t" << (e->to_end() ? -1 : 1) * e->to() << endl; - } - } - if (!node_ids.empty() && !path_name.empty()) { - int64_t path_id = vindex->get_path_id(path_name); - for (auto node_id : node_ids) { - list> path_prev, path_next; - int64_t prev_pos=0, next_pos=0; - bool prev_backward, next_backward; - if (vindex->get_node_path_relative_position(node_id, false, path_id, - path_prev, prev_pos, prev_backward, - path_next, next_pos, next_backward)) { - - // Negate IDs for backward nodes - cout << node_id << "\t" << path_prev.front().first * (path_prev.front().second ? -1 : 1) << "\t" << prev_pos - << "\t" << path_next.back().first * (path_next.back().second ? -1 : 1) << "\t" << next_pos << "\t"; - - Mapping m = vindex->path_relative_mapping(node_id, false, path_id, - path_prev, prev_pos, prev_backward, - path_next, next_pos, next_backward); - cout << pb2json(m) << endl; - } - } - } - if (!targets.empty()) { - VG graph; - for (auto& target : targets) { - string name; - int64_t start, end; - parse_region(target, name, start, end); - // end coordinate is exclusive for get_path() - if (end >= 0) { - ++end; - } - vindex->get_path(graph, name, start, end); - } - if (context_size > 0) { - vindex->expand_context(graph, context_size); - } - graph.remove_orphan_edges(); - graph.serialize_to_ostream(cout); - } - if (!range.empty()) { - VG graph; - int64_t id_start=0, id_end=0; - vector parts = split_delims(range, ":"); - if (parts.size() == 1) { - cerr << "[vg find] error, format of range must be \"N:M\" where start id is N and end id is M, got " << range << endl; - exit(1); - } - convert(parts.front(), id_start); - convert(parts.back(), id_end); - vindex->get_range(id_start, id_end, graph); - if (context_size > 0) { - vindex->expand_context(graph, context_size); - } - graph.remove_orphan_edges(); - graph.serialize_to_ostream(cout); + if (id(connecting_start) != 0) { + // Extract a connecting graph + auto output_graph = get_output_graph(); + auto& graph = *output_graph; + vg::algorithms::extract_connecting_graph(xindex, &graph, connecting_range, connecting_start, connecting_end); + vg::io::save_handle_graph(&graph, cout); } } @@ -853,100 +818,42 @@ int main_find(int argc, char** argv) { if (!sequence.empty()) { if (gcsa_in.empty()) { - if (get_mems) { - cerr << "error:[vg find] a GCSA index must be passed to get MEMs" << endl; - return 1; - } - set kmer_sizes = vindex->stored_kmer_sizes(); - if (kmer_sizes.empty()) { - cerr << "error:[vg find] index does not include kmers, add with vg index -k" << endl; - return 1; - } - if (kmer_size == 0) { - kmer_size = *kmer_sizes.begin(); - } - for (int i = 0; i <= sequence.size()-kmer_size; i+=kmer_stride) { - kmers.push_back(sequence.substr(i,kmer_size)); - } - } else { - // let's use the GCSA index - - // Configure GCSA2 verbosity so it doesn't spit out loads of extra info - gcsa::Verbosity::set(gcsa::Verbosity::SILENT); - - // Configure its temp directory to the system temp directory - gcsa::TempFile::setDirectory(temp_file::get_dir()); - - // Open it - ifstream in_gcsa(gcsa_in.c_str()); - gcsa::GCSA gcsa_index; - gcsa_index.load(in_gcsa); - gcsa::LCPArray lcp_index; - // default LCP is the gcsa base name +.lcp - string lcp_in = gcsa_in + ".lcp"; - ifstream in_lcp(lcp_in.c_str()); - lcp_index.load(in_lcp); - //range_type find(const char* pattern, size_type length) const; - //void locate(size_type path, std::vector& results, bool append = false, bool sort = true) const; - //locate(i, results); - if (!get_mems) { - auto paths = gcsa_index.find(sequence.c_str(), sequence.length()); - //cerr << paths.first << " - " << paths.second << endl; - for (gcsa::size_type i = paths.first; i <= paths.second; ++i) { - std::vector ids; - gcsa_index.locate(i, ids); - for (auto id : ids) { - cout << gcsa::Node::decode(id) << endl; - } - } - } else { - // for mems we need to load up the gcsa and lcp structures into the mapper - Mapper mapper(&xindex, &gcsa_index, &lcp_index); - mapper.fast_reseed = use_fast_reseed; - // get the mems - double lcp_max, fraction_filtered; - auto mems = mapper.find_mems_deep(sequence.begin(), sequence.end(), lcp_max, fraction_filtered, max_mem_length, min_mem_length, mem_reseed_length); - - // dump them to stdout - cout << mems_to_json(mems) << endl; - - } + cerr << "error:[vg find] need GCSA index to query sequences" << endl; + return 1; } - } - - if (!kmers.empty()) { - if (count_kmers) { - for (auto& kmer : kmers) { - cout << kmer << "\t" << vindex->approx_size_of_kmer_matches(kmer) << endl; - } - } else if (kmer_table) { - for (auto& kmer : kmers) { - map > > positions; - vindex->get_kmer_positions(kmer, positions); - for (auto& k : positions) { - for (auto& p : k.second) { - cout << k.first << "\t" << p.first << "\t" << p.second << endl; - } + + // Configure GCSA2 verbosity so it doesn't spit out loads of extra info + gcsa::Verbosity::set(gcsa::Verbosity::SILENT); + + // Open it + auto gcsa_index = vg::io::VPKG::load_one(gcsa_in); + // default LCP is the gcsa base name +.lcp + auto lcp_index = vg::io::VPKG::load_one(gcsa_in + ".lcp"); + + //range_type find(const char* pattern, size_type length) const; + //void locate(size_type path, std::vector& results, bool append = false, bool sort = true) const; + //locate(i, results); + if (!get_mems) { + auto paths = gcsa_index->find(sequence.c_str(), sequence.length()); + //cerr << paths.first << " - " << paths.second << endl; + for (gcsa::size_type i = paths.first; i <= paths.second; ++i) { + std::vector ids; + gcsa_index->locate(i, ids); + for (auto id : ids) { + cout << gcsa::Node::decode(id) << endl; } } } else { - vector graphs; - for (auto& kmer : kmers) { - VG g; - vindex->get_kmer_subgraph(kmer, g); - if (context_size > 0) { - vindex->expand_context(g, context_size); - } - graphs.push_back(g); - } - - VG result_graph; - for (auto& graph : graphs) { - // Allow duplicate nodes and edges (from multiple kmers); silently collapse them. - result_graph.extend(graph); - } - result_graph.remove_orphan_edges(); - result_graph.serialize_to_ostream(cout); + // for mems we need to load up the gcsa and lcp structures into the mapper + Mapper mapper(xindex, gcsa_index.get(), lcp_index.get()); + mapper.fast_reseed = use_fast_reseed; + // get the mems + double lcp_avg, fraction_filtered; + auto mems = mapper.find_mems_deep(sequence.begin(), sequence.end(), lcp_avg, fraction_filtered, max_mem_length, min_mem_length, mem_reseed_length); + + // dump them to stdout + cout << mems_to_json(mems) << endl; + } } @@ -954,4 +861,4 @@ int main_find(int argc, char** argv) { } -static Subcommand vg_msga("find", "use an index to find nodes, edges, kmers, paths, or positions", TOOLKIT, main_find); +static Subcommand vg_msga("find", "use an index to find nodes, edges, kmers, paths, or positions", DEVELOPMENT, main_find); diff --git a/src/subcommand/gamcompare_main.cpp b/src/subcommand/gamcompare_main.cpp index 848972adb8e..7cd27270513 100644 --- a/src/subcommand/gamcompare_main.cpp +++ b/src/subcommand/gamcompare_main.cpp @@ -8,11 +8,13 @@ #include #include -#include +#include "subcommand.hpp" #include "../alignment.hpp" +#include "../snarl_distance_index.hpp" #include "../vg.hpp" -#include "../stream.hpp" +#include +#include using namespace std; using namespace vg; @@ -22,10 +24,63 @@ void help_gamcompare(char** argv) { cerr << "usage: " << argv[0] << " gamcompare aln.gam truth.gam >output.gam" << endl << endl << "options:" << endl - << " -r, --range N distance within which to consider reads correct" << endl - << " -T, --tsv output TSV (correct, mq, aligner, read) comaptible with plot-qq.R instead of GAM" << endl - << " -a, --aligner aligner name for TSV output [\"vg\"]" << endl - << " -t, --threads N number of threads to use" << endl; + << " -d, --distance-index FILE use distances from this distance index instead of path position annotations" << endl + << " -r, --range N distance within which to consider reads correct" << endl + << " -n, --rename Q=T interpret the given query contig name as the given truth contig (may repeat)" << endl + << " -T, --tsv output TSV (correct, mq, aligner, read) compatible with plot-qq.R instead of GAM" << endl + << " -a, --aligner aligner name for TSV output [\"vg\"]" << endl + << " -s, --score-alignment get a correctness score of the alignment (higher is better)" << endl + << " -t, --threads N number of threads to use" << endl; +} + +// A gapless alignment between a read and a single node. +struct MappingRun { + pos_t start; // Starting position in the graph. + size_t read_offset; // Starting position in the read. + size_t length; // Length of the alignment. + + size_t limit() const { + return this->read_offset + this->length; + } + + // Get the graph position at read offset `offset >= this->read_offset`. + pos_t pos_at(size_t offset) const { + pos_t result = this->start; + get_offset(result) += offset - this->read_offset; + return result; + } +}; + +// Returns the maximal MappingRuns for the alignment. +std::vector base_mappings(const Alignment& aln) { + std:vector result; + size_t read_offset = 0; + const Path& path = aln.path(); + for (size_t i = 0; i < path.mapping_size(); i++) { + const Mapping& mapping = path.mapping(i); + pos_t start = make_pos_t(mapping.position()); + size_t length = 0; // Number of consecutive matches/mismatches. + for (size_t j = 0; j < mapping.edit_size(); j++) { + const Edit& edit = mapping.edit(j); + if (edit.from_length() == edit.to_length()) { + length += edit.to_length(); + } else { + if (length > 0) { + result.push_back({ start, read_offset, length }); + get_offset(start) += length; + read_offset += length; + length = 0; + } + get_offset(start) += edit.from_length(); + read_offset += edit.to_length(); + } + } + if (length > 0) { + result.push_back({ start, read_offset, length }); + read_offset += length; + } + } + return result; } int main_gamcompare(int argc, char** argv) { @@ -39,6 +94,10 @@ int main_gamcompare(int argc, char** argv) { int64_t range = -1; bool output_tsv = false; string aligner_name = "vg"; + bool score_alignment = false; + string distance_name; + // Map from query contigs to corresponding truth contigs + std::unordered_map renames; int c; optind = 2; @@ -46,15 +105,18 @@ int main_gamcompare(int argc, char** argv) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, + {"distance-index", required_argument, 0, 'd'}, {"range", required_argument, 0, 'r'}, + {"rename", required_argument, 0, 'n'}, {"tsv", no_argument, 0, 'T'}, {"aligner", required_argument, 0, 'a'}, + {"score-alignment", no_argument, 0, 's'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hr:Ta:t:", + c = getopt_long (argc, argv, "hd:r:n:Ta:st:", long_options, &option_index); // Detect the end of the options. @@ -67,6 +129,27 @@ int main_gamcompare(int argc, char** argv) { range = parse(optarg); break; + case 'n': + { + // Parse the rename old=new + string key_value(optarg); + auto found = key_value.find('='); + if (found == string::npos || found == 0 || found + 1 == key_value.size()) { + cerr << "error:[vg gamcompare] could not parse rename " << key_value << endl; + exit(1); + } + // Parse out the two parts + string query_contig = key_value.substr(0, found); + string truth_contig = key_value.substr(found + 1); + // Add the name mapping + renames.emplace(query_contig, truth_contig); + } + break; + + case 'd': + distance_name = optarg; + break; + case 'T': output_tsv = true; break; @@ -75,6 +158,10 @@ int main_gamcompare(int argc, char** argv) { aligner_name = optarg; break; + case 's': + score_alignment = true; + break; + case 't': threads = parse(optarg); omp_set_num_threads(threads); @@ -91,90 +178,263 @@ int main_gamcompare(int argc, char** argv) { } } + // We need to read the second argument first, so we can't use get_input_file with its free error checking. string test_file_name = get_input_file_name(optind, argc, argv); string truth_file_name = get_input_file_name(optind, argc, argv); - // We will collect all the truth positions - string_hash_map > > > true_positions; - function record_truth = [&true_positions](Alignment& aln) { + // True path positions. For each alignment name, store a mapping from reference path names + // to sets of (sequence offset, is_reverse). There is usually either one position per + // alignment or one position per node. + vg::string_hash_map > > > true_path_positions; + function record_path_positions = [&true_path_positions](Alignment& aln) { auto val = alignment_refpos_to_path_offsets(aln); #pragma omp critical (truth_table) - true_positions[aln.name()] = val; + true_path_positions[aln.name()] = val; }; + + // True graph positions. For each alignment name, we find the maximal read intervals that correspond + // to a gapless alignment between the read and a single node. + vg::string_hash_map> true_graph_positions; + function record_graph_positions = [&true_graph_positions](Alignment& aln) { + if (aln.path().mapping_size() > 0) { +#pragma omp critical (truth_table) + true_graph_positions[aln.name()] = base_mappings(aln); + } + }; + if (truth_file_name == "-") { - assert(test_file_name != "-"); - stream::for_each_parallel(std::cin, record_truth); + // Read truth fropm standard input, if it looks good. + if (test_file_name == "-") { + cerr << "error[vg gamcompare]: Standard input can only be used for truth or test file, not both" << endl; + exit(1); + } + if (!std::cin) { + cerr << "error[vg gamcompare]: Unable to read standard input when looking for true reads" << endl; + exit(1); + } + if (distance_name.empty()) { + vg::io::for_each_parallel(std::cin, record_path_positions); + } else { + vg::io::for_each_parallel(std::cin, record_graph_positions); + } } else { + // Read truth from this file, if it looks good. ifstream truth_file_in(truth_file_name); - stream::for_each_parallel(truth_file_in, record_truth); + if (!truth_file_in) { + cerr << "error[vg gamcompare]: Unable to read " << truth_file_name << " when looking for true reads" << endl; + exit(1); + } + if (distance_name.empty()) { + vg::io::for_each_parallel(truth_file_in, record_path_positions); + } else { + vg::io::for_each_parallel(truth_file_in, record_graph_positions); + } + } + if (score_alignment && range == -1) { + cerr << "error[vg gamcompare]: Score-alignment requires range" << endl; + exit(1); + } + + // Load the distance index. + unique_ptr distance_index; + if (!distance_name.empty()) { + distance_index = vg::io::VPKG::load_one(distance_name); } - // We have a buffer for annotated alignments - vector buf; + // We have a buffered emitter for annotated alignments, if we're not outputting text + std::unique_ptr> emitter; + if (!output_tsv) { + emitter = std::unique_ptr>(new vg::io::ProtobufEmitter(cout)); + } + + // We have an ordinary buffer we use for text output + vector text_buffer; - // We have an output function to dump all the reads in the buffer in the correct format - auto flush_buffer = [&buf,&output_tsv,&aligner_name]() { + // We have an output function to dump all the reads in the text buffer in TSV + auto flush_text_buffer = [&text_buffer,&output_tsv,&aligner_name]() { // We print exactly one header line. static bool header_printed = false; - if (output_tsv) { - // Output TSV to standard out in the format plot-qq.R needs. - if (!header_printed) { - // It needs a header - cout << "correct\tmq\taligner\tread" << endl; - header_printed = true; - } - - for (auto& aln : buf) { - // Dump each alignment - cout << (aln.correctly_mapped() ? "1" : "0") << "\t"; - cout << aln.mapping_quality() << "\t"; - cout << aligner_name << "\t"; - cout << aln.name() << endl; - } - - } else { - // Output by serializing Alignment objects - write_alignments(cout, buf); + // Output TSV to standard out in the format plot-qq.R needs. + if (!header_printed) { + // It needs a header + cout << "correct\tmq\taligner\tread" << endl; + header_printed = true; } - buf.clear(); + for (auto& aln : text_buffer) { + // Dump each alignment + cout << (aln.correctly_mapped() ? "1" : "0") << "\t"; + cout << aln.mapping_quality() << "\t"; + cout << aligner_name << "\t"; + cout << aln.name() << endl; + } + text_buffer.clear(); }; - + + // We want to count correct reads + vector correct_counts(vg::get_thread_count(), 0); + + //Get stats for calculating the score + vector read_count_by_thread (vg::get_thread_count(), 0); + vector> mapq_count_by_thread (vg::get_thread_count()); + vector> correct_count_by_mapq_by_thread(vg::get_thread_count()); + for (size_t i = 0 ; i < vg::get_thread_count() ; i++) { + mapq_count_by_thread[i].resize(61, 0); + correct_count_by_mapq_by_thread[i].resize(61,0); + } + // This function annotates every read with distance and correctness, and batch-outputs them. - function annotate_test = [&buf,&flush_buffer,&true_positions,&range](Alignment& aln) { - auto f = true_positions.find(aln.name()); - if (f != true_positions.end()) { - auto& true_position = f->second; - alignment_set_distance_to_correct(aln, true_position); - - if (range != -1) { - // We are flagging reads correct/incorrect. - // It is correct if there is a path for its minimum distance and it is in range on that path. - aln.set_correctly_mapped(aln.to_correct().name() != "" && aln.to_correct().offset() <= range); + function annotate_test = [&](Alignment& aln) { + bool found = false; + if (distance_name.empty()) { + //If the distance index isn't used + auto iter = true_path_positions.find(aln.name()); + if (iter != true_path_positions.end()) { + alignment_set_distance_to_correct(aln, iter->second, &renames); + found = true; + } + } else { + //If the distance index gets used + auto iter = true_graph_positions.find(aln.name()); + if (iter != true_graph_positions.end() && aln.path().mapping_size() > 0) { + std::vector read_mappings = base_mappings(aln); + int64_t distance = std::numeric_limits::max(); + auto read_iter = read_mappings.begin(); + auto truth_iter = iter->second.begin(); + // Break the read into maximal intervals such that each interval corresponds + // to a gapless alignment between the read and a single node both in the true + // alignment and the candidate alignment. Compute the distance for each + // interval and use the minimum distance over all intervals. + while (read_iter != read_mappings.end() && truth_iter != iter->second.end()) { + size_t start = std::max(read_iter->read_offset, truth_iter->read_offset); + size_t limit = std::min(read_iter->limit(), truth_iter->limit()); + if (start < limit) { + pos_t read_pos = read_iter->pos_at(start); + pos_t truth_pos = truth_iter->pos_at(start); + size_t forward = minimum_distance(*distance_index, read_pos, truth_pos); + if (forward != std::numeric_limits::max()) { + distance = std::min((int64_t)forward, distance); + } + size_t reverse = minimum_distance(*distance_index, truth_pos, read_pos); + if (reverse != std::numeric_limits::max()) { + distance = std::min((int64_t)reverse, distance); + } + } + if (read_iter->limit() <= limit) { + ++read_iter; + } + if (truth_iter->limit() <= limit) { + ++truth_iter; + } + } + Position result; + result.set_name("graph"); + result.set_offset(distance); + *aln.mutable_to_correct() = result; + found = true; } + } + if (found && range != -1) { + // We are flagging reads correct/incorrect. + // It is correct if there is a path for its minimum distance and it is in range on that path. + bool correctly_mapped = (aln.to_correct().name() != "" && aln.to_correct().offset() <= range); + + // Annotate it as such + aln.set_correctly_mapped(correctly_mapped); + if (correctly_mapped) { + correct_counts.at(omp_get_thread_num()) += 1; + } + auto mapq = aln.mapping_quality(); + if (mapq) { + if (mapq >= mapq_count_by_thread.at(omp_get_thread_num()).size()) { + mapq_count_by_thread.at(omp_get_thread_num()).resize(mapq+1, 0); + correct_count_by_mapq_by_thread.at(omp_get_thread_num()).resize(mapq+1, 0); + } + + read_count_by_thread.at(omp_get_thread_num()) += 1; + mapq_count_by_thread.at(omp_get_thread_num()).at(mapq) += 1; + if (correctly_mapped) { + correct_count_by_mapq_by_thread.at(omp_get_thread_num()).at(mapq) += 1; + } + } } -#pragma omp critical (buf) +#pragma omp critical { - buf.push_back(aln); - if (buf.size() > 1000) { - flush_buffer(); + if (output_tsv) { + text_buffer.emplace_back(std::move(aln)); + if (text_buffer.size() > 1000) { + flush_text_buffer(); + } + } else { + emitter->write(std::move(aln)); } } }; if (test_file_name == "-") { - assert(truth_file_name != "-"); - stream::for_each_parallel(std::cin, annotate_test); + if (!std::cin) { + cerr << "error[vg gamcompare]: Unable to read standard input when looking for reads under test" << endl; + exit(1); + } + vg::io::for_each_parallel(std::cin, annotate_test); } else { ifstream test_file_in(test_file_name); - stream::for_each_parallel(test_file_in, annotate_test); + if (!test_file_in) { + cerr << "error[vg gamcompare]: Unable to read " << test_file_name << " when looking for reads under test" << endl; + exit(1); + } + vg::io::for_each_parallel(test_file_in, annotate_test); + } + + if (output_tsv) { + // Save whatever's in the buffer at the end. + flush_text_buffer(); + } + + + if (range != -1) { + // We are flagging reads correct/incorrect. So report the total correct. + size_t total_correct = 0; + for (auto& count : correct_counts) { + total_correct += count; + } + + cerr << total_correct << " reads correct" << endl; } - // Save whatever's in the buffer at the end. - flush_buffer(); - cout.flush(); + if (score_alignment) { + //Get a goodness score of the alignment that takes into account correctness and mapq calibration + size_t total_reads = 0; + vector mapq_count (61, 0); + vector correct_count_by_mapq (61, 0); + for (size_t i = 0 ; i < vg::get_thread_count() ; i++) { + total_reads += read_count_by_thread.at(i); + for (size_t mq = 0 ; mq < mapq_count_by_thread.at(i).size() ; mq++) { + if (mq >= mapq_count.size()) { + mapq_count.resize(mq+1, 0); + correct_count_by_mapq.resize(mq+1, 0); + } + mapq_count.at(mq) += mapq_count_by_thread.at(i).at(mq); + correct_count_by_mapq.at(mq) += correct_count_by_mapq_by_thread.at(i).at(mq); + } + } + size_t accumulated_count = 0; + size_t accumulated_correct_count = 0; + float mapping_goodness_score = 0.0; + for (int i = mapq_count.size()-1 ; i >= 0 ; i--) { + accumulated_count += mapq_count[i]; + accumulated_correct_count += correct_count_by_mapq[i]; + double fraction_incorrect = accumulated_count == 0 ? 0.0 : + (float) (accumulated_count - accumulated_correct_count) / (float) accumulated_count; + fraction_incorrect = fraction_incorrect == 0.0 ? 1.0/ (float) total_reads : fraction_incorrect; + mapping_goodness_score -= log10(fraction_incorrect) * mapq_count[i]; + } + cerr << "mapping goodness score: " << mapping_goodness_score / total_reads << endl; + + } + return 0; } diff --git a/src/subcommand/gampcompare_main.cpp b/src/subcommand/gampcompare_main.cpp new file mode 100644 index 00000000000..fd8b5b4ec3e --- /dev/null +++ b/src/subcommand/gampcompare_main.cpp @@ -0,0 +1,302 @@ +// gampcompare_main.cpp: defines a GAMP to GAM annotation function + +#include +#include +#include + +#include +#include +#include + +#include + +#include "../algorithms/alignment_path_offsets.hpp" +#include "../multipath_alignment.hpp" +#include "../alignment.hpp" +#include "../vg.hpp" +#include +#include +#include + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_gampcompare(char** argv) { + cerr << "usage: " << argv[0] << " gampcompare [options] alngraph.xg aln.gamp truth.gam > output.tsv" << endl + << endl + << "options:" << endl + << " -G, --gam alignments are in GAM format rather than GAMP" << endl + << " -r, --range N distance within which to consider reads correct [100]" << endl + << " -a, --aligner STR aligner name for TSV output [\"vg\"]" << endl + << " -d, --distance report minimum distance along a path rather than correctness" << endl + << " -t, --threads N number of threads to use [1]" << endl; +} + +int main_gampcompare(int argc, char** argv) { + + if (argc == 2) { + help_gampcompare(argv); + exit(1); + } + + int threads = 1; + int64_t range = 100; + string aligner_name = "vg"; + int buffer_size = 10000; + bool gam_input = false; + bool report_distance = false; + + int c; + optind = 2; + while (true) { + static struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"range", required_argument, 0, 'r'}, + {"gam", no_argument, 0, 'G'}, + {"aligner", required_argument, 0, 'a'}, + {"distance", required_argument, 0, 'd'}, + {"threads", required_argument, 0, 't'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "hr:a:t:Gd", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) break; + + switch (c) + { + + case 'r': + range = parse(optarg); + break; + + case 'a': + aligner_name = optarg; + break; + + case 'd': + report_distance = true; + break; + + case 't': + threads = parse(optarg); + omp_set_num_threads(threads); + break; + + case 'G': + gam_input = true; + break; + + case 'h': + case '?': + help_gampcompare(argv); + exit(1); + break; + + default: + abort (); + } + } + + // We need to read the second argument first, so we can't use get_input_file with its free error checking. + string graph_file_name = get_input_file_name(optind, argc, argv); + string test_file_name = get_input_file_name(optind, argc, argv); + string truth_file_name = get_input_file_name(optind, argc, argv); + + if ((truth_file_name == "-") + (test_file_name == "-") + (graph_file_name == "-") > 1) { + cerr << "error[vg gampcompare]: Standard input can only be used for one input file" << endl; + exit(1); + } + + // Load the graph we mapped to + unique_ptr path_handle_graph; + if (graph_file_name == "-") { + path_handle_graph = vg::io::VPKG::load_one(std::cin); + } + else { + ifstream graph_stream(graph_file_name); + if (!graph_stream) { + cerr << "error:[vg mpmap] Cannot open graph file " << graph_file_name << endl; + exit(1); + } + path_handle_graph = vg::io::VPKG::load_one(graph_stream); + } + + bdsg::PathPositionOverlayHelper overlay_helper; + PathPositionHandleGraph* path_position_handle_graph = overlay_helper.apply(path_handle_graph.get()); + + // We will collect all the truth positions + string_hash_map > > > true_positions; + function record_truth = [&true_positions](Alignment& aln) { + auto val = alignment_refpos_to_path_offsets(aln); +#pragma omp critical (truth_table) + true_positions[move(*aln.mutable_name())] = move(val); + }; + + if (truth_file_name == "-") { + // Read truth fropm standard input, if it looks good. + if (!std::cin) { + cerr << "error[vg gampcompare]: Unable to read standard input when looking for true reads" << endl; + exit(1); + } + vg::io::for_each_parallel(std::cin, record_truth); + } + else { + // Read truth from this file, if it looks good. + ifstream truth_file_in(truth_file_name); + if (!truth_file_in) { + cerr << "error[vg gampcompare]: Unable to read " << truth_file_name << " when looking for true reads" << endl; + exit(1); + } + vg::io::for_each_parallel(truth_file_in, record_truth); + } + + // A buffer we use for the TSV output + vector>> buffers(get_thread_count()); + + // We have an output function to dump all the reads in the text buffer in TSV + auto flush_buffer = [&](vector>& buffer) { + // We print exactly one header line. + static bool header_printed = false; + // Output TSV to standard out in the format plot-qq.R needs. + if (!header_printed) { + // It needs a header + if (report_distance) { + cout << "distance"; + } + else { + cout << "correct"; + } + cout << "\tmapped\tmq\tgroupmq\taligner\tread" << endl; + header_printed = true; + } + for (auto& result : buffer) { + // Dump each alignment + if (report_distance) { + cout << get<0>(result); + } + else { + cout << (get<0>(result) <= range); + } + cout << '\t' << get<1>(result) << '\t' << get<2>(result) << '\t' << get<3>(result) << '\t' << aligner_name << '\t' << get<4>(result) << endl; + } + buffer.clear(); + }; + + // We want to count correct reads + vector correct_counts(get_thread_count(), 0); + + // This function annotates every read with distance and correctness, and batch-outputs them. + function evaluate_correctness = [&](MultipathAlignment& proto_mp_aln) { + + // check the multipath mapping for correctness + int64_t abs_dist = numeric_limits::max(); + auto f = true_positions.find(proto_mp_aln.name()); + if (f != true_positions.end()) { + + multipath_alignment_t mp_aln; + from_proto_multipath_alignment(proto_mp_aln, mp_aln); + + auto& true_positions = f->second; + auto mapped_positions = algorithms::multipath_alignment_path_offsets(*path_position_handle_graph, + mp_aln); + for (auto it = true_positions.begin(); it != true_positions.end(); ++it) { + // TODO: it really should be possible to do this with only path handles instead of names + auto path_handle = path_position_handle_graph->get_path_handle(it->first); + if (mapped_positions.count(path_handle)) { + // the true and mapped positions share this path + auto& path_true_positions = it->second; + auto& path_mapped_positions = mapped_positions[path_handle]; + // check all pairs of positions + for (size_t i = 0; i < path_true_positions.size(); ++i) { + for (size_t j = 0; j < path_mapped_positions.size(); ++j) { + if (path_true_positions[i].second == path_mapped_positions[j].second) { + // there is a pair of positions on the same strand of the same path + abs_dist = min(abs_dist, + abs(path_true_positions[i].first - path_mapped_positions[j].first)); + } + } + } + } + } + } + + if (abs_dist <= range) { + correct_counts[omp_get_thread_num()]++; + } + + // group mapq defaults to regular mapq + int64_t group_mapq = proto_mp_aln.mapping_quality(); + if (has_annotation(proto_mp_aln, "group_mapq")) { + group_mapq = get_annotation(proto_mp_aln, "group_mapq"); + } + + // put the result on the IO buffer + auto& buffer = buffers[omp_get_thread_num()]; + buffer.emplace_back(abs_dist, proto_mp_aln.subpath_size() > 0, proto_mp_aln.mapping_quality(), group_mapq, move(*proto_mp_aln.mutable_name())); + if (buffer.size() > buffer_size) { +#pragma omp critical + flush_buffer(buffer); + } + }; + + function evaluate_gam_correctness = [&](Alignment& aln) { + // TODO: kinda ugly, but whatever + multipath_alignment_t mp_aln; + to_multipath_alignment(aln, mp_aln); + MultipathAlignment proto_mp_aln; + to_proto_multipath_alignment(mp_aln, proto_mp_aln); + // we need the name to survive transit in multipath_alignment_t + proto_mp_aln.set_name(aln.name()); + // now use the same evaluation function + evaluate_correctness(proto_mp_aln); + }; + + if (test_file_name == "-") { + if (!std::cin) { + cerr << "error[vg gampcompare]: Unable to read standard input when looking for mapped reads" << endl; + exit(1); + } + if (gam_input) { + vg::io::for_each_parallel(std::cin, evaluate_gam_correctness); + } + else { + vg::io::for_each_parallel(std::cin, evaluate_correctness); + } + } else { + ifstream test_file_in(test_file_name); + if (!test_file_in) { + cerr << "error[vg gampcompare]: Unable to read " << test_file_name << " when looking for mapped reads" << endl; + exit(1); + } + if (gam_input) { + vg::io::for_each_parallel(test_file_in, evaluate_gam_correctness); + } + else { + vg::io::for_each_parallel(test_file_in, evaluate_correctness); + } + } + + // Empty out whatever's in the buffers at the end. + for (auto& buffer : buffers) { + flush_buffer(buffer); + } + + // We are flagging reads correct/incorrect. So report the total correct. + size_t total_correct = 0; + for (auto& count : correct_counts) { + total_correct += count; + } + + cerr << total_correct << " reads correct" << endl; + + return 0; +} + +// Register subcommand +static Subcommand vg_gampcompare("gampcompare", "compare multipath alignment positions", main_gampcompare); diff --git a/src/subcommand/gamsort_main.cpp b/src/subcommand/gamsort_main.cpp index 32ed97a4ba7..2275cbd41ea 100644 --- a/src/subcommand/gamsort_main.cpp +++ b/src/subcommand/gamsort_main.cpp @@ -1,9 +1,8 @@ -#include "../gamsorter.hpp" -#include "../gam_index.hpp" -#include "../stream.hpp" +#include "../stream_sorter.hpp" +#include +#include "../stream_index.hpp" #include #include "subcommand.hpp" -#include "../index.hpp" /** * GAM sort main @@ -17,11 +16,8 @@ void help_gamsort(char **argv) cerr << "gamsort: sort a GAM file, or index a sorted GAM file" << endl << "Usage: " << argv[1] << " [Options] gamfile" << endl << "Options:" << endl - << " -s / --sorted Input GAM is already sorted." << endl << " -i / --index FILE produce an index of the sorted GAM file" << endl << " -d / --dumb-sort use naive sorting algorithm (no tmp files, faster for small GAMs)" << endl - << " -r / --rocks DIR Just use the old RocksDB-style indexing scheme for sorting, using the given database name." << endl - << " -a / --aln-index Create the old RocksDB-style node-to-alignment index." << endl << " -p / --progress Show progress." << endl << " -t / --threads Use the specified number of threads." << endl << endl; @@ -30,10 +26,7 @@ void help_gamsort(char **argv) int main_gamsort(int argc, char **argv) { string index_filename; - string rocksdb_filename; - bool dumb_sort = false; - bool is_sorted = false; - bool do_aln_index = false; + bool easy_sort = false; bool show_progress = false; // We limit the max threads, and only allow thread count to be lowered, to // prevent tcmalloc from giving each thread a very large heap for many @@ -49,13 +42,11 @@ int main_gamsort(int argc, char **argv) {"index", required_argument, 0, 'i'}, {"dumb-sort", no_argument, 0, 'd'}, {"rocks", required_argument, 0, 'r'}, - {"aln-index", no_argument, 0, 'a'}, - {"is-sorted", no_argument, 0, 's'}, {"progress", no_argument, 0, 'p'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0}}; int option_index = 0; - c = getopt_long(argc, argv, "i:dhr:aspt:", + c = getopt_long(argc, argv, "i:dhpt:", long_options, &option_index); // Detect the end of the options. @@ -68,16 +59,7 @@ int main_gamsort(int argc, char **argv) index_filename = optarg; break; case 'd': - dumb_sort = true; - break; - case 's': - is_sorted = true; - break; - case 'r': - rocksdb_filename = optarg; - break; - case 'a': - do_aln_index = true; + easy_sort = true; break; case 'p': show_progress = true; @@ -105,73 +87,26 @@ int main_gamsort(int argc, char **argv) GAMSorter gs(show_progress); - if (!rocksdb_filename.empty()) { - // Do the sort the old way - write a big ol' - // RocksDB index of alignments, then dump them - // from that DB. Loses unmapped reads. - Index rocks; - - unique_ptr index; - if (!index_filename.empty()) { - // Make a new-style GAM index also - index = unique_ptr(new GAMIndex()); - } - - // Index the alignments in RocksDB - rocks.open_for_bulk_load(rocksdb_filename); - int64_t aln_idx = 0; - function lambda_reader = [&rocks](Alignment& aln) { - rocks.put_alignment(aln); - }; - stream::for_each_parallel(gam_in, lambda_reader); - - // Set up the emitter - stream::ProtobufEmitter output(cout); - if (index.get() != nullptr) { - output.on_group([&index](const vector& group, int64_t start_vo, int64_t past_end_vo) { - // If we are making a sorted GAM index, record the group. - // The index will outlive the emitter so this is safe to call in the emitter's destructor. - index->add_group(group, start_vo, past_end_vo); - }); - } - - // Print them out again in order - auto lambda_writer = [&output](const Alignment& aln) { - output.write_copy(aln); - }; - rocks.for_each_alignment(lambda_writer); - - rocks.flush(); - rocks.close(); - - if (index.get() != nullptr) { - // Save the index - ofstream index_out(index_filename); - index->save(index_out); - } - + // Do a normal GAMSorter sort + unique_ptr index; + + if (!index_filename.empty()) { + // Make an index + index = unique_ptr(new GAMIndex()); + } + + if (easy_sort) { + // Sort in a single pass in memory + gs.easy_sort(gam_in, cout, index.get()); } else { - // Do a normal GAMSorter sort - unique_ptr index; - - if (!index_filename.empty()) { - // Make an index - index = unique_ptr(new GAMIndex()); - } - - if (dumb_sort) { - // Sort in a single pass in memory - gs.dumb_sort(gam_in, cout, index.get()); - } else { - // Sort using fan-in-limited temp file merging - gs.stream_sort(gam_in, cout, index.get()); - } - - if (index.get() != nullptr) { - // Save the index - ofstream index_out(index_filename); - index->save(index_out); - } + // Sort using fan-in-limited temp file merging + gs.stream_sort(gam_in, cout, index.get()); + } + + if (index.get() != nullptr) { + // Save the index + ofstream index_out(index_filename); + index->save(index_out); } }); diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index 284a71e02a9..2977fb7f14d 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -1,79 +1,509 @@ /** \file gbwt_main.cpp * - * Defines the "vg gbwt" subcommand, which wraps up access for commands we'd otherwise find - * in the gbwt submodule. */ + * Defines the "vg gbwt" subcommand for building, merging, and manipulating GBWT indexes + * and GBWTGraphs. + */ #include #include #include -#include -#include +#include #include "subcommand.hpp" +#include "../gbwt_helper.hpp" +#include "../gbwtgraph_helper.hpp" +#include "../haplotype_indexer.hpp" +#include "../path.hpp" +#include "../region.hpp" +#include "../algorithms/find_translation.hpp" -#include "../xg.hpp" +#include + +#include +#include +#include -using namespace std; using namespace vg; -using namespace vg::subcommand; -#include +struct GBWTConfig { + // Build mode also defines the type of input args. + enum build_mode { build_none, build_vcf, build_gfa, build_paths, build_alignments, build_gbz, build_gbwtgraph }; + enum merge_mode { merge_none, merge_insert, merge_fast, merge_parallel }; + enum path_cover_mode { path_cover_none, path_cover_augment, path_cover_local, path_cover_greedy }; + + // Requirements and modes. + bool produces_one_gbwt = false; // Steps 1-4 eventually produce one input GBWT regardless of the number of input args. + build_mode build = build_none; + merge_mode merge = merge_none; + path_cover_mode path_cover = path_cover_none; + bool metadata_mode = false, thread_mode = false; + + // Input GBWT construction. + HaplotypeIndexer haplotype_indexer; + bool gam_format = false, inputs_as_jobs = false, parse_only = false; + size_t build_jobs = default_build_jobs(); + + // GFA parsing. + gbwtgraph::GFAParsingParameters gfa_parameters = get_best_gbwtgraph_gfa_parsing_parameters(); + + // Parallel merging. + gbwt::MergeParameters merge_parameters; + + // GBWTGraph construction. + bool gbz_format = false; + + // Other parameters and flags. + bool show_progress = false; + bool count_threads = false; + bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, thread_names = false, tags = false; + bool include_named_paths = false; + size_t num_paths = default_num_paths(), context_length = default_context_length(); + bool num_paths_set = false; + size_t search_threads = omp_get_max_threads(); + + // Input data. + std::vector input_filenames; + std::string gbwt_name; // There is a single input GBWT to load. + std::string graph_name; + std::string gbwtgraph_name; + + // File names. + std::string gbwt_output; // Output GBWT. + std::string thread_output; // Threads in SDSL format. + std::string graph_output; // Output GBWTGraph. + std::string segment_translation; // Segment to node translation output. + std::string r_index_name; // Output r-index. + + // Sample names and metadata + std::set to_remove; // Sample names to remove. + std::map tags_to_set; // Tag changes to apply to the GBWT + + GBWTConfig() { + this->merge_parameters.setMergeJobs(default_merge_jobs()); + } + + static size_t default_build_jobs() { + return std::max(static_cast(1), static_cast(omp_get_max_threads() / 2)); + } + + static constexpr size_t default_num_paths() { + return gbwtgraph::PATH_COVER_DEFAULT_N; + } + + static constexpr size_t default_num_paths_local() { + return gbwtgraph::LOCAL_HAPLOTYPES_DEFAULT_N; + } + + static constexpr size_t default_context_length() { + return gbwtgraph::PATH_COVER_DEFAULT_K; + } + + static size_t default_merge_jobs() { + return std::min(static_cast(gbwt::MergeParameters::MERGE_JOBS), std::max(static_cast(1), static_cast(omp_get_max_threads() / 2))); + } +}; + +struct GraphHandler { + enum graph_type { graph_none, graph_path, graph_source, graph_gbz, graph_gbwtgraph }; + + std::unique_ptr path_graph = nullptr; + std::unique_ptr sequence_source = nullptr; + std::unique_ptr gbwt_graph = nullptr; + graph_type in_use = graph_none; + + // Load the `PathHandleGraph` specified in the config and release other graphs. + // No effect if the handler already contains a `PathHandleGraph`. + void get_graph(const GBWTConfig& config); + + // Take the ownership of the provided `SequenceSource` and store it in the handler. + // Releases other graphs. + void use(std::unique_ptr& source); + + // Load the GBZ specified in the config, store the GBWT in the GBWTHandler and the + // graph in this handler. + // NOTE: The graph will become invalid if the GBWT in the GBWTHandler changes. + void load_gbz(GBWTHandler& gbwts, GBWTConfig& config); + + // Load the GBWTGraph specified in the config, store the GBWT in the + // GBWTHandler and the graph in this handler. + // NOTE: The graph will become invalid if the GBWT in the GBWTHandler changes. + void load_gbwtgraph(GBWTHandler& gbwts, GBWTConfig& config); -#include + void clear(); + // If the handler contains a `SequenceSource`, serialize it according to the config. + void serialize_segment_translation(const GBWTConfig& config) const; +}; + +//---------------------------------------------------------------------------- + +GBWTConfig parse_gbwt_config(int argc, char** argv); +void validate_gbwt_config(GBWTConfig& config); + +void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); +void step_2_merge_gbwts(GBWTHandler& gbwts, GBWTConfig& config); +void step_3_alter_gbwt(GBWTHandler& gbwts, GBWTConfig& config); +void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); +void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); +void step_6_r_index(GBWTHandler& gbwts, GBWTConfig& config); +void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config); +void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config); + +void report_time_memory(const std::string& what, double start_time, const GBWTConfig& config); +void print_metadata(std::ostream& out, const GBWTHandler& gbwts); + +//---------------------------------------------------------------------------- + +int main_gbwt(int argc, char** argv) { + GBWTConfig config = parse_gbwt_config(argc, argv); + validate_gbwt_config(config); + + // Let GBWT operate silently. + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + + // This is the data we are using. + GBWTHandler gbwts; + gbwts.filename = config.gbwt_name; + gbwts.show_progress = config.show_progress; + GraphHandler graphs; + + // Input GBWT construction. + if (config.build != GBWTConfig::build_none) { + step_1_build_gbwts(gbwts, graphs, config); + } + + // Merge multiple input GBWTs. + if (config.merge != GBWTConfig::merge_none) { + step_2_merge_gbwts(gbwts, config); + } + + // Edit the GBWT (remove samples, apply tags). + if (!config.to_remove.empty() || !config.tags_to_set.empty()) { + step_3_alter_gbwt(gbwts, config); + } + + // Path cover construction. + if (config.path_cover != GBWTConfig::path_cover_none) { + step_4_path_cover(gbwts, graphs, config); + } + + // Now we can serialize the GBWT to a separate file. + if (!config.gbwt_output.empty() && !config.gbz_format) { + double start = gbwt::readTimer(); + gbwts.serialize(config.gbwt_output); + report_time_memory("GBWT serialized", start, config); + } + + // Serialize the segment translation if necessary. + if (!config.segment_translation.empty()) { + graphs.serialize_segment_translation(config); + } + + // GBWTGraph construction and serialization. + if (!config.graph_output.empty()) { + step_5_gbwtgraph(gbwts, graphs, config); + } + + // We no longer need the graph. + graphs.clear(); + + // R-index construction. + if (!config.r_index_name.empty()) { + step_6_r_index(gbwts, config); + } + + // Metadata options. + if (config.metadata_mode) { + step_7_metadata(gbwts, config); + } + + // Thread options. + if (config.thread_mode) { + step_8_threads(gbwts, config); + } + + return 0; +} + +//---------------------------------------------------------------------------- void help_gbwt(char** argv) { - cerr << "usage: " << argv[0] << " [options] [args]" << endl - << "Manipulate GBWTs." << endl - << "merging:" << endl - << " -m, --merge merge the GBWT files from the input args and write to output" << endl - << " -o, --output X write output GBWT to X" << endl - << " -b, --batches N use batches of N sequences for merging (default: " << gbwt::DynamicGBWT::MERGE_BATCH_SIZE << ")" << endl - << " -f, --fast fast merging algorithm (node ids must not overlap; implies -m)" << endl - << " -p, --progress show progress and statistics" << endl - << "threads:" << endl - << " -c, --count-threads print the number of threads" << endl - << " -e, --extract FILE extract threads in SDSL format to FILE" << endl - << endl; -} - -int main_gbwt(int argc, char** argv) -{ + std::cerr << "usage: " << argv[0] << " gbwt [options] [args]" << std::endl; + std::cerr << std::endl; + std::cerr << "Manipulate GBWTs. Input GBWTs are loaded from input args or built in earlier steps." << std::endl; + std::cerr << "The input graph is provided with one of -x, -G, or -Z" << std::endl; + std::cerr << std::endl; + std::cerr << "General options:" << std::endl; + std::cerr << " -x, --xg-name FILE read the graph from FILE" << std::endl; + std::cerr << " -o, --output FILE write output GBWT to FILE" << std::endl; + std::cerr << " -d, --temp-dir DIR use directory DIR for temporary files" << std::endl; + std::cerr << " -p, --progress show progress and statistics" << std::endl; + std::cerr << std::endl; + std::cerr << "GBWT construction parameters (for steps 1 and 4):" << std::endl; + std::cerr << " --buffer-size N GBWT construction buffer size in millions of nodes (default " << (gbwt::DynamicGBWT::INSERT_BATCH_SIZE / gbwt::MILLION) << ")" << std::endl; + std::cerr << " --id-interval N store path ids at one out of N positions (default " << gbwt::DynamicGBWT::SAMPLE_INTERVAL << ")" << std::endl; + std::cerr << std::endl; + std::cerr << "Multithreading:" << std::endl; + std::cerr << " --num-jobs N use at most N parallel build jobs (for -v and -G; default " << GBWTConfig::default_build_jobs() << ")" << std::endl; + std::cerr << " --num-threads N use N parallel search threads (for -b and -r; default " << omp_get_max_threads() << ")" << std::endl; + std::cerr << std::endl; + std::cerr << "Step 1: GBWT construction (requires -o and one of { -v, -G, -Z, -E, A }):" << std::endl; + std::cerr << " -v, --vcf-input index the haplotypes in the VCF files specified in input args in parallel" << std::endl; + std::cerr << " (inputs must be over different contigs; requires -x, implies -f)" << std::endl; + std::cerr << " (does not store graph contigs in the GBWT)" << std::endl; + std::cerr << " --preset X use preset X (available: 1000gp)" << std::endl; + std::cerr << " --inputs-as-jobs create one build job for each input instead of using first-fit heuristic" << std::endl; + std::cerr << " --parse-only store the VCF parses without building GBWTs" << std::endl; + std::cerr << " (use -o for the file name prefix; skips subsequent steps)" << std::endl; + std::cerr << " --ignore-missing do not warn when variants are missing from the graph" << std::endl; + std::cerr << " --actual-phasing do not interpret unphased homozygous genotypes as phased" << std::endl; + std::cerr << " --force-phasing replace unphased genotypes with randomly phased ones" << std::endl; + std::cerr << " --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved" << std::endl; + std::cerr << " instead of creating a phase break" << std::endl; + std::cerr << " --batch-size N index the haplotypes in batches of N samples (default 200)" << std::endl; // FIXME source for the default + std::cerr << " --sample-range X-Y index samples X to Y (inclusive, 0-based)" << std::endl; + std::cerr << " --rename V=P VCF contig V matches path P in the graph (may repeat)" << std::endl; + std::cerr << " --vcf-variants variants in the graph use VCF contig names instead of path names" << std::endl; + std::cerr << " --vcf-region C:X-Y restrict VCF contig C to coordinates X to Y (inclusive, 1-based; may repeat)" << std::endl; + std::cerr << " --exclude-sample X do not index the sample with name X (faster than -R; may repeat)" << std::endl; + std::cerr << " -G, --gfa-input index the walks or paths in the GFA file (one input arg)" << std::endl; + std::cerr << " --max-node N chop long segments into nodes of at most N bp (default " << gbwtgraph::MAX_NODE_LENGTH << ", use 0 to disable)" << std::endl; + std::cerr << " --path-regex X parse metadata as haplotypes from path names using regex X instead of vg-parser-compatible rules" << std::endl; + std::cerr << " --path-fields X parse metadata as haplotypes, mapping regex submatches to these fields instead of using vg-parser-compatible rules" << std::endl; + std::cerr << " --translation FILE write the segment to node translation table to FILE" << std::endl; + std::cerr << " -Z, --gbz-input extract GBWT and GBWTGraph from GBZ input (one input arg)" << std::endl; + std::cerr << " --translation FILE write the segment to node translation table to FILE" << std::endl; + std::cerr << " -I, --gg-in FILE load GBWTGraph from FILE and GBWT from input (one input arg) " << std::endl; + std::cerr << " -E, --index-paths index the embedded non-alt paths in the graph (requires -x, no input args)" << std::endl; + std::cerr << " -A, --alignment-input index the alignments in the GAF files specified in input args (requires -x)" << std::endl; + std::cerr << " --gam-format the input files are in GAM format instead of GAF format" << std::endl; + std::cerr << std::endl; + std::cerr << "Step 2: Merge multiple input GBWTs (requires -o):" << std::endl; + std::cerr << " -m, --merge use the insertion algorithm" << std::endl; + std::cerr << " -f, --fast fast merging algorithm (node ids must not overlap)" << std::endl; + std::cerr << " -b, --parallel use the parallel algorithm" << std::endl; + std::cerr << " --chunk-size N search in chunks of N sequences (default " << gbwt::MergeParameters::CHUNK_SIZE << ")" << std::endl; + std::cerr << " --pos-buffer N use N MiB position buffers for each search thread (default " << gbwt::MergeParameters::POS_BUFFER_SIZE << ")" << std::endl; + std::cerr << " --thread-buffer N use N MiB thread buffers for each search thread (default " << gbwt::MergeParameters::THREAD_BUFFER_SIZE << ")" << std::endl; + std::cerr << " --merge-buffers N merge 2^N thread buffers into one file per merge job (default " << gbwt::MergeParameters::MERGE_BUFFERS << ")" << std::endl; + std::cerr << " --merge-jobs N run N parallel merge jobs (default " << GBWTConfig::default_merge_jobs() << ")" << std::endl; + std::cerr << std::endl; + std::cerr << "Step 3: Alter GBWT (requires -o and one input GBWT):" << std::endl; + std::cerr << " -R, --remove-sample X remove the sample with name X from the index (may repeat)" << std::endl; + std::cerr << " --set-tag K=V set a GBWT tag (may repeat)" << std::endl; + std::cerr << std::endl; + std::cerr << "Step 4: Path cover GBWT construction (requires -o, -x, and one of { -a, -l, -P }):" << std::endl; + std::cerr << " -a, --augment-gbwt add a path cover of missing components (one input GBWT)" << std::endl; + std::cerr << " -l, --local-haplotypes sample local haplotypes (one input GBWT)" << std::endl; + std::cerr << " -P, --path-cover build a greedy path cover (no input GBWTs)" << std::endl; + std::cerr << " -n, --num-paths N find N paths per component (default " << GBWTConfig::default_num_paths_local() << " for -l, " << GBWTConfig::default_num_paths() << " otherwise)" << std::endl; + std::cerr << " -k, --context-length N use N-node contexts (default " << GBWTConfig::default_context_length() << ")" << std::endl; + std::cerr << " --pass-paths include named graph paths in local haplotype or greedy path cover GBWT" << std::endl; + std::cerr << std::endl; + std::cerr << "Step 5: GBWTGraph construction (requires an input graph and one input GBWT):" << std::endl; + std::cerr << " -g, --graph-name FILE build GBWTGraph and store it in FILE" << std::endl; + std::cerr << " --gbz-format serialize both GBWT and GBWTGraph in GBZ format (makes -o unnecessary)" << std::endl; + std::cerr << std::endl; + std::cerr << "Step 6: R-index construction (one input GBWT):" << std::endl; + std::cerr << " -r, --r-index FILE build an r-index and store it in FILE" << std::endl; + std::cerr << std::endl; + std::cerr << "Step 7: Metadata (one input GBWT):" << std::endl; + std::cerr << " -M, --metadata print basic metadata" << std::endl; + std::cerr << " -C, --contigs print the number of contigs" << std::endl; + std::cerr << " -H, --haplotypes print the number of haplotypes" << std::endl; + std::cerr << " -S, --samples print the number of samples" << std::endl; + std::cerr << " -L, --list-names list contig/sample names (use with -C or -S)" << std::endl; + std::cerr << " -T, --thread-names list thread names" << std::endl; + std::cerr << " --tags list GBWT tags" << std::endl; + std::cerr << std::endl; + std::cerr << "Step 8: Threads (one input GBWT):" << std::endl; + std::cerr << " -c, --count-threads print the number of threads" << std::endl; + std::cerr << " -e, --extract FILE extract threads in SDSL format to FILE" << std::endl; + std::cerr << std::endl; +} + +//---------------------------------------------------------------------------- + +void use_preset(std::string preset_name, GBWTConfig& config) { + for (char& c : preset_name) { + c = std::tolower(c); + } + if (preset_name == "1000gp") { + config.haplotype_indexer.gbwt_buffer_size = 200; + config.haplotype_indexer.samples_in_batch = 100; + config.haplotype_indexer.force_phasing = true; + config.haplotype_indexer.discard_overlaps = true; + } else { + std::cerr << "error: [vg gbwt] invalid preset: " << preset_name << std::endl; + std::exit(EXIT_FAILURE); + } +} + +void no_multiple_input_types(const GBWTConfig& config) { + if (config.build != GBWTConfig::build_none) { + std::cerr << "error: [vg gbwt] only one input type can be specified for step 1" << std::endl; + std::exit(EXIT_FAILURE); + } +} + +void no_multiple_cover_types(const GBWTConfig& config) { + if (config.path_cover != GBWTConfig::path_cover_none) + { + std::cerr << "error: [vg gbwt] only one path cover type can be specified for step 4" << std::endl; + std::exit(EXIT_FAILURE); + } +} + +GBWTConfig parse_gbwt_config(int argc, char** argv) { if (argc == 2) { help_gbwt(argv); - return 1; + std::exit(EXIT_FAILURE); } - bool merge = false; - size_t batch_size = gbwt::DynamicGBWT::MERGE_BATCH_SIZE; - bool fast_merging = false; - bool show_progress = false; - bool count_threads = false; - string gbwt_output, thread_output; + // Long options with no corresponding short options. + constexpr int OPT_BUFFER_SIZE = 1000; + constexpr int OPT_ID_INTERVAL = 1001; + constexpr int OPT_NUM_JOBS = 1002; + constexpr int OPT_NUM_THREADS = 1003; + constexpr int OPT_PRESET = 1100; + constexpr int OPT_INPUTS_AS_JOBS = 1102; + constexpr int OPT_PARSE_ONLY = 1103; + constexpr int OPT_IGNORE_MISSING = 1104; + constexpr int OPT_ACTUAL_PHASING = 1105; + constexpr int OPT_FORCE_PHASING = 1106; + constexpr int OPT_DISCARD_OVERLAPS = 1107; + constexpr int OPT_BATCH_SIZE = 1108; + constexpr int OPT_SAMPLE_RANGE = 1109; + constexpr int OPT_RENAME = 1110; + constexpr int OPT_VCF_VARIANTS = 1111; + constexpr int OPT_VCF_REGION = 1112; + constexpr int OPT_EXCLUDE_SAMPLE = 1113; + constexpr int OPT_MAX_NODE = 1114; + constexpr int OPT_PATH_REGEX = 1115; + constexpr int OPT_PATH_FIELDS = 1116; + constexpr int OPT_TRANSLATION = 1117; + constexpr int OPT_GAM_FORMAT = 1118; + constexpr int OPT_CHUNK_SIZE = 1200; + constexpr int OPT_POS_BUFFER = 1201; + constexpr int OPT_THREAD_BUFFER = 1202; + constexpr int OPT_MERGE_BUFFERS = 1203; + constexpr int OPT_MERGE_JOBS = 1204; + constexpr int OPT_SET_TAG = 1300; + constexpr int OPT_PASS_PATHS = 1400; + constexpr int OPT_GBZ_FORMAT = 1500; + constexpr int OPT_TAGS = 1700; + + // Make a collection of all the known tags and their descriptions. Use an ordered map so that we can do some typo guessing. + // Values are description and list of prohibited characters. + const std::map>> KNOWN_TAGS = { + {gbwtgraph::REFERENCE_SAMPLE_LIST_GBWT_TAG, {"a space-separated list of PanSN-valid sample/assembly names of references in the graph", {'#'}}} + }; - int c; - optind = 2; // force optind past command positional argument - while (true) { - static struct option long_options[] = - { - // Merging - {"merge", no_argument, 0, 'm'}, - {"output", required_argument, 0, 'o'}, - {"batches", required_argument, 0, 'b'}, - {"fast", no_argument, 0, 'f'}, - {"progress", no_argument, 0, 'p'}, + static struct option long_options[] = + { + // General + { "xg-name", required_argument, 0, 'x' }, + { "output", required_argument, 0, 'o' }, + { "temp-dir", required_argument, 0, 'd' }, + { "progress", no_argument, 0, 'p' }, + + // GBWT construction parameters + { "buffer-size", required_argument, 0, OPT_BUFFER_SIZE }, + { "id-interval", required_argument, 0, OPT_ID_INTERVAL }, - // Threads - {"count-threads", no_argument, 0, 'c'}, - {"extract", required_argument, 0, 'e'}, + // Multithreading parameters + { "num-jobs", required_argument, 0, OPT_NUM_JOBS }, + { "num-threads", required_argument, 0, OPT_NUM_THREADS }, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; + // Input GBWT construction: VCF + { "vcf-input", no_argument, 0, 'v' }, + { "preset", required_argument, 0, OPT_PRESET }, + { "inputs-as-jobs", no_argument, 0, OPT_INPUTS_AS_JOBS }, + { "parse-only", no_argument, 0, OPT_PARSE_ONLY }, + { "ignore-missing", no_argument, 0, OPT_IGNORE_MISSING }, + { "actual-phasing", no_argument, 0, OPT_ACTUAL_PHASING }, + { "force-phasing", no_argument, 0, OPT_FORCE_PHASING }, + { "discard-overlaps", no_argument, 0, OPT_DISCARD_OVERLAPS }, + { "batch-size", required_argument, 0, OPT_BATCH_SIZE }, + { "sample-range", required_argument, 0, OPT_SAMPLE_RANGE }, + { "rename", required_argument, 0, OPT_RENAME }, + { "vcf-variants", no_argument, 0, OPT_VCF_VARIANTS }, + { "vcf-region", required_argument, 0, OPT_VCF_REGION }, + { "exclude-sample", required_argument, 0, OPT_EXCLUDE_SAMPLE }, + // Input GBWT construction: GFA + { "gfa-input", no_argument, 0, 'G' }, + { "max-node", required_argument, 0, OPT_MAX_NODE }, + { "path-regex", required_argument, 0, OPT_PATH_REGEX }, + { "path-fields", required_argument, 0, OPT_PATH_FIELDS }, + { "translation", required_argument, 0, OPT_TRANSLATION }, + + // Input GBWT construction: GBZ + { "gbz-input", no_argument, 0, 'Z' }, + + // Input GBWT construction: GBWTGraph and GBWT + { "gg-in", required_argument, 0, 'I' }, + + // Input GBWT construction: paths + { "index-paths", no_argument, 0, 'E' }, + + // Input GBWT construction: GAF/GAM + { "alignment-input", no_argument, 0, 'A' }, + { "gam-format", no_argument, 0, OPT_GAM_FORMAT }, + + // Merging + { "merge", no_argument, 0, 'm' }, + { "fast", no_argument, 0, 'f' }, + { "parallel", no_argument, 0, 'b' }, + { "chunk-size", required_argument, 0, OPT_CHUNK_SIZE }, + { "pos-buffer", required_argument, 0, OPT_POS_BUFFER }, + { "thread-buffer", required_argument, 0, OPT_THREAD_BUFFER }, + { "merge-buffers", required_argument, 0, OPT_MERGE_BUFFERS }, + { "merge-jobs", required_argument, 0, OPT_MERGE_JOBS }, + + // Alter GBWT + { "remove-sample", required_argument, 0, 'R' }, + { "set-tag", required_argument, 0, OPT_SET_TAG }, + + // Path cover + { "augment-gbwt", no_argument, 0, 'a' }, + { "local-haplotypes", no_argument, 0, 'l' }, + { "path-cover", no_argument, 0, 'P' }, + { "num-paths", required_argument, 0, 'n' }, + { "context-length", required_argument, 0, 'k' }, + { "pass-paths", no_argument, 0, OPT_PASS_PATHS }, + + // GBWTGraph + { "graph-name", required_argument, 0, 'g' }, + { "gbz-format", no_argument, 0, OPT_GBZ_FORMAT }, + + // R-index + { "r-index", required_argument, 0, 'r' }, + + // Metadata + { "metadata", no_argument, 0, 'M' }, + { "contigs", no_argument, 0, 'C' }, + { "haplotypes", no_argument, 0, 'H' }, + { "samples", no_argument, 0, 'S' }, + { "list-names", no_argument, 0, 'L' }, + { "thread-names", no_argument, 0, 'T' }, + { "tags", no_argument, 0, OPT_TAGS }, + + // Threads + { "count-threads", no_argument, 0, 'c' }, + { "extract", required_argument, 0, 'e' }, + + { "help", no_argument, 0, 'h' }, + { 0, 0, 0, 0 } + }; + + int c; + optind = 2; // force optind past command positional argument + GBWTConfig config; + while (true) { int option_index = 0; - c = getopt_long(argc, argv, "mo:b:fpce:h?", long_options, &option_index); + c = getopt_long(argc, argv, "x:o:d:pvGZI:EAmfbR:alPn:k:g:r:MCHSLTce:h?", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) @@ -81,168 +511,1299 @@ int main_gbwt(int argc, char** argv) switch (c) { + // General + case 'x': + config.graph_name = optarg; + break; + case 'o': + config.gbwt_output = optarg; + break; + case 'd': + temp_file::set_dir(optarg); + break; + case 'p': + config.show_progress = true; + break; + + // GBWT construction parameters + case OPT_BUFFER_SIZE: + config.haplotype_indexer.gbwt_buffer_size = std::max(parse(optarg), 1ul); + config.gfa_parameters.automatic_batch_size = false; // User-defined buffer size overrides heuristics. + break; + case OPT_ID_INTERVAL: + config.haplotype_indexer.id_interval = parse(optarg); + break; + + // Multithreading parameters + case OPT_NUM_JOBS: + config.build_jobs = parse(optarg); + break; + case OPT_NUM_THREADS: + config.search_threads = std::max(parse(optarg), 1ul); + break; + + // Input GBWT construction: VCF + case 'v': + no_multiple_input_types(config); + config.build = GBWTConfig::build_vcf; + config.produces_one_gbwt = true; + break; + case OPT_PRESET: + use_preset(optarg, config); + break; + case OPT_INPUTS_AS_JOBS: + config.inputs_as_jobs = true; + break; + case OPT_PARSE_ONLY: + config.parse_only = true; + break; + case OPT_IGNORE_MISSING: + config.haplotype_indexer.warn_on_missing_variants = false; + break; + case OPT_ACTUAL_PHASING: + config.haplotype_indexer.phase_homozygous = false; + break; + case OPT_FORCE_PHASING: + config.haplotype_indexer.force_phasing = true; + break; + case OPT_DISCARD_OVERLAPS: + config.haplotype_indexer.discard_overlaps = true; + break; + case OPT_BATCH_SIZE: + config.haplotype_indexer.samples_in_batch = std::max(parse(optarg), 1ul); + break; + case OPT_SAMPLE_RANGE: + { + // Parse first-last + string range(optarg); + size_t found = range.find("-"); + if(found == std::string::npos || found == 0 || found + 1 == range.size()) { + cerr << "error: [vg gbwt] cannot parse range " << range << endl; + std::exit(EXIT_FAILURE); + } + config.haplotype_indexer.sample_range.first = parse(range.substr(0, found)); + config.haplotype_indexer.sample_range.second = parse(range.substr(found + 1)) + 1; + } + break; + case OPT_RENAME: + { + // Parse old=new + string key_value(optarg); + auto found = key_value.find('='); + if (found == string::npos || found == 0 || found + 1 == key_value.size()) { + cerr << "error: [vg gbwt] cannot parse rename " << key_value << endl; + std::exit(EXIT_FAILURE); + } + // Parse out the two parts + string vcf_contig = key_value.substr(0, found); + string graph_contig = key_value.substr(found + 1); + // Add the name mapping + config.haplotype_indexer.path_to_vcf[graph_contig] = vcf_contig; + } + break; + case OPT_VCF_VARIANTS: + config.haplotype_indexer.rename_variants = false; + break; + case OPT_VCF_REGION: + { + // Parse contig:first-last + std::string region(optarg); + Region parsed; + parse_region(region, parsed); + if (parsed.start <= 0 || parsed.end <= 0) { + // We need both range bounds, and we can't accept 0 since input is 1-based. + cerr << "error: [vg gbwt] cannot parse 1-based region " << region << endl; + } + // Make sure to correct the coordinates to 0-based exclusive-end, from 1-based inclusive-end + config.haplotype_indexer.regions[parsed.seq] = std::make_pair((size_t) (parsed.start - 1), (size_t) parsed.end); + } + break; + case OPT_EXCLUDE_SAMPLE: + config.haplotype_indexer.excluded_samples.insert(optarg); + break; + + // Input GBWT construction: GFA + case 'G': + no_multiple_input_types(config); + config.build = GBWTConfig::build_gfa; + config.produces_one_gbwt = true; + break; + case OPT_MAX_NODE: + config.gfa_parameters.max_node_length = parse(optarg); + break; + case OPT_PATH_REGEX: + if (config.gfa_parameters.path_name_formats.size() != 1) { + // We need to override the existing rules we set up when we set + // up the config, and make sure we have a place for the other + // option. + config.gfa_parameters.path_name_formats.clear(); + config.gfa_parameters.path_name_formats.emplace_back("", "", PathSense::HAPLOTYPE); + } + config.gfa_parameters.path_name_formats.back().regex = optarg; + break; + case OPT_PATH_FIELDS: + if (config.gfa_parameters.path_name_formats.size() != 1) { + // We need to override the existing rules we set up when we set + // up the config, and make sure we have a place for the other + // option. + config.gfa_parameters.path_name_formats.clear(); + config.gfa_parameters.path_name_formats.emplace_back("", "", PathSense::HAPLOTYPE); + } + config.gfa_parameters.path_name_formats.back().fields = optarg; + break; + case OPT_TRANSLATION: + config.segment_translation = optarg; + break; + + // Input GBWT construction: GBZ + case 'Z': + no_multiple_input_types(config); + config.build = GBWTConfig::build_gbz; + config.produces_one_gbwt = true; + break; + + // Input GBWT construction: GBWTGraph and GBWT + case 'I': + no_multiple_input_types(config); + config.build = GBWTConfig::build_gbwtgraph; + config.gbwtgraph_name = optarg; + config.produces_one_gbwt = true; + break; + + // Input GBWT construction: Paths + case 'E': + no_multiple_input_types(config); + config.build = GBWTConfig::build_paths; + config.produces_one_gbwt = true; + break; + + // Input GBWT construction: GAF/GAM + case 'A': + no_multiple_input_types(config); + config.build = GBWTConfig::build_alignments; + config.produces_one_gbwt = true; + break; + case OPT_GAM_FORMAT: + config.gam_format = true; + break; + // Merging case 'm': - merge = true; + config.merge = GBWTConfig::merge_insert; + config.produces_one_gbwt = true; break; - case 'o': - gbwt_output = optarg; + case 'f': + config.merge = GBWTConfig::merge_fast; + config.produces_one_gbwt = true; break; case 'b': - batch_size = parse(optarg); + config.merge = GBWTConfig::merge_parallel; + config.produces_one_gbwt = true; break; - case 'f': - fast_merging = true; - merge = true; + case OPT_CHUNK_SIZE: + config.merge_parameters.setChunkSize(parse(optarg)); break; - case 'p': - show_progress = true; + case OPT_POS_BUFFER: + config.merge_parameters.setPosBufferSize(parse(optarg)); + break; + case OPT_THREAD_BUFFER: + config.merge_parameters.setThreadBufferSize(parse(optarg)); + break; + case OPT_MERGE_BUFFERS: + config.merge_parameters.setMergeBuffers(parse(optarg)); + break; + case OPT_MERGE_JOBS: + config.merge_parameters.setMergeJobs(parse(optarg)); + break; + + // Alter GBWT + case 'R': + config.to_remove.insert(optarg); + break; + case OPT_SET_TAG: + { + std::string argument(optarg); + size_t separator = argument.find('='); + if (separator == std::string::npos) { + // We can't parse this as key = value. + std::cerr << "Error: expected '=' in " << argument << std::endl; + std::exit(EXIT_FAILURE); + } + auto tag_name = argument.substr(0, separator); + auto tag_value = argument.substr(separator + 1); + // See if this tag is known + auto tag_record = KNOWN_TAGS.lower_bound(tag_name); + if (tag_record == KNOWN_TAGS.end() && !KNOWN_TAGS.empty()) { + // This tag is larger than all known tags. Closest match is last tag. + --tag_record; + } + if (tag_record != KNOWN_TAGS.end()) { + auto& tag_description = tag_record->second.first; + auto& tag_prohibited_characters = tag_record->second.second; + // Tag is either known, or is unknown but there's a known tag to compare it with. + if (tag_name != tag_record->first) { + // This is an unknown tag, but we have an idea what it should be. + std::cerr << "warning: [vg gbwt] tag \"" << tag_name << "\" is not a tag with a meaning recognized by vg; maybe you meant \"" << tag_record->first << "\" which would be " << tag_description << std::endl; + } else { + // This is a known tag, so validate it. + for (auto& letter : tag_value) { + if (tag_prohibited_characters.count(letter)) { + // This letter isn't allowed. + std::cerr << "error: [vg gbwt] tag \"" << tag_name << "\" contains prohibited character \"" << letter << "\". It needs to be " << tag_description << " and may not contain any of:"; + for (auto& c : tag_prohibited_characters) { + std::cerr << " '" << c << "'"; + } + std::cerr << std::endl; + std::exit(EXIT_FAILURE); + } + } + } + } + config.tags_to_set.emplace(tag_name, tag_value); + } + break; + + // Path cover + case 'a': + no_multiple_cover_types(config); + config.path_cover = GBWTConfig::path_cover_augment; + break; + case 'l': + no_multiple_cover_types(config); + config.path_cover = GBWTConfig::path_cover_local; + if (!config.num_paths_set) { + config.num_paths = GBWTConfig::default_num_paths_local(); + } + break; + case 'P': + no_multiple_cover_types(config); + config.path_cover = GBWTConfig::path_cover_greedy; + config.produces_one_gbwt = true; + break; + case 'n': + config.num_paths = parse(optarg); + config.num_paths_set = true; + break; + case 'k': + config.context_length = parse(optarg); + break; + case OPT_PASS_PATHS: + config.include_named_paths = true; + break; + + // GBWTGraph + case 'g': + config.graph_output = optarg; + break; + case OPT_GBZ_FORMAT: + config.gbz_format = true; break; + // Build r-index + case 'r': + config.r_index_name = optarg; + break; + + // Metadata + case 'M': + config.metadata = true; + config.metadata_mode = true; + break; + case 'C': + config.contigs = true; + config.metadata_mode = true; + break; + case 'H': + config.haplotypes = true; + config.metadata_mode = true; + break; + case 'S': + config.samples = true; + config.metadata_mode = true; + break; + case 'L': + config.list_names = true; + config.metadata_mode = true; + break; + case 'T': + config.thread_names = true; + config.metadata_mode = true; + break; + case OPT_TAGS: + config.tags = true; + config.metadata_mode = true; + break; + + // Threads case 'c': - count_threads = true; + config.count_threads = true; + config.thread_mode = true; break; case 'e': - thread_output = optarg; + config.thread_output = optarg; + config.thread_mode = true; break; case 'h': case '?': /* getopt_long already printed an error message. */ help_gbwt(argv); - exit(1); + std::exit(EXIT_FAILURE); break; default: - abort (); + std::exit(EXIT_FAILURE); } } - gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + // The remaining args are input args. + for (int arg = optind; arg < argc; arg++) { + config.input_filenames.push_back(argv[arg]); + } + // We can load a single input GBWT if we did not use any build options. + if (config.input_filenames.size() == 1 && config.build == GBWTConfig::build_none) { + config.gbwt_name = config.input_filenames.front(); + } - if (merge) { + // Copy information from primary fields to redundant fields. + config.haplotype_indexer.show_progress = config.show_progress; + config.gfa_parameters.show_progress = config.show_progress; + config.gfa_parameters.parallel_jobs = config.build_jobs; + config.gfa_parameters.batch_size = config.haplotype_indexer.gbwt_buffer_size * gbwt::MILLION; + config.gfa_parameters.sample_interval = config.haplotype_indexer.id_interval; + + return config; +} + +//---------------------------------------------------------------------------- + +void validate_gbwt_config(GBWTConfig& config) { + // We can either write GBWT in SDSL format to a separate file or with GBWTGraph in GBZ format. + // However, `--parse-only` uses `gbwt_output` for other purposes. + bool has_gbwt_output = !config.gbwt_output.empty() || (config.gbz_format && !config.graph_output.empty() && !config.parse_only); + + // We have one input GBWT after steps 1-4. + bool one_input_gbwt = config.input_filenames.size() == 1 || config.produces_one_gbwt; - // Ugly hack here. GBWT prints to stdout, and we want to direct it to stderr. - std::streambuf* cout_buf = cout.rdbuf(); - cout.rdbuf(cerr.rdbuf()); + // We can load a PathHandleGraph from a file, get a SequenceSource from parsing GFA, or get a GBWTGraph from GBZ or GG/GBWT. + bool has_graph_input = !config.graph_name.empty() || config.build == GBWTConfig::build_gfa || config.build == GBWTConfig::build_gbz || config.build == GBWTConfig::build_gbwtgraph; - size_t input_files = argc - optind; - size_t total_inserted = 0; - if (input_files <= 1) { - cerr << "[vg gbwt] error: at least two input gbwt files required to merge" << endl; - return 1; + if (config.build == GBWTConfig::build_gbz) { + // If we "build" a GBWT by loading it from a GBZ, we just need to make + // sure that we know enough to actually load it. + if (!config.graph_name.empty()) { + std::cerr << "error: [vg gbwt] GBZ input does not use -x" << std::endl; + std::exit(EXIT_FAILURE); } - if (gbwt_output.empty()) { - cerr << "[vg gbwt] error: output file must be specified with -o" << endl; + if (config.input_filenames.size() != 1) { + std::cerr << "error: [vg gbwt] GBZ input requires one input arg" << std::endl; + std::exit(EXIT_FAILURE); } - if (show_progress) { - gbwt::printHeader("Algorithm"); cout << (fast_merging ? "fast" : "insert") << endl; - gbwt::printHeader("Input files"); cout << input_files << endl; - gbwt::printHeader("Output name"); cout << gbwt_output << endl; - if(!fast_merging) { gbwt::printHeader("Batch size"); cout << batch_size << endl; } - cout << endl; + } else if (config.build == GBWTConfig::build_gbwtgraph) { + // If we "build" a GBWT by loading it from a GG and a GBWT, we just need to make + // sure that we know enough to actually load it. + if (!config.graph_name.empty()) { + std::cerr << "error: [vg gbwt] GBWTGraph input does not use -x" << std::endl; + std::exit(EXIT_FAILURE); + } + if (config.input_filenames.size() != 1) { + std::cerr << "error: [vg gbwt] GBWTGraph input requires one input arg" << std::endl; + std::exit(EXIT_FAILURE); + } + } else if (config.build != GBWTConfig::build_none) { + if (!has_gbwt_output) { + // If we build our GBWT by doing anything other than loading it + // from a GBZ, we need to have somewhere to put it. + std::cerr << "error: [vg gbwt] GBWT construction requires output GBWT" << std::endl; + std::exit(EXIT_FAILURE); } + if (config.build == GBWTConfig::build_vcf) { + if (config.graph_name.empty() || config.input_filenames.empty()) { + std::cerr << "error: [vg gbwt] GBWT construction from VCF files requires -x and input args" << std::endl; + std::exit(EXIT_FAILURE); + } + if (config.parse_only) { + config.haplotype_indexer.batch_file_prefix = config.gbwt_output; + } + } else if (config.build == GBWTConfig::build_gfa) { + if (!config.graph_name.empty()) { + std::cerr << "error: [vg gbwt] GBWT construction from GFA does not use -x" << std::endl; + std::exit(EXIT_FAILURE); + } + if (config.input_filenames.size() != 1) { + std::cerr << "error: [vg gbwt] GBWT construction from GFA requires one input arg" << std::endl; + std::exit(EXIT_FAILURE); + } + } else if (config.build == GBWTConfig::build_alignments) { + if (config.graph_name.empty() || config.input_filenames.empty()) { + std::cerr << "error: [vg gbwt] GBWT construction from alignments requires -x and input args" << std::endl; + std::exit(EXIT_FAILURE); + } + } else if (config.build == GBWTConfig::build_paths) { + if (config.graph_name.empty()) { + std::cerr << "error: [vg gbwt] GBWT construction from embedded paths requires -x" << std::endl; + std::exit(EXIT_FAILURE); + } + if (!config.input_filenames.empty()) { + std::cerr << "error: [vg gbwt] GBWT construction from embedded paths does not use input args" << std::endl; + std::exit(EXIT_FAILURE); + } + } + } - double start = gbwt::readTimer(); + if (config.merge != GBWTConfig::merge_none) { + if (config.input_filenames.size() < 2 || !has_gbwt_output) { + std::cerr << "error: [vg gbwt] merging requires multiple input GBWTs and output GBWT" << std::endl; + std::exit(EXIT_FAILURE); + } + } - if(fast_merging) - { - vector indexes(argc - optind); - for(int i = optind; i < argc; i++) - { - string input_name = argv[i]; - sdsl::load_from_file(indexes[i - optind], input_name); - if (show_progress) { - gbwt::printStatistics(indexes[i - optind], input_name); - } - total_inserted += indexes[i - optind].size(); + if (!config.to_remove.empty()) { + if (config.build == GBWTConfig::build_gbz) { + std::cerr << "error: [vg gbwt] the GBWT extracted from GBZ cannot have paths modified" << std::endl; + } + if (config.build == GBWTConfig::build_gbwtgraph) { + std::cerr << "error: [vg gbwt] the GBWT loaded with a GBWTGraph cannot have paths modified" << std::endl; + } + if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) { + std::cerr << "error: [vg gbwt] removing a sample requires one input GBWT and output GBWT" << std::endl; + std::exit(EXIT_FAILURE); + } + } + + if (!config.tags_to_set.empty()) { + if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) { + std::cerr << "error: [vg gbwt] setting tags requires one input GBWT and output GBWT" << std::endl; + std::exit(EXIT_FAILURE); + } + } + + if (config.path_cover != GBWTConfig::path_cover_none) { + if (!has_gbwt_output || config.graph_name.empty()) { + std::cerr << "error: [vg gbwt] path cover options require -x and output GBWT" << std::endl; + std::exit(EXIT_FAILURE); + } + if (config.path_cover == GBWTConfig::path_cover_greedy && !config.input_filenames.empty()) { + std::cerr << "error: [vg gbwt] greedy path cover does not use input GBWTs" << std::endl; + std::exit(EXIT_FAILURE); + } + if ((config.path_cover == GBWTConfig::path_cover_local || config.path_cover == GBWTConfig::path_cover_augment) && !(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none)) { + std::cerr << "error: [vg gbwt] path cover options -a and -l require one input GBWT" << std::endl; + std::exit(EXIT_FAILURE); + } + if (config.num_paths == 0) { + std::cerr << "error: [vg gbwt] number of paths must be non-zero for path cover" << std::endl; + std::exit(EXIT_FAILURE); + } + if (config.context_length < gbwtgraph::PATH_COVER_MIN_K) { + std::cerr << "error: [vg gbwt] context length must be at least " << gbwtgraph::PATH_COVER_MIN_K << " for path cover" << std::endl; + std::exit(EXIT_FAILURE); + } + } + + if (!config.segment_translation.empty()) { + if (config.build != GBWTConfig::build_gfa && config.build != GBWTConfig::build_gbz) { + std::cerr << "error: [vg gbwt] segment to node translation requires GFA or GBZ input" << std::endl; + std::exit(EXIT_FAILURE); + } + } + + if (!config.graph_output.empty()) { + if (!has_graph_input || !one_input_gbwt) { + std::cerr << "error: [vg gbwt] GBWTGraph construction requires an input graph and and one input GBWT" << std::endl; + std::exit(EXIT_FAILURE); + } + } + + if (!config.r_index_name.empty()) { + if (!one_input_gbwt) { + std::cerr << "error: [vg gbwt] r-index construction requires one input GBWT" << std::endl; + std::exit(EXIT_FAILURE); + } + } + + if (config.metadata_mode) { + if (!one_input_gbwt) { + std::cerr << "error: [vg gbwt] metadata operations require one input GBWT" << std::endl; + std::exit(EXIT_FAILURE); + } + } + + if (config.thread_mode) { + if (!one_input_gbwt) { + std::cerr << "error: [vg gbwt] thread operations require one input GBWT" << std::endl; + std::exit(EXIT_FAILURE); + } + } + + for (auto& format : config.gfa_parameters.path_name_formats) { + // Check the path name format. We don't check the regex syntax here, + // but we can make sure that we're asking for things consistent with + // the path sense so that we can make a GBWTGraph out of them later. + // TODO: Do we still need to let people make GBWTs that can't make a GBWTGraph? + if (format.regex.empty()) { + std::cerr << "error: [vg gbwt] path name format regex is missing" << std::endl; + std::exit(EXIT_FAILURE); + } + switch(format.sense) { + case PathSense::GENERIC: + if (format.fields.find("C") == std::string::npos && format.fields.find("c") == std::string::npos) { + std::cerr << "error: [vg gbwt] path name fields do not set required contig for regex " + << format.regex << " and fields " << format.fields << std::endl; + std::exit(EXIT_FAILURE); + } + if (format.fields.find("S") != std::string::npos || format.fields.find("s") != std::string::npos) { + std::cerr << "error: [vg gbwt] path name fields set unusable sample for regex " + << format.regex << " and fields " << format.fields << std::endl; + std::exit(EXIT_FAILURE); + } + if (format.fields.find("H") != std::string::npos || format.fields.find("h") != std::string::npos) { + std::cerr << "error: [vg gbwt] path name fields set unusable haplotype number for regex " + << format.regex << " and fields " << format.fields << std::endl; + std::exit(EXIT_FAILURE); + } + break; + case PathSense::HAPLOTYPE: + if (format.fields.find("S") == std::string::npos && format.fields.find("s") == std::string::npos) { + std::cerr << "error: [vg gbwt] path name fields do not set required sample for regex " + << format.regex << " and fields " << format.fields << std::endl; + std::exit(EXIT_FAILURE); } - gbwt::GBWT merged(indexes); - sdsl::store_to_file(merged, gbwt_output); - if (show_progress) { - gbwt::printStatistics(merged, gbwt_output); + // Fall-through because haplotypes also need contigs. + case PathSense::REFERENCE: + if (format.fields.find("C") == std::string::npos && format.fields.find("c") == std::string::npos) { + std::cerr << "error: [vg gbwt] path name fields do not set required contig for regex " + << format.regex << " and fields " << format.fields << std::endl; + std::exit(EXIT_FAILURE); } + break; + default: + std::cerr << "error: [vg gbwt] path sense is unimplemented: " << (int)format.sense << std::endl; + std::exit(EXIT_FAILURE); } - else - { - gbwt::DynamicGBWT index; - { - string input_name = argv[optind]; - sdsl::load_from_file(index, input_name); - if (show_progress) { - gbwt::printStatistics(index, input_name); + } +} + +//---------------------------------------------------------------------------- + +struct job_type { + std::string filename; + std::vector paths; + size_t size; + + // Large jobs first. + bool operator<(const job_type& another) const { + return (this->size > another.size); + } + + typedef std::pair path_type; + + void insert(path_type path) { + this->paths.push_back(path.first); + this->size += path.second; + } +}; + +std::vector determine_jobs(std::unique_ptr& graph, const GBWTConfig& config) { + + std::vector result; + + // Determine the non-alt paths. + std::vector paths; + size_t max_length = 0; + graph->for_each_path_handle([&](path_handle_t path_handle) { + if (!Paths::is_alt(graph->get_path_name(path_handle))) { + paths.emplace_back(path_handle, graph->get_step_count(path_handle)); + max_length = std::max(max_length, paths.back().second); + } + }); + if (paths.empty()) { + return result; + } + + struct vcf_paths { + size_t file; + std::vector paths; + + // In descending order by length. + void sort_paths() { + std::sort(this->paths.begin(), this->paths.end(), [](job_type::path_type a, job_type::path_type b) -> bool { + return (a.second > b.second); + }); + } + + void insert(job_type::path_type path) { + this->paths.push_back(path); + } + + bool empty() const { + return this->paths.empty(); + } + }; + + // Initialize the files. + std::vector paths_by_file; + for (size_t i = 0; i < config.input_filenames.size(); i++) { + paths_by_file.push_back({ i, {} }); + } + + // Determine which VCF file contains each path. + std::vector path_found_in(paths.size(), config.input_filenames.size()); + for (size_t i = 0; i < config.input_filenames.size(); i++) { + std::string filename = config.input_filenames[i]; + vcflib::VariantCallFile variant_file; + variant_file.parseSamples = false; + variant_file.open(filename); + if (!variant_file.is_open()) { + std::cerr << "error: [vg gbwt] cannot open VCF file " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + for (size_t j = 0; j < paths.size(); j++) { + std::string contig_name = graph->get_path_name(paths[j].first); + if (config.haplotype_indexer.path_to_vcf.find(contig_name) != config.haplotype_indexer.path_to_vcf.end()) { + contig_name = config.haplotype_indexer.path_to_vcf.at(contig_name); + } + variant_file.setRegion(contig_name); + vcflib::Variant var(variant_file); + if (!(variant_file.is_open() && variant_file.getNextVariant(var) && var.sequenceName == contig_name)) { + continue; + } + if (path_found_in[j] < config.input_filenames.size()) { + std::cerr << "error: [vg gbwt] contig " << contig_name << " found in files " << config.input_filenames[path_found_in[j]] << " and " << filename << std::endl; + std::exit(EXIT_FAILURE); + } + paths_by_file[i].insert(paths[j]); + path_found_in[j] = i; + } + } + + // Special case: Each input file is a single job. + if (config.inputs_as_jobs) { + for (const vcf_paths& curr : paths_by_file) { + if (curr.empty()) { + continue; + } + job_type job({ config.input_filenames[curr.file], {}, 0 }); + for (auto path : curr.paths) { + job.insert(path); + } + result.push_back(job); + } + return result; + } + + // First-fit heuristic: Create jobs of size at most max_length from each file. + for (size_t i = 0; i < paths_by_file.size(); i++) { + paths_by_file[i].sort_paths(); + } + for (const vcf_paths& curr : paths_by_file) { + if (curr.empty()) { + continue; + } + std::vector jobs; + for (auto path : curr.paths) { + bool inserted = false; + for (size_t i = 0; i < jobs.size(); i++) { + if (jobs[i].size + path.second <= max_length) { + jobs[i].insert(path); + inserted = true; + break; } } - for (int curr = optind + 1; curr < argc; curr++) + if (!inserted) { + jobs.push_back({ config.input_filenames[curr.file], {}, 0 }); + jobs.back().insert(path); + } + } + result.insert(result.end(), jobs.begin(), jobs.end()); + } + + // Sort the jobs in descending order by size. + std::sort(result.begin(), result.end()); + return result; +} + +void use_or_save(std::unique_ptr& index, GBWTHandler& gbwts, std::vector& filenames, size_t i, bool show_progress) { + if (filenames.size() == 1) { + gbwts.use(*index); + } else { + std::string temp = temp_file::create("gbwt-" + std::to_string(i) + "-"); + if (show_progress) { + #pragma omp critical { - string input_name = argv[curr]; - gbwt::GBWT next; - sdsl::load_from_file(next, input_name); - if (show_progress) { - gbwt::printStatistics(next, input_name); + std::cerr << "Job " << i << ": Saving the GBWT to " << temp << std::endl; + } + } + save_gbwt(*index, temp, false); + filenames[i] = temp; + } +} + +void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config) { + double start = gbwt::readTimer(); + if (config.show_progress) { + std::cerr << "Building input GBWTs" << std::endl; + } + gbwts.unbacked(); // We will build a new GBWT. + if (config.build != GBWTConfig::build_gfa && config.build != GBWTConfig::build_gbz && config.build != GBWTConfig::build_gbwtgraph) { + graphs.get_graph(config); + } + + if (config.build == GBWTConfig::build_vcf) { + if (config.show_progress) { + std::cerr << "Input type: VCF" << std::endl; + } + omp_set_num_threads(config.build_jobs); + // Process each VCF contig corresponding to a non-alt path. + std::vector jobs = determine_jobs(graphs.path_graph, config); + if (jobs.size() > 1 && config.merge == GBWTConfig::merge_none) { + config.merge = GBWTConfig::merge_fast; + } + std::vector> vcf_parses(jobs.size()); + if (config.show_progress) { + std::cerr << "Parsing " << jobs.size() << " VCF files using up to " << config.build_jobs << " parallel jobs" << std::endl; + } + #pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < jobs.size(); i++) { + std::string job_name = "Job " + std::to_string(i); + if (config.show_progress) { + #pragma omp critical + { + std::cerr << job_name << ": File " << jobs[i].filename << ", paths {"; + for (path_handle_t handle : jobs[i].paths) { + std::cerr << " " << graphs.path_graph->get_path_name(handle); + } + std::cerr << " }" << std::endl; } - index.merge(next, batch_size); - total_inserted += next.size(); } - sdsl::store_to_file(index, gbwt_output); - if (show_progress) { - gbwt::printStatistics(index, gbwt_output); + vcf_parses[i] = config.haplotype_indexer.parse_vcf(jobs[i].filename, *(graphs.path_graph), jobs[i].paths, job_name); + } + graphs.clear(); // Delete the graph to save memory. + if (!config.parse_only) { + std::vector gbwt_files(vcf_parses.size(), ""); + if (config.show_progress) { + std::cerr << "Building " << vcf_parses.size() << " GBWTs using up to " << config.build_jobs << " parallel jobs" << std::endl; + } + #pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < vcf_parses.size(); i++) { + std::string job_name = "Job " + std::to_string(i); + std::unique_ptr parsed = config.haplotype_indexer.build_gbwt(vcf_parses[i], job_name); + use_or_save(parsed, gbwts, gbwt_files, i, config.show_progress); + } + if (vcf_parses.size() > 1) { + config.input_filenames = gbwt_files; // Use the temporary GBWTs as inputs. } } - - double seconds = gbwt::readTimer() - start; + } else if (config.build == GBWTConfig::build_gfa) { + if(config.show_progress) { + std::cerr << "Input type: GFA" << std::endl; + } + auto result = gbwtgraph::gfa_to_gbwt(config.input_filenames.front(), config.gfa_parameters); + if (result.first.get() == nullptr || result.second.get() == nullptr) { + std::cerr << "error: [vg gbwt] GBWT construction from GFA failed" << std::endl; + std::exit(EXIT_FAILURE); + } + gbwts.use(*(result.first)); + graphs.use(result.second); + } else if (config.build == GBWTConfig::build_gbz) { + if(config.show_progress) { + std::cerr << "Input type: GBZ" << std::endl; + } + graphs.load_gbz(gbwts, config); + } else if (config.build == GBWTConfig::build_gbwtgraph) { + if(config.show_progress) { + std::cerr << "Input type: GBWTGraph" << std::endl; + } + graphs.load_gbwtgraph(gbwts, config); + } else if (config.build == GBWTConfig::build_paths) { + if(config.show_progress) { + std::cerr << "Input type: embedded paths" << std::endl; + } + std::unique_ptr temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph)); + gbwts.use(*temp); + } else if (config.build == GBWTConfig::build_alignments) { + if (config.show_progress) { + std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl; + } + std::unique_ptr temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF")); + gbwts.use(*temp); + } - if (show_progress) { + report_time_memory("GBWTs built", start, config); + if (config.parse_only) { + std::exit(EXIT_SUCCESS); // VCF parsing does not produce GBWTs to continue with. + } +} +//---------------------------------------------------------------------------- + +void step_2_merge_gbwts(GBWTHandler& gbwts, GBWTConfig& config) { + double start = gbwt::readTimer(); + if (config.show_progress) { + std::string algo_name; + if (config.merge == GBWTConfig::merge_fast) { + algo_name = "fast"; + } else if (config.merge == GBWTConfig::merge_insert) { + algo_name = "insertion"; + } else if (config.merge == GBWTConfig::merge_parallel) { + algo_name = "parallel"; + } + std::cerr << "Merging " << config.input_filenames.size() << " input GBWTs (" << algo_name << " algorithm)" << std::endl; + } + + if (config.merge == GBWTConfig::merge_fast) { + std::vector indexes(config.input_filenames.size()); + for (size_t i = 0; i < config.input_filenames.size(); i++) { + load_gbwt(indexes[i], config.input_filenames[i], config.show_progress); + } + if (config.show_progress) { + std::cerr << "Merging the GBWTs" << std::endl; + } + gbwt::GBWT merged(indexes); + gbwts.use(merged); + } else if (config.merge == GBWTConfig::merge_insert) { + gbwts.filename = config.input_filenames.front(); + gbwts.use_dynamic(); + for (size_t i = 1; i < config.input_filenames.size(); i++) { + gbwt::GBWT next; + load_gbwt(next, config.input_filenames[i], config.show_progress); + if (next.size() > 2 * gbwts.dynamic.size()) { + std::cerr << "warning: [vg gbwt] merging " << config.input_filenames[i] << " into a substantially smaller index" << std::endl; + std::cerr << "warning: [vg gbwt] merging would be faster in another order" << std::endl; + } + if (config.show_progress) { + std::cerr << "Inserting " << next.sequences() << " sequences of total length " << next.size() << std::endl; + } + gbwts.dynamic.merge(next); + } + } else if (config.merge == GBWTConfig::merge_parallel) { + gbwts.filename = config.input_filenames.front(); + gbwts.use_dynamic(); + omp_set_num_threads(config.search_threads); + for (size_t i = 1; i < config.input_filenames.size(); i++) { + gbwt::DynamicGBWT next; + load_gbwt(next, config.input_filenames[i], config.show_progress); + if (next.size() > 2 * gbwts.dynamic.size()) { + std::cerr << "warning: [vg gbwt] merging " << config.input_filenames[i] << " into a substantially smaller index" << std::endl; + std::cerr << "warning: [vg gbwt] merging would be faster in another order" << std::endl; + } + if (config.show_progress) { + std::cerr << "Inserting " << next.sequences() << " sequences of total length " << next.size() << std::endl; + } + gbwts.dynamic.merge(next, config.merge_parameters); + } + } + gbwts.unbacked(); // We modified the GBWT. + + if (config.show_progress) { + print_metadata(std::cerr, gbwts); + report_time_memory("GBWTs merged", start, config); + } +} + +//---------------------------------------------------------------------------- + +void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) { + double start = gbwt::readTimer(); + if (config.show_progress) { + std::cerr << "Removing " << config.to_remove.size() << " sample(s) from the index" << std::endl; + } + + gbwts.use_dynamic(); + if (!(gbwts.dynamic.hasMetadata() && gbwts.dynamic.metadata.hasPathNames() && gbwts.dynamic.metadata.hasSampleNames())) { + std::cerr << "error: [vg gbwt] the index does not contain metadata with thread and sample names" << std::endl; + std::exit(EXIT_FAILURE); + } + + // Remove the samples one at a time, because old sample/path ids may be invalidated. + for (const std::string& sample_name : config.to_remove) { + gbwt::size_type sample_id = gbwts.dynamic.metadata.sample(sample_name); + if (sample_id >= gbwts.dynamic.metadata.samples()) { + std::cerr << "warning: [vg gbwt] the index does not contain sample " << sample_name << std::endl; + continue; + } + std::vector path_ids = gbwts.dynamic.metadata.removeSample(sample_id); + if (path_ids.empty()) { + std::cerr << "warning: [vg gbwt] no threads associated with sample " << sample_name << std::endl; + continue; + } + if (config.show_progress) { + std::cerr << "Removing " << path_ids.size() << " threads for sample " << sample_name << std::endl; + } + gbwts.dynamic.remove(path_ids); + } + gbwts.unbacked(); // We modified the GBWT. + + report_time_memory("Samples removed", start, config); +} + +void set_tags(GBWTHandler& gbwts, GBWTConfig& config) { + double start = gbwt::readTimer(); + if (config.show_progress) { + std::cerr << "Setting " << config.tags_to_set.size() << " tags on the GBWT" << std::endl; + } + + gbwts.use_compressed(); + for (auto& kv : config.tags_to_set) { + gbwts.compressed.tags.set(kv.first, kv.second); + } + // We modified the GBWT (we assume some tags got set) + gbwts.unbacked(); + + report_time_memory("Tags set", start, config); +} + +void step_3_alter_gbwt(GBWTHandler& gbwts, GBWTConfig& config) { + if (!config.to_remove.empty()) { + remove_samples(gbwts, config); + } + if (!config.tags_to_set.empty()) { + set_tags(gbwts, config); + } +} + +//---------------------------------------------------------------------------- - cout << "Inserted " << total_inserted << " nodes in " << seconds << " seconds (" - << (total_inserted / seconds) << " nodes/second)" << endl; - cout << "Memory usage " << gbwt::inGigabytes(gbwt::memoryUsage()) << " GB" << endl; - cout << endl; +void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config) { + double start = gbwt::readTimer(); + if (config.show_progress) { + std::cerr << "Finding a " << config.num_paths << "-path cover with context length " << config.context_length << std::endl; + } + + graphs.get_graph(config); + + // We need to drop paths that are alt allele paths and not pass them + // through from a graph that has them to the synthesized GBWT. + std::function path_filter = [&graphs](const path_handle_t& path) { + return !Paths::is_alt(graphs.path_graph->get_path_name(path)); + }; + + if (config.path_cover == GBWTConfig::path_cover_greedy) { + if (config.show_progress) { + std::cerr << "Algorithm: greedy" << std::endl; + } + gbwt::GBWT cover = gbwtgraph::path_cover_gbwt(*(graphs.path_graph), + config.num_paths, + config.context_length, + config.haplotype_indexer.gbwt_buffer_size * gbwt::MILLION, + config.haplotype_indexer.id_interval, + config.include_named_paths, + &path_filter, + config.show_progress); + gbwts.use(cover); + } else if (config.path_cover == GBWTConfig::path_cover_augment) { + if (config.show_progress) { + std::cerr << "Algorithm: augment" << std::endl; + } + gbwts.use_dynamic(); + gbwtgraph::augment_gbwt(*(graphs.path_graph), + gbwts.dynamic, + config.num_paths, + config.context_length, + config.haplotype_indexer.gbwt_buffer_size * gbwt::MILLION, + config.haplotype_indexer.id_interval, + config.show_progress); + } else { + if (config.show_progress) { + std::cerr << "Algorithm: local haplotypes" << std::endl; } + gbwts.use_compressed(); + gbwt::GBWT cover = gbwtgraph::local_haplotypes(*(graphs.path_graph), + gbwts.compressed, + config.num_paths, + config.context_length, + config.haplotype_indexer.gbwt_buffer_size * gbwt::MILLION, + config.haplotype_indexer.id_interval, + config.include_named_paths, + &path_filter, + config.show_progress); + gbwts.use(cover); + } + gbwts.unbacked(); // We modified the GBWT. + + report_time_memory("Path cover built", start, config); +} + +//---------------------------------------------------------------------------- + +void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config) { + double start = gbwt::readTimer(); + if (config.show_progress) { + std::cerr << "Building GBWTGraph" << std::endl; + } + + gbwts.use_compressed(); + gbwtgraph::GBWTGraph graph; + if (graphs.in_use == GraphHandler::graph_source) { + graph = gbwtgraph::GBWTGraph(gbwts.compressed, *(graphs.sequence_source)); + } else if (graphs.in_use == GraphHandler::graph_gbz || graphs.in_use == GraphHandler::graph_gbwtgraph) { + graph = std::move(*(graphs.gbwt_graph)); + graphs.clear(); + } else { + graphs.get_graph(config); + if (config.show_progress) { + std::cerr << "Starting the construction" << std::endl; + } + graph = gbwtgraph::GBWTGraph(gbwts.compressed, *(graphs.path_graph), vg::algorithms::find_translation(graphs.path_graph.get())); + } + if (config.gbz_format) { + save_gbz(gbwts.compressed, graph, config.graph_output, config.show_progress); + } else { + save_gbwtgraph(graph, config.graph_output, config.show_progress); + } + + report_time_memory("GBWTGraph built", start, config); +} + +//---------------------------------------------------------------------------- + +void step_6_r_index(GBWTHandler& gbwts, GBWTConfig& config) { + double start = gbwt::readTimer(); + if (config.show_progress) { + std::cerr << "Building r-index" << std::endl; + } + + omp_set_num_threads(config.search_threads); + gbwts.use_compressed(); + if (config.show_progress) { + std::cerr << "Starting the construction" << std::endl; + } + gbwt::FastLocate r_index(gbwts.compressed); + save_r_index(r_index, config.r_index_name, config.show_progress); + + report_time_memory("R-index built", start, config); +} + +//---------------------------------------------------------------------------- - // Revert the hack. - cout.rdbuf(cout_buf); +void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { + gbwts.use_compressed(); + + // Use this to get the metadata object for the operations that need it, or + // fail if it's not there. + auto get_metadata = [&gbwts]() -> const gbwt::Metadata& { + if (!gbwts.compressed.hasMetadata()) { + std::cerr << "error: [vg gbwt] the GBWT does not contain metadata" << std::endl; + std::exit(EXIT_FAILURE); + } + return gbwts.compressed.metadata; + }; + + if (config.metadata) { + // Make sure the metadata exists. + get_metadata(); + print_metadata(std::cout, gbwts); } - if (count_threads) { - if (optind >= argc) { - cerr << "[vg gbwt] error: no input files given" << endl; - return 1; + if (config.contigs) { + auto& metadata = get_metadata(); + if (config.list_names) { + if (gbwts.compressed.metadata.hasContigNames()) { + for (size_t i = 0; i < metadata.contigs(); i++) { + std::cout << metadata.contig(i) << std::endl; + } + } else { + std::cerr << "error: [vg gbwt] the metadata does not contain contig names" << std::endl; + std::exit(EXIT_FAILURE); + } + } else { + std::cout << metadata.contigs() << std::endl; } + } - size_t total_threads = 0; - for (int i = optind; i < argc; i++) { - gbwt::GBWT index; - sdsl::load_from_file(index, argv[i]); - total_threads += index.sequences() / 2; // Ignore reverse complements. + if (config.haplotypes) { + std::cout << get_metadata().haplotypes() << std::endl; + } + + if (config.samples) { + auto& metadata = get_metadata(); + if (config.list_names) { + if (metadata.hasSampleNames()) { + for (size_t i = 0; i < metadata.samples(); i++) { + std::cout << metadata.sample(i) << std::endl; + } + } else { + std::cerr << "error: [vg gbwt] the metadata does not contain sample names" << std::endl; + std::exit(EXIT_FAILURE); + } + } else { + std::cout << metadata.samples() << std::endl; } - cout << total_threads << endl; } - if (!thread_output.empty()) { - if (optind + 1 != argc) { - cerr << "[vg gbwt] error: option -e requires one input file" << endl; - return 1; + if (config.thread_names) { + auto& metadata = get_metadata(); + if (metadata.hasPathNames()) { + // Precompute some metadata + auto gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(gbwts.compressed); + for (size_t i = 0; i < metadata.paths(); i++) { + PathSense sense = gbwtgraph::get_path_sense(gbwts.compressed, i, gbwt_reference_samples); + std::cout << gbwtgraph::compose_path_name(gbwts.compressed, i, sense) << std::endl; + } + } else { + std::cerr << "error: [vg gbwt] the metadata does not contain thread names" << std::endl; + } + } + + if (config.tags) { + // This only needs GBWT tag metadata. + // TODO: the gbwt::Tags object doesn't have its own enumeration API. + // Just reach in and grab them. + for (auto& kv : gbwts.compressed.tags.tags) { + std::cout << kv.first << "\t" << kv.second << std::endl; } + } +} + +//---------------------------------------------------------------------------- - gbwt::GBWT index; - sdsl::load_from_file(index, argv[optind]); - gbwt::size_type node_width = gbwt::bit_length(index.sigma() - 1); - gbwt::text_buffer_type out(thread_output, std::ios::out, gbwt::MEGABYTE, node_width); - for (gbwt::size_type id = 0; id < index.sequences(); id += 2) { // Ignore reverse complements. - gbwt::vector_type sequence = index.extract(id); +void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) { + // Extract threads in SDSL format. + if (!config.thread_output.empty()) { + double start = gbwt::readTimer(); + if (config.show_progress) { + std::cerr << "Extracting threads to " << config.thread_output << std::endl; + } + gbwts.use_compressed(); + if (config.show_progress) { + std::cerr << "Starting the extraction" << std::endl; + } + gbwt::size_type node_width = gbwt::bit_length(gbwts.compressed.sigma() - 1); + gbwt::text_buffer_type out(config.thread_output, std::ios::out, gbwt::MEGABYTE, node_width); + for (gbwt::size_type id = 0; id < gbwts.compressed.sequences(); id += 2) { // Ignore reverse complements. + gbwt::vector_type sequence = gbwts.compressed.extract(id); for (auto node : sequence) { out.push_back(node); } out.push_back(gbwt::ENDMARKER); } out.close(); + report_time_memory("Threads extracted", start, config); } - return 0; + // There are two sequences for each thread. + if (config.count_threads) { + gbwts.use_compressed(); + std::cout << (gbwts.compressed.sequences() / 2) << std::endl; + } } +//---------------------------------------------------------------------------- -// Register subcommand -static Subcommand vg_gbwt("gbwt", "Manipuate GBWTs", main_gbwt); +void GraphHandler::get_graph(const GBWTConfig& config) { + if (this->in_use == graph_path) { + return; + } else { + if (config.show_progress) { + std::cerr << "Loading input graph from " << config.graph_name << std::endl; + } + this->clear(); + this->path_graph = vg::io::VPKG::load_one(config.graph_name); + if (this->path_graph == nullptr) { + std::cerr << "error: [vg gbwt] cannot load graph " << config.graph_name << std::endl; + std::exit(EXIT_FAILURE); + } + this->in_use = graph_path; + } +} + +void GraphHandler::use(std::unique_ptr& source) { + this->clear(); + this->sequence_source = std::move(source); + this->in_use = graph_source; +} + +void GraphHandler::load_gbz(GBWTHandler& gbwts, GBWTConfig& config) { + if (this->in_use == graph_gbz) { + return; + } else { + this->clear(); + gbwtgraph::GBZ gbz; + vg::load_gbz(gbz, config.input_filenames.front(), config.show_progress); + gbwts.use(gbz.index); + this->gbwt_graph = std::make_unique(std::move(gbz.graph)); + this->gbwt_graph->set_gbwt(gbwts.compressed); + this->in_use = graph_gbz; + } +} + +void GraphHandler::load_gbwtgraph(GBWTHandler& gbwts, GBWTConfig& config) { + if (this->in_use == graph_gbwtgraph) { + return; + } else { + this->clear(); + // Load the GBWT + gbwt::GBWT input_gbwt; + vg::load_gbwt(input_gbwt, config.input_filenames.front(), config.show_progress); + gbwts.use(input_gbwt); + + // Then load the GBWTGraph + this->gbwt_graph = std::make_unique(); + vg::load_gbwtgraph(*this->gbwt_graph, config.gbwtgraph_name, config.show_progress); + // And connect it + this->gbwt_graph->set_gbwt(gbwts.compressed); + this->in_use = graph_gbwtgraph; + } +} + +void GraphHandler::clear() { + this->path_graph.reset(); + this->sequence_source.reset(); + this->gbwt_graph.reset(); + this->in_use = graph_none; +} + +void GraphHandler::serialize_segment_translation(const GBWTConfig& config) const { + double start = gbwt::readTimer(); + if (config.show_progress) { + std::cerr << "Serializing segment to node translation to " << config.segment_translation << std::endl; + } + std::ofstream out(config.segment_translation, std::ios_base::binary); + if (this->in_use == graph_source) { + if (this->sequence_source->uses_translation()) { + auto& translation = this->sequence_source->segment_translation; + for (auto iter = translation.begin(); iter != translation.end(); ++iter) { + out << "T\t" << iter->first << "\t" << iter->second.first; + for (nid_t i = iter->second.first + 1; i < iter->second.second; i++) { + out << "," << i; + } + out << "\n"; + } + } + } else if (this->in_use == graph_gbz) { + this->gbwt_graph->for_each_segment([&](const std::string& name, std::pair nodes) -> bool { + out << "T\t" << name << "\t" << nodes.first; + for (nid_t i = nodes.first + 1; i < nodes.second; i++) { + out << "," << i; + } + out << "\n"; + return true; + }); + } + + out.close(); + report_time_memory("Translation serialized", start, config); +} + +//---------------------------------------------------------------------------- + +void report_time_memory(const std::string& what, double start_time, const GBWTConfig& config) { + if (config.show_progress) { + double seconds = gbwt::readTimer() - start_time; + std::cerr << what << " in " << seconds << " seconds, " << gbwt::inGigabytes(gbwt::memoryUsage()) << " GiB" << std::endl; + std::cerr << std::endl; + } +} + +void print_metadata(std::ostream& out, const GBWTHandler& gbwts) { + if (gbwts.in_use == GBWTHandler::index_compressed) { + gbwt::operator<<(out, gbwts.compressed.metadata) << std::endl; + } else if (gbwts.in_use == GBWTHandler::index_dynamic) { + gbwt::operator<<(out, gbwts.dynamic.metadata) << std::endl; + } +} + +//---------------------------------------------------------------------------- + +// Register subcommand +static vg::subcommand::Subcommand vg_gbwt("gbwt", "build and manipulate GBWTs", vg::subcommand::TOOLKIT, main_gbwt); diff --git a/src/subcommand/genotype_main.cpp b/src/subcommand/genotype_main.cpp index 7714332658e..f062e956f58 100644 --- a/src/subcommand/genotype_main.cpp +++ b/src/subcommand/genotype_main.cpp @@ -1,12 +1,10 @@ #include #include "subcommand.hpp" -#include "index.hpp" -#include "stream.hpp" -#include "genotyper.hpp" -#include "genotypekit.hpp" -#include "variant_recall.hpp" -#include "stream.hpp" +#include +#include "../genotyper.hpp" +#include "../genotypekit.hpp" +#include "../variant_recall.hpp" /** * GAM sort main */ @@ -15,13 +13,12 @@ using namespace std; using namespace vg; using namespace vg::subcommand; void help_genotype(char** argv) { - cerr << "usage: " << argv[0] << " genotype [options] [reads.index/] > " << endl - << "Compute genotypes from a graph and an indexed collection of reads" << endl + cerr << "usage: " << argv[0] << " genotype [options] alignments.gam > " << endl + << "Compute genotypes from a graph and a collection of reads" << endl << endl << "options:" << endl << " -j, --json output in JSON" << endl << " -v, --vcf output in VCF" << endl - << " -G, --gam GAM a GAM file to use with variant recall (or in place of index)" << endl << " -V, --recall-vcf VCF recall variants in a specific VCF file." << endl << " -F, --fasta FASTA" << endl << " -I, --insertions INS" << endl @@ -35,7 +32,7 @@ void help_genotype(char** argv) { << " -A, --no_indel_realign disable indel realignment" << endl << " -d, --het_prior_denom denominator for prior probability of heterozygousness" << endl << " -P, --min_per_strand min unique reads per strand for a called allele to accept a call" << endl - << " -E, --no_embed dont embed gam edits into grpah" << endl + << " -E, --no_embed don't embed gam edits into graph" << endl << " -T, --traversal traversal finder to use {reads, exhaustive, representative, adaptive} (adaptive)" << endl << " -p, --progress show progress" << endl << " -t, --threads N number of threads to use" << endl; @@ -77,7 +74,6 @@ int main_genotype(int argc, char** argv) { string gam_file; string fasta; string insertions_file; - bool useindex = true; // Should we use mapping qualities? bool use_mapq = true; @@ -113,7 +109,6 @@ int main_genotype(int argc, char** argv) { {"progress", no_argument, 0, 'p'}, {"threads", required_argument, 0, 't'}, {"recall-vcf", required_argument, 0, 'V'}, - {"gam", required_argument, 0, 'G'}, {"fasta", required_argument, 0, 'F'}, {"insertions", required_argument, 0, 'I'}, {"call", no_argument, 0, 'z'}, @@ -123,7 +118,7 @@ int main_genotype(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hjvr:c:s:o:l:a:QAd:P:pt:V:I:G:F:zET:", + c = getopt_long (argc, argv, "hjvr:c:s:o:l:a:QAd:P:pt:V:I:F:zET:", long_options, &option_index); /* Detect the end of the options. */ @@ -196,10 +191,6 @@ int main_genotype(int argc, char** argv) { case 'F': fasta = optarg; break; - case 'G': - gam_file = optarg; - useindex = false; - break; case 'E': embed_gam_edits = false; break; @@ -234,6 +225,15 @@ int main_genotype(int argc, char** argv) { get_input_file(optind, argc, argv, [&](istream& in) { graph = new VG(in); }); + + // get GAM + if (optind < argc){ + gam_file = get_input_file_name(optind, argc, argv); + + } else { + cerr << "[vg genotype] GAM file must be specified as positional argument" << endl; + return 1; + } if (just_call){ string gamfi(gam_file); @@ -242,25 +242,6 @@ int main_genotype(int argc, char** argv) { exit(0); } - // setup reads index - string reads_index_name = ""; - if (optind < argc){ - reads_index_name = get_input_file_name(optind, argc, argv); - - } else { - if (gam_file.empty()) { - cerr << "[vg genotype] Index argument must be specified when not using -G" << endl; - return 1; - } - } - - // This holds the RocksDB index that has all our reads, indexed by the nodes they visit. - Index index; - if (useindex){ - index.open_read_only(reads_index_name); - gam_file = reads_index_name; - } - // Build the set of all the node IDs to operate on vector graph_ids; graph->for_each_node([&](Node* node) { @@ -280,7 +261,7 @@ int main_genotype(int argc, char** argv) { insertions.emplace_back(ins); ins->open(insertions_file); } - variant_recall(graph, vars, lin_ref, insertions, gam_file, useindex); + variant_recall(graph, vars, lin_ref, insertions, gam_file); return 0; } @@ -301,29 +282,19 @@ int main_genotype(int argc, char** argv) { return alignment.path().mapping_size() > 0; }; - if (useindex) { - // Extract all the alignments - index.for_alignment_to_nodes(graph_ids, [&](const Alignment& alignment) { - // Only take alignments that don't visit nodes not in the graph - if (alignment_contained(alignment)) { - alignments.push_back(alignment); - } - }); - } else { - // load in all reads (activated by passing GAM directly with -G). - // This is used by, ex., toil-vg, which has already used the gam index - // to extract relevant reads - ifstream gam_reads(gam_file.c_str()); - if (!gam_reads) { - cerr << "[vg genotype] Error opening gam: " << gam_file << endl; - return 1; - } - stream::for_each(gam_reads, [&alignments, &alignment_contained](Alignment& alignment) { - if (alignment_contained(alignment)) { - alignments.push_back(alignment); - } - }); + // load in all reads (activated by passing GAM directly with -G). + // This is used by, ex., toil-vg, which has already used the gam index + // to extract relevant reads + ifstream gam_reads(gam_file.c_str()); + if (!gam_reads) { + cerr << "[vg genotype] Error opening gam: " << gam_file << endl; + return 1; } + vg::io::for_each(gam_reads, [&alignments, &alignment_contained](Alignment& alignment) { + if (alignment_contained(alignment)) { + alignments.push_back(alignment); + } + }); if(show_progress) { cerr << "Loaded " << alignments.size() << " alignments" << endl; @@ -367,12 +338,7 @@ int main_genotype(int argc, char** argv) { AugmentedGraph augmented_graph; // Move our input graph into the augmented graph - // TODO: less terrible interface. also shouldn't have to re-index. swap(augmented_graph.graph, *graph); - swap(augmented_graph.graph.paths, graph->paths); - augmented_graph.graph.paths.rebuild_node_mapping(); - augmented_graph.graph.paths.rebuild_mapping_aux(); - augmented_graph.graph.paths.to_graph(augmented_graph.graph.graph); // Do the actual augmentation using vg edit. If augmentation was already // done, just embeds the reads. Reads will be taken by the AugmentedGraph diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp new file mode 100644 index 00000000000..85addf1c322 --- /dev/null +++ b/src/subcommand/giraffe_main.cpp @@ -0,0 +1,1696 @@ +/** + * \file giraffe_main.cpp: G(ir)AF (Graph Alignment Format) Fast Emitter: a fast short-read-to-haplotypes mapper + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "subcommand.hpp" +#include "options.hpp" + +#include "../snarl_seed_clusterer.hpp" +#include "../mapper.hpp" +#include "../annotation.hpp" +#include +#include +#include "../hts_alignment_emitter.hpp" +#include "../minimizer_mapper.hpp" +#include "../index_registry.hpp" +#include "../watchdog.hpp" +#include "../crash.hpp" +#include + +#include "../gbwtgraph_helper.hpp" +#include "../recombinator.hpp" + +#include +#include + +//#define USE_CALLGRIND + +#ifdef USE_CALLGRIND +#include +#endif + +#include +#ifdef __linux__ +#include +#include +/// Bind perf_event_open for counting instructions. +/// See +static long perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +} +#endif + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +/// Options struct for options for the Giraffe driver (i.e. this file) +struct GiraffeMainOptions { + /// How long should we wait while mapping a read before complaining, in seconds. + static constexpr size_t default_watchdog_timeout = 10; + size_t watchdog_timeout = default_watchdog_timeout; +}; + +/// Options struct for scoring-related parameters. Defaults are in aligner.hpp. +struct ScoringOptions { + int8_t match = default_match; + int8_t mismatch = default_mismatch; + int8_t gap_open = default_gap_open; + int8_t gap_extend = default_gap_extension; + int8_t full_length_bonus = default_full_length_bonus; +}; + +static GroupedOptionGroup get_options() { + GroupedOptionGroup parser; + + // Configure Giraffe program settings + auto& main_opts = parser.add_group("program options"); + main_opts.add_range( + "watchdog-timeout", + &GiraffeMainOptions::watchdog_timeout, + GiraffeMainOptions::default_watchdog_timeout, + "complain after INT seconds working on a read or read pair" + ); + + // Configure scoring + auto& scoring_opts = parser.add_group("scoring options"); + scoring_opts.add_range( + "match", + &ScoringOptions::match, + default_match, + "use this match score" + ); + scoring_opts.add_range( + "mismatch", + &ScoringOptions::mismatch, + default_mismatch, + "use this mismatch penalty" + ); + scoring_opts.add_range( + "gap-open", + &ScoringOptions::gap_open, + default_gap_open, + "use this gap open penalty" + ); + scoring_opts.add_range( + "gap-extend", + &ScoringOptions::gap_extend, + default_gap_extension, + "use this gap extension penalty" + ); + scoring_opts.add_range( + "full-l-bonus", + &ScoringOptions::full_length_bonus, + default_full_length_bonus, + "the full-length alignment bonus" + ); + + // Configure output settings on the MinimizerMapper + auto& result_opts = parser.add_group("result options"); + result_opts.add_range( + "max-multimaps", 'M', + &MinimizerMapper::max_multimaps, + MinimizerMapper::default_max_multimaps, + "produce up to INT alignments for each read" + ); + + // Configure normal Giraffe mapping computation + auto& comp_opts = parser.add_group("computational parameters"); + comp_opts.add_range( + "hit-cap", 'c', + &MinimizerMapper::hit_cap, + MinimizerMapper::default_hit_cap, + "use all minimizers with at most INT hits" + ); + comp_opts.add_range( + "hard-hit-cap", 'C', + &MinimizerMapper::hard_hit_cap, + MinimizerMapper::default_hard_hit_cap, + "ignore all minimizers with more than INT hits" + ); + comp_opts.add_range( + "score-fraction", 'F', + &MinimizerMapper::minimizer_score_fraction, + MinimizerMapper::default_minimizer_score_fraction, + "select minimizers between hit caps until score is FLOAT of total" + ); + comp_opts.add_range( + "max-min", 'U', + &MinimizerMapper::max_unique_min, + MinimizerMapper::default_max_unique_min, + "use at most INT minimizers", + size_t_is_nonzero + ); + comp_opts.add_range( + "num-bp-per-min", + &MinimizerMapper::num_bp_per_min, + MinimizerMapper::default_num_bp_per_min, + "use maximum of number minimizers calculated by READ_LENGTH / INT and --max-min" + ); + comp_opts.add_range( + "distance-limit", 'D', + &MinimizerMapper::distance_limit, + MinimizerMapper::default_distance_limit, + "cluster using this distance limit" + ); + comp_opts.add_range( + "max-extensions", 'e', + &MinimizerMapper::max_extensions, + MinimizerMapper::default_max_extensions, + "extend up to INT clusters" + ); + comp_opts.add_range( + "max-alignments", 'a', + &MinimizerMapper::max_alignments, + MinimizerMapper::default_max_alignments, + "align up to INT extensions" + ); + comp_opts.add_range( + "cluster-score", 's', + &MinimizerMapper::cluster_score_threshold, + MinimizerMapper::default_cluster_score_threshold, + "only extend clusters if they are within INT of the best score", + double_is_nonnegative + ); + comp_opts.add_range( + "pad-cluster-score", 'S', + &MinimizerMapper::pad_cluster_score_threshold, + MinimizerMapper::default_pad_cluster_score_threshold, + "also extend clusters within INT of above threshold to get a second-best cluster", + double_is_nonnegative + ); + comp_opts.add_range( + "cluster-coverage", 'u', + &MinimizerMapper::cluster_coverage_threshold, + MinimizerMapper::default_cluster_coverage_threshold, + "only extend clusters if they are within FLOAT of the best read coverage", + double_is_nonnegative + ); + comp_opts.add_range( + "extension-score", 'v', + &MinimizerMapper::extension_score_threshold, + MinimizerMapper::default_extension_score_threshold, + "only align extensions if their score is within INT of the best score", + int_is_nonnegative + ); + comp_opts.add_range( + "extension-set", 'w', + &MinimizerMapper::extension_set_score_threshold, + MinimizerMapper::default_extension_set_score_threshold, + "only align extension sets if their score is within INT of the best score", + double_is_nonnegative + ); + comp_opts.add_flag( + "no-dp", 'O', + &MinimizerMapper::do_dp, + MinimizerMapper::default_do_dp, + "disable all gapped alignment" + ); + comp_opts.add_range( + "rescue-attempts", 'r', + &MinimizerMapper::max_rescue_attempts, + MinimizerMapper::default_max_rescue_attempts, + "attempt up to INT rescues per read in a pair" + ); + comp_opts.add_range( + "max-fragment-length", 'L', + &MinimizerMapper::max_fragment_length, + MinimizerMapper::default_max_fragment_length, + "assume that fragment lengths should be smaller than INT when estimating the fragment length distribution" + ); + comp_opts.add_flag( + "exclude-overlapping-min", + &MinimizerMapper::exclude_overlapping_min, + MinimizerMapper::default_exclude_overlapping_min, + "exclude overlapping minimizers" + ); + comp_opts.add_range( + "paired-distance-limit", + &MinimizerMapper::paired_distance_stdevs, + MinimizerMapper::default_paired_distance_stdevs, + "cluster pairs of read using a distance limit FLOAT standard deviations greater than the mean" + ); + comp_opts.add_range( + "rescue-subgraph-size", + &MinimizerMapper::rescue_subgraph_stdevs, + MinimizerMapper::default_rescue_subgraph_stdevs, + "search for rescued alignments FLOAT standard deviations greater than the mean" + ); + comp_opts.add_range( + "rescue-seed-limit", + &MinimizerMapper::rescue_seed_limit, + MinimizerMapper::default_rescue_seed_limit, + "attempt rescue with at most INT seeds" + ); + + // Configure chaining + auto& chaining_opts = parser.add_group("long-read/chaining parameters"); + chaining_opts.add_flag( + "align-from-chains", + &MinimizerMapper::align_from_chains, + MinimizerMapper::default_align_from_chains, + "chain up extensions to create alignments, instead of doing each separately" + ); + chaining_opts.add_range( + "chaining-cluster-distance", + &MinimizerMapper::chaining_cluster_distance, + MinimizerMapper::default_chaining_cluster_distance, + "maximum distance to cluster over before chaining" + ); + chaining_opts.add_range( + "precluster-connection-coverage-threshold", + &MinimizerMapper::precluster_connection_coverage_threshold, + MinimizerMapper::default_precluster_connection_coverage_threshold, + "threshold of precluster pair coverage below the base, after which to stop reseeding between preclusters" + ); + chaining_opts.add_range( + "min-precluster-connections", + &MinimizerMapper::min_precluster_connections, + MinimizerMapper::default_min_precluster_connections, + "minimum number of precluster connections to reseed over" + ); + chaining_opts.add_range( + "max-precluster-connections", + &MinimizerMapper::max_precluster_connections, + MinimizerMapper::default_max_precluster_connections, + "maximum number of precluster connections to reseed over" + ); + chaining_opts.add_range( + "max-lookback-bases", + &MinimizerMapper::max_lookback_bases, + MinimizerMapper::default_max_lookback_bases, + "maximum distance to look back when chaining" + ); + chaining_opts.add_range( + "min-lookback-items", + &MinimizerMapper::min_lookback_items, + MinimizerMapper::default_min_lookback_items, + "minimum items to consider coming from when chaining" + ); + chaining_opts.add_range( + "lookback-item-hard-cap", + &MinimizerMapper::lookback_item_hard_cap, + MinimizerMapper::default_lookback_item_hard_cap, + "maximum items to consider coming from when chaining" + ); + + chaining_opts.add_range( + "chain-score-threshold", + &MinimizerMapper::chain_score_threshold, + MinimizerMapper::default_chain_score_threshold, + "only align chains if their score is within this many points of the best score", + double_is_nonnegative + ); + chaining_opts.add_range( + "min-chains", + &MinimizerMapper::min_chains, + MinimizerMapper::default_min_chains, + "ignore score threshold to get this many chains aligned", + int_is_nonnegative + ); + chaining_opts.add_range( + "chain-min-score", + &MinimizerMapper::chain_min_score, + MinimizerMapper::default_chain_min_score, + "do not align chains with less than this score", + int_is_nonnegative + ); + + chaining_opts.add_range( + "max-chain-connection", + &MinimizerMapper::max_chain_connection, + MinimizerMapper::default_max_chain_connection, + "maximum distance across which to connect seeds when aligning a chain" + ); + chaining_opts.add_range( + "max-tail-length", + &MinimizerMapper::max_tail_length, + MinimizerMapper::default_max_tail_length, + "maximum length of a tail to align before forcing softclipping when aligning a chain" + ); + chaining_opts.add_range( + "max-dp-cells", + &MinimizerMapper::max_dp_cells, + MinimizerMapper::default_max_dp_cells, + "maximum number of alignment cells to allow in a tail with GSSW" + ); + return parser; +} + +// Try stripping all suffixes in the vector, one at a time, and return on failure. +std::string strip_suffixes(std::string filename, const std::vector& suffixes) { + for (const std::string& suffix : suffixes) { + if (filename.length() > suffix.length() && filename.substr(filename.length() - suffix.length()) == suffix) { + filename = filename.substr(0, filename.length() - suffix.length()); + } else { + break; + } + } + return filename; +} + +// Returns the name of the sampled GBZ. +string sample_haplotypes(const vector>& indexes, string& basename, string& sample_name, string& haplotype_file, string& kff_file, bool progress); + +//---------------------------------------------------------------------------- + +void help_giraffe(char** argv, const BaseOptionGroup& parser, bool full_help) { + cerr + << "usage:" << endl + << " " << argv[0] << " giraffe -Z graph.gbz [-d graph.dist -m graph.min] [other options] > output.gam" << endl + << " " << argv[0] << " giraffe -Z graph.gbz --haplotype-name graph.hapl --kff-name sample.kff [other options] > output.gam" << endl + << endl + << "Fast haplotype-aware short read mapper." << endl + << endl; + + cerr + << "basic options:" << endl + << " -Z, --gbz-name FILE map to this GBZ graph" << endl + << " -d, --dist-name FILE cluster using this distance index" << endl + << " -m, --minimizer-name FILE use this minimizer index" << endl + << " -p, --progress show progress" << endl + << " -t, --threads INT number of mapping threads to use" << endl + << " -b, --parameter-preset NAME set computational parameters (fast / default) [default]" << endl + << " -h, --help print full help with all available options" << endl; + + cerr + << "input options:" << endl + << " -G, --gam-in FILE read and realign GAM-format reads from FILE" << endl + << " -f, --fastq-in FILE read and align FASTQ-format reads from FILE (two are allowed, one for each mate)" << endl + << " -i, --interleaved GAM/FASTQ input is interleaved pairs, for paired-end alignment" << endl; + + cerr + << "haplotype sampling:" << endl + << " --haplotype-name FILE sample from haplotype information in FILE" << endl + << " --kff-name FILE sample according to kmer counts in FILE" << endl + << " --index-basename STR name prefix for generated graph/index files (default: from graph name)" << endl; + + cerr + << "alternate graphs:" << endl + << " -x, --xg-name FILE map to this graph (if no -Z / -g), or use this graph for HTSLib output" << endl + << " -g, --graph-name FILE map to this GBWTGraph (if no -Z)" << endl + << " -H, --gbwt-name FILE use this GBWT index (when mapping to -x / -g)" << endl; + + cerr + << "output options:" << endl + << " -N, --sample NAME add this sample name" << endl + << " -R, --read-group NAME add this read group" << endl + << " -o, --output-format NAME output the alignments in NAME format (gam / gaf / json / tsv / SAM / BAM / CRAM) [gam]" << endl + << " --ref-paths FILE ordered list of paths in the graph, one per line or HTSlib .dict, for HTSLib @SQ headers" << endl + << " --named-coordinates produce GAM outputs in named-segment (GFA) space" << endl; + if (full_help) { + cerr + << " -P, --prune-low-cplx prune short and low complexity anchors during linear format realignment" << endl + << " -n, --discard discard all output alignments (for profiling)" << endl + << " --output-basename NAME write output to a GAM file beginning with the given prefix for each setting combination" << endl + << " --report-name NAME write a TSV of output file and mapping speed to the given file" << endl + << " --show-work log how the mapper comes to its conclusions about mapping locations" << endl; + } + + if (full_help) { + cerr + << "Giraffe parameters:" << endl + << " -A, --rescue-algorithm NAME use algorithm NAME for rescue (none / dozeu / gssw) [dozeu]" << endl + << " --fragment-mean FLOAT force the fragment length distribution to have this mean (requires --fragment-stdev)" << endl + << " --fragment-stdev FLOAT force the fragment length distribution to have this standard deviation (requires --fragment-mean)" << endl + << " --track-provenance track how internal intermediate alignment candidates were arrived at" << endl + << " --track-correctness track if internal intermediate alignment candidates are correct (implies --track-provenance)" << endl + << " -B, --batch-size INT number of reads or pairs per batch to distribute to threads [" << vg::io::DEFAULT_PARALLEL_BATCHSIZE << "]" << endl; + + auto helps = parser.get_help(); + print_table(helps, cerr); + } +} + +//---------------------------------------------------------------------------- + +int main_giraffe(int argc, char** argv) { + + std::chrono::time_point launch = std::chrono::system_clock::now(); + + // For haplotype sampling. + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + + // Set up to parse options + GroupedOptionGroup parser = get_options(); + + if (argc == 2) { + help_giraffe(argv, parser, false); + return 1; + } + + #define OPT_OUTPUT_BASENAME 1001 + #define OPT_REPORT_NAME 1002 + #define OPT_TRACK_PROVENANCE 1003 + #define OPT_TRACK_CORRECTNESS 1004 + #define OPT_FRAGMENT_MEAN 1005 + #define OPT_FRAGMENT_STDEV 1006 + #define OPT_REF_PATHS 1010 + #define OPT_SHOW_WORK 1011 + #define OPT_NAMED_COORDINATES 1012 + constexpr int OPT_HAPLOTYPE_NAME = 1100; + constexpr int OPT_KFF_NAME = 1101; + constexpr int OPT_INDEX_BASENAME = 1102; + + // initialize parameters with their default options + + // This holds and manages finding our indexes. + IndexRegistry registry = VGIndexes::get_vg_index_registry(); + + // Indexes provided to IndexRegistry in the arguments. We do not apply them + // immediately, because we may want to do haplotype sampling. + vector> provided_indexes; + string index_basename, index_basename_override; + + // For haplotype sampling. + string haplotype_name, kff_name; + + string output_basename; + string report_name; + bool show_progress = false; + + // Main Giraffe program options struct + // Not really initialized until after we load all the indexes though... + GiraffeMainOptions main_options; + // Scoring options struct + ScoringOptions scoring_options; + // What GAM should we realign? + string gam_filename; + // What FASTQs should we align. + // Note: multiple FASTQs are not interpreted as paired. + string fastq_filename_1; + string fastq_filename_2; + // Is the input interleaved/are we in paired-end mode? + bool interleaved = false; + // True if fastq_filename_2 or interleaved is set. + bool paired = false; + string param_preset = "default"; + //Attempt up to this many rescues of reads with no pairs + bool forced_rescue_attempts = false; + // Which rescue algorithm do we use? + MinimizerMapper::RescueAlgorithm rescue_algorithm = MinimizerMapper::rescue_dozeu; + //Did we force the fragment length distribution? + bool forced_mean = false; + //And if so what is it? + double fragment_mean = 0.0; + bool forced_stdev = false; + double fragment_stdev = 0.0; + // How many pairs should we be willing to buffer before giving up on fragment length estimation? + size_t MAX_BUFFERED_PAIRS = 100000; + // What sample name if any should we apply? + string sample_name; + // What read group if any should we apply? + string read_group; + // Should we track candidate provenance? + bool track_provenance = MinimizerMapper::default_track_provenance; + // Should we track candidate correctness? + bool track_correctness = MinimizerMapper::default_track_correctness; + // Should we log our mapping decision making? + bool show_work = MinimizerMapper::default_show_work; + + // Should we throw out our alignments instead of outputting them? + bool discard_alignments = false; + // How many reads per batch to run at a time? + uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE; + + // Chain all the ranges and get a function that loops over all combinations. + auto for_each_combo = parser.get_iterator(); + + + // Formats for alignment output. + std::string output_format = "GAM"; + std::set output_formats = { "GAM", "GAF", "JSON", "TSV", "SAM", "BAM", "CRAM" }; + + // For HTSlib formats, where do we get sequence header info? + std::string ref_paths_name; + // And should we drop low complexity anchors when surjectng? + bool prune_anchors = false; + + // For GAM format, should we report in named-segment space instead of node ID space? + bool named_coordinates = false; + + // Map algorithm names to rescue algorithms + std::map rescue_algorithms = { + { "none", MinimizerMapper::rescue_none }, + { "dozeu", MinimizerMapper::rescue_dozeu }, + { "gssw", MinimizerMapper::rescue_gssw }, + }; + std::map algorithm_names = { + { MinimizerMapper::rescue_none, "none" }, + { MinimizerMapper::rescue_dozeu, "dozeu" }, + { MinimizerMapper::rescue_gssw, "gssw" }, + }; + + // Map preset names to presets + std::map presets; + // We have a fast preset that sets a bunch of stuff + presets["fast"] + .add_entry("hit-cap", 10) + .add_entry("hard-hit-cap", 500) + .add_entry("score-fraction", 0.5) + .add_entry("max-multimaps", 1) + .add_entry("max-extensions", 400) + .add_entry("max-alignments", 8) + .add_entry("cluster-score", 50) + .add_entry("pad-cluster-score", 0) + .add_entry("cluster-coverage", 0.2) + .add_entry("extension-set", 20) + .add_entry("extension-score", 1); + // And a default preset that doesn't. + presets["default"]; + // And a chaining preset (TODO: make into PacBio and Nanopore) + presets["chaining"] + .add_entry("align-from-chains", true) + .add_entry("watchdog-timeout", 30); + + std::vector long_options = + { + {"help", no_argument, 0, 'h'}, + {"gbz-name", required_argument, 0, 'Z'}, + {"xg-name", required_argument, 0, 'x'}, + {"graph-name", required_argument, 0, 'g'}, + {"gbwt-name", required_argument, 0, 'H'}, + {"minimizer-name", required_argument, 0, 'm'}, + {"dist-name", required_argument, 0, 'd'}, + {"progress", no_argument, 0, 'p'}, + {"haplotype-name", required_argument, 0, OPT_HAPLOTYPE_NAME}, + {"kff-name", required_argument, 0, OPT_KFF_NAME}, + {"index-basename", required_argument, 0, OPT_INDEX_BASENAME}, + {"gam-in", required_argument, 0, 'G'}, + {"fastq-in", required_argument, 0, 'f'}, + {"interleaved", no_argument, 0, 'i'}, + {"max-multimaps", required_argument, 0, 'M'}, + {"sample", required_argument, 0, 'N'}, + {"read-group", required_argument, 0, 'R'}, + {"output-format", required_argument, 0, 'o'}, + {"ref-paths", required_argument, 0, OPT_REF_PATHS}, + {"prune-low-cplx", no_argument, 0, 'P'}, + {"named-coordinates", no_argument, 0, OPT_NAMED_COORDINATES}, + {"discard", no_argument, 0, 'n'}, + {"output-basename", required_argument, 0, OPT_OUTPUT_BASENAME}, + {"report-name", required_argument, 0, OPT_REPORT_NAME}, + {"fast-mode", no_argument, 0, 'b'}, + {"rescue-algorithm", required_argument, 0, 'A'}, + {"fragment-mean", required_argument, 0, OPT_FRAGMENT_MEAN }, + {"fragment-stdev", required_argument, 0, OPT_FRAGMENT_STDEV }, + {"track-provenance", no_argument, 0, OPT_TRACK_PROVENANCE}, + {"track-correctness", no_argument, 0, OPT_TRACK_CORRECTNESS}, + {"show-work", no_argument, 0, OPT_SHOW_WORK}, + {"batch-size", required_argument, 0, 'B'}, + {"threads", required_argument, 0, 't'}, + }; + parser.make_long_options(long_options); + long_options.push_back({0, 0, 0, 0}); + + std::string short_options = "hZ:x:g:H:m:d:pG:f:iM:N:R:o:Pnb:B:t:A:"; + parser.make_short_options(short_options); + + int c; + optind = 2; // force optind past command positional argument + while (true) { + + + int option_index = 0; + c = getopt_long (argc, argv, short_options.c_str(), + &long_options[0], &option_index); + + + // Detect the end of the options. + if (c == -1) + break; + + if (parser.parse(c, optarg)) { + // Parser took care of it + continue; + } + + // Otherwise handle it manually + switch (c) + { + case 'Z': + if (!optarg || !*optarg) { + cerr << "error:[vg giraffe] Must provide GBZ file with -Z." << endl; + exit(1); + } + if (!std::ifstream(optarg).is_open()) { + cerr << "error:[vg giraffe] Couldn't open GBZ file " << optarg << endl; + exit(1); + } + provided_indexes.emplace_back("Giraffe GBZ", optarg); + + // If we have a GBZ we probably want to use its name as the base name. + // But see -g. + index_basename = strip_suffixes(std::string(optarg), { ".gbz", ".giraffe" }); + + break; + + case 'x': + if (!optarg || !*optarg) { + cerr << "error:[vg giraffe] Must provide graph file with -x." << endl; + exit(1); + } + if (!std::ifstream(optarg).is_open()) { + cerr << "error:[vg giraffe] Couldn't open graph file " << optarg << endl; + exit(1); + } + provided_indexes.emplace_back("XG", optarg); + + // If we have an xg we probably want to use its name as the base name. + // But see -g. + index_basename = split_ext(optarg).first; + + break; + + case 'g': + if (!optarg || !*optarg) { + cerr << "error:[vg giraffe] Must provide GBWTGraph file with -g." << endl; + exit(1); + } + if (!std::ifstream(optarg).is_open()) { + cerr << "error:[vg giraffe] Couldn't open GBWTGraph file " << optarg << endl; + exit(1); + } + provided_indexes.emplace_back("GBWTGraph", optarg); + + // But if we have a GBWTGraph we probably want to use *its* name as the base name. + // Whichever is specified last will win, unless we also have a FASTA input name. + index_basename = split_ext(optarg).first; + + break; + + case 'H': + if (!optarg || !*optarg) { + cerr << "error:[vg giraffe] Must provide GBWT file with -H." << endl; + exit(1); + } + if (!std::ifstream(optarg).is_open()) { + cerr << "error:[vg giraffe] Couldn't open GBWT file " << optarg << endl; + exit(1); + } + provided_indexes.emplace_back("Giraffe GBWT", optarg); + break; + + case 'm': + if (!optarg || !*optarg) { + cerr << "error:[vg giraffe] Must provide minimizer file with -m." << endl; + exit(1); + } + if (!std::ifstream(optarg).is_open()) { + cerr << "error:[vg giraffe] Couldn't open minimizer file " << optarg << endl; + exit(1); + } + provided_indexes.emplace_back("Minimizers", optarg); + break; + + case 'd': + if (!optarg || !*optarg) { + cerr << "error:[vg giraffe] Must provide distance index file with -d." << endl; + exit(1); + } + if (!std::ifstream(optarg).is_open()) { + cerr << "error:[vg giraffe] Couldn't open distance index file " << optarg << endl; + exit(1); + } + provided_indexes.emplace_back("Giraffe Distance Index", optarg); + break; + + case 'p': + show_progress = true; + break; + + case OPT_HAPLOTYPE_NAME: + haplotype_name = optarg; + break; + case OPT_KFF_NAME: + kff_name = optarg; + break; + case OPT_INDEX_BASENAME: + index_basename_override = optarg; + break; + + case 'G': + gam_filename = optarg; + if (gam_filename.empty()) { + cerr << "error:[vg giraffe] Must provide GAM file with -G." << endl; + exit(1); + } + break; + + case 'f': + if (fastq_filename_1.empty()) { + fastq_filename_1 = optarg; + if (fastq_filename_1.empty()) { + cerr << "error:[vg giraffe] Must provide FASTQ file with -f." << endl; + exit(1); + } + } + else if (fastq_filename_2.empty()) { + fastq_filename_2 = optarg; + if (fastq_filename_2.empty()) { + cerr << "error:[vg giraffe] Must provide FASTQ file with -f." << endl; + exit(1); + } + paired = true; + } else { + cerr << "error:[vg giraffe] Cannot specify more than two FASTQ files." << endl; + exit(1); + } + break; + + case 'i': + interleaved = true; + paired = true; + break; + + case 'N': + sample_name = optarg; + break; + + case 'R': + read_group = optarg; + break; + + case 'o': + { + output_format = optarg; + for (char& c : output_format) { + c = std::toupper(c); + } + if (output_formats.find(output_format) == output_formats.end()) { + std::cerr << "error: [vg giraffe] Invalid output format: " << optarg << std::endl; + std::exit(1); + } + } + break; + + case OPT_REF_PATHS: + ref_paths_name = optarg; + break; + + case 'P': + prune_anchors = true; + break; + + case OPT_NAMED_COORDINATES: + named_coordinates = true; + break; + + case 'n': + discard_alignments = true; + break; + + case OPT_OUTPUT_BASENAME: + output_basename = optarg; + break; + + case OPT_REPORT_NAME: + report_name = optarg; + break; + case 'b': + param_preset = optarg; + { + auto found = presets.find(param_preset); + if (found == presets.end()) { + // Complain this isn't a preset. + std::cerr << "error: [vg giraffe] invalid parameter preset: " << param_preset << std::endl; + exit(1); + } else { + // Apply the preset values. + found->second.apply(parser); + } + } + break; + + case 'A': + { + std::string algo_name = optarg; + for (char& c : algo_name) { + c = std::tolower(c); + } + auto iter = rescue_algorithms.find(algo_name); + if (iter == rescue_algorithms.end()) { + std::cerr << "error: [vg giraffe] Invalid rescue algorithm: " << optarg << std::endl; + std::exit(1); + } + rescue_algorithm = iter->second; + } + break; + + case OPT_FRAGMENT_MEAN: + forced_mean = true; + fragment_mean = parse(optarg); + break; + + case OPT_FRAGMENT_STDEV: + forced_stdev = true; + fragment_stdev = parse(optarg); + break; + + case OPT_TRACK_PROVENANCE: + track_provenance = true; + break; + + case OPT_TRACK_CORRECTNESS: + track_provenance = true; + track_correctness = true; + break; + + case OPT_SHOW_WORK: + show_work = true; + // Also turn on saving explanations + Explainer::save_explanations = true; + break; + + case 'B': + batch_size = parse(optarg); + break; + + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg giraffe] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + } + break; + + case 'h': + case '?': + default: + help_giraffe(argv, parser, true); + exit(1); + break; + } + } + + + // Get positional arguments before validating user intent + if (have_input_file(optind, argc, argv)) { + // Must be the FASTA, but check. + + string fasta_filename = get_input_file_name(optind, argc, argv); + + auto fasta_parts = split_ext(fasta_filename); + if (fasta_parts.second == "gz") { + fasta_parts = split_ext(fasta_parts.first); + } + if (fasta_parts.second != "fa" && fasta_parts.second != "fasta" && fasta_parts.second != "fna") { + cerr << "error:[vg giraffe] FASTA file " << fasta_filename << " is not named like a FASTA" << endl; + exit(1); + } + + provided_indexes.emplace_back("Reference FASTA", fasta_filename); + // Everything else should be named like the FASTA by default + index_basename = fasta_parts.first; + + if (have_input_file(optind, argc, argv)) { + // Next one must be VCF, but check. + // TODO: Unify with FASTA check? + + string vcf_filename = get_input_file_name(optind, argc, argv); + + auto vcf_parts = split_ext(vcf_filename); + if (vcf_parts.second == "gz") { + vcf_parts = split_ext(vcf_parts.first); + } + if (vcf_parts.second != "vcf") { + cerr << "error:[vg giraffe] VCF file " << vcf_filename << " is not named like a VCF" << endl; + exit(1); + } + + // Determine if it is phased or not + string file_type = IndexRegistry::vcf_is_phased(vcf_filename) ? "VCF w/ Phasing" : "VCF"; + + // Feed it to the index registry to maybe use + provided_indexes.emplace_back(file_type, vcf_filename); + } + } + + // If we don't want rescue, let the user see we don't try it. + if (parser.get_option_value("rescue-attempts") == 0 || rescue_algorithm == MinimizerMapper::rescue_none) { + // Replace any parsed values + parser.set_option_value("rescue-attempts", 0); + rescue_algorithm = MinimizerMapper::rescue_none; + } + + // Now all the arguments are parsed, so see if they make sense + + // Decide if we are outputting to an htslib format + bool hts_output = (output_format == "SAM" || output_format == "BAM" || output_format == "CRAM"); + + if (!ref_paths_name.empty() && !hts_output) { + cerr << "warning:[vg giraffe] Reference path file (--ref-paths) is only used when output format (-o) is SAM, BAM, or CRAM." << endl; + ref_paths_name = ""; + } + + if (output_format != "GAM" && !output_basename.empty()) { + cerr << "error:[vg giraffe] Using an output basename (--output-basename) only makes sense for GAM format (-o)" << endl; + exit(1); + } + + if (interleaved && !fastq_filename_2.empty()) { + cerr << "error:[vg giraffe] Cannot designate both interleaved paired ends (-i) and separate paired end file (-f)." << endl; + exit(1); + } + + if (!fastq_filename_1.empty() && !gam_filename.empty()) { + cerr << "error:[vg giraffe] Cannot designate both FASTQ input (-f) and GAM input (-G) in same run." << endl; + exit(1); + } + + if (have_input_file(optind, argc, argv)) { + // TODO: work out how to interpret additional files as reads. + cerr << "error:[vg giraffe] Extraneous input file: " << get_input_file_name(optind, argc, argv) << endl; + exit(1); + } + + if ((forced_mean && ! forced_stdev) || (!forced_mean && forced_stdev)) { + cerr << "warning:[vg giraffe] Both a mean and standard deviation must be specified for the fragment length distribution" << endl; + cerr << " Detecting fragment length distribution automatically" << endl; + forced_mean = false; + forced_stdev = false; + fragment_mean = 0.0; + fragment_stdev = 0.0; + } + if ((forced_mean || forced_stdev || forced_rescue_attempts) && (!paired)) { + cerr << "warning:[vg giraffe] Attempting to set paired-end parameters but running in single-end mode" << endl; + } + + bool haplotype_sampling = !haplotype_name.empty() & !kff_name.empty(); + if (!index_basename_override.empty()) { + index_basename = index_basename_override; + } + if (haplotype_sampling) { + // If we do haplotype sampling, we get a new GBZ and later build indexes for it. + string gbz_name = sample_haplotypes(provided_indexes, index_basename, sample_name, haplotype_name, kff_name, show_progress); + registry.provide("Giraffe GBZ", gbz_name); + index_basename = split_ext(gbz_name).first; + } else { + // Otherwise we use the provided indexes. + for (auto& index : provided_indexes) { + registry.provide(index.first, index.second); + } + } + registry.set_prefix(index_basename); + + // The IndexRegistry doesn't try to infer index files based on the + // basename, so do that here. We can have multiple extension options that + // we try in order of priority. + unordered_map> indexes_and_extensions = { + {"Giraffe GBZ", {"giraffe.gbz", "gbz"}}, + {"XG", {"xg"}}, + {"Giraffe GBWT", {"gbwt"}}, + {"GBWTGraph", {"gg"}}, + {"Giraffe Distance Index", {"dist"}}, + {"Minimizers", {"min"}} + }; + for (auto& completed : registry.completed_indexes()) { + // Drop anything we already got from the list + indexes_and_extensions.erase(completed); + } + for (auto& index_and_extensions : indexes_and_extensions) { + // For each index type + for (auto& extension : index_and_extensions.second) { + // For each extension in priority order + string inferred_filename = registry.get_prefix() + "." + extension; + if (ifstream(inferred_filename).is_open()) { + // A file with the appropriate name exists and we can read it. + if (haplotype_sampling) { + // If we did haplotype sampling, we are going to overwrite existing indexes. + cerr << "warning:[vg giraffe] " << inferred_filename << " exists and will be overwritten" << endl; + } else { + // Report it because this may not be desired behavior. + cerr << "Guessing that " << inferred_filename << " is " << index_and_extensions.first << endl; + registry.provide(index_and_extensions.first, inferred_filename); + } + // Skip other extension options for the index + break; + } + } + } + + // create in-memory objects + + // Don't try and use all the memory. + // TODO: add memory options like autoindex? + registry.set_target_memory_usage(IndexRegistry::get_system_memory() / 2); + + auto index_targets = VGIndexes::get_default_giraffe_indexes(); + +#ifdef debug + for (auto& needed : index_targets) { + cerr << "Want index: " << needed << endl; + } +#endif + + try { + if (show_progress) { + cerr << "Preparing Indexes" << endl; + } + registry.make_indexes(index_targets); + } + catch (InsufficientInputException ex) { + cerr << "error:[vg giraffe] Input is not sufficient to create indexes" << endl; + cerr << ex.what(); + return 1; + } + +#ifdef debug + for (auto& completed : registry.completed_indexes()) { + cerr << "Have index: " << completed << endl; + for (auto& filename : registry.require(completed)) { + cerr << "\tAt: " << filename << endl; + } + } +#endif + + // Grab the minimizer index + if (show_progress) { + cerr << "Loading Minimizer Index" << endl; + } + auto minimizer_index = vg::io::VPKG::load_one(registry.require("Minimizers").at(0)); + + // Grab the GBZ + if (show_progress) { + cerr << "Loading GBZ" << endl; + } + auto gbz = vg::io::VPKG::load_one(registry.require("Giraffe GBZ").at(0)); + + // Grab the distance index + if (show_progress) { + cerr << "Loading Distance Index v2" << endl; + } + auto distance_index = vg::io::VPKG::load_one(registry.require("Giraffe Distance Index").at(0)); + + if (show_progress) { + cerr << "Paging in Distance Index v2" << endl; + } + std::chrono::time_point preload_start = std::chrono::system_clock::now(); + // Make sure the distance index is paged in from disk. + // This does a blocking load; a nonblocking hint to the kernel doesn't seem to help at all. + distance_index->preload(true); + std::chrono::time_point preload_end = std::chrono::system_clock::now(); + std::chrono::duration di2_preload_seconds = preload_end - preload_start; + + // If we are tracking correctness, we will fill this in with a graph for + // getting offsets along ref paths. + PathPositionHandleGraph* path_position_graph = nullptr; + // If we need an overlay for position lookup, we might be pointing into + // this overlay. We want one that's good for reference path queries. + bdsg::ReferencePathOverlayHelper overlay_helper; + // And we might load an XG + unique_ptr xg_graph; + if (track_correctness || hts_output) { + // Usually we will get our paths from the GBZ + PathHandleGraph* base_graph = &gbz->graph; + // But if an XG is around, we should use that instead. Otherwise, it's not possible to provide paths when using an old GBWT/GBZ that doesn't have them. + if (registry.available("XG")) { + if (show_progress) { + cerr << "Loading XG Graph" << endl; + } + xg_graph = vg::io::VPKG::load_one(registry.require("XG").at(0)); + base_graph = xg_graph.get(); + } + + // Apply the overlay if needed. + path_position_graph = overlay_helper.apply(base_graph); + } + + // Set up the mapper + if (show_progress) { + cerr << "Initializing MinimizerMapper" << endl; + } + MinimizerMapper minimizer_mapper(gbz->graph, *minimizer_index, &*distance_index, path_position_graph); + if (forced_mean && forced_stdev) { + minimizer_mapper.force_fragment_length_distr(fragment_mean, fragment_stdev); + } + + std::chrono::time_point init = std::chrono::system_clock::now(); + std::chrono::duration init_seconds = init - launch; + if (show_progress) { + cerr << "Loading and initialization: " << init_seconds.count() << " seconds" << endl; + cerr << "Of which Distance Index v2 paging: " << di2_preload_seconds.count() << " seconds" << endl; + } + + // Set up to write a report of mapping speed if requested, instead of just dumping to stderr. + ofstream report; + if (!report_name.empty()) { + // Open the report + report.open(report_name); + if (!report) { + // Make sure it worked + cerr << "error[vg giraffe]: Could not open report file " << report_name << endl; + exit(1); + } + + // Add a header + report << "#file\treads/second/thread" << endl; + } + + // We need to loop over all the ranges... + for_each_combo([&]() { + + // Work out where to send the output. Default to stdout. + string output_filename = "-"; + if (!output_basename.empty()) { + // Compose a name using all the parameters. + stringstream s; + + s << output_basename; + + if (interleaved) { + s << "-i"; + } + // Make a slug of the other options + parser.print_options(s, true); + s << ".gam"; + + output_filename = s.str(); + } + + if (show_progress) { + if (discard_alignments) { + cerr << "Discarding output alignments" << endl; + } else { + cerr << "Mapping reads to \"" << output_filename << "\" (" << output_format << ")" << endl; + } + } + + // Show and apply all the parser-managed options + if (show_progress) { + parser.print_options(cerr); + } + parser.apply(minimizer_mapper); + parser.apply(main_options); + parser.apply(scoring_options); + + if (show_progress && interleaved) { + cerr << "--interleaved" << endl; + } + + if (show_progress && prune_anchors) { + cerr << "--prune-low-cplx" << endl; + } + + if (show_progress && track_provenance) { + cerr << "--track-provenance " << endl; + } + minimizer_mapper.track_provenance = track_provenance; + + if (show_progress && track_correctness) { + cerr << "--track-correctness " << endl; + } + minimizer_mapper.track_correctness = track_correctness; + + if (show_progress && show_work) { + cerr << "--show-work " << endl; + } + minimizer_mapper.show_work = show_work; + + if (show_progress && paired) { + if (forced_mean && forced_stdev) { + cerr << "--fragment-mean " << fragment_mean << endl; + cerr << "--fragment-stdev " << fragment_stdev << endl; + } + cerr << "--rescue-algorithm " << algorithm_names[rescue_algorithm] << endl; + } + minimizer_mapper.rescue_algorithm = rescue_algorithm; + + minimizer_mapper.sample_name = sample_name; + minimizer_mapper.read_group = read_group; + + // Apply scoring parameters, after they have been parsed + minimizer_mapper.set_alignment_scores(scoring_options.match, scoring_options.mismatch, scoring_options.gap_open, scoring_options.gap_extend, scoring_options.full_length_bonus); + + // Work out the number of threads we will have + size_t thread_count = omp_get_max_threads(); + + // Set up counters per-thread for total reads mapped + vector reads_mapped_by_thread(thread_count, 0); + + // For timing, we may run one thread first and then switch to all threads. So track both start times. + std::chrono::time_point first_thread_start; + std::chrono::time_point all_threads_start; + + // We also time in terms of CPU time + clock_t cpu_time_before; + + // We may also have access to perf stats. + vector perf_fds; + +#ifdef __linux__ + // Set up a counter for executed instructions. + // See + struct perf_event_attr perf_config; + memset(&perf_config, 0, sizeof(struct perf_event_attr)); + perf_config.type = PERF_TYPE_HARDWARE; + perf_config.size = sizeof(struct perf_event_attr); + perf_config.config = PERF_COUNT_HW_INSTRUCTIONS; + perf_config.exclude_kernel = 1; + perf_config.exclude_hv = 1; + + perf_fds.resize(thread_count); + + perf_fds[omp_get_thread_num()] = perf_event_open(&perf_config, 0, -1, -1, 0); + if (show_progress && perf_fds[omp_get_thread_num()] == -1) { + int problem = errno; + cerr << "Not counting CPU instructions because perf events are unavailable: " << strerror(problem) << endl; + perf_fds.clear(); + } + + // Each OMP thread will call this to make sure perf is on. + auto ensure_perf_for_thread = [&]() { + if (!perf_fds.empty() && perf_fds[omp_get_thread_num()] == 0) { + perf_fds[omp_get_thread_num()] = perf_event_open(&perf_config, 0, -1, -1, 0); + } + }; + + // Main thread will call this to turn it off + auto stop_perf_for_thread = [&]() { + if (!perf_fds.empty() && perf_fds[omp_get_thread_num()] != 0) { + ioctl(perf_fds[omp_get_thread_num()], PERF_EVENT_IOC_DISABLE, 0); + } + }; + + // Main thread will call this when mapping starts to reset the counter. + auto reset_perf_for_thread = [&]() { + if (!perf_fds.empty() && perf_fds[omp_get_thread_num()] != 0) { + ioctl(perf_fds[omp_get_thread_num()], PERF_EVENT_IOC_RESET, 0); + } + }; + + // TODO: we won't count the output thread, but it will appear in CPU time! +#endif + + // Establish a watchdog to find reads that take too long to map. + // If we see any, we will issue a warning. + unique_ptr watchdog(new Watchdog(thread_count, chrono::seconds(main_options.watchdog_timeout))); + + { + + // Look up all the paths we might need to surject to. + vector> paths; + if (hts_output) { + // For htslib we need a non-empty list of paths. + assert(path_position_graph != nullptr); + paths = get_sequence_dictionary(ref_paths_name, {}, *path_position_graph); + } + + // Set up output to an emitter that will handle serialization and surjection. + // Unless we want to discard all the alignments in which case do that. + unique_ptr alignment_emitter; + if (discard_alignments) { + alignment_emitter = make_unique(); + } else { + // We actually want to emit alignments. + // Encode flags describing what we want to happen. + int flags = ALIGNMENT_EMITTER_FLAG_NONE; + if (prune_anchors) { + // When surjecting, do anchor pruning. + flags |= ALIGNMENT_EMITTER_FLAG_HTS_PRUNE_SUSPICIOUS_ANCHORS; + } + if (named_coordinates) { + // When not surjecting, use named segments instead of node IDs. + flags |= ALIGNMENT_EMITTER_FLAG_VG_USE_SEGMENT_NAMES; + } + + // We send along the positional graph when we have it, and otherwise we send the GBWTGraph which is sufficient for GAF output. + // TODO: What if we need both a positional graph and a NamedNodeBackTranslation??? + const HandleGraph* emitter_graph = path_position_graph ? (const HandleGraph*)path_position_graph : (const HandleGraph*)&(gbz->graph); + + alignment_emitter = get_alignment_emitter(output_filename, output_format, + paths, thread_count, + emitter_graph, flags); + } + +#ifdef USE_CALLGRIND + // We want to profile the alignment, not the loading. + CALLGRIND_START_INSTRUMENTATION; +#endif + + // Start timing overall mapping time now that indexes are loaded. + first_thread_start = std::chrono::system_clock::now(); + cpu_time_before = clock(); + +#ifdef __linux__ + reset_perf_for_thread(); +#endif + + if (interleaved || !fastq_filename_2.empty()) { + //Map paired end from either one gam or fastq file or two fastq files + + // a buffer to hold read pairs that can't be unambiguously mapped before the fragment length distribution + // is estimated + // note: sufficient to have only one buffer because multithreading code enforces single threaded mode + // during distribution estimation + vector> ambiguous_pair_buffer; + + // Track whether the distribution was ready, so we can detect when it becomes ready and capture the all-threads start time. + bool distribution_was_ready = false; + + // Define how to know if the paired end distribution is ready + auto distribution_is_ready = [&]() { + bool is_ready = minimizer_mapper.fragment_distr_is_finalized(); + if (is_ready && !distribution_was_ready) { + // It has become ready now. + distribution_was_ready = true; + + if (show_progress) { + // Report that it is now ready + #pragma omp critical (cerr) + { + cerr << "Using fragment length estimate: " << minimizer_mapper.get_fragment_length_mean() << " +/- " << minimizer_mapper.get_fragment_length_stdev() << endl; + } + } + + // Remember when now is. + all_threads_start = std::chrono::system_clock::now(); + } + return is_ready; + }; + + // Define a way to force the distribution ready + auto require_distribution_finalized = [&]() { + if (!minimizer_mapper.fragment_distr_is_finalized()){ + cerr << "warning[vg::giraffe]: Finalizing fragment length distribution before reaching maximum sample size" << endl; + cerr << " mapped " << minimizer_mapper.get_fragment_length_sample_size() + << " reads single ended with " << ambiguous_pair_buffer.size() << " pairs of reads left unmapped" << endl; + cerr << " mean: " << minimizer_mapper.get_fragment_length_mean() << ", stdev: " + << minimizer_mapper.get_fragment_length_stdev() << endl; + minimizer_mapper.finalize_fragment_length_distr(); + } + }; + + // Define how to align and output a read pair, in a thread. + auto map_read_pair = [&](Alignment& aln1, Alignment& aln2) { + try { + set_crash_context(aln1.name() + ", " + aln2.name()); + + auto thread_num = omp_get_thread_num(); +#ifdef __linux__ + ensure_perf_for_thread(); +#endif + + if (watchdog) { + watchdog->check_in(thread_num, aln1.name() + ", " + aln2.name()); + } + + toUppercaseInPlace(*aln1.mutable_sequence()); + toUppercaseInPlace(*aln2.mutable_sequence()); + + pair, vector> mapped_pairs = minimizer_mapper.map_paired(aln1, aln2, ambiguous_pair_buffer); + if (!mapped_pairs.first.empty() && !mapped_pairs.second.empty()) { + //If we actually tried to map this paired end + + // Work out whether it could be properly paired or not, if that is relevant. + // If we're here, let the read be properly paired in + // HTSlib terms no matter how far away it is in linear + // space (on the same contig), because it went into + // pair distribution estimation. + // TODO: The semantics are weird here. 0 means + // "properly paired at any distance" and + // numeric_limits::max() doesn't. + int64_t tlen_limit = 0; + if (hts_output && minimizer_mapper.fragment_distr_is_finalized()) { + tlen_limit = minimizer_mapper.get_fragment_length_mean() + 6 * minimizer_mapper.get_fragment_length_stdev(); + } + // Emit it + alignment_emitter->emit_mapped_pair(std::move(mapped_pairs.first), std::move(mapped_pairs.second), tlen_limit); + // Record that we mapped a read. + reads_mapped_by_thread.at(thread_num) += 2; + } + + if (!minimizer_mapper.fragment_distr_is_finalized() && ambiguous_pair_buffer.size() >= MAX_BUFFERED_PAIRS) { + // We risk running out of memory if we keep this up. + cerr << "warning[vg::giraffe]: Encountered " << ambiguous_pair_buffer.size() << " ambiguously-paired reads before finding enough" << endl + << " unambiguously-paired reads to learn fragment length distribution. Are you sure" << endl + << " your reads are paired and your graph is not a hairball?" << endl; + require_distribution_finalized(); + } + + if (watchdog) { + watchdog->check_out(thread_num); + } + + clear_crash_context(); + + } catch (const std::exception& ex) { + report_exception(ex); + } + }; + + if (!gam_filename.empty()) { + // GAM file to remap + get_input_file(gam_filename, [&](istream& in) { + // Map pairs of reads to the emitter + vg::io::for_each_interleaved_pair_parallel_after_wait(in, map_read_pair, distribution_is_ready); + }); + } else if (!fastq_filename_2.empty()) { + //A pair of FASTQ files to map + fastq_paired_two_files_for_each_parallel_after_wait(fastq_filename_1, fastq_filename_2, map_read_pair, distribution_is_ready, batch_size); + + + } else if ( !fastq_filename_1.empty()) { + // An interleaved FASTQ file to map, map all its pairs in parallel. + fastq_paired_interleaved_for_each_parallel_after_wait(fastq_filename_1, map_read_pair, distribution_is_ready, batch_size); + } + + // Now map all the ambiguous pairs + // Make sure fragment length distribution is finalized first. + require_distribution_finalized(); + for (pair& alignment_pair : ambiguous_pair_buffer) { + try { + set_crash_context(alignment_pair.first.name() + ", " + alignment_pair.second.name()); + auto mapped_pairs = minimizer_mapper.map_paired(alignment_pair.first, alignment_pair.second); + // Work out whether it could be properly paired or not, if that is relevant. + int64_t tlen_limit = 0; + if (hts_output && minimizer_mapper.fragment_distr_is_finalized()) { + tlen_limit = minimizer_mapper.get_fragment_length_mean() + 6 * minimizer_mapper.get_fragment_length_stdev(); + } + // Emit the read + alignment_emitter->emit_mapped_pair(std::move(mapped_pairs.first), std::move(mapped_pairs.second), tlen_limit); + // Record that we mapped a read. + reads_mapped_by_thread.at(omp_get_thread_num()) += 2; + clear_crash_context(); + } catch (const std::exception& ex) { + report_exception(ex); + } + } + } else { + // Map single-ended + + // All the threads start at once. + all_threads_start = first_thread_start; + + // Define how to align and output a read, in a thread. + auto map_read = [&](Alignment& aln) { + try { + set_crash_context(aln.name()); + auto thread_num = omp_get_thread_num(); +#ifdef __linux__ + ensure_perf_for_thread(); +#endif + if (watchdog) { + watchdog->check_in(thread_num, aln.name()); + } + + toUppercaseInPlace(*aln.mutable_sequence()); + + // Map the read with the MinimizerMapper. + minimizer_mapper.map(aln, *alignment_emitter); + // Record that we mapped a read. + reads_mapped_by_thread.at(thread_num)++; + + if (watchdog) { + watchdog->check_out(thread_num); + } + clear_crash_context(); + } catch (const std::exception& ex) { + report_exception(ex); + } + }; + + if (!gam_filename.empty()) { + // GAM file to remap + get_input_file(gam_filename, [&](istream& in) { + // Open it and map all the reads in parallel. + vg::io::for_each_parallel(in, map_read, batch_size); + }); + } + + if (!fastq_filename_1.empty()) { + // FASTQ file to map, map all its reads in parallel. + fastq_unpaired_for_each_parallel(fastq_filename_1, map_read, batch_size); + } + } + + } // Make sure alignment emitter is destroyed and all alignments are on disk. + + // Now mapping is done + std::chrono::time_point end = std::chrono::system_clock::now(); + clock_t cpu_time_after = clock(); +#ifdef __linux__ + stop_perf_for_thread(); +#endif + + // Compute wall clock elapsed + std::chrono::duration all_threads_seconds = end - all_threads_start; + std::chrono::duration first_thread_additional_seconds = all_threads_start - first_thread_start; + + // Compute CPU time elapsed + double cpu_seconds = (cpu_time_after - cpu_time_before) / (double)CLOCKS_PER_SEC; + + // Compute instructions used + long long total_instructions = 0; + for (auto& perf_fd : perf_fds) { + if (perf_fd > 0) { + long long thread_instructions; + if (read(perf_fd, &thread_instructions, sizeof(long long)) != sizeof(long long)) { + // Read failed for some reason. + cerr << "warning:[vg giraffe] Could not count CPU instructions executed" << endl; + thread_instructions = 0; + } + if (close(perf_fd)) { + int problem = errno; + cerr << "warning:[vg giraffe] Error closing perf event instruction counter: " << strerror(problem) << endl; + } + total_instructions += thread_instructions; + } + } + + // How many reads did we map? + size_t total_reads_mapped = 0; + for (auto& reads_mapped : reads_mapped_by_thread) { + total_reads_mapped += reads_mapped; + } + + // Compute speed (as reads per thread-second) + double reads_per_second_per_thread = total_reads_mapped / (all_threads_seconds.count() * thread_count + first_thread_additional_seconds.count()); + // And per CPU second (including any IO threads) + double reads_per_cpu_second = total_reads_mapped / cpu_seconds; + double mega_instructions_per_read = total_instructions / (double)total_reads_mapped / 1E6; + double mega_instructions_per_second = total_instructions / cpu_seconds / 1E6; + + if (show_progress) { + // Log to standard error + cerr << "Mapped " << total_reads_mapped << " reads across " + << thread_count << " threads in " + << all_threads_seconds.count() << " seconds with " + << first_thread_additional_seconds.count() << " additional single-threaded seconds." << endl; + cerr << "Mapping speed: " << reads_per_second_per_thread + << " reads per second per thread" << endl; + + cerr << "Used " << cpu_seconds << " CPU-seconds (including output)." << endl; + cerr << "Achieved " << reads_per_cpu_second + << " reads per CPU-second (including output)" << endl; + + if (total_instructions != 0) { + cerr << "Used " << total_instructions << " CPU instructions (not including output)." << endl; + cerr << "Mapping slowness: " << mega_instructions_per_read + << " M instructions per read at " << mega_instructions_per_second + << " M mapping instructions per inclusive CPU-second" << endl; + } + + cerr << "Memory footprint: " << gbwt::inGigabytes(gbwt::memoryUsage()) << " GB" << endl; + } + + + if (report) { + // Log output filename and mapping speed in reads/second/thread to report TSV + report << output_filename << "\t" << reads_per_second_per_thread << endl; + } + + }); + + return 0; +} + +//---------------------------------------------------------------------------- + +string sample_haplotypes(const vector>& indexes, string& basename, string& sample_name, string& haplotype_file, string& kff_file, bool progress) { + + if (progress) { + std::cerr << "Sampling haplotypes" << std::endl; + } + + // Sanity checks. + if (haplotype_file.empty() || kff_file.empty()) { + std::cerr << "error:[vg giraffe] Haplotype sampling requires --haplotype-name and --kff-name." << std::endl; + std::exit(EXIT_FAILURE); + } + + // Determine output name. + std::string sample = sample_name; + if (sample.empty()) { + sample = file_base_name(kff_file); + if (progress) { + std::cerr << "Guessing from " << kff_file << " that sample name is " << sample << std::endl; + } + } + if (sample == "giraffe") { + std::cerr << "warning:[vg giraffe] Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; + } + std::string output_name = basename + "." + sample + ".gbz"; + + // Load GBZ. + gbwtgraph::GBZ gbz; + if (indexes.size() == 1 && indexes[0].first == "Giraffe GBZ") { + load_gbz(gbz, indexes[0].second, progress); + } else if (indexes.size() == 2 && indexes[0].first == "Giraffe GBWT" && indexes[1].first == "GBWTGraph") { + load_gbz(gbz, indexes[0].second, indexes[1].second, progress); + } else if (indexes.size() == 2 && indexes[0].first == "GBWTGraph" && indexes[1].first == "Giraffe GBWT") { + load_gbz(gbz, indexes[1].second, indexes[0].second, progress); + } else { + std::cerr << "error:[vg giraffe] Haplotype sampling requires either -Z or -g and -H with no other indexes." << std::endl; + std::exit(EXIT_FAILURE); + } + + // Load haplotype information. + if (progress) { + std::cerr << "Loading haplotype information from " << haplotype_file << std::endl; + } + Haplotypes haplotypes; + sdsl::simple_sds::load_from(haplotypes, haplotype_file); + + // Sample haplotypes. + Haplotypes::Verbosity verbosity = (progress ? Haplotypes::verbosity_basic : Haplotypes::verbosity_silent); + Recombinator recombinator(gbz, verbosity); + Recombinator::Parameters parameters; + parameters.include_reference = true; + gbwt::GBWT sampled_gbwt; + try { + sampled_gbwt = recombinator.generate_haplotypes(haplotypes, kff_file, parameters); + } catch (const std::runtime_error& e) { + std::cerr << "error:[vg giraffe] Haplotype sampling failed: " << e.what() << std::endl; + std::exit(EXIT_FAILURE); + } + + // Create GBWTGraph and save GBZ. + if (progress) { + std::cerr << "Building GBWTGraph" << std::endl; + } + gbwtgraph::GBWTGraph sampled_graph = gbz.graph.subgraph(sampled_gbwt); + save_gbz(sampled_gbwt, sampled_graph, output_name, progress); + + return output_name; +} + +//---------------------------------------------------------------------------- + +// Register subcommand +static Subcommand vg_giraffe("giraffe", "fast haplotype-aware short read alignment", PIPELINE, 6, main_giraffe); diff --git a/src/subcommand/haplotypes_main.cpp b/src/subcommand/haplotypes_main.cpp new file mode 100644 index 00000000000..451a964b8b5 --- /dev/null +++ b/src/subcommand/haplotypes_main.cpp @@ -0,0 +1,1306 @@ +/** \file haplotypes_main.cpp + * + * Defines the "vg haplotypes" subcommand, which will ultimately sample haplotypes. + * + * This is currently highly experimental. + */ + +#include "subcommand.hpp" + +#include "../hash_map.hpp" +#include "../recombinator.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +using namespace vg; + +//---------------------------------------------------------------------------- + +constexpr size_t DEFAULT_MAX_THREADS = 16; + +size_t haplotypes_default_threads() { + size_t threads = omp_get_max_threads(); + threads = std::max(threads, size_t(1)); + return std::min(threads, DEFAULT_MAX_THREADS); +} + +constexpr size_t haplotypes_default_k() { + return Haplotypes::Header::DEFAULT_K; +} + +constexpr size_t haplotypes_default_w() { + return gbwtgraph::Key64::WINDOW_LENGTH; +} + +constexpr size_t haplotypes_default_subchain_length() { + return HaplotypePartitioner::SUBCHAIN_LENGTH; +} + +constexpr size_t haplotypes_default_n() { + return Recombinator::NUM_HAPLOTYPES; +} + +constexpr size_t haplotypes_default_coverage() { + return Recombinator::COVERAGE; +} + +constexpr double haplotypes_default_discount() { + return Recombinator::PRESENT_DISCOUNT; +} + +constexpr double haplotypes_default_adjustment() { + return Recombinator::HET_ADJUSTMENT; +} + +constexpr double haplotypes_default_absent() { + return Recombinator::ABSENT_SCORE; +} + +struct HaplotypesConfig { + enum OperatingMode { + mode_invalid, + mode_sample_graph, + mode_preprocess, + mode_sample_haplotypes, + mode_map_variants, + mode_extract, + }; + + OperatingMode mode = mode_invalid; + Haplotypes::Verbosity verbosity = Haplotypes::verbosity_silent; + + // File names. + std::string graph_name; + std::string gbz_output, haplotype_output, score_output; + std::string distance_name, r_index_name; + std::string haplotype_input, kmer_input, vcf_input; + + // Computational parameters. + size_t k = haplotypes_default_k(), w = haplotypes_default_w(); + HaplotypePartitioner::Parameters partitioner_parameters; + Recombinator::Parameters recombinator_parameters; + + // A prefix to add to VCF contig names to get GBWT contig names. + std::string contig_prefix; + + // For extracting local haplotypes. + size_t chain_id = std::numeric_limits::max(); + size_t subchain_id = std::numeric_limits::max(); + + // Other parameters. + size_t threads = haplotypes_default_threads(); + bool validate = false; + + HaplotypesConfig(int argc, char** argv, size_t max_threads); +}; + +void preprocess_graph(const gbwtgraph::GBZ& gbz, Haplotypes& haplotypes, HaplotypesConfig& config); + +void sample_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); + +void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); + +void extract_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); + +//---------------------------------------------------------------------------- + +int main_haplotypes(int argc, char** argv) { + double start = gbwt::readTimer(); + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + size_t max_threads = omp_get_max_threads(); + omp_set_num_threads(haplotypes_default_threads()); + + // Parse the arguments. + HaplotypesConfig config(argc, argv, max_threads); + + // Load the graph. + gbwtgraph::GBZ gbz; + load_gbz(gbz, config.graph_name, config.verbosity >= Haplotypes::verbosity_basic); + + // Generate or load haplotype information. + Haplotypes haplotypes; + if (config.mode == HaplotypesConfig::mode_sample_graph || config.mode == HaplotypesConfig::mode_preprocess) { + preprocess_graph(gbz, haplotypes, config); + } else { + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Loading haplotype information from " << config.haplotype_input << std::endl; + } + sdsl::simple_sds::load_from(haplotypes, config.haplotype_input); + } + + // Save haplotype information if necessary. + if (!config.haplotype_output.empty()) { + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Writing haplotype information to " << config.haplotype_output << std::endl; + } + sdsl::simple_sds::serialize_to(haplotypes, config.haplotype_output); + } + + // Sample the haplotypes. + if (config.mode == HaplotypesConfig::mode_sample_graph || config.mode == HaplotypesConfig::mode_sample_haplotypes) { + sample_haplotypes(gbz, haplotypes, config); + } + + // Map variants to subchains. + if (config.mode == HaplotypesConfig::mode_map_variants) { + map_variants(gbz, haplotypes, config); + } + + // Extract local haplotypes in FASTA format. + if (config.mode == HaplotypesConfig::mode_extract) { + extract_haplotypes(gbz, haplotypes, config); + } + + if (config.verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - start; + double gib = gbwt::inGigabytes(gbwt::memoryUsage()); + std::cerr << "Used " << seconds << " seconds, " << gib << " GiB" << std::endl; + } + return 0; +} + +static vg::subcommand::Subcommand vg_haplotypes("haplotypes", "haplotype sampling based on kmer counts", vg::subcommand::DEVELOPMENT, main_haplotypes); + +//---------------------------------------------------------------------------- + +void help_haplotypes(char** argv, bool developer_options) { + std::string usage = " " + std::string(argv[0]) + " " + std::string(argv[1]) + " [options] "; + std::cerr << "Usage:" << std::endl; + std::cerr << usage << "-k kmers.kff -g output.gbz graph.gbz" << std::endl; + std::cerr << usage << "-H output.hapl graph.gbz" << std::endl; + std::cerr << usage << "-i graph.hapl -k kmers.kff -g output.gbz graph.gbz" << std::endl; + if (developer_options) { + std::cerr << usage << "-i graph.hapl --vcf-input variants.vcf graph.gbz > output.tsv" << std::endl; + std::cerr << usage << "-i graph.hapl -k kmers.kff --extract M:N graph.gbz > output.fa" << std::endl; + } + std::cerr << std::endl; + + std::cerr << "Haplotype sampling based on kmer counts." << std::endl; + std::cerr << std::endl; + std::cerr << "Output files:" << std::endl; + std::cerr << " -g, --gbz-output X write the output GBZ to X" << std::endl; + std::cerr << " -H, --haplotype-output X write haplotype information to X" << std::endl; + std::cerr << std::endl; + std::cerr << "Input files:" << std::endl; + std::cerr << " -d, --distance-index X use this distance index (default: .dist)" << std::endl; + std::cerr << " -r, --r-index X use this r-index (default: .ri)" << std::endl; + std::cerr << " -i, --haplotype-input X use this haplotype information (default: generate)" << std::endl; + std::cerr << " -k, --kmer-input X use kmer counts from this KFF file (required for --gbz-output)" << std::endl; + std::cerr << std::endl; + std::cerr << "Options for generating haplotype information:" << std::endl; + std::cerr << " --kmer-length N kmer length for building the minimizer index (default: " << haplotypes_default_k() << ")" << std::endl; + std::cerr << " --window-length N window length for building the minimizer index (default: " << haplotypes_default_w() << ")" << std::endl; + std::cerr << " --subchain-length N target length (in bp) for subchains (default: " << haplotypes_default_subchain_length() << ")" << std::endl; + std::cerr << std::endl; + std::cerr << "Options for sampling haplotypes:" << std::endl; + std::cerr << " --coverage N kmer coverage in the KFF file (default: estimate)" << std::endl; + std::cerr << " --num-haplotypes N generate N haplotypes (default: " << haplotypes_default_n() << ")" << std::endl; + std::cerr << " --present-discount F discount scores for present kmers by factor F (default: " << haplotypes_default_discount() << ")" << std::endl; + std::cerr << " --het-adjustment F adjust scores for heterozygous kmers by F (default: " << haplotypes_default_adjustment() << ")" << std::endl; + std::cerr << " --absent-score F score absent kmers -F/+F (default: " << haplotypes_default_absent() << ")" << std::endl; + std::cerr << " --diploid-sampling choose the best pair from the greedily selected haplotypes" << std::endl; + std::cerr << " --include-reference include named and reference paths in the output" << std::endl; + std::cerr << std::endl; + std::cerr << "Other options:" << std::endl; + std::cerr << " -v, --verbosity N verbosity level (0 = silent, 1 = basic, 2 = detailed, 3 = debug; default: 0)" << std::endl; + std::cerr << " -t, --threads N approximate number of threads (default: " << haplotypes_default_threads() << " on this system)" << std::endl; + std::cerr << std::endl; + if (developer_options) { + std::cerr << "Developer options:" << std::endl; + std::cerr << " --validate validate the generated information (may be slow)" << std::endl; + std::cerr << " --vcf-input X map the variants in VCF file X to subchains" << std::endl; + std::cerr << " --contig-prefix X a prefix for transforming VCF contig names into GBWT contig names" << std::endl; + std::cerr << " --extract M:N extract haplotypes in chain M, subchain N in FASTA format" << std::endl; + std::cerr << " --score-output X write haplotype scores to X" << std::endl; + std::cerr << std::endl; + } +} + +//---------------------------------------------------------------------------- + +HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { + constexpr int OPT_KMER_LENGTH = 1200; + constexpr int OPT_WINDOW_LENGTH = 1201; + constexpr int OPT_SUBCHAIN_LENGTH = 1202; + constexpr int OPT_COVERAGE = 1300; + constexpr int OPT_NUM_HAPLOTYPES = 1301; + constexpr int OPT_PRESENT_DISCOUNT = 1302; + constexpr int OPT_HET_ADJUSTMENT = 1303; + constexpr int OPT_ABSENT_SCORE = 1304; + constexpr int OPT_DIPLOID_SAMPLING = 1305; + constexpr int OPT_INCLUDE_REFERENCE = 1306; + constexpr int OPT_VALIDATE = 1400; + constexpr int OPT_VCF_INPUT = 1500; + constexpr int OPT_CONTIG_PREFIX = 1501; + constexpr int OPT_EXTRACT = 1600; + constexpr int OPT_SCORE_OUTPUT = 1601; + + static struct option long_options[] = + { + { "gbz-output", required_argument, 0, 'g' }, + { "haplotype-output", required_argument, 0, 'H' }, + { "distance-index", required_argument, 0, 'd' }, + { "r-index", required_argument, 0, 'r' }, + { "haplotype-input", required_argument, 0, 'i' }, + { "kmer-input", required_argument, 0, 'k' }, + { "kmer-length", required_argument, 0, OPT_KMER_LENGTH }, + { "window-length", required_argument, 0, OPT_WINDOW_LENGTH }, + { "subchain-length", required_argument, 0, OPT_SUBCHAIN_LENGTH }, + { "coverage", required_argument, 0, OPT_COVERAGE }, + { "num-haplotypes", required_argument, 0, OPT_NUM_HAPLOTYPES }, + { "present-discount", required_argument, 0, OPT_PRESENT_DISCOUNT }, + { "het-adjustment", required_argument, 0, OPT_HET_ADJUSTMENT }, + { "absent-score", required_argument, 0, OPT_ABSENT_SCORE }, + { "diploid-sampling", no_argument, 0, OPT_DIPLOID_SAMPLING }, + { "include-reference", no_argument, 0, OPT_INCLUDE_REFERENCE }, + { "verbosity", required_argument, 0, 'v' }, + { "threads", required_argument, 0, 't' }, + { "validate", no_argument, 0, OPT_VALIDATE }, + { "vcf-input", required_argument, 0, OPT_VCF_INPUT }, + { "contig-prefix", required_argument, 0, OPT_CONTIG_PREFIX }, + { "extract", required_argument, 0, OPT_EXTRACT }, + { "score-output", required_argument, 0, OPT_SCORE_OUTPUT }, + { 0, 0, 0, 0 } + }; + + // Process the arguments. + int c; + optind = 2; // force optind past command positional argument + while (true) { + int option_index = 0; + c = getopt_long(argc, argv, "g:H:d:r:i:k:v:t:h", long_options, &option_index); + if (c == -1) { break; } // End of options. + + switch (c) + { + case 'g': + this->gbz_output = optarg; + break; + case 'H': + this->haplotype_output = optarg; + break; + + case 'd': + this->distance_name = optarg; + break; + case 'r': + this->r_index_name = optarg; + break; + case 'i': + this->haplotype_input = optarg; + break; + case 'k': + this->kmer_input = optarg; + break; + + case OPT_KMER_LENGTH: + this->k = parse(optarg); + if (this->k == 0 || this->k > gbwtgraph::Key64::KMER_MAX_LENGTH) { + std::cerr << "error: [vg haplotypes] kmer length must be between 1 and " << gbwtgraph::Key64::KMER_MAX_LENGTH << std::endl; + std::exit(EXIT_FAILURE); + } + break; + case OPT_WINDOW_LENGTH: + this->w = parse(optarg); + if (this->w == 0) { + std::cerr << "error: [vg haplotypes] window length cannot be 0" << std::endl; + std::exit(EXIT_FAILURE); + } + break; + case OPT_SUBCHAIN_LENGTH: + this->partitioner_parameters.subchain_length = parse(optarg); + if (this->partitioner_parameters.subchain_length == 0) { + std::cerr << "error: [vg haplotypes] subchain length cannot be 0" << std::endl; + std::exit(EXIT_FAILURE); + } + break; + case OPT_COVERAGE: + this->recombinator_parameters.coverage = parse(optarg); + break; + case OPT_NUM_HAPLOTYPES: + this->recombinator_parameters.num_haplotypes = parse(optarg); + if (this->recombinator_parameters.num_haplotypes == 0) { + std::cerr << "error: [vg haplotypes] number of haplotypes cannot be 0" << std::endl; + std::exit(EXIT_FAILURE); + } + break; + case OPT_PRESENT_DISCOUNT: + this->recombinator_parameters.present_discount = parse(optarg); + if (this->recombinator_parameters.present_discount < 0.0 || this->recombinator_parameters.present_discount > 1.0) { + std::cerr << "error: [vg haplotypes] present discount must be between 0.0 and 1.0" << std::endl; + std::exit(EXIT_FAILURE); + } + break; + case OPT_HET_ADJUSTMENT: + this->recombinator_parameters.het_adjustment = parse(optarg); + if (this->recombinator_parameters.het_adjustment < 0.0) { + std::cerr << "error: [vg haplotypes] het adjustment must be non-negative" << std::endl; + std::exit(EXIT_FAILURE); + } + break; + case OPT_ABSENT_SCORE: + this->recombinator_parameters.absent_score = parse(optarg); + if (this->recombinator_parameters.absent_score < 0.0) { + std::cerr << "error: [vg haplotypes] absent score must be non-negative" << std::endl; + std::exit(EXIT_FAILURE); + } + break; + case OPT_DIPLOID_SAMPLING: + this->recombinator_parameters.diploid_sampling = true; + break; + case OPT_INCLUDE_REFERENCE: + this->recombinator_parameters.include_reference = true; + break; + + case 'v': + { + size_t level = parse(optarg); + if (level > Haplotypes::verbosity_debug) { + std::cerr << "error: [vg haplotypes] invalid verbosity level: " << level << std::endl; + std::exit(EXIT_FAILURE); + } + this->verbosity = static_cast(level); + } + break; + case 't': + this->threads = parse(optarg); + if (this->threads == 0 || this->threads > max_threads) { + std::cerr << "error: [vg haplotypes] cannot run " << this->threads << " threads in parallel on this system" << std::endl; + std::exit(EXIT_FAILURE); + } + omp_set_num_threads(this->threads); + break; + + case OPT_VALIDATE: + this->validate = true; + break; + case OPT_VCF_INPUT: + this->vcf_input = optarg; + break; + case OPT_CONTIG_PREFIX: + this->contig_prefix = optarg; + break; + case OPT_EXTRACT: + { + std::string arg = optarg; + size_t offset = arg.find(':'); + if (offset == 0 || offset == std::string::npos || offset + 1 >= arg.length()) { + std::cerr << "error: [vg haplotypes] cannot parse chain:subchain from " << arg << std::endl; + std::exit(EXIT_FAILURE); + } + this->chain_id = parse(arg.substr(0, offset)); + this->subchain_id = parse(arg.substr(offset + 1)); + } + break; + case OPT_SCORE_OUTPUT: + this->score_output = optarg; + break; + + case 'h': + case '?': + help_haplotypes(argv, true); + std::exit(EXIT_FAILURE); + default: + std::abort(); + } + } + + // Determine input graph and set operating mode. + if (optind + 1 != argc) { + help_haplotypes(argv, false); + std::exit(EXIT_FAILURE); + } + this->graph_name = argv[optind]; + if (this->haplotype_input.empty() && !this->kmer_input.empty() && !this->gbz_output.empty()) { + this->mode = mode_sample_graph; + } else if (this->haplotype_input.empty() && !this->haplotype_output.empty()) { + this->mode = mode_preprocess; + } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() && !this->gbz_output.empty()) { + this->mode = mode_sample_haplotypes; + } else if (!this->haplotype_input.empty() && !this->vcf_input.empty()) { + this->mode = mode_map_variants; + } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() && + this->chain_id < std::numeric_limits::max() && this->subchain_id < std::numeric_limits::max()) { + this->mode = mode_extract; + } + if (this->mode == mode_invalid) { + help_haplotypes(argv, false); + std::exit(EXIT_FAILURE); + } +} + +//---------------------------------------------------------------------------- + +void validate_haplotypes(const Haplotypes& haplotypes, + const gbwtgraph::GBWTGraph& graph, + const gbwt::FastLocate& r_index, + const HaplotypePartitioner::minimizer_index_type& minimizer_index, + size_t expected_chains, + HaplotypePartitioner::Verbosity verbosity); + +bool ends_with(const std::string& str, const std::string& suffix) { + if (str.length() < suffix.length()) { + return false; + } + return (str.substr(str.length() - suffix.length()) == suffix); +} + +std::string get_name(const std::string& graph_name, const std::string& extension) { + size_t length = graph_name.length(); + if (ends_with(graph_name, gbwtgraph::GBZ::EXTENSION)) { + length -= gbwtgraph::GBZ::EXTENSION.length(); + } + return graph_name.substr(0, length) + extension; +} + +void preprocess_graph(const gbwtgraph::GBZ& gbz, Haplotypes& haplotypes, HaplotypesConfig& config) { + double start = gbwt::readTimer(); + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Generating haplotype information" << std::endl; + } + + // Distance index. + if (config.distance_name.empty()) { + config.distance_name = get_name(config.graph_name, ".dist"); + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Guessing that distance index is " << config.distance_name << std::endl; + } + } + SnarlDistanceIndex distance_index; + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Loading distance index from " << config.distance_name << std::endl; + } + distance_index.deserialize(config.distance_name); + size_t expected_chains = 0; + distance_index.for_each_child(distance_index.get_root(), [&](const handlegraph::net_handle_t&) { + expected_chains++; + }); + + // Minimizer index. + HaplotypePartitioner::minimizer_index_type minimizer_index(config.k, config.w, false); + { + double minimizer = gbwt::readTimer(); + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Building minimizer index" << std::endl; + } + gbwtgraph::index_haplotypes(gbz.graph, minimizer_index); + if (config.verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - minimizer; + std::cerr << "Built the minimizer index in " << seconds << " seconds" << std::endl; + } + } + + // R-index. + if (config.r_index_name.empty()) { + config.r_index_name = get_name(config.graph_name, gbwt::FastLocate::EXTENSION); + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Guessing that r-index is " << config.r_index_name << std::endl; + } + } + gbwt::FastLocate r_index; + load_r_index(r_index, config.r_index_name, config.verbosity >= Haplotypes::verbosity_basic); + r_index.setGBWT(gbz.index); + + // Partition the haplotypes. + HaplotypePartitioner partitioner(gbz, r_index, distance_index, minimizer_index, config.verbosity); + try { + haplotypes = partitioner.partition_haplotypes(config.partitioner_parameters); + } + catch (const std::runtime_error& e) { + std::cerr << e.what() << std::endl; + std::exit(EXIT_FAILURE); + } + if (config.verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Generated haplotype information in " << seconds << " seconds" << std::endl; + } + + // Validate the haplotypes. + if (config.validate) { + validate_haplotypes(haplotypes, gbz.graph, r_index, minimizer_index, expected_chains, config.verbosity); + } +} + +//---------------------------------------------------------------------------- + +size_t threads_to_jobs(size_t threads) { + size_t jobs = std::round(0.85 * threads); + return std::max(jobs, size_t(1)); +} + +void validate_subgraph(const gbwtgraph::GBWTGraph& graph, const gbwtgraph::GBWTGraph& subgraph, HaplotypePartitioner::Verbosity verbosity); + +void sample_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) { + omp_set_num_threads(threads_to_jobs(config.threads)); + Recombinator recombinator(gbz, config.verbosity); + gbwt::GBWT merged = recombinator.generate_haplotypes(haplotypes, config.kmer_input, config.recombinator_parameters); + omp_set_num_threads(config.threads); // Restore the number of threads. + + // Build and serialize GBWTGraph. + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Building GBWTGraph" << std::endl; + } + double checkpoint = gbwt::readTimer(); + gbwtgraph::GBWTGraph output_graph = gbz.graph.subgraph(merged); + if (config.verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - checkpoint; + std::cerr << "Built the GBWTGraph in " << seconds << " seconds" << std::endl; + } + save_gbz(merged, output_graph, config.gbz_output, config.verbosity >= Haplotypes::verbosity_basic); + + // Validate the graph. + if (config.validate) { + // TODO: How could we validate the haplotypes? + validate_subgraph(gbz.graph, output_graph, config.verbosity); + } +} + +//---------------------------------------------------------------------------- + +gbwt::size_type path_for_contig(const gbwtgraph::GBZ& gbz, gbwt::size_type contig_id, const std::string& contig_name) { + gbwt::size_type path_id = gbz.index.metadata.paths(); + size_t found_paths = 0; + for (size_t i = 0; i < gbz.graph.named_paths.size(); i++) { + gbwt::size_type candidate = gbz.graph.named_paths[i].id; + if (gbz.index.metadata.path(candidate).contig == contig_id) { + path_id = candidate; + found_paths++; + } + } + if (found_paths != 1) { + std::cerr << "error: [vg haplotypes] found " << found_paths << " named/reference paths for contig " << contig_name << std::endl; + std::exit(EXIT_FAILURE); + } + return path_id; +} + +std::pair seq_chain_for_path(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, const std::string& contig_name) { + gbwt::size_type sequence_id = gbwt::Path::encode(path_id, false); + gbwt::size_type reverse_id = gbwt::Path::encode(path_id, true); + size_t found_chains = 0; + std::pair result(gbwt::invalid_sequence(), haplotypes.components()); + for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) { + const Haplotypes::Subchain& subchain = haplotypes.chains[chain_id].subchains.front(); + for (size_t i = 0; i < subchain.sequences.size(); i++) { + if (subchain.sequences[i].first == sequence_id) { + result.first = sequence_id; + result.second = chain_id; + found_chains++; + break; + } + if (subchain.sequences[i].first == reverse_id) { + result.first = reverse_id; + result.second = chain_id; + found_chains++; + break; + } + } + } + if (found_chains != 1) { + std::cerr << "error: [vg haplotypes] found " << found_chains << " top-level chains for contig " << contig_name << std::endl; + std::exit(EXIT_FAILURE); + } + return result; +} + +struct ReferenceInterval { + enum order { before, overlap, after }; + + Haplotypes::Subchain::subchain_t type; + + size_t id; + + // Semiopen range of reference positions for the internal parts of the subchain. + size_t start, end; + + // Where is this interval relative to the specified interval? + order compare(std::pair interval) { + if (this->end <= interval.first) { + return before; + } else if (this->start >= interval.second) { + return after; + } else { + return overlap; + } + } + + size_t length() const { + return this->end - this->start; + } + + std::string to_string() const { + std::string result; + switch (this->type) { + case Haplotypes::Subchain::normal: + result.push_back('N'); + break; + case Haplotypes::Subchain::prefix: + result.push_back('P'); + break; + case Haplotypes::Subchain::suffix: + result.push_back('S'); + break; + case Haplotypes::Subchain::full_haplotype: + result.push_back('F'); + break; + } + result += std::to_string(this->id) + "(" + std::to_string(this->start) + ".." + std::to_string(this->end) + ")"; + return result; + } +}; + +std::vector subchain_intervals(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type sequence_id, size_t chain_id, bool reverse) { + gbwt::size_type actual_sequence_id = (reverse ? gbwt::Path::reverse(sequence_id) : sequence_id); + gbwt::vector_type path = gbz.index.extract(actual_sequence_id); + size_t total_length = 0; + for (auto gbwt_node : path) { + total_length += gbz.graph.get_length(gbwtgraph::GBWTGraph::node_to_handle(gbwt_node)); + } + + const Haplotypes::TopLevelChain& chain = haplotypes.chains[chain_id]; + std::vector result; + size_t seq_offset = 0, node_offset = 0; + for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) { + size_t actual_subchain_id; + Haplotypes::Subchain subchain; + if (reverse) { + actual_subchain_id = chain.subchains.size() - 1 - subchain_id; + switch (chain.subchains[actual_subchain_id].type) { + case Haplotypes::Subchain::prefix: + subchain.type = Haplotypes::Subchain::suffix; + break; + case Haplotypes::Subchain::suffix: + subchain.type = Haplotypes::Subchain::prefix; + break; + default: + subchain.type = chain.subchains[actual_subchain_id].type; + break; + } + subchain.start = gbwt::Node::reverse(chain.subchains[actual_subchain_id].end); + subchain.end = gbwt::Node::reverse(chain.subchains[actual_subchain_id].start); + } else { + actual_subchain_id = subchain_id; + subchain.type = chain.subchains[actual_subchain_id].type; + subchain.start = chain.subchains[actual_subchain_id].start; + subchain.end = chain.subchains[actual_subchain_id].end; + } + ReferenceInterval interval { subchain.type, actual_subchain_id, 0, total_length }; + if (subchain.has_start()) { + while (node_offset < path.size() && path[node_offset] != subchain.start) { + seq_offset += gbz.graph.get_length(gbwtgraph::GBWTGraph::node_to_handle(path[node_offset])); + node_offset++; + } + if (node_offset < path.size()) { + seq_offset += gbz.graph.get_length(gbwtgraph::GBWTGraph::node_to_handle(path[node_offset])); + node_offset++; + } + interval.start = seq_offset; + } else if (subchain.type == Haplotypes::Subchain::prefix) { + // If a prefix follows a suffix, they cover the same interval. + interval.start = result.back().start; + } + if (subchain.has_end()) { + while (node_offset < path.size() && path[node_offset] != subchain.end) { + seq_offset += gbz.graph.get_length(gbwtgraph::GBWTGraph::node_to_handle(path[node_offset])); + node_offset++; + } + interval.end = seq_offset; + // If a prefix follows a suffix, they cover the same interval. + if (subchain.type == Haplotypes::Subchain::prefix) { + result.back().end = interval.end; + } + } + result.push_back(interval); + } + + return result; +} + +void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) { + if (!gbz.index.metadata.hasContigNames()) { + std::cerr << "error: [vg haplotypes] cannot map variant positions without contig names in the GBWT index" << std::endl; + } + + // Read variants from the VCF file. + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Reading VCF file " << config.vcf_input << std::endl; + } + vcflib::VariantCallFile variant_file; + variant_file.parseSamples = false; // Just in case there are many samples. + std::string temp_filename = config.vcf_input; + variant_file.open(temp_filename); + if (!variant_file.is_open()) { + std::cerr << "error: [vg haplotypes] cannot open VCF file " << config.vcf_input << std::endl; + std::exit(EXIT_FAILURE); + } + std::unordered_map contig_to_offset; // VCF contig name to offset in `variant positions`. + std::vector>> variant_positions; // Semiopen 0-based ranges of sequence positions. + vcflib::Variant var(variant_file); + size_t total_variants = 0; + while (variant_file.is_open() && variant_file.getNextVariant(var)) { + size_t offset; + auto iter = contig_to_offset.find(var.sequenceName); + if (iter == contig_to_offset.end()) { + offset = variant_positions.size(); + contig_to_offset[var.sequenceName] = offset; + variant_positions.push_back({}); + } else { + offset = iter->second; + } + size_t start = var.zeroBasedPosition(); + variant_positions[offset].push_back({ start, start + var.ref.length() }); + total_variants++; + } + for (auto& positions : variant_positions) { + std::sort(positions.begin(), positions.end()); + } + if (config.verbosity >= Haplotypes::verbosity_detailed) { + std::cerr << "Read " << total_variants << " variants over " << variant_positions.size() << " contigs" << std::endl; + } + + // Map VCF contig names to GBWT sequence ids for named/reference paths and top-level chain. + std::vector contig_names(contig_to_offset.size(), ""); + std::vector> offset_to_seq_chain(contig_to_offset.size(), { gbwt::invalid_sequence(), haplotypes.components() }); + for (auto iter = contig_to_offset.begin(); iter != contig_to_offset.end(); ++iter) { + std::string contig_name = config.contig_prefix + iter->first; + gbwt::size_type contig_id = gbz.index.metadata.contig(contig_name); + if (contig_id >= gbz.index.metadata.contigs()) { + std::cerr << "error: [vg haplotypes] no contig " << contig_name << " in the GBWT index" << std::endl; + std::exit(EXIT_FAILURE); + } + contig_names[iter->second] = contig_name; + gbwt::size_type path_id = path_for_contig(gbz, contig_id, contig_name); + std::pair seq_chain = seq_chain_for_path(gbz, haplotypes, path_id, contig_name); + offset_to_seq_chain[iter->second] = seq_chain; + if (config.verbosity >= Haplotypes::verbosity_debug) { + std::cerr << "VCF contig " << iter->first << ", GBWT contig " << contig_name + << ": contig id " << contig_id + << ", path id " << path_id + << ", reverse " << gbwt::Path::is_reverse(seq_chain.first) + << ", chain " << seq_chain.second << std::endl; + } + } + + // Output (contig[interval], top-level chain, subchains, subchain lengths) + for (auto iter = contig_to_offset.begin(); iter != contig_to_offset.end(); ++iter) { + std::string contig_name = config.contig_prefix + iter->first; + size_t offset = iter->second; + gbwt::size_type sequence_id = offset_to_seq_chain[offset].first; + gbwt::size_type chain_id = offset_to_seq_chain[offset].second; + auto ref_intervals = subchain_intervals(gbz, haplotypes, sequence_id, chain_id, gbwt::Path::is_reverse(sequence_id)); + for (auto interval : variant_positions[offset]) { + size_t low = 0, high = ref_intervals.size(); + bool found = false; + while (!found && low < high) { + size_t mid = low + (high - low) / 2; + switch (ref_intervals[mid].compare(interval)) { + case ReferenceInterval::before: + low = mid + 1; + break; + case ReferenceInterval::overlap: + low = mid; + while (low > 0 && ref_intervals[low - 1].compare(interval) == ReferenceInterval::overlap) { + low--; + } + high = mid + 1; + while (high < ref_intervals.size() && ref_intervals[high].compare(interval) == ReferenceInterval::overlap) { + high++; + } + found = true; + break; + case ReferenceInterval::after: + high = mid; + break; + } + } + std::cout << iter->first << "[" << interval.first << ".." << interval.second << "]\t" << chain_id << "\t"; + if (low >= high) { + if (low > 0) { + std::cout << ref_intervals[low - 1].to_string(); + } + std::cout << ".."; + if (low < ref_intervals.size()) { + std::cout << ref_intervals[low].to_string(); + } + } else { + for (size_t i = low; i < high; i++) { + if (i > low) { + std::cout << ","; + } + std::cout << ref_intervals[i].to_string(); + } + } + std::cout << "\t"; + for (size_t i = low; i < high; i++) { + if (i > low) { + std::cout << ","; + } + std::cout << ref_intervals[i].length(); + } + std::cout << std::endl; + } + } +} +//---------------------------------------------------------------------------- + +void extract_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) { + if (config.verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Extracting haplotypes from chain " << config.chain_id << ", subchain " << config.subchain_id << std::endl; + } + + Recombinator recombinator(gbz, config.verbosity); + auto result = recombinator.extract_sequences( + haplotypes, config.kmer_input, + config.chain_id, config.subchain_id, config.recombinator_parameters + ); + if (config.verbosity >= Haplotypes::verbosity_detailed) { + std::cerr << "Found " << result.size() << " haplotypes" << std::endl; + } + for (auto& sequence : result) { + write_fasta_sequence(sequence.name, sequence.sequence, std::cout); + } + + if (!config.score_output.empty()) { + std::ofstream out(config.score_output, std::ios_base::binary); + if (!out) { + std::cerr << "error: [vg haplotypes] cannot open score file " << config.score_output << " for writing" << std::endl; + return; + } + for (auto& sequence : result) { + out << sequence.name; + for (size_t i = 0; i < config.recombinator_parameters.num_haplotypes; i++) { + if (i < sequence.scores.size()) { + out << "\t" << sequence.scores[i].first << "\t" << sequence.scores[i].second; + } else { + out << "\t-\t-"; + } + } + out << "\n"; + } + } +} + +//---------------------------------------------------------------------------- + +void validate_error(const std::string& header, const std::string& message) { + std::cerr << "error: [vg haplotypes] "; + if (!header.empty()) { + std::cerr << header << ": "; + } + std::cerr << message << std::endl; + std::exit(EXIT_FAILURE); +} + +template +std::string expected_got(T expected, T got) { + return "expected " + std::to_string(expected) + ", got " + std::to_string(got); +} + +template +std::string pair_to_string(std::pair value) { + return "(" + std::to_string(value.first) + ", " + std::to_string(value.second) + ")"; +} + +void validate_error_chain(size_t chain_id, const std::string& message) { + validate_error("chain " + std::to_string(chain_id), message); +} + +void validate_error_subchain(size_t chain_id, size_t subchain_id, const std::string& message) { + validate_error("chain " + std::to_string(chain_id) + ", subchain " + std::to_string(subchain_id), message); +} + +void validate_error_sequence(size_t chain_id, size_t subchain_id, size_t sequence_id, const std::string& message) { + std::string header = "chain " + std::to_string(chain_id) + ", subchain " + std::to_string(subchain_id) + ", sequence " + std::to_string(sequence_id); + validate_error(header, message); +} + +std::string validate_unary_path(const HandleGraph& graph, handle_t from, handle_t to) { + hash_set visited; + handle_t curr = from; + while (curr != to) { + if (visited.find(curr) != visited.end()) { + return "incoming path contains a cycle"; + } + visited.insert(curr); + handle_t successor = empty_gbwtgraph_handle(); + size_t successors = 0; + graph.follow_edges(curr, false, [&](const handle_t& next) { + successor = next; + successors++; + }); + if (successors != 1) { + return "incoming path is not unary"; + } + curr = successor; + } + return ""; +} + +// Returns true if the path from (start, offset) reaches end without revisiting start. +bool trace_path(const gbwt::GBWT& index, gbwt::node_type start, gbwt::size_type offset, gbwt::node_type end) { + gbwt::edge_type pos(start, offset); + while (pos.first != end) { + pos = index.LF(pos); + if (pos.first == gbwt::ENDMARKER || pos.first == start) { + return false; + } + } + return true; +} + +// Returns the given haplotype over the given subchain. +std::string get_haplotype(const gbwtgraph::GBWTGraph& graph, Haplotypes::sequence_type sequence, + gbwt::node_type from, gbwt::node_type to, size_t k) { + std::string haplotype; + gbwt::edge_type pos; + + // Initial node with three cases (from start, suffix of a long `from`, short `from`). + if (from == gbwt::ENDMARKER) { + pos = graph.index->start(sequence.first); + gbwtgraph::view_type view = graph.get_sequence_view(gbwtgraph::GBWTGraph::node_to_handle(pos.first)); + haplotype.append(view.first, view.second); + } else { + pos = gbwt::edge_type(from, sequence.second); + gbwtgraph::view_type view = graph.get_sequence_view(gbwtgraph::GBWTGraph::node_to_handle(pos.first)); + if (view.second >= k) { + haplotype.append(view.first + view.second - (k - 1), k - 1); + } else { + haplotype.append(view.first, view.second); + } + } + + while (true) { + pos = graph.index->LF(pos); + if (pos.first == gbwt::ENDMARKER) { + break; + } + gbwtgraph::view_type view = graph.get_sequence_view(gbwtgraph::GBWTGraph::node_to_handle(pos.first)); + if (pos.first == to) { + haplotype.append(view.first, std::min(view.second, k - 1)); + break; + } else { + haplotype.append(view.first, view.second); + } + } + + return haplotype; +} + +void validate_chain(const Haplotypes::TopLevelChain& chain, + const gbwtgraph::GBWTGraph& graph, + const gbwt::FastLocate& r_index, + const HaplotypePartitioner::minimizer_index_type& minimizer_index, + size_t chain_id, + HaplotypePartitioner::Verbosity verbosity) { + if (chain.offset != chain_id) { + validate_error_chain(chain_id, "stored id is " + std::to_string(chain.offset)); + } + if (chain.subchains.empty()) { + validate_error_chain(chain_id, "the chain is empty"); + } + + const Haplotypes::Subchain* prev = nullptr; + for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) { + const Haplotypes::Subchain& subchain = chain.subchains[subchain_id]; + + // Check that the subchain is of an appropriate type. + switch (subchain.type) { + case Haplotypes::Subchain::normal: + break; + case Haplotypes::Subchain::prefix: + if (subchain_id > 0 && prev->type != Haplotypes::Subchain::suffix) { + validate_error_subchain(chain_id, subchain_id, "a prefix inside a fragment"); + } + break; + case Haplotypes::Subchain::suffix: + break; + case Haplotypes::Subchain::full_haplotype: + if (chain.subchains.size() != 1) { + validate_error_subchain(chain_id, subchain_id, "full haplotypes in a nontrivial chain"); + } + break; + } + + // Check that the boundary nodes have been defined. + if (subchain.has_start() && subchain.start == gbwt::ENDMARKER) { + validate_error_subchain(chain_id, subchain_id, "missing start node"); + } + if (subchain.has_end() && subchain.end == gbwt::ENDMARKER) { + validate_error_subchain(chain_id, subchain_id, "missing end node"); + } + + // Check that the kmer presence bitvector is of appropriate length. + size_t total_kmers = subchain.sequences.size() * subchain.kmers.size(); + if (subchain.kmers_present.size() != total_kmers) { + std::string message = expected_got(total_kmers, subchain.kmers_present.size()) + " kmer occurrences"; + validate_error_subchain(chain_id, subchain_id, message); + } + + // Check that there is a unary path from the previous subchain if the + // appropriate boundary nodes are present. + if (subchain_id > 0 && prev->has_end() && subchain.has_start()) { + std::string message = validate_unary_path(graph, gbwtgraph::GBWTGraph::node_to_handle(prev->end), gbwtgraph::GBWTGraph::node_to_handle(subchain.start)); + if (!message.empty()) { + validate_error_subchain(chain_id, subchain_id, message); + } + } + + // Sequences: normal subchains. + if (subchain.type == Haplotypes::Subchain::normal) { + std::vector da = r_index.decompressDA(subchain.start); + hash_set selected; + for (size_t i = 0; i < da.size(); i++) { + if (trace_path(*(graph.index), subchain.start, i, subchain.end)) { + selected.insert(Haplotypes::sequence_type(da[i], i)); + } + } + if (subchain.sequences.size() != selected.size()) { + std::string message = expected_got(selected.size(), subchain.sequences.size()) + " sequences (normal)"; + validate_error_subchain(chain_id, subchain_id, message); + } + for (size_t i = 0; i < subchain.sequences.size(); i++) { + if (selected.find(subchain.sequences[i]) == selected.end()) { + std::string message = "invalid value " + pair_to_string(subchain.sequences[i]); + validate_error_sequence(chain_id, subchain_id, i, message); + } + } + } + + // Sequences: prefixes and suffixes. + if (subchain.type == Haplotypes::Subchain::prefix || subchain.type == Haplotypes::Subchain::suffix) { + gbwt::node_type node = (subchain.has_start() ? subchain.start : subchain.end); + std::vector da = r_index.decompressDA(node); + if (subchain.sequences.size() != da.size()) { + std::string message = expected_got(da.size(), subchain.sequences.size()) + " sequences (prefix / suffix)"; + validate_error_subchain(chain_id, subchain_id, message); + } + hash_set truth; + for (size_t i = 0; i < da.size(); i++) { + truth.insert({ da[i], i }); + } + for (size_t i = 0; i < subchain.sequences.size(); i++) { + if (truth.find(subchain.sequences[i]) == truth.end()) { + std::string message = "invalid value " + pair_to_string(subchain.sequences[i]); + validate_error_sequence(chain_id, subchain_id, i, message); + } + } + } + + // Sequences: full haplotypes. + if (subchain.type == Haplotypes::Subchain::full_haplotype) { + if (subchain.sequences.empty()) { + validate_error_subchain(chain_id, subchain_id, "full haplotypes without sequences"); + } + } + + // Kmers. + if (subchain.type != Haplotypes::Subchain::full_haplotype) { + hash_set all_kmers; + for (size_t i = 0; i < subchain.kmers.size(); i++) { + all_kmers.insert(subchain.kmers[i].first); + } + if (all_kmers.size() != subchain.kmers.size()) { + std::string message = expected_got(subchain.kmers.size(), all_kmers.size()) + " kmers"; + validate_error_subchain(chain_id, subchain_id, message); + } + hash_map used_kmers; // (kmer used in haplotypes, number of sequences that contain it) + hash_map missing_kmers; // (kmer not used in haplotypes, number of sequences that contain it) + for (size_t i = 0; i < subchain.sequences.size(); i++) { + std::string haplotype = get_haplotype(graph, subchain.sequences[i], subchain.start, subchain.end, minimizer_index.k()); + auto minimizers = minimizer_index.minimizers(haplotype); + hash_map unique_minimizers; // (kmer, used in the sequence) + for (auto& minimizer : minimizers) { + if (minimizer_index.count(minimizer) == 1) { + unique_minimizers[minimizer.key.get_key()] = false; + } + } + for (size_t j = 0, offset = i * subchain.kmers.size(); j < subchain.kmers.size(); j++, offset++) { + if (subchain.kmers_present[offset]) { + auto iter = unique_minimizers.find(subchain.kmers[j].first); + if (iter == unique_minimizers.end()) { + std::string message = "kmer " + std::to_string(j) + " not present in the haplotype"; + validate_error_sequence(chain_id, subchain_id, i, message); + } + used_kmers[subchain.kmers[j].first]++; + iter->second = true; + } else { + if (unique_minimizers.find(subchain.kmers[j].first) != unique_minimizers.end()) { + std::string message = "kmer " + std::to_string(j) + " is present in the haplotype"; + validate_error_sequence(chain_id, subchain_id, i, message); + } + } + } + for (auto iter = unique_minimizers.begin(); iter != unique_minimizers.end(); ++iter) { + if (!iter->second) { + missing_kmers[iter->first]++; + } + } + } + size_t invalid_count = 0; + for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { + size_t count = 0; + auto iter = used_kmers.find(subchain.kmers[kmer_id].first); + if (iter == used_kmers.end() || iter->second != subchain.kmers[kmer_id].second) { + invalid_count++; + } + } + if (invalid_count > 0) { + std::string message = "invalid occurrence count for "+ std::to_string(invalid_count) + " kmers"; + validate_error_subchain(chain_id, subchain_id, message); + } + size_t missing_informative_kmers = 0; + for (auto iter = missing_kmers.begin(); iter != missing_kmers.end(); ++iter) { + if (iter->second < subchain.sequences.size()) { + missing_informative_kmers++; + } + } + if (missing_informative_kmers > 0) { + std::string message = "missing " + std::to_string(missing_informative_kmers) + " informative kmers"; + validate_error_subchain(chain_id, subchain_id, message); + } + } + + prev = &subchain; + } +} + +std::string subchain_to_string(size_t chain_id, size_t subchain_id, const Haplotypes::Subchain& subchain) { + return "chain " + std::to_string(chain_id) + ", subchain " + std::to_string(subchain_id) + " (" + subchain.to_string() + ")"; +} + +void validate_haplotypes(const Haplotypes& haplotypes, + const gbwtgraph::GBWTGraph& graph, + const gbwt::FastLocate& r_index, + const HaplotypePartitioner::minimizer_index_type& minimizer_index, + size_t expected_chains, + HaplotypePartitioner::Verbosity verbosity) { + if (verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Validating the haplotype information" << std::endl; + } + double start = gbwt::readTimer(); + + // Header information. + if (haplotypes.k() != minimizer_index.k()) { + validate_error("k-mer length", expected_got(minimizer_index.k(), haplotypes.k())); + } + if (haplotypes.components() != expected_chains) { + validate_error("graph components", expected_got(expected_chains, haplotypes.components())); + } + if (haplotypes.components() != haplotypes.chains.size()) { + validate_error("top-level chains", expected_got(haplotypes.components(), haplotypes.chains.size())); + } + std::vector chains_per_job(haplotypes.jobs(), 0); + for (size_t chain = 0; chain < haplotypes.components(); chain++) { + size_t job_id = haplotypes.chains[chain].job_id; + if (job_id >= haplotypes.jobs()) { + validate_error_chain(chain, "job id " + std::to_string(job_id) + " >= " + std::to_string(haplotypes.jobs())); + } + chains_per_job[job_id]++; + } + for (size_t job_id = 0; job_id < chains_per_job.size(); job_id++) { + if (chains_per_job[job_id] == 0) { + validate_error("", "job " + std::to_string(job_id) + " is empty"); + } + } + + // Cached paths. + if (haplotypes.jobs_for_cached_paths.size() != graph.named_paths.size()) { + validate_error("cached paths", expected_got(graph.named_paths.size(), haplotypes.jobs_for_cached_paths.size())); + } + + // Haplotype information is valid + if (verbosity >= HaplotypePartitioner::Verbosity::verbosity_detailed) { + std::cerr << "Validating subchains, sequences, and kmers" << std::endl; + } + #pragma omp parallel for schedule(dynamic, 1) + for (size_t chain = 0; chain < haplotypes.components(); chain++) { + validate_chain(haplotypes.chains[chain], graph, r_index, minimizer_index, chain, verbosity); + } + + // Kmers are globally unique. + if (verbosity >= HaplotypePartitioner::Verbosity::verbosity_detailed) { + std::cerr << "Validating kmer specificity" << std::endl; + } + hash_map> kmers; + for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) { + const Haplotypes::TopLevelChain& chain = haplotypes.chains[chain_id]; + for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) { + const Haplotypes::Subchain& subchain = chain.subchains[subchain_id]; + for (size_t i = 0; i < subchain.kmers.size(); i++) { + auto iter = kmers.find(subchain.kmers[i].first); + if (iter != kmers.end()) { + const Haplotypes::Subchain& prev = haplotypes.chains[iter->second.first].subchains[iter->second.second]; + if (chain_id == iter->second.first && subchain_id == iter->second.second + 1 && subchain.type == Haplotypes::Subchain::prefix && prev.type == Haplotypes::Subchain::suffix) { + // A prefix subchain may overlap the preceding suffix subchain and + // contain the same kmers. + } else { + std::string message = subchain.to_string() + ": kmer " + std::to_string(i) + " also found in " + subchain_to_string(iter->second.first, iter->second.second, prev); + validate_error_subchain(chain_id, subchain_id, message); + } + } + kmers[subchain.kmers[i].first] = { chain_id, subchain_id }; + } + } + } + kmers.clear(); + + if (verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Validated the haplotype information in " << seconds << " seconds" << std::endl; + } +} + +//---------------------------------------------------------------------------- + +void validate_nodes(const gbwtgraph::GBWTGraph& graph, const gbwtgraph::GBWTGraph& subgraph) { + nid_t last_node = 0; + bool nodes_ok = subgraph.for_each_handle([&](const handle_t& handle) -> bool { + last_node = subgraph.get_id(handle); + return graph.has_node(last_node); + }); + if (!nodes_ok) { + validate_error("", "invalid node " + std::to_string(last_node)); + } +} + +void validate_edges(const gbwtgraph::GBWTGraph& graph, const gbwtgraph::GBWTGraph& subgraph) { + edge_t last_edge(gbwtgraph::GBWTGraph::node_to_handle(0), gbwtgraph::GBWTGraph::node_to_handle(0)); + bool edges_ok = subgraph.for_each_edge([&](const edge_t& edge) -> bool { + last_edge = edge; + return graph.has_edge(edge.first, edge.second); + }); + if (!edges_ok) { + validate_error("", "invalid edge " + to_string_gbwtgraph(last_edge.first) + " to " + to_string_gbwtgraph(last_edge.second)); + } +} + +void validate_subgraph(const gbwtgraph::GBWTGraph& graph, const gbwtgraph::GBWTGraph& subgraph, HaplotypePartitioner::Verbosity verbosity) { + if (verbosity >= Haplotypes::verbosity_basic) { + std::cerr << "Validating the output subgraph" << std::endl; + } + double start = gbwt::readTimer(); + + std::thread nodes(validate_nodes, std::cref(graph), std::cref(subgraph)); + std::thread edges(validate_edges, std::cref(graph), std::cref(subgraph)); + nodes.join(); + edges.join(); + + if (verbosity >= Haplotypes::verbosity_basic) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Validated the subgraph in " << seconds << " seconds" << std::endl; + } +} + +//---------------------------------------------------------------------------- + diff --git a/src/subcommand/help_main.cpp b/src/subcommand/help_main.cpp index f6bee35f000..6a113a3a3b1 100644 --- a/src/subcommand/help_main.cpp +++ b/src/subcommand/help_main.cpp @@ -46,6 +46,8 @@ int main_help(int argc, char** argv){ cerr << endl; } + cerr << "For technical support, please visit: https://www.biostars.org/tag/vg/" << endl << endl; + return 0; } diff --git a/src/subcommand/ids_main.cpp b/src/subcommand/ids_main.cpp index caf1722d6e4..0646bb34605 100644 --- a/src/subcommand/ids_main.cpp +++ b/src/subcommand/ids_main.cpp @@ -14,8 +14,13 @@ #include "../vg.hpp" #include "../vg_set.hpp" -#include "../algorithms/topological_sort.hpp" - +#include +#include +#include +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include +#include "../io/save_handle_graph.hpp" #include using namespace std; @@ -110,19 +115,50 @@ int main_ids(int argc, char** argv) { } if (!join && mapping_name.empty()) { - VG* graph; - get_input_file(optind, argc, argv, [&](istream& in) { - graph = new VG(in); - }); - - if (sort) { - // Set up the nodes so we go through them in topological order - algorithms::sort(graph); - } - - if (compact || sort) { - // Compact only, or compact to re-assign IDs after sort - graph->compact_ids(); + unique_ptr graph; + string graph_filename = get_input_file_name(optind, argc, argv); + graph = vg::io::VPKG::load_one(graph_filename); + + if (sort || compact) { + // We need to reassign IDs + hash_map new_ids; + + if (compact && !sort) { + // We are compacting, but do not need to topologically sort + + // Loop over all the nodes in the graph's order and assign them new IDs in ID order. + // This is slower than it needs to be, but gets us nice results even on graphs that don't preserve node order. + // TODO: counting all the nodes may be an O(1) scan of the graph to save some vector copies. + vector all_ids; + all_ids.reserve(graph->get_node_count()); + graph->for_each_handle([&](const handle_t& h) { + all_ids.emplace_back(graph->get_id(h)); + }); + std::sort(all_ids.begin(), all_ids.end()); + + // Now invert the vector's mapping + new_ids.reserve(all_ids.size()); + for (nid_t i = 1; i < all_ids.size() + 1; i++) { + new_ids[all_ids[i - 1]] = i; + } + } else { + // We are sorting to assign IDs, which inherently compacts. + + // We only need to sort the ID numbers, not the graph's iteration order (if any). + auto handle_order = handlealgs::topological_order(graph.get()); + + // Now invert the order's mapping + new_ids.reserve(handle_order.size()); + for (nid_t i = 1; i < handle_order.size() + 1; i++) { + new_ids[graph->get_id(handle_order[i - 1])] = i; + } + } + + // Now assign the IDs. If we find any e.g. dangling paths or + // edges with no nodes we will crash. + graph->reassign_node_ids([&](const nid_t& old_id) { + return new_ids.at(old_id); + }); } if (increment != 0) { @@ -130,11 +166,10 @@ int main_ids(int argc, char** argv) { } if (decrement != 0) { - graph->decrement_node_ids(decrement); + graph->increment_node_ids(-increment); } - graph->serialize_to_ostream(std::cout); - delete graph; + vg::io::save_handle_graph(graph.get(), cout); } else { vector graph_file_names; @@ -144,7 +179,7 @@ int main_ids(int argc, char** argv) { } VGset graphs(graph_file_names); - vg::id_t max_node_id = (join ? graphs.merge_id_space() : graphs.get_max_id()); + vg::id_t max_node_id = (join ? graphs.merge_id_space() : graphs.max_node_id()); if (!mapping_name.empty()) { gcsa::NodeMapping mapping(max_node_id + 1); std::ofstream out(mapping_name, std::ios_base::binary); diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp index 60abf5872ed..7bc9c34da3a 100644 --- a/src/subcommand/index_main.cpp +++ b/src/subcommand/index_main.cpp @@ -1,4 +1,4 @@ -// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, GBWT, and RocksDB indexes +// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and GBWT indexes #include #include @@ -11,19 +11,26 @@ #include "subcommand.hpp" #include "../vg.hpp" -#include "../index.hpp" -#include "../gam_index.hpp" -#include "../stream.hpp" +#include "../haplotype_indexer.hpp" +#include "xg.hpp" +#include +#include +#include "../io/save_handle_graph.hpp" +#include "../stream_index.hpp" #include "../vg_set.hpp" #include "../utility.hpp" #include "../region.hpp" -#include "../snarls.hpp" -#include "../distance.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../snarl_distance_index.hpp" +#include "../source_sink_overlay.hpp" +#include "../gbwt_helper.hpp" +#include "../gbwtgraph_helper.hpp" +#include "../gcsa_helper.hpp" -#include #include -#include #include +#include +#include using namespace std; using namespace vg; @@ -38,16 +45,16 @@ void help_index(char** argv) { << " -t, --threads N number of threads to use" << endl << " -p, --progress show progress" << endl << "xg options:" << endl - << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s)" << endl - << " -F, --thread-db FILE read thread database from FILE (may repeat)" << endl - << "gbwt options:" << endl + << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl + << " -L, --xg-alts include alt paths in xg" << endl + << "gbwt options (more in vg gbwt):" << endl << " -v, --vcf-phasing FILE generate threads from the haplotypes in the VCF file FILE" << endl - << " -e, --parse-only FILE store the VCF parsing with prefix FILE without generating threads" << endl + << " -W, --ignore-missing don't warn when variants in the VCF are missing from the graph; silently skip them" << endl << " -T, --store-threads generate threads from the embedded paths" << endl - << " -M, --store-gam FILE generate threads from the alignments in FILE (many allowed)" << endl + << " -M, --store-gam FILE generate threads from the alignments in gam FILE (many allowed)" << endl + << " -F, --store-gaf FILE generate threads from the alignments in gaf FILE (many allowed)" << endl << " -G, --gbwt-name FILE store the threads as GBWT in FILE" << endl - << " -H, --write-haps FILE store the threads as sequences in FILE" << endl - << " -F, --thread-db FILE write thread database to FILE" << endl + << " -z, --actual-phasing do not make unphased homozygous genotypes phased"<< endl << " -P, --force-phasing replace unphased genotypes with randomly phased ones" << endl << " -o, --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved" << endl << " -B, --batch-size N number of samples per batch (default 200)" << endl @@ -55,11 +62,12 @@ void help_index(char** argv) { << " -n, --id-interval N store haplotype ids at one out of N positions (default 1024)" << endl << " -R, --range X..Y process samples X to Y (inclusive)" << endl << " -r, --rename V=P rename contig V in the VCFs to path P in the graph (may repeat)" << endl + << " --rename-variants when renaming contigs, find variants in the graph based on the new name" << endl << " -I, --region C:S-E operate on only the given 1-based region of the given VCF contig (may repeat)" << endl << " -E, --exclude SAMPLE exclude any samples with the given name from haplotype indexing" << endl << "gcsa options:" << endl - << " -g, --gcsa-out FILE output a GCSA2 index instead of a rocksdb index" << endl - << " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl + << " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl + //<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl << " -f, --mapping FILE use this node mapping in GCSA2 construction" << endl << " -k, --kmer-size N index kmers of size N in the graph (default " << gcsa::Key::MAX_LENGTH << ")" << endl << " -X, --doubling-steps N use this number of doubling steps for GCSA2 construction (default " << gcsa::ConstructionParameters::DOUBLING_STEPS << ")" << endl @@ -67,67 +75,19 @@ void help_index(char** argv) { << " -V, --verify-index validate the GCSA2 index using the input kmers (important for testing)" << endl << "gam indexing options:" << endl << " -l, --index-sorted-gam input is sorted .gam format alignments, store a GAI index of the sorted GAM in INPUT.gam.gai" << endl - << "rocksdb options:" << endl - << " -d, --db-name store the RocksDB index in " << endl - << " -m, --store-mappings input is .gam format, store the mappings in alignments by node" << endl - << " -a, --store-alignments input is .gam format, store the alignments by node" << endl - << " -A, --dump-alignments graph contains alignments, output them in sorted order" << endl - << " -N, --node-alignments input is (ideally, sorted) .gam format," << endl - << " cross reference nodes by alignment traversals" << endl - << " -D, --dump print the contents of the db to stdout" << endl - << " -C, --compact compact the index into a single level (improves performance)" << endl + << "vg in-place indexing options:" << endl + << " --index-sorted-vg input is ID-sorted .vg format graph chunks, store a VGI index of the sorted vg in INPUT.vg.vgi" << endl << "snarl distance index options" << endl - << " -c --dist-graph FILE generate snarl distane index from VG in FILE" << endl - << " -s --snarl-name FILE load snarls from FILE" << endl - << " -j --dist-name FILE use this file to store a snarl-based distance index" << endl; + << " -j --dist-name FILE use this file to store a snarl-based distance index" << endl + << " --snarl-limit N don't store snarl distances for snarls with more than N nodes (default 10000)" << endl; } -// Convert gbwt::node_type to ThreadMapping. -xg::XG::ThreadMapping gbwt_to_thread_mapping(gbwt::node_type node) { - xg::XG::ThreadMapping thread_mapping = { (int64_t)(gbwt::Node::id(node)), gbwt::Node::is_reverse(node) }; - return thread_mapping; +void multiple_thread_sources() { + std::cerr << "error: [vg index] cannot generate threads from multiple sources (VCF, GAM, GAF, paths)" << std::endl; + std::cerr << "error: [vg index] GBWT indexes can be built separately and merged with vg gbwt" << std::endl; + std::exit(EXIT_FAILURE); } -// Convert Path to a GBWT path. -gbwt::vector_type path_to_gbwt(const Path& path) { - gbwt::vector_type result(path.mapping_size()); - for (size_t i = 0; i < result.size(); i++) { - result[i] = gbwt::Node::encode(path.mapping(i).position().node_id(), path.mapping(i).position().is_reverse()); - } - return result; -} - -// Find all predecessor nodes of the path, ignoring self-loops. -gbwt::vector_type predecessors(const xg::XG& xg_index, const Path& path) { - gbwt::vector_type result; - if (path.mapping_size() == 0) { - return result; - } - - vg::id_t first_node = path.mapping(0).position().node_id(); - bool is_reverse = path.mapping(0).position().is_reverse(); - auto pred_edges = (is_reverse ? xg_index.edges_on_end(first_node) : xg_index.edges_on_start(first_node)); - for (auto& edge : pred_edges) { - if (edge.from() == edge.to()) { - continue; // Self-loop. - } - if (edge.from() == first_node) { // Reverse the edge if it is from this node. - result.push_back(gbwt::Node::encode(edge.to(), !(edge.to_end()))); - } else { - result.push_back(gbwt::Node::encode(edge.from(), edge.from_start())); - } - } - - return result; -} - -std::vector parseGenotypes(const std::string& vcf_line, size_t num_samples); - -// Thread database files written by vg index -G and read by vg index -x. -// These should probably be in thread_database.cpp or something like that. -void write_thread_db(const std::string& filename, const std::vector& thread_names, size_t haplotype_count); -void read_thread_db(const std::vector& filenames, std::vector& thread_names, size_t& haplotype_count); - int main_index(int argc, char** argv) { if (argc == 2) { @@ -135,32 +95,29 @@ int main_index(int argc, char** argv) { return 1; } + #define OPT_BUILD_VGI_INDEX 1000 + #define OPT_RENAME_VARIANTS 1001 + #define OPT_DISTANCE_SNARL_LIMIT 1002 + // Which indexes to build. - bool build_xg = false, build_gbwt = false, write_threads = false, build_gpbwt = false, build_gcsa = false, build_rocksdb = false, build_dist = false; + bool build_xg = false, build_gbwt = false, build_gcsa = false, build_dist = false; // Files we should read. - string vcf_name, mapping_name, dist_graph; - vector thread_db_names; + string vcf_name, mapping_name; vector dbg_names; // Files we should write. - string xg_name, gbwt_name, parse_name, threads_name, gcsa_name, rocksdb_name, dist_name, snarl_name; + string xg_name, gbwt_name, gcsa_name, dist_name; + // General bool show_progress = false; // GBWT - bool index_haplotypes = false, index_paths = false, index_gam = false; - bool parse_only = false; - vector gam_file_names; - bool force_phasing = false, discard_overlaps = false; - size_t samples_in_batch = 200; - size_t gbwt_buffer_size = gbwt::DynamicGBWT::INSERT_BATCH_SIZE / gbwt::MILLION; // Millions of nodes. - size_t id_interval = gbwt::DynamicGBWT::SAMPLE_INTERVAL; - std::pair sample_range(0, ~(size_t)0); // The semiopen range of samples to process. - map path_to_vcf; // Path name conversion from --rename. - map> regions; // Region restrictions for contigs, in VCF name space, as 0-based exclusive-end ranges. - unordered_set excluded_samples; // Excluded sample names from --exclude. + HaplotypeIndexer haplotype_indexer; + enum thread_source_type { thread_source_none, thread_source_vcf, thread_source_paths, thread_source_gam, thread_source_gaf }; + thread_source_type thread_source = thread_source_none; + vector aln_file_names; // GCSA gcsa::size_type kmer_size = gcsa::Key::MAX_LENGTH; @@ -168,17 +125,16 @@ int main_index(int argc, char** argv) { bool verify_gcsa = false; // Gam index (GAI) - bool build_gam_index = false; + bool build_gai_index = false; + + // VG in-place index (VGI) + bool build_vgi_index = false; - // RocksDB - bool dump_index = false; - bool store_alignments = false; - bool store_node_alignments = false; - bool store_mappings = false; - bool dump_alignments = false; + // Include alt paths in xg + bool xg_alts = false; - // Unused? - bool compact = false; + //Distance index + size_t snarl_limit = 50000; int c; optind = 2; // force optind past command positional argument @@ -193,14 +149,16 @@ int main_index(int argc, char** argv) { // XG {"xg-name", required_argument, 0, 'x'}, {"thread-db", required_argument, 0, 'F'}, + {"xg-alts", no_argument, 0, 'L'}, // GBWT {"vcf-phasing", required_argument, 0, 'v'}, - {"parse-only", required_argument, 0, 'e'}, + {"ignore-missing", no_argument, 0, 'W'}, {"store-threads", no_argument, 0, 'T'}, {"store-gam", required_argument, 0, 'M'}, + {"store-gaf", required_argument, 0, 'F'}, {"gbwt-name", required_argument, 0, 'G'}, - {"write-haps", required_argument, 0, 'H'}, + {"actual-phasing", no_argument, 0, 'z'}, {"force-phasing", no_argument, 0, 'P'}, {"discard-overlaps", no_argument, 0, 'o'}, {"batch-size", required_argument, 0, 'B'}, @@ -208,11 +166,12 @@ int main_index(int argc, char** argv) { {"id-interval", required_argument, 0, 'n'}, {"range", required_argument, 0, 'R'}, {"rename", required_argument, 0, 'r'}, + {"rename-variants", no_argument, 0, OPT_RENAME_VARIANTS}, {"region", required_argument, 0, 'I'}, {"exclude", required_argument, 0, 'E'}, // GCSA - {"gcsa-name", required_argument, 0, 'g'}, + {"gcsa-out", required_argument, 0, 'g'}, {"dbg-in", required_argument, 0, 'i'}, {"mapping", required_argument, 0, 'f'}, {"kmer-size", required_argument, 0, 'k'}, @@ -222,25 +181,18 @@ int main_index(int argc, char** argv) { // GAM index (GAI) {"index-sorted-gam", no_argument, 0, 'l'}, - - // RocksDB - {"db-name", required_argument, 0, 'd'}, - {"store-mappings", no_argument, 0, 'm'}, - {"store-alignments", no_argument, 0, 'a'}, - {"dump-alignments", no_argument, 0, 'A'}, - {"node-alignments", no_argument, 0, 'N'}, - {"dump", no_argument, 0, 'D'}, - {"compact", no_argument, 0, 'C'}, + + // VG in-place index (VGI) + {"index-sorted-vg", no_argument, 0, OPT_BUILD_VGI_INDEX}, //Snarl distance index - {"dist-graph", required_argument, 0, 'c'}, - {"snarl-name", required_argument, 0, 's'}, + {"snarl-limit", required_argument, 0, OPT_DISTANCE_SNARL_LIMIT}, {"dist-name", required_argument, 0, 'j'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "b:t:px:F:v:e:TM:G:H:PoB:u:n:R:r:I:E:g:i:f:k:X:Z:Vld:maANDCc:s:j:h", + c = getopt_long (argc, argv, "b:t:px:Lv:WTM:F:G:zPoB:u:n:R:r:I:E:g:i:f:k:X:Z:Vlj:h", long_options, &option_index); // Detect the end of the options. @@ -258,6 +210,7 @@ int main_index(int argc, char** argv) { break; case 'p': show_progress = true; + haplotype_indexer.show_progress = true; break; // XG @@ -265,51 +218,64 @@ int main_index(int argc, char** argv) { build_xg = true; xg_name = optarg; break; - case 'F': - thread_db_names.push_back(optarg); + case 'L': + xg_alts = true; break; // GBWT case 'v': - index_haplotypes = true; - build_xg = true; + if (thread_source != thread_source_none) { + multiple_thread_sources(); + } + thread_source = thread_source_vcf; vcf_name = optarg; break; - case 'e': - parse_only = true; - parse_name = optarg; + case 'W': + haplotype_indexer.warn_on_missing_variants = false; break; case 'T': - index_paths = true; - build_xg = true; + if (thread_source != thread_source_none) { + multiple_thread_sources(); + } + thread_source = thread_source_paths; break; case 'M': - index_gam = true; + if (thread_source != thread_source_none && thread_source != thread_source_gam) { + multiple_thread_sources(); + } + thread_source = thread_source_gam; build_gbwt = true; - gam_file_names.push_back(optarg); + aln_file_names.push_back(optarg); + break; + case 'F': + if (thread_source != thread_source_none && thread_source != thread_source_gaf) { + multiple_thread_sources(); + } + thread_source = thread_source_gaf; + build_gbwt = true; + aln_file_names.push_back(optarg); break; case 'G': build_gbwt = true; gbwt_name = optarg; break; - case 'H': - write_threads = true; - threads_name = optarg; + case 'z': + haplotype_indexer.phase_homozygous = false; break; case 'P': - force_phasing = true; + haplotype_indexer.force_phasing = true; break; case 'o': - discard_overlaps = true; + haplotype_indexer.discard_overlaps = true; break; case 'B': - samples_in_batch = std::max(parse(optarg), 1ul); + haplotype_indexer.samples_in_batch = std::max(parse(optarg), 1ul); break; case 'u': - gbwt_buffer_size = std::max(parse(optarg), 1ul); + haplotype_indexer.gbwt_buffer_size = std::max(parse(optarg), 1ul); break; case 'n': - id_interval = parse(optarg); + haplotype_indexer.id_interval = parse(optarg); break; case 'R': { @@ -320,8 +286,8 @@ int main_index(int argc, char** argv) { cerr << "error: [vg index] could not parse range " << temp << endl; exit(1); } - sample_range.first = parse(temp.substr(0, found)); - sample_range.second = parse(temp.substr(found + 2)) + 1; + haplotype_indexer.sample_range.first = parse(temp.substr(0, found)); + haplotype_indexer.sample_range.second = parse(temp.substr(found + 2)) + 1; } break; case 'r': @@ -337,9 +303,12 @@ int main_index(int argc, char** argv) { string vcf_contig = key_value.substr(0, found); string graph_contig = key_value.substr(found + 1); // Add the name mapping - path_to_vcf[graph_contig] = vcf_contig; + haplotype_indexer.path_to_vcf[graph_contig] = vcf_contig; } break; + case OPT_RENAME_VARIANTS: + haplotype_indexer.rename_variants = true; + break; case 'I': { // We want to parse this region specifier @@ -353,11 +322,11 @@ int main_index(int argc, char** argv) { } // Make sure to correct the coordinates to 0-based exclusive-end, from 1-based inclusive-end - regions[parsed.seq] = make_pair((size_t) (parsed.start - 1), (size_t) parsed.end); + haplotype_indexer.regions[parsed.seq] = make_pair((size_t) (parsed.start - 1), (size_t) parsed.end); } break; case 'E': - excluded_samples.insert(optarg); + haplotype_indexer.excluded_samples.insert(optarg); break; // GCSA @@ -366,6 +335,7 @@ int main_index(int argc, char** argv) { gcsa_name = optarg; break; case 'i': + cerr << "warning: -i option is deprecated" << endl; dbg_names.push_back(optarg); break; case 'f': @@ -386,47 +356,22 @@ int main_index(int argc, char** argv) { // Gam index (GAI) case 'l': - build_gam_index = true; - break; - - // RocksDB - case 'd': - build_rocksdb = true; - rocksdb_name = optarg; - break; - case 'm': - store_mappings = true; - break; - case 'a': - store_alignments = true; - break; - case 'A': - dump_alignments = true; + build_gai_index = true; break; - case 'N': - store_node_alignments = true; - break; - case 'D': - dump_index = true; - break; - - case 'C': - compact = true; + + // VGI index + case OPT_BUILD_VGI_INDEX: + build_vgi_index = true; break; //Snarl distance index - case 'c': - build_dist = true; - dist_graph = optarg; - break; - case 's': - build_dist = true; - snarl_name = optarg; - break; case 'j': build_dist = true; dist_name = optarg; break; + case OPT_DISTANCE_SNARL_LIMIT: + snarl_limit = parse(optarg); + break; case 'h': case '?': @@ -444,18 +389,35 @@ int main_index(int argc, char** argv) { file_names.push_back(file_name); } - if (xg_name.empty() && gbwt_name.empty() && parse_name.empty() && threads_name.empty() && gcsa_name.empty() && rocksdb_name.empty() && !build_gam_index && dist_graph.empty() ) { + + if (xg_name.empty() && gbwt_name.empty() && + gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) { cerr << "error: [vg index] index type not specified" << endl; return 1; } - if ((build_gbwt || write_threads) && !(index_haplotypes || index_paths || index_gam)) { + if (build_gbwt && thread_source == thread_source_none) { cerr << "error: [vg index] cannot build GBWT without threads" << endl; return 1; } - if (parse_only && (index_paths || index_gam)) { - cerr << "error: [vg index] --parse-only does not work with --store-threads or --store-gam" << endl; + if (thread_source != thread_source_none && !build_gbwt) { + cerr << "error: [vg index] no GBWT output specified for the threads" << endl; + return 1; + } + + if (thread_source == thread_source_gam || thread_source == thread_source_gaf) { + for (const auto& name : aln_file_names) { + if (name == "-") { + cerr << "error: [vg index] GAM (-M) and GAF (-F) input files cannot be read from stdin (-)" << endl; + return 1; + } + } + } + + if (thread_source != thread_source_none && file_names.size() != 1) { + cerr << "error: [vg index] exactly one graph required for generating threads" << std::endl; + cerr << "error: [vg index] you may combine the graphs with vg index -x combined.xg --xg-alts" << std::endl; return 1; } @@ -464,466 +426,173 @@ int main_index(int argc, char** argv) { //return 1; } - if (file_names.size() != 1 && build_gam_index) { + if (file_names.size() != 1 && build_gai_index) { cerr << "error: [vg index] can only index exactly one sorted GAM file at a time" << endl; return 1; } + if (file_names.size() != 1 && build_vgi_index) { + cerr << "error: [vg index] can only index exactly one sorted VG file at a time" << endl; + return 1; + } + + if (file_names.size() > 1 && build_dist) { + // Allow zero filenames for the index-from-xg mode + cerr << "error: [vg index] can only create one distance index at a time" << endl; + return 1; + } + if (build_gcsa && kmer_size > gcsa::Key::MAX_LENGTH) { cerr << "error: [vg index] GCSA2 cannot index with kmer size greater than " << gcsa::Key::MAX_LENGTH << endl; return 1; } - - if ((build_gbwt || write_threads) && thread_db_names.size() > 1) { - cerr << "error: [vg index] cannot use multiple thread database files with -G or -H" << endl; - return 1; + + if (build_xg && build_gcsa && file_names.empty()) { + // Really we want to build a GCSA by *reading* and XG + build_xg = false; + // We'll continue in the build_gcsa section + std::cerr << "warning: [vg index] providing input XG with option -x is deprecated" << std::endl; + } + if (build_dist && file_names.empty()) { + //If we want to build the distance index from the xg + build_xg = false; + std::cerr << "warning: [vg index] providing input XG with option -x is deprecated" << std::endl; } - // Build XG - xg::XG* xg_index = new xg::XG(); - map alt_paths; + + // Build XG. Include alt paths in the XG if requested with -L. if (build_xg) { if (file_names.empty()) { // VGset or something segfaults when we feed it no graphs. cerr << "error: [vg index] at least one graph is required to build an xg index" << endl; return 1; } + if (show_progress) { + cerr << "Building XG index" << endl; + } + xg::XG xg_index; VGset graphs(file_names); - build_gpbwt = !build_gbwt & !write_threads & !parse_only; - graphs.to_xg(*xg_index, index_paths & build_gpbwt, Paths::is_alt, index_haplotypes ? &alt_paths : nullptr); + graphs.to_xg(xg_index, (xg_alts ? [](const string&) {return false;} : Paths::is_alt), nullptr); if (show_progress) { - cerr << "Built base XG index" << endl; + cerr << "Saving XG index to " << xg_name << endl; } + // Save the XG. + vg::io::save_handle_graph(&xg_index, xg_name); } // Generate threads - if (index_haplotypes || index_paths || index_gam) { - - if (!build_gbwt && !(parse_only && index_haplotypes) && !write_threads && !build_gpbwt) { - cerr << "error: [vg index] No output format specified for the threads" << endl; - return 1; + if (thread_source != thread_source_none) { + + // Load the only input graph. + unique_ptr path_handle_graph; + path_handle_graph = vg::io::VPKG::load_one(file_names[0]); + + std::unique_ptr gbwt_index(nullptr); + if (thread_source == thread_source_vcf) { + std::vector parse_files = haplotype_indexer.parse_vcf(vcf_name, *path_handle_graph); + path_handle_graph.reset(); // Save memory by deleting the graph. + gbwt_index = haplotype_indexer.build_gbwt(parse_files); + } else if (thread_source == thread_source_paths) { + gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph); + } else if (thread_source == thread_source_gam) { + gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAM"); + } else if (thread_source == thread_source_gaf) { + gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAF"); + } + if (build_gbwt && gbwt_index.get() != nullptr) { + save_gbwt(*gbwt_index, gbwt_name, show_progress); } + } // End of thread indexing. - // Use the same temp directory as VG. - gbwt::TempFile::setDirectory(temp_file::get_dir()); - - // if we already made the xg index we can determine the - size_t id_width; - if (!index_gam) { - id_width = gbwt::bit_length(gbwt::Node::encode(xg_index->get_max_id(), true)); - } else { // indexing a GAM - if (show_progress) { - cerr << "Finding maximum node id in GAM..." << endl; - } - vg::id_t max_id = 0; - function lambda = [&](Alignment& aln) { - gbwt::vector_type buffer; - for (auto& m : aln.path().mapping()) { - max_id = max(m.position().node_id(), max_id); - } - }; - for (auto& file_name : gam_file_names) { - get_input_file(file_name, [&](istream& in) { - stream::for_each_parallel(in, lambda); - }); - } - id_width = gbwt::bit_length(gbwt::Node::encode(max_id, true)); - } - - if (show_progress) { - cerr << "Node id width: " << id_width << endl; - } - - vector thread_names; // Store thread names in insertion order. - vector all_phase_threads; // Store all threads if building gPBWT. - size_t haplotype_count = 0; - - // Do we build GBWT? - gbwt::GBWTBuilder* gbwt_builder = 0; - if (build_gbwt) { - if (show_progress) { - cerr << "GBWT parameters: buffer size " << gbwt_buffer_size << ", id interval " << id_interval << endl; - } - gbwt::Verbosity::set(gbwt::Verbosity::SILENT); // Make the construction thread silent. - gbwt_builder = new gbwt::GBWTBuilder(id_width, gbwt_buffer_size * gbwt::MILLION, id_interval); - } - - // Do we write threads? - gbwt::text_buffer_type binary_file; - if (write_threads) { - if (show_progress) { cerr << "Writing the threads to " << threads_name << endl; } - binary_file = gbwt::text_buffer_type(threads_name, std::ios::out, gbwt::MEGABYTE, id_width); - } - - // Store a thread and its name. - auto store_thread = [&](const gbwt::vector_type& to_save, const std::string& thread_name) { - if (build_gbwt) { - gbwt_builder->insert(to_save, true); // Insert in both orientations. - } - if (write_threads) { - for (auto node : to_save) { binary_file.push_back(node); } - binary_file.push_back(gbwt::ENDMARKER); - } - if (build_gpbwt) { - xg::XG::thread_t temp; - temp.reserve(to_save.size()); - for (auto node : to_save) { temp.push_back(gbwt_to_thread_mapping(node)); } - all_phase_threads.push_back(temp); - } - thread_names.push_back(thread_name); - }; - - // Convert paths to threads - if (index_paths & !build_gpbwt) { - if (show_progress) { - cerr << "Converting paths to threads..." << endl; - } - for (size_t path_rank = 1; path_rank <= xg_index->max_path_rank(); path_rank++) { - const xg::XGPath& path = xg_index->get_path(xg_index->path_name(path_rank)); - if (path.ids.size() == 0) { - continue; - } - gbwt::vector_type buffer(path.ids.size()); - for (size_t i = 0; i < path.ids.size(); i++) { - buffer[i] = gbwt::Node::encode(path.node(i), path.is_reverse(i)); - } - store_thread(buffer, xg_index->path_name(path_rank)); - } - haplotype_count++; // We assume that the XG index contains the reference paths. - } + // Build GCSA + if (build_gcsa) { - if (index_gam) { - if (show_progress) { - cerr << "Converting GAM to threads..." << endl; - } - function lambda = [&](Alignment& aln) { - gbwt::vector_type buffer; - for (auto& m : aln.path().mapping()) { - buffer.push_back(gbwt::Node::encode(m.position().node_id(), m.position().is_reverse())); - } - store_thread(buffer, aln.name()); - haplotype_count++; - }; - for (auto& file_name : gam_file_names) { - get_input_file(file_name, [&](istream& in) { - stream::for_each_parallel(in, lambda); - }); - } + // Configure GCSA2 verbosity so it doesn't spit out loads of extra info + if (!show_progress) { + gcsa::Verbosity::set(gcsa::Verbosity::SILENT); } - // Generate haplotypes - if (index_haplotypes) { - vcflib::VariantCallFile variant_file; - variant_file.parseSamples = false; // vcflib parsing is very slow if there are many samples. - variant_file.open(vcf_name); - if (!variant_file.is_open()) { - cerr << "error: [vg index] could not open " << vcf_name << endl; - return 1; - } else if (show_progress) { - cerr << "Opened variant file " << vcf_name << endl; - } - std::mt19937 rng(0xDEADBEEF); - std::uniform_int_distribution random_bit(0, 1); - - // How many samples are there? - size_t num_samples = variant_file.sampleNames.size(); - if (num_samples == 0) { - cerr << "error: [vg index] The variant file does not contain phasings" << endl; - return 1; - } - - // Remember the sample names - const vector& sample_names = variant_file.sampleNames; + double start = gcsa::readTimer(); - // Determine the range of samples. - sample_range.second = std::min(sample_range.second, num_samples); - haplotype_count += 2 * (sample_range.second - sample_range.first); // Assuming a diploid genome + // Generate temporary kmer files + bool delete_kmer_files = false; + if (dbg_names.empty()) { if (show_progress) { - cerr << "Haplotype generation parameters:" << endl; - cerr << "- Samples " << sample_range.first << " to " << (sample_range.second - 1) << endl; - cerr << "- Batch size " << samples_in_batch << endl; - if (force_phasing) { - cerr << "- Force phasing" << endl; - } - if (discard_overlaps) { - cerr << "- Discard overlaps" << endl; - } + cerr << "Generating kmer files..." << endl; } - - // Process each VCF contig corresponding to an XG path. - size_t max_path_rank = xg_index->max_path_rank(); - for (size_t path_rank = 1; path_rank <= max_path_rank; path_rank++) { - string path_name = xg_index->path_name(path_rank); - string vcf_contig_name = path_to_vcf.count(path_name) ? path_to_vcf[path_name] : path_name; + + if (!file_names.empty()) { + // Get the kmers from a VGset. + VGset graphs(file_names); + size_t kmer_bytes = params.getLimitBytes(); + dbg_names = graphs.write_gcsa_kmers_binary(kmer_size, kmer_bytes); + params.reduceLimit(kmer_bytes); + delete_kmer_files = true; + } else if (!xg_name.empty()) { + // Get the kmers from an XG or other single graph + + // Load the graph + auto single_graph = vg::io::VPKG::load_one(xg_name); + + auto make_kmers_for_component = [&](const HandleGraph* g) { + // Make an overlay on it to add source and sink nodes + // TODO: Don't use this directly; unify this code with VGset's code. + SourceSinkOverlay overlay(g, kmer_size); + + // Get the size limit + size_t kmer_bytes = params.getLimitBytes(); + + // Write the kmer temp file + dbg_names.push_back(write_gcsa_kmers_to_tmpfile(overlay, kmer_size, kmer_bytes, + overlay.get_id(overlay.get_source_handle()), + overlay.get_id(overlay.get_sink_handle()))); + + // Feed back into the size limit + params.reduceLimit(kmer_bytes); + delete_kmer_files = true; + }; + if (show_progress) { - cerr << "Processing path " << path_name << " as VCF contig " << vcf_contig_name << endl; - } - string parse_file = parse_name + '_' + vcf_contig_name; - - // Structures to parse the VCF file into. - const xg::XGPath& path = xg_index->get_path(path_name); - gbwt::VariantPaths variants(path.ids.size()); - std::vector phasings; - - // Add the reference to VariantPaths. - for (size_t i = 0; i < path.ids.size(); i++) { - variants.appendToReference(gbwt::Node::encode(path.node(i), path.is_reverse(i))); - } - variants.indexReference(); - - // Create a PhasingInformation for each batch. - for (size_t batch_start = sample_range.first; batch_start < sample_range.second; batch_start += samples_in_batch) { - if (parse_only) { - // Use a permanent file. - phasings.emplace_back(parse_file, batch_start, std::min(samples_in_batch, sample_range.second - batch_start)); - variants.addFile(phasings.back().name(), phasings.back().offset(), phasings.back().size()); - } else { - // Use a temporary file. - phasings.emplace_back(batch_start, std::min(samples_in_batch, sample_range.second - batch_start)); - } + cerr << "Finding connected components..." << endl; } - - // Set the VCF region or process the entire contig. - if (regions.count(vcf_contig_name)) { - auto region = regions[vcf_contig_name]; + + // Get all the components in the graph, which we can process separately to save memory. + std::vector> components = handlealgs::weakly_connected_components(single_graph.get()); + + if (components.size() == 1) { + // Only one component if (show_progress) { - cerr << "- Setting region " << region.first << " to " << region.second << endl; + cerr << "Processing single component graph..." << endl; } - variant_file.setRegion(vcf_contig_name, region.first, region.second); + make_kmers_for_component(single_graph.get()); } else { - variant_file.setRegion(vcf_contig_name); - } - - // Parse the variants and the phasings. - vcflib::Variant var(variant_file); - size_t variants_processed = 0; - std::vector was_diploid(sample_range.second, true); // Was the sample diploid at the previous site? - while (variant_file.is_open() && variant_file.getNextVariant(var) && var.sequenceName == vcf_contig_name) { - // Skip variants with non-DNA sequence, as they are not included in the graph. - bool isDNA = allATGC(var.ref); - for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { - if (!allATGC(*a)) isDNA = false; - } - if (!isDNA) { - continue; - } - - // Determine the reference nodes for the current variant and create a variant site. - // If the variant is not an insertion, there should be a path for the ref allele. - - std::string var_name = make_variant_id(var); - std::string ref_path_name = "_alt_" + var_name + "_0"; - auto ref_path_iter = alt_paths.find(ref_path_name); - gbwt::vector_type ref_path; - size_t ref_pos = variants.invalid_position(); - if (ref_path_iter != alt_paths.end() && ref_path_iter->second.mapping_size() != 0) { - ref_path = path_to_gbwt(ref_path_iter->second); - ref_pos = variants.firstOccurrence(ref_path.front()); - if (ref_pos == variants.invalid_position()) { - cerr << "warning: [vg index] Invalid ref path for " << var_name << " at " - << var.sequenceName << ":" << var.position << endl; - continue; - } - } else { // Try using alt paths instead. - bool found = false; - for (size_t alt_index = 1; alt_index < var.alleles.size(); alt_index++) { - std::string alt_path_name = "_alt_" + var_name + "_" + to_string(alt_index); - size_t candidate_pos = 0; - bool candidate_found = false; - auto alt_path_iter = alt_paths.find(alt_path_name); - if (alt_path_iter != alt_paths.end()) { - gbwt::vector_type pred_nodes = predecessors(*xg_index, alt_path_iter->second); - for (auto node : pred_nodes) { - size_t pred_pos = variants.firstOccurrence(node); - if (pred_pos != variants.invalid_position()) { - candidate_pos = std::max(candidate_pos, pred_pos + 1); - candidate_found = true; - found = true; - } - } - // For each alternate allele, find the rightmost reference node among - // its predecessors. If multiple alleles have candidates for the - // reference position, choose the leftmost one. - if (candidate_found) { - ref_pos = std::min(ref_pos, candidate_pos); - } - } - } - if (!found) { - cerr << "warning: [vg index] Alt and ref paths for " << var_name - << " at " << var.sequenceName << ":" << var.position - << " missing/empty! Was the variant skipped during construction?" << endl; - continue; - } - } - variants.addSite(ref_pos, ref_pos + ref_path.size()); - - // Add alternate alleles to the site. - for (size_t alt_index = 1; alt_index < var.alleles.size(); alt_index++) { - std::string alt_path_name = "_alt_" + var_name + "_" + to_string(alt_index); - auto alt_path_iter = alt_paths.find(alt_path_name); - if (alt_path_iter != alt_paths.end()) { - variants.addAllele(path_to_gbwt(alt_path_iter->second)); - } else { - variants.addAllele(ref_path); + for (size_t i = 0; i < components.size(); i++) { + // Run separately on each component. + // Don't run in parallel or size limit tracking won't work. + + if (show_progress) { + cerr << "Selecting component " << i << "/" << components.size() << "..." << endl; } - } - - // Store the phasings in PhasingInformation structures. - std::vector genotypes = parseGenotypes(var.originalLine, num_samples); - for (size_t batch = 0; batch < phasings.size(); batch++) { - std::vector current_phasings; - for (size_t sample = phasings[batch].offset(); sample < phasings[batch].limit(); sample++) { - string& sample_name = variant_file.sampleNames[sample]; - current_phasings.emplace_back(genotypes[sample], was_diploid[sample]); - was_diploid[sample] = current_phasings.back().diploid; - if(force_phasing) { - current_phasings.back().forcePhased([&]() { - return random_bit(rng); - }); - } + + bdsg::PackedSubgraphOverlay component_graph(single_graph.get()); + for (auto& id : components[i]) { + // Add each node to the subgraph. + // TODO: use a handle-returning component + // finder so we don't need to get_handle here. + component_graph.add_node(single_graph->get_handle(id, false)); } - phasings[batch].append(current_phasings); - } - variants_processed++; - } // End of variants. - if (show_progress) { - cerr << "- Parsed " << variants_processed << " variants" << endl; - size_t phasing_bytes = 0; - for (size_t batch = 0; batch < phasings.size(); batch++) { - phasing_bytes += phasings[batch].bytes(); - } - cerr << "- Phasing information: " << gbwt::inMegabytes(phasing_bytes) << " MB" << endl; - } - - // Save memory: - // - Delete the alt paths if we no longer need them. - // - Delete the XG index if we no longer need it. - // - Close the phasings files. - if (path_rank == max_path_rank) { - alt_paths.clear(); - if (xg_name.empty()) { - delete xg_index; - xg_index = nullptr; - } - } - for (size_t batch = 0; batch < phasings.size(); batch++) { - phasings[batch].close(); - } - - // Save the VCF parse or generate the haplotypes. - if (parse_only) { - sdsl::store_to_file(variants, parse_file); - } else { - for (size_t batch = 0; batch < phasings.size(); batch++) { - gbwt::generateHaplotypes(variants, phasings[batch], - [&](gbwt::size_type sample) -> bool { - return (excluded_samples.find(sample_names[sample]) == excluded_samples.end()); - }, - [&](const gbwt::Haplotype& haplotype) { - stringstream sn; - sn << "_thread_" << sample_names[haplotype.sample] - << "_" << path_name - << "_" << haplotype.phase - << "_" << haplotype.count; - store_thread(haplotype.path, sn.str()); - }, - [&](gbwt::size_type, gbwt::size_type) -> bool { - return discard_overlaps; - }); + if (show_progress) { - cerr << "- Processed samples " << phasings[batch].offset() << " to " << (phasings[batch].offset() + phasings[batch].size() - 1) << endl; + cerr << "Processing component " << i << "/" << components.size() << "..." << endl; } + + make_kmers_for_component(&component_graph); } - } // End of haplotype generation for the current contig. - } // End of contigs. - } // End of haplotypes. - - // Store the thread database. Write it to disk if a filename is given, - // or store it in the XG index if building gPBWT or if the XG index - // will be written to disk. - alt_paths.clear(); - if (!parse_only) { - if (build_gbwt) { - gbwt_builder->finish(); - if (show_progress) { cerr << "Saving GBWT to disk..." << endl; } - sdsl::store_to_file(gbwt_builder->index, gbwt_name); - delete gbwt_builder; gbwt_builder = nullptr; - } - if (write_threads) { - binary_file.close(); - } - if (build_gbwt || write_threads) { - if (!thread_db_names.empty()) { - write_thread_db(thread_db_names.front(), thread_names, haplotype_count); - } else if (!xg_name.empty()) { - if (show_progress) { - cerr << "Storing " << thread_names.size() << " thread names from " - << haplotype_count << " haplotypes in the XG index..." << endl; - } - xg_index->set_thread_names(thread_names); - xg_index->set_haplotype_count(haplotype_count); - } - } - if (build_gpbwt) { - if (show_progress) { - cerr << "Inserting all phase threads into DAG..." << endl; } - xg_index->insert_threads_into_dag(all_phase_threads, thread_names); - xg_index->set_haplotype_count(haplotype_count); - } - } - } // End of thread indexing. - - // Save XG - if (!xg_name.empty()) { - if (!thread_db_names.empty()) { - vector thread_names; - size_t haplotype_count = 0; - read_thread_db(thread_db_names, thread_names, haplotype_count); - if (show_progress) { - cerr << thread_names.size() << " threads for " - << haplotype_count << " haplotypes in " - << thread_db_names.size() << " file(s)" << endl; - } - xg_index->set_thread_names(thread_names); - xg_index->set_haplotype_count(haplotype_count); - } - - if (show_progress) { - cerr << "Saving XG index to disk..." << endl; - } - ofstream db_out(xg_name); - xg_index->serialize(db_out); - db_out.close(); - } - delete xg_index; xg_index = nullptr; - - // Build GCSA - if (build_gcsa) { - - // Configure GCSA2 verbosity so it doesn't spit out loads of extra info - if (!show_progress) { - gcsa::Verbosity::set(gcsa::Verbosity::SILENT); - } - - // Use the same temp directory as VG. - gcsa::TempFile::setDirectory(temp_file::get_dir()); - - double start = gcsa::readTimer(); - - // Generate temporary kmer files - bool delete_kmer_files = false; - if (dbg_names.empty()) { - if (show_progress) { - cerr << "Generating kmer files..." << endl; + } else { + cerr << "error: [vg index] cannot generate GCSA index without either a vg or an xg" << endl; + exit(1); } - VGset graphs(file_names); - graphs.show_progress = show_progress; - size_t kmer_bytes = params.getLimitBytes(); - dbg_names = graphs.write_gcsa_kmers_binary(kmer_size, kmer_bytes); - params.reduceLimit(kmer_bytes); - delete_kmer_files = true; } // Build the index @@ -942,11 +611,8 @@ int main_index(int argc, char** argv) { } // Save the indexes - if (show_progress) { - cerr << "Saving the index to disk..." << endl; - } - sdsl::store_to_file(gcsa_index, gcsa_name); - sdsl::store_to_file(lcp_array, gcsa_name + ".lcp"); + save_gcsa(gcsa_index, gcsa_name, show_progress); + save_lcp(lcp_array, gcsa_name + ".lcp", show_progress); // Verify the index if (verify_gcsa) { @@ -966,16 +632,15 @@ int main_index(int argc, char** argv) { } } - if (build_gam_index) { + if (build_gai_index) { // Index a sorted GAM file. - GAMIndex index; get_input_file(file_names.at(0), [&](istream& in) { // Grab the input GAM stream and wrap it in a cursor - stream::ProtobufIterator cursor(in); + vg::io::ProtobufIterator cursor(in); // Index the file - GAMIndex index; + StreamIndex index; index.index(cursor); // Save the GAM index in the appropriate place. @@ -987,215 +652,98 @@ int main_index(int argc, char** argv) { } index.save(index_out); }); - } - - if (build_rocksdb) { - - Index index; - - if (compact) { - index.open_for_write(rocksdb_name); - index.compact(); - index.flush(); - index.close(); - } - - if (store_node_alignments && file_names.size() > 0) { - index.open_for_bulk_load(rocksdb_name); - int64_t aln_idx = 0; - function lambda = [&index,&aln_idx](Alignment& aln) { - index.cross_alignment(aln_idx++, aln); - }; - for (auto& file_name : file_names) { - get_input_file(file_name, [&](istream& in) { - stream::for_each_parallel(in, lambda); - }); - } - index.flush(); - index.close(); - } - - if (store_alignments && file_names.size() > 0) { - index.open_for_bulk_load(rocksdb_name); - function lambda = [&index](Alignment& aln) { - index.put_alignment(aln); - }; - for (auto& file_name : file_names) { - get_input_file(file_name, [&](istream& in) { - stream::for_each_parallel(in, lambda); - }); - } - index.flush(); - index.close(); - } - - if (dump_alignments) { - vector output_buf; - index.open_read_only(rocksdb_name); - auto lambda = [&output_buf](const Alignment& aln) { - output_buf.push_back(aln); - stream::write_buffered(cout, output_buf, 100); - }; - index.for_each_alignment(lambda); - stream::write_buffered(cout, output_buf, 0); - index.close(); - } - - if (store_mappings && file_names.size() > 0) { - index.open_for_bulk_load(rocksdb_name); - function lambda = [&index](Alignment& aln) { - const Path& path = aln.path(); - for (int i = 0; i < path.mapping_size(); ++i) { - index.put_mapping(path.mapping(i)); - } - }; - for (auto& file_name : file_names) { - get_input_file(file_name, [&](istream& in) { - stream::for_each_parallel(in, lambda); - }); + + if (build_vgi_index) { + // Index an ID-sorted VG file. + get_input_file(file_names.at(0), [&](istream& in) { + // Grab the input VG stream and wrap it in a cursor + vg::io::ProtobufIterator cursor(in); + + // Index the file + StreamIndex index; + index.index(cursor); + + // Save the index in the appropriate place. + // TODO: Do we really like this enforced naming convention just beacuse samtools does it? + ofstream index_out(file_names.at(0) + ".vgi"); + if (!index_out.good()) { + cerr << "error: [vg index] could not open " << file_names.at(0) << ".vgi" << endl; + exit(1); } - index.flush(); - index.close(); - } - - if (dump_index) { - index.open_read_only(rocksdb_name); - index.dump(cout); - index.close(); - } - + index.save(index_out); + }); + } - //Build snarl distance index + //Build a snarl-based minimum distance index if (build_dist) { - if (dist_graph.empty()) { - cerr << "error: [vg index] distance index requires a vg file" << endl; + if (file_names.empty() && xg_name.empty()) { + cerr << "error: [vg index] one graph is required to build a distance index" << endl; return 1; + } else if (file_names.size() > 1 || (file_names.size() == 1 && !xg_name.empty())) { + cerr << "error: [vg index] only one graph at a time can be used to build a distance index" << endl; } else if (dist_name.empty()) { cerr << "error: [vg index] distance index requires an output file" << endl; return 1; - } else if (snarl_name.empty()) { - cerr << "error: [vg index] distance index requires a snarl file" << endl; - return 1; - - } else { - ifstream vg_stream(dist_graph); - if (!vg_stream) { - cerr << "error: [vg index] cannot open VG file" << endl; - exit(1); - } - VG vg(vg_stream); - vg_stream.close(); - - ifstream snarl_stream(snarl_name); - if (!snarl_stream) { - cerr << "error: [vg index] cannot open Snarls file" << endl; - exit(1); - } - SnarlManager* snarl_manager = new SnarlManager(snarl_stream); - snarl_stream.close(); - - int64_t cap = 20; //TODO: Take this as an argument or something - DistanceIndex di (&vg, snarl_manager, cap); + } else { + //Get graph and build dist index - - ofstream dist_out(dist_name); - di.serialize(dist_out); - dist_out.close(); + if (file_names.empty() && !xg_name.empty()) { + // We were given a -x specifically to read as XG + + auto xg = vg::io::VPKG::load_one(xg_name); + + IntegratedSnarlFinder snarl_finder(*xg.get()); + // Create the SnarlDistanceIndex + SnarlDistanceIndex distance_index; + + //Fill it in + fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit); + // Save it + distance_index.serialize(dist_name); + } else { + // May be GBZ or a HandleGraph. + auto options = vg::io::VPKG::try_load_first(file_names.at(0)); + + if (get<0>(options)) { + // We have a GBZ graph + auto& gbz = get<0>(options); + + // Create the SnarlDistanceIndex + IntegratedSnarlFinder snarl_finder(gbz->graph); + + //Make a distance index and fill it in + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &(gbz->graph), &snarl_finder, snarl_limit); + // Save it + distance_index.serialize(dist_name); + } else if (get<1>(options)) { + // We were given a graph generically + auto& graph = get<1>(options); + + // Create the SnarlDistanceIndex + IntegratedSnarlFinder snarl_finder(*graph.get()); + + //Make a distance index and fill it in + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, graph.get(), &snarl_finder, snarl_limit); + // Save it + distance_index.serialize(dist_name); + } else { + cerr << "error: [vg index] input is not a graph or GBZ" << endl; + return 1; + } + } } } - if (show_progress) { cerr << "Memory usage: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; } return 0; } -std::vector parseGenotypes(const std::string& vcf_line, size_t num_samples) { - std::vector result; - - // The 9th tab-separated field should start with "GT". - size_t offset = 0; - for (int i = 0; i < 8; i++) { - size_t pos = vcf_line.find('\t', offset); - if (pos == std::string::npos) { - std::cerr << "error: [vg index] VCF line does not contain genotype information" << std::endl; - std::exit(EXIT_FAILURE); - } - offset = pos + 1; - } - if (vcf_line.substr(offset, 2) != "GT") { - std::cerr << "error: [vg index] VCF line does not contain genotype information" << std::endl; - std::exit(EXIT_FAILURE); - } - - // Genotype strings are the first colon-separated fields in the 10th+ tab-separated fields. - offset = vcf_line.find('\t', offset); - while (offset != std::string::npos && offset + 1 < vcf_line.length()) { - offset++; - size_t pos = vcf_line.find_first_of("\t:", offset); - if (pos == std::string::npos) { - pos = vcf_line.length(); - } - result.emplace_back(vcf_line.substr(offset, pos - offset)); - offset = vcf_line.find('\t', offset); - } - - if (result.size() != num_samples) { - std::cerr << "error: [vg index] expected " << num_samples << " samples, got " << result.size() << std::endl; - std::exit(EXIT_FAILURE); - } - - return result; -} - -void write_thread_db(const std::string& filename, const std::vector& thread_names, size_t haplotype_count) { - std::ofstream out(filename, std::ios_base::binary); - if (!out) { - std::cerr << "error: [vg index] cannot write thread database to " << filename << std::endl; - } - - out.write(reinterpret_cast(&haplotype_count), sizeof(haplotype_count)); - size_t thread_count = thread_names.size(); - out.write(reinterpret_cast(&thread_count), sizeof(thread_count)); - for (const std::string& name : thread_names) { - size_t name_length = name.length(); - out.write(reinterpret_cast(&name_length), sizeof(name_length)); - out.write(name.data(), name_length); - } - out.close(); -} - -void read_thread_db(const std::vector& filenames, std::vector& thread_names, size_t& haplotype_count) { - thread_names.clear(); - haplotype_count = 0; - - for (const std::string& filename : filenames) { - std::ifstream in(filename, std::ios_base::binary); - if (!in) { - std::cerr << "error: [vg index] cannot read thread database from " << filename << std::endl; - std::exit(EXIT_FAILURE); - } - - size_t new_haplotype_count = 0; - in.read(reinterpret_cast(&new_haplotype_count), sizeof(new_haplotype_count)); - haplotype_count = std::max(haplotype_count, new_haplotype_count); - size_t threads_remaining = 0; - in.read(reinterpret_cast(&threads_remaining), sizeof(threads_remaining)); - while (threads_remaining > 0) { - size_t name_length = 0; - in.read(reinterpret_cast(&name_length), sizeof(name_length)); - std::vector buffer(name_length); - in.read(buffer.data(), name_length); - thread_names.emplace_back(buffer.begin(), buffer.end()); - threads_remaining--; - } - in.close(); - } -} - // Register subcommand -static Subcommand vg_construct("index", "index graphs or alignments for random access or mapping", PIPELINE, 2, main_index); +static Subcommand vg_construct("index", "index graphs or alignments for random access or mapping", PIPELINE, 4, main_index); diff --git a/src/subcommand/inject_main.cpp b/src/subcommand/inject_main.cpp index f9736b01a71..a9ed8cbcb8f 100644 --- a/src/subcommand/inject_main.cpp +++ b/src/subcommand/inject_main.cpp @@ -10,9 +10,13 @@ #include +#include "../utility.hpp" #include "../alignment.hpp" #include "../vg.hpp" -#include "../stream.hpp" +#include "../xg.hpp" +#include +#include +#include using namespace std; using namespace vg; @@ -22,7 +26,7 @@ void help_inject(char** argv) { cerr << "usage: " << argv[0] << " inject [options] input.[bam|sam|cram] >output.gam" << endl << endl << "options:" << endl - << " -x, --xg-name FILE use the graph in this xg index" << endl + << " -x, --xg-name FILE use this graph or xg index (required, non-XG formats also accepted)" << endl << " -t, --threads N number of threads to use" << endl; } @@ -33,7 +37,7 @@ int main_inject(int argc, char** argv) { } string xg_name; - int threads = 1; + int threads = get_thread_count(); int c; optind = 2; @@ -62,7 +66,6 @@ int main_inject(int argc, char** argv) { case 't': threads = parse(optarg); - omp_set_num_threads(threads); break; case 'h': @@ -75,28 +78,25 @@ int main_inject(int argc, char** argv) { abort (); } } + + omp_set_num_threads(threads); string file_name = get_input_file_name(optind, argc, argv); - xg::XG* xgidx = nullptr; - ifstream xg_stream(xg_name); - if(xg_stream) { - xgidx = new xg::XG(xg_stream); - } - if (!xg_stream || xgidx == nullptr) { - cerr << "[vg inject] error: could not open xg index" << endl; - return 1; + // We require an XG index + if (xg_name.empty()) { + cerr << "error[vg inject]: XG index (-x) is required" << endl; + exit(1); } + unique_ptr path_handle_graph = vg::io::VPKG::load_one(xg_name); + bdsg::PathPositionOverlayHelper overlay_helper; + PathPositionHandleGraph* xgidx = overlay_helper.apply(path_handle_graph.get()); - vector buf; + vg::io::ProtobufEmitter buf(cout); function lambda = [&buf](Alignment& aln) { #pragma omp critical (buf) { - buf.push_back(aln); - if (buf.size() > 1000) { - write_alignments(cout, buf); - buf.clear(); - } + buf.write(std::move(aln)); } }; if (threads > 1) { @@ -104,11 +104,6 @@ int main_inject(int argc, char** argv) { } else { hts_for_each(file_name, lambda, xgidx); } - write_alignments(cout, buf); - buf.clear(); - // Finish the stream with an EOF marker - stream::finish(cout); - cout.flush(); return 0; } diff --git a/src/subcommand/join_main.cpp b/src/subcommand/join_main.cpp index 39d2a87ccfe..23921d24a22 100644 --- a/src/subcommand/join_main.cpp +++ b/src/subcommand/join_main.cpp @@ -88,5 +88,5 @@ int main_join(int argc, char** argv) { } // Register subcommand -static Subcommand vg_join("join", "combine graphs via a new head", main_join); +static Subcommand vg_join("join", "combine graphs via a new head", DEPRECATED, main_join); diff --git a/src/subcommand/kmers_main.cpp b/src/subcommand/kmers_main.cpp index c693d8e2b7b..ebb31597082 100644 --- a/src/subcommand/kmers_main.cpp +++ b/src/subcommand/kmers_main.cpp @@ -9,6 +9,7 @@ #include #include +#include #include "subcommand.hpp" @@ -22,25 +23,20 @@ using namespace vg::subcommand; void help_kmers(char** argv) { cerr << "usage: " << argv[0] << " kmers [options] [graph2.vg ...] >kmers.tsv" << endl - << "Generates kmers of the graph(s). Output is: kmer id pos" << endl + << "Generates kmers from both strands of the graph(s). Output is: kmer id pos" << endl << endl - << "options:" << endl + << "general options:" << endl << " -k, --kmer-size N print kmers of size N in the graph" << endl - << " -e, --edge-max N only consider paths which make edge choices at <= this many points" << endl - << " -j, --kmer-stride N step distance between succesive kmers in paths (default 1)" << endl << " -t, --threads N number of threads to use" << endl - << " -d, --ignore-dups filter out duplicated kmers in normal output" << endl - << " -n, --allow-negs don't filter out relative negative positions of kmers in normal output" << endl + << " -p, --progress show progress" << endl + << "gcsa options:" << endl << " -g, --gcsa-out output a table suitable for input to GCSA2:" << endl << " kmer, starting position, previous characters," << endl << " successive characters, successive positions." << endl - << " Forward and reverse strand kmers are reported." << endl - << " -B, --gcsa-binary Write the GCSA graph in binary format." << endl - << " -F, --forward-only When producing GCSA2 output, don't describe the reverse strand" << endl - << " -P, --path-only Only consider kmers if they occur in a path embedded in the graph" << endl + << " -B, --gcsa-binary write the GCSA graph in binary format (implies -g)" << endl << " -H, --head-id N use the specified ID for the GCSA2 head sentinel node" << endl << " -T, --tail-id N use the specified ID for the GCSA2 tail sentinel node" << endl - << " -p, --progress show progress" << endl; + << "" << endl; } int main_kmers(int argc, char** argv) { @@ -50,20 +46,15 @@ int main_kmers(int argc, char** argv) { return 1; } - int kmer_size = 0; - bool path_only = false; - int edge_max = 0; - int kmer_stride = 1; + // General options. + size_t kmer_size = 0; bool show_progress = false; + + // GCSA options. Head and tail for distributed kmer generation. bool gcsa_out = false; - bool allow_dups = true; - bool allow_negs = false; - // for distributed GCSA2 kmer generation + bool gcsa_binary = false; int64_t head_id = 0; int64_t tail_id = 0; - bool forward_only = false; - bool gcsa_binary = false; - bool handle_alg = false; int c; optind = 2; // force optind past command positional argument @@ -71,25 +62,27 @@ int main_kmers(int argc, char** argv) { static struct option long_options[] = { - {"help", no_argument, 0, 'h'}, + // General options. {"kmer-size", required_argument, 0, 'k'}, - {"kmer-stride", required_argument, 0, 'j'}, - {"edge-max", required_argument, 0, 'e'}, {"threads", required_argument, 0, 't'}, - {"gcsa-out", no_argument, 0, 'g'}, - {"ignore-dups", no_argument, 0, 'd'}, - {"allow-negs", no_argument, 0, 'n'}, {"progress", no_argument, 0, 'p'}, + + // GCSA options. + {"gcsa-out", no_argument, 0, 'g'}, + {"gcsa-binary", no_argument, 0, 'B'}, {"head-id", required_argument, 0, 'H'}, {"tail-id", required_argument, 0, 'T'}, + + // Obsolete options. + {"edge-max", required_argument, 0, 'e'}, {"forward-only", no_argument, 0, 'F'}, - {"gcsa-binary", no_argument, 0, 'B'}, - {"path-only", no_argument, 0, 'P'}, + + {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hk:j:pt:e:gdnH:T:FBP", + c = getopt_long (argc, argv, "k:t:pgBH:T:e:Fh", long_options, &option_index); // Detect the end of the options. @@ -98,58 +91,40 @@ int main_kmers(int argc, char** argv) { switch (c) { - + // General options. case 'k': - kmer_size = parse(optarg); - break; - - case 'j': - kmer_stride = parse(optarg); - break; - - case 'e': - edge_max = parse(optarg); + kmer_size = parse(optarg); break; - case 't': omp_set_num_threads(parse(optarg)); break; + case 'p': + show_progress = true; + break; + // GCSA options. case 'g': gcsa_out = true; break; - - case 'F': - forward_only = true; - break; - - - case 'P': - path_only = true; - break; - - case 'd': - allow_dups = false; - break; - - case 'n': - allow_negs = true; - break; - - case 'p': - show_progress = true; + case 'B': + gcsa_out = true; + gcsa_binary = true; break; - case 'H': head_id = parse(optarg); break; - case 'T': tail_id = parse(optarg); break; - case 'B': - gcsa_binary = true; + // Obsolete options. + case 'e': + cerr << "error: [vg kmers] Option --edge-max is obsolete. Use vg prune to prune the graph instead." << endl; + std::exit(EXIT_FAILURE); + break; + case 'F': + cerr << "error: [vg kmers] Option --forward-only is obsolete" << endl; + std::exit(EXIT_FAILURE); break; case 'h': @@ -163,6 +138,11 @@ int main_kmers(int argc, char** argv) { } } + if (kmer_size == 0) { + cerr << "error: [vg kmers] --kmer-size was not specified" << endl; + std::exit(EXIT_FAILURE); + } + vector graph_file_names; while (optind < argc) { string file_name = get_input_file_name(optind, argc, argv); @@ -174,14 +154,6 @@ int main_kmers(int argc, char** argv) { graphs.show_progress = show_progress; if (gcsa_out) { - if (edge_max != 0) { - // I have been passing this option to vg index -g for months - // thinking it worked. But it can't work. So we should tell the user - // they're wrong. - cerr << "error:[vg kmers] Cannot limit edge crossing (-e) when generating GCSA kmers (-g)." - << " Use vg mod -p to prune the graph instead." << endl; - exit(1); - } if (!gcsa_binary) { graphs.write_gcsa_kmers_ascii(cout, kmer_size, head_id, tail_id); } else { @@ -189,7 +161,6 @@ int main_kmers(int argc, char** argv) { graphs.write_gcsa_kmers_binary(cout, kmer_size, limit, head_id, tail_id); } } else { - //function auto lambda = [](const kmer_t& kmer) { #pragma omp critical (cout) cout << kmer << endl; @@ -202,5 +173,5 @@ int main_kmers(int argc, char** argv) { } // Register subcommand -static Subcommand vg_kmers("kmers", "enumerate kmers of the graph", main_kmers); +static Subcommand vg_kmers("kmers", "enumerate kmers of the graph", DEPRECATED, main_kmers); diff --git a/src/subcommand/locify_main.cpp b/src/subcommand/locify_main.cpp deleted file mode 100644 index b70de90932c..00000000000 --- a/src/subcommand/locify_main.cpp +++ /dev/null @@ -1,360 +0,0 @@ -/** \file locify_main.cpp - * - * Defines the "vg locify" subcommand - */ - - -#include -#include -#include - -#include - -#include "subcommand.hpp" - -#include "../vg.hpp" -#include "../index.hpp" -#include "../convert.hpp" -#include "../stream.hpp" - -using namespace std; -using namespace vg; -using namespace vg::subcommand; - -void help_locify(char** argv){ - cerr << "usage: " << argv[0] << " locify [options] " << endl - << " -l, --loci FILE input loci over which to locify the alignments" << endl - << " -a, --aln-idx DIR use this rocksdb alignment index (from vg index -N)" << endl - << " -x, --xg-idx FILE use this xg index" << endl - << " -n, --name-alleles generate names for each allele rather than using full Paths" << endl - << " -f, --forwardize flip alignments on the reverse strand to the forward" << endl - << " -s, --sorted-loci FILE write the non-nested loci out in their sorted order" << endl - << " -b, --n-best N keep only the N-best alleles by alignment support" << endl - << " -o, --out-loci FILE rewrite the loci with only N-best alleles kept" << endl; - // TODO -- add some basic filters that are useful downstream in whatshap -} - -int main_locify(int argc, char** argv){ - string gam_idx_name; - string loci_file; - Index gam_idx; - string xg_idx_name; - bool name_alleles = false; - bool forwardize = false; - string loci_out, sorted_loci; - int n_best = 0; - - if (argc <= 2){ - help_locify(argv); - exit(1); - } - - int c; - optind = 2; // force optind past command positional argument - while (true) { - static struct option long_options[] = - { - {"help", no_argument, 0, 'h'}, - {"gam-idx", required_argument, 0, 'g'}, - {"loci", required_argument, 0, 'l'}, - {"xg-idx", required_argument, 0, 'x'}, - {"name-alleles", no_argument, 0, 'n'}, - {"forwardize", no_argument, 0, 'f'}, - {"sorted-loci", required_argument, 0, 's'}, - {"loci-out", required_argument, 0, 'o'}, - {"n-best", required_argument, 0, 'b'}, - {0, 0, 0, 0} - }; - - int option_index = 0; - c = getopt_long (argc, argv, "hl:x:g:nfo:b:s:", - long_options, &option_index); - - // Detect the end of the options. - if (c == -1) - break; - - switch (c) - { - case 'g': - gam_idx_name = optarg; - break; - - case 'l': - loci_file = optarg; - break; - - case 'x': - xg_idx_name = optarg; - break; - - case 'n': - name_alleles = true; - break; - - case 'f': - forwardize = true; - break; - - case 'o': - loci_out = optarg; - break; - - case 's': - sorted_loci = optarg; - break; - - case 'b': - n_best = parse(optarg); - name_alleles = true; - break; - - case 'h': - case '?': - help_locify(argv); - exit(1); - break; - - default: - abort (); - } - } - - if (!gam_idx_name.empty()) { - gam_idx.open_read_only(gam_idx_name); - } - - if (xg_idx_name.empty()) { - cerr << "[vg locify] Error: no xg index provided" << endl; - return 1; - } - ifstream xgstream(xg_idx_name); - xg::XG xgidx(xgstream); - - std::function(string, char)> strsplit = [&](string x, char delim){ - - vector ret; - stringstream ss; - std::string tok; - while (getline(ss, tok, delim)){ - ret.push_back(tok); - } - return ret; - - }; - - vector locus_names; - map > locus_allele_names; - map alignments_with_loci; - map > pos_to_loci; - map > locus_to_pos; - map > locus_allele_support; - map > locus_to_best_n_alleles; - map > locus_to_keep; - int count = 0; - - std::function lambda = [&](Locus& l){ - locus_names.push_back(l.name()); - set nodes_in_locus; - for (int i = 0; i < l.allele_size(); ++i) { - auto& allele = l.allele(i); - for (int j = 0; j < allele.mapping_size(); ++j) { - auto& position = allele.mapping(j).position(); - nodes_in_locus.insert(position.node_id()); - } - // for position in mapping - map ref_positions; - map edits; - decompose(allele, ref_positions, edits); - // warning: uses only reference positions!!! - for (auto& pos : ref_positions) { - pos_to_loci[pos.first].insert(l.name()); - locus_to_pos[l.name()].insert(pos.first); - } - } - // void for_alignment_in_range(int64_t id1, int64_t id2, std::function lambda); - std::function fill_alns = [&](const Alignment& a){ - // TODO reverse complementing alleles ? - // overlap is stranded - //matching - // find the most-matching allele - map > matches; - for (int i = 0; i < l.allele_size(); ++i) { - auto& allele = l.allele(i); - matches[overlap(a.path(), allele)].push_back(i); - } - assert(l.allele_size()); - int best = matches.rbegin()->second.front(); - Locus matching; - matching.set_name(l.name()); - if (name_alleles) { - //map > locus_allele_names; - auto& allele = l.allele(best); - string s; - allele.SerializeToString(&s); - auto& l_names = locus_allele_names[l.name()]; - auto f = l_names.find(s); - int name_int = 0; - if (f == l_names.end()) { - int next_id = l_names.size() + 1; - l_names[s] = next_id; - name_int = next_id; - } else { - name_int = f->second; - } - string allele_name = vg::convert(name_int); - Path p; - p.set_name(allele_name); - *matching.add_allele() = p; - if (n_best) { - // record support for this allele - // we'll use to filter the locus records later - locus_allele_support[l.name()][name_int]++; - } - } else { - *matching.add_allele() = l.allele(best); - // TODO get quality score relative to this specific allele / alignment - // record in the alignment we'll save - } - if (alignments_with_loci.find(a.name()) == alignments_with_loci.end()) { - alignments_with_loci[a.name()] = a; - } - Alignment& aln = alignments_with_loci[a.name()]; - *aln.add_locus() = matching; - }; - vector nodes_vec; - for (auto& id : nodes_in_locus) nodes_vec.push_back(id); - gam_idx.for_alignment_to_nodes(nodes_vec, fill_alns); - }; - - if (!loci_file.empty()){ - ifstream ifi(loci_file); - stream::for_each(ifi, lambda); - } else { - cerr << "[vg locify] Warning: empty locus file given, could not annotate alignments with loci." << endl; - } - - // find the non-nested loci - vector non_nested_loci; - for (auto& name : locus_names) { - // is it nested? - auto& positions = locus_to_pos[name]; - int min_loci = 0; - for (auto& pos : positions) { - auto& loci = pos_to_loci[pos]; - min_loci = (min_loci == 0 ? (int)loci.size() : min(min_loci, (int)loci.size())); - } - if (min_loci == 1) { - // not fully contained in any other locus - non_nested_loci.push_back(name); - } - } - - // filter out the non-best alleles - if (n_best) { - // find the n-best - for (auto& supp : locus_allele_support) { - auto& name = supp.first; - auto& alleles = supp.second; - map ranked; - for (auto& allele : alleles) { - ranked[allele.second] = allele.first; - } - auto& to_keep = locus_to_keep[name]; - for (auto r = ranked.rbegin(); r != ranked.rend(); ++r) { - to_keep.insert(r->second); - if (to_keep.size() == n_best) { - break; - } - } - } - // filter out non-n-best from the alignments - for (auto& a : alignments_with_loci) { - auto& aln = a.second; - vector kept; - for (int i = 0; i < aln.locus_size(); ++i) { - auto& allele = aln.locus(i).allele(0); - if (locus_to_keep[aln.locus(i).name()].count(atoi(allele.name().c_str()))) { - kept.push_back(aln.locus(i)); - } - } - aln.clear_locus(); - for (auto& l : kept) { - *aln.add_locus() = l; - } - } - } - - if (n_best && !loci_out.empty()) { - // filter out non-n-best from the loci - if (!loci_file.empty()){ - ofstream outloci(loci_out); - vector buffer; - std::function lambda = [&](Locus& l){ - // remove the alleles which are to filter - //map > locus_allele_names; - auto& allele_names = locus_allele_names[l.name()]; - auto& to_keep = locus_to_keep[l.name()]; - vector alleles_to_keep; - for (int i = 0; i < l.allele_size(); ++i) { - auto allele = l.allele(i); - string s; allele.SerializeToString(&s); - auto& name = allele_names[s]; - if (to_keep.count(name)) { - allele.set_name(vg::convert(name)); - alleles_to_keep.push_back(allele); - } - } - l.clear_allele(); - for (auto& allele : alleles_to_keep) { - *l.add_allele() = allele; - } - buffer.push_back(l); - stream::write_buffered(outloci, buffer, 100); - }; - ifstream ifi(loci_file); - stream::for_each(ifi, lambda); - stream::write_buffered(outloci, buffer, 0); - outloci.close(); - } else { - cerr << "[vg locify] Warning: empty locus file given, could not update loci." << endl; - } - } - - // sort them using... ? ids? - sort(non_nested_loci.begin(), non_nested_loci.end(), - [&locus_to_pos](const string& s1, const string& s2) { - return *locus_to_pos[s1].begin() < *locus_to_pos[s2].begin(); - }); - - if (!sorted_loci.empty()) { - ofstream outsorted(sorted_loci); - for (auto& name : non_nested_loci) { - outsorted << name << endl; - } - outsorted.close(); - } - - vector output_buf; - for (auto& aln : alignments_with_loci) { - // TODO order the loci by their order in the alignments - if (forwardize) { - if (aln.second.path().mapping_size() && aln.second.path().mapping(0).position().is_reverse()) { - output_buf.push_back(reverse_complement_alignment(aln.second, - [&xgidx](int64_t id) { return xgidx.node_length(id); })); - } else { - output_buf.push_back(aln.second); - } - } else { - output_buf.push_back(aln.second); - } - stream::write_buffered(cout, output_buf, 100); - } - stream::write_buffered(cout, output_buf, 0); - - return 0; -} - -// Register subcommand -static Subcommand vg_locify("locify", "find loci", main_locify); - diff --git a/src/subcommand/map_main.cpp b/src/subcommand/map_main.cpp index abb8bb90898..ea269644d5f 100644 --- a/src/subcommand/map_main.cpp +++ b/src/subcommand/map_main.cpp @@ -1,12 +1,16 @@ #include "subcommand.hpp" #include "../vg.hpp" +#include "../xg.hpp" #include "../utility.hpp" #include "../mapper.hpp" -#include "../surjector.hpp" -#include "../stream.hpp" +#include "../hts_alignment_emitter.hpp" +#include +#include +#include #include #include +#include using namespace vg; using namespace vg::subcommand; @@ -17,7 +21,7 @@ void help_map(char** argv) { << endl << "graph/index:" << endl << " -d, --base-name BASE use BASE.xg and BASE.gcsa as the input index pair" << endl - << " -x, --xg-name FILE use this xg index (defaults to .vg.xg)" << endl + << " -x, --xg-name FILE use this xg index or graph (defaults to .vg.xg)" << endl << " -g, --gcsa-name FILE use this GCSA2 index (defaults to " << gcsa::GCSA::EXTENSION << ")" << endl << " -1, --gbwt-name FILE use this GBWT haplotype index (defaults to "<= FLOAT [0]" << endl << " -H, --max-target-x N skip cluster subgraphs with length > N*read_length [100]" << endl - << " -m, --acyclic-graph improves runtime when the graph is acyclic" << endl << " -w, --band-width INT band width for long read alignment [256]" << endl << " -O, --band-overlap INT band overlap for long read alignment [{-w}/8]" << endl << " -J, --band-jump INT the maximum number of bands of insertion we consider in the alignment chain model [128]" << endl @@ -56,7 +59,7 @@ void help_map(char** argv) { << "scoring:" << endl << " -q, --match INT use this match score [1]" << endl << " -z, --mismatch INT use this mismatch penalty [4]" << endl - << " --score-matrix FILE read a 5x5 integer substitution scoring matrix from a file" << endl + << " --score-matrix FILE read a 4x4 integer substitution scoring matrix from a file" << endl << " -o, --gap-open INT use this gap open penalty [6]" << endl << " -y, --gap-extend INT use this gap extension penalty [1]" << endl << " -L, --full-l-bonus INT the full-length alignment bonus [5]" << endl @@ -64,6 +67,9 @@ void help_map(char** argv) { << " -a, --hap-exp FLOAT the exponent for haplotype consistency likelihood in alignment score [1]" << endl << " --recombination-penalty FLOAT use this log recombination penalty for GBWT haplotype scoring [20.7]" << endl << " -A, --qual-adjust perform base quality adjusted alignments (requires base quality input)" << endl + << "preset:" << endl + << " -m, --alignment-model STR use a preset alignment scoring model, either \"short\" (default) or \"long\" (for ONT/PacBio)" << endl + << " \"long\" is equivalent to `-u 2 -L 63 -q 1 -z 2 -o 2 -y 1 -w 128 -O 32`" << endl << "input:" << endl << " -s, --sequence STR align a string to the graph in graph.vg using partial order alignment" << endl << " -V, --seq-name STR name the sequence using this value (for graph modification with new named paths)" << endl @@ -77,19 +83,25 @@ void help_map(char** argv) { << " -R, --read-group NAME for --reads input, add this read group" << endl << "output:" << endl << " -j, --output-json output JSON rather than an alignment stream (helpful for debugging)" << endl + << " -%, --gaf output alignments in GAF format" << endl << " --surject-to TYPE surject the output into the graph's paths, writing TYPE := bam |sam | cram" << endl + << " --ref-paths FILE ordered list of paths in the graph, one per line or HTSlib .dict, for HTSLib @SQ headers" << endl << " --buffer-size INT buffer this many alignments together before outputting in GAM [512]" << endl << " -X, --compare realign GAM input (-G), writing alignment with \"correct\" field set to overlap with input" << endl << " -v, --refpos-table for efficient testing output a table of name, chr, pos, mq, score" << endl << " -K, --keep-secondary produce alignments for secondary input alignments in addition to primary ones" << endl << " -M, --max-multimaps INT produce up to INT alignments for each read [1]" << endl << " -Q, --mq-max INT cap the mapping quality at INT [60]" << endl - << " -D, --debug print debugging information about alignment to stderr" << endl; + << " --exclude-unaligned exclude reads with no alignment" << endl + << " -D, --debug print debugging information about alignment to stderr" << endl + << " --log-time print runtime to stderr" << endl; } int main_map(int argc, char** argv) { + std::chrono::time_point launch = std::chrono::system_clock::now(); + if (argc == 2) { help_map(argv); return 1; @@ -97,6 +109,8 @@ int main_map(int argc, char** argv) { #define OPT_SCORE_MATRIX 1000 #define OPT_RECOMBINATION_PENALTY 1001 + #define OPT_EXCLUDE_UNALIGNED 1002 + #define OPT_REF_PATHS 1003 string matrix_file_name; string seq; string qual; @@ -112,8 +126,9 @@ int main_map(int argc, char** argv) { int hit_max = 2048; int max_multimaps = 1; int thread_count = 1; - bool output_json = false; - string surject_type; + string output_format = "GAM"; + string ref_paths_name; + bool exclude_unaligned = false; bool debug = false; float min_score = 0; string sample_name; @@ -168,12 +183,12 @@ int main_map(int argc, char** argv) { bool print_fragment_model = false; int fragment_model_update = 10; bool acyclic_graph = false; - bool refpos_table = false; bool patch_alignments = true; int min_banded_mq = 0; int max_sub_mem_recursion_depth = 2; bool xdrop_alignment = false; uint32_t max_gap_length = 40; + bool log_time = false; int c; optind = 2; // force optind past command positional argument @@ -199,6 +214,7 @@ int main_map(int argc, char** argv) { {"output-json", no_argument, 0, 'j'}, {"hts-input", required_argument, 0, 'b'}, {"keep-secondary", no_argument, 0, 'K'}, + {"exclude-unaligned", no_argument, 0, OPT_EXCLUDE_UNALIGNED}, {"fastq", required_argument, 0, 'f'}, {"fasta", required_argument, 0, 'F'}, {"interleaved", no_argument, 0, 'i'}, @@ -229,7 +245,7 @@ int main_map(int argc, char** argv) { {"full-l-bonus", required_argument, 0, 'L'}, {"hap-exp", required_argument, 0, 'a'}, {"recombination-penalty", required_argument, 0, OPT_RECOMBINATION_PENALTY}, - {"acyclic-graph", no_argument, 0, 'm'}, + {"alignment-model", required_argument, 0, 'm'}, {"mem-chance", required_argument, 0, 'e'}, {"drop-chain", required_argument, 0, 'C'}, {"mq-overlap", required_argument, 0, 'n'}, @@ -243,16 +259,19 @@ int main_map(int argc, char** argv) { {"id-mq-weight", required_argument, 0, '7'}, {"refpos-table", no_argument, 0, 'v'}, {"surject-to", required_argument, 0, '5'}, + {"ref-paths", required_argument, 0, OPT_REF_PATHS}, {"no-patch-aln", no_argument, 0, '8'}, {"drop-full-l-bonus", no_argument, 0, '2'}, {"unpaired-cost", required_argument, 0, 'S'}, {"max-gap-length", required_argument, 0, 1}, {"xdrop-alignment", no_argument, 0, 2}, + {"gaf", no_argument, 0, '%'}, + {"log-time", no_argument, 0, '^'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "s:J:Q:d:x:g:1:T:N:R:c:M:t:G:jb:Kf:iw:P:Dk:Y:r:W:6H:Z:q:z:o:y:Au:B:I:S:l:e:C:V:O:L:a:n:E:X:UpF:m7:v5:824:3:9:0:", + c = getopt_long (argc, argv, "s:J:Q:d:x:g:1:T:N:R:c:M:t:G:jb:Kf:iw:P:Dk:Y:r:W:6H:Z:q:z:o:y:Au:B:I:S:l:e:C:V:O:L:a:n:E:X:UpF:m:7:v5:824:3:9:0:%^", long_options, &option_index); @@ -323,7 +342,16 @@ int main_map(int argc, char** argv) { break; case 'm': - acyclic_graph = true; + if (string(optarg) == "long") { + extra_multimaps = 2; + full_length_bonus = 63; + match = 1; + mismatch = 2; + gap_open = 2; + gap_extend = 1; + band_width = 128; + band_overlap = 32; + } break; case 'T': @@ -346,6 +374,10 @@ int main_map(int argc, char** argv) { keep_secondary = true; break; + case OPT_EXCLUDE_UNALIGNED: + exclude_unaligned = true; + break; + case 'f': if (fastq1.empty()) fastq1 = optarg; else if (fastq2.empty()) fastq2 = optarg; @@ -389,7 +421,11 @@ int main_map(int argc, char** argv) { break; case 'j': - output_json = true; + output_format = "JSON"; + break; + + case '%': + output_format = "GAF"; break; case 'w': @@ -475,15 +511,27 @@ int main_map(int argc, char** argv) { case 'X': compare_gam = true; - output_json = true; + output_format = "JSON"; break; case 'v': - refpos_table = true; + output_format = "TSV"; break; case '5': - surject_type = optarg; + output_format = optarg; + for (auto& c: output_format) { + // Convert to upper case + c = toupper(c); + } + if (output_format != "SAM" && output_format != "BAM" && output_format != "CRAM") { + cerr << "error [vg map] illegal surjection type " << optarg << endl; + return 1; + } + break; + + case OPT_REF_PATHS: + ref_paths_name = optarg; break; case '8': @@ -538,6 +586,10 @@ int main_map(int argc, char** argv) { xdrop_alignment = true; break; + case '^': + log_time = true; + break; + case 'h': case '?': /* getopt_long already printed an error message. */ @@ -552,6 +604,14 @@ int main_map(int argc, char** argv) { } } + // Decide if we are outputting to an htslib format + bool hts_output = (output_format == "SAM" || output_format == "BAM" || output_format == "CRAM"); + + if (!ref_paths_name.empty() && !hts_output) { + cerr << "warning:[vg map] Reference path file (--ref-paths) is only used when output format (--surject-to) is SAM, BAM, or CRAM." << endl; + ref_paths_name = ""; + } + if (seq.empty() && read_file.empty() && hts_file.empty() && fastq1.empty() && gam_input.empty() && fasta_file.empty()) { cerr << "error:[vg map] A sequence or read file is required when mapping." << endl; return 1; @@ -570,7 +630,7 @@ int main_map(int argc, char** argv) { return 1; } // note: still possible that hts file types don't have quality, but have to check the file to know - + MappingQualityMethod mapping_quality_method = Approx; string file_name; @@ -602,33 +662,37 @@ int main_map(int argc, char** argv) { // Configure GCSA2 verbosity so it doesn't spit out loads of extra info gcsa::Verbosity::set(gcsa::Verbosity::SILENT); - - // Configure its temp directory to the system temp directory - gcsa::TempFile::setDirectory(temp_file::get_dir()); // Load up our indexes. - xg::XG* xgidx = nullptr; - gcsa::GCSA* gcsa = nullptr; - gcsa::LCPArray* lcp = nullptr; - gbwt::GBWT* gbwt = nullptr; + PathPositionHandleGraph* xgidx = nullptr; + unique_ptr gcsa; + unique_ptr lcp; + unique_ptr gbwt; + // Used only for memory management: + unique_ptr path_handle_graph; + bdsg::PathPositionVectorizableOverlayHelper overlay_helper; // One of them may be used to provide haplotype scores haplo::ScoreProvider* haplo_score_provider = nullptr; - - // We try opening the file, and then see if it worked - ifstream xg_stream(xg_name); - - if(xg_stream) { + + if(!xg_name.empty()) { // We have an xg index! + + // We try opening the file, and then see if it worked + ifstream xg_stream(xg_name); + if (!xg_stream) { + cerr << "Error[vg map]: Unable to open xg file \"" << xg_name << "\"" << endl; + exit(1); + } + xg_stream.close(); // TODO: tell when the user asked for an XG vs. when we guessed one, // and error when the user asked for one and we can't find it. if(debug) { cerr << "Loading xg index " << xg_name << "..." << endl; } - xgidx = new xg::XG(xg_stream); - - // TODO: Support haplo::XGScoreProvider? + path_handle_graph = vg::io::VPKG::load_one(xg_name); + xgidx = dynamic_cast(overlay_helper.apply(path_handle_graph.get())); } ifstream gcsa_stream(gcsa_name); @@ -637,18 +701,16 @@ int main_map(int argc, char** argv) { if(debug) { cerr << "Loading GCSA2 index " << gcsa_name << "..." << endl; } - gcsa = new gcsa::GCSA(); - gcsa->load(gcsa_stream); + gcsa = vg::io::VPKG::load_one(gcsa_stream); } string lcp_name = gcsa_name + ".lcp"; ifstream lcp_stream(lcp_name); if (lcp_stream) { if(debug) { - cerr << "Loading LCP index " << gcsa_name << "..." << endl; + cerr << "Loading LCP index " << lcp_name << "..." << endl; } - lcp = new gcsa::LCPArray(); - lcp->load(lcp_stream); + lcp = vg::io::VPKG::load_one(lcp_stream); } ifstream gbwt_stream(gbwt_name); @@ -657,8 +719,8 @@ int main_map(int argc, char** argv) { if(debug) { cerr << "Loading GBWT haplotype index " << gbwt_name << "..." << endl; } - gbwt = new gbwt::GBWT(); - gbwt->load(gbwt_stream); + + gbwt = vg::io::VPKG::load_one(gbwt_stream); // We want to use this for haplotype scoring haplo_score_provider = new haplo::GBWTScoreProvider(*gbwt); @@ -673,278 +735,55 @@ int main_map(int argc, char** argv) { } } - thread_count = get_thread_count(); + thread_count = vg::get_thread_count(); + // TODO: We need a Mapper for every thread because the Mapper's fragment + // length distribution isn't yet thread safe. vector mapper; mapper.resize(thread_count); - vector > output_buffer; - output_buffer.resize(thread_count); - vector empty_alns; - // If we need to do surjection - Surjector surjector(xgidx); - - // bam/sam/cram output - samFile* sam_out = 0; - int buffer_limit = 100; - bam_hdr_t* hdr = nullptr; - int compress_level = 9; // hard coded - map rg_sample; - string sam_header; - - vector surjectors; - if (!surject_type.empty()) { - surjectors.resize(thread_count); - for (int i = 0; i < surjectors.size(); i++) { - surjectors[i] = new Surjector(xgidx); - } - } - - // if no paths were given take all of those in the index - set path_names; - if (!surject_type.empty() && path_names.empty()) { - for (size_t i = 1; i <= xgidx->path_count; ++i) { - path_names.insert(xgidx->path_name(i)); - } + // When outputting single-ended alignments, we need an empty vector to pass around + vector empty_alns; + + // Look up all the paths we might need to surject to. + vector> paths; + if (hts_output) { + paths = get_sequence_dictionary(ref_paths_name, {}, *xgidx); } + + // Set up output to an emitter that will handle serialization and surjection + unique_ptr alignment_emitter = get_alignment_emitter("-", output_format, paths, thread_count, xgidx); - // for SAM header generation - auto setup_sam_header = [&hdr, &sam_out, &surject_type, &compress_level, &xgidx, &rg_sample, &sam_header] (void) { -#pragma omp critical (hts_header) - if (!hdr) { - char out_mode[5]; - string out_format = ""; - strcpy(out_mode, "w"); - if (surject_type == "bam") { out_format = "b"; } - else if (surject_type == "cram") { out_format = "c"; } - else { out_format = ""; } - strcat(out_mode, out_format.c_str()); - if (compress_level >= 0) { - char tmp[2]; - tmp[0] = compress_level + '0'; tmp[1] = '\0'; - strcat(out_mode, tmp); - } - map path_length; - int num_paths = xgidx->max_path_rank(); - for (int i = 1; i <= num_paths; ++i) { - auto name = xgidx->path_name(i); - path_length[name] = xgidx->path_length(name); - } - hdr = hts_string_header(sam_header, path_length, rg_sample); - if ((sam_out = sam_open("-", out_mode)) == 0) { - cerr << "[vg map] failed to open stdout for writing HTS output" << endl; - exit(1); - } else { - // write the header - if (sam_hdr_write(sam_out, hdr) != 0) { - cerr << "[vg map] error: failed to write the SAM header" << endl; - } - } - } - }; - - // TODO: Refactor the surjection code out of surject_main and intto somewhere where we can just use it here! - - auto surject_alignments = [&hdr, &sam_header, &mapper, &rg_sample, &setup_sam_header, &path_names, &sam_out, &xgidx, &surjectors] (const vector& alns1, const vector& alns2) { - - if (alns1.empty()) return; - setup_sam_header(); - vector > surjects1, surjects2; - int tid = omp_get_thread_num(); - for (auto& aln : alns1) { - // Surject each alignment of the first read in the pair - string path_name; - int64_t path_pos = -1; - bool path_reverse = false; - - auto surj = surjectors[omp_get_thread_num()]->path_anchored_surject(aln, path_names, path_name, path_pos, path_reverse); - surjects1.push_back(make_tuple(path_name, path_pos, path_reverse, surj)); - - // hack: if we haven't established the header, we look at the reads to guess which read groups to put in it - if (!hdr && !surj.read_group().empty() && !surj.sample_name().empty()) { -#pragma omp critical (hts_header) - rg_sample[surj.read_group()] = surj.sample_name(); - } - } - - for (auto& aln : alns2) { - // Surject each alignment of the second read in the pair, if any - string path_name; - int64_t path_pos = -1; - bool path_reverse = false; - - auto surj = surjectors[omp_get_thread_num()]->path_anchored_surject(aln, path_names, path_name, path_pos, path_reverse); - surjects2.push_back(make_tuple(path_name, path_pos, path_reverse, surj)); - - // Don't try and populate the header; it should have happened already - } - - if (surjects2.empty()) { - // Write out surjected single-end reads - - for (auto& s : surjects1) { - auto& path_name = get<0>(s); - auto& path_pos = get<1>(s); - auto& path_reverse = get<2>(s); - auto& surj = get<3>(s); - - size_t path_len = 0; - if (path_name != "") { - path_len = xgidx->path_length(path_name); - } - string cigar = cigar_against_path(surj, path_reverse, path_pos, path_len, 0); - bam1_t* b = alignment_to_bam(sam_header, - surj, - path_name, - path_pos, - path_reverse, - cigar); - int r = 0; -#pragma omp critical (cout) - r = sam_write1(sam_out, hdr, b); - if (r == 0) { cerr << "[vg map] error: writing to stdout failed" << endl; exit(1); } - bam_destroy1(b); - } + // We have one function to dump alignments into + auto output_alignments = [&](vector& alns1, vector& alns2) { + if (alns2.empty()) { + // Single-ended read + alignment_emitter->emit_mapped_single(std::move(alns1)); } else { - // Write out surjected paired-end reads - - // Paired-end reads come in corresponding pairs, allowing duplicate reads. - assert(surjects1.size() == surjects2.size()); - - for (size_t i = 0; i < surjects1.size(); i++) { - // For each corresponding pair - auto& s1 = surjects1[i]; - auto& s2 = surjects2[i]; - - // Unpack each read - auto& path_name1 = get<0>(s1); - auto& path_pos1 = get<1>(s1); - auto& path_reverse1 = get<2>(s1); - auto& surj1 = get<3>(s1); - - auto& path_name2 = get<0>(s2); - auto& path_pos2 = get<1>(s2); - auto& path_reverse2 = get<2>(s2); - auto& surj2 = get<3>(s2); - - // Compute CIGARs - size_t path_len1, path_len2; - if (path_name1 != "") { - path_len1 = xgidx->path_length(path_name1); - } - if (path_name2 != "") { - path_len2 = xgidx->path_length(path_name2); - } - string cigar1 = cigar_against_path(surj1, path_reverse1, path_pos1, path_len1, 0); - string cigar2 = cigar_against_path(surj2, path_reverse2, path_pos2, path_len2, 0); + // Paired reads + if (hts_output) { + // We need a tlen limit for flags - // TODO: compute template length based on - // pair distance and alignment content. - int template_length = 0; + // Look up the paired end distribution stats for deciding if reads are propelry paired + auto& stats = mapper[omp_get_thread_num()]->frag_stats; + // Put a proper pair bound at 6 std devs. + // If distribution hasn't been computed yet, this comes out 0 and no bound is applied. + int64_t tlen_limit = stats.cached_fragment_length_mean + 6 * stats.cached_fragment_length_stdev; - // Make BAM records - bam1_t* b1 = alignment_to_bam(sam_header, - surj1, - path_name1, - path_pos1, - path_reverse1, - cigar1, - path_name2, - path_pos2, - template_length); - bam1_t* b2 = alignment_to_bam(sam_header, - surj2, - path_name2, - path_pos2, - path_reverse2, - cigar2, - path_name1, - path_pos1, - template_length); - - // Write the records - int r = 0; -#pragma omp critical (cout) - r = sam_write1(sam_out, hdr, b1); - if (r == 0) { cerr << "[vg map] error: writing to stdout failed" << endl; exit(1); } - bam_destroy1(b1); - r = 0; -#pragma omp critical (cout) - r = sam_write1(sam_out, hdr, b2); - if (r == 0) { cerr << "[vg map] error: writing to stdout failed" << endl; exit(1); } - bam_destroy1(b2); - } - - - } - }; - - auto write_json = [](const vector& alns) { - for(auto& alignment : alns) { - string json = pb2json(alignment); - cout << json << "\n"; - } - }; - - auto write_refpos = [](const vector& alns) { - for(auto& alignment : alns) { - Position refpos; - if (alignment.refpos_size()) { - refpos = alignment.refpos(0); - } - cout << alignment.name() << "\t" - << refpos.name() << "\t" - << refpos.offset() << "\t" - << alignment.mapping_quality() << "\t" - << alignment.score() << "\n"; - } - }; - - // We have one function to dump alignments into - // Make sure to flush the buffer at the end of the program! - auto output_alignments = [&output_buffer, - &output_json, - &surject_type, - &surject_alignments, - &buffer_size, - &refpos_table, - &write_json, - &write_refpos](const vector& alns1, const vector& alns2) { - if (output_json) { - // If we want to convert to JSON, convert them all to JSON and dump them to cout. -#pragma omp critical (cout) - { - write_json(alns1); - write_json(alns2); - } - } else if (refpos_table) { - // keep multi alignments ordered appropriately -#pragma omp critical (cout) - { - write_refpos(alns1); - write_refpos(alns2); + // Send the tlen limit when emitting + alignment_emitter->emit_mapped_pair(std::move(alns1), std::move(alns2), tlen_limit); + } else { + // No need for a tlen limit + alignment_emitter->emit_mapped_pair(std::move(alns1), std::move(alns2)); } - } else if (!surject_type.empty()) { - // surject - surject_alignments(alns1, alns2); - } else { - // Otherwise write them through the buffer for our thread - int tid = omp_get_thread_num(); - auto& output_buf = output_buffer[tid]; - - // Copy all the alignments over to the output buffer - copy(alns1.begin(), alns1.end(), back_inserter(output_buf)); - copy(alns2.begin(), alns2.end(), back_inserter(output_buf)); - - stream::write_buffered(cout, output_buf, buffer_size); } }; for (int i = 0; i < thread_count; ++i) { Mapper* m = nullptr; - if(xgidx && gcsa && lcp) { + if(xgidx && gcsa.get() && lcp.get()) { // We have the xg and GCSA indexes, so use them - m = new Mapper(xgidx, gcsa, lcp, haplo_score_provider); + m = new Mapper(xgidx, gcsa.get(), lcp.get(), haplo_score_provider); } else { // Can't continue with null throw runtime_error("Need XG, GCSA, and LCP to create a Mapper"); @@ -955,6 +794,7 @@ int main_map(int argc, char** argv) { m->band_multimaps = band_multimaps; m->min_banded_mq = min_banded_mq; m->maybe_mq_threshold = maybe_mq_threshold; + m->exclude_unaligned = exclude_unaligned; m->debug = debug; m->min_identity = min_score; m->drop_chain = drop_chain; @@ -971,9 +811,16 @@ int main_map(int argc, char** argv) { m->fast_reseed = use_fast_reseed; m->max_sub_mem_recursion_depth = max_sub_mem_recursion_depth; m->max_target_factor = max_target_factor; - m->set_alignment_scores(match, mismatch, gap_open, gap_extend, full_length_bonus, haplotype_consistency_exponent, max_gap_length); - if(matrix_stream.is_open()) m->load_scoring_matrix(matrix_stream); + if (matrix_stream.is_open()) { + m->set_alignment_scores(matrix_stream, gap_open, gap_extend, full_length_bonus, haplotype_consistency_exponent); + // reset the stream for the next Mapper + matrix_stream.seekg(0); + } + else { + m->set_alignment_scores(match, mismatch, gap_open, gap_extend, full_length_bonus, haplotype_consistency_exponent); + } m->strip_bonuses = strip_bonuses; + m->max_xdrop_gap_length = max_gap_length; m->adjust_alignments_for_base_quality = qual_adjust_alignments; m->extra_multimaps = extra_multimaps; m->mapping_quality_method = mapping_quality_method; @@ -999,6 +846,9 @@ int main_map(int argc, char** argv) { m->patch_alignments = patch_alignments; mapper[i] = m; } + vector reads_mapped_by_thread(thread_count, 0); + + std::chrono::time_point init = std::chrono::system_clock::now(); if (!seq.empty()) { int tid = omp_get_thread_num(); @@ -1009,9 +859,16 @@ int main_map(int argc, char** argv) { if (!qual.empty()) { unaligned.set_quality(qual); } - - vector alignments = mapper[tid]->align_multi(unaligned, kmer_size, kmer_stride, max_mem_length, band_width, band_overlap, xdrop_alignment); - if(alignments.size() == 0) { + + vector alignments = mapper[tid]->align_multi(unaligned, + kmer_size, + kmer_stride, + max_mem_length, + band_width, + band_overlap, + xdrop_alignment); + + if(alignments.size() == 0 && !exclude_unaligned) { // If we didn't have any alignments, report the unaligned alignment alignments.push_back(unaligned); } @@ -1023,8 +880,9 @@ int main_map(int argc, char** argv) { if (!seq_name.empty()) alignment.set_name(seq_name); } - // Output the alignments in JSON or protobuf as appropriate. + // Output the alignments in the correct format, possibly surjecting. output_alignments(alignments, empty_alns); + reads_mapped_by_thread[tid] += 1; } if (!read_file.empty()) { @@ -1044,8 +902,14 @@ int main_map(int argc, char** argv) { // Make an alignment Alignment unaligned; unaligned.set_sequence(line); - - vector alignments = mapper[tid]->align_multi(unaligned, kmer_size, kmer_stride, max_mem_length, band_width, band_overlap, xdrop_alignment); + vector alignments = mapper[tid]->align_multi(unaligned, + kmer_size, + kmer_stride, + max_mem_length, + band_width, + band_overlap, + xdrop_alignment); + for(auto& alignment : alignments) { // Set the alignment metadata @@ -1054,9 +918,10 @@ int main_map(int argc, char** argv) { } - // Output the alignments in JSON or protobuf as appropriate. + // Output the alignments in the correct format, possibly surjecting. output_alignments(alignments, empty_alns); } + reads_mapped_by_thread[tid] += 1; } } } @@ -1071,49 +936,60 @@ int main_map(int argc, char** argv) { unaligned.set_sequence(seq); unaligned.set_name(name); int tid = omp_get_thread_num(); - vector alignments = mapper[tid]->align_multi(unaligned, kmer_size, kmer_stride, max_mem_length, band_width, band_overlap, xdrop_alignment); + vector alignments = mapper[tid]->align_multi(unaligned, + kmer_size, + kmer_stride, + max_mem_length, + band_width, + band_overlap, + xdrop_alignment); + for(auto& alignment : alignments) { // Set the alignment metadata if (!sample_name.empty()) alignment.set_sample_name(sample_name); if (!read_group.empty()) alignment.set_read_group(read_group); } - // Output the alignments in JSON or protobuf as appropriate. + // Output the alignments in the correct format, possibly surjecting. output_alignments(alignments, empty_alns); + + reads_mapped_by_thread[tid] += 1; } }; #pragma omp parallel for for (size_t i = 0; i < ref.index->sequenceNames.size(); ++i) { auto& name = ref.index->sequenceNames[i]; - string seq = nonATGCNtoN(toUppercase(ref.getSequence(name))); + string seq = vg::nonATGCNtoN(vg::toUppercase(ref.getSequence(name))); align_seq(name, seq); } } if (!hts_file.empty()) { - function lambda = - [&mapper, - &output_alignments, - &keep_secondary, - &kmer_size, - &kmer_stride, - &max_mem_length, - &band_width, - &band_overlap, - &empty_alns, - &xdrop_alignment] - (Alignment& alignment) { - - if(alignment.is_secondary() && !keep_secondary) { - // Skip over secondary alignments in the input; we don't want several output mappings for each input *mapping*. - return; - } + function lambda = [&](Alignment& alignment) { + if(alignment.is_secondary() && !keep_secondary) { + // Skip over secondary alignments in the input; we don't want several output mappings for each input *mapping*. + return; + } - int tid = omp_get_thread_num(); - vector alignments = mapper[tid]->align_multi(alignment, kmer_size, kmer_stride, max_mem_length, band_width, band_overlap, xdrop_alignment); + int tid = omp_get_thread_num(); + vector alignments = mapper[tid]->align_multi(alignment, + kmer_size, + kmer_stride, + max_mem_length, + band_width, + band_overlap, + xdrop_alignment); + + for(auto& alignment : alignments) { + // Set the alignment metadata + if (!sample_name.empty()) alignment.set_sample_name(sample_name); + if (!read_group.empty()) alignment.set_read_group(read_group); + } - // Output the alignments in JSON or protobuf as appropriate. - output_alignments(alignments, empty_alns); - }; + // Output the alignments in JSON or protobuf as appropriate. + output_alignments(alignments, empty_alns); + + reads_mapped_by_thread[tid] += 1; + }; // run hts_for_each_parallel(hts_file, lambda); } @@ -1121,34 +997,26 @@ int main_map(int argc, char** argv) { if (!fastq1.empty()) { if (interleaved_input) { // paired interleaved - auto output_func = [&output_alignments, - &compare_gam, - &print_fragment_model] - (Alignment& aln1, - Alignment& aln2, - pair, vector>& alnp) { + auto output_func = [&](Alignment& aln1, + Alignment& aln2, + pair, vector>& alnp) { + if (!print_fragment_model) { // Output the alignments in JSON or protobuf as appropriate. output_alignments(alnp.first, alnp.second); } }; - function lambda = - [&mapper, - &output_alignments, - &keep_secondary, - &kmer_size, - &kmer_stride, - &max_mem_length, - &band_width, - &band_overlap, - &pair_window, - &top_pairs_only, - &print_fragment_model, - &output_func, - &xdrop_alignment](Alignment& aln1, Alignment& aln2) { + + function lambda = [&](Alignment& aln1, Alignment& aln2) { auto our_mapper = mapper[omp_get_thread_num()]; bool queued_resolve_later = false; - auto alnp = our_mapper->align_paired_multi(aln1, aln2, queued_resolve_later, max_mem_length, top_pairs_only, false, xdrop_alignment); + auto alnp = our_mapper->align_paired_multi(aln1, + aln2, + queued_resolve_later, + max_mem_length, + top_pairs_only, + false, + xdrop_alignment); if (!queued_resolve_later) { output_func(aln1, aln2, alnp); // check if we should try to align the queued alignments @@ -1167,6 +1035,8 @@ int main_map(int argc, char** argv) { our_mapper->imperfect_pairs_to_retry.clear(); } } + + reads_mapped_by_thread[omp_get_thread_num()] += 2; }; fastq_paired_interleaved_for_each_parallel(fastq1, lambda); #pragma omp parallel @@ -1188,54 +1058,41 @@ int main_map(int argc, char** argv) { } } else if (fastq2.empty()) { // single - function lambda = - [&mapper, - &output_alignments, - &kmer_size, - &kmer_stride, - &max_mem_length, - &band_width, - &band_overlap, - &empty_alns, - &xdrop_alignment] - (Alignment& alignment) { - + function lambda = [&](Alignment& alignment) { int tid = omp_get_thread_num(); - vector alignments = mapper[tid]->align_multi(alignment, kmer_size, kmer_stride, max_mem_length, band_width, band_overlap, xdrop_alignment); + vector alignments = mapper[tid]->align_multi(alignment, + kmer_size, + kmer_stride, + max_mem_length, + band_width, + band_overlap, + xdrop_alignment); //cerr << "This is just before output_alignments" << alignment.DebugString() << endl; output_alignments(alignments, empty_alns); + reads_mapped_by_thread[tid] += 1; }; fastq_unpaired_for_each_parallel(fastq1, lambda); } else { // paired two-file - auto output_func = [&output_alignments, - &print_fragment_model] - (Alignment& aln1, - Alignment& aln2, - pair, vector>& alnp) { + auto output_func = [&](Alignment& aln1, + Alignment& aln2, + pair, vector>& alnp) { // Make sure we have unaligned "alignments" for things that don't align. // Output the alignments in JSON or protobuf as appropriate. if (!print_fragment_model) { output_alignments(alnp.first, alnp.second); } }; - function lambda = - [&mapper, - &output_alignments, - &keep_secondary, - &kmer_size, - &kmer_stride, - &max_mem_length, - &band_width, - &band_overlap, - &pair_window, - &top_pairs_only, - &print_fragment_model, - &output_func, - &xdrop_alignment](Alignment& aln1, Alignment& aln2) { + function lambda = [&](Alignment& aln1, Alignment& aln2) { auto our_mapper = mapper[omp_get_thread_num()]; bool queued_resolve_later = false; - auto alnp = our_mapper->align_paired_multi(aln1, aln2, queued_resolve_later, max_mem_length, top_pairs_only, false, xdrop_alignment); + auto alnp = our_mapper->align_paired_multi(aln1, + aln2, + queued_resolve_later, + max_mem_length, + top_pairs_only, + false, + xdrop_alignment); if (!queued_resolve_later) { output_func(aln1, aln2, alnp); // check if we should try to align the queued alignments @@ -1254,6 +1111,8 @@ int main_map(int argc, char** argv) { our_mapper->imperfect_pairs_to_retry.clear(); } } + + reads_mapped_by_thread[omp_get_thread_num()] += 2; }; fastq_paired_two_files_for_each_parallel(fastq1, fastq2, lambda); #pragma omp parallel @@ -1271,6 +1130,8 @@ int main_map(int argc, char** argv) { output_func(p.first, p.second, alnp); } our_mapper->imperfect_pairs_to_retry.clear(); + + reads_mapped_by_thread[omp_get_thread_num()] += 2; } } } @@ -1278,12 +1139,10 @@ int main_map(int argc, char** argv) { if (!gam_input.empty()) { ifstream gam_in(gam_input); if (interleaved_input) { - auto output_func = [&output_alignments, - &compare_gam, - &print_fragment_model] - (Alignment& aln1, - Alignment& aln2, - pair, vector>& alnp) { + // Paired-end GAM input + auto output_func = [&] (Alignment& aln1, + Alignment& aln2, + pair, vector>& alnp) { if (print_fragment_model) { // do nothing } else { @@ -1297,24 +1156,16 @@ int main_map(int argc, char** argv) { output_alignments(alnp.first, alnp.second); } }; - function lambda = - [&mapper, - &output_alignments, - &keep_secondary, - &kmer_size, - &kmer_stride, - &max_mem_length, - &band_width, - &band_overlap, - &compare_gam, - &pair_window, - &top_pairs_only, - &print_fragment_model, - &output_func, - &xdrop_alignment](Alignment& aln1, Alignment& aln2) { + function lambda = [&](Alignment& aln1, Alignment& aln2) { auto our_mapper = mapper[omp_get_thread_num()]; bool queued_resolve_later = false; - auto alnp = our_mapper->align_paired_multi(aln1, aln2, queued_resolve_later, max_mem_length, top_pairs_only, false, xdrop_alignment); + auto alnp = our_mapper->align_paired_multi(aln1, + aln2, + queued_resolve_later, + max_mem_length, + top_pairs_only, + false, + xdrop_alignment); if (!queued_resolve_later) { output_func(aln1, aln2, alnp); // check if we should try to align the queued alignments @@ -1333,8 +1184,9 @@ int main_map(int argc, char** argv) { our_mapper->imperfect_pairs_to_retry.clear(); } } + reads_mapped_by_thread[omp_get_thread_num()] += 2; }; - stream::for_each_interleaved_pair_parallel(gam_in, lambda); + vg::io::for_each_interleaved_pair_parallel(gam_in, lambda); #pragma omp parallel { auto our_mapper = mapper[omp_get_thread_num()]; @@ -1352,31 +1204,25 @@ int main_map(int argc, char** argv) { our_mapper->imperfect_pairs_to_retry.clear(); } } else { - function lambda = - [&mapper, - &output_alignments, - &keep_secondary, - &kmer_size, - &kmer_stride, - &max_mem_length, - &band_width, - &band_overlap, - &compare_gam, - &empty_alns, - &xdrop_alignment](Alignment& alignment) { + // Processing single-end GAM input + function lambda = [&](Alignment& alignment) { int tid = omp_get_thread_num(); - std::chrono::time_point start = std::chrono::system_clock::now(); - vector alignments = mapper[tid]->align_multi(alignment, kmer_size, kmer_stride, max_mem_length, band_width, band_overlap, xdrop_alignment); - std::chrono::time_point end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = end-start; - // Output the alignments in JSON or protobuf as appropriate. + vector alignments = mapper[tid]->align_multi(alignment, + kmer_size, + kmer_stride, + max_mem_length, + band_width, + band_overlap, + xdrop_alignment); if (compare_gam) { + // Compare against true input at mapping time alignments.front().set_correct(overlap(alignment.path(), alignments.front().path())); alignment_set_distance_to_correct(alignments.front(), alignment); } output_alignments(alignments, empty_alns); + reads_mapped_by_thread[tid] += 1; }; - stream::for_each_parallel(gam_in, lambda); + vg::io::for_each_parallel(gam_in, lambda); } gam_in.close(); } @@ -1390,51 +1236,36 @@ int main_map(int argc, char** argv) { } } - // clean up - for (int i = 0; i < thread_count; ++i) { - delete mapper[i]; - auto& output_buf = output_buffer[i]; - if (!output_json && !refpos_table && surject_type.empty()) { - stream::write_buffered(cout, output_buf, 0); - } - } - - // special cleanup for htslib outputs - if (!surject_type.empty()) { - if (hdr != nullptr) bam_hdr_destroy(hdr); - sam_close(sam_out); - cout.flush(); - } - if (haplo_score_provider) { delete haplo_score_provider; haplo_score_provider = nullptr; } - if (gbwt) { - delete gbwt; - gbwt = nullptr; - } - if (lcp) { - delete lcp; - lcp = nullptr; - } - if(gcsa) { - delete gcsa; - gcsa = nullptr; - } - if(xgidx) { - delete xgidx; - xgidx = nullptr; - } + std::chrono::time_point end = std::chrono::system_clock::now(); + std::chrono::duration mapping_seconds = end - init; + std::chrono::duration index_load_seconds = init - launch; + + if (log_time){ + + size_t total_reads_mapped = 0; + for (auto& reads_mapped : reads_mapped_by_thread) { + total_reads_mapped += reads_mapped; + } - for (Surjector* surjector : surjectors) { - delete surjector; + double reads_per_second_per_thread = total_reads_mapped / (mapping_seconds.count() * thread_count); + cerr << "Index load time: " << index_load_seconds.count() << endl; + cerr << "Mapped " << total_reads_mapped << " reads" << endl; + cerr << "Mapping speed: " << reads_per_second_per_thread << " reads per second per thread" << endl; } - + cout.flush(); + // clean up our mappers + for (uint64_t i = 0; i < mapper.size(); ++i) { + delete mapper[i]; + } + return 0; } -static Subcommand vg_map("map", "MEM-based read alignment", PIPELINE, 3, main_map); +static Subcommand vg_map("map", "MEM-based read alignment", PIPELINE, 5, main_map); diff --git a/src/subcommand/mcmc_main.cpp b/src/subcommand/mcmc_main.cpp new file mode 100644 index 00000000000..c111ce330a1 --- /dev/null +++ b/src/subcommand/mcmc_main.cpp @@ -0,0 +1,259 @@ +/** + * \file mcmc_main.cpp: GFA (Graph Alignment Format) Fast Emitter: a new mapper that will be *extremely* fast once we actually write it + */ + +#include +#include +#include +#include +#include +#include +#include +#include "subcommand.hpp" +#include +#include "../mcmc_genotyper.hpp" +#include "../vg.hpp" +#include "../multipath_alignment.hpp" +#include "../mcmc_caller.hpp" +#include "../graph_caller.hpp" +#include +#include +#include +#include + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_mcmc(char** argv) { + cerr + << "usage: " << argv[0] << " mcmc [options] multipath_alns.mgam graph.vg sites.snarls > graph_with_paths.vg" << endl + << "Finds haplotypes based on reads using MCMC methods" << endl + << endl + << "basic options:" << endl + << " -i, --iteration-number INT tells us the number of iterations to run mcmc_genotyper with" < ref_paths; + vector ref_path_offsets; + vector ref_path_lengths; + + string vcf_out; + int burn_in; + int gamma_freq; + + if (argc < 7) { + help_mcmc(argv); + return 1; + } + + // initialize parameters with their default options + int n_iterations = 1000; + int seed = std::chrono::system_clock::now().time_since_epoch().count(); + string sample_name = "SAMPLE"; + + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"iteration-number", required_argument, 0, 'i'}, + {"seed", required_argument, 0, 'r'}, + {"sample", required_argument, 0, 's'}, + {"ref-path", required_argument, 0, 'p'}, + {"ref-offset", required_argument, 0, 'o'}, + {"ref-length", required_argument, 0, 'l'}, + {"vcf-out", required_argument, 0, 'v'}, + {"burn-in", required_argument, 0, 'b'}, + {"gamma-freq", required_argument, 0, 'g'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "hi:s:p:o:l:r:v:b:g:", + long_options, &option_index); + + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'i': + n_iterations = parse(optarg); + break; + case 'r': + seed = parse(optarg); + break; + case 'p': + ref_paths.push_back(optarg); + break; + case 'o': + ref_path_offsets.push_back(parse(optarg)); + break; + case 'l': + ref_path_lengths.push_back(parse(optarg)); + break; + case 's': + sample_name = optarg; + break; + case 'v': + vcf_out = optarg; + break; + case 'b': + burn_in = parse(optarg); + break; + case 'g': + gamma_freq = parse(optarg); + break; + case 'h': + case '?': + default: + help_mcmc(argv); + exit(1); + break; + } + } + + string multipath_file = get_input_file_name(optind, argc, argv); + string graph_file = get_input_file_name(optind, argc, argv); + string snarls_file = get_input_file_name(optind, argc, argv); + + unique_ptr snarls = (vg::io::VPKG::load_one(snarls_file)); + + // // create a PathHandleGraph + unique_ptr path_hgraph; + bdsg::PathPositionOverlayHelper overlay_helper; + path_hgraph = vg::io::VPKG::load_one(graph_file); + + // Some stuff below here needs a vg graph. + VG* vg_graph = dynamic_cast(path_hgraph.get()); + + // Call this to populate the vg_graph if it isn't populated. + auto ensure_vg = [&]() -> vg::VG* { + if (vg_graph == nullptr) { + // Copy instead. + vg_graph = new vg::VG(); + handlealgs::copy_path_handle_graph(path_hgraph.get(), vg_graph); + // Give the unique_ptr ownership and delete the graph we loaded. + path_hgraph.reset(vg_graph); + // Make sure the paths are all synced up + vg_graph->paths.to_graph(vg_graph->graph); + } + return vg_graph; + }; + + //convert to VG graph if needed + ensure_vg(); + + if(vg_graph == nullptr || vg_graph == 0){ + cerr << "Graph is NULL" <has_path(ref_path)) { + cerr << "error [vg call]: Reference path \"" << ref_path << "\" not found in graph" << endl; + return 1; + } + } + + // Check our offsets + if (ref_path_offsets.size() != 0 && ref_path_offsets.size() != ref_paths.size()) { + cerr << "error [vg call]: when using -o, the same number paths must be given with -p" << endl; + return 1; + } + // Check our ref lengths + if (ref_path_lengths.size() != 0 && ref_path_lengths.size() != ref_paths.size()) { + cerr << "error [vg call]: when using -l, the same number paths must be given with -p" << endl; + return 1; + } + + // No paths specified: use them all + if (ref_paths.empty()) { + graph->for_each_path_handle([&](path_handle_t path_handle) { + const string& name = graph->get_path_name(path_handle); + if (!Paths::is_alt(name)) { + ref_paths.push_back(name); + } + }); + + } + + // Check if VCF output file is specified + ofstream vcf_file_out; + if(!vcf_out.empty()){ + vcf_file_out.open(vcf_out, ios::out); + } + + /* + *######################################################################################## + * GENOTYPING + *######################################################################################## + **/ + + vector reads; + get_input_file(multipath_file, [&] (istream& open_file){ + io::ProtobufIterator iter (open_file); + while(iter.has_current()){ + reads.emplace_back(); + from_proto_multipath_alignment(*iter, reads.back()); + // vg::view_multipath_alignment_as_dot(cerr,*iter,true); + ++iter; + } + }); + double log_base = gssw_dna_recover_log_base(1,4,.5,1e-12); + // invoke run genotyper + MCMCGenotyper mcmc_genotyper(*snarls, *vg_graph, n_iterations, seed, burn_in, gamma_freq); + unique_ptr genome = mcmc_genotyper.run_genotype(reads, log_base); + + // genome->print_phased_genome(); + + /* + *######################################################################################## + * VCF OUTPUT + *######################################################################################## + **/ + + // Create MCMC_Caller object + MCMCCaller mcmc_caller(graph, *genome, *snarls, sample_name, ref_paths, ref_path_offsets, ref_path_lengths, cout); + + // Write header to ofstream + vcf_file_out << mcmc_caller.vcf_header(*graph, ref_paths, ref_path_lengths); + + //current implimentation is writing vcf record after each variant processed + mcmc_caller.call_top_level_snarls(); + + // mcmc_caller.write_variants(cerr); + mcmc_caller.write_variants(vcf_file_out); + + //close the vcf file + vcf_file_out.close(); + + // will output a graph w/ embedded paths + vg_graph->serialize_to_ostream(std::cout); + + return 0; +} + +// Register subcommand +static Subcommand vg_mcmc("mcmc", "Finds haplotypes based on reads using MCMC methods", DEVELOPMENT, main_mcmc); + + diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp new file mode 100644 index 00000000000..86f0039cd25 --- /dev/null +++ b/src/subcommand/minimizer_main.cpp @@ -0,0 +1,444 @@ +/** \file minimizer_main.cpp + * + * Defines the "vg minimizer" subcommand, which builds the minimizer index. + * + * The index contains the lexicographically smallest kmer in a window of w + * successive kmers and their reverse complements. If the kmer contains + * characters other than A, C, G, and T, it will not be indexed. + * + * The index contains either all or haplotype-consistent minimizers. Indexing all + * minimizers from complex graph regions can take a long time (e.g. tens of hours + * vs 5-10 minutes for 1000GP), because many windows have the same minimizer. + * As the total number of minimizers is manageable (e.g. 1.5x more for 1000GP) + * it should be possible to develop a better algorithm for finding the minimizers. + * + * A quick idea for indexing the entire graph: + * - For each node v, extract the subgraph for the windows starting in v. + * - Extract all k'-mers from the subgraph and use them to determine where the + * minimizers can start. + */ + +#include "subcommand.hpp" + +#include + +#include +#include +#include + +#include +#include + +#include "../gbwtgraph_helper.hpp" +#include "../gbwt_helper.hpp" +#include "../index_registry.hpp" +#include "../utility.hpp" +#include "../handle.hpp" +#include "../snarl_distance_index.hpp" + +#include + +using namespace vg; + +// Using too many threads just wastes CPU time without speeding up the construction. +constexpr int DEFAULT_MAX_THREADS = 16; + +// For weighted minimizers. +constexpr size_t DEFAULT_THRESHOLD = 500; // This should be Giraffe hard hit cap. +constexpr size_t DEFAULT_ITERATIONS = 3; +constexpr size_t MAX_ITERATIONS = gbwtgraph::MinimizerHeader::FLAG_WEIGHT_MASK >> gbwtgraph::MinimizerHeader::FLAG_WEIGHT_OFFSET; +constexpr size_t HASH_TABLE_MIN_WIDTH = 10; +constexpr size_t HASH_TABLE_MAX_WIDTH = 36; + +int get_default_threads() { + return std::min(omp_get_max_threads(), DEFAULT_MAX_THREADS); +} + +size_t estimate_hash_table_size(const gbwtgraph::GBZ& gbz, bool progress); + +void help_minimizer(char** argv) { + std::cerr << "usage: " << argv[0] << " minimizer [options] -d graph.dist -o graph.min graph" << std::endl; + std::cerr << std::endl; + std::cerr << "Builds a (w, k)-minimizer index or a (k, s)-syncmer index of the threads in the GBWT" << std::endl; + std::cerr << "index. The graph can be any HandleGraph, which will be transformed into a GBWTGraph." << std::endl; + std::cerr << "The transformation can be avoided by providing a GBWTGraph or a GBZ graph." << std::endl; + std::cerr << std::endl; + std::cerr << "Required options:" << std::endl; + std::cerr << " -d, --distance-index X annotate the hits with positions in this distance index" << std::endl; + std::cerr << " -o, --output-name X store the index to file X" << std::endl; + std::cerr << std::endl; + std::cerr << "Minimizer options:" << std::endl; + std::cerr << " -k, --kmer-length N length of the kmers in the index (default " << IndexingParameters::minimizer_k << ", max " << gbwtgraph::DefaultMinimizerIndex::key_type::KMER_MAX_LENGTH << ")" << std::endl; + std::cerr << " -w, --window-length N choose the minimizer from a window of N kmers (default " << IndexingParameters::minimizer_w << ")" << std::endl; + std::cerr << " -c, --closed-syncmers index closed syncmers instead of minimizers" << std::endl; + std::cerr << " -s, --smer-length N use smers of length N in closed syncmers (default " << IndexingParameters::minimizer_s << ")" << std::endl; + std::cerr << std::endl; + std::cerr << "Weighted minimizers:" << std::endl; + std::cerr << " -W, --weighted use weighted minimizers" << std::endl; + std::cerr << " --threshold N downweight kmers with more than N hits (default " << DEFAULT_THRESHOLD << ")" << std::endl; + std::cerr << " --iterations N downweight frequent kmers by N iterations (default " << DEFAULT_ITERATIONS << ")" << std::endl; + std::cerr << " --fast-counting use the fast kmer counting algorithm (default)" << std::endl; + std::cerr << " --save-memory use the space-efficient kmer counting algorithm" << std::endl; + std::cerr << " --hash-table N use 2^N-cell hash tables for kmer counting (default: guess)" << std::endl; + std::cerr << std::endl; + std::cerr << "Other options:" << std::endl; + std::cerr << " -l, --load-index X load the index from file X and insert the new kmers into it" << std::endl; + std::cerr << " (overrides minimizer / weighted minimizer options)" << std::endl; + std::cerr << " -g, --gbwt-name X use the GBWT index in file X (required with a non-GBZ graph)" << std::endl; + std::cerr << " -p, --progress show progress information" << std::endl; + std::cerr << " -t, --threads N use N threads for index construction (default " << get_default_threads() << ")" << std::endl; + std::cerr << " (using more than " << DEFAULT_MAX_THREADS << " threads rarely helps)" << std::endl; + std::cerr << " --no-dist build the index without distance index annotations (not recommended)" << std::endl; + std::cerr << std::endl; +} + +int main_minimizer(int argc, char** argv) { + + if (argc <= 5) { + help_minimizer(argv); + return 1; + } + + // Command-line options. + std::string output_name, distance_name, load_index, gbwt_name, graph_name; + bool use_syncmers = false; + bool weighted = false, space_efficient_counting = false; + size_t threshold = DEFAULT_THRESHOLD, iterations = DEFAULT_ITERATIONS, hash_table_size = 0; + bool progress = false; + int threads = get_default_threads(); + bool require_distance_index = true; + + constexpr int OPT_THRESHOLD = 1001; + constexpr int OPT_ITERATIONS = 1002; + constexpr int OPT_FAST_COUNTING = 1003; + constexpr int OPT_SAVE_MEMORY = 1004; + constexpr int OPT_HASH_TABLE = 1005; + constexpr int OPT_NO_DIST = 1100; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + { + { "gbwt-name", required_argument, 0, 'g' }, + { "distance-index", required_argument, 0, 'd' }, + { "output-name", required_argument, 0, 'o' }, + { "index-name", required_argument, 0, 'i' }, // deprecated + { "kmer-length", required_argument, 0, 'k' }, + { "window-length", required_argument, 0, 'w' }, + { "bounded-syncmers", no_argument, 0, 'b' }, // deprecated + { "closed-syncmers", no_argument, 0, 'c' }, + { "smer-length", required_argument, 0, 's' }, + { "weighted", no_argument, 0, 'W' }, + { "threshold", required_argument, 0, OPT_THRESHOLD }, + { "iterations", required_argument, 0, OPT_ITERATIONS }, + { "fast-counting", no_argument, 0, OPT_FAST_COUNTING }, + { "save-memory", no_argument, 0, OPT_SAVE_MEMORY }, + { "hash-table", required_argument, 0, OPT_HASH_TABLE }, + { "load-index", required_argument, 0, 'l' }, + { "gbwt-graph", no_argument, 0, 'G' }, // deprecated + { "progress", no_argument, 0, 'p' }, + { "threads", required_argument, 0, 't' }, + { "no-dist", no_argument, 0, OPT_NO_DIST }, + { 0, 0, 0, 0 } + }; + + int option_index = 0; + c = getopt_long(argc, argv, "g:d:o:i:k:w:bcs:Wl:Gpt:h", long_options, &option_index); + if (c == -1) { break; } // End of options. + + switch (c) + { + case 'g': + gbwt_name = optarg; + break; + case 'd': + distance_name = optarg; + break; + case 'o': + output_name = optarg; + break; + case 'i': + std::cerr << "[vg minimizer] warning: --index-name is deprecated, use --output-name instead" << std::endl; + output_name = optarg; + break; + + case 'k': + IndexingParameters::minimizer_k = parse(optarg); + break; + case 'w': + IndexingParameters::minimizer_w = parse(optarg); + break; + case 'b': + std::cerr << "[vg minimizer] warning: --bounded-syncmers is deprecated, use --closed-syncmers instead" << std::endl; + use_syncmers = true; + break; + case 'c': + use_syncmers = true; + break; + case 's': + IndexingParameters::minimizer_s = parse(optarg); + break; + + case 'W': + weighted = true; + break; + case OPT_THRESHOLD: + threshold = parse(optarg); + break; + case OPT_ITERATIONS: + iterations = parse(optarg); + iterations = std::max(iterations, size_t(1)); + iterations = std::min(iterations, MAX_ITERATIONS); + break; + case OPT_FAST_COUNTING: + space_efficient_counting = false; + break; + case OPT_SAVE_MEMORY: + space_efficient_counting = true; + break; + case OPT_HASH_TABLE: + { + size_t width = parse(optarg); + width = std::max(width, HASH_TABLE_MIN_WIDTH); + width = std::min(width, HASH_TABLE_MAX_WIDTH); + hash_table_size = size_t(1) << width; + } + break; + + case 'l': + load_index = optarg; + break; + case 'G': + std::cerr << "[vg minimizer] warning: --gbwt-graph is deprecated, graph format is now autodetected" << std::endl; + break; + case 'p': + progress = true; + break; + case 't': + threads = parse(optarg); + threads = std::min(threads, omp_get_max_threads()); + threads = std::max(threads, 1); + break; + case OPT_NO_DIST: + require_distance_index = false; + break; + + case 'h': + case '?': + help_minimizer(argv); + return 1; + default: + std::abort(); + } + } + if (output_name.empty()) { + std::cerr << "[vg minimizer] error: option --output-name is required" << std::endl; + return 1; + } + if (optind + 1 != argc) { + help_minimizer(argv); + return 1; + } + graph_name = argv[optind]; + if (require_distance_index && distance_name.empty()) { + std::cerr << "[vg minimizer] error: one of options --distance-index and --no-dist is required" << std::endl; + return 1; + } + if (!load_index.empty() || use_syncmers) { + weighted = false; + } + omp_set_num_threads(threads); + + + double start = gbwt::readTimer(); + + // We use GBWT and GBWTGraph in this GBZ wrapper. + unique_ptr gbz; + + // Load whatever the graph argument is + if (progress) { + std::cerr << "Loading input graph from " << graph_name << std::endl; + } + auto input = vg::io::VPKG::try_load_first(graph_name); + if (get<0>(input)) { + // We loaded a GBZ directly + gbz = std::move(get<0>(input)); + } else if (get<1>(input)) { + // We loaded a GBWTGraph and need to pair it with a GBWT + gbz.reset(new gbwtgraph::GBZ()); + gbz->graph = std::move(*get<1>(input)); + + if (gbwt_name.empty()) { + std::cerr << "[vg minimizer] error: option --gbwt-name is required when using a GBWTGraph" << std::endl; + return 1; + } + + // Go get the GBWT + load_gbwt(gbz->index, gbwt_name, progress); + // And attach them together + gbz->graph.set_gbwt(gbz->index); + } else if (get<2>(input)) { + // We got a normal HandleGraph + + if (gbwt_name.empty()) { + std::cerr << "[vg minimizer] error: option --gbwt-name is required when using a HandleGraph" << std::endl; + return 1; + } + + if (progress) { + std::cerr << "Loading GBWT from " << gbwt_name << std::endl; + } + std::unique_ptr gbwt_index(vg::io::VPKG::load_one(gbwt_name)); + if (progress) { + std::cerr << "Building GBWTGraph" << std::endl; + } + gbz.reset(new gbwtgraph::GBZ(gbwt_index, *get<2>(input))); + } else { + std::cerr << "[vg minimizer] error: input graph is not a GBZ, GBWTGraph, or HandleGraph." << std::endl; + return 1; + } + + // Find frequent kmers. + std::vector frequent_kmers; + if (weighted) { + double checkpoint = gbwt::readTimer(); + if (progress) { + std::string algorithm = (space_efficient_counting ? "space-efficient" : "fast"); + std::cerr << "Finding frequent kmers using the " << algorithm << " algorithm" << std::endl; + } + if (hash_table_size == 0) { + hash_table_size = estimate_hash_table_size(*gbz, progress); + } + frequent_kmers = gbwtgraph::frequent_kmers( + gbz->graph, IndexingParameters::minimizer_k, threshold, space_efficient_counting, hash_table_size + ); + if (progress) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Found " << frequent_kmers.size() << " kmers with more than " << threshold << " hits in " << seconds << " seconds" << std::endl; + } + } + + // Minimizer index. + std::unique_ptr index; + if (load_index.empty()) { + index = std::make_unique(IndexingParameters::minimizer_k, + (use_syncmers ? IndexingParameters::minimizer_s : IndexingParameters::minimizer_w), + use_syncmers); + if (weighted && !frequent_kmers.empty()) { + index->add_frequent_kmers(frequent_kmers, iterations); + } + } else { + if (progress) { + std::cerr << "Loading MinimizerIndex from " << load_index << std::endl; + } + index = vg::io::VPKG::load_one(load_index); + } + + // Distance index. + std::unique_ptr distance_index; + if (!distance_name.empty()) { + // new distance index + if (progress) { + std::cerr << "Loading SnarlDistanceIndex from " << distance_name << std::endl; + } + distance_index = vg::io::VPKG::load_one(distance_name); + distance_index->preload(true); + } + + // Build the index. + if (progress) { + std::cerr << "Building MinimizerIndex with k = " << index->k(); + if (index->uses_syncmers()) { + std::cerr << ", s = " << index->s(); + } else { + std::cerr << ", w = " << index->w(); + } + std::cerr << std::endl; + } + if (distance_name.empty()) { + gbwtgraph::index_haplotypes(gbz->graph, *index, [](const pos_t&) -> gbwtgraph::Payload { + return MIPayload::NO_CODE; + }); + } else { + gbwtgraph::index_haplotypes(gbz->graph, *index, [&](const pos_t& pos) -> gbwtgraph::Payload { + return MIPayload::encode(get_minimizer_distances(*distance_index,pos)); + }); + } + + // Index statistics. + if (progress) { + std::cerr << index->size() << " keys (" << index->unique_keys() << " unique)" << std::endl; + std::cerr << "Minimizer occurrences: " << index->number_of_values() << std::endl; + std::cerr << "Load factor: " << index->load_factor() << std::endl; + double seconds = gbwt::readTimer() - start; + std::cerr << "Construction so far: " << seconds << " seconds" << std::endl; + } + + // Serialize the index. + save_minimizer(*index, output_name); + + if (progress) { + double seconds = gbwt::readTimer() - start; + std::cerr << "Time usage: " << seconds << " seconds" << std::endl; + std::cerr << "Memory usage: " << gbwt::inGigabytes(gbwt::memoryUsage()) << " GiB" << std::endl; + } + + return 0; +} + +//------------------------------------------------------------------------------ + +size_t trailing_zeros(size_t value) { + size_t result = 0; + if (value == 0) { + return result; + } + while ((value & 1) == 0) { + value >>= 1; + result++; + } + return result; +} + +size_t estimate_hash_table_size(const gbwtgraph::GBZ& gbz, bool progress) { + if (progress) { + std::cerr << "Estimating genome size" << std::endl; + } + size_t genome_size = 0; + + if (gbz.graph.get_path_count() > 0) { + gbz.graph.for_each_path_handle([&](const path_handle_t& path_handle) { + gbz.graph.for_each_step_in_path(path_handle, [&](const step_handle_t& step_handle) { + handle_t handle = gbz.graph.get_handle_of_step(step_handle); + genome_size += gbz.graph.get_length(handle); + }); + }); + if (progress) { + std::cerr << "Estimated size based on reference / generic paths: " << genome_size << std::endl; + } + } + + if (genome_size == 0) { + gbz.graph.for_each_handle([&](const handle_t& handle) { + genome_size += gbz.graph.get_length(handle); + }); + if (progress) { + std::cerr << "Estimated size based on total sequence length: " << genome_size << std::endl; + } + } + + // Genome size / 2 should be a reasonably tight upper bound for the number of kmers + // with any specific base in the middle position. + size_t hash_table_size = gbwtgraph::KmerIndex::minimum_size(genome_size / 2); + if (progress) { + std::cerr << "Estimated hash table size: 2^" << trailing_zeros(hash_table_size) << std::endl; + } + + return hash_table_size; +} + +//------------------------------------------------------------------------------ + +// Register subcommand +static vg::subcommand::Subcommand vg_minimizer("minimizer", "build a minimizer index or a syncmer index", vg::subcommand::TOOLKIT, main_minimizer); diff --git a/src/subcommand/mod_main.cpp b/src/subcommand/mod_main.cpp index f3887473f87..a0a23ef9b4c 100644 --- a/src/subcommand/mod_main.cpp +++ b/src/subcommand/mod_main.cpp @@ -12,10 +12,15 @@ #include "../vg.hpp" #include "../cactus.hpp" -#include "../stream.hpp" +#include +#include +#include +#include "../handle.hpp" #include "../utility.hpp" -#include "../algorithms/topological_sort.hpp" -#include "../algorithms/remove_high_degree.hpp" +#include "../algorithms/simplify_siblings.hpp" +#include "../algorithms/normalize.hpp" +#include "../algorithms/prune.hpp" +#include "../io/save_handle_graph.hpp" using namespace std; using namespace vg; @@ -26,40 +31,30 @@ void help_mod(char** argv) { << "Modifies graph, outputs modified on stdout." << endl << endl << "options:" << endl - << " -i, --include-aln FILE merge the paths implied by alignments into the graph" << endl - << " -q, --include-loci FILE merge all alleles in loci into the graph" << endl - << " -Q, --include-gt FILE merge only the alleles in called genotypes into the graph" << endl - << " -Z, --translation FILE write the translation generated by editing with -i to FILE" << endl << " -P, --label-paths don't edit with -i alignments, just use them for labeling the graph" << endl << " -c, --compact-ids should we sort and compact the id space? (default false)" << endl - << " -C, --compact-ranks compact mapping ranks in paths" << endl - << " -z, --sort sort the graph using an approximate topological sort" << endl << " -b, --break-cycles use an approximate topological sort to break cycles in the graph" << endl << " -n, --normalize normalize the graph so that edges are always non-redundant" << endl << " (nodes have unique starting and ending bases relative to neighbors," << endl << " and edges that do not introduce new paths are removed and neighboring" << endl << " nodes are merged)" << endl << " -U, --until-normal N iterate normalization until convergence, or at most N times" << endl + << " -z, --nomerge-pre STR do not let normalize (-n, -U) zip up any pair of nodes that both belong to path with prefix STR" << endl << " -E, --unreverse-edges flip doubly-reversing edges so that they are represented on the" << endl << " forward strand of the graph" << endl << " -s, --simplify remove redundancy from the graph that will not change its path space" << endl - << " -T, --strong-connect outputs the strongly-connected components of the graph" << endl << " -d, --dagify-step N copy strongly connected components of the graph N times, forwarding" << endl << " edges from old to new copies to convert the graph into a DAG" << endl << " -w, --dagify-to N copy strongly connected components of the graph forwarding" << endl << " edges from old to new copies to convert the graph into a DAG" << endl << " until the shortest path through each SCC is N bases long" << endl << " -L, --dagify-len-max N stop a dagification step if the unrolling component has this much sequence" << endl - << " -f, --unfold N represent inversions accesible up to N from the forward" << endl + << " -f, --unfold N represent inversions accessible up to N from the forward" << endl << " component of the graph" << endl << " -O, --orient-forward orient the nodes in the graph forward" << endl - << " -D, --drop-paths remove the paths of the graph" << endl - << " -r, --retain-path NAME remove any path not specified for retention" << endl - << " -I, --retain-complement keep only paths NOT specified with -r" << endl - << " -k, --keep-path NAME keep only nodes and edges in the path" << endl << " -N, --remove-non-path keep only nodes and edges which are part of paths" << endl << " -A, --remove-path keep only nodes and edges which are not part of any path" << endl - << " -o, --remove-orphans remove orphan edges from graph (edge specified but node missing)" << endl + << " -k, --keep-path NAME keep only nodes and edges in the path" << endl << " -R, --remove-null removes nodes that have no sequence, forwarding their edges" << endl << " -g, --subgraph ID gets the subgraph rooted at node ID, multiple allowed" << endl << " -x, --context N steps the subgraph out by N steps (default: 1)" << endl @@ -70,13 +65,11 @@ void help_mod(char** argv) { << " -X, --chop N chop nodes in the graph so they are not more than N bp long" << endl << " -u, --unchop where two nodes are only connected to each other and by one edge" << endl << " replace the pair with a single node that is the concatenation of their labels" << endl - << " -K, --kill-labels delete the labels from the graph, resulting in empty nodes" << endl << " -e, --edge-max N only consider paths which make edge choices at <= this many points" << endl << " -M, --max-degree N unlink nodes that have edge degree greater than N" << endl << " -m, --markers join all head and tails nodes to marker nodes" << endl << " ('###' starts and '$$$' ends) of --length, for debugging" << endl << " -y, --destroy-node ID remove node with given id" << endl - << " -B, --bluntify bluntify the graph, making nodes for duplicated sequences in overlaps" << endl << " -a, --cactus convert to cactus graph representation" << endl << " -v, --sample-vcf FILE for a graph with allele paths, compute the sample graph from the given VCF" << endl << " -G, --sample-graph FILE subset an augmented graph to a sample graph using a Locus file" << endl @@ -91,10 +84,6 @@ int main_mod(int argc, char** argv) { } string path_name; - bool remove_orphans = false; - string aln_file; - string loci_file; - bool called_genotypes_only = false; bool label_paths = false; bool compact_ids = false; bool prune_complex = false; @@ -103,36 +92,29 @@ int main_mod(int argc, char** argv) { int chop_to = 0; bool add_start_and_end_markers = false; bool prune_subgraphs = false; - bool kill_labels = false; bool simplify_graph = false; bool unchop = false; bool normalize_graph = false; - bool sort_graph = false; bool remove_non_path = false; bool remove_path = false; bool compact_ranks = false; - bool drop_paths = false; - set paths_to_retain; - bool retain_complement = false; - vector root_nodes; + vector root_nodes; int32_t context_steps; - bool remove_null; - bool strong_connect = false; + bool remove_null = false; uint32_t unfold_to = 0; bool break_cycles = false; uint32_t dagify_steps = 0; uint32_t dagify_to = 0; uint32_t dagify_component_length_max = 0; bool orient_forward = false; - int64_t destroy_node_id = 0; - bool bluntify = false; + nid_t destroy_node_id = 0; int until_normal_iter = 0; - string translation_file; bool flip_doubly_reversed_edges = false; bool cactus = false; string vcf_filename; string loci_filename; int max_degree = 0; + string nomerge_prefix; int c; optind = 2; // force optind past command positional argument @@ -146,7 +128,6 @@ int main_mod(int argc, char** argv) { {"include-gt", required_argument, 0, 'Q'}, {"compact-ids", no_argument, 0, 'c'}, {"compact-ranks", no_argument, 0, 'C'}, - {"drop-paths", no_argument, 0, 'D'}, {"keep-path", required_argument, 0, 'k'}, {"remove-orphans", no_argument, 0, 'o'}, {"prune-complex", no_argument, 0, 'p'}, @@ -154,7 +135,6 @@ int main_mod(int argc, char** argv) { {"length", required_argument, 0, 'l'}, {"edge-max", required_argument, 0, 'e'}, {"chop", required_argument, 0, 'X'}, - {"kill-labels", no_argument, 0, 'K'}, {"markers", no_argument, 0, 'm'}, {"threads", no_argument, 0, 't'}, {"label-paths", no_argument, 0, 'P'}, @@ -162,22 +142,18 @@ int main_mod(int argc, char** argv) { {"unchop", no_argument, 0, 'u'}, {"normalize", no_argument, 0, 'n'}, {"until-normal", required_argument, 0, 'U'}, - {"sort", no_argument, 0, 'z'}, + {"nomerge-pre", required_argument, 0, 'z'}, {"remove-non-path", no_argument, 0, 'N'}, {"remove-path", no_argument, 0, 'A'}, {"orient-forward", no_argument, 0, 'O'}, {"unfold", required_argument, 0, 'f'}, - {"retain-path", required_argument, 0, 'r'}, {"subgraph", required_argument, 0, 'g'}, {"context", required_argument, 0, 'x'}, {"remove-null", no_argument, 0, 'R'}, - {"strong-connect", no_argument, 0, 'T'}, {"dagify-steps", required_argument, 0, 'd'}, {"dagify-to", required_argument, 0, 'w'}, {"dagify-len-max", required_argument, 0, 'L'}, - {"bluntify", no_argument, 0, 'B'}, {"break-cycles", no_argument, 0, 'b'}, - {"orient-forward", no_argument, 0, 'O'}, {"destroy-node", required_argument, 0, 'y'}, {"translation", required_argument, 0, 'Z'}, {"unreverse-edges", required_argument, 0, 'E'}, @@ -185,11 +161,14 @@ int main_mod(int argc, char** argv) { {"sample-vcf", required_argument, 0, 'v'}, {"sample-graph", required_argument, 0, 'G'}, {"max-degree", required_argument, 0, 'M'}, + {"drop-paths", no_argument, 0, 'D'}, + {"retain-path", required_argument, 0, 'r'}, + {"retain-complement", no_argument, 0, 'I'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hk:oi:q:Q:cpl:e:mt:SX:KPsunzNAf:CDr:Ig:x:RTU:Bbd:Ow:L:y:Z:Eav:G:M:", + c = getopt_long (argc, argv, "hk:oi:q:Q:cpl:e:mt:SX:KPsunz:NAf:Cg:x:RTU:Bbd:Ow:L:y:Z:Eav:G:M:Dr:I", long_options, &option_index); @@ -201,44 +180,48 @@ int main_mod(int argc, char** argv) { { case 'i': - aln_file = optarg; - break; + cerr << "[vg mod] error: vg mod -i is deprecated. please switch to vg augment" << endl; + exit(1); case 'q': - loci_file = optarg; - break; + cerr << "[vg mod] error: vg mod -q is deprecated. please switch to vg augment -l" << endl; + exit(1); case 'Q': - loci_file = optarg; - called_genotypes_only = true; + cerr << "[vg mod] error: vg mod -Q is deprecated. please switch to vg augment -L" << endl; + exit(1); break; case 'Z': - translation_file = optarg; + cerr << "[vg mod] error: vg mod -Z is deprecated. please switch to vg augment -Z" << endl; + exit(1); break; - case 'c': - compact_ids = true; + case 'D': + cerr << "[vg mod] error: vg mod -D is deprecated. please switch to vg paths -d" << endl; + exit(1); break; - case 'C': - compact_ranks = true; + case 'r': + cerr << "[vg mod] error: vg mod -r is deprecated. please switch to vg paths -r" << endl; + exit(1); break; - case 'k': - path_name = optarg; + case 'I': + cerr << "[vg mod] error: vg mod -I is deprecated. please switch to vg paths -d" << endl; + exit(1); break; - case 'r': - paths_to_retain.insert(optarg); + case 'c': + compact_ids = true; break; - - case 'I': - retain_complement = true; + + case 'k': + path_name = optarg; break; case 'o': - remove_orphans = true; + cerr << "warning[vg mod]: -o is deprecated. Dangling edges are now automatically removed." << endl; break; case 'p': @@ -265,10 +248,6 @@ int main_mod(int argc, char** argv) { flip_doubly_reversed_edges = true; break; - case 'K': - kill_labels = true; - break; - case 'e': edge_max = parse(optarg); break; @@ -290,13 +269,10 @@ int main_mod(int argc, char** argv) { break; case 'P': + cerr << "[vg mod] warning: vg mod -P is deprecated and will soon be removed. please switch to vg augment -B" << endl; label_paths = true; break; - case 'D': - drop_paths = true; - break; - case 's': simplify_graph = true; break; @@ -305,6 +281,10 @@ int main_mod(int argc, char** argv) { normalize_graph = true; break; + case 'z': + nomerge_prefix = optarg; + break; + case 'N': remove_non_path = true; break; @@ -313,10 +293,6 @@ int main_mod(int argc, char** argv) { remove_path = true; break; - case 'T': - strong_connect = true; - break; - case 'U': until_normal_iter = parse(optarg); break; @@ -333,14 +309,6 @@ int main_mod(int argc, char** argv) { dagify_component_length_max = parse(optarg); break; - case 'B': - bluntify = true; - break; - - case 'z': - sort_graph = true; - break; - case 'b': break_cycles = true; break; @@ -388,33 +356,17 @@ int main_mod(int argc, char** argv) { } } - VG* graph; - get_input_file(optind, argc, argv, [&](istream& in) { - graph = new VG(in); - }); - - if (retain_complement) { - // Compute the actual paths to retain - set complement; - graph->paths.for_each_name([&](const string& name) { - if (!paths_to_retain.count(name)) { - // Complement the set the user specified by putting in all the - // paths they didn't mention. - complement.insert(name); - } - }); - - // Retain the complement of what we were asking for. - paths_to_retain = complement; - } + unique_ptr graph; + string graph_filename = get_input_file_name(optind, argc, argv); + graph = vg::io::VPKG::load_one(graph_filename); if (!vcf_filename.empty()) { // We need to throw out the parts of the graph that are on alt paths, // but not on alt paths for alts used by the first sample in the VCF. - // This is matched against the entire path name string to detect alt + // This is called with the entire path name string to detect alt // paths. - const regex& is_alt = Paths::is_alt; + const function& is_alt = Paths::is_alt; // This holds the VCF file we read the variants from. It needs to be the // same one used to construct the graph. @@ -435,17 +387,17 @@ int main_mod(int argc, char** argv) { // This will hold the IDs of all nodes visited by alt paths that aren't used. set alt_path_ids; - graph->paths.for_each_name([&](const string& alt_path_name) { + graph->for_each_path_handle([&](const path_handle_t& p) { + auto name = graph->get_path_name(p); // For every path name in the graph - if(regex_match(alt_path_name, is_alt)) { - // If it's an alt path - - for(auto& mapping : graph->paths.get_path(alt_path_name)) { + if(is_alt(name)) { + // If it's an alt path, walk it + + graph->for_each_step_in_path(p, [&](const step_handle_t& s) { // Mark all nodes that are part of it as on alt paths - alt_path_ids.insert(mapping.node_id()); - } - + alt_path_ids.insert(graph->get_id(graph->get_handle_of_step(s))); + }); } }); @@ -461,13 +413,7 @@ int main_mod(int argc, char** argv) { // Grab its id, or make one by hashing stuff if it doesn't // have an ID. string var_name = make_variant_id(variant); - - if(!graph->paths.has_path("_alt_" + var_name + "_0")) { - // There isn't a reference alt path for this variant. Someone messed up. - cerr << variant << endl; - throw runtime_error("Reference alt for " + var_name + " not in graph!"); - } - + // For now always work on sample 0. TODO: let the user specify a // name and find it. int sample_number = 0; @@ -494,14 +440,14 @@ int main_mod(int argc, char** argv) { allele_number = stoi(it->str()); } - - // Make the name for its alt path string alt_path_name = "_alt_" + var_name + "_" + to_string(allele_number); - - for(auto& mapping : graph->paths.get_path(alt_path_name)) { - // Un-mark all nodes that are on this alt path, since it is used by the sample. - alt_path_ids.erase(mapping.node_id()); + if (graph->has_path(alt_path_name)) { + // This alt path is existent and may be nonempty. + graph->for_each_step_in_path(graph->get_path_handle(alt_path_name), [&](const step_handle_t& s) { + // Un-mark all nodes that are on this alt path, since it is used by the sample. + alt_path_ids.erase(graph->get_id(graph->get_handle_of_step(s))); + }); } } @@ -530,19 +476,31 @@ int main_mod(int argc, char** argv) { for(auto& node_id : alt_path_ids) { // And delete all the nodes that were used by alt paths that weren't // in the genotype of the first sample. - - for(auto& path_name : graph->paths.of_node(node_id)) { + + // TODO: keep handles instead maybe? + handle_t node = graph->get_handle(node_id); + + // We need to estroy all paths that touch this node. + // But we can't do it while iterating (that's asking a lot of the handle graph implementation). + // So we collect path handles and then destroy them. + // They need to be a set so we can deduplicate multiple visits of a path. + unordered_set paths_to_remove; + graph->for_each_step_on_handle(node, [&](const step_handle_t& s) { // For every path that touches the node we're destroying, // destroy the path. We can't leave it because it won't be the // same path without this node. - graph->paths.remove_path(path_name); + paths_to_remove.emplace(graph->get_path_handle_of_step(s)); #ifdef debug - cerr << "Node " << node_id << " was on path " << path_name << endl; + cerr << "Node " << node_id << " was on path " << graph->get_path_name(graph->get_path_handle_of_step(s)) << endl; #endif + }); + + for(auto& path : paths_to_remove) { + graph->destroy_path(path); } // Actually get rid of the node once its paths are gone. - graph->destroy_node(node_id); + graph->destroy_handle(node); } } @@ -553,8 +511,8 @@ int main_mod(int argc, char** argv) { assert(loci_file.is_open()); // What nodes and edges are called as present by the loci? - set called_nodes; - set called_edges; + set called_nodes; + set called_edges; function lambda = [&](Locus& locus) { // For each locus @@ -578,244 +536,187 @@ int main_mod(int argc, char** argv) { const Mapping& m = allele.mapping(i); // Remember to keep this node - called_nodes.insert(graph->get_node(m.position().node_id())); + called_nodes.insert(graph->get_handle(m.position().node_id())); if (i + 1 < allele.mapping_size()) { // Look at the next mapping, which exists const Mapping& m2 = allele.mapping(i + 1); // Find the edge from the last Mapping's node to this one and mark it as used - called_edges.insert(graph->get_edge(NodeSide(m.position().node_id(), !m.position().is_reverse()), - NodeSide(m2.position().node_id(), m2.position().is_reverse()))); + called_edges.insert(graph->edge_handle(graph->get_handle(m.position().node_id(), m.position().is_reverse()), + graph->get_handle(m2.position().node_id(), m2.position().is_reverse()))); } } } }; - stream::for_each(loci_file, lambda); + vg::io::for_each(loci_file, lambda); // Collect all the unused nodes and edges (so we don't try to delete // while iterating...) - set unused_nodes; - set unused_edges; + set unused_nodes; + set unused_edges; - graph->for_each_node([&](Node* n) { + graph->for_each_handle([&](const handle_t& n) { if (!called_nodes.count(n)) { unused_nodes.insert(n); } }); - graph->for_each_edge([&](Edge* e) { + graph->for_each_edge([&](const edge_t& e) { if (!called_edges.count(e)) { unused_edges.insert(e); } }); - - // Destroy all the extra edges (in case they use extra nodes) - for (auto* e : unused_edges) { + for (auto& e : unused_edges) { graph->destroy_edge(e); } - for (auto* n : unused_nodes) { - graph->destroy_node(n); + for (auto& n : unused_nodes) { + graph->destroy_handle(n); } } - - if (bluntify) { - graph->bluntify(); - } - + + // Some stuff below here needs a vg graph. + VG* vg_graph = dynamic_cast(graph.get()); + + // Call this to populate the vg_graph if it isn't populated. + auto ensure_vg = [&]() -> vg::VG* { + if (vg_graph == nullptr) { + // Copy instead. + vg_graph = new vg::VG(); + handlealgs::copy_path_handle_graph(graph.get(), vg_graph); + // Give the unique_ptr ownership and delete the graph we loaded. + graph.reset(vg_graph); + // Make sure the paths are all synced up + vg_graph->paths.to_graph(vg_graph->graph); + } + return vg_graph; + }; + if (!path_name.empty()) { - graph->keep_path(path_name); - } - - if (!paths_to_retain.empty() || retain_complement) { - graph->paths.keep_paths(paths_to_retain); - } - - if (drop_paths) { - graph->paths.clear(); - } - - if (remove_orphans) { - graph->remove_orphan_edges(); + // TODO: turn into an algorithm or reimplement + ensure_vg(); + vg_graph->keep_path(path_name); } if (unchop) { - graph->unchop(); + handlealgs::unchop(*graph); } if (simplify_graph) { - graph->simplify_siblings(); + // Run at up to twice to try and get both ends of nodes. + // This could be a loop until everything that can simplify does. + algorithms::simplify_siblings(graph.get()) && algorithms::simplify_siblings(graph.get()); + } + + // check if a handle is contained within a path whose name has nomerge_prefix + function check_prefix = [&nomerge_prefix, &graph](const handle_t& handle) { + bool has_prefix = false; + graph->for_each_step_on_handle(handle, [&nomerge_prefix, &graph, &has_prefix](const step_handle_t& step_handle) { + string path_name = graph->get_path_name(graph->get_path_handle_of_step(step_handle)); + if (path_name.compare(0, nomerge_prefix.length(), nomerge_prefix) == 0) { + has_prefix = true; + } + return !has_prefix; + }); + return has_prefix; + }; + function can_merge = nullptr; + if (!nomerge_prefix.empty()) { + can_merge = [&nomerge_prefix, &graph, &check_prefix](const handle_t& h1, const handle_t& h2) { + return !check_prefix(h1) || !check_prefix(h2); + }; } - + if (normalize_graph) { - graph->normalize(); + algorithms::normalize(graph.get(), 1, false, can_merge); } if (until_normal_iter) { - graph->normalize(until_normal_iter); - } - - if (strong_connect) { - graph->keep_multinode_strongly_connected_components(); + // TODO: This doesn't work with vg::VG due to its paths needing re-syncing + assert(vg_graph == nullptr); + algorithms::normalize(graph.get(), until_normal_iter, true, can_merge); } if (remove_non_path) { - graph->remove_non_path(); + // TODO: turn into an algorithm + ensure_vg()->remove_non_path(); } if (remove_path) { - graph->remove_path(); + // TODO: turn into an algorithm + ensure_vg()->remove_path(); } if (orient_forward) { - algorithms::orient_nodes_forward(graph); + handlealgs::apply_orientations(graph.get(), handlealgs::topological_order(graph.get())); } if (flip_doubly_reversed_edges) { - graph->flip_doubly_reversed_edges(); + // TODO: turn into an algorithm + ensure_vg()->flip_doubly_reversed_edges(); } if (dagify_steps) { - unordered_map > node_translation; - *graph = graph->dagify(dagify_steps, node_translation, 0, dagify_component_length_max); + unordered_map > node_translation; + // TODO: turn into an algorithm + ensure_vg(); + *vg_graph = vg_graph->dagify(dagify_steps, node_translation, 0, dagify_component_length_max); } if (dagify_to) { - unordered_map > node_translation; + unordered_map > node_translation; // use the walk as our maximum number of steps; it's the worst case - *graph = graph->dagify(dagify_to, node_translation, dagify_to, dagify_component_length_max); + // TODO: turn into an algorithm + ensure_vg(); + *vg_graph = vg_graph->dagify(dagify_to, node_translation, dagify_to, dagify_component_length_max); } if (unfold_to) { - unordered_map > node_translation; - *graph = graph->unfold(unfold_to, node_translation); + unordered_map > node_translation; + // TODO: turn into an algorithm + ensure_vg(); + *vg_graph = vg_graph->unfold(unfold_to, node_translation); } if (remove_null) { - graph->remove_null_nodes_forwarding_edges(); - } - - if (sort_graph) { - algorithms::sort(graph); + // TODO: turn into an algorithm + ensure_vg()->remove_null_nodes_forwarding_edges(); } if (break_cycles) { - graph->break_cycles(); + // TODO: turn into an algorithm + ensure_vg()->break_cycles(); } // to subset the graph if (!root_nodes.empty()) { VG g; + // TODO: turn into an algorithm + ensure_vg(); for (auto root : root_nodes) { - graph->nonoverlapping_node_context_without_paths(graph->get_node(root), g); - graph->expand_context(g, max(context_steps, 1)); + vg_graph->nonoverlapping_node_context_without_paths(vg_graph->get_node(root), g); + vg_graph->expand_context(g, max(context_steps, 1)); g.remove_orphan_edges(); } - *graph = g; - } - - if (!aln_file.empty()) { - // read in the alignments and save their paths, concatenating them in order where they have the same name - map paths_map; - function lambda = [&graph, &paths_map](Alignment& aln) { - Path path = simplify(aln.path()); - path.set_name(aln.name()); - auto f = paths_map.find(path.name()); - if (f != paths_map.end()) { - paths_map[path.name()] = concat_paths(f->second, path); - } else { - paths_map[path.name()] = path; - } - }; - if (aln_file == "-") { - stream::for_each(std::cin, lambda); - } else { - ifstream in; - in.open(aln_file.c_str()); - stream::for_each(in, lambda); - } - vector paths; - for (auto& p : paths_map) { - paths.push_back(p.second); - } - paths_map.clear(); - if (!label_paths) { - // execute the edits - auto translation = graph->edit(paths, true); - if (!translation_file.empty()) { - ofstream out(translation_file); - stream::write_buffered(out, translation, 0); - out.close(); - } - } else { - // just add the path labels to the graph - for (auto& path : paths) { - graph->paths.extend(path); - } - } - } - - if (!loci_file.empty()) { - // read in the alignments and save their paths - vector paths; - function lambda = [&graph, &paths, &called_genotypes_only](Locus& locus) { - // if we are only doing called genotypes, record so we can filter alleles - set alleles_in_genotype; - if (called_genotypes_only) { - for (int i = 0; i < locus.genotype_size(); ++i) { - for (int j = 0; j < locus.genotype(i).allele_size(); ++j) { - alleles_in_genotype.insert(locus.genotype(i).allele(j)); - } - } - } - for (int i = 0; i < locus.allele_size(); ++i) { - // skip alleles not in the genotype if using only called genotypes - if (!alleles_in_genotype.empty()) { - if (!alleles_in_genotype.count(i)) continue; - } - Path path = simplify(locus.allele(i)); - stringstream name; - name << locus.name() << ":" << i; - path.set_name(name.str()); - paths.push_back(path); - } - }; - if (loci_file == "-") { - stream::for_each(std::cin, lambda); - } else { - ifstream in; - in.open(loci_file.c_str()); - stream::for_each(in, lambda); - } - // execute the edits and produce the translation if requested. - // Make sure to break at node ends, but don't add any paths because they're just loci alleles and not real paths. - auto translation = graph->edit(paths, false, false, true); - if (!translation_file.empty()) { - ofstream out(translation_file); - stream::write_buffered(out, translation, 0); - out.close(); - } + *vg_graph = g; } // and optionally compact ids if (compact_ids) { - algorithms::sort(graph); - graph->compact_ids(); - } - - if (compact_ranks) { - graph->paths.compact_ranks(); + // Sort and compact IDs. + // TODO: This differs from vg ids! Make an alforithm. + graph->apply_ordering(handlealgs::topological_order(graph.get()), true); } if (prune_complex) { if (!(path_length > 0 && edge_max > 0)) { - cerr << "[vg mod]: when pruning complex regions you must specify a --path-length and --edge-max" << endl; + cerr << "[vg mod]: when pruning complex regions you must specify a --length and --edge-max" << endl; return 1; } - graph->prune_complex_with_head_tail(path_length, edge_max); + algorithms::prune_complex_with_head_tail(*graph, path_length, edge_max); } if (max_degree) { @@ -823,44 +724,50 @@ int main_mod(int argc, char** argv) { } if (prune_subgraphs) { - graph->prune_short_subgraphs(path_length); + algorithms::prune_short_subgraphs(*graph, path_length); } if (chop_to) { - graph->dice_nodes(chop_to); - graph->paths.compact_ranks(); - } - - if (kill_labels) { - graph->for_each_node([](Node* n) { n->clear_sequence(); }); + MutablePathDeletableHandleGraph* chop_graph = graph.get(); + if (vg_graph != nullptr) { + chop_graph = vg_graph; + } + + handlealgs::chop(*chop_graph, chop_to); + + if (chop_graph == vg_graph) { + vg_graph->paths.compact_ranks(); + } } if (add_start_and_end_markers) { if (!(path_length > 0)) { - cerr << "[vg mod]: when adding start and end markers you must provide a --path-length" << endl; + cerr << "[vg mod]: when adding start and end markers you must provide a --length" << endl; return 1; } + // TODO: replace this with the SourceSinkOverlay, accounting somehow for its immutability. Node* head_node = NULL; Node* tail_node = NULL; vg::id_t head_id = 0, tail_id = 0; - graph->add_start_end_markers(path_length, '#', '$', head_node, tail_node, head_id, tail_id); + ensure_vg()->add_start_end_markers(path_length, '#', '$', head_node, tail_node, head_id, tail_id); } if (destroy_node_id > 0) { - graph->destroy_node(destroy_node_id); + graph->destroy_handle(graph->get_handle(destroy_node_id)); } if (cactus) { + // TODO: turn into an algorithm + ensure_vg(); // ensure we're sorted - algorithms::sort(graph); - *graph = cactusify(*graph); + vg_graph->sort(); + *vg_graph = cactusify(*vg_graph); // no paths survive, make sure they are erased - graph->paths = Paths(); + vg_graph->paths = Paths(); } - graph->serialize_to_ostream(std::cout); - - delete graph; + // Save the modified graph + vg::io::save_handle_graph(graph.get(), std::cout); return 0; } diff --git a/src/subcommand/mpmap_main.cpp b/src/subcommand/mpmap_main.cpp index 7da90291a9d..0833f80c840 100644 --- a/src/subcommand/mpmap_main.cpp +++ b/src/subcommand/mpmap_main.cpp @@ -1,15 +1,30 @@ - /** +/** * \file mpmap_main.cpp: multipath mapping of reads to a graph */ #include #include +#include #include +#include +#include +#include +#include #include "subcommand.hpp" +#include +#include "../algorithms/component.hpp" #include "../multipath_mapper.hpp" +#include "../mem_accelerator.hpp" +#include "../surjector.hpp" +#include "../multipath_alignment_emitter.hpp" #include "../path.hpp" +#include "../watchdog.hpp" +#include +#include +#include +#include //#define record_read_run_times @@ -19,150 +34,297 @@ #include #endif +#ifdef mpmap_instrument_mem_statistics +#define MEM_STATS_FILE "_mem_statistics.tsv" +#endif + using namespace std; using namespace vg; using namespace vg::subcommand; +pair, vector>> parse_intron_distr_file(ifstream& strm) { + + auto bail = [&]() { + cerr << "error:[vg mpmap] Could not parse intron length distribution file." << endl; + exit(1); + }; + + string line; + getline(strm, line); + size_t parse_len; + int num_comps = stoi(line, &parse_len); + if (parse_len != line.size()) { + bail(); + } + + vector weights; + vector> params; + for (int i = 0; i < 3 * num_comps; ++i) { + + if (!strm) { + bail(); + } + line.clear(); + getline(strm, line); + + double param = stod(line, &parse_len); + if (parse_len != line.size()) { + bail(); + } + if (i < num_comps) { + weights.push_back(param); + } + else if ((i - num_comps) % 2 == 0) { + // have to switch the order relative to the script's output + params.emplace_back(0.0, param); + } + else { + params.back().first = param; + } + } + return make_pair(weights, params); +} + void help_mpmap(char** argv) { cerr - << "usage: " << argv[0] << " mpmap [options] -x index.xg -g index.gcsa [-f reads1.fq [-f reads2.fq] | -G reads.gam] > aln.gamp" << endl + << "usage: " << argv[0] << " mpmap [options] -x graph.xg -g index.gcsa [-f reads1.fq [-f reads2.fq] | -G reads.gam] > aln.gamp" << endl << "Multipath align reads to a graph." << endl << endl << "basic options:" << endl << "graph/index:" << endl - << " -x, --xg-name FILE use this xg index (required)" << endl - << " -g, --gcsa-name FILE use this GCSA2/LCP index pair (required; both FILE and FILE.lcp)" << endl - << " -H, --gbwt-name FILE use this GBWT haplotype index for population-based MAPQs" << endl - << " --linear-index FILE use this sublinear Li and Stephens index file for population-based MAPQs" << endl - << " --linear-path PATH use the given path name as the path that the linear index is against" << endl + << " -x, --graph-name FILE graph (required; XG format recommended but other formats are valid, see `vg convert`) " << endl + << " -g, --gcsa-name FILE use this GCSA2/LCP index pair for MEMs (required; both FILE and FILE.lcp, see `vg index`)" << endl + //<< " -H, --gbwt-name FILE use this GBWT haplotype index for population-based MAPQs" << endl + << " -d, --dist-name FILE use this snarl distance index for clustering (recommended, see `vg index`)" << endl + //<< " --linear-index FILE use this sublinear Li and Stephens index file for population-based MAPQs" << endl + //<< " --linear-path PATH use the given path name as the path that the linear index is against" << endl + << " -s, --snarls FILE align to alternate paths in these snarls (unnecessary if providing -d, see `vg snarls`)" << endl << "input:" << endl - << " -f, --fastq FILE input FASTQ (possibly compressed), can be given twice for paired ends (for stdin use -)" << endl - << " -G, --gam-input FILE input GAM (for stdin, use -)" << endl - << " -i, --interleaved FASTQ or GAM contains interleaved paired ends" << endl - << " -N, --sample NAME add this sample name to output GAMP" << endl - << " -R, --read-group NAME add this read group to output GAMP" << endl - << " -e, --same-strand read pairs are from the same strand of the DNA molecule" << endl - << "algorithm:" << endl - << " -S, --single-path-mode produce single-path alignments (GAM) instead of multipath alignments (GAMP) (ignores -sua)" << endl - << " -s, --snarls FILE align to alternate paths in these snarls" << endl - << "scoring:" << endl - << " -A, --no-qual-adjust do not perform base quality adjusted alignments (required if input does not have base qualities)" << endl - << " -E, --long-read-scoring set alignment scores to long-read defaults: -q1 -z1 -o1 -y1 -L0 (can be overridden)" << endl + << " -f, --fastq FILE input FASTQ (possibly gzipped), can be given twice for paired ends (for stdin use -)" << endl + << " -i, --interleaved input contains interleaved paired ends" << endl + << "algorithm presets:" << endl + << " -n, --nt-type TYPE sequence type preset: 'DNA' for genomic data, 'RNA' for transcriptomic data [RNA]" << endl + << " -l, --read-length TYPE read length preset: 'very-short', 'short', or 'long' (approx. <50bp, 50-500bp, and >500bp) [short]" << endl + << " -e, --error-rate TYPE error rate preset: 'low' or 'high' (approx. PHRED >20 and <20) [low]" << endl + << "output:" << endl + << " -F, --output-fmt TYPE format to output alignments in: 'GAMP for' multipath alignments, 'GAM' or 'GAF' for single-path" << endl + << " alignments, 'SAM', 'BAM', or 'CRAM' for linear reference alignments (may also require -S) [GAMP]" << endl + << " -S, --ref-paths FILE paths in the graph either 1) one per line in a text file, or 2) in an HTSlib .dict, to treat as" << endl + << " reference sequences for HTSlib formats (see -F) [all paths]" << endl + << " -N, --sample NAME add this sample name to output" << endl + << " -R, --read-group NAME add this read group to output" << endl + << " -p, --suppress-progress do not report progress to stderr" << endl + //<< "algorithm:" << endl + //<< " --min-dist-cluster use the minimum distance based clusterer (requires a distance index from -d)" << endl +// << "scoring:" << endl +// << " -E, --long-read-scoring set alignment scores to long-read defaults: -q1 -z1 -o1 -y1 -L0 (can be overridden)" << endl + << "computational parameters:" << endl + << " -t, --threads INT number of compute threads to use [all available]" << endl << endl << "advanced options:" << endl << "algorithm:" << endl - << " -X, --snarl-max-cut INT do not align to alternate paths in a snarl if an exact match is at least this long (0 for no limit) [5]" << endl - << " -a, --alt-paths INT align to (up to) this many alternate paths in between MEMs or in snarls [4]" << endl - << " -n, --unstranded use lazy strand consistency when clustering MEMs" << endl - << " -b, --frag-sample INT look for this many unambiguous mappings to estimate the fragment length distribution [1000]" << endl - << " -I, --frag-mean mean for fixed fragment length distribution" << endl - << " -D, --frag-stddev standard deviation for fixed fragment length distribution" << endl - << " -B, --no-calibrate do not auto-calibrate mismapping dectection" << endl - << " -P, --max-p-val FLOAT background model p value must be less than this to avoid mismapping detection [0.00001]" << endl - << " -v, --mq-method OPT mapping quality method: 0 - none, 1 - fast approximation, 2 - adaptive, 3 - exact [2]" << endl - << " -Q, --mq-max INT cap mapping quality estimates at this much [60]" << endl - << " -p, --padding-mult FLOAT pad dynamic programming bands in inter-MEM alignment FLOAT * sqrt(read length) [1.0]" << endl - << " -u, --map-attempts INT perform (up to) this many mappings per read (0 for no limit) [24 paired / 64 unpaired]" << endl - << " -O, --max-paths INT consider (up to) this many paths per alignment for population consistency scoring, 0 to disable [10]" << endl - << " -M, --max-multimaps INT report (up to) this many mappings per read [1]" << endl - << " -r, --reseed-length INT reseed SMEMs for internal MEMs if they are at least this long (0 for no reseeding) [28]" << endl - << " -W, --reseed-diff FLOAT require internal MEMs to have length within this much of the SMEM's length [0.45]" << endl - << " -k, --min-mem-length INT minimum MEM length to anchor multipath alignments [1]" << endl - << " -K, --clust-length INT minimum MEM length form clusters [automatic]" << endl - << " -c, --hit-max INT use at most this many hits for any MEM (0 for no limit) [1024]" << endl - << " -d, --max-dist-error INT maximum typical deviation between distance on a reference path and distance in graph [8]" << endl - << " -w, --approx-exp FLOAT let the approximate likelihood miscalculate likelihood ratios by this power [10.0]" << endl - << " --recombination-penalty FLOAT use this log recombination penalty for GBWT haplotype scoring [20.7]" << endl - << " --always-check-population always try o population-score reads, even if there is only a single mapping" << endl - << " -C, --drop-subgraph FLOAT drop alignment subgraphs whose MEMs cover this fraction less of the read than the best subgraph [0.2]" << endl - << " -U, --prune-exp FLOAT prune MEM anchors if their approximate likelihood is this root less than the optimal anchors [1.25]" << endl + //<< " -v, --tvs-clusterer use the target value search-based clusterer (requires a distance index from -d)" << endl + //<< " -a, --alt-paths INT align to (up to) this many alternate paths in snarls [10]" << endl + //<< " --suppress-tail-anchors don't produce extra anchors when aligning to alternate paths in snarls" << endl + //<< " -T, --same-strand read pairs are from the same strand of the DNA/RNA molecule" << endl + << " -X, --not-spliced do not form spliced alignments, even if aligning with --nt-type 'rna'" << endl + << " -M, --max-multimaps INT report (up to) this many mappings per read [10 rna / 1 dna]" << endl + << " -a, --agglomerate-alns combine separate multipath alignments into one (possibly disconnected) alignment" << endl + << " -r, --intron-distr FILE intron length distribution (from scripts/intron_length_distribution.py)" << endl + << " -Q, --mq-max INT cap mapping quality estimates at this much [60]" << endl + << " -b, --frag-sample INT look for this many unambiguous mappings to estimate the fragment length distribution [1000]" << endl + << " -I, --frag-mean FLOAT mean for a pre-determined fragment length distribution (also requires -D)" << endl + << " -D, --frag-stddev FLOAT standard deviation for a pre-determined fragment length distribution (also requires -I)" << endl + //<< " -B, --no-calibrate do not auto-calibrate mismapping dectection" << endl + << " -G, --gam-input FILE input GAM (for stdin, use -)" << endl + //<< " -P, --max-p-val FLOAT background model p-value must be less than this to avoid mismapping detection [0.0001]" << endl + //<< " -U, --report-group-mapq add an annotation for the collective mapping quality of all reported alignments" << endl + //<< " --padding-mult FLOAT pad dynamic programming bands in inter-MEM alignment FLOAT * sqrt(read length) [1.0]" << endl + << " -u, --map-attempts INT perform (up to) this many mappings per read (0 for no limit) [24 paired / 64 unpaired]" << endl + //<< " --max-paths INT consider (up to) this many paths per alignment for population consistency scoring, 0 to disable [10]" << endl + //<< " --top-tracebacks consider paths for each alignment based only on alignment score and not based on haplotypes" << endl + //<< " -r, --reseed-length INT reseed SMEMs for internal MEMs if they are at least this long (0 for no reseeding) [28]" << endl + //<< " -W, --reseed-diff FLOAT require internal MEMs to have length within this much of the SMEM's length [0.45]" << endl + //<< " -K, --clust-length INT minimum MEM length used in clustering [automatic]" << endl + //<< " -F, --stripped-match use stripped match algorithm instead of MEMs" << endl + << " -c, --hit-max INT use at most this many hits for any match seeds (0 for no limit) [1024 DNA / 100 RNA]" << endl + //<< " --approx-exp FLOAT let the approximate likelihood miscalculate likelihood ratios by this power [10.0 DNA / 5.0 RNA]" << endl + //<< " --recombination-penalty FLOAT use this log recombination penalty for GBWT haplotype scoring [20.7]" << endl + //<< " --always-check-population always try to population-score reads, even if there is only a single mapping" << endl + //<< " --delay-population do not apply population scoring at intermediate stages of the mapping algorithm" << endl + //<< " --force-haplotype-count INT assume that INT haplotypes ought to run through each fixed part of the graph, if nonzero [0]" << endl + //<< " -C, --drop-subgraph FLOAT drop alignment subgraphs whose MEMs cover this fraction less of the read than the best subgraph [0.2]" << endl + //<< " --prune-exp FLOAT prune MEM anchors if their approximate likelihood is this root less than the optimal anchors [1.25]" << endl << "scoring:" << endl - << " -q, --match INT use this match score [1]" << endl - << " -z, --mismatch INT use this mismatch penalty [4]" << endl - << " --score-matrix FILE read a 5x5 integer substitution scoring matrix from a file" << endl - << " -o, --gap-open INT use this gap open penalty [6]" << endl - << " -y, --gap-extend INT use this gap extension penalty [1]" << endl - << " -L, --full-l-bonus INT add this score to alignments that use the full length of the read [5]" << endl - << " -m, --remove-bonuses remove full length alignment bonuses in reported scores" << endl - << "computational parameters:" << endl - << " -t, --threads INT number of compute threads to use" << endl - << " -Z, --buffer-size INT buffer this many alignments together (per compute thread) before outputting to stdout [100]" << endl; - + << " -A, --no-qual-adjust do not perform base quality adjusted alignments even when base qualities are available" << endl + << " -q, --match INT use this match score [1]" << endl + << " -z, --mismatch INT use this mismatch penalty [4 low error, 1 high error]" << endl + << " -o, --gap-open INT use this gap open penalty [6 low error, 1 high error]" << endl + << " -y, --gap-extend INT use this gap extension penalty [1]" << endl + << " -L, --full-l-bonus INT add this score to alignments that align each end of the read [mismatch+1 short, 0 long]" << endl + << " -w, --score-matrix FILE read a 4x4 integer substitution scoring matrix from a file (in the order ACGT)" << endl + << " -m, --remove-bonuses remove full length alignment bonuses in reported scores" << endl; + //<< "computational parameters:" << endl + //<< " -Z, --buffer-size INT buffer this many alignments together (per compute thread) before outputting to stdout [200]" << endl; } -int main_mpmap(int argc, char** argv) { + +int main_mpmap(int argc, char** argv) { + if (argc == 2) { help_mpmap(argv); return 1; } // initialize parameters with their default options - #define OPT_SCORE_MATRIX 1000 + #define OPT_PRUNE_EXP 1000 #define OPT_RECOMBINATION_PENALTY 1001 #define OPT_ALWAYS_CHECK_POPULATION 1002 + #define OPT_FORCE_HAPLOTYPE_COUNT 1004 + #define OPT_SUPPRESS_TAIL_ANCHORS 1005 + #define OPT_TOP_TRACEBACKS 1006 + #define OPT_MIN_DIST_CLUSTER 1007 + #define OPT_APPROX_EXP 1008 + #define OPT_MAX_PATHS 1009 + #define OPT_GREEDY_MIN_DIST 1010 + #define OPT_COMPONENT_MIN_DIST 1011 + #define OPT_BAND_PADDING_MULTIPLIER 1012 + #define OPT_HARD_HIT_MAX_MULTIPLIER 1013 + #define OPT_MAX_RESCUE_ATTEMPTS 1014 + #define OPT_STRIP_LENGTH 1015 + #define OPT_STRIP_COUNT 1016 + #define OPT_SECONDARY_RESCUE_ATTEMPTS 1017 + #define OPT_SECONDARY_MAX_DIFF 1018 + #define OPT_NO_CLUSTER 1019 + #define OPT_NO_GREEDY_MEM_RESTARTS 1020 + #define OPT_GREEDY_MEM_RESTART_MAX_LCP 1021 + #define OPT_SHORT_MEM_FILTER_FACTOR 1022 + #define OPT_NO_OUTPUT 1023 + #define OPT_STRIPPED_MATCH 1024 + #define OPT_FAN_OUT_QUAL 1025 + #define OPT_MAX_FANS_OUT 1026 + #define OPT_FAN_OUT_DIFF 1027 + #define OPT_PATH_RESCUE_GRAPH 1028 + #define OPT_MAX_RESCUE_P_VALUE 1029 + #define OPT_ALT_PATHS 1030 + #define OPT_SUPPRESS_SUPPRESSION 1031 + #define OPT_SNARL_MAX_CUT 1032 + #define OPT_SPLICE_ODDS 1033 + #define OPT_REPORT_ALLELIC_MAPQ 1034 + #define OPT_RESEED_LENGTH 1035 + #define OPT_MAX_MOTIF_PAIRS 1036 + #define OPT_SUPPRESS_MISMAPPING_DETECTION 1037 string matrix_file_name; - string xg_name; + string graph_name; string gcsa_name; string gbwt_name; string sublinearLS_name; string sublinearLS_ref_path; string snarls_name; + string distance_index_name; string fastq_name_1; string fastq_name_2; string gam_file_name; + string ref_paths_name; + string intron_distr_name; int match_score = default_match; int mismatch_score = default_mismatch; int gap_open_score = default_gap_open; int gap_extension_score = default_gap_extension; int full_length_bonus = default_full_length_bonus; bool interleaved_input = false; - int snarl_cut_size = 5; + int default_snarl_cut_size = 5; + int snarl_cut_size = default_snarl_cut_size; + int max_branch_trim_length = 5; + bool synthesize_tail_anchors = false; int max_paired_end_map_attempts = 24; - int max_single_end_mappings_for_rescue = 64; int max_single_end_map_attempts = 64; + int max_single_end_map_attempts_very_short = 16; + int max_single_end_mappings_for_rescue = max_single_end_map_attempts; int max_rescue_attempts = 10; + double rescue_graph_std_devs = 6.0; + bool get_rescue_graph_from_paths = false; int population_max_paths = 10; + int population_paths_hard_cap = 1000; + bool top_tracebacks = false; // How many distinct single path alignments should we look for in a multipath, for MAPQ? // TODO: create an option. int localization_max_paths = 5; - int max_num_mappings = 1; - int buffer_size = 100; + int max_num_mappings = 0; + int default_dna_num_mappings = 1; + int default_rna_num_mappings = 10; int hit_max = 1024; + int hit_max_arg = numeric_limits::min(); + int hard_hit_max_muliplier = 3; int min_mem_length = 1; int min_clustering_mem_length = 0; + int min_clustering_mem_length_arg = numeric_limits::min(); + bool use_stripped_match_alg = false; + int default_strip_length = 10; + int stripped_match_alg_strip_length = default_strip_length; + int stripped_match_alg_max_length = 0; // no maximum yet + int default_strip_count = 10; + int stripped_match_alg_target_count = default_strip_count; + bool use_fanout_match_alg = false; + int max_fanout_base_quality = 20; + int max_fans_out = 3; + int fanout_pruning_diff = 3; + bool use_greedy_mem_restarts = true; + // TODO: it would be best if these parameters responded to the size of the graph... + int greedy_restart_min_length = 30; + int greedy_restart_max_lcp = 25; + int greedy_restart_max_count = 2; + bool greedy_restart_assume_substitution = false; + bool filter_short_mems = false; + double short_mem_filter_factor = 0.45; int reseed_length = 28; + int reseed_length_arg = numeric_limits::min(); double reseed_diff = 0.45; + double reseed_diff_arg = numeric_limits::lowest(); double reseed_exp = 0.065; bool use_adaptive_reseed = true; double cluster_ratio = 0.2; + bool use_tvs_clusterer = false; + bool use_min_dist_clusterer = false; + bool greedy_min_dist = false; + bool component_min_dist = true; + bool no_clustering = false; bool qual_adjusted = true; bool strip_full_length_bonus = false; - MappingQualityMethod mapq_method = Adaptive; + MappingQualityMethod mapq_method = Exact; + bool report_group_mapq = false; + bool report_allelic_mapq = false; double band_padding_multiplier = 1.0; int max_dist_error = 12; - int num_alt_alns = 4; + int default_num_alt_alns = 16; + int num_alt_alns = default_num_alt_alns; + bool agglomerate_multipath_alns = false; double suboptimal_path_exponent = 1.25; double likelihood_approx_exp = 10.0; + double likelihood_approx_exp_arg = numeric_limits::lowest(); double recombination_penalty = 20.7; bool always_check_population = false; - bool single_path_alignment_mode = false; + size_t force_haplotype_count = 0; int max_mapq = 60; + double mapq_scaling_factor = 1.0; size_t frag_length_sample_size = 1000; double frag_length_robustness_fraction = 0.95; double frag_length_mean = NAN; double frag_length_stddev = NAN; bool same_strand = false; + bool suppress_mismapping_detection = false; bool auto_calibrate_mismapping_detection = true; - double max_mapping_p_value = 0.00001; - size_t num_calibration_simulations = 250; - size_t calibration_read_length = 150; - bool unstranded_clustering = false; + double max_mapping_p_value = 0.0001; + double max_rescue_p_value = 0.03; + size_t num_calibration_simulations = 100; + vector calibration_read_lengths{50, 100, 150, 250, 450}; size_t order_length_repeat_hit_max = 3000; size_t sub_mem_count_thinning = 4; - size_t sub_mem_thinning_burn_in = 16; + size_t sub_mem_thinning_burn_in_diff = 1; double secondary_rescue_score_diff = 0.8; size_t secondary_rescue_attempts = 4; + size_t secondary_rescue_attempts_arg = numeric_limits::max(); size_t rescue_only_min = numeric_limits::max(); // disabling this for now size_t rescue_only_anchor_max = 16; string sample_name = ""; @@ -174,15 +336,45 @@ int main_mpmap(int argc, char** argv) { int secondary_rescue_subopt_diff = 35; int min_median_mem_coverage_for_split = 0; bool suppress_cluster_merging = false; + bool suppress_suppression = false; + bool suppress_multicomponent_splitting = false; bool dynamic_max_alt_alns = true; bool simplify_topologies = true; - bool long_read_scoring = false; + int max_alignment_gap = 5000; + bool use_pessimistic_tail_alignment = false; + double pessimistic_gap_multiplier = 3.0; + bool restrained_graph_extraction = false; + bool do_spliced_alignment = false; + int max_softclip_overlap = 8; + int max_splice_overhang = 2 * max_softclip_overlap; + double no_splice_log_odds = 2.0; + double splice_rescue_graph_std_devs = 3.0; + bool override_spliced_alignment = false; + int max_motif_pairs = 200; + // the TruSeq adapters, which seem to be what mostly gets used for RNA-seq + // (this info is only used during spliced alignment, so that should be all + // that matters) + string read_1_adapter = "AGATCGGAAGAG"; + string read_2_adapter = "AGATCGGAAGAG"; int match_score_arg = std::numeric_limits::min(); int mismatch_score_arg = std::numeric_limits::min(); int gap_open_score_arg = std::numeric_limits::min(); int gap_extension_score_arg = std::numeric_limits::min(); int full_length_bonus_arg = std::numeric_limits::min(); + int reversing_walk_length = 1; + int min_splice_length = 20; + int mem_accelerator_length = 12; + bool no_output = false; + string out_format = "GAMP"; + + // default presets + string nt_type = "rna"; + string read_length = "short"; + string error_rate = "low"; + // logging and warning + bool suppress_progress = false; + int fragment_length_warning_factor = 25; int c; optind = 2; // force optind past command positional argument @@ -190,9 +382,10 @@ int main_mpmap(int argc, char** argv) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, - {"xg-name", required_argument, 0, 'x'}, + {"graph-name", required_argument, 0, 'x'}, {"gcsa-name", required_argument, 0, 'g'}, {"gbwt-name", required_argument, 0, 'H'}, + {"dist-name", required_argument, 0, 'd'}, {"linear-index", required_argument, 0, 1}, {"linear-path", required_argument, 0, 2}, {"fastq", required_argument, 0, 'f'}, @@ -200,50 +393,82 @@ int main_mpmap(int argc, char** argv) { {"sample", required_argument, 0, 'N'}, {"read-group", required_argument, 0, 'R'}, {"interleaved", no_argument, 0, 'i'}, - {"same-strand", no_argument, 0, 'e'}, - {"single-path-mode", no_argument, 0, 'S'}, + {"same-strand", no_argument, 0, 'T'}, + {"ref-paths", required_argument, 0, 'S'}, + {"output-fmt", required_argument, 0, 'F'}, {"snarls", required_argument, 0, 's'}, - {"snarl-max-cut", required_argument, 0, 'X'}, - {"alt-paths", required_argument, 0, 'a'}, - {"unstranded", no_argument, 0, 'n'}, + {"synth-tail-anchors", no_argument, 0, OPT_SUPPRESS_TAIL_ANCHORS}, + {"suppress-suppression", no_argument, 0, OPT_SUPPRESS_SUPPRESSION}, + {"tvs-clusterer", no_argument, 0, 'v'}, + {"snarl-max-cut", required_argument, 0, OPT_SNARL_MAX_CUT}, + {"alt-paths", required_argument, 0, OPT_ALT_PATHS}, {"frag-sample", required_argument, 0, 'b'}, {"frag-mean", required_argument, 0, 'I'}, {"frag-stddev", required_argument, 0, 'D'}, + {"max-rescues", required_argument, 0, OPT_MAX_RESCUE_ATTEMPTS}, + {"max-secondary-rescues", required_argument, 0, OPT_SECONDARY_RESCUE_ATTEMPTS}, + {"secondary-diff", required_argument, 0, OPT_SECONDARY_MAX_DIFF}, + {"path-rescue-graph", no_argument, 0, OPT_PATH_RESCUE_GRAPH}, {"no-calibrate", no_argument, 0, 'B'}, {"max-p-val", required_argument, 0, 'P'}, - {"mq-method", required_argument, 0, 'v'}, + {"max-rescue-p-val", required_argument, 0, OPT_MAX_RESCUE_P_VALUE}, {"mq-max", required_argument, 0, 'Q'}, - {"padding-mult", required_argument, 0, 'p'}, + {"agglomerate-alns", no_argument, 0, 'a'}, + {"report-group-mapq", no_argument, 0, 'U'}, + {"report-allelic-mapq", no_argument, 0, OPT_REPORT_ALLELIC_MAPQ}, + {"suppress-mismapping", no_argument, 0, OPT_SUPPRESS_MISMAPPING_DETECTION}, + {"padding-mult", required_argument, 0, OPT_BAND_PADDING_MULTIPLIER}, {"map-attempts", required_argument, 0, 'u'}, - {"max-paths", required_argument, 0, 'O'}, + {"max-paths", required_argument, 0, OPT_MAX_PATHS}, + {"top-tracebacks", no_argument, 0, OPT_TOP_TRACEBACKS}, {"max-multimaps", required_argument, 0, 'M'}, - {"reseed-length", required_argument, 0, 'r'}, + {"reseed-length", required_argument, 0, OPT_RESEED_LENGTH}, {"reseed-diff", required_argument, 0, 'W'}, - {"min-mem-length", required_argument, 0, 'k'}, {"clustlength", required_argument, 0, 'K'}, + {"stripped-match", no_argument, 0, OPT_STRIPPED_MATCH}, + {"strip-length", no_argument, 0, OPT_STRIP_LENGTH}, + {"strip-count", no_argument, 0, OPT_STRIP_COUNT}, + {"no-greedy-restart", no_argument, 0, OPT_NO_GREEDY_MEM_RESTARTS}, + {"greedy-max-lcp", required_argument, 0, OPT_GREEDY_MEM_RESTART_MAX_LCP}, + {"filter-factor", required_argument, 0, OPT_SHORT_MEM_FILTER_FACTOR}, + {"fan-out-qual", required_argument, 0, OPT_FAN_OUT_QUAL}, + {"max-fans-out", required_argument, 0, OPT_MAX_FANS_OUT}, + {"fan-out-diff", required_argument, 0, OPT_FAN_OUT_DIFF}, {"hit-max", required_argument, 0, 'c'}, - {"max-dist-error", required_argument, 0, 'd'}, - {"approx-exp", required_argument, 0, 'w'}, + {"hard-hit-mult", required_argument, 0, OPT_HARD_HIT_MAX_MULTIPLIER}, + {"approx-exp", required_argument, 0, OPT_APPROX_EXP}, {"recombination-penalty", required_argument, 0, OPT_RECOMBINATION_PENALTY}, {"always-check-population", no_argument, 0, OPT_ALWAYS_CHECK_POPULATION}, + {"force-haplotype-count", required_argument, 0, OPT_FORCE_HAPLOTYPE_COUNT}, + {"min-dist-cluster", no_argument, 0, OPT_MIN_DIST_CLUSTER}, + {"greedy-min-dist", no_argument, 0, OPT_GREEDY_MIN_DIST}, + {"component-min-dist", no_argument, 0, OPT_COMPONENT_MIN_DIST}, + {"no-cluster", no_argument, 0, OPT_NO_CLUSTER}, {"drop-subgraph", required_argument, 0, 'C'}, - {"prune-exp", required_argument, 0, 'U'}, + {"prune-exp", required_argument, 0, OPT_PRUNE_EXP}, {"long-read-scoring", no_argument, 0, 'E'}, + {"not-spliced", no_argument, 0, 'X'}, + {"splice-odds", required_argument, 0, OPT_SPLICE_ODDS}, + {"intron-distr", required_argument, 0, 'r'}, + {"max-motif-pairs", required_argument, 0, OPT_MAX_MOTIF_PAIRS}, + {"read-length", required_argument, 0, 'l'}, + {"nt-type", required_argument, 0, 'n'}, + {"error-rate", required_argument, 0, 'e'}, {"match", required_argument, 0, 'q'}, {"mismatch", required_argument, 0, 'z'}, - {"score-matrix", required_argument, 0, OPT_SCORE_MATRIX}, + {"score-matrix", required_argument, 0, 'w'}, {"gap-open", required_argument, 0, 'o'}, {"gap-extend", required_argument, 0, 'y'}, {"full-l-bonus", required_argument, 0, 'L'}, {"remove-bonuses", no_argument, 0, 'm'}, {"no-qual-adjust", no_argument, 0, 'A'}, {"threads", required_argument, 0, 't'}, - {"buffer-size", required_argument, 0, 'Z'}, + {"no-output", no_argument, 0, OPT_NO_OUTPUT}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:g:H:f:G:N:R:ieSs:u:O:a:nb:I:D:BP:v:Q:p:M:r:W:k:K:c:d:w:C:R:Eq:z:o:y:L:mAt:Z:", + c = getopt_long (argc, argv, "hx:g:H:d:f:G:N:R:iS:s:vXu:b:I:D:BP:Q:UpM:r:W:K:F:c:C:R:En:l:e:q:z:w:o:y:L:mAt:a", long_options, &option_index); @@ -254,9 +479,9 @@ int main_mpmap(int argc, char** argv) { switch (c) { case 'x': - xg_name = optarg; - if (xg_name.empty()) { - cerr << "error:[vg mpmap] Must provide XG file with -x." << endl; + graph_name = optarg; + if (graph_name.empty()) { + cerr << "error:[vg mpmap] Must provide Graph file with -x." << endl; exit(1); } break; @@ -271,6 +496,37 @@ int main_mpmap(int argc, char** argv) { case 'H': gbwt_name = optarg; + if (gbwt_name.empty()) { + cerr << "error:[vg mpmap] Must provide GBWT index file with -H" << endl; + exit(1); + } + break; + + case 'd': + distance_index_name = optarg; + if (distance_index_name.empty()) { + cerr << "error:[vg mpmap] Must provide distance index file with -d" << endl; + exit(1); + } + if (!use_tvs_clusterer) { + use_min_dist_clusterer = true; + } + break; + + case OPT_MAX_RESCUE_ATTEMPTS: + max_rescue_attempts = parse(optarg); + break; + + case OPT_SECONDARY_RESCUE_ATTEMPTS: + secondary_rescue_attempts_arg = parse(optarg); + break; + + case OPT_SECONDARY_MAX_DIFF: + secondary_rescue_score_diff = parse(optarg); + break; + + case OPT_PATH_RESCUE_GRAPH: + get_rescue_graph_from_paths = true;; break; case 1: // --linear-index @@ -313,7 +569,7 @@ int main_mpmap(int argc, char** argv) { case 'N': sample_name = optarg; if (sample_name.empty()) { - cerr << "error:[vg mpmap] Must provide sample name file with -N." << endl; + cerr << "error:[vg mpmap] Must provide sample name with -N." << endl; exit(1); } break; @@ -330,12 +586,16 @@ int main_mpmap(int argc, char** argv) { interleaved_input = true; break; - case 'e': + case 'T': same_strand = true; break; + case 'p': + suppress_progress = true; + break; + case 'S': - single_path_alignment_mode = true; + ref_paths_name = optarg; break; case 's': @@ -346,16 +606,29 @@ int main_mpmap(int argc, char** argv) { } break; - case 'X': - snarl_cut_size = parse(optarg); + case OPT_SUPPRESS_TAIL_ANCHORS: + synthesize_tail_anchors = true; break; - case 'a': - num_alt_alns = parse(optarg); + case OPT_SUPPRESS_SUPPRESSION: + suppress_suppression = true; break; - case 'n': - unstranded_clustering = true; + case OPT_SUPPRESS_MISMAPPING_DETECTION: + suppress_mismapping_detection = true; + break; + + case 'v': + use_tvs_clusterer = true; + use_min_dist_clusterer = false; + break; + + case OPT_SNARL_MAX_CUT: + snarl_cut_size = parse(optarg); + break; + + case OPT_ALT_PATHS: + num_alt_alns = parse(optarg); break; case 'b': @@ -378,33 +651,27 @@ int main_mpmap(int argc, char** argv) { max_mapping_p_value = parse(optarg); break; - case 'v': - { - int mapq_arg = parse(optarg); - if (mapq_arg == 0) { - mapq_method = None; - } - else if (mapq_arg == 1) { - mapq_method = Approx; - } - else if (mapq_arg == 2) { - mapq_method = Adaptive; - } - else if (mapq_arg == 3) { - mapq_method = Exact; - } - else { - cerr << "error:[vg mpmap] Unrecognized mapping quality (-v) option: " << mapq_arg << ". Choose from {0, 1, 2, 3}." << endl; - exit(1); - } - } + case OPT_MAX_RESCUE_P_VALUE: + max_rescue_p_value = parse(optarg); break; case 'Q': max_mapq = parse(optarg); break; - case 'p': + case 'a': + agglomerate_multipath_alns = true; + break; + + case 'U': + report_group_mapq = true; + break; + + case OPT_REPORT_ALLELIC_MAPQ: + report_allelic_mapq = true; + break; + + case OPT_BAND_PADDING_MULTIPLIER: band_padding_multiplier = parse(optarg); break; @@ -416,40 +683,81 @@ int main_mpmap(int argc, char** argv) { } break; - case 'O': + case OPT_MAX_PATHS: population_max_paths = parse(optarg); break; + case OPT_TOP_TRACEBACKS: + top_tracebacks = true; + break; + case 'M': max_num_mappings = parse(optarg); break; - case 'r': - reseed_length = parse(optarg); + case OPT_RESEED_LENGTH: + reseed_length_arg = parse(optarg); break; case 'W': - reseed_diff = parse(optarg); + reseed_diff_arg = parse(optarg); break; - case 'k': - min_mem_length = parse(optarg); + case 'K': + min_clustering_mem_length_arg = parse(optarg); break; - case 'K': - min_clustering_mem_length = parse(optarg); + case 'F': + out_format = optarg; + break; + + case OPT_STRIPPED_MATCH: + use_stripped_match_alg = true; + break; + + case OPT_STRIP_LENGTH: + stripped_match_alg_strip_length = parse(optarg); + break; + + case OPT_STRIP_COUNT: + stripped_match_alg_target_count = parse(optarg); + break; + + case OPT_NO_GREEDY_MEM_RESTARTS: + use_greedy_mem_restarts = false; + break; + + case OPT_SHORT_MEM_FILTER_FACTOR: + short_mem_filter_factor = parse(optarg); + filter_short_mems = true; + break; + + case OPT_GREEDY_MEM_RESTART_MAX_LCP: + greedy_restart_max_lcp = parse(optarg); + break; + + case OPT_MAX_FANS_OUT: + max_fans_out = parse(optarg); + break; + + case OPT_FAN_OUT_QUAL: + max_fanout_base_quality = parse(optarg); + break; + + case OPT_FAN_OUT_DIFF: + fanout_pruning_diff = parse(optarg); break; case 'c': - hit_max = parse(optarg); + hit_max_arg = parse(optarg); break; - case 'd': - max_dist_error = parse(optarg); + case OPT_HARD_HIT_MAX_MULTIPLIER: + hard_hit_max_muliplier = parse(optarg); break; - case 'w': - likelihood_approx_exp = parse(optarg); + case OPT_APPROX_EXP: + likelihood_approx_exp_arg = parse(optarg); break; case OPT_RECOMBINATION_PENALTY: @@ -460,16 +768,68 @@ int main_mpmap(int argc, char** argv) { always_check_population = true; break; + case OPT_FORCE_HAPLOTYPE_COUNT: + force_haplotype_count = parse(optarg); + break; + + case OPT_MIN_DIST_CLUSTER: + // This the default behavior + //use_min_dist_clusterer = true; + break; + + case OPT_GREEDY_MIN_DIST: + greedy_min_dist = true; + component_min_dist = false; + break; + + case OPT_COMPONENT_MIN_DIST: + // the default now + component_min_dist = true; + break; + + case OPT_NO_CLUSTER: + no_clustering = true; + break; + case 'C': cluster_ratio = parse(optarg); break; - case 'U': + case OPT_PRUNE_EXP: suboptimal_path_exponent = parse(optarg); break; case 'E': - long_read_scoring = true; + cerr << "warning:[vg mpmap] Long read scoring option (--long-read-scoring) is deprecated. Instead, use read length preset (--read-length)." << endl; + read_length = "long"; + break; + + case 'X': + override_spliced_alignment = true; + break; + + case OPT_SPLICE_ODDS: + no_splice_log_odds = parse(optarg); + break; + + case OPT_MAX_MOTIF_PAIRS: + max_motif_pairs = parse(optarg); + break; + + case 'r': + intron_distr_name = optarg; + break; + + case 'l': + read_length = optarg; + break; + + case 'e': + error_rate = optarg; + break; + + case 'n': + nt_type = optarg; break; case 'q': @@ -480,7 +840,7 @@ int main_mpmap(int argc, char** argv) { mismatch_score_arg = parse(optarg); break; - case OPT_SCORE_MATRIX: + case 'w': matrix_file_name = optarg; if (matrix_file_name.empty()) { cerr << "error:[vg mpmap] Must provide matrix file with --matrix-file." << endl; @@ -519,8 +879,8 @@ int main_mpmap(int argc, char** argv) { } break; - case 'Z': - buffer_size = parse(optarg); + case OPT_NO_OUTPUT: + no_output = true; break; case 'h': @@ -532,6 +892,262 @@ int main_mpmap(int argc, char** argv) { } } + if (optind != argc) { + cerr << "error:[vg mpmap] Unused positional argument(s):"; + for (int i = optind; i < argc; ++i) { + cerr << " " << argv[i]; + } + cerr << endl; + exit(1); + } + + // normalize capitalization on preset options + if (read_length == "Long" || read_length == "LONG") { + read_length = "long"; + } + else if (read_length == "Very-Short" || read_length == "Very-short" || read_length == "VERY-SHORT") { + read_length = "very-short"; + } + else if (read_length == "Short" || read_length == "SHORT") { + read_length = "short"; + } + + if (nt_type == "RNA" || nt_type == "Rna") { + nt_type = "rna"; + } + else if (nt_type == "DNA" || nt_type == "Dna") { + nt_type = "dna"; + } + + if (error_rate == "Low" || error_rate == "LOW") { + error_rate = "low"; + } + else if (error_rate == "High" || error_rate == "HIGH") { + error_rate = "high"; + } + + if (out_format == "gamp" || out_format == "Gamp") { + out_format = "GAMP"; + } + else if (out_format == "gam" || out_format == "Gam") { + out_format = "GAM"; + } + else if (out_format == "gaf" || out_format == "Gaf") { + out_format = "GAF"; + } + else if (out_format == "sam" || out_format == "Sam") { + out_format = "SAM"; + } + else if (out_format == "bam" || out_format == "Bam") { + out_format = "BAM"; + } + else if (out_format == "cram" || out_format == "Cram") { + out_format = "CRAM"; + } + + bool hts_output = (out_format == "SAM" || out_format == "BAM" || out_format == "CRAM"); + bool transcriptomic = (nt_type == "rna"); + bool single_path_alignment_mode = (out_format != "GAMP"); + + // set baseline parameters according to presets + + if (error_rate == "high") { + // alignment scores that don't penalize gaps or mismatches as much + mismatch_score = 1; + gap_open_score = 1; + // do less DP on tails (having a presumption that long tails with no seeds + // will probably be soft-clipped) + use_pessimistic_tail_alignment = true; + // quality scores don't express errors well for these reads and they slow down dozeu + qual_adjusted = false; + // we generate many short MEMs that slow us down on high error reads, so we need + // to filter them down to stay performant + filter_short_mems = true; + } + + if (read_length == "long") { + // we don't care so much about soft-clips on long reads + full_length_bonus = 0; + // we don't want to extract huge graphs every time there's an error in the read + restrained_graph_extraction = true; + } + else if (read_length == "very-short") { + // clustering is unlikely to improve accuracy in very short data + no_clustering = true; // might this actually be important? + // we don't want to throw away short matches a priori in very short data + min_clustering_mem_length = 1; + // we don't want to automatically distrust short mappings + suppress_mismapping_detection = true; + // we want to look for short MEMs even on small reads + reseed_length = 22; + reseed_diff = 0.8; + // but actually only use this other MEM algorithm if we have base qualities + use_fanout_match_alg = true; + + // removing too many bases of matches distorts the multipath alignment + // graph's pruning algorithms for very short reads + max_branch_trim_length = 1; + snarl_cut_size = 2; + suboptimal_path_exponent = 1.5; + } + else if (read_length != "short") { + // short is the default + cerr << "error:[vg mpmap] Cannot identify read length preset (-l): " << read_length << endl; + exit(1); + } + + if (nt_type == "rna") { + // RNA preset + if (distance_index_name.empty()) { + cerr << "warning:[vg mpmap] It is HIGHLY recommended to use a distance index (-d) for clustering on splice graphs. Both accuracy and speed will suffer without one." << endl; + } + + // we'll assume that there might be spliced alignments + do_spliced_alignment = true; + + // seed finding, cluster pruning, and rescue parameters tuned for a lower repeat content + secondary_rescue_attempts = 1; + max_single_end_mappings_for_rescue = 32; + hit_max = 100; + if (read_length != "very-short") { + reseed_diff = 0.6; + } + likelihood_approx_exp = 3.5; + mapq_scaling_factor = 0.5; + if (read_length == "very-short" && !suppress_suppression) { + // we'll allow multicomponent alignments so that the two sides of a shRNA + // can be one alignment + suppress_multicomponent_splitting = true; + } + if (max_num_mappings == 0) { + max_num_mappings = default_rna_num_mappings; + } + } + else if (nt_type == "dna") { + if (max_num_mappings == 0) { + max_num_mappings = default_dna_num_mappings; + } + } + else { + // DNA is the default + cerr << "error:[vg mpmap] Cannot identify sequencing type preset (-n): " << nt_type << endl; + exit(1); + } + + if (single_path_alignment_mode && read_length != "long") { + // we get better performance by splitting up clusters a bit more when we're forcing alignments to go to only one place + // long reads on the other hand display underclustering behavior with some parameter settings + min_median_mem_coverage_for_split = 2; + suppress_cluster_merging = true; + } + + if (read_length == "long" && !single_path_alignment_mode) { + // we sometimes need to synthesize anchors for the tails to get good multipath alignments on long reads + synthesize_tail_anchors = true; + } + + if (single_path_alignment_mode) { + // simplifying topologies is redundant work if we're just going to take the maximum weight path anyway + simplify_topologies = false; + } + + // TODO: i think it should be possible to trip the splice site variant realignment bug in the + // the spliced surject algorithm sometimes by having better multipath alignments, but i should + // revisit this at some point + if (single_path_alignment_mode && + (population_max_paths == 0 || (sublinearLS_name.empty() && gbwt_name.empty())) && + !(hts_output && transcriptomic)) { + // adjust parameters that produce irrelevant extra work single path mode + if (!snarls_name.empty()) { + cerr << "warning:[vg mpmap] Snarl file (-s) is ignored for single path alignment formats (-F) without multipath population scoring (--max-paths)." << endl; + } + + if (snarl_cut_size != default_snarl_cut_size) { + cerr << "warning:[vg mpmap] Snarl cut limit (-X) is ignored for single path alignment formats (-F) without multipath population scoring (--max-paths)." << endl; + } + + if (num_alt_alns != default_num_alt_alns) { + cerr << "warning:[vg mpmap] Number of alternate alignments (-a) for ignored in single path alignment formats (-F) without multipath population scoring (--max-paths)." << endl; + } + + // don't cut inside snarls or load the snarl manager + snarl_cut_size = 0; + snarls_name = ""; + + // only get 1 traceback for an inter-MEM or tail alignment + dynamic_max_alt_alns = false; + num_alt_alns = 1; + } + + if (override_spliced_alignment) { + do_spliced_alignment = false; + } + + // set the overrides to preset-controlled parameters + if (hit_max_arg != numeric_limits::min()) { + hit_max = hit_max_arg; + } + if (reseed_length_arg != numeric_limits::min()) { + reseed_length = reseed_length_arg; + } + if (reseed_diff_arg != numeric_limits::lowest()) { + reseed_diff = reseed_diff_arg; + } + if (min_clustering_mem_length_arg != numeric_limits::min()) { + min_clustering_mem_length = min_clustering_mem_length_arg; + } + if (secondary_rescue_attempts_arg != numeric_limits::max()) { + secondary_rescue_attempts = secondary_rescue_attempts_arg; + } + if (likelihood_approx_exp_arg != numeric_limits::lowest()) { + likelihood_approx_exp = likelihood_approx_exp_arg; + } + if (match_score_arg != std::numeric_limits::min()) { + match_score = match_score_arg; + } + if (mismatch_score_arg != std::numeric_limits::min()) { + mismatch_score = mismatch_score_arg; + } + if (gap_open_score_arg != std::numeric_limits::min()) { + gap_open_score = gap_open_score_arg; + } + if (gap_extension_score_arg != std::numeric_limits::min()) { + gap_extension_score = gap_extension_score_arg; + } + if (full_length_bonus_arg != std::numeric_limits::min()) { + full_length_bonus = full_length_bonus_arg; + } + else if (read_length != "long") { + // TODO: not so elegant + // the full length bonus should override a mismatch unless we're in long read mode + full_length_bonus = min(mismatch_score + 1, std::numeric_limits::max()); + } + + // choose either the user supplied max or the default for paired/unpaired + int max_map_attempts = 0; + if (max_map_attempts_arg) { + max_map_attempts = max_map_attempts_arg; + max_single_end_mappings_for_rescue = max_map_attempts_arg; + } + else if (interleaved_input || !fastq_name_2.empty()) { + max_map_attempts = max_paired_end_map_attempts; + } + else if (read_length == "very-short") { + max_map_attempts = max_single_end_map_attempts_very_short; + } + else { + max_map_attempts = max_single_end_map_attempts; + } + + // hits that are much more frequent than the number of hits we sample are unlikely to produce high MAPQs, so + // we can usually ignore them + int hard_hit_max = hard_hit_max_muliplier * hit_max; + + // don't report secondaries if we're agglomerating + if (agglomerate_multipath_alns && max_num_mappings != 1) { + max_num_mappings = 1; + } + // check for valid parameters if (std::isnan(frag_length_mean) != std::isnan(frag_length_stddev)) { @@ -554,6 +1170,7 @@ int main_mpmap(int argc, char** argv) { exit(1); } + if (!fastq_name_1.empty() && !gam_file_name.empty()) { cerr << "error:[vg mpmap] Cannot designate both FASTQ input (-f) and GAM input (-G) in same run." << endl; exit(1); @@ -565,7 +1182,12 @@ int main_mpmap(int argc, char** argv) { } if (!interleaved_input && fastq_name_2.empty() && same_strand) { - cerr << "warning:[vg mpmap] Ignoring same strand parameter (-d) because no paired end input provided." << endl; + cerr << "warning:[vg mpmap] Ignoring same strand parameter (-e) because no paired end input provided." << endl; + } + + if (!ref_paths_name.empty() && !hts_output) { + cerr << "warning:[vg mpmap] Reference path file (-S) is only used when output format (-F) is SAM, BAM, or CRAM." << endl; + ref_paths_name = ""; } if (num_alt_alns <= 0) { @@ -579,11 +1201,26 @@ int main_mpmap(int argc, char** argv) { } if (snarl_cut_size < 0) { - cerr << "error:[vg mpmap] Max snarl cut size (-U) set to " << snarl_cut_size << ", must set to a positive integer or 0 for no maximum." << endl; + cerr << "error:[vg mpmap] Max snarl cut size (-X) set to " << snarl_cut_size << ", must set to a positive integer or 0 for no maximum." << endl; + exit(1); + } + + if (max_mapping_p_value <= 0.0) { + cerr << "error:[vg mpmap] Max mapping p-value (-P) set to " << max_mapping_p_value << ", must set to a positive number." << endl; + exit(1); + } + + if (max_rescue_p_value <= 0.0) { + cerr << "error:[vg mpmap] Max mapping p-value (--max-rescue-p-val) set to " << max_rescue_p_value << ", must set to a positive number." << endl; exit(1); } - if (max_mapq <= 0 && mapq_method != None) { + if (mapq_method == None) { + cerr << "error:[vg mpmap] The mapping quality method 'None' is no longer supported." << endl; + exit(1); + } + + if (max_mapq <= 0) { cerr << "error:[vg mpmap] Maximum mapping quality (-Q) set to " << max_mapq << ", must set to a positive integer." << endl; exit(1); } @@ -599,14 +1236,14 @@ int main_mpmap(int argc, char** argv) { } if (population_max_paths < 0) { - cerr << "error:[vg mpmap] Maximum number of paths per alignment for population scoring (-O) set to " << population_max_paths << ", must set to a nonnegative integer." << endl; + cerr << "error:[vg mpmap] Maximum number of paths per alignment for population scoring (--max-paths) set to " << population_max_paths << ", must set to a nonnegative integer." << endl; exit(1); } if (population_max_paths != 10 && population_max_paths != 0 && gbwt_name.empty() && sublinearLS_name.empty()) { // Don't allow anything but the default or the "disabled" setting without an index. // TODO: This restriction makes neat auto-generation of command line options for different conditions hard. - cerr << "error:[vg mpmap] Maximum number of paths per alignment for population scoring (-O) is specified but population database (-H or --linear-index) was not provided." << endl; + cerr << "error:[vg mpmap] Maximum number of paths per alignment for population scoring (--max-paths) is specified but population database (-H or --linear-index) was not provided." << endl; exit(1); } @@ -615,6 +1252,10 @@ int main_mpmap(int argc, char** argv) { exit(1); } + if (force_haplotype_count != 0 && gbwt_name.empty() && sublinearLS_name.empty()) { + cerr << "warning:[vg mpmap] Cannot --force-haplotype-count if no population database (-H or --linear-index) is provided. Ignoring option." << endl; + } + if (!sublinearLS_name.empty() && !gbwt_name.empty()) { cerr << "error:[vg mpmap] GBWT index (-H) and linear haplotype index (--linear-index) both specified. Only one can be used." << endl; exit(1); @@ -630,15 +1271,31 @@ int main_mpmap(int argc, char** argv) { exit(1); } - // choose either the user supplied max or the default for paired/unpaired - int max_map_attempts = max_map_attempts_arg ? max_map_attempts_arg : ((interleaved_input || !fastq_name_2.empty()) ? - max_paired_end_map_attempts : max_single_end_map_attempts); if (max_num_mappings > max_map_attempts && max_map_attempts != 0) { cerr << "warning:[vg mpmap] Reporting up to " << max_num_mappings << " mappings, but only computing up to " << max_map_attempts << " mappings." << endl; } - if (max_num_mappings <= 0) { - cerr << "error:[vg mpmap] Maximum number of mappings per read (-M) set to " << max_num_mappings << ", must set to a positive integer." << endl; + if (max_rescue_attempts < 0) { + cerr << "error:[vg mpmap] Maximum number of rescue attempts (--max-rescues) set to " << max_rescue_attempts << ", must set to a non-negative integer (0 for no rescue)." << endl; + exit(1); + } + + if (max_rescue_attempts > max_single_end_mappings_for_rescue) { + cerr << "warning:[vg mpmap] Maximum number of rescue attempts (--max-rescues) of " << max_rescue_attempts << " is greater than number of mapping attempts for rescue " << max_single_end_mappings_for_rescue << endl; + } + + if (secondary_rescue_attempts < 0) { + cerr << "error:[vg mpmap] Maximum number of rescue attempts for secondary mappings (--max-secondary-rescues) set to " << secondary_rescue_attempts << ", must set to a non-negative integer (0 for no rescue)." << endl; + exit(1); + } + + if (secondary_rescue_score_diff < 0.0) { + cerr << "error:[vg mpmap] Max score difference for candidates clusters for secondary rescue (--secondary-diff) set to " << secondary_rescue_score_diff << ", must set to a non-negative number." << endl; + exit(1); + } + + if (max_num_mappings <= 0) { + cerr << "error:[vg mpmap] Maximum number of mappings per read (-M) set to " << max_num_mappings << ", must set to a positive integer." << endl; exit(1); } @@ -657,18 +1314,62 @@ int main_mpmap(int argc, char** argv) { exit(1); } + if (hard_hit_max_muliplier < 0) { + cerr << "error:[vg mpmap] Hard MEM hit max multipler (--hard-hit-mult) set to " << hard_hit_max_muliplier << ", must set to a positive integer or 0 for no maximum." << endl; + exit(1); + } + if (hit_max < 0) { cerr << "error:[vg mpmap] MEM hit max (-c) set to " << hit_max << ", must set to a positive integer or 0 for no maximum." << endl; exit(1); } - if (max_dist_error < 0) { - cerr << "error:[vg mpmap] Maximum distance approximation error (-d) set to " << max_dist_error << ", must set to a nonnegative integer." << endl; + if (hard_hit_max < hit_max && hit_max && hard_hit_max) { + cerr << "warning:[vg mpmap] MEM hit query limit (-c) set to " << hit_max << ", which is higher than the threshold to ignore a MEM (" << hard_hit_max << ")." << endl; + } + + if (min_mem_length < 0) { + cerr << "error:[vg mpmap] Minimum MEM length set to " << min_mem_length << ", must set to a positive integer or 0 for no maximum." << endl; + exit(1); + } + + if (single_path_alignment_mode && agglomerate_multipath_alns) { + // this could probably be just a warning, but it will really mess up the MAPQs + cerr << "error:[vg mpmap] Disconnected alignments cannot be agglomerated (-a) for single path alignment formats (-F)." << endl; + exit(1); + } + + if (stripped_match_alg_strip_length <= 0) { + cerr << "error:[vg mpmap] Match strip length (--strip-length) set to " << stripped_match_alg_strip_length << ", must set to a positive integer or 0 for no maximum." << endl; + exit(1); + } + + if (stripped_match_alg_max_length < 0) { + cerr << "error:[vg mpmap] Maximum seed match length set to " << stripped_match_alg_max_length << ", must set to a positive integer or 0 for no maximum." << endl; + exit(1); + } + + if (stripped_match_alg_target_count < 0) { + cerr << "error:[vg mpmap] Target seed count (--strip-count) set to " << stripped_match_alg_target_count << ", must set to a positive integer or 0 for no maximum." << endl; exit(1); } + if (stripped_match_alg_target_count != default_strip_count && !use_stripped_match_alg) { + cerr << "warning:[vg mpmap] Target stripped match count (--strip-count) set to " << stripped_match_alg_target_count << ", but stripped algorithm (--stripped-match) was not selected. Ignoring strip count." << endl; + } + + if (stripped_match_alg_strip_length != default_strip_length && !use_stripped_match_alg) { + cerr << "warning:[vg mpmap] Strip length (--strip-length) set to " << stripped_match_alg_strip_length << ", but stripped algorithm (--stripped-match) was not selected. Ignoring strip length." << endl; + } + + // people shouldn't really be setting these anyway, but there may be combinations of presets that do this +// if (use_fanout_match_alg && use_stripped_match_alg) { +// cerr << "error:[vg mpmap] Cannot perform both stripped and fan-out match algorithms." << endl; +// exit(1); +// } + if (likelihood_approx_exp < 1.0) { - cerr << "error:[vg mpmap] Likelihood approximation exponent (-w) set to " << likelihood_approx_exp << ", must set to at least 1.0." << endl; + cerr << "error:[vg mpmap] Likelihood approximation exponent (--approx-exp) set to " << likelihood_approx_exp << ", must set to at least 1.0." << endl; exit(1); } @@ -677,110 +1378,99 @@ int main_mpmap(int argc, char** argv) { exit(1); } - if (suboptimal_path_exponent < 1.0) { - cerr << "error:[vg mpmap] Suboptimal path likelihood root (-R) set to " << suboptimal_path_exponent << ", must set to at least 1.0." << endl; + if (use_tvs_clusterer && distance_index_name.empty()) { + cerr << "error:[vg mpmap] The Target Value Search clusterer (-v) requires a distance index (-d)." << endl; exit(1); } - if (min_mem_length <= 0) { - cerr << "error:[vg mpmap] Minimum MEM length (-k) set to " << min_mem_length << ", must set to a positive integer." << endl; + if (use_min_dist_clusterer && distance_index_name.empty()) { + cerr << "error:[vg mpmap] The minimum distance clusterer (--min-dist-cluster) requires a distance index (-d)." << endl; exit(1); } - if ((match_score_arg != std::numeric_limits::min() || mismatch_score_arg != std::numeric_limits::min()) && !matrix_file_name.empty()) { - cerr << "error:[vg mpmap] Cannot choose custom scoring matrix (--score-matrix) and custom match/mismatch score (-q/-z) simultaneously." << endl; + if (use_min_dist_clusterer && use_tvs_clusterer) { + cerr << "error:[vg mpmap] Cannot perform both minimum distance clustering (--min-dist-cluster) and target value clustering (-v)." << endl; exit(1); } - if (long_read_scoring) { - // defaults for long read scoring - match_score = 1; - mismatch_score = 1; - gap_open_score = 1; - gap_extension_score = 1; - full_length_bonus = 0; + if (greedy_min_dist && !use_min_dist_clusterer) { + cerr << "warning:[vg mpmap] greedy minimum distance clustering (--greedy-min-dist) is ignored if not using minimum distance clustering (-d)" << endl; } - // if we indicated any other scores, apply those, possibly overriding - if (match_score_arg != std::numeric_limits::min()) { - match_score = match_score_arg; - } - if (mismatch_score_arg != std::numeric_limits::min()) { - mismatch_score = mismatch_score_arg; - } - if (gap_open_score_arg != std::numeric_limits::min()) { - gap_open_score = gap_open_score_arg; + if (greedy_min_dist && component_min_dist) { + cerr << "error:[vg mpmap] cannot simultaneously use greedy (--greedy-min-dist) and component (--component-min-dist) clustering" << endl; + exit(1); } - if (gap_extension_score_arg != std::numeric_limits::min()) { - gap_extension_score = gap_extension_score_arg; + + if (no_clustering && !distance_index_name.empty() && !snarls_name.empty()) { + cerr << "warning:[vg mpmap] No clustering option (--no-cluster) causes distance index (-d) to be ignored when snarls (-s) are provided. This option is activated by default for 'very-short' read lengths (-l)." << endl; } - if (full_length_bonus_arg != std::numeric_limits::min()) { - full_length_bonus = full_length_bonus_arg; + + if (suboptimal_path_exponent < 1.0) { + cerr << "error:[vg mpmap] Suboptimal path likelihood root (--prune-exp) set to " << suboptimal_path_exponent << ", must set to at least 1.0." << endl; + exit(1); } - if (match_score > std::numeric_limits::max() || mismatch_score > std::numeric_limits::max() - || gap_open_score > std::numeric_limits::max() || gap_extension_score > std::numeric_limits::max() - || full_length_bonus > std::numeric_limits::max() || match_score < 0 || mismatch_score < 0 - || gap_open_score < 0 || gap_extension_score < 0 || full_length_bonus < 0) { - cerr << "error:[vg mpmap] All alignment scoring parameters (-qzoyL) must be between 0 and " << (int) std::numeric_limits::max() << endl; + if (max_alignment_gap < 0) { + cerr << "error:[vg mpmap] Max alignment grap set to " << max_alignment_gap << ", must set to a non-negative integer." << endl; exit(1); } - if (buffer_size <= 0) { - cerr << "error:[vg mpmap] Buffer size (-Z) set to " << buffer_size << ", must set to a positive integer." << endl; + if (filter_short_mems && (short_mem_filter_factor < 0.0 || short_mem_filter_factor > 1.0)) { + cerr << "error:[vg mpmap] Short MEM filtraction factor (--filter-factor) set to " << short_mem_filter_factor << ", must set to a number between 0.0 and 1.0." << endl; exit(1); } - // adjust parameters that produce irrelevant extra work or bad behavior single path mode + if (no_splice_log_odds <= 0.0) { + cerr << "warning:[vg mpmap] Log odds against splicing (--splice-odds) set to " << no_splice_log_odds << ", non-positive values can lead to spurious identification of spliced alignments." << endl; + } - if (single_path_alignment_mode && population_max_paths == 0) { - // TODO: I don't like having these constants floating around in two different places, but it's not very risky, just a warning - if (!snarls_name.empty()) { - cerr << "warning:[vg mpmap] Snarl file (-s) is ignored in single path mode (-S) without multipath population scoring (-O)." << endl; - // TODO: Not true! - } - - if (snarl_cut_size != 5) { - cerr << "warning:[vg mpmap] Snarl cut limit (-u) is ignored in single path mode (-S) without multipath population scoring (-O)." << endl; - } - - if (num_alt_alns != 4) { - cerr << "warning:[vg mpmap] Number of alternate alignments (-a) is ignored in single path mode (-S) without multipath population scoring (-O)." << endl; - } - - num_alt_alns = 1; + if (max_motif_pairs < 0) { + cerr << "error:[vg mpmap] Maximum attempted splice motif pairs (--max-motif-pairs) set to " << max_motif_pairs << ", must set to a non-negative number." << endl; + exit(1); } - if (single_path_alignment_mode && !long_read_scoring) { - // we get better performance by splitting up clusters a bit more when we're forcing alignments to go to only one place - min_median_mem_coverage_for_split = 2; - suppress_cluster_merging = true; + if ((match_score_arg != std::numeric_limits::min() || mismatch_score_arg != std::numeric_limits::min()) && !matrix_file_name.empty()) { + cerr << "error:[vg mpmap] Cannot choose custom scoring matrix (-w) and custom match/mismatch score (-q/-z) simultaneously." << endl; + exit(1); } - if (single_path_alignment_mode) { - // simplifying topologies is redundant work if we're just going to take the maximum weight path anyway - simplify_topologies = false; + if (match_score > std::numeric_limits::max() || mismatch_score > std::numeric_limits::max() + || gap_open_score > std::numeric_limits::max() || gap_extension_score > std::numeric_limits::max() + || full_length_bonus > std::numeric_limits::max() || match_score < 0 || mismatch_score < 0 + || gap_open_score < 0 || gap_extension_score < 0 || full_length_bonus < 0) { + cerr << "error:[vg mpmap] All alignment scoring parameters (-qzoyL) must be between 0 and " << (int) std::numeric_limits::max() << endl; + exit(1); } // ensure required parameters are provided - if (xg_name.empty()) { - cerr << "error:[vg mpmap] Multipath mapping requires an XG index, must provide XG file (-x)" << endl; + if (graph_name.empty()) { + cerr << "error:[vg mpmap] Multipath mapping requires a graph (-x)" << endl; exit(1); } if (gcsa_name.empty()) { - cerr << "error:[vg mpmap] Multipath mapping requires a GCSA2 index, must provide GCSA2 file (-g)" << endl; + cerr << "error:[vg mpmap] Multipath mapping requires a GCSA2 index (-g)" << endl; exit(1); } + +#ifdef mpmap_instrument_mem_statistics + if (auto_calibrate_mismapping_detection) { + cerr << "error:[vg mpmap] set calibration off when profiling MEM statistics" << endl; + exit(1); + } +#endif + // create in-memory objects - ifstream xg_stream(xg_name); - if (!xg_stream) { - cerr << "error:[vg mpmap] Cannot open XG file " << xg_name << endl; + ifstream graph_stream(graph_name); + if (!graph_stream) { + cerr << "error:[vg mpmap] Cannot open graph file " << graph_name << endl; exit(1); } + graph_stream.close(); ifstream gcsa_stream(gcsa_name); if (!gcsa_stream) { @@ -797,70 +1487,417 @@ int main_mpmap(int argc, char** argv) { ifstream matrix_stream; if (!matrix_file_name.empty()) { - matrix_stream.open(matrix_file_name); - if (!matrix_stream) { - cerr << "error:[vg mpmap] Cannot open scoring matrix file " << matrix_file_name << endl; - exit(1); - } + matrix_stream.open(matrix_file_name); + if (!matrix_stream) { + cerr << "error:[vg mpmap] Cannot open scoring matrix file " << matrix_file_name << endl; + exit(1); + } } - + + ifstream intron_distr_stream; + if (!intron_distr_name.empty()) { + intron_distr_stream.open(intron_distr_name); + if (!intron_distr_stream) { + cerr << "error:[vg mpmap] Cannot open intron length distribution file " << intron_distr_name << endl; + exit(1); + } + } + + ifstream distance_index_stream; + if (!distance_index_name.empty() && !(no_clustering && !snarls_name.empty())) { + distance_index_stream.open(distance_index_name); + if (!distance_index_stream) { + cerr << "error:[vg mpmap] Cannot open distance index file " << distance_index_name << endl; + exit(1); + } + } + + ifstream snarl_stream; + if (!snarls_name.empty()) { + if (distance_index_name.empty() || no_clustering) { + snarl_stream.open(snarls_name); + if (!snarl_stream) { + cerr << "error:[vg mpmap] Cannot open Snarls file " << snarls_name << endl; + exit(1); + } + } + else { + cerr << "warning:[vg mpmap] Snarls file (-s) is unnecessary and will be ignored when the distance index (-d) is provided." << endl; + } + } + + ifstream gbwt_stream; + ifstream ls_stream; + if (!gbwt_name.empty()) { + gbwt_stream.open(gbwt_name); + if (!gbwt_stream) { + cerr << "error:[vg mpmap] Cannot open GBWT file " << gbwt_name << endl; + exit(1); + } + } + else if (!sublinearLS_name.empty()) { + // We want to use sublinear Li and Stephens as our haplotype scoring approach + ls_stream.open(sublinearLS_name); + if (!ls_stream) { + cerr << "error:[vg mpmap] Cannot open sublinear Li & Stevens file " << sublinearLS_name << endl; + exit(1); + } + } + + // check to make sure we can open the reads + for (string reads_name : {fastq_name_1, fastq_name_2, gam_file_name}) { + if (!reads_name.empty() && reads_name != "-") { + ifstream test_read_stream(reads_name); + if (!test_read_stream) { + cerr << "error:[vg mpmap] Cannot open reads file " << reads_name << endl; + exit(1); + } + } + } + + // Count our threads + int thread_count = vg::get_thread_count(); + + // a convenience function to preface a stderr log with an indicator of the command + // and the time elapse + bool clock_init = false; + time_t time_start; + mutex progress_mutex; + auto log_progress = [&](const string progress) { + if (!suppress_progress) { + progress_mutex.lock(); + stringstream strm; + strm << fixed; + strm.precision(0); + if (!clock_init) { + time(&time_start); + strm << 0.0 << " s"; + clock_init = true; + } + else { + time_t time_now; + time(&time_now); + double secs = (double) difftime(time_now, time_start); + if (secs <= 60.0) { + strm << secs << " s"; + } + else { + strm.precision(1); + double mins = secs / 60.0; + if (mins <= 60.0) { + strm << mins << " m"; + } + else { + double hrs = mins / 60.0; + if (hrs <= 24.0) { + strm << hrs << " h"; + } + else { + strm << (hrs / 24.0) << " d"; + } + } + } + } + cerr << "[vg mpmap] elapsed time " << strm.str() << ": " << progress << endl; + progress_mutex.unlock(); + } + }; + + { + stringstream strm; + strm << "Executing command:"; + for (size_t i = 0; i < argc; ++i) { + strm << " " << argv[i]; + } + log_progress(strm.str()); + } + + vector intron_mixture_weights; + vector> intron_component_params; + if (!intron_distr_name.empty()) { + tie(intron_mixture_weights, intron_component_params) = parse_intron_distr_file(intron_distr_stream); + } + // Configure GCSA2 verbosity so it doesn't spit out loads of extra info gcsa::Verbosity::set(gcsa::Verbosity::SILENT); - // Configure its temp directory to the system temp directory - gcsa::TempFile::setDirectory(temp_file::get_dir()); + // Load required indexes + log_progress("Loading graph from " + graph_name); + unique_ptr path_handle_graph = vg::io::VPKG::load_one(graph_name); + log_progress("Completed loading graph"); + + if (!suppress_progress) { + // let's be a friendly guide to selecting a graph + + // get the graphs magic number if it has one + uint32_t magic_num = 0; + { + SerializableHandleGraph* serializable = dynamic_cast(path_handle_graph.get()); + if (serializable) { + magic_num = serializable->get_magic_number(); + } + } + + // compare to known magic numbers + string type; + if (magic_num == xg::XG().get_magic_number()) { + type = "XG"; + } + else if (magic_num == bdsg::PackedGraph().get_magic_number()) { + type = "PackedGraph"; + } + else if (magic_num == bdsg::HashGraph().get_magic_number()) { + type = "HashGraph"; + } + + stringstream strm; + if (!type.empty()) { + // we found the type, give an appropriate message about it + strm << "Graph is in " + type + " format. "; + + if (type == "XG") { + strm << "XG is a good graph format for most mapping use cases. PackedGraph may be selected if memory usage is too high. "; + } + else if (type == "HashGraph") { + strm << "HashGraph can have high memory usage. "; + } + else if (type == "PackedGraph") { + strm << "PackedGraph is memory efficient, but has some slow queries. "; + } + } + else { + // probably a VG graph + strm << "Graph is not in XG format. "; + } + + // are they using a graph combo that I don't recommend? + if (type != "XG" && (!use_min_dist_clusterer || type == "HashGraph" || type.empty())) { + // min dist clustering alleviates the issues with slow path queries because we don't need to do + // so many, but I want to dissuade people from using HashGraph and VG for mapping regardless + strm << "XG format is recommended for most mapping tasks. "; + } + + strm << "See `vg convert` if you want to change graph formats."; + log_progress(strm.str()); + } + + if (path_handle_graph->get_path_count() == 0 && distance_index_name.empty()) { + cerr << "warning:[vg mpmap] Using a distance index (-d) for clustering is highly recommended for graphs that lack embedded paths. Speed and accuracy are likely to suffer severely without one." << endl; + } + else if (path_handle_graph->get_path_count() == 0 + && get_rescue_graph_from_paths + && (interleaved_input || !fastq_name_2.empty())) { + cerr << "warning:[vg mpmap] Identifying rescue subgraphs using embedded paths (--path-rescue-graph) is impossible on graphs that lack embedded paths. Pair rescue will not be used on this graph, potentially hurting accuracy." << endl; + } + + bdsg::ReferencePathOverlayHelper overlay_helper; + PathPositionHandleGraph* path_position_handle_graph = overlay_helper.apply(path_handle_graph.get()); + + // identify these before loading later data structures to reduce peak memory use + // (the bit vector used to detect which nodes have been visited is not exactly small) + unordered_set ref_path_handles; + if (do_spliced_alignment) { + // TODO: could let IO continue while doing this, but it risks increasing peak memory for some graphs... + log_progress("Identifying reference paths"); + vector> component_path_sets = vg::algorithms::component_paths_parallel(*path_position_handle_graph); + for (const auto& path_set : component_path_sets) { + // remove dependency on system hash ordering + vector ordered_path_set(path_set.begin(), path_set.end()); + std::sort(ordered_path_set.begin(), ordered_path_set.end()); + + int64_t max_length = 0; + path_handle_t max_handle; + for (path_handle_t path_handle : ordered_path_set) { + int64_t length = path_position_handle_graph->get_path_length(path_handle); + if (length >= max_length) { + max_length = length; + max_handle = path_handle; + } + } + ref_path_handles.insert(max_handle); + } + } + + // start at 1 for the main thread + atomic threads_active(1); + list background_processes; + + // for the indexes whose loading involves non-trivial computation, do them in the + // background to maximize IO + + unique_ptr snarl_manager; + if (!snarls_name.empty() && (distance_index_name.empty() || no_clustering)) { + // try to add an active thread + int curr_thread_active = threads_active++; + if (curr_thread_active >= thread_count) { + // take back the increment and don't let it go multithreaded + --threads_active; + log_progress("Loading snarls from " + snarls_name); + snarl_manager = vg::io::VPKG::load_one(snarl_stream); + log_progress("Completed loading snarls"); + } + else { + // do the process in a background thread + background_processes.emplace_back([&]() { + log_progress("Loading snarls from " + snarls_name + " (in background)"); + snarl_manager = vg::io::VPKG::load_one(snarl_stream); + --threads_active; + log_progress("Completed loading snarls"); + }); + } + } + + unique_ptr distance_index; + if (!distance_index_name.empty() && !(no_clustering && !snarls_name.empty())) { + // try to add an active thread + int curr_thread_active = threads_active++; + if (curr_thread_active >= thread_count) { + // take back the increment and don't let it go multithreaded + --threads_active; + log_progress("Loading distance index from " + distance_index_name); + distance_index = vg::io::VPKG::load_one(distance_index_stream); + log_progress("Completed loading distance index"); + } + else { + // do the process in a background thread + background_processes.emplace_back([&]() { + log_progress("Loading distance index from " + distance_index_name + " (in background)"); + distance_index = vg::io::VPKG::load_one(distance_index_stream); + --threads_active; + log_progress("Completed loading distance index"); + }); + } + } + + // compute this once in case the backing graph doesn't have an efficient implementation + size_t total_seq_length = path_position_handle_graph->get_total_length(); + + log_progress("Loading GCSA2 from " + gcsa_name); + unique_ptr gcsa_index = vg::io::VPKG::load_one(gcsa_stream); + log_progress("Completed loading GCSA2"); + + unique_ptr mem_accelerator; + unique_ptr lcp_array; + if (!use_stripped_match_alg) { + // don't make a huge table for a small graph + mem_accelerator_length = min(mem_accelerator_length, round(log(total_seq_length) / log(4.0))); + // try to add an active thread + int curr_thread_active = threads_active++; + if (curr_thread_active >= thread_count) { + // take back the increment and don't let it go multithreaded + --threads_active; + log_progress("Memoizing GCSA2 queries"); + mem_accelerator = unique_ptr(new MEMAccelerator(*gcsa_index, mem_accelerator_length)); + log_progress("Completed memoizing GCSA2 queries"); + } + else { + // do the process in a background thread + background_processes.emplace_back([&]() { + log_progress("Memoizing GCSA2 queries (in background)"); + mem_accelerator = unique_ptr(new MEMAccelerator(*gcsa_index, mem_accelerator_length)); + --threads_active; + log_progress("Completed memoizing GCSA2 queries"); + }); + } + + // The stripped algorithm doesn't use the LCP, but we aren't doing it + log_progress("Loading LCP from " + lcp_name); + lcp_array = vg::io::VPKG::load_one(lcp_stream); + log_progress("Completed loading LCP"); + } - xg::XG xg_index(xg_stream); - gcsa::GCSA gcsa_index; - gcsa_index.load(gcsa_stream); - gcsa::LCPArray lcp_array; - lcp_array.load(lcp_stream); + // Load optional indexes - gbwt::GBWT* gbwt = nullptr; + unique_ptr gbwt; haplo::linear_haplo_structure* sublinearLS = nullptr; haplo::ScoreProvider* haplo_score_provider = nullptr; if (!gbwt_name.empty()) { - ifstream gbwt_stream(gbwt_name); - if (!gbwt_stream) { - cerr << "error:[vg mpmap] Cannot open GBWT file " << gbwt_name << endl; + log_progress("Loading GBWT from " + gbwt_name); + // Load the GBWT from its container + gbwt = vg::io::VPKG::load_one(gbwt_stream); + log_progress("Completed loading GBWT"); + + if (gbwt.get() == nullptr) { + // Complain if we couldn't. + cerr << "error:[vg mpmap] unable to load gbwt index file" << endl; exit(1); } - gbwt = new gbwt::GBWT(); - gbwt->load(gbwt_stream); - + // We have the GBWT available for scoring haplotypes haplo_score_provider = new haplo::GBWTScoreProvider(*gbwt); - } else if (!sublinearLS_name.empty()) { - // We want to use sublinear Li and Stephens as our haplotype scoring approach - ifstream ls_stream(sublinearLS_name); + } + else if (!sublinearLS_name.empty()) { + log_progress("Loading LS index from " + sublinearLS_name); // TODO: we only support a single ref contig, and we use these // hardcoded mutation and recombination likelihoods - // What is the rank of our one and only reference path - auto xg_ref_rank = xg_index.path_rank(sublinearLS_ref_path); - - sublinearLS = new linear_haplo_structure(ls_stream, -9 * 2.3, -6 * 2.3, xg_index, xg_ref_rank); + sublinearLS = new linear_haplo_structure(ls_stream, -9 * 2.3, -6 * 2.3, *path_position_handle_graph, + path_position_handle_graph->get_path_handle(sublinearLS_ref_path)); haplo_score_provider = new haplo::LinearScoreProvider(*sublinearLS); + log_progress("Completed loading LS index"); + } + + // Load structures that we need for HTS lib outputs + unordered_set surjection_paths; + vector> path_names_and_length; + unique_ptr surjector(nullptr); + if (hts_output) { + // init the data structures + surjector = unique_ptr(new Surjector(path_position_handle_graph)); + surjector->min_splice_length = transcriptomic ? min_splice_length : numeric_limits::max(); + surjector->adjust_alignments_for_base_quality = qual_adjusted; + if (transcriptomic) { + // FIXME: replicating the behavior in surject_main + surjector->max_subgraph_bases = 16 * 1024 * 1024; + } + + if (!ref_paths_name.empty()) { + log_progress("Choosing reference paths from " + ref_paths_name); + } else { + log_progress("No reference path file given. Interpreting all non-alt-allele paths in graph as reference sequences."); + } + + // Load all the paths in the right order + vector> paths = get_sequence_dictionary(ref_paths_name, {}, *path_position_handle_graph); + // Make them into a set for directing surjection. + for (const auto& path_info : paths) { + surjection_paths.insert(get<0>(path_info)); + } + // Copy out the metadata for making the emitter later + path_names_and_length = extract_path_metadata(paths, *path_position_handle_graph).first; } - // TODO: Allow using haplo::XGScoreProvider? + // barrier sync the background threads + for (auto& process : background_processes) { + process.join(); + } + background_processes.clear(); - SnarlManager* snarl_manager = nullptr; - if (!snarls_name.empty()) { - ifstream snarl_stream(snarls_name); - if (!snarl_stream) { - cerr << "error:[vg mpmap] Cannot open Snarls file " << snarls_name << endl; - exit(1); - } - snarl_manager = new SnarlManager(snarl_stream); + // this also takes a while inside the MultipathMapper constructor, but it will only activate if we don't + // have a distance index available for oriented distance calculations + if (distance_index_name.empty() && path_handle_graph->get_path_count() > 0) { + log_progress("Labeling embedded paths by their connected component"); + } + + MultipathMapper multipath_mapper(path_position_handle_graph, gcsa_index.get(), lcp_array.get(), haplo_score_provider, + snarl_manager.get(), distance_index.get()); + // give it the MEMAccelerator + if (mem_accelerator.get() != nullptr) { + multipath_mapper.accelerator = mem_accelerator.get(); } - - MultipathMapper multipath_mapper(&xg_index, &gcsa_index, &lcp_array, haplo_score_provider, snarl_manager); // set alignment parameters - multipath_mapper.set_alignment_scores(match_score, mismatch_score, gap_open_score, gap_extension_score, full_length_bonus); - if(matrix_stream.is_open()) multipath_mapper.load_scoring_matrix(matrix_stream); + if (matrix_stream.is_open()) { + multipath_mapper.set_alignment_scores(matrix_stream, gap_open_score, gap_extension_score, full_length_bonus); + } + else if (match_score != default_match + || mismatch_score != default_mismatch + || gap_open_score != default_gap_open + || gap_extension_score != default_gap_extension + || full_length_bonus != default_full_length_bonus) { + multipath_mapper.set_alignment_scores(match_score, mismatch_score, gap_open_score, gap_extension_score, full_length_bonus); + } multipath_mapper.adjust_alignments_for_base_quality = qual_adjusted; multipath_mapper.strip_bonuses = strip_full_length_bonus; multipath_mapper.band_padding_multiplier = band_padding_multiplier; @@ -868,13 +1905,29 @@ int main_mpmap(int argc, char** argv) { // set mem finding parameters multipath_mapper.hit_max = hit_max; + multipath_mapper.hard_hit_max = hard_hit_max; multipath_mapper.mem_reseed_length = reseed_length; multipath_mapper.fast_reseed = true; multipath_mapper.fast_reseed_length_diff = reseed_diff; multipath_mapper.sub_mem_count_thinning = sub_mem_count_thinning; - multipath_mapper.sub_mem_thinning_burn_in = sub_mem_thinning_burn_in; + multipath_mapper.sub_mem_thinning_burn_in = int(ceil(log(total_seq_length) / log(4.0))) + sub_mem_thinning_burn_in_diff; multipath_mapper.order_length_repeat_hit_max = order_length_repeat_hit_max; multipath_mapper.min_mem_length = min_mem_length; + multipath_mapper.stripped_match_alg_strip_length = stripped_match_alg_strip_length; + multipath_mapper.stripped_match_alg_max_length = stripped_match_alg_max_length; + multipath_mapper.stripped_match_alg_target_count = stripped_match_alg_target_count; + multipath_mapper.use_greedy_mem_restarts = use_greedy_mem_restarts; + multipath_mapper.greedy_restart_min_length = greedy_restart_min_length; + multipath_mapper.greedy_restart_max_count = greedy_restart_max_count; + multipath_mapper.greedy_restart_max_lcp = greedy_restart_max_lcp; + multipath_mapper.greedy_restart_assume_substitution = greedy_restart_assume_substitution; + multipath_mapper.use_stripped_match_alg = use_stripped_match_alg; + multipath_mapper.filter_short_mems = filter_short_mems; + multipath_mapper.short_mem_filter_factor = short_mem_filter_factor; + multipath_mapper.use_fanout_match_alg = use_fanout_match_alg; + multipath_mapper.max_fanout_base_quality = max_fanout_base_quality; + multipath_mapper.max_fans_out = max_fans_out; + multipath_mapper.fanout_length_threshold = int(ceil(log(total_seq_length) / log(4.0))) + fanout_pruning_diff; multipath_mapper.adaptive_reseed_diff = use_adaptive_reseed; multipath_mapper.adaptive_diff_exponent = reseed_exp; multipath_mapper.use_approx_sub_mem_count = false; @@ -882,6 +1935,8 @@ int main_mpmap(int argc, char** argv) { multipath_mapper.precollapse_order_length_hits = precollapse_order_length_hits; multipath_mapper.max_sub_mem_recursion_depth = max_sub_mem_recursion_depth; multipath_mapper.max_mapping_p_value = max_mapping_p_value; + multipath_mapper.max_rescue_p_value = max_rescue_p_value; + multipath_mapper.suppress_mismapping_detection = suppress_mismapping_detection; if (min_clustering_mem_length) { multipath_mapper.min_clustering_mem_length = min_clustering_mem_length; } @@ -892,52 +1947,100 @@ int main_mpmap(int argc, char** argv) { // set mapping quality parameters multipath_mapper.mapping_quality_method = mapq_method; multipath_mapper.max_mapping_quality = max_mapq; + multipath_mapper.mapq_scaling_factor = mapq_scaling_factor; + // always report group MAPQ when we're reporting multimapped reads + multipath_mapper.report_group_mapq = report_group_mapq || (max_num_mappings > 1 && !agglomerate_multipath_alns); + multipath_mapper.report_allelic_mapq = report_allelic_mapq; // Use population MAPQs when we have the right option combination to make that sensible. multipath_mapper.use_population_mapqs = (haplo_score_provider != nullptr && population_max_paths > 0); multipath_mapper.population_max_paths = population_max_paths; + multipath_mapper.population_paths_hard_cap = population_paths_hard_cap; + multipath_mapper.top_tracebacks = top_tracebacks; multipath_mapper.recombination_penalty = recombination_penalty; multipath_mapper.always_check_population = always_check_population; + multipath_mapper.force_haplotype_count = force_haplotype_count; // set pruning and clustering parameters + multipath_mapper.no_clustering = no_clustering; + multipath_mapper.use_tvs_clusterer = use_tvs_clusterer; + multipath_mapper.use_min_dist_clusterer = use_min_dist_clusterer; + multipath_mapper.greedy_min_dist = greedy_min_dist; + multipath_mapper.component_min_dist = component_min_dist; multipath_mapper.max_expected_dist_approx_error = max_dist_error; multipath_mapper.mem_coverage_min_ratio = cluster_ratio; multipath_mapper.log_likelihood_approx_factor = likelihood_approx_exp; multipath_mapper.num_mapping_attempts = max_map_attempts; - multipath_mapper.unstranded_clustering = unstranded_clustering; multipath_mapper.min_median_mem_coverage_for_split = min_median_mem_coverage_for_split; multipath_mapper.suppress_cluster_merging = suppress_cluster_merging; + multipath_mapper.suppress_multicomponent_splitting = suppress_multicomponent_splitting; + multipath_mapper.use_tvs_clusterer = use_tvs_clusterer; + multipath_mapper.reversing_walk_length = reversing_walk_length; + multipath_mapper.max_alt_mappings = max_num_mappings; + multipath_mapper.max_alignment_gap = max_alignment_gap; + multipath_mapper.use_pessimistic_tail_alignment = use_pessimistic_tail_alignment; + multipath_mapper.pessimistic_gap_multiplier = pessimistic_gap_multiplier; + multipath_mapper.restrained_graph_extraction = restrained_graph_extraction; // set pair rescue parameters multipath_mapper.max_rescue_attempts = max_rescue_attempts; - multipath_mapper.max_single_end_mappings_for_rescue = max(max(max_single_end_mappings_for_rescue, max_rescue_attempts), max_num_mappings); + multipath_mapper.max_single_end_mappings_for_rescue = max(max_single_end_mappings_for_rescue, max_rescue_attempts); multipath_mapper.secondary_rescue_subopt_diff = secondary_rescue_subopt_diff; multipath_mapper.secondary_rescue_score_diff = secondary_rescue_score_diff; multipath_mapper.secondary_rescue_attempts = secondary_rescue_attempts; multipath_mapper.rescue_only_min = rescue_only_min; multipath_mapper.rescue_only_anchor_max = rescue_only_anchor_max; + multipath_mapper.fragment_length_warning_factor = fragment_length_warning_factor; + multipath_mapper.get_rescue_graph_from_paths = get_rescue_graph_from_paths; + multipath_mapper.rescue_graph_std_devs = rescue_graph_std_devs; // set multipath alignment topology parameters multipath_mapper.max_snarl_cut_size = snarl_cut_size; + multipath_mapper.max_branch_trim_length = max_branch_trim_length; + multipath_mapper.suppress_tail_anchors = !synthesize_tail_anchors; multipath_mapper.num_alt_alns = num_alt_alns; multipath_mapper.dynamic_max_alt_alns = dynamic_max_alt_alns; multipath_mapper.simplify_topologies = simplify_topologies; multipath_mapper.max_suboptimal_path_score_ratio = suboptimal_path_exponent; + multipath_mapper.agglomerate_multipath_alns = agglomerate_multipath_alns; + + // splicing parameters + int64_t min_softclip_length_for_splice = max(int(ceil(log(total_seq_length) / log(4.0)) - max_softclip_overlap) , 1); + multipath_mapper.set_min_softclip_length_for_splice(min_softclip_length_for_splice); + multipath_mapper.set_log_odds_against_splice(no_splice_log_odds); + multipath_mapper.max_softclip_overlap = max_softclip_overlap; + multipath_mapper.max_splice_overhang = max_splice_overhang; + multipath_mapper.splice_rescue_graph_std_devs = splice_rescue_graph_std_devs; + multipath_mapper.ref_path_handles = move(ref_path_handles); + multipath_mapper.max_motif_pairs = max_motif_pairs; + if (!intron_distr_name.empty()) { + multipath_mapper.set_intron_length_distribution(intron_mixture_weights, intron_component_params); + } + multipath_mapper.set_read_1_adapter(read_1_adapter); + multipath_mapper.set_read_2_adapter(read_2_adapter); + +#ifdef mpmap_instrument_mem_statistics + multipath_mapper._mem_stats.open(MEM_STATS_FILE); +#endif + + // we don't want to do spliced alignment while calibrating + multipath_mapper.do_spliced_alignment = false; // if directed to, auto calibrate the mismapping detection to the graph - if (auto_calibrate_mismapping_detection) { - multipath_mapper.calibrate_mismapping_detection(num_calibration_simulations, calibration_read_length); + if (auto_calibrate_mismapping_detection && !suppress_mismapping_detection) { + log_progress("Building null model to calibrate mismapping detection"); + multipath_mapper.calibrate_mismapping_detection(num_calibration_simulations, calibration_read_lengths); } - // set computational paramters - int thread_count = get_thread_count(); - multipath_mapper.set_alignment_threads(thread_count); + // now we can start doing spliced alignment + multipath_mapper.do_spliced_alignment = do_spliced_alignment; + + // Establish a watchdog to find reads that take too long to map. + // If we see any, we will issue a warning. + unique_ptr watchdog(new Watchdog(thread_count, chrono::minutes(read_length == "long" ? 40 : 5))); // are we doing paired ends? if (interleaved_input || !fastq_name_2.empty()) { // make sure buffer size is even (ensures that output will be interleaved) - if (buffer_size % 2 == 1) { - buffer_size++; - } if (!std::isnan(frag_length_mean) && !std::isnan(frag_length_stddev)) { // Force a fragment length distribution @@ -955,189 +2058,103 @@ int main_mpmap(int argc, char** argv) { ofstream read_time_file(READ_TIME_FILE); #endif + // a probably over-engineered way to report progress across threads with minimal contention + const uint64_t progress_frequency = read_length == "long" ? 250000 : 5000000; + const uint64_t thread_progress_frequency = 1000; + assert(progress_frequency % thread_progress_frequency == 0); + uint64_t num_reads_mapped = 0; + vector thread_num_reads_mapped(thread_count, 0); + + function register_mapping = [&](int thread_num) { + if (!suppress_progress) { + uint64_t num_mapped = ++thread_num_reads_mapped[thread_num]; + if (num_mapped == thread_progress_frequency) { + uint64_t n; +#pragma omp atomic capture + n = num_reads_mapped += num_mapped; + if (n % progress_frequency == 0) { + log_progress("Mapped " + to_string(n) + (!interleaved_input && fastq_name_2.empty() ? " reads" : " read pairs")); + } + thread_num_reads_mapped[thread_num] = 0; + } + } + }; + + // init a writer for the output + MultipathAlignmentEmitter* emitter = new MultipathAlignmentEmitter("-", thread_count, out_format, + path_position_handle_graph, + &path_names_and_length); + emitter->set_read_group(read_group); + emitter->set_sample_name(sample_name); + if (transcriptomic) { + emitter->set_min_splice_length(min_splice_length); + } + // a buffer to hold read pairs that can't be unambiguously mapped before the fragment length distribution // is estimated // note: sufficient to have only one buffer because multithreading code enforces single threaded mode // during distribution estimation vector> ambiguous_pair_buffer; - vector > single_path_output_buffer(thread_count); - vector > multipath_output_buffer(thread_count); - - // write unpaired multipath alignments to stdout buffer - auto output_multipath_alignments = [&](vector& mp_alns) { - auto& output_buf = multipath_output_buffer[omp_get_thread_num()]; + // do unpaired multipath alignment and write to buffer + function do_unpaired_alignments = [&](Alignment& alignment) { +#ifdef record_read_run_times + clock_t start = clock(); +#endif + + auto thread_num = omp_get_thread_num(); + + if (watchdog) { + watchdog->check_in(thread_num, alignment.name()); + } - // move all the alignments over to the output buffer - for (MultipathAlignment& mp_aln : mp_alns) { - output_buf.emplace_back(move(mp_aln)); - - // label with read group and sample name - if (!read_group.empty()) { - output_buf.back().set_read_group(read_group); - } - if (!sample_name.empty()) { - output_buf.back().set_sample_name(sample_name); - } + toUppercaseInPlace(*alignment.mutable_sequence()); + + bool is_rna = uses_Us(alignment); + if (is_rna) { + convert_Us_to_Ts(alignment); } + + vector mp_alns; + multipath_mapper.multipath_map(alignment, mp_alns); - stream::write_buffered(cout, output_buf, buffer_size); - }; - - // convert to unpaired single path alignments and write stdout buffer - auto output_single_path_alignments = [&](vector& mp_alns) { - auto& output_buf = single_path_output_buffer[omp_get_thread_num()]; - // add optimal alignments to the output buffer - for (MultipathAlignment& mp_aln : mp_alns) { - // For each multipath alignment, get the greedy nonoverlapping - // single-path alignments from the top k optimal single-path - // alignments. - vector options; - multipath_mapper.reduce_to_single_path(mp_aln, options, localization_max_paths); - - // There will always be at least one result. Use the optimal alignment. - output_buf.emplace_back(std::move(options.front())); - - // compute the Alignment identity to make vg call happy - output_buf.back().set_identity(identity(output_buf.back().path())); - - if (mp_aln.has_annotation()) { - // Move over annotations - output_buf.back().set_allocated_annotation(mp_aln.release_annotation()); - } - - // label with read group and sample name - if (!read_group.empty()) { - output_buf.back().set_read_group(read_group); - } - if (!sample_name.empty()) { - output_buf.back().set_sample_name(sample_name); + vector> path_positions; + if (hts_output) { + // we need to surject and compute path positions + path_positions.resize(mp_alns.size()); + for (size_t i = 0; i < mp_alns.size(); ++i) { + auto& path_pos = path_positions[i]; + mp_alns[i] = surjector->surject(mp_alns[i], surjection_paths, + get<0>(path_pos), get<2>(path_pos), get<1>(path_pos), + true, transcriptomic); } } - stream::write_buffered(cout, output_buf, buffer_size); - }; - - // write paired multipath alignments to stdout buffer - auto output_multipath_paired_alignments = [&](vector>& mp_aln_pairs) { - auto& output_buf = multipath_output_buffer[omp_get_thread_num()]; - - // move all the alignments over to the output buffer - for (pair& mp_aln_pair : mp_aln_pairs) { - output_buf.emplace_back(move(mp_aln_pair.first)); - - // label with read group and sample name - if (!read_group.empty()) { - output_buf.back().set_read_group(read_group); - } - if (!sample_name.empty()) { - output_buf.back().set_sample_name(sample_name); - } - - // switch second read back to the opposite strand if necessary - if (same_strand) { - output_buf.emplace_back(move(mp_aln_pair.second)); - } - else { - output_buf.emplace_back(); - rev_comp_multipath_alignment(mp_aln_pair.second, - [&](vg::id_t node_id) { return xg_index.node_length(node_id); }, - output_buf.back()); - } - - // label with read group and sample name - if (!read_group.empty()) { - output_buf.back().set_read_group(read_group); - } - if (!sample_name.empty()) { - output_buf.back().set_sample_name(sample_name); + if (is_rna) { + for (multipath_alignment_t& mp_aln : mp_alns) { + convert_Ts_to_Us(mp_aln); } } - stream::write_buffered(cout, output_buf, buffer_size); - }; - - // convert to paired single path alignments and write stdout buffer - auto output_single_path_paired_alignments = [&](vector>& mp_aln_pairs) { - auto& output_buf = single_path_output_buffer[omp_get_thread_num()]; - - // add optimal alignments to the output buffer - for (pair& mp_aln_pair : mp_aln_pairs) { - - // Compute nonoverlapping single path alignments for each multipath alignment - vector options; - multipath_mapper.reduce_to_single_path(mp_aln_pair.first, options, localization_max_paths); - - // There will always be at least one result. Use the optimal alignment. - output_buf.emplace_back(std::move(options.front())); - - if (mp_aln_pair.first.has_annotation()) { - // Move over annotations - output_buf.back().set_allocated_annotation(mp_aln_pair.first.release_annotation()); - } - - // compute the Alignment identity to make vg call happy - output_buf.back().set_identity(identity(output_buf.back().path())); - - // label with read group and sample name - if (!read_group.empty()) { - output_buf.back().set_read_group(read_group); - } - if (!sample_name.empty()) { - output_buf.back().set_sample_name(sample_name); + if (!no_output) { + if (!hts_output) { + emitter->emit_singles(alignment.name(), move(mp_alns)); } - // arbitrarily decide that this is the "previous" fragment - output_buf.back().mutable_fragment_next()->set_name(mp_aln_pair.second.name()); - - // Now do the second read - options.clear(); - multipath_mapper.reduce_to_single_path(mp_aln_pair.second, options, localization_max_paths); - output_buf.emplace_back(std::move(options.front())); - - if (mp_aln_pair.second.has_annotation()) { - // Move over annotations - output_buf.back().set_allocated_annotation(mp_aln_pair.second.release_annotation()); - } - - // compute identity again - output_buf.back().set_identity(identity(output_buf.back().path())); - - // switch second read back to the opposite strand if necessary - if (!same_strand) { - reverse_complement_alignment_in_place(&output_buf.back(), - [&](vg::id_t node_id) { return xg_index.node_length(node_id); }); - } - - // label with read group and sample name - if (!read_group.empty()) { - output_buf.back().set_read_group(read_group); - } - if (!sample_name.empty()) { - output_buf.back().set_sample_name(sample_name); + else { + emitter->emit_singles(alignment.name(), move(mp_alns), &path_positions); } - // arbitrarily decide that this is the "next" fragment - output_buf.back().mutable_fragment_prev()->set_name(mp_aln_pair.first.name()); } - stream::write_buffered(cout, output_buf, buffer_size); - }; - - // do unpaired multipath alignment and write to buffer - function do_unpaired_alignments = [&](Alignment& alignment) { -#ifdef record_read_run_times - clock_t start = clock(); -#endif - vector mp_alns; - multipath_mapper.multipath_map(alignment, mp_alns, max_num_mappings); - if (single_path_alignment_mode) { - output_single_path_alignments(mp_alns); - } - else { - output_multipath_alignments(mp_alns); + + if (watchdog) { + watchdog->check_out(thread_num); } + + register_mapping(thread_num); + #ifdef record_read_run_times clock_t finish = clock(); #pragma omp critical - read_time_file << alignment.name() << "\t" << double(finish - start) / CLOCKS_PER_SEC << endl; + read_time_file << alignment.name() << "\t" << alignment.sequence().size() << "\t" << double(finish - start) / CLOCKS_PER_SEC << endl; #endif }; @@ -1145,27 +2162,98 @@ int main_mpmap(int argc, char** argv) { function do_paired_alignments = [&](Alignment& alignment_1, Alignment& alignment_2) { // get reads on the same strand so that oriented distance estimation works correctly // but if we're clearing the ambiguous buffer we already RC'd these on the first pass + + auto thread_num = omp_get_thread_num(); + #ifdef record_read_run_times clock_t start = clock(); #endif + + if (watchdog) { + watchdog->check_in(thread_num, alignment_1.name()); + } + + toUppercaseInPlace(*alignment_1.mutable_sequence()); + toUppercaseInPlace(*alignment_2.mutable_sequence()); + + bool is_rna = (uses_Us(alignment_1) || uses_Us(alignment_2)); + if (is_rna) { + convert_Us_to_Ts(alignment_1); + convert_Us_to_Ts(alignment_2); + } + if (!same_strand) { // remove the path so we won't try to RC it (the path may not refer to this graph) alignment_2.clear_path(); - reverse_complement_alignment_in_place(&alignment_2, [&](vg::id_t node_id) { return xg_index.node_length(node_id); }); + reverse_complement_alignment_in_place(&alignment_2, [&](vg::id_t node_id) { + return path_position_handle_graph->get_length(path_position_handle_graph->get_handle(node_id)); + }); } - - vector> mp_aln_pairs; - multipath_mapper.multipath_map_paired(alignment_1, alignment_2, mp_aln_pairs, ambiguous_pair_buffer, max_num_mappings); - if (single_path_alignment_mode) { - output_single_path_paired_alignments(mp_aln_pairs); + + size_t num_buffered = ambiguous_pair_buffer.size(); + + vector> mp_aln_pairs; + bool proper_paired = multipath_mapper.multipath_map_paired(alignment_1, alignment_2, mp_aln_pairs, ambiguous_pair_buffer); + + + if (!same_strand) { + for (auto& mp_aln_pair : mp_aln_pairs) { + rev_comp_multipath_alignment_in_place(&mp_aln_pair.second, [&](vg::id_t node_id) { return path_position_handle_graph->get_length(path_position_handle_graph->get_handle(node_id)); + }); + } } - else { - output_multipath_paired_alignments(mp_aln_pairs); + + vector, tuple>> path_positions; + vector tlen_limits; + if (hts_output) { + // we need to surject and compute path positions + path_positions.resize(mp_aln_pairs.size()); + // hackily either give no limit or an unattainable limit to communicate pairedness + tlen_limits.resize(mp_aln_pairs.size(), + proper_paired ? numeric_limits::max() : -1); + + for (size_t i = 0; i < mp_aln_pairs.size(); ++i) { + auto& path_pos_1 = path_positions[i].first; + auto& path_pos_2 = path_positions[i].second; + mp_aln_pairs[i].first = surjector->surject(mp_aln_pairs[i].first, surjection_paths, + get<0>(path_pos_1), get<2>(path_pos_1), get<1>(path_pos_1), + true, transcriptomic); + mp_aln_pairs[i].second = surjector->surject(mp_aln_pairs[i].second, surjection_paths, + get<0>(path_pos_2), get<2>(path_pos_2), get<1>(path_pos_2), + true, transcriptomic); + } + } + + if (is_rna) { + for (pair& mp_aln_pair : mp_aln_pairs) { + convert_Ts_to_Us(mp_aln_pair.first); + convert_Ts_to_Us(mp_aln_pair.second); + } + } + + if (!no_output) { + if (!hts_output) { + emitter->emit_pairs(alignment_1.name(), alignment_2.name(), move(mp_aln_pairs)); + } + else { + emitter->emit_pairs(alignment_1.name(), alignment_2.name(), move(mp_aln_pairs), + &path_positions, &tlen_limits); + } + } + + if (watchdog) { + watchdog->check_out(thread_num); + } + + if (num_buffered == ambiguous_pair_buffer.size()) { + // the read didn't get buffered during the frag length estimation phase + register_mapping(thread_num); } + #ifdef record_read_run_times clock_t finish = clock(); #pragma omp critical - read_time_file << alignment_1.name() << "\t" << alignment_2.name() << "\t" << double(finish - start) / CLOCKS_PER_SEC << endl; + read_time_file << alignment_1.name() << "\t" << alignment_2.name() << "\t" << alignment_1.sequence().size() << "\t" << alignment_2.sequence().size() << "\t" << double(finish - start) / CLOCKS_PER_SEC << endl; #endif }; @@ -1173,40 +2261,88 @@ int main_mpmap(int argc, char** argv) { function do_independent_paired_alignments = [&](Alignment& alignment_1, Alignment& alignment_2) { // get reads on the same strand so that oriented distance estimation works correctly // but if we're clearing the ambiguous buffer we already RC'd these on the first pass + + auto thread_num = omp_get_thread_num(); + #ifdef record_read_run_times clock_t start = clock(); #endif - if (!same_strand) { - // TODO: the output functions undo this transformation, so we have to do it here. + + if (watchdog) { + watchdog->check_in(thread_num, alignment_1.name()); + } - // remove the path so we won't try to RC it (the path may not refer to this graph) - alignment_2.clear_path(); - reverse_complement_alignment_in_place(&alignment_2, [&](vg::id_t node_id) { return xg_index.node_length(node_id); }); + bool is_rna = (uses_Us(alignment_1) || uses_Us(alignment_2)); + if (is_rna) { + convert_Us_to_Ts(alignment_1); + convert_Us_to_Ts(alignment_2); } // Align independently - vector mp_alns_1, mp_alns_2; - multipath_mapper.multipath_map(alignment_1, mp_alns_1, max_num_mappings); - multipath_mapper.multipath_map(alignment_2, mp_alns_2, max_num_mappings); - - vector> mp_aln_pairs; - for (size_t i = 0; i < mp_alns_1.size() && i < mp_alns_2.size(); i++) { - // Pair arbitrarily. Stop when one side runs out of alignments. - mp_aln_pairs.emplace_back(mp_alns_1[i], mp_alns_2[i]); + vector mp_alns_1, mp_alns_2; + multipath_mapper.multipath_map(alignment_1, mp_alns_1); + multipath_mapper.multipath_map(alignment_2, mp_alns_2); + + if (is_rna) { + for (multipath_alignment_t& mp_aln : mp_alns_1) { + convert_Ts_to_Us(mp_aln); + } + for (multipath_alignment_t& mp_aln : mp_alns_2) { + convert_Ts_to_Us(mp_aln); + } } + + // keep an equal number to protect interleaving + mp_alns_1.resize(min(mp_alns_1.size(), mp_alns_2.size())); + mp_alns_2.resize(min(mp_alns_1.size(), mp_alns_2.size())); - // TODO: Set a flag or annotation or something to say we don't really believe the pairing - - if (single_path_alignment_mode) { - output_single_path_paired_alignments(mp_aln_pairs); + vector, tuple>> path_positions; + vector tlen_limits; + if (hts_output) { + // we need to surject and compute path positions + path_positions.resize(mp_alns_1.size()); + // hackily give unattainable limit to indicate no proper pairing + tlen_limits.resize(mp_alns_1.size(), -1); + + for (size_t i = 0; i < mp_alns_1.size(); ++i) { + auto& path_pos_1 = path_positions[i].first; + auto& path_pos_2 = path_positions[i].second; + mp_alns_1[i] = surjector->surject(mp_alns_1[i], surjection_paths, + get<0>(path_pos_1), get<2>(path_pos_1), get<1>(path_pos_1), + true, transcriptomic); + mp_alns_2[i] = surjector->surject(mp_alns_2[i], surjection_paths, + get<0>(path_pos_2), get<2>(path_pos_2), get<1>(path_pos_2), + true, transcriptomic); + } } - else { - output_multipath_paired_alignments(mp_aln_pairs); + + if (!no_output) { + // reorganize into pairs + vector> mp_aln_pairs; + mp_aln_pairs.reserve(mp_alns_1.size()); + for (size_t i = 0; i < mp_alns_1.size(); ++i) { + mp_aln_pairs.emplace_back(move(mp_alns_1[i]), move(mp_alns_2[i])); + } + + if (!hts_output) { + emitter->emit_pairs(alignment_1.name(), alignment_2.name(), move(mp_aln_pairs)); + } + else { + emitter->emit_pairs(alignment_1.name(), alignment_2.name(), move(mp_aln_pairs), + &path_positions, &tlen_limits); + } } + + if (watchdog) { + watchdog->check_out(thread_num); + } + + register_mapping(thread_num); + #ifdef record_read_run_times clock_t finish = clock(); #pragma omp critical - read_time_file << alignment_1.name() << "\t" << alignment_2.name() << "\t" << double(finish - start) / CLOCKS_PER_SEC << endl; + read_time_file << alignment_1.name() << "\t" << alignment_2.name() << "\t" << alignment_1.sequence().size() << "\t" << alignment_2.sequence().size() << "\t" << double(finish - start) / CLOCKS_PER_SEC << endl; #endif }; @@ -1215,9 +2351,10 @@ int main_mpmap(int argc, char** argv) { return multipath_mapper.has_fixed_fragment_length_distr(); }; - // FASTQ input if (!fastq_name_1.empty()) { + log_progress("Mapping reads from " + (fastq_name_1 == "-" ? string("STDIN") : fastq_name_1) + (fastq_name_2.empty() ? "" : " and " + (fastq_name_2 == "-" ? "STDIN" : fastq_name_2)) + " using " + to_string(thread_count) + " thread" + (thread_count > 1 ? "s" : "")); + if (interleaved_input) { fastq_paired_interleaved_for_each_parallel_after_wait(fastq_name_1, do_paired_alignments, multi_threaded_condition); @@ -1233,17 +2370,20 @@ int main_mpmap(int argc, char** argv) { // GAM input if (!gam_file_name.empty()) { + log_progress("Mapping reads from " + (gam_file_name == "-" ? string("STDIN") : gam_file_name) + " using " + to_string(thread_count) + " thread" + (thread_count > 1 ? "s" : "")); + function execute = [&](istream& gam_in) { if (!gam_in) { cerr << "error:[vg mpmap] Cannot open GAM file " << gam_file_name << endl; exit(1); } + if (interleaved_input) { - stream::for_each_interleaved_pair_parallel_after_wait(gam_in, do_paired_alignments, + vg::io::for_each_interleaved_pair_parallel_after_wait(gam_in, do_paired_alignments, multi_threaded_condition); } else { - stream::for_each_parallel(gam_in, do_unpaired_alignments); + vg::io::for_each_parallel(gam_in, do_unpaired_alignments); } }; get_input_file(gam_file_name, execute); @@ -1261,13 +2401,15 @@ int main_mpmap(int argc, char** argv) { // TODO: slightly wasteful, inelegant if (!same_strand) { reverse_complement_alignment_in_place(&aln_pair.second, - [&](vg::id_t node_id) { return xg_index.node_length(node_id); }); + [&](vg::id_t node_id) { + return path_position_handle_graph->get_length(path_position_handle_graph->get_handle(node_id)); + }); } do_paired_alignments(aln_pair.first, aln_pair.second); } } else { - cerr << "warning:[vg mpmap] Could not find " << frag_length_sample_size << " unambiguous read pair mappings to estimate fragment length ditribution. Mapping read pairs as independent single-ended reads. Consider decreasing sample size (-b)." << endl; + cerr << "warning:[vg mpmap] Could not find " << frag_length_sample_size << " (-b) unambiguous read pair mappings to estimate fragment length ditribution. This can happen due to data issues (e.g. unpaired reads being mapped as pairs) or because the sample size is too large for the read set. Mapping read pairs as independent single-ended reads." << endl; #pragma omp parallel for for (size_t i = 0; i < ambiguous_pair_buffer.size(); i++) { @@ -1277,23 +2419,26 @@ int main_mpmap(int argc, char** argv) { // TODO: slightly wasteful, inelegant if (!same_strand) { reverse_complement_alignment_in_place(&aln_pair.second, - [&](vg::id_t node_id) { return xg_index.node_length(node_id); }); + [&](vg::id_t node_id) { + return path_position_handle_graph->get_length(path_position_handle_graph->get_handle(node_id)); + }); } do_independent_paired_alignments(aln_pair.first, aln_pair.second); } } } - // flush output buffers - for (int i = 0; i < thread_count; i++) { - vector& single_path_buffer = single_path_output_buffer[i]; - stream::write_buffered(cout, single_path_buffer, 0); - - vector& multipath_buffer = multipath_output_buffer[i]; - stream::write_buffered(cout, multipath_buffer, 0); - } + // flush output + delete emitter; cout.flush(); + if (!suppress_progress) { + for (auto uncounted_mappings : thread_num_reads_mapped) { + num_reads_mapped += uncounted_mappings; + } + log_progress("Mapping finished. Mapped " + to_string(num_reads_mapped) + " " + (fastq_name_2.empty() && !interleaved_input ? "reads" : "read pairs") + "."); + } + #ifdef record_read_run_times read_time_file.close(); #endif @@ -1304,10 +2449,6 @@ int main_mpmap(int argc, char** argv) { //cerr << "attempted to split " << OrientedDistanceClusterer::SPLIT_ATTEMPT_COUNTER << " of " << OrientedDistanceClusterer::PRE_SPLIT_CLUSTER_COUNTER << " clusters with " << OrientedDistanceClusterer::SUCCESSFUL_SPLIT_ATTEMPT_COUNTER << " splits successful (" << 100.0 * double(OrientedDistanceClusterer::SUCCESSFUL_SPLIT_ATTEMPT_COUNTER) / OrientedDistanceClusterer::SPLIT_ATTEMPT_COUNTER << "%) resulting in " << OrientedDistanceClusterer::POST_SPLIT_CLUSTER_COUNTER << " total clusters (" << OrientedDistanceClusterer::POST_SPLIT_CLUSTER_COUNTER - OrientedDistanceClusterer::PRE_SPLIT_CLUSTER_COUNTER << " new)" << endl; //cerr << "entered secondary rescue " << MultipathMapper::SECONDARY_RESCUE_TOTAL << " times with " << MultipathMapper::SECONDARY_RESCUE_COUNT << " actually attempting rescues, totaling " << MultipathMapper::SECONDARY_RESCUE_ATTEMPT << " rescues (" << double(MultipathMapper::SECONDARY_RESCUE_ATTEMPT) / MultipathMapper::SECONDARY_RESCUE_COUNT << " average per attempt)" << endl; - if (snarl_manager != nullptr) { - delete snarl_manager; - } - if (haplo_score_provider != nullptr) { delete haplo_score_provider; } @@ -1316,14 +2457,10 @@ int main_mpmap(int argc, char** argv) { delete sublinearLS; } - if (gbwt != nullptr) { - delete gbwt; - } - return 0; } // Register subcommand -static Subcommand vg_mpmap("mpmap", "multipath alignments of reads to a graph", main_mpmap); +static Subcommand vg_mpmap("mpmap", "splice-aware multipath alignment of short reads", PIPELINE, 7, main_mpmap); diff --git a/src/subcommand/msga_main.cpp b/src/subcommand/msga_main.cpp index 22620b59505..ebb64b9b4b9 100644 --- a/src/subcommand/msga_main.cpp +++ b/src/subcommand/msga_main.cpp @@ -2,10 +2,15 @@ #include "../vg.hpp" #include "../utility.hpp" #include "../mapper.hpp" -#include "../stream.hpp" +#include +#include #include "../kmer.hpp" #include "../build_index.hpp" -#include "../algorithms/topological_sort.hpp" +#include "../algorithms/normalize.hpp" +#include "../algorithms/prune.hpp" +#include "../algorithms/path_string.hpp" +#include "../chunker.hpp" +#include "xg.hpp" #include #include @@ -28,6 +33,8 @@ void help_msga(char** argv) { << " -s, --seq SEQUENCE literally include this sequence" << endl << " -g, --graph FILE include this graph" << endl << " -a, --fasta-order build the graph in the order the sequences are seen in the FASTA (default: bigger first)" << endl + << " -R, --position-bed FILE BED file mapping sequence names (col 4) to positions on reference path (cols 1-3)" << endl + << " -T, --context STEPS expand context around BED regions (-R) by this many steps [50]" << endl << "alignment:" << endl << " -k, --min-mem INT minimum MEM length (if 0 estimate via -e) [0]" << endl << " -e, --mem-chance FLOAT set {-k} such that this fraction of {-k} length hits will by chance [5e-4]" << endl @@ -35,16 +42,16 @@ void help_msga(char** argv) { << " -Y, --max-mem INT ignore mems longer than this length (unset if 0) [0]" << endl << " -r, --reseed-x FLOAT look for internal seeds inside a seed longer than {-W} * FLOAT [1.5]" << endl << " -l, --try-at-least INT attempt to align up to the INT best candidate chains of seeds [1]" << endl - << " -u, --try-up-to INT attempt to align up to the INT best candidate chains of seeds [128]" << endl + << " -u, --try-up-to INT attempt to trace back up to this number of chains of bands (assuming we will band) [4]" << endl << " -W, --min-chain INT discard a chain if seeded bases shorter than INT [0]" << endl << " -C, --drop-chain FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [0.45]" << endl << " -P, --min-ident FLOAT accept alignment only if the alignment identity is >= FLOAT [0]" << endl << " -F, --min-band-mq INT require mapping quality for each band to be at least this [0]" << endl << " -H, --max-target-x N skip cluster subgraphs with length > N*read_length [100]" << endl - << " -w, --band-width INT band width for long read alignment [256]" << endl - << " -O, --band-overlap INT band overlap for long read alignment [{-w}/8]" << endl + << " -w, --band-width INT band/chunk width for long read alignment [128]" << endl + << " -O, --band-overlap INT band overlap for long read alignment [{-w}*3/4]" << endl << " -J, --band-jump INT the maximum number of bands of insertion we consider in the alignment chain model [128]" << endl - << " -B, --band-multi INT consider this many alignments of each band in banded alignment [16]" << endl + << " -B, --band-multi INT consider this many alignments of each band in banded alignment (overrides -u for bands) [16]" << endl << " -M, --max-multimaps INT consider this many alternate alignments for the entire sequence [1]" << endl << " --no-patch-aln do not patch banded alignments by locally aligning unaligned regions" << endl << "local alignment parameters:" << endl @@ -52,7 +59,7 @@ void help_msga(char** argv) { << " -z, --mismatch INT use this mismatch penalty [4]" << endl << " -o, --gap-open INT use this gap open penalty [6]" << endl << " -y, --gap-extend INT use this gap extension penalty [1]" << endl - << " -L, --full-l-bonus INT the full-length alignment bonus [5]" << endl + << " -L, --full-l-bonus INT the full-length alignment bonus [32]" << endl << " --xdrop-alignment use X-drop heuristic (much faster for long-read alignment)" << endl << " --max-gap-length maximum gap length allowed in each contiguous alignment (for X-drop alignment) [40]" << endl << "index generation:" << endl @@ -79,6 +86,12 @@ void help_msga(char** argv) { int main_msga(int argc, char** argv) { + cerr << "!!!" << endl; + cerr << "WARNING" << endl; + cerr << "!!!" << endl; + cerr << "vg msga was an early prototype for constructing genome graphs from multiple sequence alignments, but it is no longer state-of-the-art or even actively maintained. VG team members have developed improved graph construction algorithms in Cactus and PGGB, and several other tools have been developed by other groups." << endl << endl; + + if (argc == 2) { help_msga(argv); return 1; @@ -89,13 +102,15 @@ int main_msga(int argc, char** argv) { vector sequences; vector graph_files; string base_seq_name; + string position_bed_file; + int context_steps = 50; int idx_kmer_size = 16; int hit_max = 2048; // if we set this above 1, we use a dynamic programming process to determine the // optimal alignment through a series of bands based on a proximity metric int max_multimaps = 1; float min_identity = 0.0; - int band_width = 256; + int band_width = 128; int band_overlap = -1; int max_band_jump = 128; int band_multimaps = 16; @@ -103,7 +118,6 @@ int main_msga(int argc, char** argv) { bool debug = false; bool debug_align = false; size_t node_max = 0; - int alignment_threads = get_thread_count(); int edge_max = 3; int subgraph_prune = 0; bool normalize = false; @@ -116,13 +130,13 @@ int main_msga(int argc, char** argv) { int mismatch = 4; int gap_open = 6; int gap_extend = 1; - int full_length_bonus = 5; + int full_length_bonus = 32; bool circularize = false; float chance_match = 5e-4; int mem_reseed_length = -1; int min_cluster_length = 0; float mem_reseed_factor = 1.5; - int extra_multimaps = 128; + int extra_multimaps = 4; int min_multimaps = 1; float drop_chain = 0.45; int max_mapping_quality = 60; @@ -130,7 +144,6 @@ int main_msga(int argc, char** argv) { int maybe_mq_threshold = 0; int min_banded_mq = 0; bool use_fast_reseed = true; - bool show_align_progress = false; bool bigger_first = true; bool patch_alignments = true; int max_sub_mem_recursion_depth = 2; @@ -148,6 +161,9 @@ int main_msga(int argc, char** argv) { {"name", required_argument, 0, 'n'}, {"seq", required_argument, 0, 's'}, {"graph", required_argument, 0, 'g'}, + {"fasta-order", no_argument, 0, 'a'}, + {"position-bed", required_argument, 0, 'R'}, + {"context", required_argument, 0, 'T'}, {"base", required_argument, 0, 'b'}, {"idx-kmer-size", required_argument, 0, 'K'}, {"idx-doublings", required_argument, 0, 'X'}, @@ -182,7 +198,6 @@ int main_msga(int argc, char** argv) { {"try-up-to", required_argument, 0, 'u'}, {"try-at-least", required_argument, 0, 'l'}, {"drop-chain", required_argument, 0, 'C'}, - {"align-progress", no_argument, 0, 'S'}, {"bigger-first", no_argument, 0, 'a'}, {"no-patch-aln", no_argument, 0, '8'}, {"max-gap-length", required_argument, 0, 1}, @@ -191,7 +206,7 @@ int main_msga(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hf:n:s:g:b:K:X:w:DAc:P:E:Q:NY:H:t:m:M:q:O:I:i:o:y:ZW:z:k:L:e:r:u:l:C:F:SJ:B:a8", + c = getopt_long (argc, argv, "hf:n:s:g:b:K:X:w:DAc:P:E:Q:NY:H:t:m:M:q:O:I:i:o:y:ZW:z:k:L:e:r:u:l:C:F:J:B:a8R:T:", long_options, &option_index); // Detect the end of the options. @@ -286,10 +301,6 @@ int main_msga(int argc, char** argv) { debug_align = true; break; - case 'S': - show_align_progress = true; - break; - case 'X': doubling_steps = parse(optarg); break; @@ -328,7 +339,6 @@ int main_msga(int argc, char** argv) { case 't': omp_set_num_threads(parse(optarg)); - alignment_threads = parse(optarg); break; case 'Q': @@ -363,6 +373,14 @@ int main_msga(int argc, char** argv) { bigger_first = false; break; + case 'R': + position_bed_file = optarg; + break; + + case 'T': + context_steps = parse(optarg); + break; + case '8': patch_alignments = false; break; @@ -401,16 +419,26 @@ int main_msga(int argc, char** argv) { } if (band_overlap == -1) { - band_overlap = band_width/8; + band_overlap = 3*band_width/4; } // build the graph or read it in from input VG* graph; if (graph_files.size() == 1) { string file_name = graph_files.front(); - get_input_file(file_name, [&](istream& in) { - graph = new VG(in); - }); + + // Load the graph from the file + unique_ptr loaded = vg::io::VPKG::load_one(file_name); + + // Make it be in VG format + graph = dynamic_cast(loaded.get()); + if (graph == nullptr) { + // Copy instead. + graph = new vg::VG(); + handlealgs::copy_path_handle_graph(loaded.get(), graph); + // Make sure the paths are all synced up + graph->paths.to_graph(graph->graph); + } } else { graph = new VG; } @@ -439,12 +467,23 @@ int main_msga(int argc, char** argv) { cerr << "[vg msga] Warning: sequence " << name << " is seen multiple times in input, ignoring all but the first instance" << endl; continue; } - strings[name] = nonATGCNtoN(ref.getSequence(name)); + strings[name] = vg::nonATGCNtoN(ref.getSequence(name)); names_in_order.push_back(name); seen_seq_names.insert(name); } } + // read in our bed file of positions for the input sequences + unordered_map position_hints; + if (!position_bed_file.empty()) { + vector regions; + vector region_names; + parse_bed_regions(position_bed_file, regions, ®ion_names); + for (size_t i = 0; i < regions.size(); ++i) { + position_hints[region_names[i]] = regions[i]; + } + } + // give a label to sequences passed on the command line // use the sha1sum, take the head // collision avoidance with nonce ensures we get the same names for the same sequences across multiple runs @@ -456,7 +495,7 @@ int main_msga(int argc, char** argv) { ss << s << ++nonce; name = sha1head(ss.str(), 8); } - strings[name] = nonATGCNtoN(s); + strings[name] = vg::nonATGCNtoN(s); names_in_order.push_back(name); } @@ -479,8 +518,8 @@ int main_msga(int argc, char** argv) { if (graph->empty()) { auto build_graph = [&graph,&node_max](const string& seq, const string& name) { graph->create_node(seq); - graph->dice_nodes(node_max); - algorithms::sort(graph); + handlealgs::chop(*graph, node_max); + graph->sort(); graph->compact_ids(); // the graph will have a single embedded path in it Path& path = *graph->graph.add_path(); @@ -516,32 +555,58 @@ int main_msga(int argc, char** argv) { gcsa::LCPArray* lcpidx = nullptr; xg::XG* xgidx = nullptr; size_t iter = 0; - - // Configure GCSA temp directory to the system temp directory - gcsa::TempFile::setDirectory(temp_file::get_dir()); - - auto rebuild = [&](VG* graph) { - if (mapper) delete mapper; - if (xgidx) delete xgidx; - if (gcsaidx) delete gcsaidx; - if (lcpidx) delete lcpidx; + + auto rebuild = [&](VG* graph, int name_idx) { + delete mapper; + mapper = nullptr; + delete xgidx; + xgidx = nullptr; + delete gcsaidx; + gcsaidx = nullptr; + delete lcpidx; + lcpidx = nullptr; //stringstream s; s << iter++ << ".vg"; - algorithms::sort(graph); + graph->sort(); graph->sync_paths(); graph->graph.clear_path(); graph->paths.to_graph(graph->graph); graph->rebuild_indexes(); + if (name_idx >= names_in_order.size()) { + // nothing to align to next, so don't bother making mapping indexes + return; + } + if (debug) cerr << "building xg index" << endl; - xgidx = new xg::XG(graph->graph); + xgidx = new xg::XG(); + xgidx->from_path_handle_graph(*graph); if (debug) cerr << "building GCSA2 index" << endl; // Configure GCSA2 verbosity so it doesn't spit out loads of extra info if(!debug) gcsa::Verbosity::set(gcsa::Verbosity::SILENT); - - // Configure its temp directory to the system temp directory - gcsa::TempFile::setDirectory(temp_file::get_dir()); + + // Replace "graph" with a subsetted graph, and use it below when creating + // the GCSA index. + VG* region_graph = nullptr; + if (name_idx < names_in_order.size() && position_hints.count(names_in_order[name_idx])) { + Region region = position_hints[names_in_order[name_idx]]; + if (!xgidx->has_path(region.seq) || xgidx->get_path_length(xgidx->get_path_handle(region.seq)) <= + region.end) { + stringstream err_msg; + err_msg << "[vg msga] Error: Target region for \"" << names_in_order[name_idx] << "\" (" + << region.seq << ":" << region.start << "-" << region.end << ") not found in graph." << endl; + throw runtime_error(err_msg.str()); + } + region_graph = new VG(); + Region out_region; + PathChunker chunker(xgidx); + if (debug) cerr << "Subsetting graph to " << region.seq << ":" << region.start << "-" << region.end + << " for sequence " << names_in_order[name_idx] << " using " << context_steps + << " context steps." << endl; + chunker.extract_subgraph(region, context_steps, 0, false, *region_graph, out_region); + graph = region_graph; + } if (idx_path_only) { // make the index from only the kmers in the embedded paths @@ -551,7 +616,9 @@ int main_msga(int argc, char** argv) { vg::id_t tail_id = head_id+1; graph->paths.for_each_name([&](const string& name) { VG path_graph = *graph; - if (edge_max) path_graph.prune_complex_with_head_tail(idx_kmer_size, edge_max); + if (edge_max){ + vg::algorithms::prune_complex_with_head_tail(path_graph, idx_kmer_size, edge_max); + } path_graph.keep_path(name); size_t limit = ~(size_t)0; tmpfiles.push_back( @@ -572,14 +639,20 @@ int main_msga(int argc, char** argv) { } else if (edge_max) { VG gcsa_graph = *graph; // copy the graph // remove complex components - gcsa_graph.prune_complex_with_head_tail(idx_kmer_size, edge_max); - if (subgraph_prune) gcsa_graph.prune_short_subgraphs(subgraph_prune); + vg::algorithms::prune_complex_with_head_tail(gcsa_graph, idx_kmer_size, edge_max); + if (subgraph_prune){ + vg::algorithms::prune_short_subgraphs(gcsa_graph, subgraph_prune); + } // then index build_gcsa_lcp(gcsa_graph, gcsaidx, lcpidx, idx_kmer_size, doubling_steps); } else { // if no complexity reduction is requested, just build the index build_gcsa_lcp(*graph, gcsaidx, lcpidx, idx_kmer_size, doubling_steps); } + + delete region_graph; + graph = nullptr; + mapper = new Mapper(xgidx, gcsaidx, lcpidx); { // set mapper variables mapper->hit_max = hit_max; @@ -609,15 +682,13 @@ int main_msga(int argc, char** argv) { mapper->extra_multimaps = extra_multimaps; mapper->mapping_quality_method = mapping_quality_method; mapper->max_mapping_quality = max_mapping_quality; - // set up the multi-threaded alignment interface - mapper->set_alignment_threads(alignment_threads); - mapper->show_progress = show_align_progress; mapper->patch_alignments = patch_alignments; + mapper->max_xdrop_gap_length = default_xdrop_max_gap_length; } }; // set up the graph for mapping - rebuild(graph); + rebuild(graph, 0); // todo restructure so that we are trying to map everything // add alignment score/bp bounds to catch when we get a good alignment @@ -651,14 +722,14 @@ int main_msga(int argc, char** argv) { Alignment aln = mapper->align(seq, 0, 0, 0, band_width, band_overlap, xdrop_alignment); aln.set_name(name); if (aln.path().mapping_size()) { - auto aln_seq = graph->path_string(aln.path()); + auto aln_seq = vg::algorithms::path_string(*graph, aln.path()); if (aln_seq != seq) { cerr << "[vg msga] alignment corrupted, failed to obtain correct banded alignment (alignment seq != input seq)" << endl; cerr << "expected " << seq << endl; cerr << "got " << aln_seq << endl; ofstream f(name + "-failed-alignment-" + convert(j) + ".gam"); - stream::write(f, 1, (std::function)([&aln](size_t n) { return aln; })); - stream::finish(f); + vg::io::write(f, 1, (std::function)([&aln](size_t n) { return aln; })); + vg::io::finish(f); f.close(); graph->serialize_to_file(name + "-corrupted-alignment.vg"); exit(1); @@ -674,8 +745,8 @@ int main_msga(int argc, char** argv) { /* ofstream f(name + "-pre-edit-" + convert(j) + ".gam"); - stream::write(f, 1, (std::function)([&aln](size_t n) { return aln; })); - stream::finish(f); + vg::io::write(f, 1, (std::function)([&aln](size_t n) { return aln; })); + vg::io::finish(f); f.close(); */ @@ -685,15 +756,15 @@ int main_msga(int argc, char** argv) { if (debug) cerr << name << ": editing graph" << endl; //graph->serialize_to_file(name + "-pre-edit.vg"); // Modify graph and embed paths - graph->edit(paths, true); + graph->edit(paths, nullptr, true); //if (!graph->is_valid()) cerr << "invalid after edit" << endl; //graph->serialize_to_file(name + "-immed-post-edit.vg"); - if (normalize) graph->normalize(10, debug); - graph->dice_nodes(node_max); + if (normalize) vg::algorithms::normalize(graph, 10, debug); + handlealgs::chop(*graph, node_max); //if (!graph->is_valid()) cerr << "invalid after dice" << endl; //graph->serialize_to_file(name + "-post-dice.vg"); if (debug) cerr << name << ": sorting and compacting ids" << endl; - algorithms::sort(graph); + graph->sort(); //if (!graph->is_valid()) cerr << "invalid after sort" << endl; graph->compact_ids(); // xg can't work unless IDs are compacted. //if (!graph->is_valid()) cerr << "invalid after compact" << endl; @@ -712,12 +783,12 @@ int main_msga(int argc, char** argv) { graph->graph.clear_path(); graph->paths.to_graph(graph->graph); // and rebuild the indexes - rebuild(graph); + rebuild(graph, i); //graph->serialize_to_file(convert(i) + "-" + name + "-post.vg"); // verfy validity of path bool is_valid = graph->is_valid(); - auto path_seq = graph->path_string(graph->paths.path(name)); + auto path_seq = vg::algorithms::path_string(*graph, graph->paths.path(name)); incomplete = !(path_seq == seq) || !is_valid; if (incomplete) { cerr << "[vg msga] failed to include alignment, retrying " << endl @@ -727,8 +798,8 @@ int main_msga(int argc, char** argv) { << pb2json(graph->paths.path(name)) << endl; graph->serialize_to_file(name + "-post-edit.vg"); ofstream f(name + "-failed-alignment-" + convert(j) + ".gam"); - stream::write(f, 1, (std::function)([&aln](size_t n) { return aln; })); - stream::finish(f); + vg::io::write(f, 1, (std::function)([&aln](size_t n) { return aln; })); + vg::io::finish(f); f.close(); } } @@ -739,6 +810,11 @@ int main_msga(int argc, char** argv) { } } + delete mapper; + delete xgidx; + delete gcsaidx; + delete lcpidx; + // auto include_paths = [&mapper, // kmer_size, // kmer_stride, @@ -765,10 +841,14 @@ int main_msga(int argc, char** argv) { if (normalize) { if (debug) cerr << "normalizing graph" << endl; - graph->remove_non_path(); - graph->normalize(); - graph->dice_nodes(node_max); - algorithms::sort(graph); + if (graph_files.empty()) { + // shouldn't be any reason to do this, but if we are going to do it, + // only try if graph was made entirely of msga'd sequences. + graph->remove_non_path(); + } + vg::algorithms::normalize(graph); + handlealgs::chop(*graph, node_max); + graph->sort(); graph->compact_ids(); if (!graph->is_valid()) { cerr << "[vg msga] warning! graph is not valid after normalization" << endl; @@ -780,7 +860,7 @@ int main_msga(int argc, char** argv) { for (auto& sp : strings) { auto& name = sp.first; auto& seq = sp.second; - if (seq != graph->path_string(graph->paths.path(name))) { + if (seq != vg::algorithms::path_string(*graph, graph->paths.path(name))) { /* cerr << "failed inclusion" << endl << "expected " << graph->path_string(graph->paths.path(name)) << endl @@ -834,4 +914,4 @@ int main_msga(int argc, char** argv) { return 0; } -static Subcommand vg_msga("msga", "multiple sequence graph alignment", main_msga); +static Subcommand vg_msga("msga", "multiple sequence graph alignment", DEPRECATED, main_msga); diff --git a/src/subcommand/options.cpp b/src/subcommand/options.cpp new file mode 100644 index 00000000000..53f21d59920 --- /dev/null +++ b/src/subcommand/options.cpp @@ -0,0 +1,218 @@ +/** + *\file + * options.cpp: option parser system implementation + */ + +#include "options.hpp" + +namespace vg { +namespace subcommand { + +void TickChainLink::reset_chain() { + reset_chain_parent(); +} + +bool TickChainLink::tick_chain() { + return tick_chain_parent(); +} + +TickChainLink& TickChainLink::chain(TickChainLink& next) { + // Attach next to us + next.reset_chain_parent = [&]() { + this->reset_chain(); + }; + next.tick_chain_parent = [&]() { + return this->tick_chain(); + }; + + // And return it for a nice chain of chain calls. + return next; +} + +std::function&)> TickChainLink::get_iterator() { + return [&](const std::function& iteratee) { + // Start + reset_chain(); + + do { + // Run iteratee + iteratee(); + // And tick the whole chain before running again + } while(tick_chain()); + }; +} + +int get_option_id() { + static int id = 10000; + return id++; +} + +template<> +const char* get_metavar() { + return "INT"; +} + +template<> +const char* get_metavar() { + return "INT"; +} + +template<> +const char* get_metavar() { + return "INT"; +} + +template<> +const char* get_metavar() { + return "BOOL"; +} + +template<> +const char* get_metavar() { + return "FLOAT"; +} + +template<> +const char* get_metavar() { + return "NAME"; +} + +BaseValuation::BaseValuation(const std::string& option) : option(option) { + // Nothing to do! +} + +const ValidatorFunction double_is_positive = [](const double& d) { + if (d <= 0) { + throw std::domain_error("must be strictly positive"); + } +}; + +const ValidatorFunction double_is_nonnegative = [](const double& d) { + if (d < 0) { + throw std::domain_error("cannot be negative"); + } +}; + +const ValidatorFunction size_t_is_nonzero = [](const size_t& s) { + if (s == 0) { + throw std::domain_error("cannot be zero"); + } +}; + +const ValidatorFunction int_is_nonnegative = [](const int& i) { + if (i < 0) { + throw std::domain_error("cannot be negative"); + } +}; + +TickChainLink& GroupedOptionGroup::chain(TickChainLink& next) { + if (subgroups.empty()) { + // Just chain through + return TickChainLink::chain(next); + } else { + // Chain us to first subgroup, and last subgroup to next. + TickChainLink::chain(*subgroups.front()); + subgroups.back()->chain(next); + return next; + } +} + +bool GroupedOptionGroup::parse(int option_id, const char* optarg) { + for (auto& group : subgroups) { + if (group->parse(option_id, optarg)) { + // If any of our groups wants this option, we do too. + return true; + } + } + return false; +} + +bool GroupedOptionGroup::preset(const BaseValuation& entry) { + for (auto& group : subgroups) { + if (group->preset(entry)) { + // If any of our groups wants this option, we do too. + return true; + } + } + return false; +} + +bool GroupedOptionGroup::set(const BaseValuation& entry) { + for (auto& group : subgroups) { + if (group->set(entry)) { + // If any of our groups wants this option, we do too. + return true; + } + } + return false; +} + +bool GroupedOptionGroup::query(BaseValuation& entry) const { + for (auto& group : subgroups) { + if (group->query(entry)) { + // If any of our groups wants this option, we do too. + return true; + } + } + return false; +} + +void GroupedOptionGroup::print_options(ostream& out, bool slug) const { + for (auto& group : subgroups) { + // Print options from all groups in order + group->print_options(out, slug); + } +} + +std::vector> GroupedOptionGroup::get_help() const { + std::vector> helps; + for (auto& group : subgroups) { + // Get helps from all subgroups + auto subgroup_helps = group->get_help(); + // And put each collection on the end + std::copy(subgroup_helps.begin(), subgroup_helps.end(), std::back_inserter(helps)); + } + return helps; +} + +void GroupedOptionGroup::make_long_options(std::vector& dest) const { + for (auto& group : subgroups) { + group->make_long_options(dest); + } +} + +void GroupedOptionGroup::make_short_options(std::string& dest) const { + for (auto& group : subgroups) { + group->make_short_options(dest); + } +} + +void print_table(const std::vector>& rows, ostream& out) { + // Work out the max length of anything in the first column + size_t max_length = 0; + for (auto& r : rows) { + max_length = std::max(max_length, r.first.size()); + } + for (auto& r : rows) { + if (r.first.empty()) { + // It's a heading + out << r.second << std::endl; + } else { + // Print leading indent + out << " "; + // Print column 1 + out << r.first; + for (size_t i = 0; i < max_length - r.first.size(); i++) { + // Print padding to make all items the max length + out << " "; + } + // Print separator + out << " "; + // Print column 2 + out << r.second << std::endl; + } + } +} + +} +} diff --git a/src/subcommand/options.hpp b/src/subcommand/options.hpp new file mode 100644 index 00000000000..0c311db4cce --- /dev/null +++ b/src/subcommand/options.hpp @@ -0,0 +1,1090 @@ +#ifndef VG_SUBCOMMAND_OPTIONS_HPP_INCLUDED +#define VG_SUBCOMMAND_OPTIONS_HPP_INCLUDED + +/** + *\file + * options.hpp: option parser system. + * + * Make a BaseOptionGroup, and use add_group(heading) to add + * subgroups that apply parsed options to instances of a given class. + * + * Set up each option in the group with add_option(), add_range() (for an + * option that can be cycled through a range of values for a grid search), or + * add_flag(). Every option always has a logn option name; short option + * character is optional and comes after it. Options take a pointer-to-member + * into the group's type (where the value will be ultimately written) and a + * default value, a help stiring, and an optional "validator function" which + * gets to check the parsed value and should raise std::domain_error with a + * complaint if the value isn't acceptable. + * + * Get the help with get_help() on the root group, and use the free + * print_table() function to print it to a stream with headings and automatic + * indentation. + * + * To parse options, use make_long_options() and make_short_options() to adjust + * some (possibly already partly populated) getopt_long() inputs, + * null-terminate the long options array, and call getopt_long() as normal. + * Show every option number and optarg value to the root group's parse(), and + * then continue with your own parsing if it returns false because it does not + * consume that option. + * + * To apply presets before parsing, make a Preset and fill it in with + * add_entry(option, value). Then call apply() on the preset with the root + * group. + * + * To read and write option values manually, use the get_option_value() and + * set_option_value() methods on the root option group. + * + * To log option values, there is a print_options() method on the root option + * group. + * + * EXAMPLE + * + * struct ThingDoer { + * static constexpr int default_count = 5; + * int count = default_count; + * }; + * + * vg::subcommand::BaseOptionGroup get_parser() { + * vg::subcommand::BaseOptionGroup parser; + * auto thing_doer_opts = parser.add_group("thing doer configuration"); + * thing_doer_opts.add_option( + * "count", 'c', + * &ThingDoer::count, + * ThingDoer::default_count, + * "number of things to do", + * vg::subcommand::int_is_nonnegative + * ); + * return parser; + * } + * + * int main(int argc, char** argv) { + * auto parser = get_parser(); + * std::vector long_options; + * parser.make_long_options(long_options); + * long_options.push_back({0, 0, 0, 0}); + * std::string short_options; + * parser.make_short_optins(short_options); + * int c; + * while (true) { + * int option_index = 0; + * int option_id = getopt_long (argc, argv, short_options.c_str(), + * &long_options[0], &option_index); + * if (option_id == -1) break; + * if (parser.parse(option_id, optarg)) continue; + * switch (option_id) { + * default: + * vg::subcommand::print_table(parser.get_help(), std::cerr); + * return 1; + * } + * } + * ThingDoer thing_doer; + * parser.apply(thing_doer); + * std::cout << "Doing " << thing_doer.count << " things" << std::endl; + * return 0; + * } + */ + +#include "../utility.hpp" + +#include +#include +#include +#include + +#include + +namespace vg { +namespace subcommand { + +/** + * Interface for things that form a "chain" that can be "ticked". + * + * Each link in the chain works like a digit place in a number, and ticking increments the number. + * This lets us do gird search over a bunch of values of different types without a bunch of nexted loops. + */ +struct TickChainLink { + /// This will be called when we want to reset_chain what we are chained onto. + std::function reset_chain_parent = []() { + }; + /// This will be called when we need to tick_chain our parent + std::function tick_chain_parent = []() { + return false; + }; + + /// Reset the chain to its initial values. + virtual void reset_chain(); + + /// Tick the chain. Return true if there's still a value for the chain, and + /// false if the chain is out of values. + virtual bool tick_chain(); + + /// Add a thing to the chain after us. + /// Return that thing. + virtual TickChainLink& chain(TickChainLink& next); + + /// Get a function that runs another function for each combination of + /// values for this Range and all Ranges it has been chained onto. + virtual std::function&)> get_iterator(); +}; + +} +} + +namespace vg { +// TODO: If Range isn't in the vg namespace, then vg::parse<>'s specialization +// for it doesn't actually get treated as a specialization, and we don't +// instantiate the template when we need to, and we get linker errors because +// the linker can't find the instantiation of the template. Someday we should +// work out why and fix this. + +/** + * Tickable link that represents a single value or a range of values. + * Range rusn from start to <=end, going up by step. + * You can set the range to s aingle value or to a full range, and when you read it you see the current value. + */ +template +struct Range : public subcommand::TickChainLink { + + // Expose the thing we are a range of + using type = Number; + + /// Represents the start of the range + Number start = 0; + /// Represents the inclusive end of the range + Number end = 0; + /// Represents the step to move by each tick + Number step = 1; + + /// Represents the current value the range is at + Number here = 0; + /// Determines if we are running or not (i.e. is here valid) + bool running = false; + + /// Default constructor + Range() { + // Nothing to do! + } + + /// Construct from a single value + Range(const Number& val): start(val), end(val) { + // Nothing to do! + } + + /// Copy, preserving destination links + Range(const Range& other): start(other.start), end(other.end), step(other.step) { + // Nothing to do + } + + /// Move, preserving destination links + Range(Range&& other): start(other.start), end(other.end), step(other.step) { + // Nothing to do + } + + /// Copy assignment, preserving destination links + Range& operator=(const Range& other) { + start = other.start; + end = other.end; + step = other.step; + return *this; + } + + /// Move assignment, preserving destination links + Range& operator=(Range&& other) { + start = other.start; + end = other.end; + step = other.step; + return *this; + } + + /// Check the range for usefulness + inline bool is_valid() { + if (start != end && step == 0) { + // We'll never make it + cerr << "Invalid range (no movement): " << start << " to " << end << " step " << step << endl; + return false; + } + + if (start > end && step > 0) { + // We're going the wrong way + cerr << "Invalid range (need to go down): " << start << " to " << end << " step " << step << endl; + return false; + } + + if (start < end && step < 0) { + // We're going the other wrong way + cerr << "Invalid range (need to go up): " << start << " to " << end << " step " << step << endl; + return false; + } + + return true; + } + + /// Convert to Number with the current value + operator Number() const { + if (running) { + return here; + } else { + return start; + } + } + + /// Start at our start value + void reset() { + here = start; + running = true; + } + + /// Start us and all the things we are chained onto at their start values + void reset_chain() { + reset(); + reset_chain_parent(); + } + + /// Increment our value. + /// Returns true if the new value needs processing, and false if we have left or would leave the range. + bool tick() { + if (here == end) { + // We are at the end + return false; + } + + here += step; + if ((step > 0 && here > end) || (step < 0 && here < end)) { + // We have passed the end (for things like double) + return false; + } + + return true; + } + + /// Increment our value. + /// If it overflows, tick_chain whatever we are chained onto, and reset and succeed if that succeeds. + bool tick_chain() { + if (tick()) { + // We could change + return true; + } else { + // We couldn't change. + if (tick_chain_parent()) { + // We have a parent we could advance. + reset(); + return true; + } else { + // Our parent couldn't advance either. + return false; + } + } + } +}; + +} + +namespace vg { + +// Define a way to test if a type is an instantiation of a template on a type +// See https://stackoverflow.com/a/25803794 + +// In general, things aren't instantiations of things +template class Predicate> +struct is_instantiation_of : std::false_type { +}; + +// Except things that are instantiations of things with some arguments +template class Predicate, class... PredicateArgs> +struct is_instantiation_of, Predicate> : std::true_type { +}; + +/// Parse a range as start[:end[:step]] +template +inline bool parse(const string& arg, typename enable_if::value, Result>::type& dest) { + + auto colon1 = arg.find(':'); + + if (colon1 == string::npos) { + // No colons here. Parse one number. + if (!parse(arg, dest.start)) { + return false; + } + dest.end = dest.start; + dest.step = 0; + return dest.is_valid(); + } else if (colon1 == arg.size()) { + // Can't end in a colon + return false; + } else { + // Look for another colon + auto colon2 = arg.find(':', colon1 + 1); + if (colon2 == string::npos) { + // Just a range of two things + if (!parse(arg.substr(0, colon1), dest.start)) { + return false; + } + if (!parse(arg.substr(colon1 + 1), dest.end)) { + return false; + } + dest.step = 1; + return dest.is_valid(); + } else if (colon2 == arg.size()) { + // Can't end in a colon + return false; + } else { + // We have 3 numbers + if (!parse(arg.substr(0, colon1), dest.start)) { + return false; + } + if (!parse(arg.substr(colon1 + 1, colon2 - colon1 - 1), dest.end)) { + return false; + } + if (!parse(arg.substr(colon2 + 1), dest.step)) { + return false; + } + + return dest.is_valid(); + } + } +} + +} + +namespace vg { +namespace subcommand { + +/// Get a new unique option ID. +int get_option_id(); + +/** + * Get a string "metavar" placeholder for a command line option, appropriate to its type. + */ +template +const char* get_metavar(); + +template<> +const char* get_metavar(); + +template<> +const char* get_metavar(); + +template<> +const char* get_metavar(); + +template<> +const char* get_metavar(); + +template<> +const char* get_metavar(); + +template<> +const char* get_metavar(); + +/** + * Represents an option being set to a value. Base interface. + */ +struct BaseValuation { + /// Make a new BaseValuation for the given option + BaseValuation(const std::string& option); + virtual ~BaseValuation() = default; + + /// Long option to give a value to + std::string option; +}; + +/** + * Represents an option being set to a value. Actually has the value. + */ +template +struct Valuation : public BaseValuation { + /// Make a preset entry that sets the given long option to the given value. + Valuation(const std::string& option, const T& value) : BaseValuation(option), value(value) { + // Nothing to do + } + + virtual ~Valuation() = default; + + /// Value for the option + T value; +}; + +/// Function type used to validate arguments. Throw std::domain_error if not allowed, explaining why. +template +using ValidatorFunction = std::function; + +/// Validate that a double is positive, or throw std::domain_error +extern const ValidatorFunction double_is_positive; + +/// Validate that a double is not negative, or throw std::domain_error +extern const ValidatorFunction double_is_nonnegative; + +/// Validate that a size_t is not zero, or throw std::domain_error +extern const ValidatorFunction size_t_is_nonzero; + +/// Validate that an int is not negative, or throw std::domain_error; +extern const ValidatorFunction int_is_nonnegative; + +/** + * Interface for a command-line argument that goes into a field on an object of + * the given type. + */ +template +struct BaseArgSpec : public TickChainLink { + /// Make an option with a long and short option name + BaseArgSpec(const std::string& option, char short_option, const std::string& help) : option(option), help(help), short_option(short_option), option_id(short_option != '\0' ? short_option : get_option_id()) { + // Nothing to do + } + /// Make an option with a long option name only + BaseArgSpec(const std::string& option, const std::string& help) : BaseArgSpec(option, '\0', help) { + // Nothing to do + } + virtual ~BaseArgSpec() = default; + + /// Parse the argument's value from the command line. + /// Throws std::domain_error if validation fails. + virtual void parse(const char* optarg) = 0; + /// Apply a preset item, or fail if it doesn't match. + /// The preset value will sit under any parsed value but above the default. + virtual void preset(const BaseValuation& entry) = 0; + /// Apply a valuation, or fail if it doesn't match. + /// The value will replace any parsed value! + /// Validation will not be run! + virtual void set(const BaseValuation& entry) = 0; + /// Put our current effective value into the given BaseValuation, which + /// must be for the right option and have the right type. + virtual void query(BaseValuation& entry) const = 0; + /// Apply the value to the right field of the given object. + virtual void apply(Receiver& receiver) const = 0; + /// Print value to the given stream after the given separator. + virtual void print_value(ostream& out, const char* sep = "") const = 0; + /// Print value metavar placeholder to the given stream after the given separator. + virtual void print_metavar(ostream& out, const char* sep = "") const = 0; + /// Print default value to the given stream, if appropriate. + virtual void print_default(ostream& out) const = 0; + /// Print option and value to the given stream, without newlines, between the given separators. + /// If slug is set, use short option if available and don't include spaces. + virtual void print(ostream& out, const char* sep = "", const char* after = "", bool slug = false) const { + out << sep; + if (slug && short_option != '\0') { + out << "-" << short_option; + } else { + out << "--" << option; + } + this->print_value(out, slug ? "" : " "); + out << after; + } + /// Get the getopt structure for this option. Option must outlive it and not move. + virtual struct option get_option_struct() const = 0; + + /// Name of the option (long opt) + std::string option; + /// Help for the option + std::string help; + /// Character of the option (short opt), or 0 + char short_option; + /// Int value to represent the option + int option_id; +}; + +/** + * Interface for a command-line argument that corresponds to a value of a given type. + * Storage method is left to be implemented by inheritor. + */ +template +struct ArgSpec : public BaseArgSpec { + /// Make an option with a long and short option name + ArgSpec(const std::string& option, char short_option, T Receiver::*dest, const T& default_value, const std::string& help, const ValidatorFunction& validator) : BaseArgSpec(option, short_option, help), dest(dest), default_value(default_value), validator(validator) { + // Nothing to do! + } + /// Make an option with a long option name only + ArgSpec(const std::string& option, T Receiver::*dest, const T& default_value, const std::string& help, const ValidatorFunction& validator) : ArgSpec(option, '\0', dest, default_value, help, validator) { + // Nothing to do! + } + + virtual ~ArgSpec() = default; + + /// Allow setting our stored value + virtual void set_value(const T& value) = 0; + /// And getting our current effective value + virtual T get_value() const = 0; + /// Return true if a value has been set from parsing or a preset. + virtual bool was_set() const = 0; + + virtual void preset(const BaseValuation& entry) { + // Needs to be a preset for the right option + assert(entry.option == this->option); + const Valuation* as_typed = dynamic_cast*>(&entry); + if (as_typed) { + if (!this->was_set()) { + // Apply the preset value, if nothing is set yet. + this->set_value(as_typed->value); + } + } else { + throw std::runtime_error("Could not cast valuation for " + this->option + " from " + typeid(&entry).name() + " to " + typeid(Valuation*).name()); + } + } + + virtual void set(const BaseValuation& entry) { + // Needs to be for the right option + assert(entry.option == this->option); + const Valuation* as_typed = dynamic_cast*>(&entry); + if (as_typed) { + // Apply the value + this->set_value(as_typed->value); + } else { + throw std::runtime_error("Could not cast valuation for " + this->option + " from " + typeid(&entry).name() + " to " + typeid(Valuation*).name()); + } + } + + virtual void query(BaseValuation& entry) const { + // Needs to be a valuation for the right option + assert(entry.option == this->option); + Valuation* as_typed = dynamic_cast*>(&entry); + if (as_typed) { + // Put our value in there. + as_typed->value = this->get_value(); + } else { + throw std::runtime_error("Could not cast valuation for " + this->option + " from " + typeid(&entry).name() + " to " + typeid(Valuation*).name()); + } + } + + /// Field in the receiving type we set. + T Receiver::*dest; + /// Original default value. + T default_value; + /// Function to check value with + ValidatorFunction validator; +}; + +/** + * Definition structure for normal value-having options. Lets you specify + * storage type for the actual value. + */ +template +struct ValueArgSpec : public ArgSpec { + /// Make an option with a long and short option name + ValueArgSpec(const std::string& option, char short_option, T Receiver::*dest, const T& default_value, const std::string& help, const ValidatorFunction& validator) : ArgSpec(option, short_option, dest, default_value, help, validator), value(default_value) { + // Nothing to do + } + /// Make an option with a long option name only + ValueArgSpec(const std::string& option, T Receiver::*dest, const T& default_value, const std::string& help, const ValidatorFunction& validator) : ValueArgSpec(option, '\0', dest, default_value, help, validator) { + // Nothing to do + } + virtual ~ValueArgSpec() = default; + + virtual void set_value(const T& replacement) { + // We assume the holder supports assignment. + this->value = replacement; + // Remember we got a value applied. Presets shouldn't clobber it. + this->value_set = true; + } + + virtual T get_value() const { + return this->value; + } + + virtual bool was_set() const { + return this->value_set; + } + + virtual void parse(const char* optarg) { + try { + if (!optarg) { + // Protect against nulls + throw std::domain_error("requires a value"); + } + + this->value = vg::parse(optarg); + this->validator(this->value); + this->value_set = true; + } catch (std::domain_error& e) { + cerr << "error: option "; + if (this->short_option) { + cerr << "-" << this->short_option << "/"; + } + cerr << "--" << this->option << " "; + cerr << e.what() << endl; + exit(1); + } + } + + virtual void apply(Receiver& receiver) const { + receiver.*(this->dest) = value; + } + virtual void print_metavar(ostream& out, const char* sep = "") const { + out << sep << get_metavar(); + } + virtual void print_value(ostream& out, const char* sep = "") const { + out << sep; + if (std::is_integral::value) { + // Looks like a char, so print it as a number. + // See + out << +value; + } else { + out << value; + } + } + virtual void print_default(ostream& out) const { + out << " ["; + if (std::is_integral::value) { + // Looks like a char, so print it as a number. + // See + out << +(this->default_value); + } else { + out << this->default_value; + } + out << "]"; + } + virtual struct option get_option_struct() const { + return {this->option.c_str(), required_argument, 0, this->option_id}; + } + + + Holder value; + bool value_set = false; +}; + +/** + * Definition structure for value-having options that can run through a range. + */ +template +struct RangeArgSpec : public ValueArgSpec> { + using Holder = Range; + + using ValueArgSpec>::ValueArgSpec; + virtual ~RangeArgSpec() = default; + + virtual TickChainLink& chain(TickChainLink& next) { + // Wire our value range into the chain. + TickChainLink::chain(this->value); + this->value.chain(next); + return next; + } +}; + +/** + * Definition structure for flag options that flip a default value. + */ +template +struct FlagArgSpec : public ValueArgSpec { + using T = bool; + using Holder = T; + + using ValueArgSpec::ValueArgSpec; + virtual ~FlagArgSpec() = default; + + virtual void parse(const char* optarg) { + // When parsing, flip stored default. + this->set_value(!this->default_value); + } + virtual void print_metavar(ostream& out, const char* sep = "") const { + // Don't do anything + } + virtual void print_value(ostream& out, const char* sep = "") const { + // Don't do anything + } + virtual void print_default(ostream& out) const { + // Don't do anything + } + virtual void print(ostream& out, const char* sep = "", const char* after = "", bool slug = false) const { + // Override print to just print the flag when used + if (this->value != this->default_value) { + out << sep; + if (slug && this->short_option != '\0') { + out << "-" << this->short_option; + } else { + out << "--" << this->option; + } + out << after; + } + } + virtual struct option get_option_struct() const { + return {this->option.c_str(), no_argument, 0, this->option_id}; + } +}; + +/** + * Represents a set of command-line options. + */ +struct BaseOptionGroup : public TickChainLink { + + virtual ~BaseOptionGroup() = default; + + /// Parse the given option ID, with the given value if needed. + /// Return true if we matched the ID, and false otherwise. + virtual bool parse(int option_id, const char* optarg) = 0; + + /// Apply a preset value to its option. Returns true if it was found, and + /// false otherwies. + virtual bool preset(const BaseValuation& entry) = 0; + + /// Apply a value to its option. Returns true if it was found, and false + /// otherwies. + virtual bool set(const BaseValuation& entry) = 0; + + /// Fill in entry with the value of the correspondign option, if we have + /// that option. If so, return true. + virtual bool query(BaseValuation& entry) const = 0; + + /// Print all options set. + /// By default, prints one option per line. + /// If slug is set, prints short options, all on one line. + virtual void print_options(ostream& out, bool slug = false) const = 0; + + /// Get help, in the form of pairs of options and descriptions. + /// Headings are descriptions without options. + virtual std::vector> get_help() const = 0; + + /// Add options to non-null-terminated input for getopt_long + virtual void make_long_options(std::vector& dest) const = 0; + + /// Add options to string input for getopt_long + virtual void make_short_options(std::string& dest) const = 0; + + /// Allow the user to query an option value by name. + /// Would be simpler if we could override template methods but we can't. + template + T get_option_value(const std::string& option) const { + Valuation question(option, T()); + bool found = this->query(question); + if (!found) { + throw std::runtime_error("Undefined option: " + option); + } + return question.value; + } + + /// Allow the user to manually set an option value + template + void set_option_value(const std::string& option, const T& value) { + Valuation setter(option, value); + bool found = this->set(setter); + if (!found) { + throw std::runtime_error("Undefined option: " + option); + } + } +}; + +/** + * Represents a set of command-line options that can be applied to an object. + * Internal values can be ranges that can be ticked. + * Comes with a heading. + */ +template +struct OptionGroup : public BaseOptionGroup { + + virtual ~OptionGroup() = default; + + /// Make an option group woith the given heading. + OptionGroup(const std::string& heading) : heading(heading) { + // Nothing to do! + } + + /// Chain through all options + virtual TickChainLink& chain(TickChainLink& next) { + if (args.empty()) { + // Just chain through + return TickChainLink::chain(next); + } else { + // Chain us to first arg, and last arg to next. + TickChainLink::chain(*args.front()); + args.back()->chain(next); + return next; + } + } + + // We need to take default_value by value, and not by reference, because we + // often want to pass stuff that is constexpr and trying to use a reference + // will make us try to link against it. + // TODO: C++17 fixes this, so fix when we use that. + + /// Add a new option that goes to the given field, with the given default. + template> + void add_option(const std::string& name, char short_option, T Receiver::*dest, T default_value, const std::string& help, const ValidatorFunction& validator = [](const T& ignored) {}) { + args.emplace_back(new Spec(name, short_option, dest, default_value, help, validator)); + if (args.size() > 1) { + // Chain onto previous option + args[args.size() - 2]->chain(*args[args.size() - 1]); + } + // Index it by option ID + id_to_index.emplace(args[args.size() - 1]->option_id, args.size() - 1); + // And option name + option_to_index.emplace(args[args.size() - 1]->option, args.size() - 1); + } + + /// Add a new option that goes to the given field, with the given default. + template> + void add_option(const std::string& name, T Receiver::*dest, T default_value, const std::string& help, const ValidatorFunction& validator = [](const T& ignored) {}) { + add_option(name, '\0', dest, default_value, help, validator); + } + + /// Add a new option that handles range values + template + void add_range(const std::string& name, char short_option, T Receiver::*dest, T default_value, const std::string& help, const ValidatorFunction& validator = [](const T& ignored) {}) { + add_option>(name, short_option, dest, default_value, help, validator); + } + /// Add a new option that handles range values + template + void add_range(const std::string& name, T Receiver::*dest, T default_value, const std::string& help, const ValidatorFunction& validator = [](const T& ignored) {}) { + add_range(name, '\0', dest, default_value, help, validator); + } + + /// Add a new option that is a boolean flag + void add_flag(const std::string& name, char short_option, bool Receiver::*dest, bool default_value, const std::string& help, const ValidatorFunction& validator = [](const bool& ignored) {}) { + add_option>(name, short_option, dest, default_value, help, validator); + } + /// Add a new option that is a boolean flag + void add_flag(const std::string& name, bool Receiver::*dest, bool default_value, const std::string& help, const ValidatorFunction& validator = [](const bool& ignored) {}) { + add_flag(name, '\0', dest, default_value, help, validator); + } + + /// Parse the given option ID, with the given value if needed. + /// Return true if we matched the ID, and false otherwise. + virtual bool parse(int option_id, const char* optarg) { + auto found = id_to_index.find(option_id); + if (found != id_to_index.end()) { + // We have this option, so parse. + args.at(found->second)->parse(optarg); + return true; + } else { + // We don't have this option, maybe someone else does. + return false; + } + } + + virtual bool preset(const BaseValuation& entry) { + auto found = option_to_index.find(entry.option); + if (found != option_to_index.end()) { + // We have this option, so assign the preset. + args.at(found->second)->preset(entry); + return true; + } else { + // We don't have this option, maybe someone else does. + return false; + } + } + + virtual bool set(const BaseValuation& entry) { + auto found = option_to_index.find(entry.option); + if (found != option_to_index.end()) { + // We have this option, so assign the preset. + args.at(found->second)->set(entry); + return true; + } else { + // We don't have this option, maybe someone else does. + return false; + } + } + + virtual bool query(BaseValuation& entry) const { + auto found = option_to_index.find(entry.option); + if (found != option_to_index.end()) { + // We have this option, so get its value. + args.at(found->second)->query(entry); + return true; + } else { + // We don't have this option, maybe someone else does. + return false; + } + } + + /// Print all options set, one per line + virtual void print_options(ostream& out, bool slug = false) const { + if (slug) { + for (auto& arg : args) { + // Print unseparated short options + arg->print(out, "", "", true); + } + } else { + for (auto& arg : args) { + // Print long options, one per line + arg->print(out, "", "\n"); + } + } + } + + /// Apply all flags to the receiver + void apply(Receiver& receiver) const { + for (auto& arg : args) { + arg->apply(receiver); + } + } + + /// Get help, in the form of pairs of options and descriptions. + /// Headings are descriptions without options. + virtual std::vector> get_help() const { + std::vector> to_return; + to_return.reserve(args.size() + 1); + + // Put the heading + to_return.emplace_back("", heading + ":"); + + for (auto& arg : args) { + // Show the option + std::stringstream opt_stream; + if (arg->short_option) { + opt_stream << "-" << arg->short_option << ", "; + } + opt_stream << "--" << arg->option; + arg->print_metavar(opt_stream, " "); + + // Show the help text with default value + std::stringstream help_stream; + help_stream << arg->help; + arg->print_default(help_stream); + + to_return.emplace_back(opt_stream.str(), help_stream.str()); + } + + return to_return; + } + + /// Add to non-null-terminated input for getopt_long + virtual void make_long_options(std::vector& dest) const { + dest.reserve(dest.size() + args.size()); + for (auto& arg : args) { + // Collect them from all the options + dest.emplace_back(arg->get_option_struct()); + } + } + + /// Add to string input for getopt_long + virtual void make_short_options(std::string& dest) const { + for (auto& arg : args) { + struct option long_spec = arg->get_option_struct(); + if (long_spec.val < std::numeric_limits::max()) { + // This has a short option. Encode the short option string. + dest.push_back(long_spec.val); + switch (long_spec.has_arg) { + case optional_argument: + dest.push_back(':'); + // Fall-through + case required_argument: + dest.push_back(':'); + // Fall-through + case no_argument: + break; + } + } + } + } + + /// Heading we will appear under in the help. + std::string heading; + /// Holds the argument definitions and parsing destinations + std::vector>> args; + /// Map from option ID to option index + std::unordered_map id_to_index; + /// Map from long option to option index, to allow applying presets. + std::unordered_map option_to_index; +}; + +/** + * Represents a group of groups of options. + * + * Also doubles as the main parser type; you can make one of these and populate + * it with subgroups and options, and then use get_help() and print_table() to + * do help, and make_long_options(), make_short_options(), and parse() to parse + * with getopt_long(), and then you can apply() the options to objects they + * eventually belong in. + */ +struct GroupedOptionGroup : public BaseOptionGroup { + + // We can't copy because we contain unique_ptr values + GroupedOptionGroup() = default; + GroupedOptionGroup(const GroupedOptionGroup& other) = delete; + GroupedOptionGroup& operator=(GroupedOptionGroup& other) = delete; + GroupedOptionGroup(GroupedOptionGroup&& other) = default; + GroupedOptionGroup& operator=(GroupedOptionGroup&& other) = default; + virtual ~GroupedOptionGroup() = default; + + /// Create a new child group with a new heading, which we can add options + /// to. + template + OptionGroup& add_group(const std::string& heading) { + OptionGroup* new_group = new OptionGroup(heading); + subgroups.emplace_back(new_group); + if (subgroups.size() > 1) { + // Chain the groups + subgroups[subgroups.size() - 2]->chain(*subgroups[subgroups.size() - 1]); + } + return *new_group; + } + + /// Apply all options that go on an object of this type to the given object. + template + void apply(Receiver& receiver) { + for (auto& group : subgroups) { + OptionGroup* as_relevant_leaf = dynamic_cast*>(group.get()); + if (as_relevant_leaf) { + // This is a group that cares about this type + as_relevant_leaf->apply(receiver); + } + GroupedOptionGroup* as_internal_node = dynamic_cast(group.get()); + if (as_internal_node) { + // This is a group that has child groups + as_internal_node->apply(receiver); + } + } + } + + /// Chain through all subgroups + virtual TickChainLink& chain(TickChainLink& next); + + virtual bool parse(int option_id, const char* optarg); + + virtual bool preset(const BaseValuation& entry); + + virtual bool set(const BaseValuation& entry); + + virtual bool query(BaseValuation& entry) const; + + virtual void print_options(ostream& out, bool slug = false) const; + + virtual std::vector> get_help() const; + + virtual void make_long_options(std::vector& dest) const; + + virtual void make_short_options(std::string& dest) const; + + /// Holds all the child groups of options + std::vector> subgroups; +}; + +/** + * Represents a named preset of command-line option default overrides. Options + * are organized by long option name. + * + * Make one of these, and use add_entry() to add values to it, and then use + * apply(root_option_group) to apply it. + */ +struct Preset { + /// As part of this preset, set the given option to the given value. + template + Preset& add_entry(const std::string& option, const T& value) { + Valuation* entry = new Valuation(option, value); + entries.emplace_back(entry); + return *this; + } + + /// Apply stored presets to the given parser + void apply(BaseOptionGroup& parser) const { + for (auto& entry : entries) { + // Apply the entry + bool applied = parser.preset(*entry); + // Make sure it worked + assert(applied); + } + } + + std::vector> entries; +}; + +/** + * Print a table of rows, with each column starting at the same character on the line. + * + * Prints the help from get_help() on an option parsing group in a nice way. + */ +void print_table(const std::vector>& rows, ostream& out); + + +} +} + +#endif diff --git a/src/subcommand/pack_main.cpp b/src/subcommand/pack_main.cpp index 452aad14238..bc0162ab748 100644 --- a/src/subcommand/pack_main.cpp +++ b/src/subcommand/pack_main.cpp @@ -1,8 +1,12 @@ #include "subcommand.hpp" #include "../vg.hpp" +#include "../xg.hpp" #include "../utility.hpp" #include "../packer.hpp" -#include "../stream.hpp" +#include +#include +#include +#include #include #include @@ -13,26 +17,43 @@ using namespace vg::subcommand; void help_pack(char** argv) { cerr << "usage: " << argv[0] << " pack [options]" << endl << "options:" << endl - << " -x, --xg FILE use this basis graph" << endl + << " -x, --xg FILE use this basis graph (any format accepted, does not have to be xg)" << endl << " -o, --packs-out FILE write compressed coverage packs to this output file" << endl << " -i, --packs-in FILE begin by summing coverage packs from each provided FILE" << endl - << " -g, --gam FILE read alignments from this file (could be '-' for stdin)" << endl + << " -g, --gam FILE read alignments from this GAM file (could be '-' for stdin)" << endl + << " -a, --gaf FILE read alignments from this GAF file (could be '-' for stdin)" << endl << " -d, --as-table write table on stdout representing packs" << endl + << " -D, --as-edge-table write table on stdout representing edge coverage" << endl + << " -u, --as-qual-table write table on stdout representing average node mapqs" << endl << " -e, --with-edits record and write edits rather than only recording graph-matching coverage" << endl << " -b, --bin-size N number of sequence bases per CSA bin [default: inf]" << endl + << " -n, --node ID write table for only specified node(s)" << endl + << " -N, --node-list FILE a white space or line delimited list of nodes to collect" << endl + << " -Q, --min-mapq N ignore reads with MAPQ < N and positions with base quality < N [default: 0]" << endl + << " -c, --expected-cov N expected coverage. used only for memory tuning [default : 128]" << endl + << " -s, --trim-ends N ignore the first and last N bases of each read" << endl << " -t, --threads N use N threads (defaults to numCPUs)" << endl; } + int main_pack(int argc, char** argv) { string xg_name; vector packs_in; string packs_out; string gam_in; + string gaf_in; bool write_table = false; - int thread_count = 1; + bool write_edge_table = false; + bool write_qual_table = false; bool record_edits = false; size_t bin_size = 0; + vector node_ids; + string node_list_file; + int min_mapq = 0; + int min_baseq = 0; + size_t expected_coverage = 128; + int trim_ends = 0; if (argc == 2) { help_pack(argv); @@ -49,15 +70,23 @@ int main_pack(int argc, char** argv) { {"packs-out", required_argument,0, 'o'}, {"count-in", required_argument, 0, 'i'}, {"gam", required_argument, 0, 'g'}, + {"gaf", required_argument, 0, 'a'}, {"as-table", no_argument, 0, 'd'}, + {"as-edge-table", no_argument, 0, 'D'}, + {"as-qual-table", no_argument, 0, 'u'}, {"threads", required_argument, 0, 't'}, {"with-edits", no_argument, 0, 'e'}, + {"node", required_argument, 0, 'n'}, + {"node-list", required_argument, 0, 'N'}, {"bin-size", required_argument, 0, 'b'}, + {"min-mapq", required_argument, 0, 'Q'}, + {"expected-cov", required_argument, 0, 'c'}, + {"trim-ends", required_argument, 0, 's'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:o:i:g:dt:eb:", + c = getopt_long (argc, argv, "hx:o:i:g:a:dDut:eb:n:N:Q:c:s:", long_options, &option_index); // Detect the end of the options. @@ -83,84 +112,161 @@ int main_pack(int argc, char** argv) { case 'g': gam_in = optarg; break; + case 'a': + gaf_in = optarg; + break; case 'd': write_table = true; break; + case 'D': + write_edge_table = true; + break; + case 'u': + write_qual_table = true; + break; case 'e': record_edits = true; break; case 'b': bin_size = atoll(optarg); - break; + break; case 't': - thread_count = parse(optarg); + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg pack] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } + case 'n': + node_ids.push_back(parse(optarg)); + break; + case 'N': + node_list_file = optarg; + break; + case 'Q': + min_mapq = parse(optarg); + min_baseq = min_mapq; + break; + case 'c': + expected_coverage = parse(optarg); + break; + case 's': + trim_ends = parse(optarg); break; - default: abort(); } } - omp_set_num_threads(thread_count); - - xg::XG xgidx; + unique_ptr handle_graph; + HandleGraph* graph = nullptr; if (xg_name.empty()) { - cerr << "No XG index given. An XG index must be provided." << endl; + cerr << "error [vg pack]: No basis graph given. One must be provided with -x." << endl; exit(1); } else { - ifstream in(xg_name.c_str()); - xgidx.load(in); + handle_graph = vg::io::VPKG::load_one(xg_name); } + bdsg::VectorizableOverlayHelper overlay_helper; + graph = dynamic_cast(overlay_helper.apply(handle_graph.get())); - // todo one packer per thread and merge + if (gam_in.empty() && packs_in.empty() && gaf_in.empty()) { + cerr << "error [vg pack]: Input must be provided with -g, -a or -i" << endl; + exit(1); + } - vg::Packer packer(&xgidx, bin_size); + if (!gam_in.empty() && !gaf_in.empty()) { + cerr << "error [vg pack]: -g cannot be used with -a" << endl; + exit(1); + } + + if (packs_out.empty() && write_table == false && write_edge_table == false && write_qual_table == false) { + cerr << "error [vg pack]: Output must be selected with -o, -d or -D" << endl; + exit(1); + } + + // process input node list + if (!node_list_file.empty()) { + ifstream nli; + nli.open(node_list_file); + if (!nli.good()){ + cerr << "[vg pack] error, unable to open the node list input file." << endl; + exit(1); + } + string line; + while (getline(nli, line)){ + for (auto& idstr : split_delims(line, " \t")) { + node_ids.push_back(parse(idstr.c_str())); + } + } + nli.close(); + } + + // get a data width from our expected coverage, using simple heuristic of counting + // bits needed to store double the coverage + size_t data_width = Packer::estimate_data_width(expected_coverage); + + // use some naive heuristics to come up with bin count and batch size based on thread count + // more bins: finer grained parallelism at cost of more mutexes and allocations + // bigger batch size: more robustness to sorted input at cost of less parallelism + size_t num_threads = get_thread_count(); + size_t batch_size = Packer::estimate_batch_size(num_threads); + size_t bin_count = Packer::estimate_bin_count(num_threads); + + // create our packer + Packer packer(graph, true, true, record_edits, true, bin_size, bin_count, data_width); + + // todo one packer per thread and merge if (packs_in.size() == 1) { packer.load_from_file(packs_in.front()); } else if (packs_in.size() > 1) { packer.merge_from_files(packs_in); } + std::function lambda = [&packer,&min_mapq,&min_baseq,&trim_ends](Alignment& aln) { + packer.add(aln, min_mapq, min_baseq, trim_ends); + }; + if (!gam_in.empty()) { - vector packers; - if (thread_count == 1) { - packers.push_back(&packer); - } else { - for (size_t i = 0; i < thread_count; ++i) { - packers.push_back(new Packer(&xgidx, bin_size)); - } - } - std::function lambda = [&packer,&record_edits,&packers](Alignment& aln) { - packers[omp_get_thread_num()]->add(aln, record_edits); + get_input_file(gam_in, [&](istream& in) { + vg::io::for_each_parallel(in, lambda, batch_size); + }); + } else if (!gaf_in.empty()) { + // we use this interface so we can ignore sequence, which takes a lot of time to parse + // and is unused by pack + function node_to_length = [&graph](nid_t node_id) { + return graph->get_length(graph->get_handle(node_id)); }; - if (gam_in == "-") { - stream::for_each_parallel(std::cin, lambda); - } else { - ifstream gam_stream(gam_in); - stream::for_each_parallel(gam_stream, lambda); - gam_stream.close(); - } - if (thread_count == 1) { - packers.clear(); - } else { - packer.merge_from_dynamic(packers); - for (auto& p : packers) { - delete p; - } - packers.clear(); - } + function node_to_sequence = [&graph](nid_t node_id, bool is_reversed) { + return graph->get_sequence(graph->get_handle(node_id, is_reversed)); + }; + + // computed batch size was tuned for GAM performance. some small tests show that + // gaf benefits from a slightly larger one. + vg::io::gaf_unpaired_for_each_parallel(node_to_length, record_edits ? node_to_sequence : nullptr, + gaf_in, lambda, batch_size * 4); } if (!packs_out.empty()) { packer.save_to_file(packs_out); } - if (write_table) { + if (write_table || write_edge_table || write_qual_table) { packer.make_compact(); - packer.as_table(cout, record_edits); + if (write_table) { + packer.as_table(cout, record_edits, node_ids); + } + if (write_edge_table) { + packer.as_edge_table(cout, node_ids); + } + if (write_qual_table) { + packer.as_quality_table(cout, node_ids); + } } return 0; } // Register subcommand -static Subcommand vg_pack("pack", "convert alignments to a compact coverage, edit, and path index", main_pack); +static Subcommand vg_pack("pack", "convert alignments to a compact coverage index", PIPELINE, 9, main_pack); diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index cf4651a6fbf..dd8589947a1 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -8,14 +8,18 @@ #include #include +#include #include #include "subcommand.hpp" #include "../vg.hpp" #include "../xg.hpp" -#include "../stream.hpp" -#include +#include "../gbwt_helper.hpp" +#include +#include +#include +#include using namespace std; using namespace vg; @@ -25,18 +29,62 @@ void help_paths(char** argv) { cerr << "usage: " << argv[0] << " paths [options]" << endl << "options:" << endl << " input:" << endl - << " -v, --vg FILE use the graph in this vg FILE" << endl - << " -x, --xg FILE use the graph in the XG index FILE" << endl - << " -g, --gbwt FILE use the GBWT index in FILE" << endl - << " inspection:" << endl - << " -X, --extract-gam return (as GAM alignments) the stored paths in the graph" << endl - << " -V, --extract-vg return (as path-only .vg) the queried paths (requires -x -g and -q or -Q)" << endl - << " -L, --list return (as a list of names, one per line) the path (or thread) names" << endl - << " -T, --threads operate on threads instead of paths (requires GBWT)" << endl - << " -q, --threads-by STR operate on threads with the given prefix instead of paths (requires GBWT)" << endl - << " -Q, --paths-by STR return the paths with the given prefix" << endl; + << " -x, --xg FILE use the paths and haplotypes in this graph FILE. Supports GBZ haplotypes." <& graph_emitter) { + size_t chunk_size = 10000; + + for (size_t start = 0; start < path.mapping_size(); start += chunk_size) { + // Make sure to chunk. + // TODO: Can we avoild a copy here somehow? + Path chunk; + chunk.set_name(path.name()); + chunk.set_is_circular(path.is_circular()); + + for (size_t i = 0; i < chunk_size && start + i < path.mapping_size(); i++) { + // Copy over this batch of mappings + *chunk.add_mapping() = path.mapping(start + i); + } + + // Emit a graph chunk containing htis part of the path + Graph g; + *(g.add_path()) = std::move(chunk); + graph_emitter.write(std::move(g)); + } +} + +// TODO: promote this to libhandlegraph +unordered_map SENSE_TO_STRING { + {PathSense::REFERENCE, "REFERENCE"}, + {PathSense::GENERIC, "GENERIC"}, + {PathSense::HAPLOTYPE, "HAPLOTYPE"} +}; + int main_paths(int argc, char** argv) { if (argc == 2) { @@ -45,14 +93,31 @@ int main_paths(int argc, char** argv) { } bool extract_as_gam = false; + bool extract_as_gaf = false; bool extract_as_vg = false; bool list_names = false; - string xg_file; - string vg_file; + bool extract_as_fasta = false; + bool drop_paths = false; + bool retain_paths = false; + string graph_file; string gbwt_file; - string thread_prefix; string path_prefix; - bool extract_threads = false; + string sample_name; + string path_file; + bool select_alt_paths = false; + // What kinds of paths are we interested in? + unordered_set path_senses { + PathSense::REFERENCE, + PathSense::GENERIC, + PathSense::HAPLOTYPE + }; + bool list_lengths = false; + bool list_metadata = false; + bool list_cyclicity = false; + size_t output_formats = 0, selection_criteria = 0; + size_t input_formats = 0; + bool coverage = false; + const size_t coverage_bins = 10; int c; optind = 2; // force optind past command positional argument @@ -63,18 +128,32 @@ int main_paths(int argc, char** argv) { {"vg", required_argument, 0, 'v'}, {"xg", required_argument, 0, 'x'}, {"gbwt", required_argument, 0, 'g'}, - {"extract-gam", no_argument, 0, 'X'}, {"extract-vg", no_argument, 0, 'V'}, + {"drop-paths", no_argument, 0, 'd'}, + {"retain-paths", no_argument, 0, 'r'}, + {"extract-gam", no_argument, 0, 'X'}, + {"extract-gaf", no_argument, 0, 'A'}, {"list", no_argument, 0, 'L'}, - {"max-length", required_argument, 0, 'l'}, - {"threads-by", required_argument, 0, 'q'}, + {"lengths", no_argument, 0, 'E'}, + {"metadata", no_argument, 0, 'M'}, + {"cyclicity", no_argument, 0, 'C'}, + {"extract-fasta", no_argument, 0, 'F'}, + {"paths-file", required_argument, 0, 'p'}, {"paths-by", required_argument, 0, 'Q'}, + {"sample", required_argument, 0, 'S'}, + {"variant-paths", no_argument, 0, 'a'}, + {"generic-paths", no_argument, 0, 'G'}, + {"coverage", no_argument, 0, 'c'}, + + // Hidden options for backward compatibility. {"threads", no_argument, 0, 'T'}, + {"threads-by", required_argument, 0, 'q'}, + {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:q:Q:VT", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGp:c", long_options, &option_index); // Detect the end of the options. @@ -84,40 +163,111 @@ int main_paths(int argc, char** argv) { switch (c) { - case 'v': - vg_file = optarg; - break; - + case 'v': // Fall through case 'x': - xg_file = optarg; + graph_file = optarg; + ++input_formats; break; case 'g': gbwt_file = optarg; + ++input_formats; + break; + + case 'V': + extract_as_vg = true; + output_formats++; + break; + + case 'd': + drop_paths = true; + output_formats++; break; + case 'r': + retain_paths = true; + output_formats++; + break; + case 'X': extract_as_gam = true; + output_formats++; break; - case 'V': - extract_as_vg = true; + case 'A': + extract_as_gaf = true; + output_formats++; break; case 'L': list_names = true; + output_formats++; break; - case 'q': - thread_prefix = optarg; + case 'E': + list_names = true; + list_lengths = true; + output_formats++; + break; + + case 'M': + list_names = true; + list_metadata = true; + output_formats++; + break; + + case 'C': + list_names = true; + list_cyclicity = true; + output_formats++; + break; + + case 'F': + extract_as_fasta = true; + output_formats++; + break; + + case 'p': + path_file = optarg; + selection_criteria++; break; case 'Q': path_prefix = optarg; + selection_criteria++; + break; + + case 'S': + sample_name = optarg; + // We only care about things with references now. + path_senses = {PathSense::REFERENCE, PathSense::HAPLOTYPE}; + selection_criteria++; + break; + + case 'a': + select_alt_paths = true; + selection_criteria++; + break; + + case 'G': + // We only care about generic paths now. + path_senses = {PathSense::GENERIC}; + selection_criteria++; + break; + + case 'c': + coverage = true; + output_formats++; break; case 'T': - extract_threads = true; + std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl; + break; + + case 'q': + std::cerr << "warning: [vg paths] option --threads-by is deprecated; please use --paths-by" << std::endl; + path_prefix = optarg; + selection_criteria++; break; case 'h': @@ -131,167 +281,473 @@ int main_paths(int argc, char** argv) { } } - if (!vg_file.empty() && !xg_file.empty()) { - cerr << "[vg paths] Error: both vg and xg index given" << endl; - exit(1); + if (input_formats != 1 && input_formats != 2) { + std::cerr << "error: [vg paths] at least one input format (-x, -g) must be specified" << std::endl; + std::exit(EXIT_FAILURE); + } + if (!gbwt_file.empty()) { + bool need_graph = (extract_as_gam || extract_as_gaf || extract_as_vg || drop_paths || retain_paths || extract_as_fasta || list_lengths); + if (need_graph && graph_file.empty()) { + std::cerr << "error: [vg paths] a graph is needed for extracting threads in -X, -A, -V, -d, -r, -E or -F format" << std::endl; + std::exit(EXIT_FAILURE); + } + if (!need_graph && !graph_file.empty()) { + // TODO: This should be an error, but we display a warning instead for backward compatibility. + //std::cerr << "error: [vg paths] cannot read input from multiple sources" << std::endl; + //std::exit(EXIT_FAILURE); + std::cerr << "warning: [vg paths] graph unnecessary for listing GBWT threads" << std::endl; + } + } + if (output_formats != 1) { + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl; + std::exit(EXIT_FAILURE); + } + if (selection_criteria > 1) { + std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G, -p) cannot be used" << std::endl; + std::exit(EXIT_FAILURE); + } + if (select_alt_paths && !gbwt_file.empty()) { + std::cerr << "error: [vg paths] selecting variant allele paths is not compatible with a GBWT index" << std::endl; + std::exit(EXIT_FAILURE); + } + if (list_metadata && !gbwt_file.empty()) { + std::cerr << "error: [vg paths] listing path metadata is not compatible with a GBWT index" << std::endl; + std::exit(EXIT_FAILURE); + } + if ((drop_paths || retain_paths) && !gbwt_file.empty()) { + std::cerr << "error: [vg paths] dropping or retaining paths only works on embedded graph paths, not GBWT threads" << std::endl; + std::exit(EXIT_FAILURE); + } + if (coverage && !gbwt_file.empty()) { + std::cerr << "error: [vg paths] coverage option -c only works on embedded graph paths, not GBWT threads" << std::endl; + std::exit(EXIT_FAILURE); } - if (!thread_prefix.empty() && extract_threads) { - cerr << "[vg paths] Error: cannot extract all threads (-T) and also prefixed threads (-q)" << endl; - exit(1); + if (select_alt_paths) { + // alt paths all have a specific prefix + path_prefix = "_alt_"; + // And are all generic sense. + path_senses = {PathSense::GENERIC}; } // Load whatever indexes we were given - unique_ptr graph; - if (!vg_file.empty()) { - // We want a vg - graph = unique_ptr(new VG()); - // Load the vg - get_input_file(vg_file, [&](istream& in) { - graph->from_istream(in); - }); - } - unique_ptr xg_index; - if (!xg_file.empty()) { - // We want an xg - xg_index = unique_ptr(new xg::XG()); - // Load the xg - get_input_file(xg_file, [&](istream& in) { - xg_index->load(in); - }); + // Note: during handlifiction, distinction between -v and -x options disappeared. + unique_ptr graph; + if (!graph_file.empty()) { + // Load the graph + graph = vg::io::VPKG::load_one(graph_file); } unique_ptr gbwt_index; if (!gbwt_file.empty()) { // We want a gbwt - gbwt_index = unique_ptr(new gbwt::GBWT()); - // Load the gbwt (TODO: support streams) - sdsl::load_from_file(*gbwt_index, gbwt_file); + + // Load the GBWT from its container + gbwt_index = vg::io::VPKG::load_one(gbwt_file); + + if (gbwt_index.get() == nullptr) { + // Complain if we couldn't. + cerr << "error: [vg paths] unable to load gbwt index file" << endl; + exit(1); + } } - if (!thread_prefix.empty() || extract_threads) { - // We are looking for threads, so we need the GBWT and the xg (which holds the thread name metadata) - - if (xg_index.get() == nullptr) { - cerr << "[vg paths] Error: thread extraction requires an XG for thread metadata" << endl; - exit(1); + + set path_names; + if (!path_file.empty()) { + ifstream path_stream(path_file); + if (!path_stream) { + cerr << "error: cannot open path name file " << path_file << endl; + exit(EXIT_FAILURE); } - if (gbwt_index.get() == nullptr) { - cerr << "[vg paths] Error: thread extraction requires a GBWT" << endl; - exit(1); + + string line; + while (getline(path_stream, line)) { + path_names.emplace(move(line)); } - if (extract_as_gam == extract_as_vg && extract_as_vg == list_names) { - cerr << "[vg paths] Error: thread extraction requires -V, -X, or -L to specifiy output format" << endl; - exit(1); + } + + // We may need to emit a stream of Alignments + unique_ptr aln_emitter; + + // Or we might need to emit a stream of VG Graph objects + unique_ptr> graph_emitter; + if (extract_as_gam || extract_as_gaf) { + // Open up a GAM/GAF output stream + aln_emitter = vg::io::get_non_hts_alignment_emitter("-", extract_as_gaf ? "GAF" : "GAM", {}, get_thread_count(), + graph.get()); + } else if (extract_as_vg) { + // Open up a VG Graph chunk output stream + graph_emitter = unique_ptr>(new vg::io::ProtobufEmitter(cout)); + } + + if (gbwt_index) { + // We want to operate on a GBWT instead of the graph. + + if (!(gbwt_index->hasMetadata() && gbwt_index->metadata.hasPathNames())) { + std::cerr << "warning: [vg paths] the GBWT index does not contain thread names" << std::endl; + std::exit(EXIT_SUCCESS); } - vector thread_ids; - if (extract_threads) { - for (gbwt::size_type id = 1; id <= gbwt_index->sequences()/2; id += 1) { - thread_ids.push_back(id); + + // Pre-parse some metadata + auto gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt_index); + + // Select the threads we are interested in. + std::vector thread_ids; + if (!sample_name.empty()) { + thread_ids = threads_for_sample(*gbwt_index, sample_name); + } else if(!path_prefix.empty()) { + for (size_t i = 0; i < gbwt_index->metadata.paths(); i++) { + PathSense sense = gbwtgraph::get_path_sense(*gbwt_index, i, gbwt_reference_samples); + std::string name = gbwtgraph::compose_path_name(*gbwt_index, i, sense); + if (name.length() >= path_prefix.length() && std::equal(path_prefix.begin(), path_prefix.end(), name.begin())) { + thread_ids.push_back(i); + } + } + } else if (!path_file.empty()) { + // TODO: there doesn't seem to be a look-up by name in the GBWT, so we check all of them + thread_ids.reserve(path_names.size()); + for (size_t i = 0; i < gbwt_index->metadata.paths(); i++) { + PathSense sense = gbwtgraph::get_path_sense(*gbwt_index, i, gbwt_reference_samples); + std::string name = gbwtgraph::compose_path_name(*gbwt_index, i, sense); + if (path_names.count(name)) { + thread_ids.push_back(i); + } + } + if (thread_ids.size() != path_names.size()) { + std::cerr << "error: [vg paths] could not find all path names from file in GBWT index" << std::endl; + std::exit(EXIT_FAILURE); + } + } else { + thread_ids.reserve(gbwt_index->metadata.paths()); + for (size_t i = 0; i < gbwt_index->metadata.paths(); i++) { + thread_ids.push_back(i); } - } else if (!thread_prefix.empty()) { - thread_ids = xg_index->threads_named_starting(thread_prefix); } - for (auto& id : thread_ids) { - // For each matching thread - - // Get its name - auto thread_name = xg_index->thread_name(id); - - if (list_names) { - // We are only interested in the name - cout << thread_name << endl; + // Process the threads. + for (gbwt::size_type id : thread_ids) { + PathSense sense = gbwtgraph::get_path_sense(*gbwt_index, id, gbwt_reference_samples); + std::string name = gbwtgraph::compose_path_name(*gbwt_index, id, sense); + + // We are only interested in the name + // TODO: do we need to consult list_cyclicity or list_metadata here? + if (list_names && !list_lengths) { + std::cout << name << endl; continue; } + // TODO: implement list_metadata for GBWT threads? + // Otherwise we need the actual thread data - gbwt::vector_type sequence = gbwt_index->extract(gbwt::Path::encode(id-1, false)); - Path path; - path.set_name(thread_name); - size_t rank = 1; - for (auto node : sequence) { - Mapping* m = path.add_mapping(); - Position* p = m->mutable_position(); - p->set_node_id(gbwt::Node::id(node)); - p->set_is_reverse(gbwt::Node::is_reverse(node)); - Edit* e = m->add_edit(); - size_t len = xg_index->node_length(p->node_id()); - e->set_to_length(len); - e->set_from_length(len); - m->set_rank(rank++); - } - if (extract_as_gam) { - vector alns; - alns.emplace_back(xg_index->path_as_alignment(path)); - write_alignments(cout, alns); - stream::finish(cout); + Path path = extract_gbwt_path(*graph, *gbwt_index, id); + if (extract_as_gam || extract_as_gaf) { + // Write as an Alignment. Must contain the whole path. + aln_emitter->emit_singles({alignment_from_path(*graph, path)}); } else if (extract_as_vg) { - Graph g; - *(g.add_path()) = path; - vector gb = { g }; - stream::write_buffered(cout, gb, 0); + // Write as a Path in a VG + chunk_to_emitter(path, *graph_emitter); + } else if (extract_as_fasta) { + write_fasta_sequence(name, path_sequence(*graph, path), cout); + } + if (list_lengths) { + cout << path.name() << "\t" << path_to_length(path) << endl; + } + if (list_cyclicity) { + bool cyclic = false; + unordered_set> visits; + for (size_t i = 0; i < path.mapping_size() && !cyclic; ++i) { + const Mapping& mapping = path.mapping(i); + pair>::iterator, bool> ret = + visits.insert(make_pair(mapping.position().node_id(), mapping.position().is_reverse())); + if (ret.second == false) { + cyclic = true; + } + } + cout << path.name() << "\t" << (cyclic ? "cyclic" : "acyclic") << endl; } } - } else if (graph.get() != nullptr) { - // Handle non-thread queries from vg + } else if (graph) { - if (!path_prefix.empty()) { - cerr << "[vg paths] Error: path prefix not supported for extracting from vg, only for extracting from xg" << endl; - exit(1); - } + // Handle queries from the graph - if (list_names) { - graph->paths.for_each_name([&](const string& name) { - cout << name << endl; + // Make a helper to loop over the selected paths in the graph + auto for_each_selected_path = [&](const std::function& iteratee) { + if (!path_file.empty()) { + // We only want paths with full names from the file. Just look them all up. + for (auto& name : path_names) { + if (graph->has_path(name)) { + auto path_handle = graph->get_path_handle(name); + if (path_senses.count(graph->get_sense(path_handle))) { + // But only take those with senses we want. + iteratee(path_handle); + } + } + } + } else { + // We have only prefixes or other criteria. + // We need to do a scan. + + // We may restrict to some exact locus names. + unordered_set* locus_name_filter = nullptr; + // We may restrict to some exact sample names + unordered_set sample_name_set; + unordered_set* sample_name_filter = nullptr; + if (!sample_name.empty()) { + // We only want paths with this exact sample name + sample_name_set.insert(sample_name); + sample_name_filter = &sample_name_set; + } + + graph->for_each_path_matching(&path_senses, sample_name_filter, locus_name_filter, + [&](const path_handle_t& path_handle) { + + // We got a path of appropriate sense, locus, and sample. + if (!path_prefix.empty()) { + // Filter by name prefix + std::string path_name = graph->get_path_name(path_handle); + + if (std::mismatch(path_name.begin(), path_name.end(), + path_prefix.begin(), path_prefix.end()).second != path_prefix.end()) { + // The path does not match the prefix. Skip it. + return; + } + } + + // It didn't fail a prefix check, so use it. + iteratee(path_handle); }); - } else if (extract_as_gam) { - vector alns = graph->paths_as_alignments(); - write_alignments(cout, alns); - stream::finish(cout); - } else if (extract_as_vg) { - cerr << "[vg paths] Error: vg extraction is only defined for prefix queries against a XG/GBWT index pair" << endl; - exit(1); - } else { - cerr << "[vg paths] Error: specify an operation to perform" << endl; + } + + }; + // Make a helper to loop over the complement set of un-selected paths in the graph + auto for_each_unselected_path = [&](const std::function& iteratee) { + // Get all the selected paths + unordered_set selected; + for_each_selected_path([&](const path_handle_t& path_handle) { + selected.insert(path_handle); + }); + + unordered_set all_senses { + PathSense::REFERENCE, + PathSense::GENERIC, + PathSense::HAPLOTYPE + }; + + graph->for_each_path_matching(&all_senses, nullptr, nullptr, [&](const path_handle_t& path_handle) { + // And then, for each path of any sense + if (!selected.count(path_handle)) { + // If it isn't selected, yield it. + iteratee(path_handle); + } + }); + + + }; + + if (drop_paths || retain_paths) { + MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph.get()); + if (!mutable_graph) { + std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; + exit(1); + } + SerializableHandleGraph* serializable_graph = dynamic_cast(graph.get()); + if (!serializable_graph) { + std::cerr << "error[vg paths]: graph cannot be saved after modification" << std::endl; + exit(1); + } + + vector to_destroy; + if (drop_paths) { + for_each_selected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } else { + for_each_unselected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } + for (string& path_name : to_destroy) { + mutable_graph->destroy_path(graph->get_path_handle(path_name)); + } + + // output the graph + serializable_graph->serialize(cout); } - } else if (xg_index.get() != nullptr) { - // Handle non-thread queries from xg - if (list_names) { - // We aren't looking for threads, but we are looking for names. - size_t max_path = xg_index->max_path_rank(); - for (size_t i = 1; i <= max_path; ++i) { - cout << xg_index->path_name(i) << endl; + else if (coverage) { + // for every node, count the number of unique paths. then add the coverage count to each one + // (we're doing the whole graph here, which could be inefficient in the case the user is selecting + // a small path) + unordered_map> coverage_map; + // big speedup + unordered_map path_to_name; + size_t max_coverage = 0; + graph->for_each_handle([&](handle_t handle) { + vector steps; + for (auto& sense : path_senses) { + graph->for_each_step_of_sense(handle, sense, [&](const step_handle_t& step) { + // For every step on this handle of any sense we care about, remember it + steps.push_back(step); + }); + } + unordered_set unique_names; + unordered_set unique_paths; + for (auto step_handle : steps) { + path_handle_t step_path_handle = graph->get_path_handle_of_step(step_handle); + auto it = path_to_name.find(step_path_handle); + if (it == path_to_name.end()) { + string step_path_name = graph->get_path_name(step_path_handle); + // disregard subpath tags when counting (but not displaying) + it = path_to_name.insert(make_pair(step_path_handle, Paths::strip_subrange(step_path_name))).first; + } + unique_names.insert(it->second); + unique_paths.insert(step_path_handle); + } + for (auto path : unique_paths) { + vector& cov = coverage_map[path]; + if (cov.size() < unique_paths.size()) { + cov.resize(unique_paths.size(), 0); + } + cov[unique_paths.size() - 1] += graph->get_length(graph->get_handle_of_step(steps[0])); + max_coverage = std::max(max_coverage, unique_names.size() - 1); + } + }); + // figure out the bin size + int64_t bin_size = 1; + if (max_coverage > coverage_bins) { + // reserve the first 2 bins for coverage = 0 and 1 no matter 1 + bin_size = (max_coverage - 2) / (coverage_bins - 2); + if ((max_coverage - 2) % (coverage_bins - 2)) { + ++bin_size; + } } - } else if (extract_as_gam) { - auto alns = xg_index->paths_as_alignments(); - write_alignments(cout, alns); - stream::finish(cout); - } else if (!path_prefix.empty()) { - vector got = xg_index->paths_by_prefix(path_prefix); - if (extract_as_gam) { - vector alns; - for (auto& path : got) { - alns.emplace_back(xg_index->path_as_alignment(path)); + // compute cumulative coverage + for (auto& path_cov : coverage_map) { + int64_t cum_cov = 0; + vector& cov = path_cov.second; + cov.resize(max_coverage + 1, 0); + // bin it up if necessary + if (cov.size() > coverage_bins) { + vector binned_cov(coverage_bins, 0); + // reserve the first 2 bins for coverage = 0 and 1 no matter 1 + binned_cov[0] = cov[0]; + binned_cov[1] = cov[1]; + // remaining bins + for (size_t bin = 0; bin < coverage_bins - 2; ++bin) { + for (size_t i = 0; i < bin_size && (2 + bin * bin_size + i < cov.size()); ++i) { + binned_cov[2 + bin] += cov[2 + bin * bin_size + i]; + } + } + swap(cov, binned_cov); } - write_alignments(cout, alns); - stream::finish(cout); - } else if (extract_as_vg) { - for(auto& path : got) { - Graph g; - *(g.add_path()) = xg_index->path(path.name()); - vector gb = { g }; - stream::write_buffered(cout, gb, 0); + // accumulate + for (auto cov_it = path_cov.second.rbegin(); cov_it != path_cov.second.rend(); ++cov_it) { + cum_cov += *cov_it; + *cov_it = cum_cov; } } + cout << "PathName"; + for (size_t cov = 0; cov <= min(max_coverage, coverage_bins); ++cov) { + cout << "\t"; + if (cov < 2 || bin_size == 1) { + cout << cov << "-" << cov; + } else { + cout << (2 + (cov - 2) * bin_size) << "-" << (2 + (cov - 2) * bin_size + bin_size - 1); + } + } + cout << endl; + for_each_selected_path([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + cout << path_name; + auto& path_cov = coverage_map[path_handle]; + for (size_t cov = 0; cov < path_cov.size(); ++cov) { + cout << "\t" << path_cov[cov]; + } + cout << endl; + }); } else { - cerr << "[vg paths] Error: specify an operation to perform" << endl; + if (list_metadata) { + // Add a header + cout << "#NAME"; + if (list_lengths) { + cout << "\tLENGTH"; + } + cout << "\tSENSE"; + cout << "\tSAMPLE"; + cout << "\tHAPLOTYPE"; + cout << "\tLOCUS"; + cout << "\tPHASE_BLOCK"; + cout << "\tSUBRANGE"; + if (list_cyclicity) { + cout << "\tCYCLICITY"; + } + cout << endl; + } + for_each_selected_path([&](path_handle_t path_handle) { + if (list_names) { + cout << graph->get_path_name(path_handle); + if (list_lengths) { + size_t path_length = 0; + for (handle_t handle : graph->scan_path(path_handle)) { + path_length += graph->get_length(handle); + } + cout << "\t" << path_length; + } + if (list_metadata) { + // Dump fields for all the metadata + cout << "\t" << SENSE_TO_STRING.at(graph->get_sense(path_handle)); + auto sample = graph->get_sample_name(path_handle); + cout << "\t" << (sample == PathMetadata::NO_SAMPLE_NAME ? "NO_SAMPLE_NAME" : sample); + auto haplotype = graph->get_haplotype(path_handle); + cout << "\t" << (haplotype == PathMetadata::NO_HAPLOTYPE ? "NO_HAPLOTYPE" : std::to_string(haplotype)); + auto locus = graph->get_locus_name(path_handle); + cout << "\t" << (locus == PathMetadata::NO_LOCUS_NAME ? "NO_LOCUS_NAME" : locus); + auto phase_block = graph->get_phase_block(path_handle); + cout << "\t" << (phase_block == PathMetadata::NO_PHASE_BLOCK ? "NO_PHASE_BLOCK" : std::to_string(phase_block)); + auto subrange = graph->get_subrange(path_handle); + cout << "\t"; + if (subrange == PathMetadata::NO_SUBRANGE) { + cout << "NO_SUBRANGE"; + } else if (subrange.second == PathMetadata::NO_END_POSITION) { + cout << subrange.first; + } else { + cout << subrange.first << "-" << subrange.second; + } + } + if (list_cyclicity) { + bool directed_cyclic = false; // same node visited twice in same orientation + bool undirected_cyclic = false; // same not visited twice in any orientation + unordered_set visits; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (visits.count(handle)) { + directed_cyclic = true; + undirected_cyclic = false; + } else if (visits.count(graph->flip(handle))) { + undirected_cyclic = true; + } + visits.insert(handle); + return !directed_cyclic || !undirected_cyclic; + }); + cout << "\t" << (directed_cyclic ? "directed-cyclic" : "directed-acyclic") + << "\t" << (undirected_cyclic ? "undirected-cyclic" : "undirected-acyclic"); + } + cout << endl; + } else { + Path path = path_from_path_handle(*graph, path_handle); + if (extract_as_gam || extract_as_gaf) { + aln_emitter->emit_singles({alignment_from_path(*graph, path)}); + } else if (extract_as_vg) { + chunk_to_emitter(path, *graph_emitter); + } else if (extract_as_fasta) { + write_fasta_sequence(graph->get_path_name(path_handle), path_sequence(*graph, path), cout); + } + } + }); } - } else { - cerr << "[vg paths] Error: an xg (-x) or vg (-v) file is required" << endl; - exit(1); } - return 0; } diff --git a/src/subcommand/prune_main.cpp b/src/subcommand/prune_main.cpp index 642c6b59024..e52562ed446 100644 --- a/src/subcommand/prune_main.cpp +++ b/src/subcommand/prune_main.cpp @@ -8,6 +8,9 @@ * regions shorter than --subgraph_min are also removed. Pruning also removes * all embedded paths. * + * For very complex graphs, there is an option to remove high-degree nodes + * before pruning. Otherwise enumerating the k bp paths would take too long. + * * With --restore-paths, the nodes and edges on non-alt paths are added back * after pruning. * @@ -18,14 +21,17 @@ */ #include "../phase_unfolder.hpp" +#include #include "subcommand.hpp" +#include "xg.hpp" +#include "../algorithms/prune.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include #include -#include #include #include @@ -43,11 +49,13 @@ struct PruningParameters static std::map kmer_length; static std::map edge_max; static std::map subgraph_min; + static std::map max_degree; }; std::map PruningParameters::kmer_length { { mode_prune, 24 }, { mode_restore, 24 }, { mode_unfold, 24 } }; std::map PruningParameters::edge_max { { mode_prune, 3 }, { mode_restore, 3 }, { mode_unfold, 3 } }; std::map PruningParameters::subgraph_min { { mode_prune, 33 }, { mode_restore, 33 }, { mode_unfold, 33 } }; +std::map PruningParameters::max_degree { { mode_prune, 0 }, { mode_restore, 0 }, { mode_unfold, 0 } }; std::string mode_name(PruningMode mode) { std::string result = "(unknown)"; @@ -97,29 +105,37 @@ void print_defaults(const std::map& defaults) { void help_prune(char** argv) { std::cerr << "usage: " << argv[0] << " prune [options] >[output.vg]" << std::endl; + std::cerr << std::endl; std::cerr << "Prunes the complex regions of the graph for GCSA2 indexing. Pruning the graph" << std::endl; std::cerr << "removes embedded paths." << std::endl; - std::cerr << "pruning parameters:" << std::endl; + std::cerr << std::endl; + std::cerr << "Pruning parameters:" << std::endl; std::cerr << " -k, --kmer-length N kmer length used for pruning" << std::endl; std::cerr << " "; print_defaults(PruningParameters::kmer_length); std::cerr << " -e, --edge-max N remove the edges on kmers making > N edge choices" << std::endl; std::cerr << " "; print_defaults(PruningParameters::edge_max); std::cerr << " -s, --subgraph-min N remove subgraphs of < N bases" << std::endl; std::cerr << " "; print_defaults(PruningParameters::subgraph_min); - std::cerr << "pruning modes (-P, -r, and -u are mutually exclusive):" << std::endl; + std::cerr << " -M, --max-degree N if N > 0, remove nodes with degree > N before pruning" << std::endl; + std::cerr << " "; print_defaults(PruningParameters::max_degree); + std::cerr << std::endl; + std::cerr << "Pruning modes (-P, -r, and -u are mutually exclusive):" << std::endl; std::cerr << " -P, --prune simply prune the graph (default)" << std::endl; std::cerr << " -r, --restore-paths restore the edges on non-alt paths" << std::endl; std::cerr << " -u, --unfold-paths unfold non-alt paths and GBWT threads" << std::endl; std::cerr << " -v, --verify-paths verify that the paths exist after pruning" << std::endl; std::cerr << " (potentially very slow)" << std::endl; - std::cerr << "unfolding options:" << std::endl; + std::cerr << std::endl; + std::cerr << "Unfolding options:" << std::endl; std::cerr << " -g, --gbwt-name FILE unfold the threads from this GBWT index" << std::endl; - std::cerr << " -m, --mapping FILE store the node mapping for duplicates in this file" << std::endl; - std::cerr << " -a, --append-mapping append to the existing node mapping (requires -m)" << std::endl; - std::cerr << "other options:" << std::endl; + std::cerr << " -m, --mapping FILE store the node mapping for duplicates in this file (required with -u)" << std::endl; + std::cerr << " -a, --append-mapping append to the existing node mapping" << std::endl; + std::cerr << std::endl; + std::cerr << "Other options:" << std::endl; std::cerr << " -p, --progress show progress" << std::endl; std::cerr << " -t, --threads N use N threads (default: " << omp_get_max_threads() << ")" << std::endl; - std::cerr << " -d, --dry-run determine the validity of the parameter combination" << std::endl; + std::cerr << " -d, --dry-run determine the validity of the combination of options" << std::endl; + std::cerr << std::endl; } int main_prune(int argc, char** argv) { @@ -133,13 +149,14 @@ int main_prune(int argc, char** argv) { int kmer_length = 0; int edge_max = 0; size_t subgraph_min = 0; + int max_degree = 0; PruningMode mode = mode_prune; int threads = omp_get_max_threads(); bool verify_paths = false, append_mapping = false, show_progress = false, dry_run = false; std::string vg_name, gbwt_name, mapping_name; // Derived variables. - bool kmer_length_set = false, edge_max_set = false, subgraph_min_set = false; + bool kmer_length_set = false, edge_max_set = false, subgraph_min_set = false, max_degree_set = false; int c; optind = 2; // force optind past command positional argument @@ -149,6 +166,7 @@ int main_prune(int argc, char** argv) { { "kmer-length", required_argument, 0, 'k' }, { "edge-max", required_argument, 0, 'e' }, { "subgraph-min", required_argument, 0, 's' }, + { "max-degree", required_argument, 0, 'M' }, { "prune", no_argument, 0, 'P' }, { "restore-paths", no_argument, 0, 'r' }, { "unfold-paths", no_argument, 0, 'u' }, @@ -165,7 +183,7 @@ int main_prune(int argc, char** argv) { }; int option_index = 0; - c = getopt_long(argc, argv, "k:e:s:Pruvx:g:m:apt:dh", long_options, &option_index); + c = getopt_long(argc, argv, "k:e:s:M:Pruvx:g:m:apt:dh", long_options, &option_index); if (c == -1) { break; } // End of options. switch (c) @@ -182,6 +200,10 @@ int main_prune(int argc, char** argv) { subgraph_min = parse(optarg); subgraph_min_set = true; break; + case 'M': + max_degree = parse(optarg); + max_degree_set = true; + break; case 'P': mode = mode_prune; break; @@ -195,7 +217,7 @@ int main_prune(int argc, char** argv) { verify_paths = true; break; case 'x': // no longer needed - std::cerr << "[vg prune]: option --xg-name is no longer needed" << std::endl; + std::cerr << "warning: [vg prune] option --xg-name is no longer needed" << std::endl; break; case 'g': gbwt_name = optarg; @@ -227,7 +249,15 @@ int main_prune(int argc, char** argv) { std::abort(); } } - vg_name = (optind >= argc ? "(stdin)" : argv[optind]); + + if (optind < argc) { + // There's an input file specified. + vg_name = get_input_file_name(optind, argc, argv); + } else { + // Assume they want stdin + vg_name = "-"; + } + if (!kmer_length_set) { kmer_length = PruningParameters::kmer_length[mode]; } @@ -237,40 +267,42 @@ int main_prune(int argc, char** argv) { if (!subgraph_min_set) { subgraph_min = PruningParameters::subgraph_min[mode]; } - if (!(kmer_length > 0 && edge_max > 0)) { - std::cerr << "[vg prune]: --kmer-length and --edge-max must be positive" << std::endl; - return 1; + if (!max_degree_set) { + max_degree = PruningParameters::max_degree[mode]; } - if (append_mapping && mapping_name.empty()) { - std::cerr << "[vg prune]: parameter --append-mapping requires --mapping" << std::endl; + if (!(kmer_length > 0 && edge_max > 0)) { + std::cerr << "error: [vg prune] --kmer-length and --edge-max must be positive" << std::endl; return 1; } // Mode-specific checks. if (mode == mode_prune) { if (verify_paths) { - std::cerr << "[vg prune]: mode " << mode_name(mode) << " does not have paths to verify" << std::endl; + std::cerr << "error: [vg prune] mode " << mode_name(mode) << " does not have paths to verify" << std::endl; return 1; } if (!(gbwt_name.empty() && mapping_name.empty())) { - std::cerr << "[vg prune]: mode " << mode_name(mode) << " does not use additional files" << std::endl; + std::cerr << "error: [vg prune] mode " << mode_name(mode) << " does not use additional files" << std::endl; return 1; } } if (mode == mode_restore) { if (!(gbwt_name.empty() && mapping_name.empty())) { - std::cerr << "[vg prune]: mode " << mode_name(mode) << " does not use additional files" << std::endl; + std::cerr << "error: [vg prune] mode " << mode_name(mode) << " does not use additional files" << std::endl; return 1; } } if (mode == mode_unfold) { - // Nothing here + if (mapping_name.empty()) { + std::cerr << "error: [vg prune] mode --unfold requires a node mapping file specified with --mapping" << std::endl; + return 1; + } } // Dry run. if (dry_run) { std::cerr << "Pruning mode: " << mode_name(mode) << std::endl; - std::cerr << "Parameters: --kmer-length " << kmer_length << " --edge-max " << edge_max << " --subgraph-min " << subgraph_min << std::endl; + std::cerr << "Parameters: --kmer-length " << kmer_length << " --edge-max " << edge_max << " --subgraph-min " << subgraph_min << " --max-degree " << max_degree << std::endl; std::cerr << "Options: --threads " << omp_get_max_threads(); if (verify_paths) { std::cerr << " --verify-paths"; @@ -286,7 +318,7 @@ int main_prune(int argc, char** argv) { } std::cerr << std::endl; if (!vg_name.empty()) { - std::cerr << "VG: " << vg_name << std::endl; + std::cerr << "VG: " << (vg_name == "-" ? "(stdin)" : vg_name) << std::endl; } if (!gbwt_name.empty()) { std::cerr << "GBWT: " << gbwt_name << std::endl; @@ -298,51 +330,78 @@ int main_prune(int argc, char** argv) { } // Handle the input. - VG* graph; + std::unique_ptr graph; + graph = vg::io::VPKG::load_one(vg_name); xg::XG xg_index; - gbwt::GBWT gbwt_index; - get_input_file(optind, argc, argv, [&](std::istream& in) { - graph = new VG(in); - }); + std::unique_ptr gbwt_index; + + vg::id_t max_node_id = graph->max_node_id(); if (show_progress) { - std::cerr << "Original graph " << vg_name << ": " << graph->node_count() << " nodes, " << graph->edge_count() << " edges" << std::endl; + std::cerr << "Original graph " << vg_name << ": " << graph->get_node_count() << " nodes, " << graph->get_edge_count() << " edges" << std::endl; } // Remove the paths and build an XG index if needed. if (mode == mode_restore || mode == mode_unfold) { - remove_paths(graph->graph, Paths::is_alt, nullptr); - xg_index.from_graph(graph->graph); + vector alt_path_handles; + graph->for_each_path_handle([&](path_handle_t path_handle) { + if (Paths::is_alt(graph->get_path_name(path_handle))) { + alt_path_handles.push_back(path_handle); + } + }); + for (auto& alt_path_handle : alt_path_handles) { + graph->destroy_path(alt_path_handle); + } + xg_index.from_path_handle_graph(*graph); if (show_progress) { std::cerr << "Built a temporary XG index" << std::endl; } } - graph->graph.clear_path(); - graph->paths.clear(); + + // Destroy all remaining paths + vector path_handles; + graph->for_each_path_handle([&](path_handle_t path_handle) { + path_handles.push_back(path_handle); + }); + for (auto path_handle : path_handles) { + graph->destroy_path(path_handle); + } + if (show_progress) { std::cerr << "Removed all paths" << std::endl; } + // Remove high-degree nodes. + if (max_degree > 0) { + algorithms::remove_high_degree_nodes(*graph, max_degree); + if (show_progress) { + std::cerr << "Removed high-degree nodes: " + << graph->get_node_count() << " nodes, " << graph->get_edge_count() << " edges" << std::endl; + } + } + // Prune the graph. - graph->prune_complex_with_head_tail(kmer_length, edge_max); + algorithms::prune_complex_with_head_tail(*graph, kmer_length, edge_max); if (show_progress) { std::cerr << "Pruned complex regions: " - << graph->node_count() << " nodes, " << graph->edge_count() << " edges" << std::endl; + << graph->get_node_count() << " nodes, " << graph->get_edge_count() << " edges" << std::endl; } - graph->prune_short_subgraphs(subgraph_min); + algorithms::prune_short_subgraphs(*graph, subgraph_min); if (show_progress) { std::cerr << "Removed small subgraphs: " - << graph->node_count() << " nodes, " << graph->edge_count() << " edges" << std::endl; + << graph->get_node_count() << " nodes, " << graph->get_edge_count() << " edges" << std::endl; } // Restore the non-alt paths. if (mode == mode_restore) { - PhaseUnfolder unfolder(xg_index, gbwt_index, max_node_id + 1); + // Make an empty GBWT index to pass along + gbwt::GBWT empty_gbwt; + PhaseUnfolder unfolder(xg_index, empty_gbwt, max_node_id + 1); unfolder.restore_paths(*graph, show_progress); if (verify_paths) { size_t failures = unfolder.verify_paths(*graph, show_progress); if (failures > 0) { - std::cerr << "[vg prune]: verification failed for " << failures << " paths" << std::endl; + std::cerr << "warning: [vg prune] verification failed for " << failures << " paths" << std::endl; } } } @@ -351,10 +410,18 @@ int main_prune(int argc, char** argv) { if (mode == mode_unfold) { if (!gbwt_name.empty()) { get_input_file(gbwt_name, [&](std::istream& in) { - gbwt_index.load(in); + gbwt_index = vg::io::VPKG::load_one(in); + if (gbwt_index.get() == nullptr) { + std::cerr << "[vg prune]: could not load GBWT" << std::endl; + exit(1); + } }); + } else { + // The PhaseUnfolder can't deal with having no GBWT at all; we need to give it an empty one. + gbwt_index = unique_ptr(new gbwt::GBWT()); + // TODO: Let us pass in null pointers instead. } - PhaseUnfolder unfolder(xg_index, gbwt_index, max_node_id + 1); + PhaseUnfolder unfolder(xg_index, *gbwt_index, max_node_id + 1); if (append_mapping) { unfolder.read_mapping(mapping_name); } @@ -365,19 +432,19 @@ int main_prune(int argc, char** argv) { if (verify_paths) { size_t failures = unfolder.verify_paths(*graph, show_progress); if (failures > 0) { - std::cerr << "[vg prune]: verification failed for " << failures << " paths" << std::endl; + std::cerr << "warning: [vg prune] verification failed for " << failures << " paths" << std::endl; } } } // Serialize. - graph->serialize_to_ostream(std::cout); + + vg::io::save_handle_graph(graph.get(), std::cout); if (show_progress) { std::cerr << "Serialized the graph: " - << graph->node_count() << " nodes, " << graph->edge_count() << " edges" << std::endl; + << graph->get_node_count() << " nodes, " << graph->get_edge_count() << " edges" << std::endl; } - delete graph; graph = nullptr; return 0; } diff --git a/src/subcommand/recalibrate_main.cpp b/src/subcommand/recalibrate_main.cpp deleted file mode 100644 index d5e323ab2fc..00000000000 --- a/src/subcommand/recalibrate_main.cpp +++ /dev/null @@ -1,274 +0,0 @@ -// recalibrate_main.cpp: mapping quality recalibration for GAM files - -#include -#include -#include - -#include -#include - -#include - -#include "../alignment.hpp" -#include "../vg.hpp" -#include "../stream.hpp" - -#include - -using namespace std; -using namespace vg; -using namespace vg::subcommand; - -void help_recalibrate(char** argv) { - cerr << "usage: " << argv[0] << " recalibrate [options] --model learned.model mapped.gam > recalibrated.gam" << endl - << " " << argv[0] << " recalibrate [options] --model learned.model --train compared.gam" << endl - << endl - << "options:" << endl - << " -T, --train read the input GAM file, and use the mapped_correctly flags from vg gamcompare to train a model" << endl - << " -m, --model FILE load/save the model to/from the given file" << endl - << " -t, --threads N number of threads to use" << endl; -} - -/// Turn an Alignment into a Vowpal Wabbit format example line. -/// If train is true, give it a label so that VW will train on it. -/// If train is false, do not label the data. -string alignment_to_example_string(const Alignment& aln, bool train) { - // We will dump it to a string stream - stringstream s; - - if (train) { - // First is the class; 1 for correct or -1 for wrong - s << (aln.correctly_mapped() ? "1 " : "-1 "); - } - - // Drop all the features into the mepty-string namespace - s << "| "; - - // Original MAPQ is a feature - s << "origMapq:" << to_string(aln.mapping_quality()) << " "; - - // As is score - s << "score:" << to_string(aln.score()) << " "; - - // And the top secondary alignment score - double secondary_score = 0; - if (aln.secondary_score_size() > 0) { - secondary_score = aln.secondary_score(0); - } - s << "secondaryScore:" << to_string(secondary_score) << " "; - - // Count the secondary alignments - s << "secondaryCount:" << aln.secondary_score_size() << " "; - - // Also do the identity - s << "identity:" << aln.identity() << " "; - - // TODO: more features - return s.str(); -} - -int main_recalibrate(int argc, char** argv) { - - if (argc == 2) { - help_recalibrate(argv); - exit(1); - } - - int threads = 1; - bool train = false; - string model_filename; - - int c; - optind = 2; - while (true) { - static struct option long_options[] = - { - {"help", no_argument, 0, 'h'}, - {"train", no_argument, 0, 'T'}, - {"model", required_argument, 0, 'm'}, - {"threads", required_argument, 0, 't'}, - {0, 0, 0, 0} - }; - - int option_index = 0; - c = getopt_long (argc, argv, "hTm:t:", - long_options, &option_index); - - // Detect the end of the options. - if (c == -1) break; - - switch (c) - { - - case 'T': - train = true; - break; - - case 'm': - model_filename = optarg; - break; - - case 't': - threads = parse(optarg); - omp_set_num_threads(threads); - break; - - case 'h': - case '?': - help_recalibrate(argv); - exit(1); - break; - - default: - abort (); - } - } - - get_input_file(optind, argc, argv, [&](istream& gam_stream) { - // With the GAM input - - if (train) { - // We want to train a model. - - // Get a VW model. - // Most of the parameters are passed as a command-line option string. - // We must always pass --no_stdin because - // - // says so. - string vw_args = "--no_stdin"; - // We need the logistic stuff to make the predictor predict probabilities - vw_args += " --link=logistic --loss_function=logistic"; - // We need this to do quadradic interaction features (kernel) - vw_args += " -q ::"; - // Add L2 regularization - vw_args += " --l2 0.000001"; - - - // We also apparently have to decide now what file name we want output to go to and use -f to send it there. - if (!model_filename.empty()) { - // Save to the given model - vw_args += " -f " + model_filename; - -#ifdef debug - // Also dump a human-readable version where feature names aren't hashed. - vw_args += " --invert_hash " + model_filename + ".inv"; -#endif - - } - - // TODO: what do any of the other parameters do? - // TODO: Note that vw defines a VW namespace but dumps its vw type into the global namespace. - vw* model = VW::initialize(vw_args); - - function train_on = [&](Alignment& aln) { - - // Turn each Alignment into a VW-format string - string example_string = alignment_to_example_string(aln, true); - - // Load an example for each Alignment. - // You can apparently only have so many examples at a time because they live in a ring buffer of unspecified size. - // TODO: There are non-string-parsing-based ways to do this too. - // TODO: Why link against vw at all if we're just going to shuffle strings around? We could pipe to it. - // TODO: vw alo dumps "example" into the global namespace... - example* example = VW::read_example(*model, example_string); - - // Now we call the learn method, defined in vowpalwabbit/global_data.h. - // It's not clear what it does but it is probably training. - // If we didn't label the data, this would just predict instead. - model->learn(example); - - // Clean up the example - VW::finish_example(*model, example); - }; - - // TODO: We have to go in serial because vw isn't thread safe I think. - stream::for_each(gam_stream, train_on); - - // Now we want to output the model. - // TODO: We had to specify that already. I think it is magic? - - // Clean up the VW model - VW::finish(*model); - - } else { - // We are in run mode - - string vw_args = "--no_stdin"; - - if (!model_filename.empty()) { - // Load from the given model - vw_args += " -i " + model_filename; - } - - // Make the model - vw* model = VW::initialize(vw_args); - - // Define a buffer for alignments to print - vector buf; - - // Specify how to recalibrate an alignment - function recalibrate = [&](Alignment& aln) { - - // Turn each Alignment into a VW-format string - string example_string = alignment_to_example_string(aln, false); - - // Load an example for each Alignment. - example* example = VW::read_example(*model, example_string); - - // Now we call the learn method, defined in vowpalwabbit/global_data.h. - // It's not clear what it does but it is probably training. - // If we didn't label the data, this would just predict instead. - model->learn(example); - - // Get the correctness prediction from -1 to 1 - double prob = example->pred.prob; - // Convert into a real MAPQ estimate. - double guess = prob_to_phred(1.0 - prob); - // Clamp to 0 to 60 - double clamped = max(0.0, min(60.0, guess)); - -#ifdef debug - cerr << example_string << " -> " << prob << " -> " << guess << " -> " << clamped << endl; -#endif - - // Set the MAPQ to output. - aln.set_mapping_quality(clamped); - - // Clean up the example - VW::finish_example(*model, example); - - - -#pragma omp critical (buf) - { - // Save to the buffer - buf.push_back(aln); - if (buf.size() > 1000) { - // And output if buffer is full - write_alignments(cout, buf); - buf.clear(); - } - } - }; - - // For each read, recalibrate and buffer and maybe print it. - // TODO: It would be nice if this could be parallel... - stream::for_each(gam_stream, recalibrate); - - VW::finish(*model); - - // Flush the buffer - write_alignments(cout, buf); - buf.clear(); - // Finish the stream with an EOF marker - stream::finish(cout); - cout.flush(); - } - - }); - - return 0; -} - -// Register subcommand -static Subcommand vg_recalibrate("recalibrate", "recalibrate mapping qualities", main_recalibrate); diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp new file mode 100644 index 00000000000..77bbb4981c4 --- /dev/null +++ b/src/subcommand/rna_main.cpp @@ -0,0 +1,518 @@ +/** \file rna_main.cpp + * + * Defines the "vg rna" subcommand. + */ + +#include +#include +#include + +#include "subcommand.hpp" + +#include "../transcriptome.hpp" +#include +#include +#include "../gbwt_helper.hpp" +#include "bdsg/packed_graph.hpp" +#include +#include + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + + +void help_rna(char** argv) { + cerr << "\nusage: " << argv[0] << " rna [options] graph.[vg|pg|hg|gbz] > splicing_graph.[vg|pg|hg]" << endl + + << "\nGeneral options:" << endl + + << " -t, --threads INT number of compute threads to use [1]" << endl + << " -p, --progress show progress" << endl + << " -h, --help print help message" << endl + + << "\nInput options:" << endl + + << " -n, --transcripts FILE transcript file(s) in gtf/gff format; may repeat" << endl + << " -m, --introns FILE intron file(s) in bed format; may repeat" << endl + << " -y, --feature-type NAME parse only this feature type in the gtf/gff (parses all if empty) [exon]" << endl + << " -s, --transcript-tag NAME use this attribute tag in the gtf/gff file(s) as id [transcript_id]" << endl + << " -l, --haplotypes FILE project transcripts onto haplotypes in GBWT index file" << endl + << " -z, --gbz-format input graph is in GBZ format (contains both a graph and haplotypes (GBWT index))" << endl + + << "\nConstruction options:" << endl + + << " -j, --use-hap-ref use haplotype paths in GBWT index as reference sequences (disables projection)" << endl + << " -e, --proj-embed-paths project transcripts onto embedded haplotype paths" << endl + << " -c, --path-collapse TYPE collapse identical transcript paths across no|haplotype|all paths [haplotype]" << endl + << " -k, --max-node-length INT chop nodes longer than maximum node length (0 disables chopping) [0]" << endl + << " -d, --remove-non-gene remove intergenic and intronic regions (deletes all paths in the graph)" << endl + << " -o, --do-not-sort do not topological sort and compact the graph" << endl + << " -r, --add-ref-paths add reference transcripts as embedded paths in the graph" << endl + << " -a, --add-hap-paths add projected transcripts as embedded paths in the graph" << endl + + << "\nOutput options:" << endl + + << " -b, --write-gbwt FILE write pantranscriptome transcript paths as GBWT index file" << endl + << " -f, --write-fasta FILE write pantranscriptome transcript sequences as fasta file" << endl + << " -i, --write-info FILE write pantranscriptome transcript info table as tsv file" << endl + << " -q, --out-exclude-ref exclude reference transcripts from pantranscriptome output" << endl + << " -g, --gbwt-bidirectional use bidirectional paths in GBWT index construction" << endl + + << endl; +} + +int32_t main_rna(int32_t argc, char** argv) { + + if (argc == 2) { + help_rna(argv); + return 1; + } + + vector transcript_filenames; + vector intron_filenames; + string feature_type = "exon"; + string transcript_tag = "transcript_id"; + string haplotypes_filename; + bool gbz_format = false; + bool use_hap_ref = false; + bool proj_emded_paths = false; + string path_collapse_type = "haplotype"; + uint32_t max_node_length = 0; + bool remove_non_transcribed_nodes = false; + bool sort_collapse_graph = true; + bool add_reference_transcript_paths = false; + bool add_projected_transcript_paths = false; + bool exclude_reference_transcripts = false; + string gbwt_out_filename = ""; + bool gbwt_add_bidirectional = false; + string fasta_out_filename = ""; + string info_out_filename = ""; + int32_t num_threads = 1; + bool show_progress = false; + + int32_t c; + optind = 2; + + while (true) { + static struct option long_options[] = + { + {"transcripts", required_argument, 0, 'n'}, + {"introns", required_argument, 0, 'm'}, + {"feature-type", required_argument, 0, 'y'}, + {"transcript-tag", required_argument, 0, 's'}, + {"haplotypes", required_argument, 0, 'l'}, + {"gbz-format", no_argument, 0, 'z'}, + {"use-hap-ref", no_argument, 0, 'j'}, + {"proj-embed-paths", no_argument, 0, 'e'}, + {"path-collapse", required_argument, 0, 'c'}, + {"max-node-length", required_argument, 0, 'k'}, + {"remove-non-gene", no_argument, 0, 'd'}, + {"do-not-sort", no_argument, 0, 'o'}, + {"add-ref-paths", no_argument, 0, 'r'}, + {"add-hap-paths", no_argument, 0, 'a'}, + {"write-gbwt", required_argument, 0, 'b'}, + {"write-fasta", required_argument, 0, 'f'}, + {"write-info", required_argument, 0, 'i'}, + {"out-ref-paths", no_argument, 0, 'u'}, + {"out-exclude-ref", no_argument, 0, 'q'}, + {"gbwt-bidirectional", no_argument, 0, 'g'}, + {"threads", required_argument, 0, 't'}, + {"progress", no_argument, 0, 'p'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int32_t option_index = 0; + c = getopt_long(argc, argv, "n:m:y:s:l:zjec:k:dorab:f:i:uqgt:ph?", long_options, &option_index); + + /* Detect the end of the options. */ + if (c == -1) + break; + + switch (c) + { + + case 'n': + transcript_filenames.push_back(optarg); + break; + + case 'm': + intron_filenames.push_back(optarg); + break; + + case 'y': + feature_type = optarg; + break; + + case 's': + transcript_tag = optarg; + break; + + case 'l': + haplotypes_filename = optarg; + break; + + case 'z': + gbz_format = true; + break; + + case 'j': + use_hap_ref = true; + break; + + case 'e': + proj_emded_paths = true; + break; + + case 'c': + path_collapse_type = optarg; + break; + + case 'k': + max_node_length = stoi(optarg); + break; + + case 'd': + remove_non_transcribed_nodes = true; + break; + + case 'o': + sort_collapse_graph = false; + break; + + case 'r': + add_reference_transcript_paths = true; + break; + + case 'a': + add_projected_transcript_paths = true; + break; + + case 'b': + gbwt_out_filename = optarg; + break; + + case 'f': + fasta_out_filename = optarg; + break; + + case 'i': + info_out_filename = optarg; + break; + + case 'u': + exclude_reference_transcripts = false; + break; + + case 'q': + exclude_reference_transcripts = true; + break; + + case 'g': + gbwt_add_bidirectional = true; + break; + + case 't': + num_threads = stoi(optarg); + break; + + case 'p': + show_progress = true; + break; + + case 'h': + case '?': + help_rna(argv); + exit(1); + break; + + default: + abort(); + } + } + + if (argc < optind + 1) { + help_rna(argv); + return 1; + } + + if (transcript_filenames.empty() && intron_filenames.empty()) { + + cerr << "[vg rna] ERROR: No transcripts or introns were given. Use --transcripts FILE and/or --introns FILE." << endl; + return 1; + } + + if (!haplotypes_filename.empty() && gbz_format) { + + cerr << "[vg rna] ERROR: Only one set of haplotypes can be provided (GBZ file contains both a graph and haplotypes). Use either --haplotypes or --gbz-format." << endl; + return 1; + } + + if (remove_non_transcribed_nodes && !add_reference_transcript_paths && !add_projected_transcript_paths) { + + cerr << "[vg rna] WARNING: Reference paths are deleted when removing intergenic and intronic regions. Consider adding transcripts as embedded paths using --add-ref-paths and/or --add-hap-paths." << endl; + } + + if (path_collapse_type != "no" && path_collapse_type != "haplotype" && path_collapse_type != "all") { + + cerr << "[vg rna] ERROR: Path collapse type (--path-collapse) provided not supported. Options: no, haplotype or all." << endl; + return 1; + } + + double time_parsing_start = gcsa::readTimer(); + if (show_progress) { cerr << "[vg rna] Parsing graph file ..." << endl; } + + string graph_filename = get_input_file_name(optind, argc, argv); + + unique_ptr graph(nullptr); + unique_ptr haplotype_index; + + if (!gbz_format) { + + // Load pangenome graph. + graph = move(vg::io::VPKG::load_one(graph_filename)); + + if (!haplotypes_filename.empty()) { + + // Load haplotype GBWT index. + if (show_progress) { cerr << "[vg rna] Parsing haplotype GBWT index file ..." << endl; } + haplotype_index = vg::io::VPKG::load_one(haplotypes_filename); + assert(haplotype_index->bidirectional()); + + } else { + + // Construct empty GBWT index if non is given. + haplotype_index = unique_ptr(new gbwt::GBWT()); + } + + } else { + + graph = unique_ptr(new bdsg::PackedGraph()); + + // Load GBZ file + unique_ptr gbz = vg::io::VPKG::load_one(graph_filename); + + if (show_progress) { cerr << "[vg rna] Converting graph format ..." << endl; } + + // Convert GBWTGraph to mutable graph type (PackedGraph). + graph->set_id_increment(gbz->graph.min_node_id()); + handlealgs::copy_handle_graph(&(gbz->graph), graph.get()); + + // Copy reference and generic paths to new graph. + gbz->graph.for_each_path_matching({PathSense::GENERIC, PathSense::REFERENCE}, {}, {}, [&](const path_handle_t& path) { + + handlegraph::algorithms::copy_path(&(gbz->graph), path, graph.get()); + }); + + haplotype_index = make_unique(gbz->index); + } + + if (graph == nullptr) { + cerr << "[transcriptome] ERROR: Could not load graph." << endl; + exit(1); + } + + // Construct transcriptome and parse graph. + Transcriptome transcriptome(move(graph)); + assert(graph == nullptr); + + transcriptome.show_progress = show_progress; + transcriptome.num_threads = num_threads; + transcriptome.feature_type = feature_type; + transcriptome.transcript_tag = transcript_tag; + transcriptome.path_collapse_type = path_collapse_type; + + if (show_progress) { cerr << "[vg rna] Graph " << ((!haplotype_index->empty()) ? "and GBWT index " : "") << "parsed in " << gcsa::readTimer() - time_parsing_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; + + + if (!intron_filenames.empty()) { + + double time_intron_start = gcsa::readTimer(); + if (show_progress) { cerr << "[vg rna] Adding intron splice-junctions to graph ..." << endl; } + + vector intron_streams; + intron_streams.reserve(intron_filenames.size()); + + for (auto & filename: intron_filenames) { + + auto intron_stream = new ifstream(filename); + intron_streams.emplace_back(intron_stream); + } + + // Add introns as novel splice-junctions to graph. + transcriptome.add_intron_splice_junctions(intron_streams, haplotype_index, true); + + for (auto & intron_stream: intron_streams) { + + delete intron_stream; + } + + if (show_progress) { cerr << "[vg rna] Introns parsed and graph updated in " << gcsa::readTimer() - time_intron_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; + } + + + vector transcript_streams; + + if (!transcript_filenames.empty()) { + + double time_transcript_start = gcsa::readTimer(); + if (show_progress) { cerr << "[vg rna] Adding transcript splice-junctions and exon boundaries to graph ..." << endl; } + + transcript_streams.reserve(transcript_filenames.size()); + + for (auto & filename: transcript_filenames) { + + auto transcript_stream = new ifstream(filename); + transcript_streams.emplace_back(transcript_stream); + } + + // Add transcripts as novel exon boundaries and splice-junctions to graph. + transcriptome.add_reference_transcripts(transcript_streams, haplotype_index, use_hap_ref, !use_hap_ref); + + if (show_progress) { cerr << "[vg rna] Transcripts parsed and graph updated in " << gcsa::readTimer() - time_transcript_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; + } + + + if (!transcript_streams.empty() && (!haplotype_index->empty() || proj_emded_paths) && !use_hap_ref) { + + double time_project_start = gcsa::readTimer(); + if (show_progress) { cerr << "[vg rna] Projecting transcripts to haplotypes ..." << endl; } + + for (auto & transcript_stream: transcript_streams) { + + // Reset transcript file streams. + transcript_stream->clear(); + transcript_stream->seekg(0); + } + + // Add transcripts to transcriptome by projecting them onto embedded paths + // in a graph and/or haplotypes in a GBWT index. + transcriptome.add_haplotype_transcripts(transcript_streams, *haplotype_index, proj_emded_paths); + + if (show_progress) { cerr << "[vg rna] Haplotype-specific transcripts constructed in " << gcsa::readTimer() - time_project_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; + } + + for (auto & transcript_stream: transcript_streams) { + + delete transcript_stream; + } + + + if (remove_non_transcribed_nodes) { + + double time_remove_start = gcsa::readTimer(); + if (show_progress) { cerr << "[vg rna] Removing non-transcribed regions ..." << endl; } + + transcriptome.remove_non_transcribed_nodes(); + + if (show_progress) { cerr << "[vg rna] Regions removed in " << gcsa::readTimer() - time_remove_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; + } + + + if (max_node_length > 0) { + + double time_chop_start = gcsa::readTimer(); + if (show_progress) { cerr << "[vg rna] Chopping long nodes ..." << endl; } + + transcriptome.chop_nodes(max_node_length); + + if (show_progress) { cerr << "[vg rna] Nodes chopped in " << gcsa::readTimer() - time_chop_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; + } + + + if (sort_collapse_graph) { + + double time_sort_start = gcsa::readTimer(); + if (show_progress) { cerr << "[vg rna] Topological sorting graph and compacting node ids ..." << endl; } + + if (transcriptome.sort_compact_nodes()) { + + if (show_progress) { cerr << "[vg rna] Graph sorted and compacted in " << gcsa::readTimer() - time_sort_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; + + } else { + + if (show_progress) { cerr << "[vg rna] WARNING: Can only sort and compact node ids for a graph in the PackedGraph format" << endl; }; + } + } + + + if (add_reference_transcript_paths || add_projected_transcript_paths) { + + double time_add_start = gcsa::readTimer(); + + if (add_reference_transcript_paths && add_projected_transcript_paths) { + + if (show_progress) { cerr << "[vg rna] Adding reference and projected transcripts as embedded paths in the graph ..." << endl; } + + } else { + + if (show_progress) { cerr << "[vg rna] Adding " << ((add_reference_transcript_paths) ? "reference" : "projected") << " transcripts as embedded paths in the graph ..." << endl; } + } + + transcriptome.embed_transcript_paths(add_reference_transcript_paths, add_projected_transcript_paths); + + if (show_progress) { cerr << "[vg rna] Transcript paths added in " << gcsa::readTimer() - time_add_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; + } + + + double time_writing_start = gcsa::readTimer(); + + bool write_pantranscriptome = (!gbwt_out_filename.empty() || !fasta_out_filename.empty() || !info_out_filename.empty()); + + if (write_pantranscriptome) { + + if (show_progress) { cerr << "[vg rna] Writing pantranscriptome transcripts to file(s) ..." << endl; } + } + + // Write transcript paths in transcriptome as GBWT index. + if (!gbwt_out_filename.empty()) { + + // Silence GBWT index construction. + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + gbwt::GBWTBuilder gbwt_builder(gbwt::bit_length(gbwt::Node::encode(transcriptome.graph().max_node_id(), true)), gbwt::DynamicGBWT::INSERT_BATCH_SIZE, gbwt::DynamicGBWT::SAMPLE_INTERVAL); + + transcriptome.add_transcripts_to_gbwt(&gbwt_builder, gbwt_add_bidirectional, exclude_reference_transcripts); + + assert(gbwt_builder.index.hasMetadata()); + + // Finish contruction and recode index. + gbwt_builder.finish(); + save_gbwt(gbwt_builder.index, gbwt_out_filename); + } + + // Write transcript sequences in transcriptome as fasta file. + if (!fasta_out_filename.empty()) { + + ofstream fasta_ostream; + fasta_ostream.open(fasta_out_filename); + + transcriptome.write_transcript_sequences(&fasta_ostream, exclude_reference_transcripts); + + fasta_ostream.close(); + } + + // Write transcript info in transcriptome as tsv file. + if (!info_out_filename.empty()) { + + ofstream info_ostream; + info_ostream.open(info_out_filename); + + transcriptome.write_transcript_info(&info_ostream, *haplotype_index, exclude_reference_transcripts); + + info_ostream.close(); + } + + if (show_progress) { cerr << "[vg rna] Writing splicing graph to stdout ..." << endl; } + + // Write splicing graph to stdout + transcriptome.write_graph(&cout); + + if (show_progress) { cerr << "[vg rna] Graph " << (write_pantranscriptome ? "and pantranscriptome " : "") << "written in " << gcsa::readTimer() - time_writing_start << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; }; + + return 0; +} + +// Register subcommand +static Subcommand vg_rna("rna", "construct splicing graphs and pantranscriptomes", PIPELINE, 3, main_rna); + diff --git a/src/subcommand/sift_main.cpp b/src/subcommand/sift_main.cpp index 808210319ab..c3424bb4a7a 100644 --- a/src/subcommand/sift_main.cpp +++ b/src/subcommand/sift_main.cpp @@ -7,10 +7,10 @@ #include #include "subcommand.hpp" #include -#include "../stream.hpp" -#include "../json2pb.h" +#include +#include "vg/io/json2pb.h" #include "../vg.hpp" -#include "vg.pb.h" +#include #include "../filter.hpp" #include "../alignment.hpp" @@ -30,7 +30,6 @@ void help_sift(char** argv){ << "General Options: " << endl << " -t / --threads number of OMP threads (not all algorithms are parallelized)." << endl //<< " -v / --inverse return the inverse of a query (like grep -v)" << endl - << " -x / --xg An XG index (for realignment of split reads)" << endl << " -p / --paired Input reads are paired-end" << endl << " -R / --remap Remap (locally) any soft-clipped, split, or discordant read pairs." << endl << " -o / --output " << endl @@ -117,7 +116,7 @@ int main_sift(int argc, char** argv){ }; int option_index = 0; - c = getopt_long (argc, argv, "hut:vx:gG:pRo:I:W:OCDc:s:q:d:i:aw:r1", + c = getopt_long (argc, argv, "hut:vgG:pRo:I:W:OCDc:s:q:d:i:aw:r1", long_options, &option_index); // Detect the end of the options. @@ -489,15 +488,15 @@ int main_sift(int argc, char** argv){ } - stream::write_buffered(unmapped_stream, unmapped_selected, 100); - stream::write_buffered(discordant_stream, discordant_selected, 100); - stream::write_buffered(oea_stream, one_end_anchored, 100); - stream::write_buffered(insert_stream, insert_selected, 100); - stream::write_buffered(split_stream, split_selected, 100); - stream::write_buffered(clipped_stream, clipped_selected, 100); - stream::write_buffered(clean_stream, clean, 100); - stream::write_buffered(reversing_stream, reversing_selected, 100); - stream::write_buffered(perfect_stream, perfect, 100); + vg::io::write_buffered(unmapped_stream, unmapped_selected, 100); + vg::io::write_buffered(discordant_stream, discordant_selected, 100); + vg::io::write_buffered(oea_stream, one_end_anchored, 100); + vg::io::write_buffered(insert_stream, insert_selected, 100); + vg::io::write_buffered(split_stream, split_selected, 100); + vg::io::write_buffered(clipped_stream, clipped_selected, 100); + vg::io::write_buffered(clean_stream, clean, 100); + vg::io::write_buffered(reversing_stream, reversing_selected, 100); + vg::io::write_buffered(perfect_stream, perfect, 100); }; std::function single_filters = [&](Alignment& aln){ @@ -511,7 +510,7 @@ int main_sift(int argc, char** argv){ if (ff.soft_clip_filter(aln)){ clipped_selected.push_back(aln); } - //stream::write_buffered(clipped_stream, clipped_selected, 1000); + //vg::io::write_buffered(clipped_stream, clipped_selected, 1000); } if (do_quality){ @@ -542,7 +541,7 @@ int main_sift(int argc, char** argv){ if (alignment_file == "-"){ - stream::for_each_interleaved_pair_parallel(cin, pair_filters); + vg::io::for_each_interleaved_pair_parallel(cin, pair_filters); } else{ ifstream in; @@ -550,16 +549,16 @@ else{ if (in.good()){ // if (just_calc_insert){ - // stream::for_each_interleaved_pair_parallel(in, calc_insert); + // vg::io::for_each_interleaved_pair_parallel(in, calc_insert); // exit(0); // } if (is_paired){ cerr << "Processing..." << endl; - stream::for_each_interleaved_pair_parallel(in, pair_filters); + vg::io::for_each_interleaved_pair_parallel(in, pair_filters); } else{ - stream::for_each_parallel(in, single_filters); + vg::io::for_each_parallel(in, single_filters); } } @@ -568,15 +567,15 @@ else{ help_sift(argv); } } - stream::write_buffered(unmapped_stream, unmapped_selected, 0); - stream::write_buffered(discordant_stream, discordant_selected, 0); - stream::write_buffered(oea_stream, one_end_anchored, 0); - stream::write_buffered(insert_stream, insert_selected, 0); - stream::write_buffered(split_stream, split_selected, 0); - stream::write_buffered(clipped_stream, clipped_selected, 0); - stream::write_buffered(clean_stream, clean, 0); - stream::write_buffered(reversing_stream, reversing_selected, 0); - stream::write_buffered(perfect_stream, perfect, 0); + vg::io::write_buffered(unmapped_stream, unmapped_selected, 0); + vg::io::write_buffered(discordant_stream, discordant_selected, 0); + vg::io::write_buffered(oea_stream, one_end_anchored, 0); + vg::io::write_buffered(insert_stream, insert_selected, 0); + vg::io::write_buffered(split_stream, split_selected, 0); + vg::io::write_buffered(clipped_stream, clipped_selected, 0); + vg::io::write_buffered(clean_stream, clean, 0); + vg::io::write_buffered(reversing_stream, reversing_selected, 0); + vg::io::write_buffered(perfect_stream, perfect, 0); buffer.clear(); @@ -586,6 +585,6 @@ else{ return 0; } -static Subcommand vg_sift("sift", "Filter Alignments by various metrics related to variant calling.", main_sift); +static Subcommand vg_sift("sift", "Filter Alignments by various metrics related to variant calling.", DEPRECATED, main_sift); diff --git a/src/subcommand/sim_main.cpp b/src/subcommand/sim_main.cpp index 83a4a32d8a8..212c1407a2e 100644 --- a/src/subcommand/sim_main.cpp +++ b/src/subcommand/sim_main.cpp @@ -10,29 +10,94 @@ #include #include +#include +#include #include "subcommand.hpp" #include "../vg.hpp" -#include "../mapper.hpp" +#include "../aligner.hpp" +#include "../gbwt_helper.hpp" +#include "vg/io/alignment_emitter.hpp" #include "../sampler.hpp" -#include "../stream.hpp" +#include +#include +#include +#include using namespace std; using namespace vg; using namespace vg::subcommand; +using namespace vg::io; + +// Gets the transcript IDs and TPM values from an RSEM output .tsv file +vector> parse_rsem_expression_file(istream& rsem_in) { + vector> return_val; + string line; + // skip the header line + getline(rsem_in, line); + line.clear(); + while (getline(rsem_in, line)) { + vector tokens; + stringstream strm(line); + string token; + while (getline(strm, token, '\t')) { + tokens.push_back(move(token)); + token.clear(); + } + if (tokens.size() != 8) { + cerr << "[vg sim] error: Cannot parse transcription file. Expected 8-column TSV file as produced by RSEM, got " << tokens.size() << " columns." << endl; + exit(1); + } + return_val.emplace_back(tokens[0], parse(tokens[5])); + line.clear(); + } + return return_val; +} + +// Gets the trancript path name, the original transcript name, and the haplotype count from the vg rna -i file +vector> parse_haplotype_transcript_file(istream& haplo_tx_in) { + vector> return_val; + string line; + // skip the header line + getline(haplo_tx_in, line); + line.clear(); + while (getline(haplo_tx_in, line)) { + vector tokens; + stringstream strm(line); + string token; + while (getline(strm, token, '\t')) { + tokens.push_back(move(token)); + token.clear(); + } + if (tokens.size() != 5) { + cerr << "[vg sim] error: Cannot parse haplotype transcript file. Expected 5-column TSV file as produced by vg rna -i, got " << tokens.size() << " columns." << endl; + exit(1); + } + // contributing haplotypes are separeted by commas + size_t haplo_count = 1 + std::count(tokens[4].begin(), tokens[4].end(), ','); + return_val.emplace_back(tokens[0], tokens[2], haplo_count); + line.clear(); + } + return return_val; +} void help_sim(char** argv) { cerr << "usage: " << argv[0] << " sim [options]" << endl << "Samples sequences from the xg-indexed graph." << endl << endl - << "options:" << endl - << " -x, --xg-name FILE use the xg index in FILE" << endl - << " -F, --fastq FILE superpose errors matching the error profile of NGS reads in FILE (ignores -l,-f)" << endl - << " -I, --interleaved reads in FASTQ (-F) are interleaved read pairs" << endl - << " -P, --path PATH simulate from the given names path (multiple allowed)" << endl - << " -l, --read-length N write reads of length N" << endl + << "basic options:" << endl + << " -x, --xg-name FILE use the graph in FILE (required)" << endl << " -n, --num-reads N simulate N reads or read pairs" << endl + << " -l, --read-length N simulate reads of length N" << endl + << " -r, --progress show progress information" << endl + << "output options:" << endl + << " -a, --align-out write alignments in GAM-format" << endl + << " -J, --json-out write alignments in json" << endl + << " --multi-position annotate alignments with multiple reference positions" << endl + << "simulation parameters:" << endl + << " -F, --fastq FILE match the error profile of NGS reads in FILE, repeat for paired reads (ignores -l,-f)" << endl + << " -I, --interleaved reads in FASTQ (-F) are interleaved read pairs" << endl << " -s, --random-seed N use this specific seed for the PRNG" << endl << " -e, --sub-rate FLOAT base substitution rate (default 0.0)" << endl << " -i, --indel-rate FLOAT indel rate (default 0.0)" << endl @@ -42,8 +107,20 @@ void help_sim(char** argv) { << " -p, --frag-len N make paired end reads with given fragment length N" << endl << " -v, --frag-std-dev FLOAT use this standard deviation for fragment length estimation" << endl << " -N, --allow-Ns allow reads to be sampled from the graph with Ns in them" << endl - << " -a, --align-out generate true alignments on stdout rather than reads" << endl - << " -J, --json-out write alignments in json" << endl; + << " --max-tries N attempt sampling operations up to N times before giving up [100]" << endl + << " -t, --threads number of compute threads (only when using FASTQ with -F) [1]" << endl + << "simulate from paths:" << endl + << " -P, --path PATH simulate from this path (may repeat; cannot also give -T)" << endl + << " -A, --any-path simulate from any path (overrides -P)" << endl + << " -m, --sample-name NAME simulate from this sample (may repeat; requires -g)" << endl + << " -R, --ploidy-regex RULES use the given comma-separated list of colon-delimited REGEX:PLOIDY rules to assign" << endl + << " ploidies to contigs not visited by the selected samples, or to all contigs simulated" << endl + << " from if no samples are used. Unmatched contigs get ploidy 2." << endl + << " -g, --gbwt-name FILE use samples from this GBWT index" << endl + << " -T, --tx-expr-file FILE simulate from an expression profile formatted as RSEM output (cannot also give -P)" << endl + << " -H, --haplo-tx-file FILE transcript origin info table from vg rna -i (required for -T on haplotype transcripts)" << endl + << " -u, --unsheared sample from unsheared fragments" << endl + << " -E, --path-pos-file FILE output a TSV with sampled position on path of each read (requires -F)" << endl; } int main_sim(int argc, char** argv) { @@ -53,26 +130,58 @@ int main_sim(int argc, char** argv) { return 1; } - int read_length = 100; + #define OPT_MULTI_POSITION 1000 + #define OPT_MAX_TRIES 1001 + + string xg_name; int num_reads = 1; + int read_length = 100; + bool progress = false; + int threads = 1; + int seed_val = time(NULL); double base_error = 0; double indel_error = 0; bool forward_only = false; bool align_out = false; bool json_out = false; + bool multi_position_annotations = false; int fragment_length = 0; double fragment_std_dev = 0; bool reads_may_contain_Ns = false; - string xg_name; + size_t max_tries = 100; bool strip_bonuses = false; bool interleaved = false; + bool unsheared_fragments = false; double indel_prop = 0.0; double error_scale_factor = 1.0; string fastq_name; + string fastq_2_name; + string path_pos_filename; + // What path should we sample from? Empty string = the whole graph. vector path_names; + bool any_path = false; + // Sample from GBWT threads. + std::vector sample_names; + std::string gbwt_name; + + // When sampling from paths or GBWT threads, what ploidy should we assign to each path? + // Represented as a list of regexes (to match the whole path name) and ploidies. + // The first rule to match wins. + // When using GBWT threads, only applies to contigs with no threads in any sample. + // Each thread that does exist is ploidy 1. + std::vector> ploidy_rules; + + // Alternatively, which transcripts with how much expression? + string rsem_file_name; + vector> transcript_expressions; + // If we made haplotype trancripts, we'll need a translation layer onto the + // expression profile + string haplotype_transcript_file_name; + vector> haplotype_transcripts; + int c; optind = 2; // force optind past command positional argument while (true) { @@ -80,27 +189,39 @@ int main_sim(int argc, char** argv) { { {"help", no_argument, 0, 'h'}, {"xg-name", required_argument, 0, 'x'}, + {"progress", no_argument, 0, 'r'}, {"fastq", required_argument, 0, 'F'}, {"interleaved", no_argument, 0, 'I'}, {"path", required_argument, 0, 'P'}, + {"any-path", no_argument, 0, 'A'}, + {"sample-name", required_argument, 0, 'm'}, + {"ploidy-regex", required_argument, 0, 'R'}, + {"gbwt-name", required_argument, 0, 'g'}, + {"tx-expr-file", required_argument, 0, 'T'}, + {"haplo-tx-file", required_argument, 0, 'H'}, {"read-length", required_argument, 0, 'l'}, {"num-reads", required_argument, 0, 'n'}, {"random-seed", required_argument, 0, 's'}, {"forward-only", no_argument, 0, 'f'}, {"align-out", no_argument, 0, 'a'}, {"json-out", no_argument, 0, 'J'}, + {"multi-position", no_argument, 0, OPT_MULTI_POSITION}, {"allow-Ns", no_argument, 0, 'N'}, + {"max-tries", required_argument, 0, OPT_MAX_TRIES}, + {"unsheared", no_argument, 0, 'u'}, {"sub-rate", required_argument, 0, 'e'}, {"indel-rate", required_argument, 0, 'i'}, {"indel-err-prop", required_argument, 0, 'd'}, {"scale-err", required_argument, 0, 'S'}, {"frag-len", required_argument, 0, 'p'}, {"frag-std-dev", required_argument, 0, 'v'}, + {"threads", required_argument, 0, 't'}, + {"path-usage", required_argument, 0, 'E'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hl:n:s:e:i:fax:Jp:v:Nd:F:P:S:I", + c = getopt_long (argc, argv, "hrl:n:s:e:i:fax:Jp:v:Nud:F:P:Am:R:g:T:H:S:It:E:", long_options, &option_index); // Detect the end of the options. @@ -114,8 +235,21 @@ int main_sim(int argc, char** argv) { xg_name = optarg; break; + case 'r': + progress = true; + break; + case 'F': - fastq_name = optarg; + if (fastq_name.empty()) { + fastq_name = optarg; + } + else if (fastq_2_name.empty()) { + fastq_2_name = optarg; + } + else { + cerr << "error: cannot provide more than 2 FASTQs to train simulator" << endl; + exit(1); + } break; case 'I': @@ -126,6 +260,48 @@ int main_sim(int argc, char** argv) { path_names.push_back(optarg); break; + case 'A': + any_path = true; + break; + + case 'm': + sample_names.push_back(optarg); + break; + + case 'R': + for (auto& rule : split_delims(optarg, ",")) { + // For each comma-separated rule + auto parts = split_delims(rule, ":"); + if (parts.size() != 2) { + cerr << "error: ploidy rules must be REGEX:PLOIDY" << endl; + exit(1); + } + try { + // Parse the regex + std::regex match(parts[0]); + double weight = parse(parts[1]); + // Save the rule + ploidy_rules.emplace_back(match, weight); + } catch (const std::regex_error& e) { + // This is not a good regex + cerr << "error: unacceptable regular expression \"" << parts[0] << "\": " << e.what() << endl; + exit(1); + } + } + break; + + case 'g': + gbwt_name = optarg; + break; + + case 'T': + rsem_file_name = optarg; + break; + + case 'H': + haplotype_transcript_file_name = optarg; + break; + case 'l': read_length = parse(optarg); break; @@ -171,10 +347,22 @@ int main_sim(int argc, char** argv) { json_out = true; align_out = true; break; + + case OPT_MULTI_POSITION: + multi_position_annotations = true; + break; case 'N': reads_may_contain_Ns = true; break; + + case OPT_MAX_TRIES: + max_tries = parse(optarg); + break; + + case 'u': + unsheared_fragments = true; + break; case 'p': fragment_length = parse(optarg); @@ -183,6 +371,14 @@ int main_sim(int argc, char** argv) { case 'v': fragment_std_dev = parse(optarg); break; + + case 't': + threads = parse(optarg); + break; + + case 'E': + path_pos_filename = optarg; + break; case 'h': case '?': @@ -194,55 +390,435 @@ int main_sim(int argc, char** argv) { abort (); } } + + omp_set_num_threads(threads); + + // We'll fill this in with ploidies for each path in path_names + std::vector path_ploidies; + // When we need to consult the ploidy rules about a contig nemr we call this function. + auto consult_ploidy_rules = [&](const std::string& name) { + for (auto& rule : ploidy_rules) { + if (std::regex_match(name, rule.first)) { + // This rule should apply to this contig + return rule.second; + } + } + // Unmatched contigs get ploidy 2. + // 1 makes no sense in the context of a genomic reference. + // 0 makes no sense for --all-paths which consults the rules for all the names. + return 2.0; + }; if (xg_name.empty()) { - cerr << "[vg sim] error: we need an xg index to sample reads from" << endl; + cerr << "[vg sim] error: we need a graph to sample reads from" << endl; return 1; } - - xg::XG* xgidx = nullptr; - ifstream xg_stream(xg_name); - if(xg_stream) { - xgidx = new xg::XG(xg_stream); + if (!gbwt_name.empty() && sample_names.empty() && rsem_file_name.empty()) { + cerr << "[vg sim] error: --gbwt-name requires --sample-name or --tx-expr-file" << endl; + return 1; } - if (!xg_stream || xgidx == nullptr) { - cerr << "[vg sim] error: could not open xg index" << endl; + if (!sample_names.empty() && gbwt_name.empty()) { + cerr << "[vg sim] error: --sample-name must be used with --gbwt-name" << endl; + return 1; + } + if (!gbwt_name.empty() && !rsem_file_name.empty() && !haplotype_transcript_file_name.empty()) { + cerr << "[vg sim] error: using --gbwt-name requires that HSTs be included --tx-expr-file, combination with --haplo-tx-file is not implemented" << endl; return 1; } - for (auto& path_name : path_names) { - if (xgidx->path_rank(path_name) == 0) { - cerr << "[vg sim] error: path \""<< path_name << "\" not found in index" << endl; + if (!rsem_file_name.empty()) { + if (progress) { + std::cerr << "Reading transcription profile from " << rsem_file_name << std::endl; + } + ifstream rsem_in(rsem_file_name); + if (!rsem_in) { + cerr << "[vg sim] error: could not open transcription profile file " << rsem_file_name << endl; + return 1; + } + transcript_expressions = parse_rsem_expression_file(rsem_in); + } + + if (!haplotype_transcript_file_name.empty()) { + if (progress) { + std::cerr << "Reading haplotype transcript file " << haplotype_transcript_file_name << std::endl; + } + ifstream haplo_tx_in(haplotype_transcript_file_name); + if (!haplo_tx_in) { + cerr << "[vg sim] error: could not open haplotype transcript file " << haplotype_transcript_file_name << endl; return 1; } + haplotype_transcripts = parse_haplotype_transcript_file(haplo_tx_in); } - // Make a Mapper to score reads, with the default parameters - Mapper rescorer(xgidx, nullptr, nullptr); - // We define a function to score a generated alignment under the mapper - auto rescore = [&] (Alignment& aln) { - // Score using exact distance. - aln.set_score(rescorer.score_alignment(aln, false)); - }; + if (progress) { + std::cerr << "Loading graph " << xg_name << std::endl; + } + unique_ptr path_handle_graph = vg::io::VPKG::load_one(xg_name); + + if (!path_pos_filename.empty() && fastq_name.empty()) { + cerr << "[vg sim] error: path usage table is not available unless using trained simulation (-F)" << endl; + exit(1); + } + + if (fastq_name.empty() && unsheared_fragments) { + cerr << "[vg sim] error: unsheared fragment option only available when simulating from FASTQ-trained errors" << endl; + exit(1); + } + + // Deal with path names. Do this before we create paths to represent threads. + if (any_path) { + if (progress) { + std::cerr << "Selecting all " << path_handle_graph->get_path_count() << " paths" << std::endl; + } + if (path_handle_graph->get_path_count() == 0) { + cerr << "[vg sim] error: the graph does not contain paths" << endl; + return 1; + } + path_names.clear(); + path_handle_graph->for_each_path_handle([&](const path_handle_t& handle) { + // For each path in the graph + auto name = path_handle_graph->get_path_name(handle); + // Simulate from it + path_names.push_back(name); + // At ploidy defined by the rules (default 2) + path_ploidies.push_back(consult_ploidy_rules(name)); + }); + } else if (!path_names.empty()) { + if (progress) { + std::cerr << "Checking " << path_names.size() << " selected paths" << std::endl; + } + for (auto& path_name : path_names) { + if (path_handle_graph->has_path(path_name) == false) { + cerr << "[vg sim] error: path \""<< path_name << "\" not found in index" << endl; + return 1; + } + // Synthesize ploidies for explicitly specified paths + path_ploidies.push_back(consult_ploidy_rules(path_name)); + } + } + + // We may add some paths to our graph. If so, we need to ignore them when + // annotating with path positions, because they will be useless. + unordered_set inserted_path_names; + // Deal with GBWT threads + if (!gbwt_name.empty()) { + + // We need to track the contigs that have not had any threads in any sample + std::unordered_set unvisited_contigs; + + if (progress) { + std::cerr << "Loading GBWT index " << gbwt_name << std::endl; + } + std::unique_ptr gbwt_index = vg::io::VPKG::load_one(gbwt_name); + if (!(gbwt_index->hasMetadata()) || !(gbwt_index->metadata.hasSampleNames()) || !(gbwt_index->metadata.hasPathNames())) { + std::cerr << "[vg sim] error: GBWT index does not contain sufficient metadata" << std::endl; + return 1; + } + + // we will add these threads to the graph as named paths and index them for easy look up + hash_map sample_id_to_idx; + + if (!sample_names.empty()) { + // we're consulting the provided sample names to determine which threads to include + + // We need to track the contigs that have not had any threads in any sample + if (!ploidy_rules.empty()) { + // We actually want to visit them, so we have to find them + if (progress) { + std::cerr << "Inventorying contigs" << std::endl; + } + path_handle_graph->for_each_path_handle([&](const path_handle_t& handle) { + // For each path in the graph + auto name = path_handle_graph->get_path_name(handle); + if (!Paths::is_alt(name)) { + // TODO: We assume that if it isn't an alt path it represents a contig! + // TODO: We may need to change this when working with graphs with multiple sets of primary paths, or other extra paths. + unvisited_contigs.insert(name); + } + }); + } + + if (progress) { + std::cerr << "Checking " << sample_names.size() << " samples" << std::endl; + } + + for (std::string& sample_name : sample_names) { + gbwt::size_type id = gbwt_index->metadata.sample(sample_name); + if (id >= gbwt_index->metadata.samples()) { + std::cerr << "[vg sim] error: sample \"" << sample_name << "\" not found in the GBWT index" << std::endl; + return 1; + } + auto idx = sample_id_to_idx.size(); + sample_id_to_idx[id] = idx; + } + } + else { + // we are consulting the transcript expression table to decide which threads to include + for (const auto& transcript_expression : transcript_expressions) { + gbwt::size_type id = gbwt_index->metadata.sample(transcript_expression.first); + if (id >= gbwt_index->metadata.samples()) { + std::cerr << "[vg sim] error: haplotype-specific transcript \"" << transcript_expression.first << "\" not found in the GBWT index" << std::endl; + return 1; + } + auto idx = sample_id_to_idx.size(); + sample_id_to_idx[id] = idx; + } + } + + MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(path_handle_graph.get()); + if (mutable_graph == nullptr) { + if (progress) { + std::cerr << "Converting the graph into HashGraph" << std::endl; + } + mutable_graph = new bdsg::HashGraph(); + handlealgs::copy_path_handle_graph(path_handle_graph.get(), mutable_graph); + path_handle_graph.reset(mutable_graph); + } + if (progress) { + std::cerr << "Inserting " << sample_id_to_idx.size() << " GBWT threads into the graph" << std::endl; + } + + for (gbwt::size_type i = 0; i < gbwt_index->metadata.paths(); i++) { + auto& path = gbwt_index->metadata.path(i); + auto it = sample_id_to_idx.find(path.sample); + if (it != sample_id_to_idx.end()) { + std::string path_name = insert_gbwt_path(*mutable_graph, *gbwt_index, i); + if (!path_name.empty()) { + // path was successfully added + if (!sample_names.empty()) { + // assign this haplotype a ploidy of 1 + + // We managed to make a path for this thread + path_names.push_back(path_name); + // It should have ploidy 1 + path_ploidies.push_back(1.0); + + if (!unvisited_contigs.empty()) { + // Remember that the contig this path is on is visited + auto contig_name = gbwt_index->metadata.contig(path.contig); + unvisited_contigs.erase(contig_name); + } + } + else { + // update the transcript name so we can assign it expression + // later down + transcript_expressions[it->second].first = path_name; + } + // Remember we inserted a path + inserted_path_names.insert(path_name); + } + } + } + if (progress) { + std::cerr << "Inserted " << inserted_path_names.size() << " paths" << std::endl; + } + if (!unvisited_contigs.empty()) { + // There are unvisited contigs we want to sample from too + for (auto& name : unvisited_contigs) { + // Sample from each + path_names.push_back(name); + // With the rule-determined ploidy + path_ploidies.push_back(consult_ploidy_rules(name)); + } + if (progress) { + std::cerr << "Also sampling from " << unvisited_contigs.size() << " paths representing unvisited contigs" << std::endl; + } + } + } + + if (haplotype_transcript_file_name.empty()) { + if (!transcript_expressions.empty()) { + if (progress) { + std::cerr << "Checking " << transcript_expressions.size() << " transcripts" << std::endl; + } + for (auto& transcript_expression : transcript_expressions) { + if (!path_handle_graph->has_path(transcript_expression.first)) { + cerr << "[vg sim] error: transcript path for \""<< transcript_expression.first << "\" not found in index" << endl; + cerr << "if you embedded haplotype-specific transcripts in the graph, you may need the haplotype transcript file from vg rna -i" << endl; + return 1; + } + } + } + } + else { + if (progress) { + std::cerr << "Checking " << haplotype_transcripts.size() << " haplotype transcripts" << std::endl; + } + for (auto& haplotype_transcript : haplotype_transcripts) { + if (!path_handle_graph->has_path(get<0>(haplotype_transcript))) { + cerr << "[vg sim] error: transcript path for \""<< get<0>(haplotype_transcript) << "\" not found in index" << endl; + return 1; + } + } + } + + if (progress) { + std::cerr << "Creating path position overlay" << std::endl; + } + + bdsg::ReferencePathVectorizableOverlayHelper overlay_helper; + PathPositionHandleGraph* xgidx = dynamic_cast(overlay_helper.apply(path_handle_graph.get())); + + // We want to store the inserted paths as a set of handles, which are + // easier to hash than strings for lookup. + unordered_set inserted_path_handles; + if (!inserted_path_names.empty()) { + if (progress) { + std::cerr << "Finding inserted paths" << std::endl; + } + for (auto& name : inserted_path_names) { + inserted_path_handles.insert(xgidx->get_path_handle(name)); + } + } + + unique_ptr alignment_emitter; + if (align_out) { + // We're writing in an alignment format + alignment_emitter = get_non_hts_alignment_emitter("-", json_out ? "JSON" : "GAM", + map(), get_thread_count()); + } + // Otherwise we're just dumping sequence strings; leave it null. + + if (progress) { + std::cerr << "Simulating " << (fragment_length > 0 ? "read pairs" : "reads") << std::endl; + std::cerr << "--num-reads " << num_reads << std::endl; + std::cerr << "--read-length " << read_length << std::endl; + if (align_out) { + std::cerr << "--align-out" << std::endl; + } + if (json_out) { + std::cerr << "--json-out" << std::endl; + } + if (!fastq_name.empty()) { + std::cerr << "--fastq " << fastq_name << std::endl; + if (!fastq_2_name.empty()) { + std::cerr << "--fastq " << fastq_2_name << std::endl; + } + if (interleaved) { + std::cerr << "--interleaved" << std::endl; + } + } else { + if (base_error > 0.0) { + std::cerr << "--sub-rate " << base_error << std::endl; + } + } + if (indel_error > 0.0) { + std::cerr << "--indel-rate " << indel_error << std::endl; + } + if (!fastq_name.empty()) { + if (indel_prop > 0.0) { + std::cerr << "--indel-err-prop " << indel_prop << std::endl; + } + if (error_scale_factor != 1.0) { + std::cerr << "--scale-err " << error_scale_factor << std::endl; + } + } + if (forward_only) { + std::cerr << "--forward-only" << std::endl; + } + if (fragment_length > 0) { + std::cerr << "--frag-len " << fragment_length << std::endl; + if (fragment_std_dev > 0.0) { + std::cerr << "--frag-std-dev " << fragment_std_dev << std::endl; + } + } + if (reads_may_contain_Ns) { + std::cerr << "--allow-Ns" << std::endl; + } + if (max_tries != 100) { + std::cerr << "--max-tries" << max_tries << std::endl; + } + } + + unique_ptr sampler; if (fastq_name.empty()) { // Use the fixed error rate sampler - // Make a sample to sample reads with - Sampler sampler(xgidx, seed_val, forward_only, reads_may_contain_Ns, path_names); + if (unsheared_fragments) { + cerr << "warning: Unsheared fragment option only available when simulating from FASTQ-trained errors" << endl; + } - // Make a Mapper to score reads, with the default parameters - Mapper rescorer(xgidx, nullptr, nullptr); - // Override the "default" full length bonus, just like every other subcommand that uses a mapper ends up doing. - // TODO: is it safe to change the default? - rescorer.set_alignment_scores(default_match, default_mismatch, default_gap_open, default_gap_extension, default_full_length_bonus); - // Include the full length bonuses if requested. - rescorer.strip_bonuses = strip_bonuses; - // We define a function to score a generated alignment under the mapper - auto rescore = [&] (Alignment& aln) { - // Score using exact distance. - aln.set_score(rescorer.score_alignment(aln, false)); + sampler.reset(new Sampler(xgidx, seed_val, forward_only, reads_may_contain_Ns, path_names, path_ploidies, transcript_expressions, haplotype_transcripts)); + } else { + // Use the FASTQ-trained sampler + sampler.reset(new NGSSimulator(*xgidx, + fastq_name, + fastq_2_name, + interleaved, + path_names, + path_ploidies, + transcript_expressions, + haplotype_transcripts, + base_error, + indel_error, + indel_prop, + fragment_length ? fragment_length : std::numeric_limits::max(), // suppresses warnings about fragment length + fragment_std_dev ? fragment_std_dev : 0.000001, // eliminates errors from having 0 as stddev without substantial difference + error_scale_factor, + !reads_may_contain_Ns, + unsheared_fragments, + seed_val)); + } + + // Do common configuration + sampler->multi_position_annotations = multi_position_annotations; + sampler->max_tries = max_tries; + if (!inserted_path_handles.empty()) { + // Skip paths that we have added ourselves when annotating, so we search + // further for base-graph path. + std::function annotation_path_filter = [&inserted_path_handles](const path_handle_t& path) { + return !inserted_path_handles.count(path); }; + sampler->annotation_path_filter = std::make_unique>(std::move(annotation_path_filter)); + } + + // Generate an Aligner for rescoring + Aligner aligner(default_score_matrix, default_gap_open, default_gap_extension, + default_full_length_bonus, vg::default_gc_content); + + // We define a function to score a using the aligner + auto rescore = [&] (Alignment& aln) { + // Score using exact distance. + aln.set_score(aligner.score_contiguous_alignment(aln, strip_bonuses)); + }; + + // And a function to emit either single or paired reads, while recomputing scores. + auto emit = [&] (Alignment* r1, Alignment* r2) { + // write the alignment or its string + if (align_out) { + // write it out as requested + + // We will need scores + rescore(*r1); + if (r2) { + // And we have a paired read + rescore(*r2); + alignment_emitter->emit_pair(std::move(*r1), std::move(*r2)); + } else { + // We have just one read. + alignment_emitter->emit_single(std::move(*r1)); + } + } else { + // Print the sequences of the reads we have. + #pragma omp critical + { + cout << r1->sequence(); + if (r2) { + cout << "\t" << r2->sequence(); + } + cout << endl; + } + } + }; + + // The rest of the process has to split up by the type of sampler in use. + // TODO: Actually refactor to a common sampling interface. + + if (dynamic_cast(sampler.get())) { + // Not everything is bound to the new interface yet, so we need to do + // actual sampling through this typed pointer. + Sampler* basic_sampler = dynamic_cast(sampler.get()); size_t max_iter = 1000; int nonce = 1; @@ -250,8 +826,8 @@ int main_sim(int argc, char** argv) { // For each read we are going to generate if (fragment_length) { - // fragment_lenght is nonzero so make it two paired reads - auto alns = sampler.alignment_pair(read_length, fragment_length, fragment_std_dev, base_error, indel_error); + // fragment_length is nonzero so make it two paired reads + auto alns = basic_sampler->alignment_pair(read_length, fragment_length, fragment_std_dev, base_error, indel_error); size_t iter = 0; while (iter++ < max_iter) { @@ -259,38 +835,22 @@ int main_sim(int argc, char** argv) { if (alns.front().sequence().size() < read_length || alns.back().sequence().size() < read_length) { // If our read was too short, try again - alns = sampler.alignment_pair(read_length, fragment_length, fragment_std_dev, base_error, indel_error); + alns = basic_sampler->alignment_pair(read_length, fragment_length, fragment_std_dev, base_error, indel_error); } } // write the alignment or its string - if (align_out) { - // write it out as requested - - // We will need scores - rescore(alns.front()); - rescore(alns.back()); - - if (json_out) { - cout << pb2json(alns.front()) << endl; - cout << pb2json(alns.back()) << endl; - } else { - function lambda = [&alns](size_t n) { return alns[n]; }; - stream::write(cout, 2, lambda); - } - } else { - cout << alns.front().sequence() << "\t" << alns.back().sequence() << endl; - } + emit(&alns.front(), &alns.back()); } else { // Do single-end reads - auto aln = sampler.alignment_with_error(read_length, base_error, indel_error); + auto aln = basic_sampler->alignment_with_error(read_length, base_error, indel_error); size_t iter = 0; while (iter++ < max_iter) { // For up to max_iter iterations if (aln.sequence().size() < read_length) { // If our read is too short, try again - auto aln_prime = sampler.alignment_with_error(read_length, base_error, indel_error); + auto aln_prime = basic_sampler->alignment_with_error(read_length, base_error, indel_error); if (aln_prime.sequence().size() > aln.sequence().size()) { // But only keep the new try if it is longer aln = aln_prime; @@ -298,95 +858,41 @@ int main_sim(int argc, char** argv) { } } - // write the alignment or its string - if (align_out) { - // write it out as requested - - // We will need scores - rescore(aln); - - if (json_out) { - cout << pb2json(aln) << endl; - } else { - function lambda = [&aln](size_t n) { return aln; }; - stream::write(cout, 1, lambda); - } - } else { - cout << aln.sequence() << endl; - } + // Emit the unpaired alignment + emit(&aln, nullptr); } } - } - else { + } else if (dynamic_cast(sampler.get())) { // Use the trained error rate - Aligner aligner(default_match, default_mismatch, default_gap_open, default_gap_extension, 5); + // Not everything is bound to the new interface yet, so we need to do + // actual sampling through this typed pointer. + NGSSimulator* ngs_sampler = dynamic_cast(sampler.get()); - NGSSimulator sampler(*xgidx, - fastq_name, - interleaved, - path_names, - base_error, - indel_error, - indel_prop, - fragment_length ? fragment_length : std::numeric_limits::max(), // suppresses warnings about fragment length - fragment_std_dev ? fragment_std_dev : 0.000001, // eliminates errors from having 0 as stddev without substantial difference - error_scale_factor, - !reads_may_contain_Ns, - seed_val); + if (!path_pos_filename.empty()) { + ngs_sampler->connect_to_position_file(path_pos_filename); + } - if (fragment_length) { - for (size_t i = 0; i < num_reads; i++) { - pair read_pair = sampler.sample_read_pair(); - read_pair.first.set_score(aligner.score_ungapped_alignment(read_pair.first, strip_bonuses)); - read_pair.second.set_score(aligner.score_ungapped_alignment(read_pair.second, strip_bonuses)); - - if (align_out) { - if (json_out) { - cout << pb2json(read_pair.first) << endl; - cout << pb2json(read_pair.second) << endl; - } - else { - function lambda = [&read_pair](size_t n) { - return n % 2 == 0 ? read_pair.first : read_pair.second; - }; - stream::write(cout, 2, lambda); - } - } - else { - cout << read_pair.first.sequence() << "\t" << read_pair.second.sequence() << endl; - } + // static scheduling could produce some degradation in speed, but i think it should make + // the output deterministic (except for ordering) +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < num_reads; i++) { + if (fragment_length) { + pair read_pair = ngs_sampler->sample_read_pair(); + emit(&read_pair.first, &read_pair.second); } - } - else { - for (size_t i = 0; i < num_reads; i++) { - Alignment read = sampler.sample_read(); - read.set_score(aligner.score_ungapped_alignment(read, strip_bonuses)); - - if (align_out) { - if (json_out) { - cout << pb2json(read) << endl; - } - else { - function lambda = [&read](size_t n) { - return read; - }; - stream::write(cout, 1, lambda); - } - } - else { - cout << read.sequence() << endl; - } + else { + Alignment read = ngs_sampler->sample_read(); + emit(&read, nullptr); } } + } else { + // We don't know about this sampler type. + // TODO: Define a real sampler interface that lets you sample. + throw std::logic_error("Attempted to use sampler type for which sampling is not implemented!"); } - if (align_out && !json_out) { - // We wrote alignment data, so write an EOF - stream::finish(cout); - } - return 0; } diff --git a/src/subcommand/simplify_main.cpp b/src/subcommand/simplify_main.cpp index 963ee5fe50c..22500642819 100644 --- a/src/subcommand/simplify_main.cpp +++ b/src/subcommand/simplify_main.cpp @@ -10,8 +10,8 @@ #include "subcommand.hpp" #include "../vg.hpp" -#include "../simplifier.hpp" - +#include "../small_snarl_simplifier.hpp" +#include "../rare_variant_simplifier.hpp" @@ -21,13 +21,19 @@ using namespace vg::subcommand; void help_simplify(char** argv) { cerr << "usage: " << argv[0] << " simplify [options] old.vg >new.vg" << endl - << "options:" << endl - << " -m, --min-size N remove leaf sites with fewer than N bases involved (default: 10)" << endl - << " -i, --max-iterations N perform up to N iterations of simplification (default: 10)" << endl + << "general options:" << endl + << " -a, --algorithm NAME simplify using the given algorithm (small, rare; default: small)" << endl + << " -t, --threads N use N threads to construct graph (defaults to numCPUs)" << endl << " -p, --progress show progress" << endl << " -b, --bed-in read in the given BED file in the cordinates of the original paths" << endl << " -B, --bed-out output transformed features in the coordinates of the new paths" << endl - << " -t, --threads N use N threads to construct graph (defaults to numCPUs)" << endl; + << "small snarl simplifier options:" << endl + << " -m, --min-size N remove leaf sites with fewer than N bases involved (default: 10)" << endl + << " -i, --max-iterations N perform up to N iterations of simplification (default: 10)" << endl + << "rare variant simplifier options:" << endl + << " -v, --vcf FILE use the given VCF file to determine variant frequency (required)" << endl + << " -f, --min-freq FLOAT remove variants with total alt frequency under FLOAT (default: 0)" << endl + << " -c, --min-count N remove variants with total alt occurrence count under N (default: 0)" << endl; } int main_simplify(int argc, char** argv) { @@ -37,32 +43,44 @@ int main_simplify(int argc, char** argv) { return 1; } + // What algorithm should we use for simplification ("small" or "rare"). + string algorithm = "small"; - // TODO: The simplifier needs the graph when we make it, so we can't store - // our settings in it directly. - size_t min_size = 10; - size_t max_iterations = 10; + // General options string bed_in_filename; string bed_out_filename; bool show_progress = false; + // For simplifying small variants + size_t min_size = 10; + size_t max_iterations = 10; + + // For simplifying rare variants + string vcf_filename; + double min_frequency = 0; + size_t min_count = 0; + int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { - {"min-size", required_argument, 0, 'm'}, - {"max-iterations", required_argument, 0, 'i'}, + {"algorithm", required_argument, 0, 'a'}, {"progress", no_argument, 0, 'p'}, + {"threads", required_argument, 0, 't'}, {"bed-in", required_argument, 0, 'b'}, {"bed-out", required_argument, 0, 'B'}, - {"threads", required_argument, 0, 't'}, + {"min-size", required_argument, 0, 'm'}, + {"max-iterations", required_argument, 0, 'i'}, + {"vcf", required_argument, 0, 'v'}, + {"min-freq", required_argument, 0, 'f'}, + {"min-count", required_argument, 0, 'c'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "m:i:pb:B:t:h?", + c = getopt_long (argc, argv, "a:pt:b:B:m:i:v:f:c:h?", long_options, &option_index); /* Detect the end of the options. */ @@ -72,17 +90,17 @@ int main_simplify(int argc, char** argv) { switch (c) { - case 'm': - min_size = parse(optarg); - break; - - case 'i': - max_iterations = parse(optarg); + case 'a': + algorithm = optarg; break; case 'p': show_progress = true; break; + + case 't': + omp_set_num_threads(parse(optarg)); + break; case 'b': bed_in_filename = optarg; @@ -92,8 +110,24 @@ int main_simplify(int argc, char** argv) { bed_out_filename = optarg; break; - case 't': - omp_set_num_threads(parse(optarg)); + case 'm': + min_size = parse(optarg); + break; + + case 'i': + max_iterations = parse(optarg); + break; + + case 'v': + vcf_filename = optarg; + break; + + case 'f': + min_frequency = parse(optarg); + break; + + case 'c': + min_count = parse(optarg); break; case 'h': @@ -108,55 +142,93 @@ int main_simplify(int argc, char** argv) { } } - - // TODO: move all this to a simplifier object + + // Do preliminary options checks + if (!bed_out_filename.empty() && bed_in_filename.empty()) { + // Don't allow writing out a BED without reading one + cerr << "error[vg simplify]: Cannot output a bed (-B) unless a BED is read in first (-b)" << endl; + exit(1); + } // Load the graph - VG* graph; + unique_ptr graph; get_input_file(optind, argc, argv, [&](istream& in) { - graph = new VG(in, show_progress); + graph = unique_ptr(new VG(in, show_progress)); }); if (graph == nullptr) { cerr << "error:[vg simplify]: Could not load graph" << endl; exit(1); } - - { - // Make a Simplifier for the graph and copy over settings. Need sto be - // in a block so that the graph doesn't get deleted before the - // simplifier goes out of scope. - Simplifier simplifier(*graph); + + // This will hold BED features if we are tracking those + unique_ptr features; + if (!bed_in_filename.empty()) { + // Go and ,oad up the BED features + get_input_file(bed_in_filename, [&](istream& bed_stream) { + features = unique_ptr(new FeatureSet()); + features->load_bed(bed_stream); + }); + } + + if (algorithm == "small") { + if (!vcf_filename.empty()) { + cerr << "error[vg simplify]: A VCF file (-v) cannot be used with small snarl simplification" << endl; + exit(1); + } + + // Make a SmallSnarlSimplifier for the graph and copy over settings. + SmallSnarlSimplifier simplifier(*graph); simplifier.show_progress = show_progress; simplifier.max_iterations = max_iterations; simplifier.min_size = min_size; - - if (!bed_in_filename.empty()) { - // Load BED features - ifstream bed_stream(bed_in_filename.c_str()); - simplifier.features.load_bed(bed_stream); - } - + simplifier.features = features.get(); + // Do the simplification simplifier.simplify(); - - // Serialize the graph - graph->serialize_to_ostream(std::cout); - - if (!bed_out_filename.empty()) { - // Save BED features - ofstream bed_stream(bed_out_filename.c_str()); - simplifier.features.save_bed(bed_stream); + } else if (algorithm == "rare") { + // We are going to remove rare variants as noted in a VCF + if (vcf_filename.empty()) { + cerr << "error[vg simplify]: \"rare\" simplification algorithm requires a VCF (-v)" << endl; + exit(1); } + + // Load the VCF + vcflib::VariantCallFile variant_file; + variant_file.parseSamples = false; // Major speedup if there are many samples. + variant_file.open(vcf_filename); + if (!variant_file.is_open()) { + cerr << "error:[vg simplify] could not open" << vcf_filename << endl; + exit(1); + } + + // Buffer it + VcfBuffer buffer(&variant_file); + + // Make a RareVariantSimplifier for the graph + RareVariantSimplifier simplifier(*graph, buffer); + + // Set its settings + simplifier.min_frequency_to_keep = min_frequency; + simplifier.min_count_to_keep = min_count; + + // Run it + simplifier.simplify(); + } else { + cerr << "error[vg simplify]: Unknown algorithm \"" << algorithm << "\"; use \"small\" or \"rare\"." << endl; + exit(1); + } + + // Serialize the graph + graph->serialize_to_ostream(std::cout); + if (!bed_out_filename.empty()) { + // Save BED features + assert(features.get() != nullptr); + ofstream bed_stream(bed_out_filename.c_str()); + features->save_bed(bed_stream); } - delete graph; - - // NB: If you worry about "still reachable but possibly lost" warnings in valgrind, - // this would free all the memory used by protobuf: - //ShutdownProtobufLibrary(); - return 0; } diff --git a/src/subcommand/snarls_main.cpp b/src/subcommand/snarls_main.cpp index 6a9969d9c0c..8587e93d2d2 100644 --- a/src/subcommand/snarls_main.cpp +++ b/src/subcommand/snarls_main.cpp @@ -11,26 +11,41 @@ #include "subcommand.hpp" #include "../vg.hpp" -#include "vg.pb.h" +#include #include "../traversal_finder.hpp" -#include "../stream.hpp" +#include "../cactus_snarl_finder.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../gbwtgraph_helper.hpp" +#include "../algorithms/find_translation.hpp" +#include "../algorithms/back_translate.hpp" +#include +#include +//#define debug using namespace std; using namespace vg; using namespace vg::subcommand; void help_snarl(char** argv) { - cerr << "usage: " << argv[0] << " snarls [options] graph.vg > snarls.pb" << endl + cerr << "usage: " << argv[0] << " snarls [options] graph > snarls.pb" << endl << " By default, a list of protobuf Snarls is written" << endl << "options:" << endl - << " -p, --pathnames output variant paths as SnarlTraversals to STDOUT" << endl - << " -r, --traversals FILE output SnarlTraversals for ultrabubbles." << endl - << " -l, --leaf-only restrict traversals to leaf ultrabubbles." << endl - << " -o, --top-level restrict traversals to top level ultrabubbles" << endl - << " -m, --max-nodes N only compute traversals for snarls with <= N nodes [10]" << endl - << " -t, --include-trivial report snarls that consist of a single edge" << endl - << " -s, --sort-snarls return snarls in sorted order by node ID (for topologically ordered graphs)" << endl; + << " -A, --algorithm NAME compute snarls using 'cactus' or 'integrated' algorithms (default: integrated)" << endl + << " -p, --pathnames output variant paths as SnarlTraversals to STDOUT" << endl + << " -r, --traversals FILE output SnarlTraversals for ultrabubbles." << endl + << " -e, --path-traversals only consider traversals that correspond to paths in the graph. (-m ignored)" << endl + << " -l, --leaf-only restrict traversals to leaf ultrabubbles." << endl + << " -o, --top-level restrict traversals to top level ultrabubbles" << endl + << " -a, --any-snarl-type compute traversals for any snarl type (not limiting to ultrabubbles)" << endl + << " -m, --max-nodes N only compute traversals for snarls with <= N nodes (with degree > 1) [10]" << endl + << " -n, --named-coordinates produce snarl and traversal outputs in named-segment (GFA) space" << endl + << " -T, --include-trivial report snarls that consist of a single edge" << endl + << " -s, --sort-snarls return snarls in sorted order by node ID (for topologically ordered graphs)" << endl + << " -v, --vcf FILE use vcf-based instead of exhaustive traversal finder with -r" << endl + << " -f --fasta FILE reference in FASTA format (required for SVs by -v)" << endl + << " -i --ins-fasta FILE insertion sequences in FASTA format (required for SVs by -v)" << endl + << " -t, --threads N number of threads to use [all available]" << endl; } int main_snarl(int argc, char** argv) { @@ -42,32 +57,47 @@ int main_snarl(int argc, char** argv) { static const int buffer_size = 100; + string algorithm = "integrated"; string traversal_file; bool leaf_only = false; bool top_level_only = false; + bool ultrabubble_only = true; int max_nodes = 10; + bool named_coordinates = false; bool filter_trivial_snarls = true; bool sort_snarls = false; bool fill_path_names = false; - + string vcf_filename; + string ref_fasta_filename; + string ins_fasta_filename; + bool path_traversals = false; + int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { + {"algorithm", required_argument, 0, 'A'}, {"traversals", required_argument, 0, 'r'}, - {"pathnames", no_argument, 0, 'p'}, + {"pathnames", no_argument, 0, 'p'}, {"leaf-only", no_argument, 0, 'l'}, {"top-level", no_argument, 0, 'o'}, + {"any-snarl-type", no_argument, 0, 'a'}, {"max-nodes", required_argument, 0, 'm'}, - {"include-trivial", no_argument, 0, 't'}, + {"named-coordinates", no_argument, 0, 'n'}, + {"include-trivial", no_argument, 0, 'T'}, {"sort-snarls", no_argument, 0, 's'}, + {"vcf", required_argument, 0, 'v'}, + {"fasta", required_argument, 0, 'f'}, + {"ins-fasta", required_argument, 0, 'i'}, + {"path-traversals", no_argument, 0, 'e'}, + {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "sr:ltopm:h?", + c = getopt_long (argc, argv, "A:sr:laTopm:nv:f:i:eh?t:", long_options, &option_index); /* Detect the end of the options. */ @@ -76,7 +106,11 @@ int main_snarl(int argc, char** argv) { switch (c) { - + + case 'A': + algorithm = optarg; + break; + case 'r': traversal_file = optarg; break; @@ -89,11 +123,19 @@ int main_snarl(int argc, char** argv) { top_level_only = true; break; + case 'a': + ultrabubble_only = false; + break; + case 'm': max_nodes = parse(optarg); break; - case 't': + case 'n': + named_coordinates = true; + break; + + case 'T': filter_trivial_snarls = false; break; @@ -103,7 +145,29 @@ int main_snarl(int argc, char** argv) { case 'p': fill_path_names = true; break; - + case 'v': + vcf_filename = optarg; + break; + case 'f': + ref_fasta_filename = optarg; + break; + case 'i': + ins_fasta_filename = optarg; + break; + case 'e': + path_traversals = true; + break; + + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg snarls] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } case 'h': case '?': /* getopt_long already printed an error message. */ @@ -122,31 +186,96 @@ int main_snarl(int argc, char** argv) { if (!traversal_file.empty()) { trav_stream.open(traversal_file); if (!trav_stream) { - cerr << "error:[vg snarl]: Could not open \"" << traversal_file + cerr << "error: [vg snarls] Could not open \"" << traversal_file << "\" for writing" << endl; return 1; } } - // Read the graph - VG* graph; - get_input_file(optind, argc, argv, [&](istream& in) { - graph = new VG(in); - }); + // Read the graph into a PathHandleGraph. + string graph_filename = get_input_file_name(optind, argc, argv); + unique_ptr graph = vg::io::VPKG::load_one(graph_filename); - if (graph == nullptr) { - cerr << "error:[vg snarl]: Could not load graph" << endl; - exit(1); + // Determine what translation we should apply, if any, to our output coordinates. + const NamedNodeBackTranslation* translation = nullptr; + if (named_coordinates) { + translation = vg::algorithms::find_translation(graph.get()); + if (!translation) { + cerr << "error:[vg snarls] Named coordinate output (-n) was requested, but the graph does not come with a named coordinate space." << endl; + return 1; + } + } + + // TODO: Everything but Cactus and the path-related options can work with a + // non-path HandleGraph, but we don't really have any of those implemented + // anymore, so we don't bother supporting them. + + // Pick a SnalrFinder + unique_ptr snarl_finder; + + if (algorithm == "cactus") { + snarl_finder.reset(new CactusSnarlFinder(*graph)); + } else if (algorithm == "integrated") { + snarl_finder.reset(new IntegratedSnarlFinder(*graph)); + } else { + cerr << "error:[vg snarls]: Algorithm must be 'cactus' or 'integrated', not '" << algorithm << "'" << endl; + return 1; + } + if (!vcf_filename.empty() && path_traversals) { + cerr << "error:[vg snarls]: -v cannot be used with -e" << endl; + return 1; + } + if (path_traversals && traversal_file.empty()) { + cerr << "error:[vg snarls]: -e requires -r" << endl; + return 1; + } + if (!vcf_filename.empty() && traversal_file.empty()) { + cerr << "error:[vg snarls]: -v requires -r" << endl; + return 1; } - // The only implemented snarl finder: - SnarlFinder* snarl_finder = new CactusSnarlFinder(*graph); + unique_ptr trav_finder; + vcflib::VariantCallFile variant_file; + unique_ptr ref_fasta; + unique_ptr ins_fasta; + + if (!vcf_filename.empty()) { + variant_file.parseSamples = false; + variant_file.open(vcf_filename); + if (!variant_file.is_open()) { + cerr << "error: [vg snarls] could not open " << vcf_filename << endl; + return 1; + } + + // load up the fasta + if (!ref_fasta_filename.empty()) { + ref_fasta = unique_ptr(new FastaReference); + ref_fasta->open(ref_fasta_filename); + } + if (!ins_fasta_filename.empty()) { + ins_fasta = unique_ptr(new FastaReference); + ins_fasta->open(ins_fasta_filename); + } + } // Load up all the snarls - SnarlManager snarl_manager = snarl_finder->find_snarls(); - vector snarl_roots = snarl_manager.top_level_snarls(); + SnarlManager snarl_manager = snarl_finder->find_snarls_parallel(); + + // Get all snarls in top level chains, in chain order + vector snarl_roots; + + snarl_manager.for_each_top_level_chain([&](const Chain* chain) { + // For each top level chain of 1 or more snarls + for(auto here = chain_begin(*chain); here != chain_end(*chain); ++here) { + // For each snarl in the chain in order + // Remember to visit it (ignoring chain-relative orientation) + snarl_roots.push_back(here->first); + } + }); + if (fill_path_names){ - TraversalFinder* trav_finder = new PathBasedTraversalFinder(*graph, snarl_manager); + // This finder needs a vg::VG + trav_finder.reset(new PathBasedTraversalFinder(*graph, snarl_manager)); for (const Snarl* snarl : snarl_roots ){ if (filter_trivial_snarls) { auto contents = snarl_manager.shallow_contents(snarl, *graph, false); @@ -156,18 +285,39 @@ int main_snarl(int argc, char** argv) { } } vector travs = trav_finder->find_traversals(*snarl); - stream::write_buffered(cout, travs, 0); + if (translation) { + for (auto& trav : travs) { + // Bring all the output traversals into named segment space. + algorithms::back_translate_in_place(translation, trav); + } + } + vg::io::write_buffered(cout, travs, 0); } - delete trav_finder; - delete snarl_finder; - delete graph; - exit(0); } - - TraversalFinder* trav_finder = new ExhaustiveTraversalFinder(*graph, snarl_manager); + if (path_traversals) { + // Limit traversals to embedded paths + trav_finder.reset(new PathTraversalFinder(*graph, snarl_manager)); + } else if (vcf_filename.empty()) { + // This finder works with any backing graph + trav_finder.reset(new ExhaustiveTraversalFinder(*graph, snarl_manager)); + } else { + // This should effectively be the same as above, and is included in this tool + // for testing purposes. The VCFTraversalFinder differs from Exhaustive in that + // it's easier to limit traversals using read support, and it takes care of + // mapping back to the VCF via the alt paths. + vector ref_paths; + graph->for_each_path_handle([&](path_handle_t path_handle) { + const string& name = graph->get_path_name(path_handle); + if (!Paths::is_alt(name)) { + ref_paths.push_back(name); + } + }); + trav_finder.reset(new VCFTraversalFinder(*graph, snarl_manager, variant_file, ref_paths, + ref_fasta.get(), ins_fasta.get())); + } // Sort the top level Snarls if (sort_snarls) { @@ -192,9 +342,11 @@ int main_snarl(int argc, char** argv) { return snarl_1->start().node_id() < snarl_2->end().node_id(); }); } - + // Now we have to output stuff. + // TODO: remove extra features and just use SnarlManager::serialize() + // Protobuf output buffers vector snarl_buffer; vector traversal_buffer; @@ -219,13 +371,32 @@ int main_snarl(int argc, char** argv) { // Write our snarl tree snarl_buffer.push_back(*snarl); - stream::write_buffered(cout, snarl_buffer, buffer_size); - + if (translation) { + // Bring all the output snarls into named segment space. + algorithms::back_translate_in_place(translation, snarl_buffer.back()); + } + vg::io::write_buffered(cout, snarl_buffer, buffer_size); + + auto check_max_nodes = [&graph, &max_nodes](const unordered_set& nodeset) { + int node_count = 0; + for (auto node_id : nodeset) { + handle_t node = graph->get_handle(node_id); + if (graph->get_degree(node, false) > 1 || graph->get_degree(node, true) > 1) { + ++node_count; + if (node_count > max_nodes) { + return false; + } + } + } + return true; + }; + // Optionally write our traversals - if (!traversal_file.empty() && snarl->type() == ULTRABUBBLE && + if (!traversal_file.empty() && + (!ultrabubble_only || snarl->type() == ULTRABUBBLE) && (!leaf_only || snarl_manager.is_leaf(snarl)) && (!top_level_only || snarl_manager.is_root(snarl)) && - (snarl_manager.deep_contents(snarl, *graph, true).first.size() <= max_nodes)) { + (path_traversals || check_max_nodes(snarl_manager.deep_contents(snarl, *graph, true).first))) { #ifdef debug cerr << "Look for traversals of " << pb2json(*snarl) << endl; @@ -235,12 +406,19 @@ int main_snarl(int argc, char** argv) { cerr << "Found " << travs.size() << endl; #endif + if (translation) { + for (auto& trav : travs) { + // Bring all the output traversals into named segment space. + algorithms::back_translate_in_place(translation, trav); + } + } + traversal_buffer.insert(traversal_buffer.end(), travs.begin(), travs.end()); - stream::write_buffered(trav_stream, traversal_buffer, buffer_size); + vg::io::write_buffered(trav_stream, traversal_buffer, buffer_size); } - // Sort the child snarls by node ID? if (sort_snarls) { + // Sort the child snarls by node ID vector children = snarl_manager.children_of(snarl); std::sort(children.begin(), children.end(), [](const Snarl* snarl_1, const Snarl* snarl_2) { return snarl_1->start().node_id() < snarl_2->end().node_id(); @@ -251,27 +429,27 @@ int main_snarl(int argc, char** argv) { } } else { - for (const Snarl* child_snarl : snarl_manager.children_of(snarl)) { - stack.push_back(child_snarl); + // Visit the child chains in contiguous blocks + for (auto chain : snarl_manager.chains_of(snarl)) { + // For every child chain + for (auto here = chain_rbegin(chain); here != chain_rend(chain); ++here) { + // Stack up its child snarls in reverse order, so we visit them in forward order + stack.push_back(here->first); + } } } } - } // flush - stream::write_buffered(cout, snarl_buffer, 0); + vg::io::write_buffered(cout, snarl_buffer, 0); if (!traversal_file.empty()) { - stream::write_buffered(trav_stream, traversal_buffer, 0); + vg::io::write_buffered(trav_stream, traversal_buffer, 0); } - delete snarl_finder; - delete trav_finder; - delete graph; - return 0; } // Register subcommand -static Subcommand vg_snarl("snarls", "compute snarls and their traversals", main_snarl); +static Subcommand vg_snarl("snarls", "compute snarls and their traversals", TOOLKIT, main_snarl); diff --git a/src/subcommand/sort_main.cpp b/src/subcommand/sort_main.cpp index 77c5ec0281a..c86b3476fa8 100644 --- a/src/subcommand/sort_main.cpp +++ b/src/subcommand/sort_main.cpp @@ -13,47 +13,54 @@ #include "subcommand.hpp" #include "../vg.hpp" -#include "../gfa.hpp" +#include "../stream_index.hpp" #include "../flow_sort.hpp" - +#include "../algorithms/gfa_to_handle.hpp" +#include "../algorithms/id_sort.hpp" +#include using namespace std; using namespace vg; using namespace vg::subcommand; void help_sort(char** argv){ - cerr << "usage: " << argv[0] << " sort [options] -i -r > sorted.vg " << endl + cerr << "usage: " << argv[0] << " sort [options] > sorted.vg " << endl << "options: " << endl - << " -g, --gfa input in GFA format" << endl - << " -i, --in input file" << endl - << " -r, --ref reference name" << endl - << " -w, --without-grooming no grooming mode" << endl - << " -f, --fast sort using Eades algorithm, otherwise max-flow sorting is used" << endl + << " -a, --algorithm NAME sort by the given algorithm (eades, max-flow, id, or topo; default id)" << endl + << " -g, --gfa input in GFA format" << endl + << " -r, --ref reference name, for eades and max-flow algorithms; makes -a default to max-flow" << endl + << " -w, --without-grooming no grooming mode for eades" << endl + << " -I, --index-to FILE produce an index of an id-sorted vg file to the given filename" << endl << endl; } int main_sort(int argc, char *argv[]) { - //default input format is vg + // What should we sort the graph by? + string algorithm; + + // Default input format is vg, but we can also read GFA bool gfa_input = false; - string file_name = ""; - string reference_name = ""; + + string reference_name; bool without_grooming = false; - bool use_fast_algorithm = false; + string sorted_index_filename; + int c; + optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { + {"algorithm", required_argument, 0, 'a'}, {"gfa", no_argument, 0, 'g'}, - {"in", required_argument, 0, 'i'}, {"ref", required_argument, 0, 'r'}, {"without-grooming", no_argument, 0, 'w'}, - {"fast", no_argument, 0, 'f'}, + {"index-to", no_argument, 0, 'I'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "i:r:gwf", + c = getopt_long (argc, argv, "a:gr:wI:", long_options, &option_index); /* Detect the end of the options. */ @@ -62,20 +69,23 @@ int main_sort(int argc, char *argv[]) { switch (c) { + case 'a': + algorithm = optarg; + break; case 'g': gfa_input = true; break; case 'r': reference_name = optarg; - break; - case 'i': - file_name = optarg; + if (algorithm.empty()) { + algorithm = "max-flow"; + } break; case 'w': without_grooming = true; break; - case 'f': - use_fast_algorithm = true; + case 'I': + sorted_index_filename = optarg; break; case 'h': case '?': @@ -84,41 +94,141 @@ int main_sort(int argc, char *argv[]) { exit(1); break; default: - abort (); + abort(); + } + } + + if (algorithm.empty()) { + // Set the default algorithm + algorithm = "id"; + } + + // Validate the algorithm selection and option combination + if (algorithm == "id" || algorithm == "topo") { + if (!reference_name.empty()) { + cerr << "error[vg sort]: Reference name not used with " << algorithm << " sort algorithm" << endl; + exit(1); + } + if (without_grooming) { + cerr << "error[vg sort]: Not sensible to turn off grooming with " << algorithm << " sort algorithm" << endl; + exit(1); } + } else if (algorithm == "max-flow" || algorithm == "eades") { + if (reference_name.empty()) { + cerr << "error[vg sort]: Reference name required with " << algorithm << " sort algorithm" << endl; + exit(1); + } + } else { + cerr << "error[vg sort]: Unrecognized sort algorithm " << algorithm << endl; + exit(1); } - - if (reference_name.empty() || file_name.empty()) { - help_sort(argv); + if (!sorted_index_filename.empty() && algorithm != "id") { + cerr << "error[vg sort]: Sorted VG index can only be produced when sorting by ID" << endl; exit(1); } - ifstream in; + // With the input graph file + string filename = get_input_file_name(optind, argc, argv); + + // We will load it into this graph std::unique_ptr graph; - { - in.open(file_name.c_str()); - if (gfa_input) { - graph.reset(new VG()); - if (!gfa_to_graph(in, graph.get())) { - // GFA loading has failed because the file is invalid - exit(1); - } + + if (gfa_input) { + // Read as GFA + graph.reset(new VG()); + try { + algorithms::gfa_to_path_handle_graph(filename, graph.get()); + } catch(algorithms::GFAFormatError& e) { + // GFA loading has failed because the file is invalid + cerr << e.what() << endl; + exit(1); + } catch(ios_base::failure& e) { + // GFA loading has failed because the file couldn't be read + cerr << e.what() << endl; + exit(1); + } + } else { + // Read as Handle Graph and copy into VG + unique_ptr handle_graph = + vg::io::VPKG::load_one(filename); + VG* vg_graph = dynamic_cast(graph.get()); + if (vg_graph != nullptr) { + graph.reset(vg_graph); + handle_graph.release(); } else { - graph.reset(new VG(in)); + // Copy instead. + vg_graph = new vg::VG(); + handlealgs::copy_path_handle_graph(handle_graph.get(), vg_graph); + // Give the unique_ptr ownership and delete the graph we loaded. + graph.reset(vg_graph); + // Make sure the paths are all synced up + vg_graph->paths.to_graph(vg_graph->graph); } } - FlowSort flow_sort(*graph.get()); - if (use_fast_algorithm) { - flow_sort.fast_linear_sort(reference_name, !without_grooming); + + // Now sort the graph + + if (algorithm == "max-flow" || algorithm == "eades") { + // Do max flow sort or Eades algorithm sort + FlowSort flow_sort(*graph.get()); + if (algorithm == "eades") { + // Use Eades algorithm + flow_sort.fast_linear_sort(reference_name, !without_grooming); + } else { + // Use max flow + flow_sort.max_flow_sort(reference_name); + } + } else if (algorithm == "id") { + // Sort by ID + graph.get()->id_sort(); + } else if (algorithm == "topo") { + // Sort topologically + graph.get()->sort(); } else { - flow_sort.max_flow_sort(reference_name); + throw runtime_error("Unimplemented sort algorithm: " + algorithm); + } + + // We have an optional index, which will outlive our emitter + unique_ptr> index; + if (!sorted_index_filename.empty()) { + // Make an index we can use later for graph random access + index = unique_ptr>(new StreamIndex()); } - graph->serialize_to_ostream(std::cout); - in.close(); + // Maintain our own group buffer at a higher scope than the emitter. + vector group_buffer; + + { + // Make our own emitter for serialization + vg::io::ProtobufEmitter emitter(std::cout); + + if (index) { + emitter.on_message([&](const Graph& g) { + // Copy every graph that is emitted. + // TODO: Just compute indexing stats instead. + group_buffer.push_back(g); + }); + + emitter.on_group([&](int64_t start_vo, int64_t past_end_vo) { + // On every group, tell the index to record the group stats, and clear the buffer. + index->add_group(group_buffer, start_vo, past_end_vo); + group_buffer.clear(); + }); + } + + // Save the sorted graph to the emitter + graph->serialize_to_emitter(emitter); + } + + if (index) { + // Now save out the index + ofstream index_out(sorted_index_filename); + index->save(index_out); + } + return 0; } // Register subcommand -static Subcommand vg_sort("sort", "sort variant graph using max flow algorithm or Eades fast heuristic algorithm", main_sort); +static Subcommand vg_sort("sort", "sort variant graph by various algorithms", DEPRECATED, main_sort); diff --git a/src/subcommand/srpe_main.cpp b/src/subcommand/srpe_main.cpp deleted file mode 100644 index 44881851155..00000000000 --- a/src/subcommand/srpe_main.cpp +++ /dev/null @@ -1,198 +0,0 @@ -#include -#include -#include -#include -#include -#include "subcommand.hpp" -#include "srpe.hpp" -#include "stream.hpp" -#include "index.hpp" -#include "position.hpp" -#include "vg.pb.h" -#include "path.hpp" -#include "genotypekit.hpp" -#include "genotyper.hpp" -#include "path_index.hpp" -#include "vg.hpp" -#include -#include "srpe.hpp" -#include "filter.hpp" -#include "utility.hpp" -#include "Variant.h" -#include "translator.hpp" -#include "Fasta.h" -#include "IntervalTree.h" - -using namespace std; -using namespace vg; -using namespace vg::subcommand; - - -void help_srpe(char** argv){ - cerr << "Usage: " << argv[0] << " srpe [options] " << endl - << "Options: " << endl - << " -p / --ref-path" << endl - << " -x / --xg" << endl - << " -g / --gcsa" << endl - << endl; -} - - - -int main_srpe(int argc, char** argv){ - string alignment_file = ""; - string gam_index_name = ""; - string graph_name = ""; - string xg_name = ""; - string gcsa_name = ""; - string lcp_name = ""; - - string spec_vcf = ""; - string ref_fasta = ""; - string ins_fasta = ""; - - string augmented_graph_name = ""; - bool augment_paths = true; - - string ref_path = ""; - - int max_iter = 2; - int max_frag_len = 10000; - int min_soft_clip = 20; - bool remap = false; - - bool do_all = false; - - vector search_types; - search_types.push_back("DEL"); - - int threads = 1; - - if (argc <= 2) { - help_srpe(argv); - return 1; - } - - int c; - optind = 2; // force optind past command positional argument - while (true) { - static struct option long_options[] = - { - {"max-iter", required_argument, 0, 'm'}, - {"xg-index", required_argument, 0, 'x'}, - {"augmented", required_argument, 0, 'a'}, - {"help", no_argument, 0, 'h'}, - {"gcsa-index", required_argument, 0, 'g'}, - {"specific", required_argument, 0, 'S'}, - {"recall", no_argument, 0, 'R'}, - {"insertions", required_argument, 0, 'I'}, - {"reference", required_argument, 0, 'r'}, - {"threads", required_argument, 0, 't'}, - {"ref-path", required_argument, 0, 'p'}, - {"remap", no_argument, 0, 'z'}, - {0, 0, 0, 0} - - }; - int option_index = 0; - c = getopt_long (argc, argv, "hzx:g:m:S:RI:r:t:a:wp:", - long_options, &option_index); - - // Detect the end of the options. - if (c == -1) - break; - - switch (c) - { - case 'a': - augmented_graph_name = optarg; - break; - case 'z': - remap = true; - break; - case 'm': - max_iter = parse(optarg); - break; - - case 't': - threads = parse(optarg); - break; - - case 'R': - do_all = true; - break; - case 'x': - xg_name = optarg; - break; - case 'g': - gcsa_name = optarg; - break; - case 'S': - spec_vcf = optarg; - break; - case 'r': - ref_fasta = optarg; - break; - case 'I': - ins_fasta = optarg; - break; - case 'p': - ref_path = optarg; - break; - case 'h': - case '?': - default: - help_srpe(argv); - abort(); - } - - } - - omp_set_num_threads(threads); - - - SRPE srpe; - - - - - alignment_file = argv[optind]; - //gam_index_name = argv[++optind]; - graph_name = argv[++optind]; - - xg::XG* xg_ind = new xg::XG(); - Index gamind; - - vg::VG* graph; - - if (!xg_name.empty()){ - ifstream in(xg_name); - xg_ind->load(in); - srpe.ff.set_my_xg_idx(xg_ind); - } - // Set GCSA indexes - if (!gcsa_name.empty()){ - ifstream gcsa_stream(gcsa_name); - srpe.ff.gcsa_ind = new gcsa::GCSA(); - srpe.ff.gcsa_ind->load(gcsa_stream); - string lcp_name = gcsa_name + ".lcp"; - ifstream lcp_stream(lcp_name); - srpe.ff.lcp_ind = new gcsa::LCPArray(); - srpe.ff.lcp_ind->load(lcp_stream); - } - if (!xg_name.empty()){ - ifstream xgstream(xg_name); - xg_ind->load(xgstream); - srpe.ff.set_my_xg_idx(xg_ind); - } - srpe.ff.init_mapper(); - // else{ - - // } - - - - return 0; -} - -static Subcommand vg_srpe ("srpe", "graph-external SV detection", main_srpe); - diff --git a/src/subcommand/stats_main.cpp b/src/subcommand/stats_main.cpp index 4c26b8e0bef..cfe6754aa3b 100644 --- a/src/subcommand/stats_main.cpp +++ b/src/subcommand/stats_main.cpp @@ -11,43 +11,64 @@ #include #include +#include +#include + #include "subcommand.hpp" #include "../algorithms/distance_to_head.hpp" #include "../algorithms/distance_to_tail.hpp" +#include "../handle.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../annotation.hpp" +#include "../snarl_distance_index.hpp" -#include "../vg.hpp" -#include "../distributions.hpp" +#include "../path.hpp" +#include "../statistics.hpp" #include "../genotypekit.hpp" +#include "xg.hpp" +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include +#include "../io/converted_hash_graph.hpp" +#include "../io/save_handle_graph.hpp" +#include "../gbzgraph.hpp" + using namespace std; using namespace vg; using namespace vg::subcommand; using namespace vg::algorithms; void help_stats(char** argv) { - cerr << "usage: " << argv[0] << " stats [options] " << endl + cerr << "usage: " << argv[0] << " stats [options] []" << endl << "options:" << endl - << " -z, --size size of graph" << endl - << " -N, --node-count number of nodes in graph" << endl - << " -E, --edge-count number of edges in graph" << endl - << " -l, --length length of sequences in graph" << endl - << " -s, --subgraphs describe subgraphs of graph" << endl - << " -H, --heads list the head nodes of the graph" << endl - << " -T, --tails list the tail nodes of the graph" << endl - << " -S, --siblings describe the siblings of each node" << endl - << " -c, --components print the strongly connected components of the graph" << endl - << " -A, --is-acyclic print if the graph is acyclic or not" << endl - << " -n, --node ID consider node with the given id" << endl - << " -d, --to-head show distance to head for each provided node" << endl - << " -t, --to-tail show distance to head for each provided node" << endl - << " -a, --alignments FILE compute stats for reads aligned to the graph" << endl - << " -r, --node-id-range X:Y where X and Y are the smallest and largest " + << " -z, --size size of graph" << endl + << " -N, --node-count number of nodes in graph" << endl + << " -E, --edge-count number of edges in graph" << endl + << " -l, --length length of sequences in graph" << endl + << " -L, --self-loops number of self-loops" << endl + << " -s, --subgraphs describe subgraphs of graph" << endl + << " -H, --heads list the head nodes of the graph" << endl + << " -T, --tails list the tail nodes of the graph" << endl + << " -e, --nondeterm list the nondeterministic edge sets" << endl + << " -c, --components print the strongly connected components of the graph" << endl + << " -A, --is-acyclic print if the graph is acyclic or not" << endl + << " -n, --node ID consider node with the given id" << endl + << " -d, --to-head show distance to head for each provided node" << endl + << " -t, --to-tail show distance to head for each provided node" << endl + << " -a, --alignments FILE compute stats for reads aligned to the graph" << endl + << " -r, --node-id-range X:Y where X and Y are the smallest and largest " "node id in the graph, respectively" << endl << " -o, --overlap PATH for each overlapping path mapping in the graph write a table:" << endl << " PATH, other_path, rank1, rank2" << endl << " multiple allowed; limit comparison to those provided" << endl << " -O, --overlap-all print overlap table for the cartesian product of paths" << endl << " -R, --snarls print statistics for each snarl" << endl + << " -F, --format graph format from {VG-Protobuf, PackedGraph, HashGraph, XG}. " << + "Can't detect Protobuf if graph read from stdin" << endl + << " -D, --degree-dist print degree distribution of the graph." << endl + << " -b, --dist-snarls FILE print the sizes and depths of the snarls in a given distance index." << endl + << " -p, --threads N number of threads to use [all available]" << endl << " -v, --verbose output longer reports" << endl; } @@ -60,9 +81,11 @@ int main_stats(int argc, char** argv) { bool stats_size = false; bool stats_length = false; + bool stats_self_loops = false; bool stats_subgraphs = false; bool stats_heads = false; bool stats_tails = false; + bool stats_nondeterm = false; bool show_sibs = false; bool show_components = false; bool head_distance = false; @@ -79,6 +102,9 @@ int main_stats(int argc, char** argv) { vector paths_to_overlap; bool overlap_all_paths = false; bool snarl_stats = false; + bool format = false; + bool degree_dist = false; + string distance_index_filename; int c; optind = 2; // force optind past command positional argument @@ -89,11 +115,12 @@ int main_stats(int argc, char** argv) { {"node-count", no_argument, 0, 'N'}, {"edge-count", no_argument, 0, 'E'}, {"length", no_argument, 0, 'l'}, + {"self-loops", no_argument, 0, 'L'}, {"subgraphs", no_argument, 0, 's'}, {"heads", no_argument, 0, 'H'}, {"tails", no_argument, 0, 'T'}, + {"nondeterm", no_argument, 0, 'e'}, {"help", no_argument, 0, 'h'}, - {"siblings", no_argument, 0, 'S'}, {"components", no_argument, 0, 'c'}, {"to-head", no_argument, 0, 'd'}, {"to-tail", no_argument, 0, 't'}, @@ -105,11 +132,15 @@ int main_stats(int argc, char** argv) { {"overlap", no_argument, 0, 'o'}, {"overlap-all", no_argument, 0, 'O'}, {"snarls", no_argument, 0, 'R'}, + {"format", no_argument, 0, 'F'}, + {"degree-dist", no_argument, 0, 'D'}, + {"dist-snarls", required_argument, 0, 'b'}, + {"threads", required_argument, 0, 'p'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hzlsHTScdtn:NEa:vAro:OR", + c = getopt_long (argc, argv, "hzlLsHTecdtn:NEa:vAro:ORFDb:p:", long_options, &option_index); // Detect the end of the options. @@ -134,6 +165,10 @@ int main_stats(int argc, char** argv) { stats_length = true; break; + case 'L': + stats_self_loops = true; + break; + case 's': stats_subgraphs = true; break; @@ -146,6 +181,10 @@ int main_stats(int argc, char** argv) { stats_tails = true; break; + case 'e': + stats_nondeterm = true; + break; + case 'S': show_sibs = true; break; @@ -194,6 +233,27 @@ int main_stats(int argc, char** argv) { verbose = true; break; + case 'F': + format = true; + break; + + case 'D': + degree_dist = true; + break; + case 'b': + distance_index_filename = optarg; + break; + case 'p': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg stats] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } + case 'h': case '?': help_stats(argv); @@ -205,81 +265,157 @@ int main_stats(int argc, char** argv) { } } - VG graph; - get_input_file(optind, argc, argv, [&](istream& in) { - graph.from_istream(in); - }); + bdsg::ReferencePathOverlayHelper overlay_helper; + unique_ptr path_handle_graph; + PathHandleGraph* graph = nullptr; + string graph_file_name; + if (have_input_file(optind, argc, argv)) { + // We have an (optional, because we can just process alignments) graph input file. + // TODO: we can load any PathHandleGraph, but some operations still require a VG + // In those cases, we convert back to vg::VG + graph_file_name = get_input_file_name(optind, argc, argv); + path_handle_graph = vg::io::VPKG::load_one(graph_file_name); + if (dynamic_cast(path_handle_graph.get()) != nullptr && !alignments_filename.empty()) { + // GBZ paths on handle lookups too slow without the overlay + graph = overlay_helper.apply(path_handle_graph.get()); + } else { + graph = path_handle_graph.get(); + } + } + + // We have function to make sure the graph was passed and complain if not + auto require_graph = [&graph]() { + if (graph == nullptr) { + cerr << "error[vg stats]: The selected operation requires passing a graph file to work on" << endl; + exit(1); + } + }; + if (stats_size) { - cout << "nodes" << "\t" << graph.node_count() << endl - << "edges" << "\t" << graph.edge_count() << endl; + require_graph(); + cout << "nodes" << "\t" << graph->get_node_count() << endl + << "edges" << "\t" << graph->get_edge_count() << endl; } if (node_count) { - cout << graph.node_count() << endl; + require_graph(); + cout << graph->get_node_count() << endl; } if (edge_count) { - cout << graph.edge_count() << endl; + require_graph(); + cout << graph->get_edge_count() << endl; } if (stats_length) { - cout << "length" << "\t" << graph.total_length_of_nodes() << endl; + require_graph(); + cout << "length" << "\t" << graph->get_total_length() << endl; + } + + if (stats_self_loops) { + require_graph(); + size_t total = 0; + graph->for_each_edge([&](const edge_t& edge) { + if (graph->get_id(edge.first) == graph->get_id(edge.second)) { + total++; + } + }); + cout << "self-loops" << "\t" << total << endl; } if (stats_heads) { - vector heads; - graph.head_nodes(heads); + require_graph(); + vector heads = handlealgs::head_nodes(graph); cout << "heads" << "\t"; - for (vector::iterator h = heads.begin(); h != heads.end(); ++h) { - cout << (*h)->id() << " "; + for (auto& h : heads) { + cout << graph->get_id(h) << " "; } cout << endl; } if (stats_tails) { - vector tails; - graph.tail_nodes(tails); + require_graph(); + vector tails = handlealgs::tail_nodes(graph); cout << "tails" << "\t"; - for (vector::iterator t = tails.begin(); t != tails.end(); ++t) { - cout << (*t)->id() << " "; + for (auto& t : tails) { + cout << graph->get_id(t) << " "; } cout << endl; } + if (stats_nondeterm) { + require_graph(); + graph->for_each_handle([&](const handle_t& handle) { + nid_t id = graph->get_id(handle); + for (bool is_reverse : { false, true }) { + std::map> edges; + graph->follow_edges(graph->get_handle(id, is_reverse), false, [&](const handle_t& to) { + edges[graph->get_base(to, 0)].push_back(to); + }); + for (auto iter = edges.begin(); iter != edges.end(); ++iter) { + if (iter->second.size() > 1) { + std::cout << "nondeterministic\t" << id << (is_reverse ? "-" : "+"); + for (const handle_t& to : iter->second) { + std::cout << "\t" << graph->get_id(to) << (graph->get_is_reverse(to) ? "-" : "+"); + } + std::cout << std::endl; + } + } + } + }); + } + if (stats_subgraphs) { - list subgraphs; - graph.disjoint_subgraphs(subgraphs); - // these are topologically-sorted - for (list::iterator s = subgraphs.begin(); s != subgraphs.end(); ++s) { - VG& subgraph = *s; - vector heads; - subgraph.head_nodes(heads); - int64_t length = subgraph.total_length_of_nodes(); - for (vector::iterator h = heads.begin(); h != heads.end(); ++h) { - cout << (h==heads.begin()?"":",") << (*h)->id(); + require_graph(); + + // TODO: Pretty sure "subgraphs" means "weakly connected components", + // but this isn't really explained. + + vector, vector>> subgraphs_with_tips = + handlealgs::weakly_connected_components_with_tips(graph); + + for (auto& subgraph_and_tips : subgraphs_with_tips) { + // For each subgraph set and its inward tip handles + auto& subgraph = subgraph_and_tips.first; + auto& tips = subgraph_and_tips.second; + + // Decide if we need a comma before us or not + bool first = true; + for (handle_t& tip : tips) { + // Print all the IDs of heads + if (graph->get_is_reverse(tip)) { + // Heads are locally forward, so this isn't one. + // TODO: subgraphs with only tails get no identification. + continue; + } + if (!first) { + cout << ","; + } else { + first = false; + } + cout << graph->get_id(tip); } - cout << "\t" << length << endl; + cout << "\t"; + + // Now we need the total length. TODO: can we do a batch lookup? + size_t total_length = 0; + for (auto& id : subgraph) { + total_length += graph->get_length(graph->get_handle(id)); + } + + cout << total_length << endl; } } if (stats_range) { - cout << "node-id-range\t" << graph.min_node_id() << ":" << graph.max_node_id() << endl; - } - - if (show_sibs) { - graph.for_each_node([&graph](Node* n) { - for (auto trav : graph.full_siblings_to(NodeTraversal(n, false))) { - cout << n->id() << "\t" << "to-sib" << "\t" << trav.node->id() << endl; - } - for (auto trav : graph.full_siblings_from(NodeTraversal(n, false))) { - cout << n->id() << "\t" << "from-sib" << "\t" << trav.node->id() << endl; - } - }); + require_graph(); + cout << "node-id-range\t" << graph->min_node_id() << ":" << graph->max_node_id() << endl; } if (show_components) { - for (auto& c : graph.strongly_connected_components()) { + require_graph(); + for (auto& c : handlealgs::strongly_connected_components(graph)) { for (auto& id : c) { cout << id << ", "; } @@ -288,7 +424,8 @@ int main_stats(int argc, char** argv) { } if (is_acyclic) { - if (graph.is_acyclic()) { + require_graph(); + if (handlealgs::is_acyclic(graph)) { cout << "acyclic" << endl; } else { cout << "cyclic" << endl; @@ -296,22 +433,85 @@ int main_stats(int argc, char** argv) { } if (head_distance) { + require_graph(); for (auto id : ids) { - auto n = graph.get_handle(id, false); + auto n = graph->get_handle(id, false); cout << id << " to head:\t" - << distance_to_head(n, 1000, &graph) << endl; + << distance_to_head(n, 1000, graph) << endl; } } if (tail_distance) { + require_graph(); for (auto id : ids) { - auto n = graph.get_handle(id, false); + auto n = graph->get_handle(id, false); cout << id << " to tail:\t" - << distance_to_tail(n, 1000, &graph) << endl; + << distance_to_tail(n, 1000, graph) << endl; + } + } + + if (format) { + require_graph(); + string format_string; + if (dynamic_cast(graph) != nullptr) { + format_string = "XG"; + } else if (dynamic_cast(graph) != nullptr) { + // important this check comes before PackedGraph + format_string = "GFA"; + } else if (dynamic_cast(graph) != nullptr) { + format_string = "PackedGraph"; + } else if (dynamic_cast(graph) != nullptr) { + // Was Protobuf but we're using a HashGraph internally + format_string = "VG-Protobuf"; + } else if (dynamic_cast(graph) != nullptr) { + format_string = "HashGraph"; + } else if (dynamic_cast(graph) != nullptr) { + format_string = "GBZ"; + } else { + format_string = "Unknown"; + } + cout << "format: " << format_string << endl; + } + + if (degree_dist) { + require_graph(); + // compute degrees + map> degree_to_count; + graph->for_each_handle([°ree_to_count, &graph](handle_t handle) { + size_t left_degree = graph->get_degree(handle, true); + size_t right_degree = graph->get_degree(handle, false); + // update sides count + ++get<0>(degree_to_count[left_degree]); + ++get<0>(degree_to_count[right_degree]); + // update min count + ++get<1>(degree_to_count[std::min(left_degree, right_degree)]); + // update max count + ++get<2>(degree_to_count[std::max(left_degree, right_degree)]); + // update total count + ++get<3>(degree_to_count[left_degree + right_degree]); + }); + // print degrees + cout << "Degree\tSides\tNodes(min)\tNodes(max)\tNodes(total)" << endl; + for (const auto& dg : degree_to_count) { + cout << dg.first << "\t" << get<0>(dg.second) << "\t" <(dg.second) << "\t" << get<2>(dg.second) << "\t" << get<3>(dg.second) << endl; } } if (!paths_to_overlap.empty() || overlap_all_paths) { + require_graph(); + + VG* vg_graph = dynamic_cast(graph); + if (vg_graph == nullptr) { + // TODO: This path overlap code can be handle-ified, and should be. + vg_graph = new vg::VG(); + handlealgs::copy_path_handle_graph(graph, vg_graph); + // Give the unique_ptr ownership and delete the graph we loaded. + path_handle_graph.reset(vg_graph); + graph = path_handle_graph.get(); + // Make sure the paths are all synced up + vg_graph->paths.to_graph(vg_graph->graph); + } + auto cb = [&](const Path& p1, const Path& p2) { // sparse storage of the correspondence matrix // map from ranks in first to ranks in second @@ -363,17 +563,17 @@ int main_stats(int argc, char** argv) { cout << "comparison" << "\t" << "x" << "\t" << "y" << endl; vector path_names; if (overlap_all_paths) { - path_names = graph.paths.all_path_names(); + path_names = vg_graph->paths.all_path_names(); } else { path_names = paths_to_overlap; } for (auto& p1_name : path_names) { - Path p1 = graph.paths.path(p1_name); + Path p1 = vg_graph->paths.path(p1_name); for (auto& p2_name : path_names) { if (p1_name == p2_name) { continue; } - Path p2 = graph.paths.path(p2_name); + Path p2 = vg_graph->paths.path(p2_name); cb(p1, p2); } } @@ -385,16 +585,6 @@ int main_stats(int argc, char** argv) { // We need some allele parsing functions - // This one decided if a path is really an allele path - auto path_name_is_allele = [](const string path_name) -> bool { - string prefix = "_alt_"; - // It needs to start with "_alt_" and have another separating - // underscore between site name and allele number - return(prefix.size() < path_name.size() && - count(path_name.begin(), path_name.end(), '_') >= 3 && - equal(prefix.begin(), prefix.end(), path_name.begin())); - }; - // This one gets the site name from an allele path name auto path_name_to_site = [](const string& path_name) -> string { auto last_underscore = path_name.rfind('_'); @@ -408,54 +598,156 @@ int main_stats(int argc, char** argv) { assert(last_underscore != string::npos); return path_name.substr(last_underscore + 1); }; + + // In order to do stats across multiple threads, we define these add-able bundles of stats. + struct ReadStats { + // These are the general stats we will compute. + size_t total_alignments = 0; + size_t total_aligned = 0; + size_t total_primary = 0; + size_t total_secondary = 0; + size_t total_perfect = 0; // Number of reads with no indels or substitutions relative to their paths + size_t total_gapless = 0; // Number of reads with no indels relative to their paths + + // These are for tracking which nodes are covered and which are not + map node_visit_counts; + + // And for counting indels + // Inserted bases also counts softclips + size_t total_insertions = 0; + size_t total_inserted_bases = 0; + size_t total_deletions = 0; + size_t total_deleted_bases = 0; + // And substitutions + size_t total_substitutions = 0; + size_t total_substituted_bases = 0; + // And softclips + size_t total_softclips = 0; + size_t total_softclipped_bases = 0; + // And pairing + size_t total_paired = 0; + size_t total_proper_paired = 0; + + // Alignment and mapping quality score distributions. + std::map alignment_scores; + std::map mapping_qualities; + + // In verbose mode we want to report details of insertions, deletions, + // and substitutions, and soft clips. + vector> insertions; + vector> deletions; + vector> substitutions; + vector> softclips; + + // This is going to be indexed by site + // ("_alt_f6d951572f9c664d5d388375aa8b018492224533") and then by allele + // ("0"). A read only counts if it visits a node that's on one allele + // and not any others in that site. + map> reads_on_allele; + + double total_time_seconds = 0.0; + + inline ReadStats& operator+=(const ReadStats& other) { + total_alignments += other.total_alignments; + total_aligned += other.total_aligned; + total_primary += other.total_primary; + total_secondary += other.total_secondary; + total_perfect += other.total_perfect; + total_gapless += other.total_gapless; + + for (auto& kv : other.node_visit_counts) { + node_visit_counts[kv.first] += kv.second; + } + + total_insertions += other.total_insertions; + total_inserted_bases += other.total_inserted_bases; + total_deletions += other.total_deletions; + total_deleted_bases += other.total_deleted_bases; + total_substitutions += other.total_substitutions; + total_substituted_bases += other.total_substituted_bases; + total_softclips += other.total_softclips; + total_softclipped_bases += other.total_softclipped_bases; + total_paired += other.total_paired; + total_proper_paired += other.total_proper_paired; + + for (auto iter = other.alignment_scores.begin(); iter != other.alignment_scores.end(); ++iter) { + this->alignment_scores[iter->first] += iter->second; + } + for (auto iter = other.mapping_qualities.begin(); iter != other.mapping_qualities.end(); ++iter) { + this->mapping_qualities[iter->first] += iter->second; + } + + std::copy(other.insertions.begin(), other.insertions.end(), std::back_inserter(insertions)); + std::copy(other.deletions.begin(), other.deletions.end(), std::back_inserter(deletions)); + std::copy(other.substitutions.begin(), other.substitutions.end(), std::back_inserter(substitutions)); + std::copy(other.softclips.begin(), other.softclips.end(), std::back_inserter(softclips)); + + for (auto& kv : other.reads_on_allele) { + auto& dest = reads_on_allele[kv.first]; + for (auto& kv2 : kv.second) { + dest[kv2.first] += kv2.second; + } + } + + total_time_seconds += other.total_time_seconds; + + return *this; + } + }; // Before we go over the reads, we need to make a map that tells us what // nodes are unique to what allele paths. Stores site and allele parts // separately. map> allele_path_for_node; - // This is what we really care about: for each pair of allele paths in - // the graph, we need to find out whether the coverage imbalance between - // them among primary alignments is statistically significant. For this, - // we need to track how many reads overlap the distinct parts of allele - // paths. - - // This is going to be indexed by site - // ("_alt_f6d951572f9c664d5d388375aa8b018492224533") and then by allele - // ("0"). A read only counts if it visits a node that's on one allele - // and not any others in that site. - - // We need to pre-populate it with 0s so we know which sites actually - // have 2 alleles and which only have 1 in the graph. - map> reads_on_allele; - - graph.for_each_node_parallel([&](Node* node) { - // For every node - - if(!graph.paths.has_node_mapping(node)) { - // No paths to go over. If we try and get them we'll be - // modifying the paths in parallel, which will explode. - return; - } - - // We want an allele path on it - string allele_path; - for(auto& name_and_mappings : graph.paths.get_node_mapping_by_path_name(node)) { - // For each path on it - if(path_name_is_allele(name_and_mappings.first)) { - // If it's an allele path - if(allele_path.empty()) { - // It's the first. Take it. - allele_path = name_and_mappings.first; - } else { - // It's a subsequent one. This node is not uniquely part - // of any allele path. - return; + // Create a combined ReadStats accumulator. We need to pre-populate its + // reads_on_allele with 0s when we look at the alleles so we know which + // sites actually have 2 alleles and which only have 1 in the graph. + ReadStats combined; + + if (graph != nullptr) { + // We have a graph to work on + + // For each pair of allele paths in the graph, we need to find out + // whether the coverage imbalance between them among primary alignments + // is statistically significant. For this, we need to track how many + // reads overlap the distinct parts of allele paths. + + graph->for_each_handle([&](handle_t node) { + // For every node in parallel + + // We want a unique allele path on it + string allele_path; + + graph->for_each_step_on_handle(node, [&](const step_handle_t& step) -> bool { + // Get the name of every patht hat goes here (some may repeat) + auto path_name = graph->get_path_name(graph->get_path_handle_of_step(step)); + + if(Paths::is_alt(path_name) && path_name != allele_path) { + // If it's a new/distinct allele path + if(allele_path.empty()) { + // It's the first. Take it. + allele_path = path_name; + // Check for more overlappin alt paths + return true; + } else { + // It's a subsequent one. This node is not uniquely part + // of any allele path. So we want to skip the node. + allele_path.clear(); + return false; + } } + + // If not an alt, keep going + return true; + }); + + if (allele_path.empty()) { + // We did not find a unique overlapping allele path. + // Skip the node. + return; } - } - - if(!allele_path.empty()) { + // We found an allele path for this node // Get its site and allele so we can count it as a biallelic @@ -467,74 +759,59 @@ int main_stats(int argc, char** argv) { #pragma omp critical (allele_path_for_node) - allele_path_for_node[node->id()] = make_pair(site, allele); + allele_path_for_node[graph->get_id(node)] = make_pair(site, allele); #pragma omp critical (reads_on_allele) - reads_on_allele[site][allele] = 0; - } - }); - - - // These are the general stats we will compute. - size_t total_alignments = 0; - size_t total_aligned = 0; - size_t total_primary = 0; - size_t total_secondary = 0; - - // These are for counting significantly allele-biased hets - size_t total_hets = 0; - size_t significantly_biased_hets = 0; + combined.reads_on_allele[site][allele] = 0; + }, true); + } - // These are for tracking which nodes are covered and which are not - map node_visit_counts; - - // And for counting indels - // Inserted bases also counts softclips - size_t total_insertions = 0; - size_t total_inserted_bases = 0; - size_t total_deletions = 0; - size_t total_deleted_bases = 0; - // And substitutions - size_t total_substitutions = 0; - size_t total_substituted_bases = 0; - // And softclips - size_t total_softclips = 0; - size_t total_softclipped_bases = 0; - - // In verbose mode we want to report details of insertions, deletions, - // and substitutions, and soft clips. - vector> insertions; - vector> deletions; - vector> substitutions; - vector> softclips; + // Allocate per-thread storage for stats + size_t thread_count = vg::get_thread_count(); + vector read_stats; + read_stats.resize(thread_count); + // when we get each read, process it into the current thread's stats function lambda = [&](Alignment& aln) { int tid = omp_get_thread_num(); - + auto& stats = read_stats.at(tid); // We ought to be able to do many stats on the alignments. // Now do all the non-mapping stats - #pragma omp critical (total_alignments) - total_alignments++; - if(aln.is_secondary()) { - #pragma omp critical (total_secondary) - total_secondary++; + stats.total_alignments++; + if(aln.is_secondary() || (has_annotation(aln, "secondary") && get_annotation(aln, "secondary"))) { + stats.total_secondary++; } else { - #pragma omp critical (total_primary) - total_primary++; - if(aln.score() > 0) { + stats.total_primary++; + bool has_alignment = aln.score() > 0; + if (has_alignment) { // We only count aligned primary reads in "total aligned"; // the primary can't be unaligned if the secondary is // aligned. - #pragma omp critical (total_aligned) - total_aligned++; + stats.total_aligned++; + stats.alignment_scores[aln.score()]++; + stats.mapping_qualities[aln.mapping_quality()]++; } + + if (aln.has_fragment_next() || aln.has_fragment_prev() || has_annotation(aln, "proper_pair")) { + stats.total_paired++; + if (has_annotation(aln, "proper_pair") && get_annotation(aln, "proper_pair")) { + stats.total_proper_paired++; + } + } + + // Record the number of thread-seconds used. Time is only counted on the primaries. + stats.total_time_seconds += aln.time_used(); // Which sites and alleles does this read support. TODO: if we hit // unique nodes from multiple alleles of the same site, we should... // do something. Discard the read? Not just count it on both sides // like we do now. set> alleles_supported; + + // We check if the read has non-softclip indels, or any edits at all. + bool has_non_match_edits = false; + bool has_non_softclip_indel_edits = false; for(size_t i = 0; i < aln.path().mapping_size(); i++) { // For every mapping... @@ -549,60 +826,61 @@ int main_stats(int argc, char** argv) { } // Record that there was a visit to this node. - #pragma omp critical (node_visit_counts) - node_visit_counts[node_id]++; + stats.node_visit_counts[node_id]++; for(size_t j = 0; j < mapping.edit_size(); j++) { // Go through edits and look for each type. auto& edit = mapping.edit(j); if(edit.to_length() > edit.from_length()) { + // This is an insert or softclip and not a match + has_non_match_edits = true; if((j == 0 && i == 0) || (j == mapping.edit_size() - 1 && i == aln.path().mapping_size() - 1)) { // We're at the very end of the path, so this is a soft clip. - #pragma omp critical (total_softclipped_bases) - total_softclipped_bases += edit.to_length() - edit.from_length(); - #pragma omp critical (total_softclips) - total_softclips++; + stats.total_softclipped_bases += edit.to_length() - edit.from_length(); + stats.total_softclips++; if(verbose) { // Record the actual insertion - #pragma omp critical (softclips) - softclips.push_back(make_pair(node_id, edit)); + stats.softclips.push_back(make_pair(node_id, edit)); } } else { + // This is not a softclip + has_non_softclip_indel_edits = true; + // Record this insertion - #pragma omp critical (total_inserted_bases) - total_inserted_bases += edit.to_length() - edit.from_length(); - #pragma omp critical (total_insertions) - total_insertions++; + stats.total_inserted_bases += edit.to_length() - edit.from_length(); + stats.total_insertions++; if(verbose) { // Record the actual insertion - #pragma omp critical (insertions) - insertions.push_back(make_pair(node_id, edit)); + stats.insertions.push_back(make_pair(node_id, edit)); } } } else if(edit.from_length() > edit.to_length()) { + // This is a deletion and not a match + has_non_match_edits = true; + + // This is not a softclip either + has_non_softclip_indel_edits = true; + // Record this deletion - #pragma omp critical (total_deleted_bases) - total_deleted_bases += edit.from_length() - edit.to_length(); - #pragma omp critical (total_deletions) - total_deletions++; + stats.total_deleted_bases += edit.from_length() - edit.to_length(); + stats.total_deletions++; if(verbose) { // Record the actual deletion - #pragma omp critical (deletions) - deletions.push_back(make_pair(node_id, edit)); + stats.deletions.push_back(make_pair(node_id, edit)); } } else if(!edit.sequence().empty()) { + // This is a substitution and not a match + has_non_match_edits = true; + // Record this substitution // TODO: a substitution might also occur as part of a deletion/insertion above! - #pragma omp critical (total_substituted_bases) - total_substituted_bases += edit.from_length(); - #pragma omp critical (total_substitutions) - total_substitutions++; + stats.total_substituted_bases += edit.from_length(); + stats.total_substitutions++; if(verbose) { // Record the actual substitution - #pragma omp critical (substitutions) - substitutions.push_back(make_pair(node_id, edit)); + stats.substitutions.push_back(make_pair(node_id, edit)); } } @@ -612,55 +890,26 @@ int main_stats(int argc, char** argv) { for(auto& site_and_allele : alleles_supported) { // This read is informative for an allele of a site. // Up the reads on that allele of that site. - #pragma omp critical (reads_on_allele) - reads_on_allele[site_and_allele.first][site_and_allele.second]++; + stats.reads_on_allele[site_and_allele.first][site_and_allele.second]++; } + + // If there's no non-match edits, call it a perfect alignment + stats.total_perfect += !has_non_match_edits && has_alignment; + + // If there's no non-softclip indel edits, the alignment is gapless + stats.total_gapless += !has_non_softclip_indel_edits && has_alignment; } }; // Actually go through all the reads and count stuff up. - stream::for_each_parallel(alignment_stream, lambda); - - // Calculate stats about the reads per allele data - for(auto& site_and_alleles : reads_on_allele) { - // For every site - if(site_and_alleles.second.size() == 2) { - // If it actually has 2 alleles with unique nodes in the - // graph (so we can use the binomial) - - // We'll fill this with the counts for the two present alleles. - vector counts; - - for(auto& allele_and_count : site_and_alleles.second) { - // Collect all the counts - counts.push_back(allele_and_count.second); - } - - if(counts[0] > counts[1]) { - // We have a 50% underlying probability so we can just put - // the rarer allele first. - swap(counts[0], counts[1]); - } - - // What's the log prob for the smaller tail? - auto tail_logprob = binomial_cmf_ln(prob_to_logprob(0.5), counts[1] + counts[0], counts[0]); - - // Double it to get the two-tailed test - tail_logprob += prob_to_logprob(2); - -#ifdef debug - cerr << "Site " << site_and_alleles.first << " has " << counts[0] - << " and " << counts[1] << " p=" << logprob_to_prob(tail_logprob) << endl; -#endif - - if(tail_logprob < prob_to_logprob(0.05)) { - significantly_biased_hets++; - } - total_hets++; - - } + vg::io::for_each_parallel(alignment_stream, lambda); + + // Now combine into a single ReadStats object (for which we pre-populated reads_on_allele with 0s). + for (auto& per_thread : read_stats) { + combined += per_thread; } + read_stats.clear(); // Go through all the nodes again and sum up unvisited nodes size_t unvisited_nodes = 0; @@ -679,86 +928,163 @@ int main_stats(int argc, char** argv) { // as many times as their nodes are touched. Also note that we ignore // edge effects and a read that stops before the end of a node will // visit the whole node. - graph.for_each_node_parallel([&](Node* node) { - // For every node - if(!node_visit_counts.count(node->id()) || node_visit_counts.at(node->id()) == 0) { - // If we never visited it with a read, count it. - #pragma omp critical (unvisited_nodes) - unvisited_nodes++; - #pragma omp critical (unvisited_node_bases) - unvisited_node_bases += node->sequence().size(); - if(verbose) { - #pragma omp critical (unvisited_ids) - unvisited_ids.insert(node->id()); - } - } else if(node_visit_counts.at(node->id()) == 1) { - // If we visited it with only one read, count it. - #pragma omp critical (single_visited_nodes) - single_visited_nodes++; - #pragma omp critical (single_visited_node_bases) - single_visited_node_bases += node->sequence().size(); - if(verbose) { - #pragma omp critical (single_visited_ids) - single_visited_ids.insert(node->id()); + + // These are for counting significantly allele-biased hets + size_t total_hets = 0; + size_t significantly_biased_hets = 0; + + if (graph != nullptr) { + + // Calculate stats about the reads per allele data + for(auto& site_and_alleles : combined.reads_on_allele) { + // For every site + if(site_and_alleles.second.size() == 2) { + // If it actually has 2 alleles with unique nodes in the + // graph (so we can use the binomial) + + // We'll fill this with the counts for the two present alleles. + vector counts; + + for(auto& allele_and_count : site_and_alleles.second) { + // Collect all the counts + counts.push_back(allele_and_count.second); + } + + if(counts[0] > counts[1]) { + // We have a 50% underlying probability so we can just put + // the rarer allele first. + swap(counts[0], counts[1]); + } + + // What's the log prob for the smaller tail? + auto tail_logprob = binomial_cmf_ln(prob_to_logprob(0.5), counts[1] + counts[0], counts[0]); + + // Double it to get the two-tailed test + tail_logprob += prob_to_logprob(2); + +#ifdef debug + cerr << "Site " << site_and_alleles.first << " has " << counts[0] + << " and " << counts[1] << " p=" << logprob_to_prob(tail_logprob) << endl; +#endif + + if(tail_logprob < prob_to_logprob(0.05)) { + significantly_biased_hets++; + } + total_hets++; + } } - }); - cout << "Total alignments: " << total_alignments << endl; - cout << "Total primary: " << total_primary << endl; - cout << "Total secondary: " << total_secondary << endl; - cout << "Total aligned: " << total_aligned << endl; + graph->for_each_handle([&](handle_t node) { + // For every node + + // Look up its stats + nid_t id = graph->get_id(node); + size_t length = graph->get_length(node); + + if(!combined.node_visit_counts.count(id) || combined.node_visit_counts.at(id) == 0) { + // If we never visited it with a read, count it. + #pragma omp critical (unvisited_nodes) + unvisited_nodes++; + #pragma omp critical (unvisited_node_bases) + unvisited_node_bases += length; + if(verbose) { + #pragma omp critical (unvisited_ids) + unvisited_ids.insert(id); + } + } else if(combined.node_visit_counts.at(id) == 1) { + // If we visited it with only one read, count it. + #pragma omp critical (single_visited_nodes) + single_visited_nodes++; + #pragma omp critical (single_visited_node_bases) + single_visited_node_bases += length; + if(verbose) { + #pragma omp critical (single_visited_ids) + single_visited_ids.insert(id); + } + } + }); + + } - cout << "Insertions: " << total_inserted_bases << " bp in " << total_insertions << " read events" << endl; + cout << "Total alignments: " << combined.total_alignments << endl; + cout << "Total primary: " << combined.total_primary << endl; + cout << "Total secondary: " << combined.total_secondary << endl; + cout << "Total aligned: " << combined.total_aligned << endl; + cout << "Total perfect: " << combined.total_perfect << endl; + cout << "Total gapless (softclips allowed): " << combined.total_gapless << endl; + cout << "Total paired: " << combined.total_paired << endl; + cout << "Total properly paired: " << combined.total_proper_paired << endl; + + SummaryStatistics score_stats = summary_statistics(combined.alignment_scores); + cout << "Alignment score: mean " << score_stats.mean + << ", median " << score_stats.median + << ", stdev " << score_stats.stdev + << ", max " << score_stats.max_value << " (" << score_stats.count_of_max << " reads)" << endl; + SummaryStatistics mapq_stats = summary_statistics(combined.mapping_qualities); + cout << "Mapping quality: mean " << mapq_stats.mean + << ", median " << mapq_stats.median + << ", stdev " << mapq_stats.stdev + << ", max " << mapq_stats.max_value << " (" << mapq_stats.count_of_max << " reads)" << endl; + + cout << "Insertions: " << combined.total_inserted_bases << " bp in " << combined.total_insertions << " read events" << endl; if(verbose) { - for(auto& id_and_edit : insertions) { + for(auto& id_and_edit : combined.insertions) { cout << "\t" << id_and_edit.second.from_length() << " -> " << id_and_edit.second.sequence() << " on " << id_and_edit.first << endl; } } - cout << "Deletions: " << total_deleted_bases << " bp in " << total_deletions << " read events" << endl; + cout << "Deletions: " << combined.total_deleted_bases << " bp in " << combined.total_deletions << " read events" << endl; if(verbose) { - for(auto& id_and_edit : deletions) { + for(auto& id_and_edit : combined.deletions) { cout << "\t" << id_and_edit.second.from_length() << " -> " << id_and_edit.second.to_length() << " on " << id_and_edit.first << endl; } } - cout << "Substitutions: " << total_substituted_bases << " bp in " << total_substitutions << " read events" << endl; + cout << "Substitutions: " << combined.total_substituted_bases << " bp in " << combined.total_substitutions << " read events" << endl; if(verbose) { - for(auto& id_and_edit : substitutions) { + for(auto& id_and_edit : combined.substitutions) { cout << "\t" << id_and_edit.second.from_length() << " -> " << id_and_edit.second.sequence() << " on " << id_and_edit.first << endl; } } - cout << "Softclips: " << total_softclipped_bases << " bp in " << total_softclips << " read events" << endl; + cout << "Softclips: " << combined.total_softclipped_bases << " bp in " << combined.total_softclips << " read events" << endl; if(verbose) { - for(auto& id_and_edit : softclips) { + for(auto& id_and_edit : combined.softclips) { cout << "\t" << id_and_edit.second.from_length() << " -> " << id_and_edit.second.sequence() << " on " << id_and_edit.first << endl; } } - - cout << "Unvisited nodes: " << unvisited_nodes << "/" << graph.node_count() - << " (" << unvisited_node_bases << " bp)" << endl; - if(verbose) { - for(auto& id : unvisited_ids) { - cout << "\t" << id << endl; - } + + if (combined.total_time_seconds > 0.0) { + // Time was recorded + cout << "Total time: " << combined.total_time_seconds << " seconds" << endl; + cout << "Speed: " << (combined.total_primary / combined.total_time_seconds) << " reads/second" << endl; } + + if (graph != nullptr) { + cout << "Unvisited nodes: " << unvisited_nodes << "/" << graph->get_node_count() + << " (" << unvisited_node_bases << " bp)" << endl; + if(verbose) { + for(auto& id : unvisited_ids) { + cout << "\t" << id << endl; + } + } - cout << "Single-visited nodes: " << single_visited_nodes << "/" << graph.node_count() - << " (" << single_visited_node_bases << " bp)" << endl; - if(verbose) { - for(auto& id : single_visited_ids) { - cout << "\t" << id << endl; + cout << "Single-visited nodes: " << single_visited_nodes << "/" << graph->get_node_count() + << " (" << single_visited_node_bases << " bp)" << endl; + if(verbose) { + for(auto& id : single_visited_ids) { + cout << "\t" << id << endl; + } } - } - cout << "Significantly biased heterozygous sites: " << significantly_biased_hets << "/" << total_hets; - if(total_hets > 0) { - cout << " (" << (double)significantly_biased_hets / total_hets * 100 << "%)"; + cout << "Significantly biased heterozygous sites: " << significantly_biased_hets << "/" << total_hets; + if(total_hets > 0) { + cout << " (" << (double)significantly_biased_hets / total_hets * 100 << "%)"; + } + cout << endl; } - cout << endl; } @@ -766,18 +1092,45 @@ int main_stats(int argc, char** argv) { if (snarl_stats) { // We will go through all the snarls and compute stats. + require_graph(); + // First compute the snarls - auto manager = CactusSnarlFinder(graph).find_snarls(); + auto manager = IntegratedSnarlFinder(*graph).find_snarls_parallel(); // We will track depth for each snarl unordered_map depth; + + // TSV header + cout << "Start\tStart-Reversed\tEnd\tEnd-Reversed\tUltrabubble\tUnary\tShallow-Nodes\tShallow-Edges\tShallow-bases\tDeep-Nodes\tDeep-Edges\tDeep-Bases\tDepth\tChildren\tChains\tChains-Children\tNet-Graph-Size\n"; manager.for_each_snarl_preorder([&](const Snarl* snarl) { // Loop over all the snarls and print stats. + + // snarl + cout << snarl->start().node_id() << "\t" << snarl->start().backward() << "\t"; + cout << snarl->end().node_id() << "\t" << snarl->end().backward() << "\t"; // Snarl metadata - cout << "ultrabubble\t" << (snarl->type() == ULTRABUBBLE) << endl; - cout << "unary\t" << (snarl->type() == UNARY) << endl; + cout << (snarl->type() == ULTRABUBBLE) << "\t"; + cout << (snarl->type() == UNARY) << "\t"; + + // Snarl size not including boundary nodes + pair, unordered_set > contents = manager.shallow_contents(snarl, *graph, false); + size_t num_bases = 0; + for (vg::id_t node_id : contents.first) { + num_bases += graph->get_length(graph->get_handle(node_id)); + } + cout << contents.first.size() << "\t"; + cout << contents.second.size() << "\t"; + cout << num_bases << "\t"; + contents = manager.deep_contents(snarl, *graph, false); + num_bases = 0; + for (vg::id_t node_id : contents.first) { + num_bases += graph->get_length(graph->get_handle(node_id)); + } + cout << contents.first.size() << "\t"; + cout << contents.second.size() << "\t"; + cout << num_bases << "\t"; // Compute depth auto parent = manager.parent_of(snarl); @@ -787,34 +1140,46 @@ int main_stats(int argc, char** argv) { } else { depth[snarl] = depth[parent] + 1; } - cout << "depth\t" << depth[snarl] << endl; + cout << depth[snarl] << "\t"; // Number of children (looking inside chains) - cout << "children\t" << manager.children_of(snarl).size() << endl; + cout << manager.children_of(snarl).size() << "\t"; // Number of chains (including unary child snarls) // Will be 0 for leaves auto chains = manager.chains_of(snarl); - cout << "chains\t" << chains.size() << endl; - - for (auto& chain : chains) { + cout << chains.size() << "\t"; + + for (size_t i = 0; i < chains.size(); ++i) { // Number of children in each chain - cout << "chain-size\t" << chain.size() << endl; + cout << chains[i].size(); + if (i < chains.size() - 1) { + cout << ","; + } + } + if (chains.empty()) { + cout << "0"; } + cout << "\t"; // Net graph info // Internal connectivity not important, we just want the size. - auto netGraph = manager.net_graph_of(snarl, &graph, false); - cout << "net-graph-size\t" << netGraph.node_size() << endl; - + auto netGraph = manager.net_graph_of(snarl, graph, false); + cout << netGraph.get_node_count() << endl; }); } + if (!distance_index_filename.empty()) { + //Print snarl stats from a distance index + auto distance_index = vg::io::VPKG::load_one(distance_index_filename); + distance_index->print_snarl_stats(); + } + return 0; } // Register subcommand -static Subcommand vg_stats("stats", "metrics describing graph properties", TOOLKIT, main_stats); +static Subcommand vg_stats("stats", "metrics describing graph and alignment properties", TOOLKIT, main_stats); diff --git a/src/subcommand/subcommand.cpp b/src/subcommand/subcommand.cpp index 956f0e53b5b..76c6478acf0 100644 --- a/src/subcommand/subcommand.cpp +++ b/src/subcommand/subcommand.cpp @@ -24,6 +24,9 @@ std::ostream& operator<<(std::ostream& out, const CommandCategory& category) { case DEVELOPMENT: out << "developer commands"; break; + case DEPRECATED: + // we don't show these + break; } return out; diff --git a/src/subcommand/subcommand.hpp b/src/subcommand/subcommand.hpp index 5a58b670bfe..65a55046e2d 100644 --- a/src/subcommand/subcommand.hpp +++ b/src/subcommand/subcommand.hpp @@ -59,7 +59,9 @@ enum CommandCategory { /// Some commands are less important but potentially useful widgets that let you do a thing you might need WIDGET, /// Some commands are useful really only for developers - DEVELOPMENT + DEVELOPMENT, + /// Some commands we're trying to move away from + DEPRECATED }; /// Define a way to print the titles of the different categories diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 0cf3ecc7e41..8121ababf47 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -10,10 +10,21 @@ #include "subcommand.hpp" +#include +#include +#include + #include "../vg.hpp" -#include "../stream.hpp" +#include "../xg.hpp" +#include +#include #include "../utility.hpp" #include "../surjector.hpp" +#include "../hts_alignment_emitter.hpp" +#include "../multipath_alignment_emitter.hpp" +#include "../crash.hpp" +#include "../watchdog.hpp" + using namespace std; using namespace vg; @@ -24,33 +35,73 @@ void help_surject(char** argv) { << "Transforms alignments to be relative to particular paths." << endl << endl << "options:" << endl - << " -x, --xg-name FILE use the graph in this xg index" << endl - << " -t, --threads N number of threads to use" << endl - << " -p, --into-path NAME surject into this path (many allowed, default: all in xg)" << endl - << " -F, --into-paths FILE surject into nonoverlapping path names listed in FILE (one per line)" << endl - << " -i, --interleaved GAM is interleaved paired-ended, so when outputting HTS formats, pair reads" << endl - << " -c, --cram-output write CRAM to stdout" << endl - << " -b, --bam-output write BAM to stdout" << endl - << " -s, --sam-output write SAM to stdout" << endl - << " -C, --compression N level for compression [0-9]" << endl; + << " -x, --xg-name FILE use this graph or xg index (required)" << endl + << " -t, --threads N number of threads to use" << endl + << " -p, --into-path NAME surject into this path or its subpaths (many allowed, default: reference, then non-alt generic)" << endl + << " -F, --into-paths FILE surject into path names listed in HTSlib sequence dictionary or path list FILE" << endl + << " -i, --interleaved GAM is interleaved paired-ended, so when outputting HTS formats, pair reads" << endl + << " -M, --multimap include secondary alignments to all overlapping paths instead of just primary" << endl + << " -G, --gaf-input input file is GAF instead of GAM" << endl + << " -m, --gamp-input input file is GAMP instead of GAM" << endl + << " -c, --cram-output write CRAM to stdout" << endl + << " -b, --bam-output write BAM to stdout" << endl + << " -s, --sam-output write SAM to stdout" << endl + << " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl + << " -P, --prune-low-cplx prune short and low complexity anchors during realignment" << endl + << " -a, --max-anchors N use no more than N anchors per target path (default: 200)" << endl + << " -S, --spliced interpret long deletions against paths as spliced alignments" << endl + << " -A, --qual-adj adjust scoring for base qualities, if they are available" << endl + << " -N, --sample NAME set this sample name for all reads" << endl + << " -R, --read-group NAME set this read group for all reads" << endl + << " -f, --max-frag-len N reads with fragment lengths greater than N will not be marked properly paired in SAM/BAM/CRAM" << endl + << " -L, --list-all-paths annotate SAM records with a list of all attempted re-alignments to paths in SS tag" << endl + << " -C, --compression N level for compression [0-9]" << endl + << " -V, --no-validate skip checking whether alignments plausibly are against the provided graph" << endl + << " -w, --watchdog-timeout N warn when reads take more than the given number of seconds to surject" << endl; } -int main_surject(int argc, char** argv) { +/// If the given alignment doesn't make sense against the given graph (i.e. +/// doesn't agree with the nodes in the graph), print a message and stop the +/// program. Is thread-safe. +static void ensure_alignment_is_for_graph(const Alignment& aln, const HandleGraph& graph) { + AlignmentValidity validity = alignment_is_valid(aln, &graph); + if (!validity) { + #pragma omp critical (cerr) + { + std::cerr << "error:[vg surject] Alignment " << aln.name() << " cannot be interpreted against this graph: " << validity.message << std::endl; + std::cerr << "Make sure that you are using the same graph that the reads were mapped to!" << std::endl; + } + exit(1); + } +} +int main_surject(int argc, char** argv) { + if (argc == 2) { help_surject(argv); return 1; } - + string xg_name; - set path_names; - string path_prefix; + vector path_names; string path_file; - string output_type = "gam"; - string input_type = "gam"; + string output_format = "GAM"; + string input_format = "GAM"; + bool spliced = false; bool interleaved = false; - string header_file; + string sample_name; + string read_group; + int32_t max_frag_len = 0; int compress_level = 9; + int min_splice_length = 20; + size_t watchdog_timeout = 10; + bool subpath_global = true; // force full length alignments in mpmap resolution + bool qual_adj = false; + bool prune_anchors = false; + size_t max_anchors = 200; + bool annotate_with_all_path_scores = false; + bool multimap = false; + bool validate = true; int c; optind = 2; // force optind past command positional argument @@ -58,22 +109,35 @@ int main_surject(int argc, char** argv) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, - {"xb-name", required_argument, 0, 'x'}, + {"xg-name", required_argument, 0, 'x'}, {"threads", required_argument, 0, 't'}, {"into-path", required_argument, 0, 'p'}, {"into-paths", required_argument, 0, 'F'}, - {"into-prefix", required_argument, 0, 'P'}, + {"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths + {"subpath-local", required_argument, 0, 'l'}, {"interleaved", no_argument, 0, 'i'}, + {"multimap", no_argument, 0, 'M'}, + {"gaf-input", no_argument, 0, 'G'}, + {"gamp-input", no_argument, 0, 'm'}, {"cram-output", no_argument, 0, 'c'}, {"bam-output", no_argument, 0, 'b'}, {"sam-output", no_argument, 0, 's'}, - {"header-from", required_argument, 0, 'H'}, + {"spliced", no_argument, 0, 'S'}, + {"prune-low-cplx", no_argument, 0, 'P'}, + {"max-anchors", required_argument, 0, 'a'}, + {"qual-adj", no_argument, 0, 'A'}, + {"sample", required_argument, 0, 'N'}, + {"read-group", required_argument, 0, 'R'}, + {"max-frag-len", required_argument, 0, 'f'}, + {"list-all-paths", no_argument, 0, 'L'}, {"compress", required_argument, 0, 'C'}, + {"no-validate", required_argument, 0, 'V'}, + {"watchdog-timeout", required_argument, 0, 'w'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:p:F:P:icbsH:C:t:", + c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPa:ALMVw:", long_options, &option_index); // Detect the end of the options. @@ -88,45 +152,93 @@ int main_surject(int argc, char** argv) { break; case 'p': - path_names.insert(optarg); + path_names.push_back(optarg); break; case 'F': path_file = optarg; break; - - case 'P': - path_prefix = optarg; - break; - - case 'H': - header_file = optarg; + + case 'l': + subpath_global = false; break; case 'i': interleaved = true; break; + + case 'M': + multimap = true; + break; + + case 'G': + input_format = "GAF"; + break; + + case 'm': + input_format = "GAMP"; + break; case 'c': - output_type = "cram"; + output_format = "CRAM"; break; case 'b': - output_type = "bam"; + output_format = "BAM"; break; case 's': compress_level = -1; - output_type = "sam"; + output_format = "SAM"; + break; + + case 'S': + spliced = true; + break; + + case 'P': + prune_anchors = true; + break; + + case 'a': + max_anchors = parse(optarg); + break; + + case 'A': + qual_adj = true; break; - case 't': - omp_set_num_threads(parse(optarg)); + case 'N': + sample_name = optarg; + break; + + case 'R': + read_group = optarg; + break; + + case 'f': + max_frag_len = parse(optarg); break; case 'C': compress_level = parse(optarg); break; + + case 'V': + validate = false; + break; + + case 'w': + watchdog_timeout = parse(optarg); + break; + + case 't': + omp_set_num_threads(parse(optarg)); + break; + + case 'L': + annotate_with_all_path_scores = true; + break; case 'h': case '?': @@ -139,383 +251,445 @@ int main_surject(int argc, char** argv) { } } - string file_name = get_input_file_name(optind, argc, argv); - - if (!path_file.empty()){ - // open the file - ifstream in(path_file); - string line; - while (std::getline(in,line)) { - path_names.insert(line); + // Create a preprocessor to apply read group and sample name overrides in place + auto set_metadata = [&](Alignment& update) { + if (!sample_name.empty()) { + update.set_sample_name(sample_name); } - } + if (!read_group.empty()) { + update.set_read_group(read_group); + } + }; - xg::XG* xgidx = nullptr; - ifstream xg_stream(xg_name); - if(xg_stream) { - xgidx = new xg::XG(xg_stream); + string file_name = get_input_file_name(optind, argc, argv); + + PathPositionHandleGraph* xgidx = nullptr; + unique_ptr path_handle_graph; + // If we add an overlay for path position queries, use one optimized for + // use with reference paths. + bdsg::ReferencePathOverlayHelper overlay_helper; + if (!xg_name.empty()) { + path_handle_graph = vg::io::VPKG::load_one(xg_name); + xgidx = overlay_helper.apply(path_handle_graph.get()); + } else { + // We need an XG index for the rest of the algorithm + cerr << "error[vg surject] XG index (-x) is required for surjection" << endl; + exit(1); } - if (!xg_stream || xgidx == nullptr) { - cerr << "[vg surject] error: could not open xg index" << endl; - return 1; + + // Get the paths to surject into and their length information, either from + // the given file, or from the provided list, or from sniffing the graph. + vector> sequence_dictionary = get_sequence_dictionary(path_file, path_names, *xgidx); + // Clear out path_names so we don't accidentally use it + path_names.clear(); + + // Convert to a set for membership testing + unordered_set paths; + paths.reserve(sequence_dictionary.size()); + for (auto& entry : sequence_dictionary) { + paths.insert(get<0>(entry)); } - - // if no paths were given take all of those in the index - if (path_names.empty()) { - for (size_t i = 1; i <= xgidx->path_count; ++i) { - path_names.insert(xgidx->path_name(i)); - } + + // Make a single thread-safe Surjector. + Surjector surjector(xgidx); + surjector.adjust_alignments_for_base_quality = qual_adj; + surjector.prune_suspicious_anchors = prune_anchors; + surjector.max_anchors = max_anchors; + if (spliced) { + surjector.min_splice_length = min_splice_length; + // we have to bump this up to be sure to align most splice junctions + surjector.max_subgraph_bases = 16 * 1024 * 1024; } - - map path_length; - int num_paths = xgidx->max_path_rank(); - for (int i = 1; i <= num_paths; ++i) { - auto name = xgidx->path_name(i); - path_length[name] = xgidx->path_length(name); + else { + surjector.min_splice_length = numeric_limits::max(); } + surjector.annotate_with_all_path_scores = annotate_with_all_path_scores; - int thread_count = get_thread_count(); - vector surjectors(thread_count); - for (int i = 0; i < surjectors.size(); i++) { - surjectors[i] = new Surjector(xgidx); - } - - if (input_type == "gam") { - if (output_type == "gam") { - vector > buffer; - buffer.resize(thread_count); - function lambda = [&xgidx, &path_names, &buffer, &surjectors](Alignment& src) { - int tid = omp_get_thread_num(); - // Since we're outputting full GAM, we ignore all this info - // about where on the path the alignment falls. But we need to - // provide the space to the surject call anyway. - string path_name; - int64_t path_pos; - bool path_reverse; - buffer[tid].push_back(surjectors[omp_get_thread_num()]->path_anchored_surject(src, - path_names, - path_name, - path_pos, - path_reverse)); - stream::write_buffered(cout, buffer[tid], 100); - }; - get_input_file(file_name, [&](istream& in) { - stream::for_each_parallel(in, lambda); - }); - for (int i = 0; i < thread_count; ++i) { - stream::write_buffered(cout, buffer[i], 0); // flush - } - } else { - char out_mode[5]; - string out_format = ""; - strcpy(out_mode, "w"); - if (output_type == "bam") { out_format = "b"; } - else if (output_type == "cram") { out_format = "c"; } - else { out_format = ""; } - strcat(out_mode, out_format.c_str()); - if (compress_level >= 0) { - char tmp[2]; - tmp[0] = compress_level + '0'; tmp[1] = '\0'; - strcat(out_mode, tmp); - } - - int thread_count = get_thread_count(); - - // bam/sam/cram output - - // Define a string to hold the SAM header, to be generated later. - string header; - // To generate the header, we need to know the read group for each sample name. - map rg_sample; - - samFile* out = nullptr; - int buffer_limit = 100; - - bam_hdr_t* hdr = nullptr; - int64_t count = 0; - // TODO: What good is this lock if we continue without getting it if the buffer is overfull??? - omp_lock_t output_lock; - omp_init_lock(&output_lock); - - // We define a type to represent a surjected alignment, ready for - // HTSlib output. It consists of surjected path name (or ""), - // surjected position (or -1), surjected orientation, and the - // actual Alignment. - using surjected_t = tuple; - // You make one with make_tuple() - - // We define a basic surject function, which also fills in the read group info we need to make the header - auto surject_alignment = [&](const Alignment& src) { - - // Set out some variables to populate with the linear position - // info we need for SAM/BAM/CRAM - string path_name; - // Make sure to initialize pos to -1 since it may not be set by - // surject_alignment if the read is unmapped, and unmapped - // reads need to come out with a 0 1-based position. - int64_t path_pos = -1; - bool path_reverse = false; - auto surj = surjectors[omp_get_thread_num()]->path_anchored_surject(src, - path_names, - path_name, - path_pos, - path_reverse); - // Always use the surjected alignment, even if it surjects to unmapped. - - if (!hdr && !surj.read_group().empty() && !surj.sample_name().empty()) { - // There's no header yet (although we race its - // construction) and we have a sample and a read group. - - // Record the read group for the sample that this read - // represents, so that when we build the header we list it. -#pragma omp critical (hts_header) - rg_sample[surj.read_group()] = surj.sample_name(); - } - - return make_tuple(path_name, path_pos, path_reverse, surj); - }; - - // We also define a function to emit the header if it hasn't been made already. - // Note that the header will only list the samples and read groups in reads we have encountered so far! - auto ensure_header = [&]() { -#pragma omp critical (hts_header) + // Count our threads + int thread_count = vg::get_thread_count(); + + // Prepare the watchdog + unique_ptr watchdog(new Watchdog(thread_count, chrono::seconds(watchdog_timeout))); + + if (input_format == "GAM" || input_format == "GAF") { + + // Give helpful warning if someone tries to surject an un-surjectable GAF + auto check_gaf_aln = [&](const Alignment& src) { + if (src.has_path() && src.sequence().empty()) { +#pragma omp critical { - if (!hdr) { - hdr = hts_string_header(header, path_length, rg_sample); - if ((out = sam_open("-", out_mode)) == 0) { + cerr << "error:[surject] Read " << src.name() << " is aligned but does not have a sequence and therefore cannot be surjected. Was it derived from a GAF without a base-level alignment? Or a GAF with a CIGAR string in the 'cg' tag (which does not provide enough information to reconstruct the sequence)?" << endl; + exit(1); + } + } + }; + + // Set up output to an emitter that will handle serialization. + // It should process output raw, without any surjection, and it should + // respect our parameter for whether to think with splicing. + unique_ptr alignment_emitter = get_alignment_emitter("-", + output_format, sequence_dictionary, thread_count, xgidx, + ALIGNMENT_EMITTER_FLAG_HTS_RAW | (spliced * ALIGNMENT_EMITTER_FLAG_HTS_SPLICED)); + + if (interleaved) { + // GAM input is paired, and for HTS output reads need to know their pair partners' mapping locations. + // TODO: We don't preserve order relationships (like primary/secondary) beyond the interleaving. + function lambda = [&](Alignment& src1, Alignment& src2) { + try { + set_crash_context(src1.name() + ", " + src2.name()); + size_t thread_num = omp_get_thread_num(); + if (watchdog) { + watchdog->check_in(thread_num, src1.name() + ", " + src2.name()); + } + // Make sure that the alignments are actually paired with each other + // (proper fragment_prev/fragment_next). We want to catch people giving us + // un-interleaved GAMs as interleaved. + // TODO: Integrate into for_each_interleaved_pair_parallel when running on Alignments. + if (src1.has_fragment_next()) { + // Alignment 1 comes first in fragment + if (src1.fragment_next().name() != src2.name() || + !src2.has_fragment_prev() || + src2.fragment_prev().name() != src1.name()) { + #pragma omp critical (cerr) - cerr << "[vg surject] error: failed to open stdout for writing HTS output" << endl; + cerr << "[vg surject] error: alignments " << src1.name() + << " and " << src2.name() << " are adjacent but not paired" << endl; + exit(1); - } else { - // write the header - if (sam_hdr_write(out, hdr) != 0) { + + } + } else if (src2.has_fragment_next()) { + // Alignment 2 comes first in fragment + if (src2.fragment_next().name() != src1.name() || + !src1.has_fragment_prev() || + src1.fragment_prev().name() != src2.name()) { + #pragma omp critical (cerr) - cerr << "[vg surject] error: failed to write the SAM header" << endl; + cerr << "[vg surject] error: alignments " << src1.name() + << " and " << src2.name() << " are adjacent but not paired" << endl; + + exit(1); + + } + } else { + // Alignments aren't paired up at all +#pragma omp critical (cerr) + cerr << "[vg surject] error: alignments " << src1.name() + << " and " << src2.name() << " are adjacent but not paired" << endl; + + exit(1); + } + + if (validate) { + ensure_alignment_is_for_graph(src1, *xgidx); + ensure_alignment_is_for_graph(src2, *xgidx); + } + + // Preprocess read to set metadata before surjection + set_metadata(src1); + set_metadata(src2); + + // Surject and emit. + if (multimap) { + + auto surjected1 = surjector.multi_surject(src1, paths, subpath_global, spliced); + auto surjected2 = surjector.multi_surject(src2, paths, subpath_global, spliced); + + // we have to pair these up manually + unordered_map, size_t> strand_idx1, strand_idx2; + for (size_t i = 0; i < surjected1.size(); ++i) { + const auto& pos = surjected1[i].refpos(0); + strand_idx1[make_pair(pos.name(), pos.is_reverse())] = i; + } + for (size_t i = 0; i < surjected2.size(); ++i) { + const auto& pos = surjected2[i].refpos(0); + strand_idx2[make_pair(pos.name(), pos.is_reverse())] = i; + } + + for (size_t i = 0; i < surjected1.size(); ++i) { + const auto& pos = surjected1[i].refpos(0); + auto it = strand_idx2.find(make_pair(pos.name(), !pos.is_reverse())); + if (it != strand_idx2.end()) { + // the alignments are paired on this strand + alignment_emitter->emit_pair(move(surjected1[i]), move(surjected2[it->second]), max_frag_len); + } + else { + // this strand's surjection is unpaired + alignment_emitter->emit_single(move(surjected1[i])); } } + for (size_t i = 0; i < surjected2.size(); ++i) { + const auto& pos = surjected2[i].refpos(0); + if (!strand_idx1.count(make_pair(pos.name(), !pos.is_reverse()))) { + // this strand's surjection is unpaired + alignment_emitter->emit_single(move(surjected2[i])); + } + } + } + else { + // FIXME: these aren't forced to be on the same path, which could be fucky + alignment_emitter->emit_pair(surjector.surject(src1, paths, subpath_global, spliced), + surjector.surject(src2, paths, subpath_global, spliced), + max_frag_len); } + if (watchdog) { + watchdog->check_out(thread_num); + } + clear_crash_context(); + } catch (const std::exception& ex) { + report_exception(ex); } }; - - // Finally, we have a little widget function to write a BAM record and check for errors. - // Consumes the passed record. - auto write_bam_record = [&](bam1_t* b) { - assert(out != nullptr); - int r = 0; -#pragma omp critical (cout) - r = sam_write1(out, hdr, b); - if (r == 0) { -#pragma omp critical (cerr) - cerr << "[vg surject] error: writing to stdout failed" << endl; - exit(1); + if (input_format == "GAM") { + get_input_file(file_name, [&](istream& in) { + vg::io::for_each_interleaved_pair_parallel(in, lambda); + }); + } else { + auto gaf_checking_lambda = [&](Alignment& src1, Alignment& src2) { + check_gaf_aln(src1); + check_gaf_aln(src2); + return lambda(src1, src2); + }; + vg::io::gaf_paired_interleaved_for_each_parallel(*xgidx, file_name, gaf_checking_lambda); + } + } else { + // We can just surject each Alignment by itself. + // TODO: We don't preserve order relationships (like primary/secondary). + function lambda = [&](Alignment& src) { + try { + set_crash_context(src.name()); + size_t thread_num = omp_get_thread_num(); + if (watchdog) { + watchdog->check_in(thread_num, src.name()); + } + if (validate) { + ensure_alignment_is_for_graph(src, *xgidx); + } + + // Preprocess read to set metadata before surjection + set_metadata(src); + + // Surject and emit the single read. + if (multimap) { + alignment_emitter->emit_singles(surjector.multi_surject(src, paths, subpath_global, spliced)); + } + else { + alignment_emitter->emit_single(surjector.surject(src, paths, subpath_global, spliced)); + } + if (watchdog) { + watchdog->check_out(thread_num); + } + clear_crash_context(); + } catch (const std::exception& ex) { + report_exception(ex); } - bam_destroy1(b); }; - + if (input_format == "GAM") { + get_input_file(file_name, [&](istream& in) { + vg::io::for_each_parallel(in,lambda); + }); + } else { + auto gaf_checking_lambda = [&](Alignment& src) { + check_gaf_aln(src); + return lambda(src); + }; + vg::io::gaf_unpaired_for_each_parallel(*xgidx, file_name, gaf_checking_lambda); + } + } + } else if (input_format == "GAMP") { + // Working on multipath alignments. We need to set the emitter up ourselves. + auto path_order_and_length = extract_path_metadata(sequence_dictionary, *xgidx).first; + MultipathAlignmentEmitter mp_alignment_emitter("-", thread_count, output_format, xgidx, &path_order_and_length); + mp_alignment_emitter.set_read_group(read_group); + mp_alignment_emitter.set_sample_name(sample_name); + mp_alignment_emitter.set_min_splice_length(spliced ? min_splice_length : numeric_limits::max()); + + // TODO: largely repetitive with GAM + get_input_file(file_name, [&](istream& in) { if (interleaved) { - // GAM input is paired, and for HTS output reads need to know their pair partners' mapping locations - // We keep a buffer, one per thread, of pairs of surjected alignments - vector>> buffer; - buffer.resize(thread_count); - - // Define a function that handles buffers, possibly opening the - // output file if we're on the first record - auto handle_buffer = [&](vector>& buf) { - if (buf.size() >= buffer_limit) { - // We have enough data to start the file. + // GAMP input is paired, and for HTS output reads need to know their pair partners' mapping locations. + // TODO: We don't preserve order relationships (like primary/secondary) beyond the interleaving. + vg::io::for_each_interleaved_pair_parallel(in, [&](MultipathAlignment& src1, MultipathAlignment& src2) { + try { + set_crash_context(src1.name() + ", " + src2.name()); + size_t thread_num = omp_get_thread_num(); + if (watchdog) { + watchdog->check_in(thread_num, src1.name() + ", " + src2.name()); + } + + // Make sure that the alignments are actually paired with each other + // (proper fragment_prev/fragment_next). We want to catch people giving us + // un-interleaved GAMs as interleaved. + // TODO: Integrate into for_each_interleaved_pair_parallel when running on Alignments. + if (src1.paired_read_name() != src2.name() || src2.paired_read_name() != src1.name()) { - // Make sure we have emitted the header - ensure_header(); - - // try to get a lock, and force things if we've built up a huge buffer waiting - // TODO: Is continuing without the lock safe? And if so why do we have the lock in the first place? - if (omp_test_lock(&output_lock) || buf.size() > 10*buffer_limit) { - for (auto& surjected_pair : buf) { - // For each pair of surjected reads - - // Unpack the first read - auto& name1 = get<0>(surjected_pair.first); - auto& pos1 = get<1>(surjected_pair.first); - auto& reverse1 = get<2>(surjected_pair.first); - auto& surj1 = get<3>(surjected_pair.first); - - // Unpack the second read - auto& name2 = get<0>(surjected_pair.second); - auto& pos2 = get<1>(surjected_pair.second); - auto& reverse2 = get<2>(surjected_pair.second); - auto& surj2 = get<3>(surjected_pair.second); +#pragma omp critical (cerr) + cerr << "[vg surject] error: alignments " << src1.name() + << " and " << src2.name() << " are adjacent but not paired" << endl; + + exit(1); + + } + else if (src1.paired_read_name().empty() || src2.paired_read_name().empty()) { + // Alignments aren't paired up at all +#pragma omp critical (cerr) + cerr << "[vg surject] error: alignments " << src1.name() + << " and " << src2.name() << " are adjacent but not paired" << endl; + + exit(1); + } + + // convert out of protobuf + multipath_alignment_t mp_src1, mp_src2; + from_proto_multipath_alignment(src1, mp_src1); + from_proto_multipath_alignment(src2, mp_src2); + + + vector, tuple>> positions; + vector> surjected; + + vector> positions_unpaired1, positions_unpaired2; + vector surjected_unpaired1, surjected_unpaired2; + + // surject and record path positions + if (multimap) { + + // TODO: highly repetitive with the version above for Alignments + + vector> positions1, positions2; + auto surjected1 = surjector.multi_surject(mp_src1, paths, positions1, subpath_global, spliced); + auto surjected2 = surjector.multi_surject(mp_src2, paths, positions2, subpath_global, spliced); + + // we have to pair these up manually + unordered_map, size_t> strand_idx1, strand_idx2; + for (size_t i = 0; i < surjected1.size(); ++i) { + strand_idx1[make_pair(get<0>(positions1[i]), get<2>(positions1[i]))] = i; + } + for (size_t i = 0; i < surjected2.size(); ++i) { + strand_idx2[make_pair(get<0>(positions2[i]), get<2>(positions2[i]))] = i; + } + + for (size_t i = 0; i < surjected1.size(); ++i) { + auto it = strand_idx2.find(make_pair(get<0>(positions1[i]), !get<2>(positions1[i]))); + if (it != strand_idx2.end()) { + // the alignments are paired on this strand + size_t j = it->second; + surjected.emplace_back(move(surjected1[i]), move(surjected2[j])); - // Compute CIGAR strings if actually surjected - string cigar1 = "", cigar2 = ""; - if (name1 != "") { - size_t path_len1 = xgidx->path_length(name1); - cigar1 = cigar_against_path(surj1, reverse1, pos1, path_len1, 0); - } - if (name2 != "") { - size_t path_len2 = xgidx->path_length(name2); - cigar2 = cigar_against_path(surj2, reverse2, pos2, path_len2, 0); - } + // reorder the positions to deal with the mismatch in the interfaces + positions.emplace_back(); + get<0>(positions.back().first) = get<0>(positions1[i]); + get<1>(positions.back().first) = get<2>(positions1[i]); + get<2>(positions.back().first) = get<1>(positions1[i]); + get<0>(positions.back().second) = get<0>(positions2[j]); + get<1>(positions.back().second) = get<2>(positions2[j]); + get<2>(positions.back().second) = get<1>(positions2[j]); + } + else { + // this strand's surjection is unpaired + surjected_unpaired1.emplace_back(move(surjected1[i])); - // TODO: compute template length based on - // pair distance and alignment content. - int template_length = 0; + // reorder the position to deal with the mismatch in the interfaces + positions_unpaired1.emplace_back(); + get<0>(positions_unpaired1.back()) = move(get<0>(positions1[i])); + get<1>(positions_unpaired1.back()) = get<2>(positions1[i]); + get<2>(positions_unpaired1.back()) = get<1>(positions1[i]); + } + } + for (size_t i = 0; i < surjected2.size(); ++i) { + if (!strand_idx1.count(make_pair(get<0>(positions2[i]), !get<2>(positions2[i])))) { + // this strand's surjection is unpaired + surjected_unpaired2.emplace_back(move(surjected2[i])); - // Create and write paired BAM records referencing each other - write_bam_record(alignment_to_bam(header, surj1, name1, pos1, reverse1, cigar1, - name2, pos2, template_length)); - write_bam_record(alignment_to_bam(header, surj2, name2, pos2, reverse2, cigar2, - name1, pos1, template_length)); - + // reorder the position to deal with the mismatch in the interfaces + positions_unpaired2.emplace_back(); + get<0>(positions_unpaired2.back()) = move(get<0>(positions2[i])); + get<1>(positions_unpaired2.back()) = get<2>(positions2[i]); + get<2>(positions_unpaired2.back()) = get<1>(positions2[i]); } - - omp_unset_lock(&output_lock); - buf.clear(); } } - }; - - // Define a function to surject the pair and fill in the - // HTSlib-required crossreferences - function lambda = [&](Alignment& aln1, Alignment& aln2) { - // Make sure that the alignments being surjected are - // actually paired with each other (proper - // fragment_prev/fragment_next). We want to catch people - // giving us un-interleaved GAMs as interleaved. - if (aln1.has_fragment_next()) { - // Alignment 1 comes first in fragment - if (aln1.fragment_next().name() != aln2.name() || - !aln2.has_fragment_prev() || - aln2.fragment_prev().name() != aln1.name()) { - -#pragma omp critical (cerr) - cerr << "[vg surject] error: alignments " << aln1.name() - << " and " << aln2.name() << " are adjacent but not paired" << endl; - - exit(1); - + else { + + // FIXME: these aren't required to be on the same path... + positions.emplace_back(); + surjected.emplace_back(surjector.surject(mp_src1, paths, get<0>(positions.front().first), + get<2>(positions.front().first), get<1>(positions.front().first), + subpath_global, spliced), + surjector.surject(mp_src2, paths, get<0>(positions.front().second), + get<2>(positions.front().second), get<1>(positions.front().second), + subpath_global, spliced)); } - } else if (aln2.has_fragment_next()) { - // Alignment 2 comes first in fragment - if (aln2.fragment_next().name() != aln1.name() || - !aln1.has_fragment_prev() || - aln1.fragment_prev().name() != aln2.name()) { - -#pragma omp critical (cerr) - cerr << "[vg surject] error: alignments " << aln1.name() - << " and " << aln2.name() << " are adjacent but not paired" << endl; - - exit(1); + + // write to output + vector tlen_limits(surjected.size(), max_frag_len); + mp_alignment_emitter.emit_pairs(src1.name(), src2.name(), move(surjected), &positions, &tlen_limits); + mp_alignment_emitter.emit_singles(src1.name(), move(surjected_unpaired1), &positions_unpaired1); + mp_alignment_emitter.emit_singles(src2.name(), move(surjected_unpaired2), &positions_unpaired2); + if (watchdog) { + watchdog->check_out(thread_num); } - } else { - // Alignments aren't paired up at all -#pragma omp critical (cerr) - cerr << "[vg surject] error: alignments " << aln1.name() - << " and " << aln2.name() << " are adjacent but not paired" << endl; - - exit(1); + clear_crash_context(); + } catch (const std::exception& ex) { + report_exception(ex); } - - // Find our buffer - auto& thread_buffer = buffer[omp_get_thread_num()]; - // Surject each of the pair and buffer the surjected pair - thread_buffer.emplace_back(surject_alignment(aln1), surject_alignment(aln2)); - // Spit out the buffer if (over)full - handle_buffer(thread_buffer); - }; - - // now apply the alignment processor to the stream - get_input_file(file_name, [&](istream& in) { - stream::for_each_interleaved_pair_parallel(in, lambda); }); - - // Spit out any remaining data - buffer_limit = 0; - for (auto& buf : buffer) { - handle_buffer(buf); - } - } else { - // GAM input is single-ended, so each read can be surjected - // independently - - // We keep a buffer, one per thread, of surjected alignments. - vector> buffer; - buffer.resize(thread_count); + // TODO: We don't preserve order relationships (like primary/secondary). + vg::io::for_each_parallel(in, [&](MultipathAlignment& src) { + try { + set_crash_context(src.name()); + size_t thread_num = omp_get_thread_num(); + if (watchdog) { + watchdog->check_in(thread_num, src.name()); + } - // Define a function that handles buffers, possibly opening the - // output file if we're on the first record - auto handle_buffer = [&](vector& buf) { - if (buf.size() >= buffer_limit) { - // We have enough data to start the file. + multipath_alignment_t mp_src; + from_proto_multipath_alignment(src, mp_src); - // Make sure we have emitted the header - ensure_header(); - - // try to get a lock, and force things if we've built up a huge buffer waiting - // TODO: Is continuing without the lock safe? And if so why do we have the lock in the first place? - if (omp_test_lock(&output_lock) || buf.size() > 10*buffer_limit) { - for (auto& s : buf) { - // For each alignment in the buffer - - // Unpack it - auto& name = get<0>(s); - auto& pos = get<1>(s); - auto& reverse = get<2>(s); - auto& surj = get<3>(s); - - // Generate a CIGAR string for it - string cigar = ""; - if (name != "") { - size_t path_len = xgidx->path_length(name); - cigar = cigar_against_path(surj, reverse, pos, path_len, 0); - } - - // Create and write a single unpaired BAM record - write_bam_record(alignment_to_bam(header, surj, name, pos, reverse, cigar)); - - } + // surject and record path positions + vector> positions; + vector surjected; + + if (multimap) { + + vector> multi_positions; + surjected = surjector.multi_surject(mp_src, paths, multi_positions, subpath_global, spliced); - omp_unset_lock(&output_lock); - buf.clear(); + // positions are in different orders in these two interfaces + for (auto& position : multi_positions) { + positions.emplace_back(move(get<0>(position)), get<2>(position), get<1>(position)); + } + } + else { + positions.emplace_back(); + surjected.emplace_back(surjector.surject(mp_src, paths, get<0>(positions.front()), + get<2>(positions.front()), get<1>(positions.front()), + subpath_global, spliced)); } + + // write to output + mp_alignment_emitter.emit_singles(src.name(), move(surjected), &positions); + + if (watchdog) { + watchdog->check_out(thread_num); + } + clear_crash_context(); + } catch (const std::exception& ex) { + report_exception(ex); } - }; - - function lambda = [&](Alignment& src) { - auto& thread_buffer = buffer[omp_get_thread_num()]; - thread_buffer.push_back(surject_alignment(src)); - handle_buffer(thread_buffer); - }; - - - // now apply the alignment processor to the stream - get_input_file(file_name, [&](istream& in) { - stream::for_each_parallel(in, lambda); }); - buffer_limit = 0; - for (auto& buf : buffer) { - handle_buffer(buf); - } - } - - - if (hdr != nullptr) { - bam_hdr_destroy(hdr); - } - assert(out != nullptr); - sam_close(out); - omp_destroy_lock(&output_lock); - } + }); + } else { + cerr << "[vg surject] Unimplemented input format " << input_format << endl; + exit(1); } + cout.flush(); - for (Surjector* surjector : surjectors) { - delete surjector; - } - return 0; } diff --git a/src/subcommand/test_main.cpp b/src/subcommand/test_main.cpp index 3be32231a12..7949f21bffa 100644 --- a/src/subcommand/test_main.cpp +++ b/src/subcommand/test_main.cpp @@ -12,7 +12,8 @@ #include "subcommand.hpp" -#include "../unittest/driver.hpp" +#define CATCH_CONFIG_RUNNER +#include "unittest/catch.hpp" using namespace std; using namespace vg; @@ -21,9 +22,51 @@ using namespace vg::subcommand; // No help_test is necessary because the unit testing library takes care of // complaining about missing options. +/** + * Take the original argc and argv from a `vg unittest` command-line call and + * run the unit tests. We keep this in its own CPP/HPP to keep our unit test + * library from being a dependency of main.o and other real application code. + * + * Passes the args along to the unit test system. + * + * Returns exit code 0 on success, other codes on failure. + */ +static int run_unit_tests(int argc, char** argv) { + // argc and argv are going to have command and subcommand, which I don't + // think argv speaks. + + assert(argc >= 2); + + // writing to session.configData() or session.Config() here + // overrides command line args + // only do this if you know you need to + + // We're going to trick it by making a fake program name with a space in it + auto new_program_name = string(argv[0]) + " " + string(argv[1]); + + // Delete an argument + int fixed_argc = argc - 1; + // Advance the pointer to the next char* + char** fixed_argv = argv + 1; + fixed_argv[0] = &new_program_name[0]; + + // Make a Catch session + Catch::Session session; + + int return_code = session.applyCommandLine(fixed_argc, fixed_argv); + if(return_code != 0) { + // Complain the user didn't specify good arguments + return return_code; + } + + // Actually run the tests + return session.run(); + +} + int main_test(int argc, char** argv){ // Forward arguments along to the main unit test driver - return vg::unittest::run_unit_tests(argc, argv); + return run_unit_tests(argc, argv); } // Register subcommand diff --git a/src/subcommand/trace_main.cpp b/src/subcommand/trace_main.cpp index 68bade93c64..c95d2897747 100644 --- a/src/subcommand/trace_main.cpp +++ b/src/subcommand/trace_main.cpp @@ -6,21 +6,22 @@ #include "subcommand.hpp" #include "../vg.hpp" +#include #include "../haplotype_extracter.hpp" +#include "../algorithms/find_gbwt.hpp" +#include using namespace vg; using namespace std; using namespace vg::subcommand; -using thread_t = vector; - void help_trace(char** argv) { cerr << "usage: " << argv[0] << " trace [options]" << endl << "Trace and extract haplotypes from an index" << endl << endl << "options:" << endl - << " -x, --index FILE use this xg index" << endl - << " -G, --gbwt-name FILE use this GBWT haplotype index instead of the xg's embedded gPBWT" << endl + << " -x, --index FILE use this xg index or graph" << endl + << " -G, --gbwt-name FILE use this GBWT haplotype index instead of any in the graph" << endl << " -n, --start-node INT start at this node" << endl //TODO: implement backwards iteration over graph // << " -b, --backwards iterate backwards over graph" << endl @@ -117,31 +118,24 @@ int main_trace(int argc, char** argv) { cerr << "error:[vg trace] start node must be specified with -n" << endl; return 1; } - xg::XG xindex; - ifstream in(xg_name.c_str()); - xindex.load(in); + unique_ptr path_handle_graph = vg::io::VPKG::load_one(xg_name); + bdsg::PathPositionOverlayHelper overlay_helper; + PathPositionHandleGraph* xindex = overlay_helper.apply(path_handle_graph.get()); // Now load the haplotype data - unique_ptr gbwt_index; - if (!gbwt_name.empty()) { - // We are tracing haplotypes, and we want to use the GBWT instead of the old gPBWT. - gbwt_index = unique_ptr(new gbwt::GBWT()); - - // Open up the index - ifstream in(gbwt_name.c_str()); - if (!in) { - cerr << "error:[vg trace] unable to load gbwt index file" << endl; - return 1; - } + unique_ptr gbwt_index_holder; + const gbwt::GBWT* gbwt_index = vg::algorithms::find_gbwt(path_handle_graph.get(), gbwt_index_holder, gbwt_name); - // And load it - gbwt_index->load(in); + if (gbwt_index == nullptr) { + // Complain if we couldn't. + cerr << "error:[vg trace] unable to find gbwt index in graph or separate file" << endl; + exit(1); } - + // trace out our graph and paths from the start node Graph trace_graph; map haplotype_frequences; - trace_haplotypes_and_paths(xindex, gbwt_index.get(), start_node, extend_distance, + trace_haplotypes_and_paths(*xindex, *gbwt_index, start_node, extend_distance, trace_graph, haplotype_frequences); // dump our graph to stdout diff --git a/src/subcommand/translate_main.cpp b/src/subcommand/translate_main.cpp index 900fa85006a..9fd8b9d4515 100644 --- a/src/subcommand/translate_main.cpp +++ b/src/subcommand/translate_main.cpp @@ -14,7 +14,7 @@ #include "../vg.hpp" #include "../translator.hpp" -#include "../stream.hpp" +#include using namespace std; using namespace vg; @@ -130,40 +130,40 @@ int main_translate(int argc, char** argv) { vector buffer; function lambda = [&](Path& path) { buffer.push_back(translator->translate(path)); - stream::write_buffered(cout, buffer, 100); + vg::io::write_buffered(cout, buffer, 100); }; ifstream path_in(path_file); - stream::for_each(path_in, lambda); - stream::write_buffered(cout, buffer, 0); + vg::io::for_each(path_in, lambda); + vg::io::write_buffered(cout, buffer, 0); } else if (!aln_file.empty()) { vector buffer; function lambda = [&](Alignment& aln) { buffer.push_back(translator->translate(aln)); - stream::write_buffered(cout, buffer, 100); + vg::io::write_buffered(cout, buffer, 100); }; ifstream aln_in(aln_file); - stream::for_each(aln_in, lambda); - stream::write_buffered(cout, buffer, 0); + vg::io::for_each(aln_in, lambda); + vg::io::write_buffered(cout, buffer, 0); } else if (!loci_file.empty()) { vector buffer; function lambda = [&](Locus& locus) { buffer.push_back(translator->translate(locus)); - stream::write_buffered(cout, buffer, 100); + vg::io::write_buffered(cout, buffer, 100); }; ifstream loci_in(loci_file); - stream::for_each(loci_in, lambda); - stream::write_buffered(cout, buffer, 0); + vg::io::for_each(loci_in, lambda); + vg::io::write_buffered(cout, buffer, 0); } if (!overlay_file.empty()) { vector buffer; function lambda = [&](Translation& trans) { buffer.push_back(translator->overlay(trans)); - stream::write_buffered(cout, buffer, 100); + vg::io::write_buffered(cout, buffer, 100); }; ifstream overlay_in(overlay_file); - stream::for_each(overlay_in, lambda); - stream::write_buffered(cout, buffer, 0); + vg::io::for_each(overlay_in, lambda); + vg::io::write_buffered(cout, buffer, 0); } return 0; @@ -171,5 +171,5 @@ int main_translate(int argc, char** argv) { // Register subcommand -static Subcommand vg_version("translate", "project alignments and paths through a graph translation", main_translate); +static Subcommand vg_version("translate", "project alignments and paths through a graph translation", DEPRECATED, main_translate); diff --git a/src/subcommand/validate_main.cpp b/src/subcommand/validate_main.cpp index c0050bd7aab..12125c6218c 100644 --- a/src/subcommand/validate_main.cpp +++ b/src/subcommand/validate_main.cpp @@ -13,21 +13,22 @@ #include "subcommand.hpp" #include "../vg.hpp" +#include "../alignment.hpp" +#include using namespace std; using namespace vg; using namespace vg::subcommand; void help_validate(char** argv) { - cerr << "usage: " << argv[0] << " validate [options] graph" << endl - << "Validate the graph." << endl - << endl - << "options:" << endl - << " default: check all aspects of the graph, if options are specified do only those" << endl - << " -n, --nodes verify that we have the expected number of nodes" << endl - << " -e, --edges verify that the graph contains all nodes that are referred to by edges" << endl - << " -p, --paths verify that contiguous path segments are connected by edges" << endl - << " -o, --orphans verify that all nodes have edges" << endl; + cerr << "usage: " << argv[0] << " validate [options] [graph]" << endl + << "Validate the graph." << endl + << endl + << "options:" << endl + << " default: check all aspects of the graph, if options are specified do only those" << endl + << " -o, --orphans verify that all nodes have edges" << endl + << " -a, --gam FILE verify that edits in the alignment fit on nodes in the graph" << endl + << " -A, --gam-only do not verify the graph itself, only the alignment" << endl; } int main_validate(int argc, char** argv) { @@ -37,10 +38,9 @@ int main_validate(int argc, char** argv) { return 1; } - bool check_nodes = false; - bool check_edges = false; bool check_orphans = false; - bool check_paths = false; + string gam_path; + bool gam_only = false; int c; optind = 2; // force optind past command positional argument @@ -48,15 +48,14 @@ int main_validate(int argc, char** argv) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, - {"nodes", no_argument, 0, 'n'}, - {"edges", no_argument, 0, 'e'}, - {"paths", no_argument, 0, 'o'}, - {"orphans", no_argument, 0, 'p'}, + {"orphans", no_argument, 0, 'o'}, + {"gam", required_argument, 0, 'a'}, + {"gam-only", no_argument, 0, 'A'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hneop", + c = getopt_long (argc, argv, "hoa:A", long_options, &option_index); // Detect the end of the options. @@ -65,21 +64,16 @@ int main_validate(int argc, char** argv) { switch (c) { - - case 'n': - check_nodes = true; - break; - - case 'e': - check_edges = true; - break; - case 'o': check_orphans = true; break; - case 'p': - check_paths = true; + case 'a': + gam_path = optarg; + break; + + case 'A': + gam_only = true; break; case 'h': @@ -93,26 +87,107 @@ int main_validate(int argc, char** argv) { } } - VG* graph; - get_input_file(optind, argc, argv, [&](istream& in) { - graph = new VG(in); - }); - - // if we chose a specific subset, do just them - if (check_nodes || check_edges || check_orphans || check_paths) { - if (graph->is_valid(check_nodes, check_edges, check_orphans, check_paths)) { - return 0; - } else { - return 1; + // load the graph + unique_ptr graph; + string graph_filename = get_input_file_name(optind, argc, argv); + graph = vg::io::VPKG::load_one(graph_filename); + + // validate the alignment if given + bool valid_aln = true; + if (!gam_path.empty()) { + get_input_file(gam_path, [&](istream& in) { + vg::io::for_each(in, [&](Alignment& aln) { + AlignmentValidity validity = alignment_is_valid(aln, graph.get()); + if (!validity) { + // Complain about this alignment + cerr << "Invalid Alignment:\n" << pb2json(aln) << "\n" << validity.message; + if (validity.problem == AlignmentValidity::NODE_TOO_SHORT) { + // If a node is too short, report the whole mapping again. + cerr << ":\n" << pb2json(aln.path().mapping(validity.bad_mapping_index)); + } + cerr << endl; + valid_aln = false; + } + }); + }); + } + + // VG's a little less structured, so try its own logic + bool valid_graph = true; + + if (!gam_only) { + VG* vg_graph = dynamic_cast(graph.get()); + if (vg_graph != nullptr) { + if (!vg_graph->is_valid(true, true, check_orphans, true)) { + valid_graph = false; + } + } + } + + if (!gam_only && valid_graph) { + // I don't think this is possible with any libbdsg implementations, but check edges just in case + graph->for_each_edge([&](const edge_t& edge) { + if (!graph->has_node(graph->get_id(edge.first))) { + cerr << "graph invalid: source node missing for edge " + << graph->get_id(edge.first) << ":" << graph->get_is_reverse(edge.first) << " -> " + << graph->get_id(edge.second) << ":" << graph->get_is_reverse(edge.second) << endl; + valid_graph = false; + } + if (!graph->has_node(graph->get_id(edge.second))) { + cerr << "graph invalid: sink node missing for edge " + << graph->get_id(edge.first) << ":" << graph->get_is_reverse(edge.first) << " -> " + << graph->get_id(edge.second) << ":" << graph->get_is_reverse(edge.second) << endl; + valid_graph = false; + } + }); + + graph->for_each_path_handle([&](path_handle_t path_handle) { + size_t i = 0; + handle_t prev; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (i > 0) { + if (!graph->has_edge(prev, handle)) { + cerr << "graph invalid: missing edge between " << (i-1) << "th step (" + << graph->get_id(prev) << ":" << graph->get_is_reverse(prev) << ") and " + << i << "th step (" << graph->get_id(handle) << ":" << graph->get_is_reverse(handle) + << ") of path " << graph->get_path_name(path_handle) << endl; + valid_graph = false; + } + if (!graph->has_edge(graph->flip(handle), graph->flip(prev))) { + cerr << "graph invalid: missing edge between " << (i) << "th step (" + << graph->get_id(handle) << ":" << !graph->get_is_reverse(handle) << ") and " + << (i-1) << "th step (" << graph->get_id(prev) << ":" << graph->get_is_reverse(prev) + << ") of path " << graph->get_path_name(path_handle) << endl; + valid_graph = false; + } + } + ++i; + prev = handle; + }); + }); + + if (check_orphans) { + graph->for_each_handle([&](handle_t handle) { + if (graph->get_degree(handle, true) + graph->get_degree(handle, false) == 0) { + cerr << "graph invalid: orphan node found: " << graph->get_id(handle) << endl; + valid_graph = false; + } + }); + } - // otherwise do everything - } else if (graph->is_valid()) { - return 0; - } else { - return 1; } + + if (!gam_path.empty()) { + cerr << "alignment: " << (valid_aln ? "valid" : "invalid") << endl; + } + if (!gam_only) { + cerr << "graph: " << (valid_graph ? "valid" : "invalid") << endl; + } + + return valid_aln && valid_graph ? 0 : 1; } // Register subcommand -static Subcommand vg_validate("validate", "validate the semantics of a graph", DEVELOPMENT, main_validate); +static Subcommand vg_validate("validate", "validate the semantics of a graph or gam", DEVELOPMENT, main_validate); diff --git a/src/subcommand/vectorize_main.cpp b/src/subcommand/vectorize_main.cpp index fb0b4254697..1693f5875b2 100644 --- a/src/subcommand/vectorize_main.cpp +++ b/src/subcommand/vectorize_main.cpp @@ -16,7 +16,9 @@ #include "../vg.hpp" #include "../vectorizer.hpp" #include "../mapper.hpp" -#include "../stream.hpp" +#include +#include +#include using namespace std; using namespace vg; @@ -28,7 +30,7 @@ void help_vectorize(char** argv){ << "Vectorize a set of alignments to a variety of vector formats." << endl << endl << "options: " << endl - << " -x --xg FILE An xg index for the graph of interest" << endl + << " -x --xg FILE An xg index or graph of interest" << endl << " -g --gcsa FILE A gcsa2 index to use if generating MEM sketches" << endl << " -l --aln-label LABEL Rename every alignment to LABEL when outputting alignment name." << endl << " -f --format Tab-delimit output so it can be used in R." << endl @@ -179,10 +181,12 @@ int main_vectorize(int argc, char** argv){ } } - xg::XG* xg_index; + PathPositionHandleGraph* xg_index = nullptr; + unique_ptr path_handle_graph; + bdsg::PathPositionOverlayHelper overlay_helper; if (!xg_name.empty()) { - ifstream in(xg_name); - xg_index = new xg::XG(in); + path_handle_graph = vg::io::VPKG::load_one(xg_name); + xg_index = overlay_helper.apply(path_handle_graph.get()); } else{ cerr << "No XG index given. An XG index must be provided." << endl; @@ -191,19 +195,15 @@ int main_vectorize(int argc, char** argv){ // Configure GCSA2 verbosity so it doesn't spit out loads of extra info gcsa::Verbosity::set(gcsa::Verbosity::SILENT); - - // Configure its temp directory to the system temp directory - gcsa::TempFile::setDirectory(temp_file::get_dir()); - gcsa::GCSA gcsa_index; - gcsa::LCPArray lcp_index; + unique_ptr gcsa_index; + unique_ptr lcp_index; if (!gcsa_name.empty()) { - ifstream in_gcsa(gcsa_name.c_str()); - gcsa_index.load(in_gcsa); + gcsa_index = vg::io::VPKG::load_one(gcsa_name); + // default LCP is the gcsa base name +.lcp - string lcp_in = gcsa_name + ".lcp"; - ifstream in_lcp(lcp_in.c_str()); - lcp_index.load(in_lcp); + string lcp_name = gcsa_name + ".lcp"; + lcp_index = vg::io::VPKG::load_one(lcp_name); } Mapper* mapper = nullptr; @@ -212,7 +212,7 @@ int main_vectorize(int argc, char** argv){ cerr << "[vg vectorize] error : an xg index and gcsa index are required when making MEM sketches" << endl; return 1; } else { - mapper = new Mapper(xg_index, &gcsa_index, &lcp_index); + mapper = new Mapper(xg_index, gcsa_index.get(), lcp_index.get()); } if (mem_hit_max) { mapper->hit_max = mem_hit_max; @@ -224,9 +224,9 @@ int main_vectorize(int argc, char** argv){ // write the header if needed if (format) { cout << "aln.name"; - for (size_t i = 1; i <= xg_index->max_node_rank(); ++i) { - cout << "\tnode." << xg_index->rank_to_id(i); - } + xg_index->for_each_handle([&](handle_t handle) { + cout << "\tnode." << xg_index->get_id(handle); + }); cout << endl; } @@ -298,7 +298,7 @@ int main_vectorize(int argc, char** argv){ }; get_input_file(optind, argc, argv, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); string mapping_str = vz.output_wabbit_map(); diff --git a/src/subcommand/view_main.cpp b/src/subcommand/view_main.cpp index 4cdf5a229b9..4e3d87f26db 100644 --- a/src/subcommand/view_main.cpp +++ b/src/subcommand/view_main.cpp @@ -15,13 +15,21 @@ #include "../multipath_alignment.hpp" #include "../vg.hpp" +#include "../snarl_distance_index.hpp" #include "../gfa.hpp" -#include "../json_stream_helper.hpp" +#include "../io/json_stream_helper.hpp" +#include "../handle.hpp" +#include "../algorithms/gfa_to_handle.hpp" + +#include +#include +#include + using namespace std; using namespace vg; using namespace vg::subcommand; - +using namespace vg::io; void help_view(char** argv) { cerr << "usage: " << argv[0] << " view [options] [ | | | [] ]" << endl @@ -29,16 +37,15 @@ void help_view(char** argv) { << " -g, --gfa output GFA format (default)" << endl << " -F, --gfa-in input GFA format, reducing overlaps if they occur" << endl - << " -v, --vg output VG format" << endl - << " -V, --vg-in input VG format (default)" << endl + << " -v, --vg output VG format [DEPRECATED, use vg convert instead]" << endl + << " -V, --vg-in input VG format only" << endl << " -j, --json output JSON format" << endl << " -J, --json-in input JSON format" << endl << " -c, --json-stream streaming conversion of a VG format graph in line delimited JSON format" << endl << " (this cannot be loaded directly via -J)" << endl - << " -G, --gam output GAM format (vg alignment format: Graph " << endl - << " Alignment/Map)" << endl + << " -G, --gam output GAM format (vg alignment format: Graph Alignment/Map)" << endl << " -Z, --translation-in input is a graph translation description" << endl << " -t, --turtle output RDF/turtle format (can not be loaded by VG)" << endl @@ -54,6 +61,7 @@ void help_view(char** argv) { << " -d, --dot output dot format" << endl << " -S, --simple-dot simplify the dot output; remove node labels, simplify alignments" << endl + << " -u, --noseq-dot shows size information instead of sequence in the dot output" << endl << " -e, --ascii-labels use labels for paths or superbubbles with char/colors rather than emoji" << endl << " -Y, --ultra-label label nodes with emoji/colors that correspond to ultrabubbles" << endl << " -m, --skip-missing skip mappings to nodes not in the graph when drawing alignments" << endl @@ -72,14 +80,17 @@ void help_view(char** argv) { << " -X, --fastq-out output fastq (input defaults to GAM)" << endl << " -i, --interleaved fastq is interleaved paired-ended" << endl - << " -L, --pileup ouput VG Pileup format" << endl + << " -L, --pileup output VG Pileup format" << endl << " -l, --pileup-in input VG Pileup format" << endl + << " -B, --distance-in input distance index" << endl << " -R, --snarl-in input VG Snarl format" << endl << " -E, --snarl-traversal-in input VG SnarlTraversal format" << endl << " -K, --multipath-in input VG MultipathAlignment format (GAMP)" << endl << " -k, --multipath output VG MultipathAlignment format (GAMP)" << endl << " -D, --expect-duplicates don't warn if encountering the same node or edge multiple times" << endl + << " -x, --extract-tag TAG extract and concatenate messages with the given tag" << endl + << " --verbose explain the file being read with --extract-tag" << endl << " --threads N for parallel operations use this many threads [1]" << endl; // TODO: Can we regularize the option names for input and output types? @@ -94,19 +105,21 @@ int main_view(int argc, char** argv) { } // Supported conversions: - // TO vg json gfa gam bam fastq dot + // TO hg vg json gfa gam bam fastq dot // FROM - // vg Y Y Y N N N Y - // json Y Y Y N N N Y - // gfa Y Y Y N N N Y - // gam N Y N N N N N - // bam N N N Y N N N - // fastq N N N Y N N N - // dot N N N N N N N + // hg N Y Y Y N N N Y + // vg N Y Y Y N N N Y + // json N Y Y Y N N N Y + // gfa N Y Y Y N N N Y + // gam N N Y N N N N N + // bam N N N N Y N N N + // fastq N N N N Y N N N + // dot N N N N N N N N // // and json-gam -> gam // json-pileup -> pileup + bool which_read_in_pair = true; string output_type; string input_type; string rdf_base_uri; @@ -121,13 +134,18 @@ int main_view(int argc, char** argv) { bool invert_edge_ports_in_dot = false; bool show_mappings_in_dot = false; bool simple_dot = false; + bool noseq_dot = false; int seed_val = time(NULL); bool color_variants = false; bool ultrabubble_labeling = false; bool skip_missing_nodes = false; bool expect_duplicates = false; + string extract_tag; + bool verbose; bool ascii_labels = false; omp_set_num_threads(1); // default to 1 thread + + #define OPT_VERBOSE 1000 int c; optind = 2; // force optind past "view" argument @@ -139,7 +157,7 @@ int main_view(int argc, char** argv) { {"dot", no_argument, 0, 'd'}, {"gfa", no_argument, 0, 'g'}, {"turtle", no_argument, 0, 't'}, - {"rdf-base-uri", no_argument, 0, 'r'}, + {"rdf-base-uri", required_argument, 0, 'r'}, {"gfa-in", no_argument, 0, 'F'}, {"json", no_argument, 0, 'j'}, {"json-in", no_argument, 0, 'J'}, @@ -163,6 +181,7 @@ int main_view(int argc, char** argv) { {"invert-ports", no_argument, 0, 'I'}, {"show-mappings", no_argument, 0, 'M'}, {"simple-dot", no_argument, 0, 'S'}, + {"noseq-dot", no_argument, 0, 'u'}, {"color", no_argument, 0, 'C'}, {"translation-in", no_argument, 0, 'Z'}, {"ultra-label", no_argument, 0, 'Y'}, @@ -170,9 +189,12 @@ int main_view(int argc, char** argv) { {"locus-in", no_argument, 0, 'q'}, {"loci", no_argument, 0, 'Q'}, {"locus-out", no_argument, 0, 'z'}, + {"distance-in", no_argument, 0, 'B'}, {"snarl-in", no_argument, 0, 'R'}, {"snarl-traversal-in", no_argument, 0, 'E'}, {"expect-duplicates", no_argument, 0, 'D'}, + {"extract-tag", required_argument, 0, 'x'}, + {"verbose", no_argument, 0, OPT_VERBOSE}, {"multipath", no_argument, 0, 'k'}, {"multipath-in", no_argument, 0, 'K'}, {"ascii-labels", no_argument, 0, 'e'}, @@ -181,7 +203,7 @@ int main_view(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "dgFjJhvVpaGbifA:s:wnlLIMcTtr:SCZYmqQ:zXREDkKe7:", + c = getopt_long (argc, argv, "dgFjJhvVpaGbifA:s:wnlLIMcTtr:SuCZYmqQ:zXBREDx:kKe7:", long_options, &option_index); /* Detect the end of the options. */ @@ -202,6 +224,10 @@ int main_view(int argc, char** argv) { simple_dot = true; break; + case 'u': + noseq_dot = true; + break; + case 'Y': ultrabubble_labeling = true; break; @@ -359,6 +385,14 @@ int main_view(int argc, char** argv) { loci_file = optarg; break; + case 'B': + input_type = "distance"; + if (output_type.empty()) { + // Default to DistanceIndex -> JSON + output_type = "json"; + } + break; + case 'R': input_type = "snarls"; if (output_type.empty()) { @@ -386,6 +420,14 @@ int main_view(int argc, char** argv) { case 'D': expect_duplicates = true; break; + + case 'x': + extract_tag = optarg; + break; + + case OPT_VERBOSE: + verbose = true; + break; case '7': omp_set_num_threads(parse(optarg)); @@ -403,9 +445,9 @@ int main_view(int argc, char** argv) { } } - // If the user specified nothing else, we default to VG in and GFA out. + // If the user specified nothing else, we default to handle graph in and GFA out. if (input_type.empty()) { - input_type = "vg"; + input_type = "handlegraph"; } if (output_type.empty()) { output_type = "gfa"; @@ -418,56 +460,115 @@ int main_view(int argc, char** argv) { function lambda = [&alns](Alignment& aln) { alns.push_back(aln); }; ifstream in; in.open(alignments.c_str()); - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); } vector loci; if (!loci_file.empty()) { function lambda = [&loci](Locus& locus) { loci.push_back(locus); }; ifstream in; in.open(loci_file.c_str()); - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); } - - VG* graph = nullptr; + if (optind >= argc) { cerr << "[vg view] error: no filename given" << endl; exit(1); } + if (output_type == "vg") { + cerr << "[vg view] warning: vg-protobuf output (-v / --v) is deprecated. please use vg convert instead." << endl; + } + string file_name = get_input_file_name(optind, argc, argv); + + // Tag extraction has to be handled specially. + // TODO: We can't dump just untagged messages. + if (!extract_tag.empty()) { + get_input_file(file_name, [&](istream& in) { + // Iterate over the input as tagged messages. + vg::io::MessageIterator it(in, verbose); + while(it.has_current()) { + if ((*it).first == extract_tag && (*it).second.get() != nullptr) { + // We match the tag, so dump this message. + if (verbose) { + cerr << "Message of " << (*it).second->size() << " bytes in matches tag to extract" << endl; + } + cout << *((*it).second.get()); + } else { + if (verbose) { + cerr << "Message of " << (*it).second->size() << " bytes does not match tag; skip" << endl; + } + } + ++it; + } + if (verbose) { + cerr << "Iterator no longer has messages" << endl; + } + }); + return 0; + } + + + unique_ptr graph; + if (input_type == "vg") { if (output_type == "stream") { function lambda = [&](Graph& g) { cout << pb2json(g) << endl; }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); return 0; } else { get_input_file(file_name, [&](istream& in) { - graph = new VG(in, false, !expect_duplicates); + graph = make_unique(in, false, !expect_duplicates); }); } // VG can convert to any of the graph formats, so keep going + } else if (input_type == "handlegraph") { + if (output_type == "stream") { + cerr << "[vg view] error: Cannot stream a generic HandleGraph to JSON" << endl; + exit(1); + } else { + graph = vg::io::VPKG::load_one(file_name); + } } else if (input_type == "gfa") { - get_input_file(file_name, [&](istream& in) { - graph = new VG; - if (!gfa_to_graph(in, graph)) { - // GFA loading has failed because the file is invalid - exit(1); - } - }); + graph = make_unique(); + + try { + // Use the disk-backed GFA loader that `vg convert` also uses. + vg::algorithms::gfa_to_path_handle_graph(file_name, + dynamic_cast(graph.get()), + nullptr, + 0); // set rgfa path rank to 0 to be consistent with vg convert's default logic + } catch (vg::algorithms::GFAFormatError& e) { + cerr << "error:[vg view] Input GFA is not acceptable." << endl; + cerr << e.what() << endl; + exit(1); + } catch (std::ios_base::failure& e) { + cerr << "error:[vg view] IO error processing input GFA." << endl; + cerr << e.what() << endl; + exit(1); + } + // GFA can convert to any of the graph formats, so keep going } else if(input_type == "json") { assert(input_json); - stream::JSONStreamHelper json_helper(file_name); + vg::io::JSONStreamHelper json_helper(file_name); function get_next_graph = json_helper.get_read_fn(); - graph = new VG(get_next_graph, false, !expect_duplicates); + // TODO: This is less inversion of control and more putting control in the middle. + graph = make_unique([&](const function use_graph) { + Graph g; + while (get_next_graph(g)) { + use_graph(g); + } + }, false, !expect_duplicates); } else if(input_type == "turtle-in") { - graph = new VG; + // TODO: Only vg::VG can load Turtle right now + graph = make_unique(); bool pre_compress=color_variants; if (file_name == "-") { - graph->from_turtle("/dev/stdin", rdf_base_uri); + dynamic_cast(graph.get())->from_turtle("/dev/stdin", rdf_base_uri); } else { - graph->from_turtle(file_name, rdf_base_uri); + dynamic_cast(graph.get())->from_turtle(file_name, rdf_base_uri); } } else if (input_type == "gam") { if (!input_json) { @@ -483,7 +584,7 @@ int main_view(int argc, char** argv) { cout << pb2json(a) << "\n"; }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else if (output_type == "fastq") { function lambda = [](Alignment& a) { @@ -497,20 +598,23 @@ int main_view(int argc, char** argv) { } }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else if (output_type == "multipath") { vector buf; function lambda = [&buf](Alignment& aln) { + multipath_alignment_t mp_aln; + to_multipath_alignment(aln, mp_aln); buf.emplace_back(); - to_multipath_alignment(aln, buf.back()); - stream::write_buffered(cout, buf, 1000); + to_proto_multipath_alignment(mp_aln, buf.back()); + transfer_proto_metadata(aln, buf.back()); + vg::io::write_buffered(cout, buf, 1000); }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); - stream::write_buffered(cout, buf, 0); + vg::io::write_buffered(cout, buf, 0); } else { // todo @@ -518,7 +622,7 @@ int main_view(int argc, char** argv) { return 1; } } else { - stream::JSONStreamHelper json_helper(file_name); + vg::io::JSONStreamHelper json_helper(file_name); if (output_type == "json" || output_type == "gam") { json_helper.write(cout, output_type == "json"); } @@ -526,11 +630,13 @@ int main_view(int argc, char** argv) { vector buf; Alignment aln; while (json_helper.get_read_fn()(aln)) { + multipath_alignment_t mp_aln; + to_multipath_alignment(aln, mp_aln); buf.emplace_back(); - to_multipath_alignment(aln, buf.back()); - stream::write_buffered(std::cout, buf, 1000); + to_proto_multipath_alignment(mp_aln, buf.back()); + vg::io::write_buffered(std::cout, buf, 1000); } - stream::write_buffered(cout, buf, 0); + vg::io::write_buffered(cout, buf, 0); } else { cerr << "[vg view] error: JSON GAM can only be converted to GAM, GAMP, or JSON" << endl; @@ -541,22 +647,11 @@ int main_view(int argc, char** argv) { return 0; } else if (input_type == "bam") { if (output_type == "gam") { - //function& lambda) { - // todo write buffering procedure in alignment.cpp - vector buf; + vg::io::ProtobufEmitter buf(std::cout); function lambda = [&buf](Alignment& aln) { - buf.push_back(aln); - if (buf.size() > 1000) { - write_alignments(std::cout, buf); - buf.clear(); - } + buf.write(std::move(aln)); }; hts_for_each(file_name, lambda); - write_alignments(std::cout, buf); - buf.clear(); - // Finish the stream with an EOF marker - stream::finish(std::cout); - cout.flush(); return 0; } else if (output_type == "json") { // todo @@ -568,7 +663,7 @@ int main_view(int argc, char** argv) { } } else if (input_type == "multipath") { if (input_json) { - stream::JSONStreamHelper json_helper(file_name); + vg::io::JSONStreamHelper json_helper(file_name); if (output_type == "multipath") { json_helper.write(cout, false); } @@ -588,17 +683,38 @@ int main_view(int argc, char** argv) { } else if (output_type == "gam") { vector buf; - MultipathAlignment mp_aln; - while (json_helper.get_read_fn()(mp_aln)) { + MultipathAlignment proto_mp_aln; + while (json_helper.get_read_fn()(proto_mp_aln)) { + multipath_alignment_t mp_aln; + from_proto_multipath_alignment(proto_mp_aln, mp_aln); buf.emplace_back(); optimal_alignment(mp_aln, buf.back()); - stream::write_buffered(std::cout, buf, 1000); + transfer_proto_metadata(proto_mp_aln, buf.back()); + if (!proto_mp_aln.paired_read_name().empty()) { + // alternate using next/prev + if (which_read_in_pair) { + buf.back().mutable_fragment_next()->set_name(proto_mp_aln.paired_read_name()); + } + else { + buf.back().mutable_fragment_prev()->set_name(proto_mp_aln.paired_read_name()); + } + } + which_read_in_pair = !which_read_in_pair; + vg::io::write_buffered(std::cout, buf, 1000); } - stream::write_buffered(cout, buf, 0); + vg::io::write_buffered(cout, buf, 0); } else if (output_type == "json") { json_helper.write(cout, true); } + else if (output_type == "dot") { + MultipathAlignment proto_mp_aln; + while (json_helper.get_read_fn()(proto_mp_aln)) { + multipath_alignment_t mp_aln; + from_proto_multipath_alignment(proto_mp_aln, mp_aln); + view_multipath_alignment_as_dot(std::cout, mp_aln, true); + } + } else { cerr << "[vg view] error: Unrecognized output format for MultipathAlignment (GAMP)" << endl; return 1; @@ -610,12 +726,12 @@ int main_view(int argc, char** argv) { vector buf; function lambda = [&buf](MultipathAlignment& mp_aln) { buf.push_back(mp_aln); - stream::write_buffered(cout, buf, 1000); + vg::io::write_buffered(cout, buf, 1000); }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); - stream::write_buffered(std::cout, buf, 0); + vg::io::write_buffered(std::cout, buf, 0); } else if (output_type == "fastq") { function lambda = [](MultipathAlignment& mp_aln) { @@ -629,27 +745,40 @@ int main_view(int argc, char** argv) { } }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else if (output_type == "gam") { vector buf; - function lambda = [&buf](MultipathAlignment& mp_aln) { + function lambda = [&buf](MultipathAlignment& proto_mp_aln) { + multipath_alignment_t mp_aln; + from_proto_multipath_alignment(proto_mp_aln, mp_aln); buf.emplace_back(); optimal_alignment(mp_aln, buf.back()); - stream::write_buffered(cout, buf, 1000); + transfer_proto_metadata(proto_mp_aln, buf.back()); + vg::io::write_buffered(cout, buf, 1000); }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); - stream::write_buffered(std::cout, buf, 0); + vg::io::write_buffered(std::cout, buf, 0); } else if (output_type == "json") { function lambda = [&](MultipathAlignment& mp_aln) { - cout << pb2json(mp_aln) << endl; + cout << pb2json(mp_aln) << '\n'; + }; + get_input_file(file_name, [&](istream& in) { + vg::io::for_each(in, lambda); + }); + } + else if (output_type == "dot") { + function lambda = [&](MultipathAlignment& proto_mp_aln) { + multipath_alignment_t mp_aln; + from_proto_multipath_alignment(proto_mp_aln, mp_aln); + view_multipath_alignment_as_dot(std::cout, mp_aln, true); }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else { @@ -666,41 +795,25 @@ int main_view(int argc, char** argv) { fastq2 = get_input_file_name(optind, argc, argv); } if (output_type == "gam") { - vector buf; + vg::io::ProtobufEmitter buf(std::cout); if (!interleaved_fastq && fastq2.empty()) { function lambda = [&buf](Alignment& aln) { - buf.push_back(aln); - if (buf.size() > 1000) { - write_alignments(std::cout, buf); - buf.clear(); - } + buf.write(std::move(aln)); }; fastq_unpaired_for_each(fastq1, lambda); } else if (interleaved_fastq && fastq2.empty()) { function lambda = [&buf](Alignment& aln1, Alignment& aln2) { - buf.push_back(aln1); - buf.push_back(aln2); - if (buf.size() > 1000) { - write_alignments(std::cout, buf); - buf.clear(); - } + buf.write(std::move(aln1)); + buf.write(std::move(aln2)); }; fastq_paired_interleaved_for_each(fastq1, lambda); } else if (!fastq2.empty()) { function lambda = [&buf](Alignment& aln1, Alignment& aln2) { - buf.push_back(aln1); - buf.push_back(aln2); - if (buf.size() > 1000) { - write_alignments(std::cout, buf); - buf.clear(); - } + buf.write(std::move(aln1)); + buf.write(std::move(aln2)); }; fastq_paired_two_files_for_each(fastq1, fastq2, lambda); } - write_alignments(std::cout, buf); - buf.clear(); - // Finish the stream with an EOF marker - stream::finish(std::cout); } else { // We can't convert fastq to the other graph formats cerr << "[vg view] error: FASTQ can only be converted to GAM" << endl; @@ -716,7 +829,7 @@ int main_view(int argc, char** argv) { cout << pb2json(p) << "\n"; }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else { // todo @@ -725,7 +838,7 @@ int main_view(int argc, char** argv) { } } else { if (output_type == "json" || output_type == "pileup") { - stream::JSONStreamHelper json_helper(file_name); + vg::io::JSONStreamHelper json_helper(file_name); json_helper.write(cout, output_type == "json"); } else { cerr << "[vg view] error: JSON Pileup can only be converted to Pileup or JSON" << endl; @@ -740,7 +853,7 @@ int main_view(int argc, char** argv) { cout << pb2json(t) << "\n"; }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else { cerr << "[vg view] error: (binary) Translation can only be converted to JSON" << endl; @@ -755,7 +868,7 @@ int main_view(int argc, char** argv) { cout << pb2json(l) << "\n"; }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else { // todo @@ -764,7 +877,7 @@ int main_view(int argc, char** argv) { } } else { if (output_type == "json" || output_type == "locus") { - stream::JSONStreamHelper json_helper(file_name); + vg::io::JSONStreamHelper json_helper(file_name); json_helper.write(cout, output_type == "json"); } else { cerr << "[vg view] error: JSON Locus can only be converted to Locus or JSON" << endl; @@ -773,13 +886,24 @@ int main_view(int argc, char** argv) { } cout.flush(); return 0; + } else if (input_type == "distance") { + if (output_type == "json") { + get_input_file(file_name, [&](istream& in) { + auto distance_index = vg::io::VPKG::load_one(in); + distance_index->write_snarls_to_json(); + }); + } else { + cerr << "[vg view] error: (binary) Distance index can only be converted to JSON" << endl; + return 1; + } + return 0; } else if (input_type == "snarls") { if (output_type == "json") { function lambda = [](Snarl& s) { cout << pb2json(s) << "\n"; }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else { cerr << "[vg view] error: (binary) Snarls can only be converted to JSON" << endl; @@ -792,7 +916,7 @@ int main_view(int argc, char** argv) { cout << pb2json(s) << "\n"; }; get_input_file(file_name, [&](istream& in) { - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); }); } else { cerr << "[vg view] error: (binary) SnarlTraversals can only be converted to JSON" << endl; @@ -801,53 +925,87 @@ int main_view(int argc, char** argv) { return 0; } - if(graph == nullptr) { + if(!graph) { // Make sure we didn't forget to implement an input format. cerr << "[vg view] error: cannot load graph in " << input_type << " format" << endl; return 1; } - if(!graph->is_valid()) { - // If we're converting the graph, we might as well make sure it's valid. - // This is especially useful for JSON import. - cerr << "[vg view] warning: graph is invalid!" << endl; - } + if (output_type == "gfa") { + graph_to_gfa(graph.get(), std::cout); + return 0; + } // Now we know graph was filled in from the input format. Spit it out in the // requested output format. - + + // TODO: for now, all our output formats require a copy through vg:VG. + // Look at the graph as a vg if possible + VG* vg_graph = dynamic_cast(graph.get()); + + if (vg_graph == nullptr) { + // Copy instead. Should be fine because we on;y ever want to run this on small graphs anyway. + vg_graph = new vg::VG(); + handlealgs::copy_path_handle_graph(graph.get(), vg_graph); + + // Make sure the new VG has its Proto right + // TODO: if we didn't reach into vg.graph we wouldn't need to do this. + vg_graph->paths.to_graph(vg_graph->graph); + +#ifdef debug + cerr << "Paths before conversion: " << endl; + graph->for_each_path_handle([&](const path_handle_t& p) { + cerr << graph->get_path_name(p) << endl; + }); + + cerr << "Paths after conversion: " << endl; + vg_graph->for_each_path_handle([&](const path_handle_t& p) { + cerr << vg_graph->get_path_name(p) << endl; + }); + + cerr << "VG Protobuf paths:" << endl; + for (auto& p : vg_graph->graph.path()) { + cerr << p.name() << endl; + } +#endif + + // Give the unique_ptr ownership and delete the graph we loaded. + graph.reset(vg_graph); + } + + if(!vg_graph->is_valid()) { + // If we're converting the graph via VG, we might as well make sure it's valid. + // This is especially useful for JSON import. + cerr << "[vg view] warning: graph is invalid!" << endl; + } if (output_type == "dot") { - graph->to_dot(std::cout, - alns, - loci, - show_paths_in_dot, - walk_paths_in_dot, - annotate_paths_in_dot, - show_mappings_in_dot, - simple_dot, - invert_edge_ports_in_dot, - color_variants, - ultrabubble_labeling, - skip_missing_nodes, - ascii_labels, - seed_val); + vg_graph->to_dot(std::cout, + alns, + loci, + show_paths_in_dot, + walk_paths_in_dot, + annotate_paths_in_dot, + show_mappings_in_dot, + simple_dot, + noseq_dot, + invert_edge_ports_in_dot, + color_variants, + ultrabubble_labeling, + skip_missing_nodes, + ascii_labels, + seed_val); } else if (output_type == "json") { - cout << pb2json(graph->graph) << endl; - } else if (output_type == "gfa") { - graph_to_gfa(graph, std::cout); + cout << pb2json(vg_graph->graph) << endl; } else if (output_type == "turtle") { - graph->to_turtle(std::cout, rdf_base_uri, color_variants); + vg_graph->to_turtle(std::cout, rdf_base_uri, color_variants); } else if (output_type == "vg") { - graph->serialize_to_ostream(cout); - stream::finish(cout); - } else if (output_type == "locus") { - - } else { + vg_graph->serialize_to_ostream(cout); + } else if (output_type != "gfa") { // We somehow got here with a bad output format. cerr << "[vg view] error: cannot save a graph in " << output_type << " format" << endl; return 1; } - + // We made it to the end and nothing broke. return 0; } diff --git a/src/subcommand/viz_main.cpp b/src/subcommand/viz_main.cpp index 31fa4944416..0b32007b8df 100644 --- a/src/subcommand/viz_main.cpp +++ b/src/subcommand/viz_main.cpp @@ -1,7 +1,10 @@ #include "subcommand.hpp" #include "../utility.hpp" #include "../viz.hpp" -#include "../stream.hpp" +#include "../xg.hpp" +#include +#include +#include #include #include @@ -109,13 +112,18 @@ int main_viz(int argc, char** argv) { } } - xg::XG xgidx; + PathPositionHandleGraph* xgidx = nullptr; + unique_ptr path_handle_graph; + bdsg::PathPositionVectorizableOverlayHelper overlay_helper; if (xg_name.empty()) { - cerr << "No XG index given. An XG index must be provided." << endl; + cerr << "No input graph given. An input graph (-x) must be provided." << endl; exit(1); } else { - ifstream in(xg_name.c_str()); - xgidx.load(in); + path_handle_graph = vg::io::VPKG::load_one(xg_name); + // We know the PathPositionVectorizableOverlayHelper produces a PathPositionVectorizableOverlay which implements PathPositionHandleGraph. + // TODO: Make the types actually work out here. + xgidx = dynamic_cast(overlay_helper.apply(path_handle_graph.get())); + assert(xgidx != nullptr); } // todo one packer per thread and merge @@ -131,8 +139,10 @@ int main_viz(int argc, char** argv) { pack_names = packs_in; } - Viz viz(&xgidx, &packs, pack_names, image_out, image_width, image_height, show_cnv, show_dna, show_paths); - viz.draw(); + { + Viz viz(xgidx, &packs, pack_names, image_out, image_width, image_height, show_cnv, show_dna, show_paths); + viz.draw(); + } return 0; } diff --git a/src/subcommand/xg_main.cpp b/src/subcommand/xg_main.cpp deleted file mode 100644 index 8fcf7e3c3ba..00000000000 --- a/src/subcommand/xg_main.cpp +++ /dev/null @@ -1,476 +0,0 @@ -/** \file version_main.cpp - * - * Defines the "vg version" subcommand, which evaluates graphs and alignments. - */ - - -#include -#include -#include -#include -#include - - -#include "subcommand.hpp" - -#include "sdsl/bit_vectors.hpp" -#include "../version.hpp" -#include "../stream.hpp" -#include "../cpp/vg.pb.h" -#include "../xg.hpp" -#include "../region.hpp" -#include "../handle_to_vg.hpp" - -using namespace std; -using namespace vg; -using namespace vg::subcommand; -using namespace xg; - -void help_xg(char** argv) { - cerr << "usage: " << argv[0] << " xg [options]" << endl - << "Manipluate succinct representations of queryable sequence graphs" << endl - << endl - << "options:" << endl - << " -v, --vg FILE compress graph in vg FILE" << endl - << " -V, --validate validate compression" << endl - << " -o, --out FILE serialize graph to FILE in xg format" << endl - << " -i, --in FILE use index in FILE" << endl - << " -X, --extract-vg FILE serialize graph to FILE in vg format" << endl - << " -n, --node ID graph neighborhood around node with ID" << endl - << " -c, --context N steps of context to extract when building neighborhood" << endl - << " -s, --node-seq ID provide node sequence for ID" << endl - << " -P, --char POS give the character at a given position in the graph" << endl - << " -F, --substr POS:LEN extract the substr of LEN on the node at the position" << endl - << " -f, --edges-from ID list edges from node with ID" << endl - << " -t, --edges-to ID list edges to node with ID" << endl - << " -O, --edges-of ID list all edges related to node with ID" << endl - << " -S, --edges-on-start ID list all edges on start of node with ID" << endl - << " -E, --edges-on-end ID list all edges on start of node with ID" << endl - << " -p, --path TARGET gets the region of the graph @ TARGET (chr:start-end)" << endl - << " -x, --extract-threads extract succinct threads as paths" << endl - << " -r, --store-threads store perfect match paths as succinct threads" << endl - << " -d, --is-sorted-dag graph is a sorted dag; use fast thread insert" << endl - << " -R, --report FILE save an HTML space usage report to FILE when serializing" << endl - << " -D, --debug show debugging output" << endl - << " -T, --text-output write text instead of vg protobuf" << endl - << " -b, --dump-bs FILE dump the gPBWT to the given file" << endl - << " -h, --help this text" << endl; -} - -int main_xg(int argc, char** argv) { - - if (argc == 2) { - help_xg(argv); - return 1; - } - - string vg_in; - string vg_out; - string out_name; - string in_name; - int64_t node_id; - bool edges_from = false; - bool edges_to = false; - bool edges_of = false; - bool edges_on_start = false; - bool edges_on_end = false; - bool node_sequence = false; - string pos_for_char; - string pos_for_substr; - int context_steps = 0; - bool node_context = false; - string target; - bool print_graph = false; - bool text_output = false; - bool validate_graph = false; - bool extract_threads = false; - bool store_threads = false; - bool is_sorted_dag = false; - string report_name; - string b_array_name; - - int c; - optind = 2; // force optind past "xg" positional argument - while (true) { - static struct option long_options[] = - { - {"help", no_argument, 0, 'h'}, - {"vg", required_argument, 0, 'v'}, - {"out", required_argument, 0, 'o'}, - {"in", required_argument, 0, 'i'}, - {"extract-vg", required_argument, 0, 'X'}, - {"node", required_argument, 0, 'n'}, - {"char", required_argument, 0, 'P'}, - {"substr", required_argument, 0, 'F'}, - //{"range", required_argument, 0, 'r'}, - {"context", required_argument, 0, 'c'}, - {"edges-from", required_argument, 0, 'f'}, - {"edges-to", required_argument, 0, 't'}, - {"edges-of", required_argument, 0, 'O'}, - {"edges-on-start", required_argument, 0, 'S'}, - {"edges-on-end", required_argument, 0, 'E'}, - {"node-seq", required_argument, 0, 's'}, - {"path", required_argument, 0, 'p'}, - {"extract-threads", no_argument, 0, 'x'}, - {"store-threads", no_argument, 0, 'r'}, - {"is-sorted-dag", no_argument, 0, 'd'}, - {"report", required_argument, 0, 'R'}, - {"debug", no_argument, 0, 'D'}, - {"text-output", no_argument, 0, 'T'}, - {"validate", no_argument, 0, 'V'}, - {"dump-bs", required_argument, 0, 'b'}, - {0, 0, 0, 0} - }; - - int option_index = 0; - c = getopt_long (argc, argv, "hv:o:i:X:f:t:s:c:n:p:DxrdTO:S:E:VR:P:F:b:", - long_options, &option_index); - - // Detect the end of the options. - if (c == -1) - break; - - switch (c) - { - - case 'v': - vg_in = optarg; - break; - - case 'V': - validate_graph = true; - break; - - case 'o': - out_name = optarg; - break; - - case 'D': - print_graph = true; - break; - - case 'T': - text_output = true; - break; - - case 'x': - extract_threads = true; - break; - - case 'r': - store_threads = true; - break; - - case 'd': - is_sorted_dag = true; - break; - - case 'i': - in_name = optarg; - break; - - case 'X': - vg_out = optarg; - break; - - case 'n': - node_id = parse(optarg); - node_context = true; - break; - - case 'c': - context_steps = parse(optarg); - break; - - case 'f': - node_id = parse(optarg); - edges_from = true; - break; - - case 't': - node_id = parse(optarg); - edges_to = true; - break; - - case 'O': - node_id = parse(optarg); - edges_of = true; - break; - - case 'S': - node_id = parse(optarg); - edges_on_start = true; - break; - - case 'E': - node_id = parse(optarg); - edges_on_end = true; - break; - - case 's': - node_id = parse(optarg); - node_sequence = true; - break; - - case 'p': - target = optarg; - break; - - case 'P': - pos_for_char = optarg; - break; - - case 'F': - pos_for_substr = optarg; - break; - - case 'R': - report_name = optarg; - break; - - case 'b': - b_array_name = optarg; - break; - - case 'h': - case '?': - help_xg(argv); - exit(1); - break; - - default: - abort (); - } - } - - XG* graph = nullptr; - //string file_name = argv[optind]; - if (in_name.empty()) assert(!vg_in.empty()); - if (vg_in == "-") { - graph = new XG; - graph->from_stream(std::cin, validate_graph, print_graph, store_threads, is_sorted_dag); - } else if (vg_in.size()) { - ifstream in; - in.open(vg_in.c_str()); - graph = new XG; - graph->from_stream(in, validate_graph, print_graph, store_threads, is_sorted_dag); - } - - if (in_name.size()) { - graph = new XG; - if (in_name == "-") { - graph->load(std::cin); - } else { - ifstream in; - in.open(in_name.c_str()); - graph->load(in); - } - } - - // Prepare structure tree for serialization - unique_ptr structure; - - if (!report_name.empty()) { - // We need to make a report, so we need the structure. Make a real tree - // node. The unique_ptr handles deleting. - structure = unique_ptr(new sdsl::structure_tree_node("name", "type")); - } - - if(!vg_out.empty()) { - if (graph == nullptr) { - cerr << "error [vg xg] no xg graph exists to convert; Try: vg xg -i graph.xg -X graph.vg" << endl; - return 1; - } - - // Convert the xg graph to vg format - VG converted = handle_to_vg(graph); - - // TODO: The converter doesn't copy paths yet. When it does, we can - // remove all this path copying code. - - // Make a raw Proto Graph to hold Path objects - Graph path_graph; - - // Since paths are not copied, copy the paths. - for (size_t rank = 1; rank <= graph->max_path_rank(); rank++) { - // Extract each path into the path graph - *path_graph.add_path() = graph->path(graph->path_name(rank)); - } - - // Merge in all the paths - converted.extend(path_graph); - - if (vg_out == "-") { - converted.serialize_to_ostream(std::cout); - } else { - converted.serialize_to_file(vg_out); - } - } - - if (!out_name.empty()) { - if (out_name == "-") { - graph->serialize(std::cout, structure.get(), "xg"); - std::cout.flush(); - } else { - ofstream out; - out.open(out_name.c_str()); - graph->serialize(out, structure.get(), "xg"); - out.flush(); - } - } - - if (!report_name.empty()) { - // Save the report - ofstream out; - out.open(report_name.c_str()); - sdsl::write_structure_tree(structure.get(), out, 0); - } - - // queries - if (node_sequence) { - cout << node_id << ": " << graph->node_sequence(node_id) << endl; - } - if (!pos_for_char.empty()) { - // extract the position from the string - int64_t id; - bool is_rev; - size_t off; - extract_pos(pos_for_char, id, is_rev, off); - // then pick it up from the graph - cout << graph->pos_char(id, is_rev, off) << endl; - } - if (!pos_for_substr.empty()) { - int64_t id; - bool is_rev; - size_t off; - size_t len; - extract_pos_substr(pos_for_substr, id, is_rev, off, len); - cout << graph->pos_substr(id, is_rev, off, len) << endl; - } - - if (edges_from) { - vector edges = graph->edges_from(node_id); - for (auto& edge : edges) { - cout << edge.from() << (edge.from_start()?"-":"+") - << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; - } - } - if (edges_to) { - vector edges = graph->edges_to(node_id); - for (auto& edge : edges) { - cout << edge.from() << (edge.from_start()?"-":"+") - << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; - } - } - if (edges_of) { - vector edges = graph->edges_of(node_id); - for (auto& edge : edges) { - cout << edge.from() << (edge.from_start()?"-":"+") - << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; - } - } - if (edges_on_start) { - vector edges = graph->edges_on_start(node_id); - for (auto& edge : edges) { - cout << edge.from() << (edge.from_start()?"-":"+") - << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; - } - } - if (edges_on_end) { - vector edges = graph->edges_on_end(node_id); - for (auto& edge : edges) { - cout << edge.from() << (edge.from_start()?"-":"+") - << " -> " << edge.to() << (edge.to_end()?"-":"+") << endl; - } - } - - if (node_context) { - Graph g; - graph->neighborhood(node_id, context_steps, g); - if (text_output) { - to_text(cout, g); - } else { - vector gb = { g }; - stream::write_buffered(cout, gb, 0); - } - } - - if (!target.empty()) { - string name; - int64_t start, end; - Graph g; - parse_region(target, name, start, end); - graph->get_path_range(name, start, end, g); - graph->expand_context(g, context_steps); - if (text_output) { - to_text(cout, g); - } else { - vector gb = { g }; - stream::write_buffered(cout, gb, 0); - } - } - - if (extract_threads) { - list threads; - for (auto& p : graph->extract_threads(false)) { - for (auto& t : p.second) { - threads.push_back(t); - } - } - for (auto& p : graph->extract_threads(true)) { - for (auto& t : p.second) { - threads.push_back(t); - } - } - - size_t thread_number = 0; - for(XG::thread_t& thread : threads) { - // Convert to a Path - Path path; - for(XG::ThreadMapping& m : thread) { - // Convert all the mappings - Mapping mapping; - mapping.mutable_position()->set_node_id(m.node_id); - mapping.mutable_position()->set_is_reverse(m.is_reverse); - - *(path.add_mapping()) = mapping; - } - - - // Give each thread a name - path.set_name("_thread_" + to_string(thread_number++)); - - // We need a Graph for serialization purposes. We do one chunk per - // thread in case the threads are long. - Graph g; - - *(g.add_path()) = path; - - // Dump the graph with its mappings. TODO: can we restrict these to - // mappings to nodes we have already pulled out? Or pull out the - // whole compressed graph? - if (text_output) { - to_text(cout, g); - } else { - vector gb = { g }; - stream::write_buffered(cout, gb, 0); - } - - } - } - - if (!b_array_name.empty()) { - // Dump B array - ofstream out; - out.open(b_array_name.c_str()); - graph->bs_dump(out); - } - - // clean up - if (graph) delete graph; - - return 0; -} - -// Register subcommand -static Subcommand vg_xg("xg", "manipulate xg files", main_xg); diff --git a/src/subgraph.cpp b/src/subgraph.cpp new file mode 100644 index 00000000000..7806a8d1932 --- /dev/null +++ b/src/subgraph.cpp @@ -0,0 +1,119 @@ +/** + * \file subgraph.cpp: contains the implementation of SubHandleGraph + */ + + +#include "subgraph.hpp" + +#include + +namespace vg { + +using namespace std; + + SubHandleGraph::SubHandleGraph(const HandleGraph* super) : super(super) { + // nothing to do + } + + void SubHandleGraph::add_handle(const handle_t& handle) { + + id_t node_id = super->get_id(handle); + + min_id = min(node_id, min_id); + max_id = max(node_id, max_id); + + contents.insert(node_id); + } + + bool SubHandleGraph::has_node(id_t node_id) const { + return contents.count(node_id); + } + + handle_t SubHandleGraph::get_handle(const id_t& node_id, bool is_reverse) const { + if (!contents.count(node_id)) { + cerr << "error:[SubHandleGraph] subgraph does not contain node with ID " << node_id << endl; + exit(1); + } + return super->get_handle(node_id, is_reverse); + } + + id_t SubHandleGraph::get_id(const handle_t& handle) const { + return super->get_id(handle); + } + + bool SubHandleGraph::get_is_reverse(const handle_t& handle) const { + return super->get_is_reverse(handle); + } + + handle_t SubHandleGraph::flip(const handle_t& handle) const { + return super->flip(handle); + } + + size_t SubHandleGraph::get_length(const handle_t& handle) const { + return super->get_length(handle); + } + + string SubHandleGraph::get_sequence(const handle_t& handle) const { + return super->get_sequence(handle); + } + + bool SubHandleGraph::follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const { + // only let it travel along edges whose endpoints are in the subgraph + bool keep_going = true; + super->follow_edges(handle, go_left, [&](const handle_t& handle) { + if (contents.count(super->get_id(handle))) { + keep_going = iteratee(handle); + } + return keep_going; + }); + return keep_going; + } + + bool SubHandleGraph::for_each_handle_impl(const function& iteratee, bool parallel) const { + if (parallel) { + atomic keep_going(true); + // do parallelism taskwise inside the iteration +#pragma omp parallel + { +#pragma omp single + { + for(auto iter = contents.begin(); keep_going && iter != contents.end(); iter++) { +#pragma omp task + { + if (!iteratee(super->get_handle(*iter))) { + keep_going = false; + } + } + } + } + } + return keep_going; + } + else { + // non-parallel + for (id_t node_id : contents) { + if (!iteratee(super->get_handle(node_id))) { + return false; + } + } + return true; + } + } + + size_t SubHandleGraph::get_node_count() const { + return contents.size(); + } + + id_t SubHandleGraph::min_node_id() const { + return min_id; + } + + id_t SubHandleGraph::max_node_id() const { + return max_id; + } + + handle_t SubHandleGraph::get_underlying_handle(const handle_t& handle) const { + return handle; + } +} + diff --git a/src/subgraph.hpp b/src/subgraph.hpp new file mode 100644 index 00000000000..1c9e2c50c36 --- /dev/null +++ b/src/subgraph.hpp @@ -0,0 +1,115 @@ +#ifndef VG_SUBGRAPH_HPP_INCLUDED +#define VG_SUBGRAPH_HPP_INCLUDED + +/** \file + * subgraph.hpp: defines a handle graph implementation of a subgraph + */ + +#include +#include "handle.hpp" + +namespace vg { + +using namespace std; + + /** + * A HandleGraph implementation that acts as a subgraph of some other HandleGraph + * using a layer of indirection. Only subsets based on nodes; all edges between + * the nodes in the super graph are considered part of the subgraph. Subgraph + * handles can also be used by the super graph. + */ + class SubHandleGraph : public ExpandingOverlayGraph { + public: + + /// Initialize with a super graph and nodes returned by iterators to handles + /// from the super graph + template + SubHandleGraph(const HandleGraph* super, HandleIter begin, HandleIter end); + + /// Initialize as empty subgraph of a super graph + SubHandleGraph(const HandleGraph* super); + + /// Add a node from the super graph to the subgraph. Must be a handle to the + /// super graph. No effect if the node is already included in the subgraph. + /// Generally invalidates the results of any previous algorithms. + void add_handle(const handle_t& handle); + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + // Method to check if a node exists by ID + virtual bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + virtual handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + virtual id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + virtual bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + virtual handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + virtual size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + virtual string get_sequence(const handle_t& handle) const; + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + virtual bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + /// Return the number of nodes in the graph + virtual size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t max_node_id() const; + + /////////////////////////////////// + /// ExpandingOverlayGraph interface + /////////////////////////////////// + + /** + * Returns the handle in the underlying graph that corresponds to a handle in the + * overlay + */ + virtual handle_t get_underlying_handle(const handle_t& handle) const; + + private: + const HandleGraph* super = nullptr; + unordered_set contents; + // keep track of these separately rather than use an ordered set + id_t min_id = numeric_limits::max(); + id_t max_id = numeric_limits::min(); + + }; + + + // Template constructor + template + SubHandleGraph::SubHandleGraph(const HandleGraph* super, HandleIter begin, HandleIter end) : super(super) { + for (auto iter = begin; iter != end; ++iter) { + add_handle(*iter); + } + } +} + +#endif diff --git a/src/subgraph_overlay.cpp b/src/subgraph_overlay.cpp new file mode 100644 index 00000000000..1de37ce24ca --- /dev/null +++ b/src/subgraph_overlay.cpp @@ -0,0 +1,209 @@ +#include +#include "subgraph_overlay.hpp" + +#include + +namespace vg { + +using namespace std; +using namespace handlegraph; + +SubgraphOverlay::SubgraphOverlay(const HandleGraph* backing, const unordered_set* node_subset) : + backing_graph(backing), + node_subset(node_subset) { + if (!node_subset->empty()) { + auto minmax_nodes = std::minmax_element(node_subset->begin(), node_subset->end()); + min_node = *minmax_nodes.first; + max_node = *minmax_nodes.second; + } +} + +SubgraphOverlay::~SubgraphOverlay() { + +} + +bool SubgraphOverlay::has_node(nid_t node_id) const { + return node_subset->count(node_id); +} + +handle_t SubgraphOverlay::get_handle(const nid_t& node_id, bool is_reverse) const { + if (has_node(node_id)) { + return backing_graph->get_handle(node_id, is_reverse); + } else { + throw runtime_error("Node " + std::to_string(node_id) + " not in subgraph overlay"); + } +} + +nid_t SubgraphOverlay::get_id(const handle_t& handle) const { + return backing_graph->get_id(handle); +} + +bool SubgraphOverlay::get_is_reverse(const handle_t& handle) const { + return backing_graph->get_is_reverse(handle); +} + +handle_t SubgraphOverlay::flip(const handle_t& handle) const { + return backing_graph->flip(handle); +} + +size_t SubgraphOverlay::get_length(const handle_t& handle) const { + return backing_graph->get_length(handle); +} + +std::string SubgraphOverlay::get_sequence(const handle_t& handle) const { + return backing_graph->get_sequence(handle); +} + +size_t SubgraphOverlay::get_node_count() const { + return node_subset->size(); +} + +nid_t SubgraphOverlay::min_node_id() const { + return min_node; +} + +nid_t SubgraphOverlay::max_node_id() const { + return max_node; +} + +bool SubgraphOverlay::follow_edges_impl(const handle_t& handle, bool go_left, const std::function& iteratee) const { + std::function subgraph_iteratee = [&](const handle_t& handle) { + if (has_node(backing_graph->get_id(handle))) { + if (iteratee(handle) == false) { + return false; + } + } + return true; + }; + if (has_node(backing_graph->get_id(handle))) { + return backing_graph->follow_edges(handle, go_left, subgraph_iteratee); + } + return true; +} + +bool SubgraphOverlay::for_each_handle_impl(const std::function& iteratee, bool parallel) const { + + if (!parallel) { + bool keep_going = true; + for (auto node_it = node_subset->begin(); keep_going && node_it != node_subset->end(); ++node_it) { + keep_going = iteratee(get_handle(*node_it, false)); + } + return keep_going; + } else { + // copy them into something easy to iterate with omp + vector node_vec(node_subset->begin(), node_subset->end()); + std::atomic keep_going(true); +#pragma omp parallel for + for (size_t i = 0; i < node_vec.size(); ++i) { + keep_going = keep_going && iteratee(backing_graph->get_handle(node_vec[i])); + } + return keep_going; + } +} + +PathSubgraphOverlay::PathSubgraphOverlay(const PathHandleGraph* backing, const unordered_set* node_subset) : + SubgraphOverlay(backing, node_subset), + backing_path_graph(backing) { + + backing->for_each_path_handle([&](const path_handle_t& path_handle) { + bool fully_contained = true; + backing->for_each_step_in_path(path_handle, [&](const step_handle_t& step_handle) -> bool { + if (!has_node(backing->get_id(backing->get_handle_of_step(step_handle)))) { + fully_contained = false; + return false; + } + return true; + }); + if (fully_contained) { + path_subset.insert(path_handle); + } + }); +} + +PathSubgraphOverlay::~PathSubgraphOverlay() { +} + +size_t PathSubgraphOverlay::get_path_count() const { + return path_subset.size(); +} + +bool PathSubgraphOverlay::has_path(const std::string& path_name) const { + return backing_path_graph->has_path(path_name) && + path_subset.count(backing_path_graph->get_path_handle(path_name)); +} + +path_handle_t PathSubgraphOverlay::get_path_handle(const std::string& path_name) const { + if (!has_path(path_name)) { + throw runtime_error("Path " + path_name + " not in subgraph overlay"); + } else { + return backing_path_graph->get_path_handle(path_name); + } +} + +std::string PathSubgraphOverlay::get_path_name(const path_handle_t& path_handle) const { + return backing_path_graph->get_path_name(path_handle); +} + +bool PathSubgraphOverlay::get_is_circular(const path_handle_t& path_handle) const { + return backing_path_graph->get_is_circular(path_handle); +} + +size_t PathSubgraphOverlay::get_step_count(const path_handle_t& path_handle) const { + return backing_path_graph->get_step_count(path_handle); +} + +handle_t PathSubgraphOverlay::get_handle_of_step(const step_handle_t& step_handle) const { + return backing_path_graph->get_handle_of_step(step_handle); +} + +path_handle_t PathSubgraphOverlay::get_path_handle_of_step(const step_handle_t& step_handle) const { + return backing_path_graph->get_path_handle_of_step(step_handle); +} + +step_handle_t PathSubgraphOverlay::path_begin(const path_handle_t& path_handle) const { + return backing_path_graph->path_begin(path_handle); +} + +step_handle_t PathSubgraphOverlay::path_end(const path_handle_t& path_handle) const { + return backing_path_graph->path_end(path_handle); +} + +step_handle_t PathSubgraphOverlay::path_back(const path_handle_t& path_handle) const { + return backing_path_graph->path_back(path_handle); +} + +step_handle_t PathSubgraphOverlay::path_front_end(const path_handle_t& path_handle) const { + return backing_path_graph->path_front_end(path_handle); +} + +bool PathSubgraphOverlay::has_next_step(const step_handle_t& step_handle) const { + return backing_path_graph->has_next_step(step_handle); +} + +bool PathSubgraphOverlay::has_previous_step(const step_handle_t& step_handle) const { + return backing_path_graph->has_previous_step(step_handle); +} + +step_handle_t PathSubgraphOverlay::get_next_step(const step_handle_t& step_handle) const { + return backing_path_graph->get_next_step(step_handle); +} + +step_handle_t PathSubgraphOverlay::get_previous_step(const step_handle_t& step_handle) const { + return backing_path_graph->get_previous_step(step_handle); +} + +bool PathSubgraphOverlay::for_each_path_handle_impl(const std::function& iteratee) const { + bool keep_going = true; + for (auto path_it = path_subset.begin(); keep_going && path_it != path_subset.end(); ++path_it) { + keep_going = iteratee(*path_it); + } + + return keep_going; +} + +bool PathSubgraphOverlay::for_each_step_on_handle_impl(const handle_t& handle, + const std::function& iteratee) const { + return backing_path_graph->for_each_step_on_handle(handle, iteratee); +} + +} diff --git a/src/subgraph_overlay.hpp b/src/subgraph_overlay.hpp new file mode 100644 index 00000000000..ef6f5bd40a8 --- /dev/null +++ b/src/subgraph_overlay.hpp @@ -0,0 +1,215 @@ +#ifndef VG_SUBGRAPH_OVERLAY_HPP_INCLUDED +#define VG_SUBGRAPH_OVERLAY_HPP_INCLUDED + +/** + * \file subgraph_overlay.hpp + * + * Provides SourceSinkOverlay, a HandleGraph implementation that joins all the + * heads and tails of a backing graph to single source and sink nodes. + * + */ + + +#include "handle.hpp" + +#include + + +namespace vg { + +using namespace handlegraph; + +/** + * Present a HandleGraph that is a backing HandleGraph but restricted + * to a subset of nodes. It won't give handles to nodes not in the + * subset, but it's not bulletproof: handles from outside the subset + * won't undergo any special checks. + */ +class SubgraphOverlay : virtual public HandleGraph { + +public: + /** + * Make a new PathSubgraphOverlay. The backing graph must not be modified + * while the overlay exists. + * + */ + SubgraphOverlay(const HandleGraph* backing, const unordered_set* node_subset); + + virtual ~SubgraphOverlay(); + + //////////////////////////////////////////////////////////////////////////// + // Handle-based interface + //////////////////////////////////////////////////////////////////////////// + + /// Method to check if a node exists by ID + virtual bool has_node(nid_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + virtual handle_t get_handle(const nid_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + virtual nid_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + virtual bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + virtual handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + virtual size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + virtual std::string get_sequence(const handle_t& handle) const; + + /// Return the number of nodes in the graph + virtual size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + virtual nid_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + virtual nid_t max_node_id() const; + +protected: + + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const std::function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. Returns true if we finished and false if we + /// stopped early. + virtual bool for_each_handle_impl(const std::function& iteratee, bool parallel = false) const; + +protected: + + /// the backing graph + const HandleGraph* backing_graph; + + /// the node subset. note, we don't own this so its up to client to keep in scope, + /// just like the backing graph + const unordered_set* node_subset; + + /// keep min_node_id() and max_node_id() constant + nid_t min_node = 0; + nid_t max_node = 0; +}; + +/** + * Present a PathHandleGraph that is a backing HandleGraph but restricted + * to a subset of nodes. + * + * Warning: we don't yet have a subgraph interface. So we only consider paths + * from the backing graph that are fully contained in the subgraph. + */ +class PathSubgraphOverlay : virtual public SubgraphOverlay, virtual public PathHandleGraph { + +public: + /** + * Make a new PathSubgraphOverlay. The backing graph must not be modified + * while the overlay exists. + * + */ + PathSubgraphOverlay(const PathHandleGraph* backing, const unordered_set* node_subset); + + virtual ~PathSubgraphOverlay(); + + //////////////////////////////////////////////////////////////////////////// + // Path handle interface + //////////////////////////////////////////////////////////////////////////// + + /// Returns the number of paths stored in the graph + virtual size_t get_path_count() const; + + /// Determine if a path name exists and is legal to get a path handle for. + virtual bool has_path(const std::string& path_name) const; + + /// Look up the path handle for the given path name. + /// The path with that name must exist. + virtual path_handle_t get_path_handle(const std::string& path_name) const; + + /// Look up the name of a path from a handle to it + virtual std::string get_path_name(const path_handle_t& path_handle) const; + + /// Look up whether a path is circular + virtual bool get_is_circular(const path_handle_t& path_handle) const; + + /// Returns the number of node steps in the path + virtual size_t get_step_count(const path_handle_t& path_handle) const; + + /// Get a node handle (node ID and orientation) from a handle to an step on a path + virtual handle_t get_handle_of_step(const step_handle_t& step_handle) const; + + /// Returns a handle to the path that an step is on + virtual path_handle_t get_path_handle_of_step(const step_handle_t& step_handle) const; + + /// Get a handle to the first step, which will be an arbitrary step in a circular path + /// that we consider "first" based on our construction of the path. If the path is empty, + /// then the implementation must return the same value as path_end(). + virtual step_handle_t path_begin(const path_handle_t& path_handle) const; + + /// Get a handle to a fictitious position past the end of a path. This position is + /// returned by get_next_step for the final step in a path in a non-circular path. + /// Note: get_next_step will *NEVER* return this value for a circular path. + virtual step_handle_t path_end(const path_handle_t& path_handle) const; + + /// Get a handle to the last step, which will be an arbitrary step in a circular path that + /// we consider "last" based on our construction of the path. If the path is empty + /// then the implementation must return the same value as path_front_end(). + virtual step_handle_t path_back(const path_handle_t& path_handle) const; + + /// Get a handle to a fictitious position before the beginning of a path. This position is + /// return by get_previous_step for the first step in a path in a non-circular path. + /// Note: get_previous_step will *NEVER* return this value for a circular path. + virtual step_handle_t path_front_end(const path_handle_t& path_handle) const; + + /// Returns true if the step is not the last step in a non-circular path. + virtual bool has_next_step(const step_handle_t& step_handle) const; + + /// Returns true if the step is not the first step in a non-circular path. + virtual bool has_previous_step(const step_handle_t& step_handle) const; + + /// Returns a handle to the next step on the path. If the given step is the final step + /// of a non-circular path, this method has undefined behavior. In a circular path, + /// the "last" step will loop around to the "first" step. + virtual step_handle_t get_next_step(const step_handle_t& step_handle) const; + + /// Returns a handle to the previous step on the path. If the given step is the first + /// step of a non-circular path, this method has undefined behavior. In a circular path, + /// it will loop around from the "first" step (i.e. the one returned by path_begin) to + /// the "last" step. + virtual step_handle_t get_previous_step(const step_handle_t& step_handle) const; + +protected: + + /// Execute a function on each path in the graph. If it returns false, stop + /// iteration. Returns true if we finished and false if we stopped early. + virtual bool for_each_path_handle_impl(const std::function& iteratee) const; + + /// Execute a function on each step of a handle in any path. If it + /// returns false, stop iteration. Returns true if we finished and false if + /// we stopped early. + virtual bool for_each_step_on_handle_impl(const handle_t& handle, + const std::function& iteratee) const; + + +protected: + + /// the backing path graph, just to not have to bother with dynamic cast + const PathHandleGraph* backing_path_graph; + + /// the subset of paths from the backing graph that are entirely contained within our subgraph + unordered_set path_subset; +}; + +} + +#endif diff --git a/src/support_caller.cpp b/src/support_caller.cpp deleted file mode 100644 index 55b359f266c..00000000000 --- a/src/support_caller.cpp +++ /dev/null @@ -1,2009 +0,0 @@ -// Call variants using an augmented graphs with annotated supports - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "vg.hpp" -#include "index.hpp" -#include "Variant.h" -#include "genotypekit.hpp" -#include "snarls.hpp" -#include "path.hpp" -#include "path_index.hpp" -#include "support_caller.hpp" -#include "stream.hpp" -#include "nested_traversal_finder.hpp" - -//#define debug - -namespace vg { - -// How many bases may we put in an allele in VCF if we expect GATK to be able to -// parse it? -// 0 means no maximum is enforced. -const static int MAX_ALLELE_LENGTH = 0; - -// Minimum log likelihood -const static double LOG_ZERO = (double)-1e100; - -// convert to string using stringstream (to replace to_string when we want sci. notation) -template -string to_string_ss(T val) { - stringstream ss; - ss << val; - return ss.str(); -} - -/** - * We need to suppress overlapping variants, but interval trees are hard to - * write. This accomplishes the collision check with a massive bit vector. - */ -struct IntervalBitfield { - // Mark every position that's used in a variant - vector used; - - /** - * Make a new IntervalBitfield covering a region of the specified length. - */ - inline IntervalBitfield(size_t length) : used(length) { - // Nothing to do - } - - /** - * Scan for a collision (O(n) in interval length) - */ - inline bool collides(size_t start, size_t pastEnd) { - for(size_t i = start; i < pastEnd; i++) { - if(used[i]) { - return true; - } - } - return(false); - } - - /** - * Take up an interval. - */ - inline void add(size_t start, size_t pastEnd) { - for(size_t i = start; i < pastEnd; i++) { - used[i] = true; - } - } -}; - -/** - * Get the strand bias of a Support. - */ -double strand_bias(const Support& support) { - return max(support.forward(), support.reverse()) / (support.forward() + support.reverse()); -} - -/** - * Make a letter into a full string because apparently that's too fancy for the - * standard library. - */ -string char_to_string(const char& letter) { - string toReturn; - toReturn.push_back(letter); - return toReturn; -} - -/** - * Write a minimal VCF header for a file with the given samples, and the given - * contigs with the given lengths. - */ -void write_vcf_header(ostream& stream, const vector& sample_names, - const vector& contig_names, const vector& contig_sizes, - int min_mad_for_filter, int max_dp_for_filter, double max_dp_multiple_for_filter, - double max_local_dp_multiple_for_filter, double min_ad_log_likelihood_for_filter) { - - stream << "##fileformat=VCFv4.2" << endl; - stream << "##ALT=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##FILTER=" <" <" <" <" <" << endl; - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - // We need this field to stratify on for VCF comparison. The info is in SB but vcfeval can't pull it out - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - - for(size_t i = 0; i < contig_names.size(); i++) { - // Announce the contigs as well. - stream << "##contig=" << endl; - } - - // Now the column header line - stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; - for (auto& sample_name : sample_names) { - // Append columns for all the samples - stream << "\t" << sample_name; - } - // End the column header line - stream << endl; -} - -/** - * Return true if a variant may be output, or false if this variant is valid but - * the GATK might choke on it. - * - * Mostly used to throw out variants with very long alleles, because GATK has an - * allele length limit. How alleles that really *are* 1 megabase deletions are - * to be specified to GATK is left as an exercise to the reader. - */ -bool can_write_alleles(vcflib::Variant& variant) { - - for(auto& allele : variant.alleles) { - if(MAX_ALLELE_LENGTH > 0 && allele.size() > MAX_ALLELE_LENGTH) { - return false; - } - } - return true; -} - - -/** - * Given a collection of pileups by original node ID, and a set of original node - * id:offset cross-references in both ref and alt categories, produce a VCF - * comment line giving the pileup for each of those positions on those nodes. - * Includes a trailing newline if nonempty. - * - * TODO: VCF comments aren't really a thing. - */ -string get_pileup_line(const map& node_pileups, - const set>& refCrossreferences, - const set>& altCrossreferences) { - // We'll make a stringstream to write to. - stringstream out; - - out << "#"; - - for(const auto& xref : refCrossreferences) { - // For every cross-reference - if(node_pileups.count(xref.first) && node_pileups.at(xref.first).base_pileup_size() > xref.second) { - // If we have that base pileup, grab it - auto basePileup = node_pileups.at(xref.first).base_pileup(xref.second); - - out << xref.first << ":" << xref.second << " (ref) " << basePileup.bases() << "\t"; - } - // Nodes with no pileups (either no pileups were provided or they didn't - // appear/weren't visited by reads) will not be mentioned on this line - } - - for(const auto& xref : altCrossreferences) { - // For every cross-reference - if(node_pileups.count(xref.first) && node_pileups.at(xref.first).base_pileup_size() > xref.second) { - // If we have that base pileup, grab it - auto basePileup = node_pileups.at(xref.first).base_pileup(xref.second); - - out << xref.first << ":" << xref.second << " (alt) " << basePileup.bases() << "\t"; - } - // Nodes with no pileups (either no pileups were provided or they didn't - // appear/weren't visited by reads) will not be mentioned on this line - } - // TODO: make these nearly-identical loops a loop or a lambda or something. - - if(out.str().size() > 1) { - // We actually found something. Send it out with a trailing newline - out << endl; - return out.str(); - } else { - // Give an empty string. - return ""; - } -} - -SupportCaller::PrimaryPath::PrimaryPath(SupportAugmentedGraph& augmented, const string& ref_path_name, size_t ref_bin_size): - ref_bin_size(ref_bin_size), index(augmented.graph, ref_path_name, true), name(ref_path_name) { - - // Follow the reference path and extract indexes we need: index by node ID, - // index by node start, and the reconstructed path sequence. - PathIndex index(augmented.graph, ref_path_name, true); - - if (index.sequence.size() == 0) { - // No empty reference paths allowed - throw runtime_error("Reference path cannot be empty"); - } - - // Store support binned along reference path; - // Last bin extended to include remainder - ref_bin_size = min(ref_bin_size, index.sequence.size()); - if (ref_bin_size <= 0) { - // No zero-sized bins allowed - throw runtime_error("Reference bin size must be 1 or larger"); - } - // Start out all the bins empty. - binned_support = vector(max(1, int(index.sequence.size() / ref_bin_size)), Support()); - - // Crunch the numbers on the reference and its read support. How much read - // support in total (node length * aligned reads) does the primary path get? - total_support = Support(); - for(auto& pointerAndSupport : augmented.node_supports) { - if(index.by_id.count(pointerAndSupport.first->id())) { - // This is a primary path node. Add in the total read bases supporting it - total_support += pointerAndSupport.first->sequence().size() * pointerAndSupport.second; - - // We also update the total for the appropriate bin - size_t bin = index.by_id[pointerAndSupport.first->id()].first / ref_bin_size; - if (bin == binned_support.size()) { - --bin; - } - binned_support[bin] = binned_support[bin] + - pointerAndSupport.first->sequence().size() * pointerAndSupport.second; - } - } - - // Average out the support bins too (in place) - min_bin = 0; - max_bin = 0; - for (int i = 0; i < binned_support.size(); ++i) { - // Compute the average over the bin's actual size - binned_support[i] = binned_support[i] / ( - i < binned_support.size() - 1 ? (double)ref_bin_size : - (double)(ref_bin_size + index.sequence.size() % ref_bin_size)); - - // See if it's a min or max - if (binned_support[i] < binned_support[min_bin]) { - min_bin = i; - } - if (binned_support[i] > binned_support[max_bin]) { - max_bin = i; - } - } - -} - -const Support& SupportCaller::PrimaryPath::get_support_at(size_t primary_path_offset) const { - return get_bin(get_bin_index(primary_path_offset)); -} - -size_t SupportCaller::PrimaryPath::get_bin_index(size_t primary_path_offset) const { - // Find which coordinate bin the position is in - size_t bin = primary_path_offset / ref_bin_size; - if (bin == get_total_bins()) { - --bin; - } - return bin; -} - -size_t SupportCaller::PrimaryPath::get_min_bin() const { - return min_bin; -} - -size_t SupportCaller::PrimaryPath::get_max_bin() const { - return max_bin; -} - -const Support& SupportCaller::PrimaryPath::get_bin(size_t bin) const { - return binned_support[bin]; -} - -size_t SupportCaller::PrimaryPath::get_total_bins() const { - return binned_support.size(); -} - -Support SupportCaller::PrimaryPath::get_average_support() const { - return get_total_support() / get_index().sequence.size(); -} - -Support SupportCaller::PrimaryPath::get_average_support(const map& paths) { - // Track the total support overall - Support total; - // And the total number of bases - size_t bases; - - for (auto& kv : paths) { - // Sum over all paths - total += kv.second.get_total_support(); - bases += kv.second.get_index().sequence.size(); - } - - // Then divide - return total / bases; -} - -Support SupportCaller::PrimaryPath::get_total_support() const { - return total_support; -} - -PathIndex& SupportCaller::PrimaryPath::get_index() { - return index; -} - -const PathIndex& SupportCaller::PrimaryPath::get_index() const { - return index; -} - -const string& SupportCaller::PrimaryPath::get_name() const { - return name; -} - -map::iterator SupportCaller::find_path(const Snarl& site, map& primary_paths) { - for(auto i = primary_paths.begin(); i != primary_paths.end(); ++i) { - // Scan the whole map with an iterator - - if (i->second.get_index().by_id.count(site.start().node_id()) && - i->second.get_index().by_id.count(site.end().node_id())) { - // This path threads through this site - return i; - } - } - // Otherwise we hit the end and found no path that this site can be strung - // on. - return primary_paths.end(); -} - -/** - * Trace out the given traversal, handling nodes, child snarls, and edges - * associated with particular visit numbers. - */ -void trace_traversal(const SnarlTraversal& traversal, const Snarl& site, function handle_node, - function handle_edge, function handle_child) { - - // Must at least have start and end - assert(traversal.visit_size() >= 2); - - // Look at the edge leading from the start (also handles deletion traversals) - handle_edge(0, to_right_side(traversal.visit(0)), to_left_side(traversal.visit(1))); - - for(int64_t i = 1; i < traversal.visit_size() - 1; i++) { - // For all the (internal) visits... - auto& visit = traversal.visit(i); - - if (visit.node_id() != 0) { - // This is a visit to a node - - // Find the node - handle_node(i - 1, visit.node_id()); - } else { - // This is a snarl - handle_child(i - 1, visit.snarl()); - } - - auto& next_visit = traversal.visit(i + 1); - - if (visit.node_id() == 0 && next_visit.node_id() == 0 && - to_right_side(visit).flip() == to_left_side(next_visit)) { - - // These are two back-to-back child snarl visits, which - // share a node and have no connecting edge. -#ifdef debug - cerr << "No edge needed for back-to-back child snarls" << endl; -#endif - - } - else { - // Do the edge to it - handle_edge(i - 1, to_right_side(visit), to_left_side(next_visit)); - } - } - -} - -/** - * Get the min support, total support, bp size (to divide total by for average - * support), and min likelihood for a traversal, optionally special-casing the - * material used by another traversal. Material used by another traversal only - * makes half its coverage available to this traversal. - */ -tuple get_traversal_support(SupportAugmentedGraph& augmented, - SnarlManager& snarl_manager, const Snarl& site, const SnarlTraversal& traversal, - const SnarlTraversal* already_used = nullptr) { - -#ifdef debug - cerr << "Evaluate traversal: " << endl; - for (size_t i = 0; i < traversal.visit_size(); i++) { - cerr << "\t" << pb2json(traversal.visit(i)) << endl; - } - if (already_used != nullptr) { - cerr << "Need to share: " << endl; - for (size_t i = 0; i < already_used->visit_size(); i++) { - cerr << "\t" << pb2json(already_used->visit(i)) << endl; - } - } -#endif - - // First work out the stuff we need to share - set shared_nodes; - set shared_children; - set shared_edges; - if (already_used != nullptr) { - // Mark all the nodes and edges that the other traverasl uses. - trace_traversal(*already_used, site, [&](size_t i, id_t node) { - shared_nodes.insert(node); - }, [&](size_t i, NodeSide end1, NodeSide end2) { - shared_edges.insert(augmented.graph.get_edge(end1, end2)); - }, [&](size_t i, Snarl child) { - shared_children.insert(child); - }); - } - - - // Compute min and total supports, and bp sizes, for all the visits by - // number. - size_t record_count = max(1, traversal.visit_size() - 2); - // What's the min support observed at every visit (inclusing edges)? - vector min_supports(record_count, make_support(INFINITY, INFINITY, INFINITY)); - // And the total support (ignoring edges)? - vector total_supports(record_count, Support()); - // And the bp size of each visit - vector visit_sizes(record_count, 0); - - // Don't count nodes shared between child snarls more than once. - set coverage_counted; - - trace_traversal(traversal, site, [&](size_t i, id_t node_id) { - // Find the node - Node* node = augmented.graph.get_node(node_id); - - // Grab this node's total support along its length - // Make sure to only use half the support if the node is shared - total_supports[i] += augmented.get_support(node) * node->sequence().size() * (shared_nodes.count(node_id) ? 0.5 : 1.0); - // And its size - visit_sizes[i] += node->sequence().size(); - - // And update its min support - min_supports[i] = support_min(min_supports[i], augmented.get_support(node) * (shared_nodes.count(node_id) ? 0.5 : 1.0)); - - }, [&](size_t i, NodeSide end1, NodeSide end2) { - // This is an edge - Edge* edge = augmented.graph.get_edge(end1, end2); - assert(edge != nullptr); - - // Count as 1 base worth for the total/average support - // Make sure to only use half the support if the edge is shared - total_supports[i] += augmented.get_support(edge) * (shared_edges.count(edge) ? 0.5 : 1.0); - visit_sizes[i] += 1; - - // Min in its support - min_supports[i] = support_min(min_supports[i], augmented.get_support(edge) * (shared_edges.count(edge) ? 0.5 : 1.0)); - }, [&](size_t i, Snarl child) { - // This is a child snarl, so get its max support. - - Support child_max; - size_t child_size = 0; - for (Node* node : snarl_manager.deep_contents(snarl_manager.manage(child), - augmented.graph, true).first) { - // For every node in the child - - if (coverage_counted.count(node)) { - // Already used by another child snarl on this traversal - continue; - } - // Claim this node for this child. - coverage_counted.insert(node); - - // How many distinct reads must use the child, given the distinct reads on this node? - child_max = support_max(child_max, augmented.get_support(node)); - - // Add in the node's size to the child - child_size += node->sequence().size(); - -#ifdef debug - cerr << "From child snarl node " << node->id() << " get " - << augmented.get_support(node) << " for distinct " << child_max << endl; -#endif - } - - if (shared_children.count(child)) { - // Make sure to halve the support if the child is shared - child_max *= 0.5; - } - - // Smoosh support over the whole child - total_supports[i] += child_max * child_size; - visit_sizes[i] += child_size; - - if (child_size != 0) { - // We actually have some nodes to our name. - min_supports[i] = support_min(min_supports[i], child_max); - } - - }); - - // Now aggregate across visits and their edges - - // What's the total support for this traversal? - Support total_support; - for (auto& support : total_supports) { - total_support += support; - } - - // And the length over which we have it (for averaging) - size_t total_size = 0; - for (auto& size : visit_sizes) { - total_size += size; - } - - // And the min support? - Support min_support = make_support(INFINITY, INFINITY, INFINITY); - for (auto& support : min_supports) { - min_support = support_min(min_support, support); - } - - if (min_support.forward() == INFINITY || min_support.reverse() == INFINITY) { - // If we have actually no material, say we have actually no support - min_support = Support(); - } - - // Spit out the supports, the size in bases observed. - return tie(min_support, total_support, total_size); - -} - -/** Get the support for each traversal in a list, using average_support_switch_threshold - to decide if we use the minimum or average */ -tuple, vector > SupportCaller::get_traversal_supports_and_sizes( - SupportAugmentedGraph& augmented, SnarlManager& snarl_manager, const Snarl& site, - const vector& traversals, const SnarlTraversal* minus_traversal) { - - // How long is the longest traversal? - // Sort of approximate because of the way nested site sizes are estimated. - size_t longest_traversal_length = 0; - - // And the shortest one? - size_t shortest_traversal_length = numeric_limits::max(); - - // Calculate average and min support for all the traversals of this snarl. - vector min_supports; - vector average_supports; - vector sizes; - for(auto& traversal : traversals) { - // Go through all the SnarlTraversals for this Snarl - - // What's the total support for this traversal? - Support total_support; - // And the length over which we have it (for averaging) - size_t total_size; - // And the min support? - Support min_support; - // Trace the traversal and get its support - tie(min_support, total_support, total_size) = get_traversal_support( - augmented, snarl_manager, site, traversal, minus_traversal); - - // Add average and min supports to vectors. Note that average support - // ignores edges. - min_supports.push_back(min_support); - average_supports.push_back(total_size != 0 ? total_support / total_size : Support()); - -#ifdef debug - cerr << "Min: " << min_support << " Total: " << total_support << " Average: " << average_supports.back() << endl; -#endif - - // Remember a new longest traversal length - longest_traversal_length = max(longest_traversal_length, total_size); - // And a new shortest one - shortest_traversal_length = min(shortest_traversal_length, total_size); - // and the current size - sizes.push_back(total_size); - } - -#ifdef debug - cerr << "Min vs. average" << endl; -#endif - for (size_t i = 0; i < average_supports.size(); i++) { -#ifdef debug - cerr << "\t" << min_supports.at(i) << " vs. " << average_supports.at(i) << endl; -#endif - // We should always have a higher average support than minumum support - assert(support_val(average_supports.at(i)) >= support_val(min_supports.at(i))); - } - - return (longest_traversal_length > average_support_switch_threshold || use_average_support) ? - tie(average_supports, sizes) : - tie(min_supports, sizes); -} - -vector SupportCaller::find_best_traversals(SupportAugmentedGraph& augmented, - SnarlManager& snarl_manager, TraversalFinder* finder, const Snarl& site, - const Support& baseline_support, size_t copy_budget, function emit_locus) { - - // We need to be an ultrabubble for the traversal finder to work right. - // TODO: generalize it - assert(site.type() == ULTRABUBBLE); - - -#ifdef debug - cerr << "Site " << site << endl; -#endif - - - // Get traversals of this Snarl, with Visits to child Snarls. - // The 0th is always the reference traversal if we're on a primary path - vector here_traversals = finder->find_traversals(site); - - -#ifdef debug - cerr << "Found " << here_traversals.size() << " traversals" << endl; -#endif - - - // Make a Locus to hold all our stats for the different traversals - // available. The 0th will always be the ref traversal if we're on a primary - // path. - Locus locus; - - vector supports; - vector traversal_sizes; - // Calculate the support for all the traversals of this snarl. - tie(supports, traversal_sizes) = get_traversal_supports_and_sizes( - augmented, snarl_manager, site, here_traversals); - - for (auto& support : supports) { - // Blit supports over to the locus - *locus.add_support() = support; - } - - //////////////////////////////////////////////////////////////////////////// - - // look at all the paths for the site and pick the best one - function&, vector)> get_best_allele = [this]( - const vector& supports, vector skips) { - int best_allele = -1; - for(size_t i = 0; i < supports.size(); i++) { - if(std::find(skips.begin(), skips.end(), i) == skips.end() && ( - best_allele == -1 || support_val(supports[best_allele]) <= support_val(supports[i]))) { - best_allele = i; - } - } - return best_allele; - }; - - // Now look at all the paths for the site and pick the best one - int best_allele = get_best_allele(supports, {}); - - // We should always have a best allele; we may sometimes have a second best. - assert(best_allele != -1); - -#ifdef debug - cerr << "Choose best allele: " << best_allele << endl; -#endif - - // Then recalculate supports assuming we can't count anything shared with that best traversal - vector additional_supports; - tie(additional_supports, std::ignore) = get_traversal_supports_and_sizes( - augmented, snarl_manager, site, here_traversals, &here_traversals.at(best_allele)); - - // Then pick the second best one - int second_best_allele = get_best_allele(additional_supports, {best_allele}); - -#ifdef debug - cerr << "Choose second best allele: " << second_best_allele << endl; -#endif - - // Hack for special case where we want to call a multiallelic alt even if the reference - // has better support than one or both alts - vector tertiary_supports; - int third_best_allele = -1; - if (second_best_allele != -1) { - tie(tertiary_supports, std::ignore) = get_traversal_supports_and_sizes( - augmented, snarl_manager, site, here_traversals, &here_traversals.at(second_best_allele)); - third_best_allele = get_best_allele(tertiary_supports, {best_allele, second_best_allele}); - } - - // Decide if we're an indel by looking at the traversal sizes - bool is_indel = traversal_sizes[best_allele] != traversal_sizes[0] || - (second_best_allele != -1 && traversal_sizes[0] != traversal_sizes[second_best_allele]); - bool is_indel_ma_2 = (second_best_allele != -1 && traversal_sizes[0] != traversal_sizes[second_best_allele]); - bool is_indel_ma_3 = (third_best_allele != -1 && traversal_sizes[0] != traversal_sizes[third_best_allele]); - - //////////////////////////////////////////////////////////////////////////// - - // Now make a genotype call at this site, up to the allowed copy number - - // TODO: Work out how to detect indels when there are nested sites and - // enable the indel bias multiple again. - double bias_multiple = 1.0; - - // How much support do we have for the top two alleles? - Support site_support = supports.at(best_allele); - if(second_best_allele != -1) { - site_support += supports.at(second_best_allele); - } - - // Pull out the different supports. Some of them may be the same. - Support best_support = supports.at(best_allele); - Support second_best_support; // Defaults to 0 - if(second_best_allele != -1) { - second_best_support = supports.at(second_best_allele); - } - Support third_best_support; - if (third_best_allele != -1) { - third_best_support = supports.at(third_best_allele); - } - - // As we do the genotype, we also compute the likelihood. Holds - // likelihood log 10. Starts out at "completely wrong". - double gen_likelihood = -1 * INFINITY; - - // Minimum allele depth of called alleles - double min_site_support = 0; - - // This is where we'll put the genotype. We only actually add it to the - // Locus if we are confident enough to actually call. - Genotype genotype; - - // We're going to make some really bad calls at low depth. We can - // pull them out with a depth filter, but for now just elide them. - if (support_val(site_support) >= support_val(baseline_support) * min_fraction_for_call * ((double) copy_budget) / 2) { - // We have enough to emit a call here. - - // If best and second best are close enough to be het, we call het. - // Otherwise, we call hom best. - - double bias_limit; - if (best_allele == 0) { - // Use ref bias limit - - // We decide closeness differently depending on whether best is ref - // or not. In practice, we use this to slightly penalize homozygous - // ref calls (by setting max_ref_het_bias higher than max_het_bias) - // and rather make a less supported alt call instead. This boost - // max sensitivity, and because everything is homozygous ref by - // default in VCF, any downstream filters will effectively reset - // these calls back to homozygous ref. TODO: This shouldn't apply - // when off the primary path! - bias_limit = max_ref_het_bias; - } else if (is_indel) { - // This is an indel - // Use indel bias limit - bias_limit = max_indel_het_bias; - } else { - // Use normal het bias limit - bias_limit = max_het_bias; - } - -#ifdef debug - cerr << best_allele << ", " << best_support << " and " - << second_best_allele << ", " << second_best_support << endl; - - if (support_val(second_best_support) > 0) { - cerr << "Bias: (limit " << bias_limit * bias_multiple << "):" - << support_val(best_support)/support_val(second_best_support) << endl; - } - - cerr << bias_limit * bias_multiple * support_val(second_best_support) << " vs " - << support_val(best_support) << endl; - - cerr << total(second_best_support) << " vs " << min_total_support_for_call << endl; -#endif - - // Call 1/2 : REF-Alt1/Alt2 even if Alt2 has only third best support - if (copy_budget >= 2 && - best_allele == 0 && - third_best_allele > 0 && - is_indel_ma_3 && - max_indel_ma_bias * bias_multiple * support_val(third_best_support) >= support_val(best_support) && - total(second_best_support) >= min_total_support_for_call && - total(third_best_support) >= min_total_support_for_call) { - // There's a second best allele and third best allele, and it's not too biased to call, - // and both alleles exceed the minimum to call them present, and the - // second-best and third-best alleles have enough support that it won't torpedo the - // variant. - -#ifdef debug - cerr << "Call as second best/third best" << endl; -#endif - // Say both are present - genotype.add_allele(second_best_allele); - genotype.add_allele(third_best_allele); - - // Get minimum support for filter (not assuming it's second_best just to be sure) - min_site_support = min(total(second_best_support), total(third_best_support)); - - // Make the call - *locus.add_genotype() = genotype; - } - else if (copy_budget >= 2 && - second_best_allele != -1 && - bias_limit * bias_multiple * support_val(second_best_support) >= support_val(best_support) && - total(best_support) >= min_total_support_for_call && - total(second_best_support) >= min_total_support_for_call) { - // There's a second best allele, and it's not too biased to call, - // and both alleles exceed the minimum to call them present, and the - // second-best allele has enough support that it won't torpedo the - // variant. - -#ifdef debug - cerr << "Call as best/second best" << endl; -#endif - - // Say both are present - genotype.add_allele(best_allele); - genotype.add_allele(second_best_allele); - - // Get minimum support for filter (not assuming it's second_best just to be sure) - min_site_support = min(total(second_best_support), total(best_support)); - - // Make the call - *locus.add_genotype() = genotype; - - } else if (copy_budget >= 2 && total(best_support) >= min_total_support_for_call) { - // The second best allele isn't present or isn't good enough, - // but the best allele has enough coverage that we can just call - // two of it. - -#ifdef debug - cerr << "Call as best/best" << endl; -#endif - - // Say the best is present twice - genotype.add_allele(best_allele); - genotype.add_allele(best_allele); - - // Get minimum support for filter - min_site_support = total(best_support); - - // Make the call - *locus.add_genotype() = genotype; - - } else if (copy_budget >= 1 && total(best_support) >= min_total_support_for_call) { - // We're only supposed to have one copy, and the best allele is good enough to call - -#ifdef debug - cerr << "Call as best" << endl; -#endif - - // Say the best is present once - genotype.add_allele(best_allele); - - // Get minimum support for filter - min_site_support = total(best_support); - - // Make the call - *locus.add_genotype() = genotype; - } else { - // Either coverage is too low, or we aren't allowed any copies. - // We can't really call this as anything. - -#ifdef debug - cerr << "Do not call" << endl; -#endif - - // Don't add the genotype to the locus - } - } else { - // Depth too low. Say we have no idea. - // TODO: elide variant? - - // Don't add the genotype to the locus - } - - // Find the total support for the Locus across all alleles - Support locus_support; - for (auto& s : supports) { - // Sum up all the Supports form all alleles (even the non-best/second-best). - locus_support += s; - } - // Save support - *locus.mutable_overall_support() = locus_support; - - //////////////////////////////////////////////////////////////////////////// - - // Figure out what child snarls are touched by the paths we have called and - // how much copy number each should get. - map child_usage_counts; - for (size_t i = 0; i < genotype.allele_size(); i++) { - // For each copy we call as present, find the SnarlTraversal we're - // asserting - SnarlTraversal& traversal = here_traversals.at(genotype.allele(i)); - - for (size_t j = 1; j < traversal.visit_size() - 1; j++) { - // For each visit to a child snarl - auto& visit = traversal.visit(j); - if (visit.node_id() != 0) { - continue; - } - - // Find the child snarl pointer for the snarl we visit - const Snarl* child = snarl_manager.manage(visit.snarl()); - - // Say it's used one more time - child_usage_counts[child]++; - - } - } - - // Recurse and get traversals for children. We do this for all our children, - // even the ones called as CN 0, because we need the fully-specified - // traversals to build our Locus (which needs the alleles we rejected as - // having no copies). - unordered_map> child_traversals; - for (const Snarl* child : snarl_manager.children_of(&site)) { - // Recurse on each child, giving a copy number budget according to the - // usage count call at this site. This produces fully realized - // traversals with no Visits to Snarls. - // Holds ref traversal, best, and optional second best for each child. - child_traversals[child] = find_best_traversals(augmented, snarl_manager, - finder, *child, baseline_support, child_usage_counts[child], emit_locus); - } - - for (auto kv : child_traversals) { - // All children must have at least two traversals (a ref and a best). - // Off the primary paths, the ref is sort of arbitrary. - assert(kv.second.size() >= 2); - } - - // Put the best (or ref) traversal for each child in our traversals that - // visit it (even if that contradicts the calls on the child) - vector concrete_traversals; - for (size_t traversal_number = 0; traversal_number < here_traversals.size(); traversal_number++) { - // For every abstract traversal of this site, starting with the ref traversal... - auto& abstract_traversal = here_traversals[traversal_number]; -#ifdef debug - cerr << "Concretizing abstract traversal " << pb2json(abstract_traversal) << endl; -#endif - - // Make a "concrete", node-level traversal for every abstract, Snarl- - // visiting traversal. - concrete_traversals.emplace_back(); - auto& concrete_traversal = concrete_traversals.back(); - - for (size_t i = 0; i < abstract_traversal.visit_size(); i++) { - // Go through all the visits in the abstract traversal - auto& abstract_visit = abstract_traversal.visit(i); - - if (abstract_visit.node_id() != 0) { - // If they're fully realized, just take them - *concrete_traversal.add_visit() = abstract_visit; - } else { - // If they're visits to children, look up the child - const Snarl* child = snarl_manager.manage(abstract_visit.snarl()); - - // Then blit the child's path over. This will be the ref path if - // we are concrete-izing this snarl's ref traversal, and the - // best path for the child otherwise. Keep in mind that we may - // be going through the child backward. - auto& child_traversal = child_traversals.at(child).at(traversal_number == 0 ? 0 : 1); - -#ifdef debug - cerr << "Splicing in child traversal " << pb2json(child_traversal) << endl; -#endif - - size_t trav_transfer_start = 0; - if (i != 0) { - // There was a previous visit. It may have been a previous - // back-to-back snarl. - auto& last_visit = abstract_traversal.visit(i - 1); - if (last_visit.node_id() == 0 && to_right_side(last_visit).flip() == to_left_side(abstract_visit)) { - // It was indeed a previous back to back site. Don't add the entry node! -#ifdef debug - cerr << "Skip entry node for back-to-back sites" << endl; -#endif - trav_transfer_start++; - } - } - for (size_t j = trav_transfer_start; j < child_traversal.visit_size(); j++) { - // All the internal visits, in the correct order - *concrete_traversal.add_visit() = abstract_visit.backward() ? - reverse(child_traversal.visit(child_traversal.visit_size()- 1 - j)) : - child_traversal.visit(j); - } - } - } -#ifdef debug - cerr << "Finished concrete traversal " << pb2json(concrete_traversals.back()) << endl; -#endif - } - - for (auto& concrete_traversal : concrete_traversals) { - // Populate the Locus with those traversals by converting to paths - Path* converted = locus.add_allele(); - - for (size_t i = 0; i < concrete_traversal.visit_size(); i++) { - // Convert all the visits to Mappings and stick them in the Locus's Paths - *converted->add_mapping() = to_mapping(concrete_traversal.visit(i), augmented.graph); - } - } - - if (locus.genotype_size() > 0) { - // Emit the locus if we have a call - emit_locus(locus, &site); - } - - // Build the list of traversals to return as ref, best, second best, with - // possible repeats. - vector to_return{concrete_traversals[0], concrete_traversals[best_allele]}; - if (second_best_allele != -1) { - to_return.push_back(concrete_traversals[second_best_allele]); - } - - // Return those important traversals - return to_return; - -} - -// this was main() in glenn2vcf -void SupportCaller::call( - // Augmented graph - SupportAugmentedGraph& augmented, - // Should we load a pileup and print out pileup info as comments after - // variants? - string pileup_filename) { - - // Toggle support counter - support_val = use_support_count ? total : support_quality; - - // Set up the graph's paths properly after augmentation modified them. - augmented.graph.paths.sort_by_mapping_rank(); - augmented.graph.paths.rebuild_mapping_aux(); - - // Make a list of the specified or autodetected primary reference paths. - vector primary_path_names = ref_path_names; - if (primary_path_names.empty()) { - // Try and guess reference path names for VCF conversion or coverage measurement. - if (verbose) { - std:cerr << "Graph has " << augmented.graph.paths.size() << " paths to choose from." - << endl; - } - if(augmented.graph.paths.size() == 1) { - // Autodetect the reference path name as the name of the only path - primary_path_names.push_back((*augmented.graph.paths._paths.begin()).first); - } else if (augmented.graph.paths.has_path("ref")) { - // Take any "ref" path. - primary_path_names.push_back("ref"); - } - - if (verbose && !primary_path_names.empty()) { - cerr << "Guessed reference path name of " << primary_path_names.front() << endl; - } - - } - - // We'll fill this in with a PrimaryPath for every primary reference path - // that is specified or detected. - map primary_paths; - for (auto& name : primary_path_names) { - // Make a PrimaryPath for every primary path we have. - // Index the primary path and compute the binned supports. - primary_paths.emplace(std::piecewise_construct, - std::forward_as_tuple(name), - std::forward_as_tuple(augmented, name, ref_bin_size)); - - auto& primary_path = primary_paths.at(name); - - if (verbose) { - cerr << "Primary path " << name << " average/off-path assumed coverage: " - << primary_path.get_average_support() << endl; - cerr << "Mininimum binned average coverage: " << primary_path.get_bin(primary_path.get_min_bin()) << " (bin " - << (primary_path.get_min_bin() + 1) << " / " << primary_path.get_total_bins() << ")" << endl; - cerr << "Maxinimum binned average coverage: " << primary_path.get_bin(primary_path.get_max_bin()) << " (bin " - << (primary_path.get_max_bin() + 1) << " / " << primary_path.get_total_bins() << ")" << endl; - } - } - - // If applicable, load the pileup. - // This will hold pileup records by node ID. - map node_pileups; - - function handle_pileup = [&](Pileup& p) { - // Handle each pileup chunk - for(size_t i = 0; i < p.node_pileups_size(); i++) { - // Pull out every node pileup - auto& pileup = p.node_pileups(i); - // Save the pileup under its node's pointer. - node_pileups[pileup.node_id()] = pileup; - } - }; - if(!pileup_filename.empty()) { - // We have to load some pileups - ifstream in; - in.open(pileup_filename.c_str()); - stream::for_each(in, handle_pileup); - } - - // Make a VCF because we need it in scope later, if we are outputting VCF. - vcflib::VariantCallFile vcf; - - // We also might need to fillin this contig names by path name map - map contig_names_by_path_name; - - if (convert_to_vcf) { - // Do initial setup for VCF output - - // Decide on names and lengths for all the primary paths. - vector contig_lengths; - vector contig_names; - - for (size_t i = 0; i < primary_path_names.size(); i++) { - if (i < contig_name_overrides.size()) { - // Override this name - contig_names.push_back(contig_name_overrides.at(i)); - } else { - // Keep the path name from the graph - contig_names.push_back(primary_path_names.at(i)); - } - - // Allow looking up the assigned contig name later - contig_names_by_path_name[primary_path_names.at(i)] = contig_names.back(); - - if (i < length_overrides.size()) { - // Override this length - contig_lengths.push_back(length_overrides.at(i)); - } else { - // Grab the length from the index - contig_lengths.push_back(primary_paths.at(primary_path_names.at(i)).get_index().sequence.size()); - } - - // TODO: is this fall-through-style logic smart, or will we just - // neglect to warn people that they forgot options by parsing what - // they said when they provide too few overrides? - } - - // Generate a vcf header. We can't make Variant records without a - // VariantCallFile, because the variants need to know which of their - // available info fields or whatever are defined in the file's header, - // so they know what to output. - stringstream header_stream; - write_vcf_header(header_stream, {sample_name}, contig_names, contig_lengths, - min_mad_for_filter, max_dp_for_filter, max_dp_multiple_for_filter, max_local_dp_multiple_for_filter, - min_ad_log_likelihood_for_filter); - - // Load the headers into a the VCF file object - string header_string = header_stream.str(); - assert(vcf.openForOutput(header_string)); - - // Spit out the header - cout << header_stream.str(); - } - - // Find all the top-level sites - list site_queue; - - CactusSnarlFinder finder(augmented.graph); - SnarlManager site_manager = finder.find_snarls(); - - site_manager.for_each_top_level_snarl_parallel([&](const Snarl* site) { - // Stick all the sites in this vector. - #pragma omp critical (sites) - site_queue.emplace_back(site); - }); - - // We're going to run through all the top-level sites and keep just what we - // can use. If we're converting to VCF it's only stuff on a primary path, - // and we will break top-level sites to find things on a primary path. - // Otherwise it's everything. - vector sites; - - while(!site_queue.empty()) { - // Grab the first site - const Snarl* site = move(site_queue.front()); - site_queue.pop_front(); - - // If the site is strung on any of the primary paths, find the - // corresponding PrimaryPath object. Otherwise, leave this null. - PrimaryPath* primary_path = nullptr; - { - auto found = find_path(*site, primary_paths); - if (found != primary_paths.end()) { - primary_path = &found->second; - } - } - - - if (site->type() == ULTRABUBBLE && primary_path != nullptr) { - // This site is an ultrabubble on a primary path - - // Throw it in the final vector of sites we're going to process. - sites.push_back(site); - } else if (site->type() == ULTRABUBBLE && !convert_to_vcf) { - // This site is an ultrabubble and we can handle things off the - // primary path. - - // Throw it in the final vector of sites we're going to process. - sites.push_back(site); - - } else { - // The site is not on the primary path or isn't an ultrabubble, but - // maybe one of its children will meet our requirements. - - size_t child_count = site_manager.children_of(site).size(); - - for(const Snarl* child : site_manager.children_of(site)) { - // Dump all the children into the queue for separate - // processing. - site_queue.emplace_back(child); - } - - if (verbose) { - if (child_count) { - cerr << "Broke up off-reference site into " - << child_count << " children" << endl; - } else { - cerr << "Dropped off-reference site" << endl; - } - } - - } - } - - if (verbose) { - cerr << "Found " << sites.size() << " sites" << endl; - } - - // Now start looking for traversals of the sites. - RepresentativeTraversalFinder traversal_finder(augmented, site_manager, max_search_depth, max_search_width, - max_bubble_paths, [&] (const Snarl& site) -> PathIndex* { - - // When the TraversalFinder needs a primary path index for a site, it can look it up with this function. - auto found = find_path(site, primary_paths); - if (found != primary_paths.end()) { - // It's on a path - return &found->second.get_index(); - } else { - // It's not on a known primary path, so the TraversalFinder should make its own backbone path - return nullptr; - } - }); - - // We're going to remember what nodes and edges are covered by sites, so we - // will know which nodes/edges aren't in any sites and may need generic - // presence/absence calls. - set covered_nodes; - set covered_edges; - - // When we genotype the sites into Locus objects, we will use this buffer for outputting them. - vector locus_buffer; - - // How many sites result in output? - size_t called_loci = 0; - - for(const Snarl* site : sites) { - // For every site, we're going to make a bunch of Locus objects - - // See if the site is on a primary path, so we can use binned support. - map::iterator found_path = find_path(*site, primary_paths); - - // We need to figure out how much support a site ought to have. - // Within its local bin? - Support baseline_support; - // On its primary path? - Support global_baseline_support; - if (expected_coverage != 0.0) { - // Use the specified coverage override - baseline_support.set_forward(expected_coverage / 2); - baseline_support.set_reverse(expected_coverage / 2); - global_baseline_support = baseline_support; - } else if (found_path != primary_paths.end()) { - // We're on a primary path, so we can find the appropriate bin - - // Since the variable part of the site is after the first anchoring node, where does it start? - // Account for the site possibly being backward on the path. - size_t variation_start = min(found_path->second.get_index().by_id.at(site->start().node_id()).first - + augmented.graph.get_node(site->start().node_id())->sequence().size(), - found_path->second.get_index().by_id.at(site->end().node_id()).first - + augmented.graph.get_node(site->end().node_id())->sequence().size()); - - // Look in the bins for the primary path to get the support there. - baseline_support = found_path->second.get_support_at(variation_start); - - // And grab the path's overall support - global_baseline_support = found_path->second.get_average_support(); - - } else { - // Just use the primary paths' average support, which may be 0 if there are none. - // How much support is expected across all the primary paths? May be 0 if there are no primary paths. - global_baseline_support = PrimaryPath::get_average_support(primary_paths); - baseline_support = global_baseline_support; - } - - // This function emits the given variant on the given primary path, as - // VCF. It needs to take the site as an argument because it may be - // called for children of the site we're working on right now. - auto emit_variant = [&contig_names_by_path_name, &vcf, &augmented, - &baseline_support, &global_baseline_support, this]( - const Locus& locus, PrimaryPath& primary_path, const Snarl* site) { - - // Note that the locus paths will traverse our site forward, which - // may make them backward along the primary path. - bool site_backward = (primary_path.get_index().by_id.at(site->start().node_id()).first > - primary_path.get_index().by_id.at(site->end().node_id()).first); - - // Unpack the genotype back into best and second-best allele - auto& genotype = locus.genotype(0); - int best_allele = genotype.allele(0); - // If we called a single allele, we've lost the second-best allele info. But we won't need it, so we can just say -1. - int second_best_allele = (genotype.allele_size() >= 2 && genotype.allele(0) != genotype.allele(1)) ? - genotype.allele(1) : - -1; - - // Populate this with original node IDs, from before augmentation. - set original_nodes; - - // Calculate the ID and sequence strings for all the alleles. - // TODO: we only use some of these - vector sequences; - vector id_lists; - // Also the flags for whether alts are reference (i.e. known) - vector is_ref; - - for (size_t i = 0; i < locus.allele_size(); i++) { - - // For each allele path in the Locus - auto& path = locus.allele(i); -#ifdef debug - cerr << "Extracting allele " << i << ": " << pb2json(path) << endl; -#endif - // Make a stream for the sequence of the path - stringstream sequence_stream; - // And for the description of involved IDs - stringstream id_stream; - - for (size_t j = 0; j < path.mapping_size(); j++) { - // For each mapping along the path - auto& mapping = path.mapping(j); - - // Record the sequence - string node_sequence = augmented.graph.get_node(mapping.position().node_id())->sequence(); - if (mapping.position().is_reverse()) { - node_sequence = reverse_complement(node_sequence); - } - sequence_stream << node_sequence; -#ifdef debug - cerr << "\tMapping: " << pb2json(mapping) << ", sequence " << node_sequence << endl; -#endif - if (j != 0) { - // Add a separator - id_stream << "_"; - } - // Record the ID - id_stream << mapping.position().node_id(); - - if (augmented.translator.has_translation(mapping.position(), false)) { - // This node is derived from an original graph node. Remember it. - original_nodes.insert(augmented.translator.translate(mapping.position()).node_id()); - } - } - - // Remember the descriptions of the alleles - if (site_backward) { - sequences.push_back(reverse_complement(sequence_stream.str())); - } else { - sequences.push_back(sequence_stream.str()); - } -#ifdef debug - cerr << "Recorded allele sequence " << sequences.back() << endl; -#endif - id_lists.push_back(id_stream.str()); - // And whether they're reference or not - is_ref.push_back(is_reference(path, augmented)); - } - - // Start off declaring the variable part to start at the start of - // the first anchoring node. We'll clip it back later to just what's - // after the shared prefix. - size_t variation_start = min(primary_path.get_index().by_id.at(site->start().node_id()).first, - primary_path.get_index().by_id.at(site->end().node_id()).first); - - // Keep track of the alleles that actually need to go in the VCF: - // ref, best, and second-best (if any), some of which may overlap. - // This is the order they will show up in the variant. - vector used_alleles; - used_alleles.push_back(0); - if (best_allele != 0) { - used_alleles.push_back(best_allele); - } - if(second_best_allele != -1 && second_best_allele != 0) { - used_alleles.push_back(second_best_allele); - } - - // Rewrite the sequences and variation_start to just represent the - // actually variable part, by dropping any common prefix and common - // suffix. We just do the whole thing in place, modifying the used - // entries in sequences. - - auto shared_prefix_length = [&](bool backward) { - size_t shortest_prefix = std::numeric_limits::max(); - - auto here = used_alleles.begin(); - if (here == used_alleles.end()) { - // No strings. - // Say no prefix is in common... - return (size_t) 0; - } - auto next = here; - next++; - - if (next == used_alleles.end()) { - // Only one string. - // Say no prefix is in common... - return (size_t) 0; - } - - while (next != used_alleles.end()) { - // Consider each allele and the next one after it, as - // long as we have both. - - // Figure out the shorter and the longer string - string* shorter = &sequences.at(*here); - string* longer = &sequences.at(*next); - if (shorter->size() > longer->size()) { - swap(shorter, longer); - } - - // Calculate the match length for this pair - size_t match_length; - if (backward) { - // Find out how far in from the right the first mismatch is. - auto mismatch_places = std::mismatch(shorter->rbegin(), shorter->rend(), longer->rbegin()); - match_length = std::distance(shorter->rbegin(), mismatch_places.first); - } else { - // Find out how far in from the left the first mismatch is. - auto mismatch_places = std::mismatch(shorter->begin(), shorter->end(), longer->begin()); - match_length = std::distance(shorter->begin(), mismatch_places.first); - } - - // The shared prefix of these strings limits the longest - // prefix shared by all strings. - shortest_prefix = min(shortest_prefix, match_length); - - here = next; - ++next; - } - - // Return the shortest universally shared prefix - return shortest_prefix; - }; - // Trim off the shared prefix - size_t shared_prefix = shared_prefix_length(false); - for (auto allele : used_alleles) { - sequences[allele] = sequences[allele].substr(shared_prefix); - } - // Add it onto the start coordinate - variation_start += shared_prefix; - - // Then find and trim off the shared suffix - size_t shared_suffix = shared_prefix_length(true); - for (auto allele : used_alleles) { - sequences[allele] = sequences[allele].substr(0, sequences[allele].size() - shared_suffix); - } - - // Make a Variant - vcflib::Variant variant; - variant.sequenceName = contig_names_by_path_name.at(primary_path.get_name()); - variant.setVariantCallFile(vcf); - variant.quality = 0; - // Position should be 1-based and offset with our offset option. - variant.position = variation_start + 1 + variant_offset; - - // Set the ID based on the IDs of the involved nodes. Note that the best - // allele may have no nodes (because it's a pure edge) - variant.id = id_lists.at(best_allele); - if(second_best_allele != -1 && !id_lists.at(second_best_allele).empty()) { - // Add the second best allele's nodes in. - variant.id += "-" + id_lists.at(second_best_allele); - } - - - if(sequences.at(0).empty() || - (best_allele != -1 && sequences.at(best_allele).empty()) || - (second_best_allele != -1 && sequences.at(second_best_allele).empty())) { - - // Fix up the case where we have an empty allele. - - // We need to grab the character before the variable part of the - // site in the reference. - assert(variation_start > 0); - string extra_base = char_to_string(primary_path.get_index().sequence.at(variation_start - 1)); - - for(auto& seq : sequences) { - // Stick it on the front of all the allele sequences - seq = extra_base + seq; - } - - // Budge the variant left - variant.position--; - } - - // Make sure the ref allele is correct - { - string real_ref = primary_path.get_index().sequence.substr( - variant.position - variant_offset - 1, sequences.front().size()); - string got_ref = sequences.front(); - - if (real_ref != got_ref) { - cerr << "Error: Ref should be " << real_ref << " but is " << got_ref << " at " << variant.position << endl; - throw runtime_error("Reference mismatch at site " + pb2json(*site)); - } - - } - - // Add the ref allele to the variant - create_ref_allele(variant, sequences.front()); - - // Add the best allele - assert(best_allele != -1); - int best_alt = add_alt_allele(variant, sequences.at(best_allele)); - - int second_best_alt = (second_best_allele == -1) ? -1 : add_alt_allele(variant, sequences.at(second_best_allele)); - - // Say we're going to spit out the genotype for this sample. - variant.format.push_back("GT"); - auto& genotype_vector = variant.samples[sample_name]["GT"]; - - if (locus.genotype_size() > 0) { - // We actually made a call. Emit the first genotype, which is the call. - - // We need to rewrite the allele numbers to alt numbers, since - // we aren't keeping all the alleles in the VCF, so we can't use - // the natural conversion of Genotype to VCF genotype string. - - // Emit parts into this stream - stringstream stream; - for (size_t i = 0; i < genotype.allele_size(); i++) { - // For each allele called as present in the genotype - - // Convert from allele number to alt number - if (genotype.allele(i) == best_allele) { - stream << best_alt; - } else if (genotype.allele(i) == second_best_allele) { - stream << second_best_alt; - } else { - throw runtime_error("Allele " + to_string(genotype.allele(i)) + - " is not best or second-best and has no alt"); - } - - if (i + 1 != genotype.allele_size()) { - // Write a separator after all but the last one - stream << (genotype.is_phased() ? '|' : '/'); - } - } - // Save the finished genotype - genotype_vector.push_back(stream.str()); - } else { - // Say there's no call here - genotype_vector.push_back("./."); - } - - // Now fill in all the other variant info/format stuff - - if((best_allele != 0 && is_ref.at(best_allele)) || - (second_best_allele != 0 && second_best_allele != -1 && is_ref.at(second_best_allele))) { - // Flag the variant as reference if either of its two best alleles - // is known but not the primary path. Don't put in a false entry if - // it isn't known, because vcflib will spit out the flag anyway... - variant.infoFlags["XREF"] = true; - } - - for (auto id : original_nodes) { - // Add references to the relevant original nodes - variant.info["XSEE"].push_back(to_string(id)); - } - - for (size_t i = 1; i < variant.alleles.size(); i++) { - // Claculate the SVLEN for this non-reference allele - int64_t svlen = (int64_t) variant.alleles.at(i).size() - (int64_t) variant.alleles.at(0).size(); - - // Add it in - variant.info["SVLEN"].push_back(to_string(svlen)); - } - - // Set up the depth format field - variant.format.push_back("DP"); - // And expected depth - variant.format.push_back("XDP"); - // And allelic depth - variant.format.push_back("AD"); - // And the log likelihood from the assignment of reads among the - // present alleles - variant.format.push_back("XADL"); - // And strand bias - variant.format.push_back("SB"); - // Also the alt allele depth - variant.format.push_back("XAAD"); - - // Compute the total support for all the alts that will be appearing - Support total_support; - // And total alt allele depth for the alt alleles - Support alt_support; - - for (int allele : used_alleles) { - // For all the alleles we are using, look at the support. - auto& support = locus.support(allele); - - // Set up allele-specific stats for the allele - variant.samples[sample_name]["AD"].push_back(to_string((int64_t)round(total(support)))); - variant.samples[sample_name]["SB"].push_back(to_string((int64_t)round(support.forward()))); - variant.samples[sample_name]["SB"].push_back(to_string((int64_t)round(support.reverse()))); - - // Sum up into total depth - total_support += support; - - if (allele != 0) { - // It's not the primary reference allele - alt_support += support; - } - } - - // Find the min total support of anything called - double min_site_support = INFINITY; - double min_site_quality = INFINITY; - - for (size_t i = 0; i < genotype.allele_size(); i++) { - // Min all the total supports from the non-ref alleles called as present - min_site_support = min(min_site_support, total(locus.support(genotype.allele(i)))); - min_site_quality = min(min_site_quality, locus.support(genotype.allele(i)).quality()); - } - - // Find the binomial bias between the called alleles, if multiple were called. - double ad_log_likelihood = INFINITY; - if (second_best_allele != -1) { - // How many of the less common one do we have? - size_t successes = round(total(locus.support(second_best_allele))); - // Out of how many chances - size_t trials = successes + (size_t) round(total(locus.support(best_allele))); - - assert(trials >= successes); - - // How weird is that? - ad_log_likelihood = binomial_cmf_ln(prob_to_logprob((real_t) 0.5), trials, successes); - - assert(!std::isnan(ad_log_likelihood)); - - variant.samples[sample_name]["XADL"].push_back(to_string(ad_log_likelihood)); - } else { - // No need to assign reads between two alleles - variant.samples[sample_name]["XADL"].push_back("."); - } - - // Set the variant's total depth - string depth_string = to_string((int64_t)round(total(total_support))); - variant.info["DP"].push_back(depth_string); // We only have one sample, so variant depth = sample depth - - // And for the sample - variant.samples[sample_name]["DP"].push_back(depth_string); - - // Set the sample's local and global expected depth - variant.samples[sample_name]["XDP"].push_back(to_string((int64_t)round(total(baseline_support)))); - variant.samples[sample_name]["XDP"].push_back(to_string((int64_t)round(total(global_baseline_support)))); - - // And its depth of non-0 alleles - variant.samples[sample_name]["XAAD"].push_back(to_string((int64_t)round(total(alt_support)))); - - // Set the total support quality of the min allele as the variant quality - variant.quality = min_site_quality; - - // Now do the filters - variant.filter = "PASS"; - if (min_site_support < min_mad_for_filter) { - // Apply Min Allele Depth cutoff across all alleles (even ref) - variant.filter = "lowad"; - } else if (max_dp_for_filter != 0 && total(total_support) > max_dp_for_filter) { - // Apply the max depth cutoff - variant.filter = "highabsdp"; - } else if (max_dp_multiple_for_filter != 0 && - total(total_support) > max_dp_multiple_for_filter * total(global_baseline_support)) { - // Apply the max depth multiple cutoff - // TODO: Different standard for sites called as haploid - variant.filter = "highreldp"; - } else if (max_local_dp_multiple_for_filter != 0 && - total(total_support) > max_local_dp_multiple_for_filter * total(baseline_support)) { - // Apply the max local depth multiple cutoff - // TODO: Different standard for sites called as haoploid - variant.filter = "highlocaldp"; - } else if (min_ad_log_likelihood_for_filter != 0 && - ad_log_likelihood < min_ad_log_likelihood_for_filter) { - // We have a het, but the assignment of reads between the two branches is just too weird - variant.filter = "lowxadl"; - } - - // Don't bother with trivial calls - if (write_trivial_calls || - (genotype_vector.back() != "./." && genotype_vector.back() != ".|." && - genotype_vector.back() != "0/0" && genotype_vector.back() != "0|0")) { - - if(can_write_alleles(variant)) { - // No need to check for collisions because we assume sites are correctly found. - // Output the created VCF variant. - cout << variant << endl; - - } else { - if (verbose) { - cerr << "Variant is too large" << endl; - } - // TODO: track bases lost again - } - } - }; - - // Recursively type the site, using that support and an assumption of a diploid sample. - find_best_traversals(augmented, site_manager, &traversal_finder, *site, baseline_support, 2, - [&locus_buffer, &emit_variant, &site_manager, &called_loci, &primary_paths, &augmented, - &covered_nodes, &covered_edges, this](const Locus& locus, const Snarl* site) { - - // Now we have the Locus with call information, and the site (either - // the root snarl we passed in or a child snarl) that the call is - // for. We need to output the call. - - if (convert_to_vcf) { - // We want to emit VCF - - // Look up the path this child site lives on. (TODO: just capture and use the path the parent lives on?) - auto found_path = find_path(*site, primary_paths); - if(found_path != primary_paths.end()) { - // And this site is on a primary path - - // Emit the variant for this Locus - emit_variant(locus, found_path->second, site); - } - // Otherwise discard it as off-path - // TODO: update bases lost - } else { - // Emit the locus itself - locus_buffer.push_back(locus); - stream::write_buffered(cout, locus_buffer, locus_buffer_size); - } - - // We called a site - called_loci++; - - // Mark all the nodes and edges in the site as covered - auto contents = site_manager.deep_contents(site, augmented.graph, true); - for (auto* node : contents.first) { - covered_nodes.insert(node); - } - for (auto* edge : contents.second) { - covered_edges.insert(edge); - } - }); - } - - if (verbose) { - cerr << "Called " << called_loci << " loci" << endl; - } - - // OK now we have handled all the real sites. But there are still nodes and - // edges that we might want to call as present or absent. - - if (!convert_to_vcf) { - - size_t extra_loci = 0; - - if (call_other_by_coverage) { - // We should look at the coverage of things off the primary path and - // make calls on them. - - augmented.graph.for_each_edge([&](Edge* e) { - // We want to make calls on all the edges that aren't covered yet - if (covered_edges.count(e)) { - // Skip this edge - return; - } - - // Make a couple of fake Visits - Visit from_visit; - from_visit.set_node_id(e->from()); - from_visit.set_backward(e->from_start()); - Visit to_visit; - to_visit.set_node_id(e->to()); - to_visit.set_backward(e->to_end()); - - // Make a Locus for the edge - Locus locus; - - // Give it an allele - Path* path = locus.add_allele(); - - // Fill in - *path->add_mapping() = to_mapping(from_visit, augmented.graph); - *path->add_mapping() = to_mapping(to_visit, augmented.graph); - - // Set the support - *locus.add_support() = augmented.edge_supports[e]; - *locus.mutable_overall_support() = augmented.edge_supports[e]; - - // Decide on the genotype - Genotype gt; - - // TODO: use the coverage bins - if (support_val(locus.support(0)) > support_val(PrimaryPath::get_average_support(primary_paths)) * 0.25) { - // We're closer to 1 copy than 0 copies - gt.add_allele(0); - - if (support_val(locus.support(0)) > support_val(PrimaryPath::get_average_support(primary_paths)) * 0.75) { - // We're closer to 2 copies than 1 copy - gt.add_allele(0); - } - } - // Save the genotype with 0, 1, or 2 copies. - *locus.add_genotype() = gt; - - // Send out the locus - locus_buffer.push_back(locus); - stream::write_buffered(cout, locus_buffer, locus_buffer_size); - - extra_loci++; - - // TODO: look at average node coverages and do node loci (in - // case any nodes have no edges?) - - }); - } else { - // We should just assert the existence of the primary path edges - // that weren't in something. Everything not asserted will get - // subsetted out. - - for (auto& kv : primary_paths) { - // For every primary path in the graph - auto& primary_path = kv.second; - - // Remember the end of the previous ndoe - NodeSide previous_end; - - for (auto& offset_and_side : primary_path.get_index()) { - // For every NodeSide that happens in the primary path - - // Grab the side - NodeSide here = offset_and_side.second; - - if (previous_end == NodeSide()) { - // Skip the first node and remember its end - previous_end = here.flip(); - continue; - } - - // Find the edge we crossed - Edge* crossed = augmented.graph.get_edge(previous_end, here); - assert(crossed != nullptr); - - if (covered_edges.count(crossed)) { - // If the edge we crossed is covered by a snarl, don't - // emit anything. - previous_end = here.flip(); - continue; - } - - // If the edge we're crossing isn't covered, we should - // assert the primary path here. - - // Make a couple of fake Visits - Visit from_visit; - from_visit.set_node_id(crossed->from()); - from_visit.set_backward(crossed->from_start()); - Visit to_visit; - to_visit.set_node_id(crossed->to()); - to_visit.set_backward(crossed->to_end()); - - // Make a Locus for the edge - Locus locus; - - // Give it an allele - Path* path = locus.add_allele(); - - // Fill in - *path->add_mapping() = to_mapping(from_visit, augmented.graph); - *path->add_mapping() = to_mapping(to_visit, augmented.graph); - - // Set the support - *locus.add_support() = augmented.get_support(crossed); - *locus.mutable_overall_support() = augmented.get_support(crossed); - - // Decide on the genotype of hom ref. - Genotype* gt = locus.add_genotype(); - gt->add_allele(0); - gt->add_allele(0); - - // Send out the locus - locus_buffer.push_back(locus); - stream::write_buffered(cout, locus_buffer, locus_buffer_size); - - extra_loci++; - - // Make sure to remember the end of the node we just did, - // for looking at the next node. - previous_end = here.flip(); - - } - } - } - - // Flush the buffer of Locus objects we have to write - stream::write_buffered(cout, locus_buffer, 0); - - if (verbose) { - cerr << "Called " << extra_loci << " extra loci with copy number estimates" << endl; - } - - } - -} - -bool SupportCaller::is_reference(const SnarlTraversal& trav, AugmentedGraph& augmented) { - - // Keep track of the previous NodeSide - NodeSide previous; - - // We'll call this function with each visit in turn. - // If it ever returns false, the whole thing is nonreference. - auto experience_visit = [&](const Visit& visit) { - // TODO: handle nested sites - assert(visit.node_id()); - - if (previous.node != 0) { - // Consider the edge from the previous visit - Edge* edge = augmented.graph.get_edge(previous, to_left_side(visit)); - - if (augmented.is_novel_edge(edge)) { - // Found a novel edge! - return false; - } - } - - if (augmented.is_novel_node(augmented.graph.get_node(visit.node_id()))) { - // This node itself is novel - return false; - } - - // Remember we want an edge from this visit when we look at the next - // one. - previous = to_right_side(visit); - - // This visit is known. - return true; - }; - - // Experience the entire traversal from start to end - for (size_t i = 0; i < trav.visit_size(); i++) { - if (!experience_visit(trav.visit(i))) { - return false; - } - } - - // And if we make it through it's a reference traversal. - return true; - -} - -bool SupportCaller::is_reference(const Path& path, AugmentedGraph& augmented) { - - // The path can't be empty because it's not clear if an empty path should be - // reference or not. - assert(path.mapping_size() != 0); - - for (size_t i = 0; i < path.mapping_size(); i++) { - // Check each mapping - auto& mapping = path.mapping(i); - - if (augmented.is_novel_node(augmented.graph.get_node(mapping.position().node_id()))) { - // We use a novel node - return false; - } - - if (i + 1 < path.mapping_size()) { - // Also look at the next mapping - auto& next_mapping = path.mapping(i + 1); - - // And see about the edge to it - Edge* edge = augmented.graph.get_edge(to_right_side(to_visit(mapping)), to_left_side(to_visit(next_mapping))); - if (augmented.is_novel_edge(edge)) { - // We used a novel edge - return false; - } - } - } - - // If we get through everything it's reference. - return true; -} - -} diff --git a/src/support_caller.hpp b/src/support_caller.hpp deleted file mode 100644 index 7fd32a7415a..00000000000 --- a/src/support_caller.hpp +++ /dev/null @@ -1,346 +0,0 @@ -#ifndef VG_SUPPORT_CALLER_HPP_INCLUDED -#define VG_SUPPORT_CALLER_HPP_INCLUDED - -#include -#include -#include -#include -#include -#include -#include -#include "vg.pb.h" -#include "vg.hpp" -#include "hash_map.hpp" -#include "utility.hpp" -#include "pileup.hpp" -#include "path_index.hpp" -#include "genotypekit.hpp" -#include "option.hpp" -#include "traversal_finder.hpp" - -namespace vg { - -using namespace std; - -/** - * SupportCaller: take an augmented graph from a Caller and produce actual calls in a - * VCF. - */ -class SupportCaller : public Configurable { - -public: - - /** - * Set up to call with default parameters. - */ - SupportCaller() = default; - - /** - * We use this to represent a contig in the primary path, with its index and coverage info. - */ - class PrimaryPath { - public: - /** - * Index the given path in the given augmented graph, and compute all - * the coverage bin information with the given bin size. - */ - PrimaryPath(SupportAugmentedGraph& augmented, const string& ref_path_name, size_t ref_bin_size); - - /** - * Get the support at the bin appropriate for the given primary path - * offset. - */ - const Support& get_support_at(size_t primary_path_offset) const; - - /** - * Get the index of the bin that the given path position falls in. - */ - size_t get_bin_index(size_t primary_path_offset) const; - - /** - * Get the bin with minimal coverage. - */ - size_t get_min_bin() const; - - /** - * Get the bin with maximal coverage. - */ - size_t get_max_bin() const; - - /** - * Get the support in the given bin. - */ - const Support& get_bin(size_t bin) const; - - /** - * Get the total number of bins that the path is divided into. - */ - size_t get_total_bins() const; - - /** - * Get the average support over the path. - */ - Support get_average_support() const; - - /** - * Get the average support over a collection of paths. - */ - static Support get_average_support(const map& paths); - - /** - * Get the total support for the path. - */ - Support get_total_support() const; - - /** - * Get the PathIndex for this primary path. - */ - PathIndex& get_index(); - - /** - * Get the PathIndex for this primary path. - */ - const PathIndex& get_index() const; - - /** - * Gets the path name we are representing. - */ - const string& get_name() const; - - protected: - /// How wide is each coverage bin along the path? - size_t ref_bin_size; - - /// This holds the index for this path - PathIndex index; - - /// This holds the name of the path - string name; - - /// What's the expected in each bin along the path? Coverage gets split - /// evenly over both strands. - vector binned_support; - - /// Which bin has min support? - size_t min_bin; - /// Which bin has max support? - size_t max_bin; - - /// What's the total Support over every bin? - Support total_support; - }; - - - /** - * Produce calls for the given annotated augmented graph. If a - * pileup_filename is provided, the pileup is loaded again and used to add - * comments describing variants - */ - void call(SupportAugmentedGraph& augmented, string pileup_filename = ""); - - /** - * Get the support and size for each traversal in a list. Discount support - * of minus_traversal if it's specified. Use average_support_switch_threshold and - * use_average_support to decide whether to return min or avg supports - */ - tuple, vector > get_traversal_supports_and_sizes( - SupportAugmentedGraph& augmented, SnarlManager& snarl_manager, const Snarl& site, - const vector& traversals, - const SnarlTraversal* minus_traversal = NULL); - - - /** - * For the given snarl, find the reference traversal, the best traversal, - * and the second-best traversal, recursively, if any exist. These - * traversals will be fully filled in with nodes. - * - * Only snarls which are ultrabubbles can be called. - * - * Expects the given baseline support for a diploid call. - * - * Will not return more than 1 + copy_budget SnarlTraversals, and will - * return less if some copies are called as having the same traversal. - * - * Does not deduplicate agains the ref traversal; it may be the same as the - * best or second-best. - * - * Uses the given copy number allowance, and emits a Locus for this Snarl - * and any child Snarls. - * - * If no path through the Snarl can be found, emits no Locus and returns no - * SnarlTraversals. - */ - vector find_best_traversals(SupportAugmentedGraph& augmented, - SnarlManager& snarl_manager, TraversalFinder* finder, const Snarl& site, - const Support& baseline_support, size_t copy_budget, - function emit_locus); - - /** - * Decide if the given SnarlTraversal is included in the original base graph - * (true), or if it represents a novel variant (false). - * - * Looks at the nodes in the traversal, and sees if their calls are - * CALL_REFERENCE or not. - * - * Handles single-edge traversals. - * - */ - bool is_reference(const SnarlTraversal& trav, AugmentedGraph& augmented); - - /** - * Decide if the given Path is included in the original base graph (true) or - * if it represents a novel variant (false). - * - * Looks at the nodes, and sees if their calls are CALL_REFERENCE or not. - * - * The path can't be empty; it has to be anchored to something (probably the - * start and end of the snarl it came from). - */ - bool is_reference(const Path& path, AugmentedGraph& augmented); - - /** - * Find the primary path, if any, that the given site is threaded onto. - * - * TODO: can only work by brute-force search. - */ - map::iterator find_path(const Snarl& site, map& primary_paths); - - /** - * Get the amount of support. Can use this function to toggle between unweighted (total from genotypekit) - * and quality-weighted (support_quality below) in one place. - */ - function support_val; - - static double support_quality(const Support& support) { - return support.quality(); - } - - // Option variables - - /// Should we output in VCF (true) or Protobuf Locus (false) format? - Option convert_to_vcf{this, "no-vcf", "V", true, - "output variants in binary Loci format instead of text VCF format"}; - /// How big should our output buffer be? - size_t locus_buffer_size = 1000; - - /// What are the names of the reference paths, if any, in the graph? - Option> ref_path_names{this, "ref", "r", {}, - "use the path with the given name as a reference path (can repeat)"}; - /// What name should we give each contig in the VCF file? Autodetected from - /// path names if empty or too short. - Option> contig_name_overrides{this, "contig", "c", {}, - "use the given name as the VCF name for the corresponding reference path (can repeat)"}; - /// What should the total sequence length reported in the VCF header be for - /// each contig? Autodetected from path lengths if empty or too short. - Option> length_overrides{this, "length", "l", {}, - "override total sequence length in VCF for the corresponding reference path (can repeat)"}; - /// What name should we use for the sample in the VCF file? - Option sample_name{this, "sample", "S", "SAMPLE", - "name the sample in the VCF with the given name"}; - /// How far should we offset positions of variants? - Option variant_offset{this, "offset", "o", 0, - "offset variant positions by this amount in VCF"}; - /// How many nodes should we be willing to look at on our path back to the - /// primary path? Keep in mind we need to look at all valid paths (and all - /// combinations thereof) until we find a valid pair. - Option max_search_depth{this, "max-search-depth", "D", 1000, - "maximum depth for path search"}; - /// How many search states should we allow on the DFS stack when searching - /// for traversals? - Option max_search_width{this, "max-search-width", "wWmMsS", 1000, - "maximum width for path search"}; - - - /// What fraction of average coverage should be the minimum to call a - /// variant (or a single copy)? Default to 0 because vg call is still - /// applying depth thresholding - Option min_fraction_for_call{this, "min-cov-frac", "F", 0, - "min fraction of average coverage at which to call"}; - /// What fraction of the reads supporting an alt are we willing to discount? - /// At 2, if twice the reads support one allele as the other, we'll call - /// homozygous instead of heterozygous. At infinity, every call will be - /// heterozygous if even one read supports each allele. - Option max_het_bias{this, "max-het-bias", "H", 10, - "max imbalance factor to call heterozygous, alt major on SNPs"}; - /// Like above, but applied to ref / alt ratio (instead of alt / ref) - Option max_ref_het_bias{this, "max-ref-bias", "R", 4.5, - "max imbalance factor to call heterozygous, ref major"}; - /// Like the max het bias, but applies to novel indels. - Option max_indel_het_bias{this, "max-indel-het-bias", "I", 3, - "max imbalance factor to call heterozygous, alt major on indels"}; - /// Like the max het bias, but applies to multiallelic indels. - Option max_indel_ma_bias{this, "max-indel-ma-bias", "G", 6, - "max imbalance factor between ref and alt2 to call 1/2 double alt on indels"}; - /// What's the minimum integer number of reads that must support a call? We - /// don't necessarily want to call a SNP as het because we have a single - // supporting read, even if there are only 10 reads on the site. - Option min_total_support_for_call{this, "min-count", "n", 1, - "min total supporting read count to call a variant"}; - /// Bin size used for counting coverage along the reference path. The - /// bin coverage is used for computing the probability of an allele - /// of a certain depth - Option ref_bin_size{this, "bin-size", "B", 250, - "bin size used for counting coverage"}; - /// On some graphs, we can't get the coverage because it's split over - /// parallel paths. Allow overriding here - Option expected_coverage{this, "avg-coverage", "C", 0.0, - "specify expected coverage (instead of computing on reference)"}; - /// Should we use average support instead of minimum support for our - /// calculations? - Option use_average_support{this, "use-avg-support", "u", false, - "use average instead of minimum support"}; - /// Max traversal length threshold at which we switch from minimum support - /// to average support (so we don't use average support on pairs of adjacent - /// errors and miscall them, but we do use it on long runs of reference - /// inside a deletion where the min support might not be representative. - Option average_support_switch_threshold{this, "use-avg-support-above", "uUaAtT", 100, - "use average instead of minimum support for sites this long or longer"}; - - /// What's the maximum number of bubble path combinations we can explore - /// while finding one with maximum support? - size_t max_bubble_paths = 100; - /// what's the minimum ref or alt allele depth to give a PASS in the filter - /// column? Also used as a min actual support for a second-best allele call - Option min_mad_for_filter{this, "min-mad", "E", 5, - "min. ref/alt allele depth to PASS filter or be a second-best allele"}; - /// what's the maximum total depth to give a PASS in the filter column - Option max_dp_for_filter{this, "max-dp", "MmDdAaXxPp", 0, - "max depth to PASS filter (0 for unlimited)"}; - /// what's the maximum total depth to give a PASS in the filter column, as a - /// multiple of the global baseline coverage? - Option max_dp_multiple_for_filter{this, "max-dp-multiple", "MmDdAaXxPp", 0, - "max portion of global expected depth to PASS filter (0 for unlimited)"}; - /// what's the maximum total depth to give a PASS in the filter column, as a - /// multiple of the local baseline coverage? - Option max_local_dp_multiple_for_filter{this, "max-local-dp-multiple", "MmLlOoDdAaXxPp", 0, - "max portion of local expected depth to PASS filter (0 for unlimited)"}; - /// what's the min log likelihood for allele depth assignments to PASS? - Option min_ad_log_likelihood_for_filter{this, "min-ad-log-likelihood", "MmAaDdLliI", -9.0, - "min log likelihood for AD assignments to PASS filter (0 for unlimited)"}; - - Option write_trivial_calls{this, "trival", "ivtTIRV", false, - "write trivial vcf calls (ex 0/0 genotypes)"}; - - /// Should we call on nodes/edges outside of snarls by coverage (true), or - /// just assert that primary path things exist and off-path things don't - /// (false)? - Option call_other_by_coverage{this, "call-nodes-by-coverage", "cCoObB", false, - "make calls on nodes/edges outside snarls by coverage"}; - - /// Use total support count (true) instead of total support quality (false) when choosing - /// top alleles and deciding gentypes based on the biases. - Option use_support_count{this, "use-support-count", "T", false, - "use total support count instead of total support quality for selecting top alleles"}; - - /// Path of supports file generated from the PileupAugmenter (via vg augment) - Option support_file_name{this, "support-file", "s", {}, - "path of file containing supports generated by vg augment -P -s"}; - - /// print warnings etc. to stderr - bool verbose = false; - -}; - -} - -#endif diff --git a/src/surjecting_alignment_emitter.cpp b/src/surjecting_alignment_emitter.cpp new file mode 100644 index 00000000000..0f337a7633c --- /dev/null +++ b/src/surjecting_alignment_emitter.cpp @@ -0,0 +1,78 @@ +/** + * \file surjecting_alignment_emitter.cpp + * Implementation for SurjectingAlignmentEmitter + */ + + +#include "surjecting_alignment_emitter.hpp" +#include "hts_alignment_emitter.hpp" + +#include + +namespace vg { + +using namespace std; + +SurjectingAlignmentEmitter::SurjectingAlignmentEmitter(const PathPositionHandleGraph* graph, unordered_set paths, + unique_ptr&& backing, bool prune_suspicious_anchors) : surjector(graph), paths(paths), backing(std::move(backing)) { + + // Configure the surjector + surjector.prune_suspicious_anchors = prune_suspicious_anchors; + +} + +void SurjectingAlignmentEmitter::surject_alignments_in_place(vector& alns) const { + for (auto& aln : alns) { + // Surject each alignment and annotate with surjected path position + aln = surjector.surject(aln, paths, surject_subpath_global); + } +} + +void SurjectingAlignmentEmitter::emit_singles(vector&& aln_batch) { + // Intercept the batch on its way + vector aln_batch_caught(aln_batch); + // Surject it in place + surject_alignments_in_place(aln_batch_caught); + // Forward it along + backing->emit_singles(std::move(aln_batch_caught)); +} + +void SurjectingAlignmentEmitter::emit_mapped_singles(vector>&& alns_batch) { + // Intercept the batch on its way + vector> alns_batch_caught(alns_batch); + for (auto& mappings : alns_batch_caught) { + // Surject all mappings in place + surject_alignments_in_place(mappings); + } + // Forward it along + backing->emit_mapped_singles(std::move(alns_batch_caught)); +} + +void SurjectingAlignmentEmitter::emit_pairs(vector&& aln1_batch, vector&& aln2_batch, vector&& tlen_limit_batch) { + // Intercept the batch on its way + vector aln1_batch_caught(aln1_batch); + vector aln2_batch_caught(aln2_batch); + // Surject it in place + surject_alignments_in_place(aln1_batch_caught); + surject_alignments_in_place(aln2_batch_caught); + // Forward it along + backing->emit_pairs(std::move(aln1_batch_caught), std::move(aln2_batch_caught), std::move(tlen_limit_batch)); +} + +void SurjectingAlignmentEmitter::emit_mapped_pairs(vector>&& alns1_batch, vector>&& alns2_batch, vector&& tlen_limit_batch) { + // Intercept the batch on its way + vector> alns1_batch_caught(alns1_batch); + vector> alns2_batch_caught(alns2_batch); + for (auto& mappings : alns1_batch_caught) { + // Surject all mappings in place + surject_alignments_in_place(mappings); + } + for (auto& mappings : alns2_batch_caught) { + // Surject all mappings in place + surject_alignments_in_place(mappings); + } + // Forward it along + backing->emit_mapped_pairs(std::move(alns1_batch_caught), std::move(alns2_batch_caught), std::move(tlen_limit_batch)); +} + +} diff --git a/src/surjecting_alignment_emitter.hpp b/src/surjecting_alignment_emitter.hpp new file mode 100644 index 00000000000..a9a12a0e1fd --- /dev/null +++ b/src/surjecting_alignment_emitter.hpp @@ -0,0 +1,84 @@ +#ifndef VG_SURJECTING_ALIGNMENT_EMITTER_HPP_INCLUDED +#define VG_SURJECTING_ALIGNMENT_EMITTER_HPP_INCLUDED + +/** \file + * + * Holds a surjecting wrapper AlignmentEmitter. + */ + + +#include "surjector.hpp" +#include "vg/io/alignment_emitter.hpp" +#include "handle.hpp" + +#include +#include +#include + +namespace vg { + +using namespace std; + +/** + * An AlignmentEmitter implementation that surjects alignments before emitting them via a backing AlignmentEmitter, which it owns. + */ +class SurjectingAlignmentEmitter : public vg::io::AlignmentEmitter { +public: + + /** + * Surject alignments using the given graph, into the given paths, and send them to the given AlignmentEmitter. + * Takes ownership of the AlignmentEmitter. + * Copies the set of paths. + * + * If prune_suspicious_anchors is set, prunes out repetitive-looking + * anchors when surjecting and lets those parts of reads be realigned. + */ + SurjectingAlignmentEmitter(const PathPositionHandleGraph* graph, + unordered_set paths, unique_ptr&& backing, + bool prune_suspicious_anchors = false); + + /// Force full length alignment in surjection resolution + bool surject_subpath_global = true; + + + /// Emit a batch of Alignments + virtual void emit_singles(vector&& aln_batch); + /// Emit batch of Alignments with secondaries. All secondaries must have is_secondary set already. + virtual void emit_mapped_singles(vector>&& alns_batch); + /// Emit a batch of pairs of Alignments. The tlen_limit_batch, if + /// specified, is the maximum pairing distance for ewch pair to flag + /// properly paired, if the output format cares about such things. TODO: + /// Move to a properly paired annotation that runs with the Alignment. + virtual void emit_pairs(vector&& aln1_batch, vector&& aln2_batch, + vector&& tlen_limit_batch); + /// Emit the mappings of a batch of pairs of Alignments. All secondaries + /// must have is_secondary set already. The tlen_limit_batch, if specified, + /// is the maximum pairing distance for each pair to flag properly paired, + /// if the output format cares about such things. TODO: Move to a properly + /// paired annotation that runs with the Alignment. + /// + /// Both ends of each pair must have the same number of mappings. + virtual void emit_mapped_pairs(vector>&& alns1_batch, + vector>&& alns2_batch, vector&& tlen_limit_batch); + +protected: + /// Surjector used to do the surjection + Surjector surjector; + + /// Paths to surject into + unordered_set paths; + + /// AlignmentEmitter to emit to once done + unique_ptr backing; + + /// Surject alignments in place. + void surject_alignments_in_place(vector& alns) const; + + + + +}; + +} + +#endif diff --git a/src/surjector.cpp b/src/surjector.cpp index 92ce013c3d1..0f1f407daab 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -3,859 +3,4203 @@ * surjector.cpp: implements a class that surjects alignments onto paths */ + #include "surjector.hpp" -//#define debug_surject +#include "sequence_complexity.hpp" +#include "alignment.hpp" +#include "utility.hpp" +#include "memoizing_graph.hpp" +#include "multipath_alignment_graph.hpp" +#include "reverse_graph.hpp" + +#include "algorithms/extract_connecting_graph.hpp" +#include "algorithms/prune_to_connecting_graph.hpp" +#include "algorithms/component.hpp" + +#include "bdsg/hash_graph.hpp" + +//#define debug_spliced_surject //#define debug_anchored_surject +//#define debug_multipath_surject +//#define debug_constrictions +//#define debug_prune_unconnectable +//#define debug_filter_paths //#define debug_validate_anchored_multipath_alignment +//#define debug_always_warn_on_too_long namespace vg { using namespace std; - Surjector::Surjector(xg::XG* xg_index) : Mapper(xg_index, nullptr, nullptr) { - + Surjector::Surjector(const PathPositionHandleGraph* graph) : graph(graph) { + if (!graph) { + cerr << "error:[Surjector] Failed to provide an graph to the Surjector" << endl; + } } - Surjector::~Surjector() { + Alignment Surjector::surject(const Alignment& source, const unordered_set& paths, + bool allow_negative_scores, bool preserve_deletions) const { + + // Allocate the annotation info + string path_name_out; + int64_t path_pos_out; + bool path_rev_out; + + // Do the surjection + Alignment surjected = surject(source, paths, path_name_out, path_pos_out, path_rev_out, allow_negative_scores, preserve_deletions); + // Pack all the info into the refpos field + surjected.clear_refpos(); + auto* pos = surjected.add_refpos(); + pos->set_name(path_name_out); + pos->set_offset(path_pos_out); + pos->set_is_reverse(path_rev_out); + + return surjected; } - Alignment Surjector::surject_classic(const Alignment& source, - const set& path_names, - string& path_name, - int64_t& path_pos, - bool& path_reverse) { - - Alignment surjection = source; - // Leave the original mapping quality in place (because that's the quality - // on the placement of this read in this region at all) - surjection.clear_mapping_quality(); - surjection.clear_score(); - surjection.clear_identity(); - surjection.clear_path(); - - int count_forward=0, count_reverse=0; - for (auto& mapping : source.path().mapping()) { - if (mapping.position().is_reverse()) { - ++count_reverse; - } else { - ++count_forward; - } - } - //cerr << "fwd " << count_forward << " rev " << count_reverse << endl; - - // here we assume that people will use this on DAGs - // require that we have an alignment with a score, and that it is on one strand - if (!source.has_path() || source.path().mapping_size() == 0 - || alignment_from_length(source) == 0 - || count_forward > 0 && count_reverse > 0) { -#ifdef debug_surject - -#pragma omp critical (cerr) - cerr << "Alignment " << source.name() << " is unmapped and cannot be surjected" << endl; - -#endif - return surjection; - } - - set nodes; - for (int i = 0; i < source.path().mapping_size(); ++ i) { - nodes.insert(source.path().mapping(i).position().node_id()); - } - VG graph; - for (auto& node : nodes) { - *graph.graph.add_node() = xindex->node(node); - } - xindex->expand_context(graph.graph, 3, true); // get connected edges and path - graph.paths.append(graph.graph); - graph.rebuild_indexes(); - VG base_graph = graph; - - // non-fiddly approach, rest on augmentation - // 0) remove softclips from the read - // 1) augment the graph with the read - // 2) keep the ref path and the aln path both in the graph - // 3) detach the nodes on the other sides of the aln path start and end from all other nodes - // 4) remove the non-path component - - Alignment trimmed_source = strip_from_end(strip_from_start(source, softclip_start(source)), softclip_end(source)); - // check if we'd fail - if (trimmed_source.sequence().size() == 0) { - return surjection; - } - - vector source_path; - source_path.push_back(trimmed_source.path()); - source_path.back().set_name(source.name()); - // Make sure to pass true here to embed the alignment - auto translation = graph.edit(source_path, true); //, true, true); - Translator translator(translation); - Path source_in_graph = graph.paths.path(source.name()); - Position start_pos = make_position(initial_position(source_in_graph)); - Position end_pos = make_position(final_position(source_in_graph)); - //cerr << "start and end pos " << pb2json(start_pos) << " " << pb2json(end_pos) << endl; - - //Position start_pos = make_position(initial_position(trimmed_source.path())); - //Position end_pos = make_position(final_position(trimmed_source.path())); - - // find then unlink the next and previous path nodes from the rest of the graph to isolate the path-specific component - handle_t start = graph.get_handle(start_pos.node_id(), start_pos.is_reverse()); - handle_t end = graph.get_handle(end_pos.node_id(), end_pos.is_reverse()); - handle_t cut; - bool found = false; - unordered_set curr; - unordered_set next; - auto find_path = [&](const handle_t& h) { - vector path_intersection; - set node_paths = graph.paths.of_node(graph.get_id(h)); - //cerr << "Node paths for " << graph.get_id(h) << " " << node_paths.size() << endl; - if (!node_paths.empty()) { - std::set_intersection(path_names.begin(), path_names.end(), - node_paths.begin(), node_paths.end(), - std::back_inserter(path_intersection)); - } - cut = h; - found = path_intersection.size() > 0; - //cerr << "path intersection size " << path_intersection.size() << endl; - next.insert(h); - return !found; - }; - found = false; - curr.insert(start); - //cerr << "going back" << endl; - while (!curr.empty()) { - bool finished = false; - //cerr << "cur has " << curr.size() << endl; - for (auto& h : curr) { - finished |= !graph.follow_edges(h, true, find_path); - if (finished) break; - } - if (finished) { - curr.clear(); - next.clear(); - break; - } else { - curr = next; - next.clear(); - } - } - handle_t cut_before = cut; - bool found_forward = found; - curr.insert(end); - //ncerr << "going forward" << endl; - while (!curr.empty()) { - bool finished = false; - //cerr << "cur has " << curr.size() << endl; - for (auto& h : curr) { - finished |= !graph.follow_edges(h, false, find_path); - if (finished) break; - } - if (finished) { - curr.clear(); - next.clear(); - break; - } else { - curr = next; - next.clear(); - } - } - handle_t cut_after = cut; - //cerr << "cut before " << graph.get_id(cut_before) << endl; - //cerr << "cut after " << graph.get_id(cut_after) << endl; - bool found_reverse = found; - //graph.serialize_to_file("before-" + source.name() + ".vg"); - //graph.serialize_to_file("before-" + graph.hash() + ".vg"); - - set kept_paths; - graph.keep_paths(path_names, kept_paths); - graph.remove_non_path(); - // by definition we have found path - if (found_forward && found_reverse && cut_before == cut_after) { - graph.destroy_handle(cut_before); - } else { - if (found_forward) graph.destroy_handle(cut_before); - if (found_reverse) graph.destroy_handle(cut_after); - } - //graph.serialize_to_file("after-" + source.name() + ".vg"); - //graph.serialize_to_file("after-" + graph.hash() + ".vg"); - -#ifdef debug_surject - cerr << "src " << pb2json(source) << endl; - cerr << "start " << pb2json(start_pos) << endl; - cerr << "end " << pb2json(end_pos) << endl; - cerr << "graph " << pb2json(graph.graph) << endl; -#endif - //Position end_pos = alignment_end(source); - // assume DAG - set target_ids; - for (auto& mapping : source_in_graph.mapping()) { - target_ids.insert(mapping.position().node_id()); - } - - // otherwise, two cuts - // remove the links in both cases - // we can clean up by removing - - // get only the subgraph that we want to align to - list subgraphs; - graph.disjoint_subgraphs(subgraphs); - - // Align the old alignment to the graph in both orientations. Apparently - // align only does a single oriantation, and we have no idea, even looking - // at the mappings, which of the orientations will correspond to the one the - // alignment is actually in. - - Graph subgraph; - for (auto& graph : subgraphs) { - //cerr << pb2json(graph.graph) << endl; - bool found = false; - graph.for_each_handle([&](const handle_t& h) { - if (!found && target_ids.count(graph.get_id(h))) { - found = true; - } - }); - if (found) { - subgraph = graph.graph; - break; - } - } - if (subgraph.node_size() == 0) { - // couldn't find subgraph, try the one we've got - subgraph = graph.graph; - } + vector Surjector::multi_surject(const Alignment& source, + const unordered_set& paths, + bool allow_negative_scores, + bool preserve_deletions) const { + vector surjected; + vector> positions; + surject_internal(&source, nullptr, &surjected, nullptr, paths, positions, + true, allow_negative_scores, preserve_deletions); - if (subgraph.node_size() == 0) { - return surjection; //empty graph, avoid further warnings + for (size_t i = 0; i < surjected.size(); ++i) { + surjected[i].clear_refpos(); + auto* pos = surjected[i].add_refpos(); + pos->set_name(get<0>(positions[i])); + pos->set_offset(get<1>(positions[i])); + pos->set_is_reverse(get<2>(positions[i])); } - // DAG assumption - sort_by_id_dedup_and_clean(subgraph); -#ifdef debug_surject - cerr << "sub " << pb2json(subgraph) << endl; -#endif - - // Flip the string and its quality around - Alignment surjection_fwd = surjection; - Alignment surjection_rev = surjection; - int start_softclip_length = softclip_start(source); - int end_softclip_length = softclip_end(source); - Alignment start_softclip_fwd, end_softclip_fwd; - Alignment start_softclip_rev, end_softclip_rev; - if (start_softclip_length) { - start_softclip_fwd = strip_from_end(surjection_fwd, surjection_fwd.sequence().size() - start_softclip_length); - end_softclip_rev = reverse_complement_alignment(start_softclip_fwd, [&](id_t id) { return xindex->node_length(id); }); - } - if (end_softclip_length) { - end_softclip_fwd = strip_from_start(surjection_fwd, surjection_fwd.sequence().size() - end_softclip_length); - start_softclip_rev = reverse_complement_alignment(end_softclip_fwd, [&](id_t id) { return xindex->node_length(id); }); - } - surjection_fwd = strip_from_end(strip_from_start(surjection_fwd, start_softclip_length), end_softclip_length); - surjection_rev = reverse_complement_alignment(surjection_fwd, [&](id_t id) { return xindex->node_length(id); }); - - // align to the graph with a big full len, and simplify without removal of internal deletions, as we'll need these for BAM reconstruction - Alignment surjection_forward, surjection_reverse; - int fwd_score = 0, rev_score = 0; - // override the full length bonus - int8_t full_length_bonus_override = 30; - int8_t saved_bonus = get_aligner(!surjection.quality().empty())->full_length_bonus; - get_aligner(!surjection.quality().empty())->full_length_bonus = full_length_bonus_override; - if (count_forward) { - surjection_forward = simplify(align_to_graph(surjection_fwd, graph.graph, max_query_graph_ratio, true, false, false, false, false, false), false); - fwd_score = surjection_forward.score(); - } - if (count_reverse) { - surjection_reverse = simplify(align_to_graph(surjection_rev, graph.graph, max_query_graph_ratio, true, false, false, false, false, false), false); - rev_score = surjection_reverse.score(); - } - // reset bonus because hacks - get_aligner(!surjection.quality().empty())->full_length_bonus = saved_bonus; + return surjected; + } + + Alignment Surjector::surject(const Alignment& source, const unordered_set& paths, string& path_name_out, + int64_t& path_pos_out, bool& path_rev_out, bool allow_negative_scores, + bool preserve_deletions) const { + vector surjected; + vector> position; + surject_internal(&source, nullptr, &surjected, nullptr, paths, position, + false, allow_negative_scores, preserve_deletions); + path_name_out = get<0>(position.front()); + path_pos_out = get<1>(position.front()); + path_rev_out = get<2>(position.front()); + return move(surjected.front()); + } + + vector Surjector::multi_surject(const Alignment& source, + const unordered_set& paths, + vector>& positions_out, + bool allow_negative_scores, + bool preserve_deletions) const { + vector surjected; + surject_internal(&source, nullptr, &surjected, nullptr, paths, positions_out, + true, allow_negative_scores, preserve_deletions); -#ifdef debug_surject - cerr << "fwd " << pb2json(surjection_forward) << endl; - cerr << "rev " << pb2json(surjection_reverse) << endl; -#endif + return surjected; + } + + multipath_alignment_t Surjector::surject(const multipath_alignment_t& source, const unordered_set& paths, + string& path_name_out, int64_t& path_pos_out, bool& path_rev_out, + bool allow_negative_scores, bool preserve_deletions) const { + + vector surjected; + vector> position; + surject_internal(nullptr, &source, nullptr, &surjected, paths, position, + false, allow_negative_scores, preserve_deletions); - graph = base_graph; - // We need this for inverting mappings to the correct strand - function node_length = [&graph](id_t node) { - return graph.get_node(node)->sequence().size(); - }; + path_name_out = move(get<0>(position.front())); + path_pos_out = get<1>(position.front()); + path_rev_out = get<2>(position.front()); -#ifdef debug_surject -#pragma omp critical (cerr) - cerr << surjection.name() << " " << surjection_forward.score() << " forward score, " << surjection_reverse.score() << " reverse score" << endl; -#endif + return move(surjected.front()); + } + + vector Surjector::multi_surject(const multipath_alignment_t& source, + const unordered_set& paths, + vector>& positions_out, + bool allow_negative_scores, + bool preserve_deletions) const { + vector surjected; + surject_internal(nullptr, &source, nullptr, &surjected, paths, positions_out, + true, allow_negative_scores, preserve_deletions); - // translate - try { - if (count_forward) surjection_forward = translator.translate(surjection_forward); - if (count_reverse) surjection_reverse = translator.translate(surjection_reverse); - } catch (...) { - cerr << "[vg Mapper::surject_alignment] warning: surjection failure with read " << source.name() << endl; - return surjection; - } + return surjected; + } + + void Surjector::surject_internal(const Alignment* source_aln, const multipath_alignment_t* source_mp_aln, + vector* alns_out, vector* mp_alns_out, + const unordered_set& paths, + vector>& positions_out, bool all_paths, + bool allow_negative_scores, bool preserve_deletions) const { + - // reattach soft clips and set original score (score isn't really used through...) - if (count_forward) { - surjection_forward = merge_alignments({start_softclip_fwd, surjection_forward, end_softclip_fwd}); - surjection_forward.set_score(fwd_score); + // we need one and only one data type: Alignment or multipath_alignment_t + assert(!(source_aln && source_mp_aln)); + assert((source_aln && alns_out) || (source_mp_aln && mp_alns_out)); + +#ifdef debug_anchored_surject + cerr << "surjecting alignment "; + if (source_mp_aln) { + cerr << "with " << source_mp_aln->subpath_size() << " subpaths " << debug_string(*source_mp_aln); } - if (count_reverse) { - surjection_reverse = merge_alignments({start_softclip_rev, surjection_reverse, end_softclip_rev}); - surjection_reverse.set_score(rev_score); + else { + cerr << pb2json(*source_aln); } - - // choose - if (count_reverse && count_forward) { - if (surjection_reverse.score() > surjection_forward.score()) { - surjection = reverse_complement_alignment(surjection_reverse, [&](id_t id) { return xindex->node_length(id); }); - } else { - surjection = surjection_forward; - } + cerr << " onto "; + if (paths.size() > 100) { + cerr << paths.size() << " paths"; } else { - if (count_reverse) { - surjection = reverse_complement_alignment(surjection_reverse, [&](id_t id) { return xindex->node_length(id); }); - } else { - surjection = surjection_forward; + cerr << "paths "; + for (const path_handle_t& path : paths) { + cerr << graph->get_path_name(path) << " "; } } - - surjection = simplify(surjection, false); - -#ifdef debug_surject - -#pragma omp critical (cerr) - cerr << surjection.path().mapping_size() << " mappings, " << kept_paths.size() << " paths" << endl; - + cerr << endl; #endif - //assert(check_alignment(surjection)); - if (surjection.path().mapping_size() > 0 && kept_paths.size() == 1) { - // determine the paths of the node we mapped into - // ... get the id of the first node, get the paths of it - assert(kept_paths.size() == 1); - path_name = *kept_paths.begin(); - - int64_t path_id = xindex->path_rank(path_name); - auto& first_pos = surjection.path().mapping(0).position(); - int64_t hit_id = surjection.path().mapping(0).position().node_id(); - bool hit_backward = surjection.path().mapping(0).position().is_reverse(); - // we pick up positional information using the index - - //cerr << "hit id " << hit_id << endl; - auto path_posns = xindex->position_in_path(hit_id, path_name); - if (path_posns.size() > 1) { - cerr << "[vg map] surject_alignment: warning, multiple positions for node " << hit_id << " in " << path_name << " but will use only first: " << path_posns.front() << endl; - } else if (path_posns.size() == 0) { - cerr << "[vg map] surject_alignment: error, no positions for alignment " << source.name() << endl; + + if (source_aln && source_aln->path().mapping_size() != 0) { + // The read is mapped. Check the input alignment for basic + // consistency. If the sequence and the graph path don't agree + // about the read length, something is very wrong with the input. + size_t source_to_length = path_to_length(source_aln->path()); + if (source_aln->sequence().size() != source_to_length) { + cerr << "error[Surjector::surject]: read " << source_aln->name() << " has " + << source_aln->sequence().size() << " sequence bases but an input alignment that aligns " + << source_to_length << " bases instead. This is invalid and uninterpretable; check your mapper." << endl; + cerr << "error[Surjector::surject]: offending alignemnt: " << pb2json(*source_aln) << endl; exit(1); } - - // if we are reversed - path_pos = path_posns.front(); - bool reversed_path = xindex->mapping_at_path_position(path_name, path_pos).position().is_reverse(); - if (reversed_path) { - // if we got the start of the node position relative to the path - // we need to offset to make things right - // but which direction - if (hit_backward) { - path_pos = path_posns.front() + first_pos.offset(); - } else { - auto pos = reverse_complement_alignment(surjection, node_length).path().mapping(0).position(); - path_pos = xindex->position_in_path(pos.node_id(), path_name).front() + pos.offset(); - } - path_reverse = !hit_backward; - } else { - if (!hit_backward) { - path_pos = path_posns.front() + first_pos.offset(); - } else { - auto pos = reverse_complement_alignment(surjection, node_length).path().mapping(0).position(); - path_pos = xindex->position_in_path(pos.node_id(), path_name).front() + pos.offset(); - } - path_reverse = hit_backward; - } - -#ifdef debug_surject - cerr << "path position " << path_name << ":" << path_pos << endl; -#endif - - } else { - - surjection = source; -#ifdef debug_surject - -#pragma omp critical (cerr) - cerr << "Alignment " << source.name() << " did not align to the surjection subgraph" << endl; - -#endif - } -#ifdef debug_surject - -#pragma omp critical (cerr) - cerr << "Surjection on reverse strand? " << path_reverse << endl; - cerr << "Surjected alignment: " << pb2json(surjection) << endl; - -#endif - //cerr << "final " << pb2json(surjection) << endl; - Alignment final = source; - *final.mutable_path() = surjection.path(); - final.set_score(surjection.score()); - final.set_identity(identity(surjection.path())); - return final; - } - - Alignment Surjector::path_anchored_surject(const Alignment& source, const set& path_names, - string& path_name_out, int64_t& path_pos_out, bool& path_rev_out) { + // do we need to simplify a complicated multipath alignment? + multipath_alignment_t simplified_source_mp_aln; + // TODO: magic number + if (source_mp_aln && source_mp_aln->subpath_size() > 8 * source_mp_aln->sequence().size()) { + // the multipath alignment seems to have a very complex topology, we'll simplify it before + // generating a whole bunch of low-quality anchors + + // it's hard to know how much we need to prune to tame this thing, so we try tighter and tighter thresholds + int32_t diff = max(optimal_alignment_score(*source_mp_aln), 0); + do { + diff /= 2; + + // copy the const alignment so we can modify it + simplified_source_mp_aln = *source_mp_aln; + remove_low_scoring_sections(simplified_source_mp_aln, diff); + #ifdef debug_anchored_surject - cerr << "surjecting alignment: " << pb2json(source) << " onto paths "; - for (const string& path_name : path_names) { - cerr << path_name << " "; - } - cerr << endl; + cerr << "simplified complicated multipath alignment from " << source_mp_aln->subpath_size() << " to " << simplified_source_mp_aln.subpath_size() << " subpaths using score diff " << diff << endl; #endif - - // translate the path names into ranks for the XG - unordered_map path_rank_to_name; - for (const string& path_name : path_names) { - path_rank_to_name[xindex->path_rank(path_name)] = path_name; + + } while (diff > 1 && simplified_source_mp_aln.subpath_size() > 8 * source_mp_aln->sequence().size()); + + source_mp_aln = &simplified_source_mp_aln; } - - // memos for expensive succinct operations that may be repeated - unordered_map> paths_of_node_memo; - unordered_map, vector>> oriented_occurrences_memo; + // make an overlay that will memoize the results of some expensive XG operations + MemoizingGraph memoizing_graph(graph); // get the chunks of the aligned path that overlap the ref path - auto path_overlapping_anchors = extract_overlapping_paths(source, path_rank_to_name, &paths_of_node_memo, &oriented_occurrences_memo); + unordered_map, vector>> connections; + auto path_overlapping_anchors = source_aln ? extract_overlapping_paths(&memoizing_graph, *source_aln, paths) + : extract_overlapping_paths(&memoizing_graph, *source_mp_aln, + paths, connections); + + if (source_mp_aln) { + // the multipath alignment anchor algorithm can produce redundant paths if + // the alignment's graph is not parsimonious, so we filter the shorter ones out + for (pair, pair, vector>>>& path_chunk_record : path_overlapping_anchors) { + filter_redundant_path_chunks(path_chunk_record.first.second, path_chunk_record.second.first, path_chunk_record.second.second, + connections[path_chunk_record.first]); + } + } #ifdef debug_anchored_surject cerr << "got path overlapping segments" << endl; - for (const auto& path_record : path_overlapping_anchors) { - cerr << "path rank " << path_record.first << endl; - for (auto& anchor : path_record.second) { - cerr << "\t read[" << (anchor.first.first - source.sequence().begin()) << ":" << (anchor.first.second - source.sequence().begin()) << "] : "; + for (const auto& surjection_record : path_overlapping_anchors) { + cerr << "path " << graph->get_path_name(surjection_record.first.first) << ", rev? " << surjection_record.first.second << endl; + + for (size_t i = 0; i < surjection_record.second.first.size(); ++i) { + auto& anchor = surjection_record.second.first[i]; + if (source_aln) { + cerr << "\tread[" << (anchor.first.first - source_aln->sequence().begin()) << ":" << (anchor.first.second - source_aln->sequence().begin()) << "] : "; + } + else { + cerr << "\tread[" << (anchor.first.first - source_mp_aln->sequence().begin()) << ":" << (anchor.first.second - source_mp_aln->sequence().begin()) << "] : "; + } for (auto iter = anchor.first.first; iter != anchor.first.second; iter++) { cerr << *iter; } cerr << endl; + cerr << "\tpath interval " << graph->get_position_of_step(surjection_record.second.second[i].first) << " - " << graph->get_position_of_step(surjection_record.second.second[i].second) << endl; cerr << "\t" << pb2json(anchor.second) << endl; } + if (connections.count(surjection_record.first)) { + cerr << "\tconnections" << endl; + for (const auto& connection : connections[surjection_record.first]) { + cerr << "\t\t" << get<0>(connection) << " -> " << get<1>(connection) << " (" << get<2>(connection) << ")" << endl; + } + } } #endif - // the surjected alignment for each path we overlapped - unordered_map path_surjections; - for (pair>& path_record : path_overlapping_anchors) { -#ifdef debug_anchored_surject - cerr << "found overlaps on path " << path_record.first << ", performing surjection" << endl; -#endif + // we want to remove anchors that can be error-prone: short anchors in the tails and anchors in + // low complexity sequences + for (auto it = path_overlapping_anchors.begin(); it != path_overlapping_anchors.end(); ++it) { + auto& path_chunks = it->second.first; + auto& step_ranges = it->second.second; - const xg::XGPath& xpath = xindex->get_path(path_rank_to_name[path_record.first]); + // Compute the lengths of all anchors + std::vector anchor_lengths; + anchor_lengths.reserve(path_chunks.size()); + for (auto& chunk : path_chunks) { + anchor_lengths.push_back(path_from_length(chunk.second)); + } + auto anchors_by_length = sort_permutation(anchor_lengths.begin(), anchor_lengths.end(), [&](const size_t& a, const size_t& b) { + // Return true if the anchor with length a has to come first because it is longer. + return a > b; + }); - // find the interval of the ref path we need to consider - pair ref_path_interval = compute_path_interval(source, path_record.first, xpath, path_record.second, - &oriented_occurrences_memo); + vector keep(path_chunks.size(), true); + if (prune_suspicious_anchors) { + for (int i = 0; i < path_chunks.size(); ++i) { + auto& chunk = path_chunks[i]; + // Mark anchors that are themselves suspicious as not to be kept. + if (((i == 0 || i + 1 == path_chunks.size()) && path_chunks.size() != 1) + && anchor_lengths[i] <= max_tail_anchor_prune && + chunk.first.second - chunk.first.first <= max_tail_anchor_prune) { #ifdef debug_anchored_surject - cerr << "final path interval is " << ref_path_interval.first << ":" << ref_path_interval.second << endl; + cerr << "anchor " << i << " pruned for being a short tail" << endl; #endif - - // get the path graph corresponding to this interval - unordered_map> path_trans; - VG path_graph = extract_linearized_path_graph(ref_path_interval.first, ref_path_interval.second, xpath, path_trans); - - // split it into a forward and reverse strand - VG split_path_graph; - unordered_map> split_trans = algorithms::split_strands(&path_graph, &split_path_graph); - - algorithms::lazier_sort(&split_path_graph); - - auto node_trans = split_path_graph.overlay_node_translations(split_trans, path_trans); - + // this is a short anchor on one of the tails + keep[i] = false; + continue; + } + SeqComplexity<6> complexity(chunk.first.first, chunk.first.second); + for (int order = 1; order <= 6; ++order) { + if (complexity.p_value(order) < low_complexity_p_value) { #ifdef debug_anchored_surject - cerr << "made split, linearized path graph " << pb2json(split_path_graph.graph) << endl; + cerr << "anchor " << i << " pruned being low complexity at order " << order << " with p-value " << complexity.p_value(order) << " and repetitive fraction " << complexity.repetitiveness(order) << endl; #endif - - // compute the connectivity between the path chunks - MultipathAlignmentGraph mp_aln_graph(split_path_graph, path_record.second, source, node_trans); - - // we don't overlap this reference path at all or we filtered out all of the path chunks, so just make a sentinel - if (mp_aln_graph.empty()) { - path_surjections[path_record.first] = make_null_alignment(source); - continue; - } - - // TODO: is this necessary in a linear graph? - vector topological_order; - mp_aln_graph.topological_sort(topological_order); - mp_aln_graph.remove_transitive_edges(topological_order); - - // align the intervening segments and store the result in a multipath alignment - MultipathAlignment mp_aln; - mp_aln_graph.align(source, split_path_graph, get_aligner(), false, 1, false, 1, mp_aln); - topologically_order_subpaths(mp_aln); - - for (size_t i = 0; i < mp_aln.subpath_size(); i++) { - // translate back into the original ID space - translate_oriented_node_ids(*mp_aln.mutable_subpath(i)->mutable_path(), node_trans); + // the sequences is repetitive at this order + keep[i] = false; + break; + } + } + } } + size_t kept_anchors = 0; + for (auto& i : anchors_by_length) { + // For each anchor longest to shortest + if (kept_anchors < max_anchors) { + // If we can keep it + if (keep[i]) { + // And we want to keep it + // Remember we kept one + kept_anchors++; + } + } else { + // After we keep enough, all other anchors can't be kept. #ifdef debug_anchored_surject - cerr << "made multipath alignment " << pb2json(mp_aln) << endl; + cerr << "anchor " << i << " pruned because we already have " << max_anchors << " anchors" << endl; #endif - -#ifdef debug_validate_anchored_multipath_alignment - if (!validate_multipath_alignment(mp_aln, *xindex)) { - cerr << "WARNING: multipath alignment for surjection of " << source.name() << " failed to validate" << endl; + keep[i] = false; + } } -#endif - // concatenate the subpaths - Alignment& surjected = path_surjections[path_record.first]; - optimal_alignment(mp_aln, surjected); - - // transfer applicable metadata - surjected.set_mapping_quality(source.mapping_quality()); - if (source.has_fragment_next()) { - *surjected.mutable_fragment_next() = source.fragment_next(); - } - if (source.has_fragment_prev()) { - *surjected.mutable_fragment_prev() = source.fragment_prev(); + // make sure we didn't flag all of the anchors for removal + bool keep_any = false; + for (bool b : keep) { + keep_any = keep_any || b; } - + if (kept_anchors == 0) { + // we filtered out all of the anchors, choose the longest one to keep + // even though it failed the filter + if (!anchors_by_length.empty()) { + auto max_idx = anchors_by_length.at(0); #ifdef debug_anchored_surject - cerr << "concatenated and translated alignment " << pb2json(surjected) << endl; + cerr << "reversing decision to prune " << max_idx << endl; #endif + keep[max_idx] = true; + } + } + // we're keeping at least one anchor, so we should be able to throw away the other ones + int removed_so_far = 0; + for (int i = 0; i < path_chunks.size(); ++i) { + if (!keep[i]) { + ++removed_so_far; + } + else if (removed_so_far) { + path_chunks[i - removed_so_far] = move(path_chunks[i]); + step_ranges[i - removed_so_far] = move(step_ranges[i]); + } + } + if (removed_so_far) { + path_chunks.resize(path_chunks.size() - removed_so_far); + step_ranges.resize(step_ranges.size() - removed_so_far); + } + } + + // the surjected alignment for each path we overlapped + unordered_map, pair>> aln_surjections; + unordered_map, pair>> mp_aln_surjections; + for (pair, pair, vector>>>& surj_record : path_overlapping_anchors) { + // to hold the path interval that corresponds to the path we surject to + pair path_range; + if (!preserve_deletions && source_aln) { + // unspliced GAM -> GAM surjection + auto surjection = realigning_surject(&memoizing_graph, *source_aln, surj_record.first.first, surj_record.first.second, + surj_record.second.first, surj_record.second.second, path_range, allow_negative_scores); + if (surjection.path().mapping_size() != 0) { + aln_surjections[surj_record.first] = make_pair(move(surjection), path_range); + } + } + else if (source_aln) { + // spliced GAM -> GAM surjection + auto surjection = spliced_surject(&memoizing_graph, source_aln->sequence(), source_aln->quality(), + source_aln->mapping_quality(), surj_record.first.first, surj_record.first.second, + surj_record.second.first, surj_record.second.second, + connections[surj_record.first], path_range, + allow_negative_scores, preserve_deletions); + if (surjection.subpath_size() != 0) { + // this internal method is written for multipath alignments, so we need to convert to standard alignments + aln_surjections[surj_record.first] = make_pair(Alignment(), path_range); + auto& surjected_aln = aln_surjections[surj_record.first].first; + optimal_alignment(surjection, surjected_aln, allow_negative_scores); + } + } + else { + // surjecting a multipath alignment (they always use the spliced pathway even if not + // doing spliced alignment) + auto surjection = spliced_surject(&memoizing_graph, source_mp_aln->sequence(), + source_mp_aln->quality(), source_mp_aln->mapping_quality(), + surj_record.first.first, surj_record.first.second, + surj_record.second.first, surj_record.second.second, + connections[surj_record.first], path_range, + allow_negative_scores, preserve_deletions); + if (surjection.subpath_size() != 0) { + // the surjection was a success + + // copy over annotations + // TODO: also redundantly copies over sequence and quality + transfer_read_metadata(*source_mp_aln, surjection); + + // record the result for this path + mp_aln_surjections[surj_record.first] = make_pair(move(surjection), path_range); + } + } } // in case we didn't overlap any paths, add a sentinel so the following code still executes correctly - if (path_surjections.empty()) { - path_surjections[0] = make_null_alignment(source); + if (aln_surjections.empty() && mp_aln_surjections.empty()) { + // this surjection didn't get aligned + positions_out.emplace_back("", -1, false); + if (source_mp_aln) { + mp_alns_out->emplace_back(make_null_mp_alignment(source_mp_aln->sequence(), source_mp_aln->quality())); + // copy over annotations + // TODO: also redundantly copies over sequence and quality + transfer_read_metadata(*source_mp_aln, mp_alns_out->back()); + } + else { + alns_out->emplace_back(make_null_alignment(*source_aln)); + } + return; } - // choose which path surjection was best - size_t best_path_rank; - int32_t score = numeric_limits::min(); - for (const auto& surjection : path_surjections) { - if (surjection.second.score() >= score) { - score = surjection.second.score(); - best_path_rank = surjection.first; + string annotation_string; + if (annotate_with_all_path_scores) { + if (source_aln) { + annotation_string = path_score_annotations(aln_surjections); + } + else { + annotation_string = path_score_annotations(mp_aln_surjections); } } - // which path was it? - path_name_out = path_rank_to_name[best_path_rank]; - - Alignment& best_surjection = path_surjections[best_path_rank]; - - // find the position along the path - const xg::XGPath& best_xpath = xindex->get_path(path_name_out); - set_path_position(best_surjection, best_path_rank, best_xpath, path_name_out, path_pos_out, path_rev_out, &oriented_occurrences_memo); - + // choose which path strands we will output + vector> strands_to_output; + if (all_paths) { + vector> path_strands; + if (source_aln) { + for (const auto& surjection : aln_surjections) { + path_strands.emplace_back(surjection.second.first.score(), + surjection.first.first, surjection.first.second); + } + } + else { + for (const auto& surjection : mp_aln_surjections) { + path_strands.emplace_back(optimal_alignment_score(surjection.second.first, allow_negative_scores), + surjection.first.first, surjection.first.second); + } + } + sort(path_strands.begin(), path_strands.end()); + for (const auto& path_strand : path_strands) { + strands_to_output.emplace_back(get<1>(path_strand), get<2>(path_strand)); + } + } + else { + // choose which path surjection was best + pair best_path_strand; + int32_t score = numeric_limits::min(); + for (const auto& surjection : aln_surjections) { + if (surjection.second.first.score() >= score) { #ifdef debug_anchored_surject - cerr << "chose path " << path_name_out << " at position " << path_pos_out << (path_rev_out ? "-" : "+") << endl; + cerr << "surjection against path " << graph->get_path_name(surjection.first.first) << " strand " << surjection.first.second << " achieves highest score of " << surjection.second.first.score() << ": " << pb2json(surjection.second.first) << endl; #endif - return move(best_surjection); - } - - unordered_map> - Surjector::extract_overlapping_paths(const Alignment& source, const unordered_map& path_rank_to_name, - unordered_map>* paths_of_node_memo, - unordered_map, vector>>* oriented_occurrences_memo) { - - - unordered_map> to_return; - - const Path& path = source.path(); - - // for each path rank that we're extending, the offset and relative orientation of the previous node - unordered_map>> offset_and_orientations_on_paths; - int64_t through_to_length = 0; + score = surjection.second.first.score(); + best_path_strand = surjection.first; + } + } + for (const auto& surjection : mp_aln_surjections) { + + int32_t surj_score = optimal_alignment_score(surjection.second.first, allow_negative_scores); + if (surj_score >= score) { +#ifdef debug_anchored_surject + cerr << "surjection against path " << graph->get_path_name(surjection.first.first) << " strand " << surjection.first.second << " achieves highest score of " << surj_score << ": " << debug_string(surjection.second.first) << endl; +#endif + score = surj_score; + best_path_strand = surjection.first; + } + } + strands_to_output.emplace_back(best_path_strand); + } - for (size_t i = 0; i < path.mapping_size(); i++) { - - int64_t before_to_length = through_to_length; - through_to_length += mapping_to_length(path.mapping(i)); + for (size_t i = 0; i < strands_to_output.size(); ++i) { + const auto& path_strand = strands_to_output[i]; - const Position& pos = path.mapping(i).position(); - vector paths_of_node = xindex->memoized_paths_of_node(pos.node_id(), paths_of_node_memo); + // find the position along the path - unordered_set paths_here; - for (size_t path_rank : paths_of_node) { - if (path_rank_to_name.count(path_rank)) { - paths_here.insert(path_rank); + // retrieve the first/last positions of the best alignment and the corresponding + // path range + pair path_range; + pos_t initial_pos, final_pos; + if (source_aln) { + auto& surjection = aln_surjections[path_strand]; + initial_pos = initial_position(surjection.first.path()); + final_pos = final_position(surjection.first.path()); + path_range = surjection.second; + alns_out->emplace_back(move(surjection.first)); + + if (i != 0 || source_aln->is_secondary()) { + alns_out->back().set_is_secondary(true); + } + + if (annotate_with_all_path_scores) { + set_annotation(alns_out->back(), "all_scores", annotation_string); } - } - - if (paths_here.empty()) { - // none of the paths that this node is on are in the list we're surjecting onto - offset_and_orientations_on_paths.clear(); } else { - // we're on at least one path that we're surjecting onto + auto& surjection = mp_aln_surjections[path_strand]; + initial_pos = initial_position(surjection.first.subpath().front().path()); + final_pos = final_position(surjection.first.subpath().back().path()); + path_range = surjection.second; + mp_alns_out->emplace_back(move(surjection.first)); - // for each path - for (size_t path_rank : paths_here) { - - // we'll need to know where this node occurs on the path - auto occurrences = xindex->memoized_oriented_occurrences_on_path(pos.node_id(), path_rank, - oriented_occurrences_memo); - - // the chunks of the alignments along this path - vector& path_chunks = to_return[path_rank]; - - // get the location(s) we were extending along the path in the previous iteration (if any) - auto& offset_and_orientations_on_path = offset_and_orientations_on_paths[path_rank]; - - if (offset_and_orientations_on_paths.count(path_rank)) { - // we were on the path before too, so we might be extending an existing chunk - - // do any of these locations match up with where we are now? - unordered_set> next_offsets_and_orientations; - for (pair& occurrence : occurrences) { - bool on_reverse = (occurrence.second != pos.is_reverse()); - size_t prev_offset = on_reverse ? occurrence.first + 1 : occurrence.first - 1; - - if (offset_and_orientations_on_path.count(make_pair(prev_offset, on_reverse))) { - next_offsets_and_orientations.emplace(occurrence.first, on_reverse); - } - } - - if (next_offsets_and_orientations.empty()) { - // we're still on the path, but we didn't follow the next edge in the path so this - // is actually the start of a new chunk not a continuation of the current chunk - - // initialize a path chunk - path_chunks.emplace_back(); - path_chunks.back().first.first = source.sequence().begin() + before_to_length; - - // mark the locations of this node along the path - offset_and_orientations_on_path.clear(); - for (pair& occurrence : occurrences) { - offset_and_orientations_on_path.emplace(occurrence.first, occurrence.second != pos.is_reverse()); + if (i != 0) { + mp_alns_out->back().set_annotation("secondary", true); + } + else if (i == 0 && source_mp_aln->has_annotation("secondary")) { + auto annotation = source_mp_aln->get_annotation("secondary"); + assert(annotation.first == multipath_alignment_t::Bool); + mp_alns_out->back().set_annotation("secondary", *((bool*) annotation.second)); + } + + if (annotate_with_all_path_scores) { + mp_alns_out->back().set_annotation("all_scores", annotation_string); + } + } + + // use this info to set the path position + positions_out.emplace_back(); + set_path_position(&memoizing_graph, initial_pos, final_pos, path_range.first, path_range.second, + path_strand.second, get<0>(positions_out.back()), get<1>(positions_out.back()), + get<2>(positions_out.back())); + + +#ifdef debug_anchored_surject + cerr << "chose path " << get<0>(positions_out.back()) << " at position " << get<1>(positions_out.back()) << (get<2>(positions_out.back()) ? "-" : "+") << endl; +#endif + } + + } + + vector> Surjector::reverse_adjacencies(const vector>& adj) const { + // make a reverse adjacency list + vector> rev_adj(adj.size()); + for (size_t i = 0; i < adj.size(); ++i) { + for (size_t j : adj[i]) { + rev_adj[j].push_back(i); + } + } + return rev_adj; + } + + vector Surjector::connected_components(const vector>& adj, const vector>& rev_adj, + size_t* num_comps_out) const { + + // DFS to find connected components + vector enqueued(adj.size(), false); + vector comps(adj.size()); + size_t curr_comp = 0; + for (size_t i = 0; i < adj.size(); ++i) { + if (!enqueued[i]) { + vector stack(1, i); + enqueued[i] = true; + while (!stack.empty()) { + size_t here = stack.back(); + stack.pop_back(); + comps[here] = curr_comp; + for (const vector>* adj_list : {&adj, &rev_adj}) { + for (size_t j : (*adj_list)[here]) { + if (!enqueued[j]) { + stack.push_back(j); + enqueued[j] = true; } } - else { - // keep track of where the next positions should come from - offset_and_orientations_on_path = move(next_offsets_and_orientations); - } - } - else { - // this is the start of a new chunk, initialize it with a mpapping - path_chunks.emplace_back(); - path_chunks.back().first.first = source.sequence().begin() + before_to_length; - - // mark the locations of this node along the path - offset_and_orientations_on_path.clear(); - for (pair& occurrence : occurrences) { - offset_and_orientations_on_path.emplace(occurrence.first, occurrence.second != pos.is_reverse()); - } } - // extend the path chunk by this mapping - path_chunks.back().first.second = source.sequence().begin() + through_to_length; - *path_chunks.back().second.add_mapping() = path.mapping(i); } - // if we've left any paths, we need to remove them from the locations index - vector to_erase; - for (const auto& path_record : offset_and_orientations_on_paths) { - if (!paths_here.count(path_record.first)) { - to_erase.push_back(path_record.first); - } - } - for (size_t path_rank : to_erase) { - offset_and_orientations_on_paths.erase(path_rank); - } + curr_comp += 1; } } - return to_return; + if (num_comps_out) { + *num_comps_out = curr_comp; + } + + return comps; } - - pair - Surjector::compute_path_interval(const Alignment& source, size_t path_rank, const xg::XGPath& xpath, const vector& path_chunks, - unordered_map, vector>>* oriented_occurrences_memo) { + + vector> Surjector::transitive_reduction(const vector>& adj) const { - pair interval(numeric_limits::max(), numeric_limits::min()); + // by construction the graph here has edges in topological order + + vector> reduction(adj.size()); - for (const auto& path_chunk : path_chunks) { + for (size_t i = 0; i < adj.size(); ++i) { - string::const_iterator read_pos = path_chunk.first.first; + const vector& edges = adj[i]; - // TODO: do I need to do this at every mapping? it might be enough to just look at the first and last - for (size_t i = 0; i < path_chunk.second.mapping_size(); i++) { - const Position& pos = path_chunk.second.mapping(i).position(); - - // the distance the read could align to the left of this mapping (oriented by the read) - int64_t left_overhang = get_aligner()->longest_detectable_gap(source, read_pos) + (read_pos - source.sequence().begin()); - - read_pos += mapping_to_length(path_chunk.second.mapping(i)); - - // the distance the read could align to the right of this mapping (oriented by the read) - int64_t right_overhang = get_aligner()->longest_detectable_gap(source, read_pos) + (source.sequence().end() - read_pos); + if (edges.size() == 1) { + // optimization: a single edge out can never be transitive + reduction[i].push_back(edges[0]); + continue; + } + + vector traversed(adj.size() - i, false); + + for (size_t j = 0; j < edges.size(); j++) { - auto oriented_occurrences = xindex->memoized_oriented_occurrences_on_path(pos.node_id(), path_rank, - oriented_occurrences_memo); + size_t edge = edges[j]; + if (traversed[edge - i]) { + // we can reach the target of this edge by another path, so it is transitive + continue; + } - // the length forward along the path that the end of the mapping is - int64_t mapping_length = mapping_from_length(path_chunk.second.mapping(i)); + // this edge reaches a target we haven't traversed to yet, so it can't be transitive + reduction[i].push_back(edge); - for (const pair& occurrence : oriented_occurrences) { - if (occurrence.second == pos.is_reverse()) { - int64_t path_offset = xpath.positions[occurrence.first]; - - int64_t left_boundary = max(0, path_offset + pos.offset() - left_overhang); - interval.first = min(interval.first, left_boundary); - - int64_t right_boundary = min(path_offset + pos.offset() + mapping_length + right_overhang, xpath.offsets.size() - 1); - interval.second = max(interval.second, right_boundary); - -#ifdef debug_anchored_surject - cerr << "path chunk " << pb2json(path_chunk.second) << " can be aligned to forward strand in interval " << left_boundary << ":" << right_boundary << endl; -#endif + // DFS to mark all reachable nodes from this edge + vector stack(1, edge); + traversed[edge - i] = true; + while (!stack.empty()) { + size_t idx = stack.back(); + stack.pop_back(); + for (size_t k : adj[idx]) { + if (!traversed[k - i]) { + stack.push_back(k); + traversed[k - i] = true; + } } - else { - int64_t path_offset = occurrence.first + 1 < xpath.positions.size() ? xpath.positions[occurrence.first + 1] : xpath.offsets.size(); - - int64_t left_boundary = max(0, path_offset - pos.offset() - mapping_length - right_overhang); - interval.first = min(interval.first, left_boundary); - - int64_t right_boundary = min(path_offset - pos.offset() + left_overhang, xpath.offsets.size() - 1); - interval.second = max(interval.second, right_boundary); + } + } + + + } + + return reduction; + } + + vector> Surjector::remove_dominated_chunks(const string& src_sequence, + const vector>& adj, + vector& path_chunks, + vector>& ref_chunks, + vector>& connections) const { + + // this is an easy way to ensure that all adjacency lists are ordered by index + auto rev_adj = reverse_adjacencies(adj); + auto fwd_adj = reverse_adjacencies(rev_adj); + + map, vector>, vector> neighbor_groups; + for (size_t i = 0; i < adj.size(); ++i) { + neighbor_groups[make_pair(rev_adj[i], fwd_adj[i])].push_back(i); + } + +#ifdef debug_spliced_surject + cerr << "neighbor groups:" << endl; + for (const auto& group : neighbor_groups) { + cerr << "("; + for (size_t i = 0; i < group.first.first.size(); ++i) { + if (i != 0){ + cerr << ", "; + } + cerr << group.first.first[i]; + } + cerr << ") ("; + for (size_t i = 0; i < group.first.second.size(); ++i) { + if (i != 0){ + cerr << ", "; + } + cerr << group.first.second[i]; + } + cerr << ")" << endl; + for (auto i : group.second) { + cerr << "\t" << i << endl; + } + } +#endif + + vector to_remove; + for (const auto& group : neighbor_groups) { + // only remove dominated chunks if they have the same, non-empty set of neighbors + if (group.second.size() > 1 && (!group.first.first.empty() || !group.first.second.empty())) { + vector total_lengths(group.second.size()); + int64_t max_total_length = 0; + for (size_t i = 0; i < group.second.size(); ++i) { + auto& chunk = path_chunks[group.second[i]]; + total_lengths[i] = path_from_length(chunk.second) + (chunk.first.second - chunk.first.first); + // don't count softclips + if (chunk.first.first == src_sequence.begin()) { + const auto& first_edit = *chunk.second.mapping().begin()->edit().begin(); + if (first_edit.from_length() == 0) { + total_lengths[i] -= first_edit.to_length(); + } + } + if (chunk.first.second == src_sequence.end()) { + const auto& last_edit = *chunk.second.mapping().rbegin()->edit().rbegin(); + if (last_edit.from_length() == 0) { + total_lengths[i] -= last_edit.to_length(); + } + } + max_total_length = max(total_lengths[i], max_total_length); + } + for (size_t i = 0; i < group.second.size(); ++i) { + if (total_lengths[i] < max_total_length - 2 * dominated_path_chunk_diff) { + to_remove.push_back(group.second[i]); + } + } + } + } + + if (!to_remove.empty()) { +#ifdef debug_spliced_surject + cerr << "marked for removal (unless has a connection):" << endl; + for (auto i : to_remove) { + cerr << "\t" << i << endl; + } +#endif + + unordered_set connected; + for (const auto& connection : connections) { + connected.insert(get<0>(connection)); + connected.insert(get<1>(connection)); + } + + vector do_remove(fwd_adj.size(), false); + for (size_t i : to_remove) { + if (!connected.count(i)) { + do_remove[i] = true; + } + } + + vector removed_before(fwd_adj.size() + 1, 0); + for (size_t i = 0; i < fwd_adj.size(); ++i) { + if (do_remove[i]) { + removed_before[i + 1] = removed_before[i] + 1; + } + else { + if (removed_before[i]) { + fwd_adj[i - removed_before[i]] = move(fwd_adj[i]); + path_chunks[i - removed_before[i]] = move(path_chunks[i]); + ref_chunks[i - removed_before[i]] = move(ref_chunks[i]); + } + removed_before[i + 1] = removed_before[i]; + } + } + fwd_adj.resize(fwd_adj.size() - removed_before.back()); + path_chunks.resize(fwd_adj.size()); + ref_chunks.resize(fwd_adj.size()); + + for (auto& adj_list : fwd_adj) { + size_t removed_so_far = 0; + for (size_t i = 0; i < adj_list.size(); ++i) { + if (do_remove[adj_list[i]]) { + ++removed_so_far; + } + else { + adj_list[i - removed_so_far] = adj_list[i] - removed_before[adj_list[i]]; + } + } + adj_list.resize(adj_list.size() - removed_so_far); + } + + for (auto& connection : connections) { + get<0>(connection) -= removed_before[get<0>(connection)]; + get<1>(connection) -= removed_before[get<1>(connection)]; + } + } + return fwd_adj; + } + + void Surjector::prune_unconnectable(vector>& adj, + vector>>& splice_adj, + vector& component, + vector>& comp_groups, + vector& path_chunks, + vector>& ref_chunks) const { + + // record which path chunks and component groups have connection adjacencies + vector has_connection_from(adj.size(), false), has_connection_to(adj.size(), false); + vector comp_has_connection_from(comp_groups.size(), false), comp_has_connection_to(comp_groups.size(), false); + for (size_t i = 0; i < splice_adj.size(); ++i) { + for (auto& edge : splice_adj[i]) { + if (get<2>(edge)) { + has_connection_from[i] = true; + comp_has_connection_from[component[i]] = true; + has_connection_to[get<0>(edge)] = true; + comp_has_connection_to[component[get<0>(edge)]] = true; + } + } + } + + // record which path chunks are reachable from the connections adjacencies + vector connects_forward(adj.size(), false), connects_backward(adj.size(), false); + for (size_t i = 0; i < adj.size(); ++i) { + size_t j = adj.size() - i - 1; + connects_forward[i] = connects_forward[i] || has_connection_to[i]; + connects_backward[j] = connects_backward[j] || has_connection_from[j]; + for (auto k : adj[i]) { + connects_forward[k] = connects_forward[k] || connects_forward[i]; + } + for (auto k : adj[j]) { + connects_backward[j] = connects_backward[j] || connects_backward[k]; + } + } + +#ifdef debug_prune_unconnectable + cerr << "connects forward:" << endl; + for (size_t i = 0; i < adj.size(); ++i) { + cerr << "\t" << i << ": " << connects_forward[i] << endl; + } + cerr << "connects backward:" << endl; + for (size_t i = 0; i < adj.size(); ++i) { + cerr << "\t" << i << ": " << connects_backward[i] << endl; + } +#endif + + // mark path chunks for removal if a component has a connection but the + // chunks don't occur on any interconnectino paths + vector> to_remove_by_group(comp_groups.size()); + for (size_t i = 0; i < adj.size(); ++i) { + size_t grp = component[i]; + if ((comp_has_connection_to[grp] && !connects_forward[i]) + || (comp_has_connection_from[grp] && !connects_backward[i])) { + to_remove_by_group[grp].insert(i); +#ifdef debug_prune_unconnectable + cerr << "marking " << i << " for removal" << endl; +#endif + } + } + + // filter out the chunks + vector removed(adj.size() + 1, 0); + for (size_t i = 0; i < adj.size(); ++i) { + size_t grp = component[i]; + // remove if not on an inter-connection path, but don't remove an entire group + // TODO: but how can we be sure to produce sensible results when an entire group + // should be removed? + if (to_remove_by_group[grp].count(i) + && to_remove_by_group[grp].size() < comp_groups[grp].size()) { +#ifdef debug_prune_unconnectable + cerr << "removing " << i << endl; +#endif + removed[i + 1] = removed[i] + 1; + } + else { + size_t removed_so_far = removed[i]; + removed[i + 1] = removed_so_far; + if (removed_so_far) { + adj[i - removed_so_far] = move(adj[i]); + splice_adj[i - removed_so_far] = move(splice_adj[i]); + path_chunks[i - removed_so_far] = move(path_chunks[i]); + ref_chunks[i - removed_so_far] = move(ref_chunks[i]); + component[i - removed_so_far] = component[i]; + } + } + } + + if (removed.back()) { + adj.resize(adj.size() - removed.back()); + splice_adj.resize(adj.size()); + path_chunks.resize(adj.size()); + ref_chunks.resize(adj.size()); + component.resize(adj.size()); + + // rewire the adjacencies to the correct chunks + for (auto& adj_list : adj) { + size_t adj_removed = 0; + for (size_t i = 0; i < adj_list.size(); ++i) { + if (removed[adj_list[i]] != removed[adj_list[i] + 1]) { + ++adj_removed; + } + else { + adj_list[i - adj_removed] = adj_list[i] - removed[adj_list[i]]; + } + } + adj_list.resize(adj_list.size() - adj_removed); + } + +#ifdef debug_prune_unconnectable + cerr << "rewired adj list:" << endl; + for (size_t i = 0; i < adj.size(); ++i) { + cerr << i << ":"; + for (auto j : adj[i]) { + cerr << " " << j; + } + cerr << endl; + } +#endif + + // rewire the splice adjacencies to the correct chunks + for (auto& splice_adj_list : splice_adj) { + size_t adj_removed = 0; + for (size_t i = 0; i < splice_adj_list.size(); ++i) { + auto& edge = splice_adj_list[i]; + if (removed[get<0>(edge)] != removed[get<0>(edge) + 1]) { + ++adj_removed; + } + else { + splice_adj_list[i - adj_removed] = make_tuple(get<0>(edge) - removed[get<0>(edge)], get<1>(edge), get<2>(edge)); + } + } + splice_adj_list.resize(splice_adj_list.size() - adj_removed); + } + +#ifdef debug_prune_unconnectable + cerr << "rewired splice adj list:" << endl; + for (size_t i = 0; i < splice_adj.size(); ++i) { + cerr << i << ":"; + for (auto edge : splice_adj[i]) { + cerr << " (" << get<0>(edge) << " " << get<1>(edge) << " " << get<2>(edge) << ")"; + } + cerr << endl; + } +#endif + + // correct the indexes in the group + for (auto& group : comp_groups) { + size_t grp_removed = 0; + for (size_t i = 0; i < group.size(); ++i) { + if (removed[group[i]] != removed[group[i] + 1]) { + ++grp_removed; + } + else { + group[i - grp_removed] = group[i] - removed[group[i]]; + } + } + group.resize(group.size() - grp_removed); + } + + } + } + + vector, vector>> Surjector::find_constriction_bicliques(const vector>& adj, + const string& src_sequence, + const string& src_quality, + vector& path_chunks, + vector>& ref_chunks, + const vector>& connections) const { + + auto connected_by_edge = [&](size_t i, size_t j) { + const auto& final_mapping = *path_chunks[i].second.mapping().rbegin(); + const auto& final_position = final_mapping.position(); + const auto& initial_position = path_chunks[j].second.mapping().begin()->position(); + handle_t handle = graph->get_handle(final_position.node_id(), + final_position.is_reverse()); + if (initial_position.offset() == 0 + && final_position.offset() + mapping_from_length(final_mapping) == graph->get_length(handle) + && graph->has_edge(handle, graph->get_handle(initial_position.node_id(), initial_position.is_reverse()))) { + return true; + } + else { + return false; + } + }; + + // are we on the reverse or forward strand of the path + const bool path_rev = (graph->get_is_reverse(graph->get_handle_of_step(ref_chunks[0].first)) + != path_chunks[0].second.mapping(0).position().is_reverse()); + + auto rev_adj = reverse_adjacencies(adj); + + size_t num_comps = 0; + auto comps = connected_components(adj, rev_adj, &num_comps); + + vector fwd(adj.size(), 0), bwd(adj.size(), 0); + vector total_comp_paths(num_comps, 0); + + for (int64_t i = adj.size() - 1; i >= 0; --i) { + if (adj[i].empty()) { + // sink node + bwd[i] = 1; + } + else { + for (size_t j : adj[i]) { + bwd[i] += bwd[j]; + } + } + if (rev_adj[i].empty()) { + // source node, add paths to total + total_comp_paths[comps[i]] += bwd[i]; + } + } + + + for (int64_t i = 0; i < adj.size(); ++i) { + if (rev_adj[i].empty()) { + // source node + fwd[i] = 1; + } + else { + for (size_t j : rev_adj[i]) { + fwd[i] += fwd[j]; + } + } + } + +#ifdef debug_constrictions + cerr << "forward counts" << endl; + for (size_t i = 0; i < fwd.size(); ++i) { + cerr << "\t" << i << ": " << fwd[i] << endl; + } + cerr << "backward counts" << endl; + for (size_t i = 0; i < bwd.size(); ++i) { + cerr << "\t" << i << ": " << bwd[i] << endl; + } +#endif + + unordered_set> enqueued; + vector>> adjacency_components; + for (size_t i = 0; i < adj.size(); ++i) { + for (bool left : {true, false}) { + if (!enqueued.count(make_pair(i, left))) { + + // start new adjacency component + adjacency_components.emplace_back(); + auto& adj_component = adjacency_components.back(); + + // init queue + vector> queue; + queue.emplace_back(i, left); + enqueued.emplace(i, left); + + // DFS bouncing back and forth across the sides + while (!queue.empty()) { + auto side = queue.back(); + queue.pop_back(); + adj_component.emplace_back(side); + + const auto& edges = side.second ? rev_adj[side.first] : adj[side.first]; + for (size_t j : edges) { + if (!enqueued.count(make_pair(j, !side.second))) { + enqueued.emplace(j, !side.second); + queue.emplace_back(j, !side.second); + } + } + } + + } + } + } + +#ifdef debug_constrictions + cerr << "adjacency components" << endl; + for (size_t i = 0; i < adjacency_components.size(); ++i) { + cerr << "component " << i << ":" << endl; + for (auto side : adjacency_components[i]) { + cerr << "\t" << side.first << " " << "RL"[side.second] << endl; + } + } +#endif + + // reorganize the connections into an adjacency list + unordered_map> connection_adj; + for (const auto& connection : connections) { + connection_adj[get<0>(connection)].emplace(get<1>(connection)); + } + + // init return value + vector, vector>> return_val; + + for (auto& adj_component : adjacency_components) { + if (adj_component.size() == 1) { + // trivial component (probably at start or end) + continue; + } + +#ifdef debug_constrictions + cerr << "checking adjacency component containing" << endl; + for (auto side : adj_component) { + cerr << "\t" << side.first << " " << "RL"[side.second] << endl; + } +#endif + + // record if there are any deletions + vector deletion_chunks; + for (auto chunk_side : adj_component) { + if (path_chunks[chunk_side.first].first.first == path_chunks[chunk_side.first].first.second) { + deletion_chunks.push_back(chunk_side.first); +#ifdef debug_constrictions + cerr << "chunk " << chunk_side.first << " is a deletion" << endl; +#endif + } + } + + // iterate over choices of left/right side for deletion chunks + for (size_t iter = 0, end = (1 << min(deletion_chunks.size(), 16)); iter < end; ++iter) { + +#ifdef debug_constrictions + cerr << "checking left-right combination " << iter << " of " << end << endl; +#endif + + // we will fill out the left and right side of this potential splice biclique + unordered_set left_side, right_side; + + size_t deletion_chunk_idx = 0; + for (auto chunk_side : adj_component) { + if (deletion_chunk_idx < deletion_chunks.size() && chunk_side.first == deletion_chunks[deletion_chunk_idx]) { + // deletions can go on either side, + if (iter & (1 << deletion_chunk_idx)) { +#ifdef debug_constrictions + cerr << "deletion chunk " << chunk_side.first << " goes to left side" << endl; +#endif + left_side.insert(chunk_side.first); + } + else { +#ifdef debug_constrictions + cerr << "deletion chunk " << chunk_side.first << " goes to right side" << endl; +#endif + right_side.insert(chunk_side.first); + } + ++deletion_chunk_idx; + } + else if (chunk_side.second) { + right_side.insert(chunk_side.first); + } + else { + left_side.insert(chunk_side.first); + } + } + + // record which pairs have a connection + bool incompatible = false; + unordered_set left_connected, right_connected; + for (auto left_it = left_side.begin(); left_it != left_side.end() && !incompatible; ++left_it) { + auto adj_it = connection_adj.find(*left_it); + if (adj_it != connection_adj.end()) { + for (auto right_it = adj_it->second.begin(); right_it != adj_it->second.end() && !incompatible; ++right_it) { + +#ifdef debug_constrictions + cerr << "looking at connection between " << *left_it << " and " << *right_it << endl; +#endif + if (right_side.count(*right_it)) { + left_connected.insert(*left_it); + right_connected.insert(*right_it); + } + else { + // the direction of this connection are not consistent with the left and right + // side of this iteration +#ifdef debug_constrictions + cerr << "connection is incompatible" << endl; +#endif + incompatible = true; + break; + } + } + } + } + + if (incompatible) { + // the division of deletions to the left and right side is not compatible with the + // connections + continue; + } + + // do the non-connected edges form a biclique? + for (auto left_it = left_side.begin(); left_it != left_side.end() && !incompatible; ++left_it) { + if (left_connected.count(*left_it)) { + continue; + } + size_t num_clique_edges = 0; + for (auto i : adj[*left_it]) { + if (right_connected.count(i)) { + // we don't worry about it if the node has a connection, because it will lose + // all of its edges anyway + continue; + } + if (right_side.count(i)) { + // this looks like it could be a splice junction + ++num_clique_edges; + } + else { +#ifdef debug_constrictions + cerr << "adjacency " << *left_it << " -> " << i << " is " << (right_side.count(i) ? "not connected by a graph edge" : "missing") << endl; +#endif + incompatible = true; + break; + } + } +#ifdef debug_constrictions + cerr << "found " << num_clique_edges << " out of expected " << (right_side.size() - right_connected.size()) << " on " << *left_it << "L" << endl; +#endif + incompatible = incompatible || (num_clique_edges != right_side.size() - right_connected.size()); + } + + if (incompatible) { + // we have edges going to outside the biclique, or we have edges missing + // from the biclique +#ifdef debug_constrictions + cerr << "this left-right combination (" << iter << " of " << end << ") is incompatible" << endl; +#endif + continue; + } + + // count up the walks through this biclique + size_t walk_total = 0; + for (auto left_it = left_side.begin(); left_it != left_side.end() && !incompatible; ++left_it) { + for (auto j : adj[*left_it]) { + walk_total += fwd[*left_it] * bwd[j]; + } + } + +#ifdef debug_constrictions + cerr << "biclique has a walk total of " << walk_total << " compared to component total " << total_comp_paths[comps[adj_component.front().first]] << endl; +#endif + + if (walk_total != total_comp_paths[comps[adj_component.front().first]]) { + // not a constriction + continue; + } + + for (auto left_it = left_side.begin(); left_it != left_side.end() && !incompatible; ++left_it) { + if (left_connected.count(*left_it)) { + continue; + } + size_t num_clique_edges = 0; + for (auto i : adj[*left_it]) { + if (right_connected.count(i)) { + // we don't worry about it if the node has a connection, because it will lose + // all of its edges anyway + continue; + } +#ifdef debug_constrictions + const auto& p1 = path_chunks[*left_it].second.mapping(path_chunks[*left_it].second.mapping_size() - 1).position(); + const auto& p2 = path_chunks[i].second.mapping(0).position(); + cerr << "read gap between " << *left_it << " and " << i << " is " << (path_chunks[i].first.first - path_chunks[*left_it].first.second) << ", connected by an edge at " << p1.node_id() << " " << p1.is_reverse() << " -> " << p2.node_id() << " " << p2.is_reverse() << "? " << connected_by_edge(*left_it, i) << endl; +#endif + if (path_chunks[*left_it].first.second != path_chunks[i].first.first || !connected_by_edge(*left_it, i)) { +#ifdef debug_constrictions + cerr << "fail deletion along edge condition in adjacency from " << *left_it << " to " << i << " with read positions " << (path_chunks[*left_it].first.second - src_sequence.begin()) << " and " << (path_chunks[i].first.first - src_sequence.begin()) << ", connected by edge? " << connected_by_edge(*left_it, i) << endl; +#endif + incompatible = true; + break; + } + } + } + + if (incompatible) { + +#ifdef debug_constrictions + cerr << "not all adjacencies in constriction are pure deletions on edges, attempting to repair the splice site"<< endl; +#endif + // it's a constriction biclique, but it doesn't have a pure deletion or pure adjacency + // we'll try to see if we can recover a splice junction here by + incompatible = false; + + int64_t max_dist = 0; + for (auto i : left_side) { + if (left_connected.count(i)) { + continue; + } + const auto& final_mapping = *path_chunks[i].second.mapping().rbegin(); + for (auto j : right_side) { + if (right_connected.count(j)) { + continue; + } + const auto& initial_mapping = *path_chunks[j].second.mapping().begin(); + bool path_rev = (graph->get_is_reverse(graph->get_handle_of_step(ref_chunks[j].first)) + != initial_mapping.position().is_reverse()); + int64_t path_dist = 0; + if (path_rev) { + path_dist = (graph->get_position_of_step(ref_chunks[i].second) + + graph->get_length(graph->get_handle_of_step(ref_chunks[i].second)) + - final_mapping.position().offset() + - mapping_from_length(final_mapping) + - graph->get_position_of_step(ref_chunks[j].first) + - graph->get_length(graph->get_handle_of_step(ref_chunks[j].first)) + + initial_mapping.position().offset()); + } + else { + path_dist = (graph->get_position_of_step(ref_chunks[j].first) + + initial_mapping.position().offset() + - graph->get_position_of_step(ref_chunks[i].second) + - final_mapping.position().offset() + - mapping_from_length(final_mapping)); + } + max_dist = max(max_dist, path_dist); + } + } + +#ifdef debug_constrictions + cerr << "max path distance is " << max_dist << ", compared to minimum for repair " << min_splice_repair_length << endl; +#endif + + if (max_dist < min_splice_repair_length) { + // all of the sides are close together on the path, so it's likely to be just variation + // and even if not then it won't be too terrible to just align it + continue; + } + + // make alignmments to the connecting graph across all of these edges + + vector> repair_alns; + repair_alns.reserve(left_side.size()); + + int64_t max_aln_length = (get_aligner()->longest_detectable_gap(src_sequence.size()) + + (path_chunks[*right_side.begin()].first.first + - path_chunks[*left_side.begin()].first.second)); + for (auto left_it = left_side.begin(); left_it != left_side.end() && !incompatible; ++left_it) { + if (left_connected.count(*left_it)) { + continue; + } + repair_alns.emplace_back(); + repair_alns.back().reserve(right_side.size()); + auto left_pos = final_position(path_chunks[*left_it].second); + for (auto i : right_side) { + if (right_connected.count(i)) { + // we don't worry about it if the node has a connection, because it will lose + // all of its edges anyway + continue; + } +#ifdef debug_constrictions + cerr << "attempting to repair splice adjacency from " << *left_it << " to " << i << " with read interval " << (path_chunks[*left_it].first.second - src_sequence.begin()) << ":" << (path_chunks[i].first.first - src_sequence.begin()) << endl; +#endif + + auto right_pos = initial_position(path_chunks[i].second); + + bdsg::HashGraph connecting; + auto id_trans = algorithms::extract_connecting_graph(graph, &connecting, max_aln_length, + left_pos, right_pos, true); + + +#ifdef debug_constrictions + cerr << "connecting graph between " << left_pos << " and " << right_pos << ":" << endl; + connecting.for_each_handle([&](const handle_t& handle) { + cerr << connecting.get_id(handle) << " " << connecting.get_sequence(handle) << endl; + connecting.follow_edges(handle, true, [&](const handle_t& prev) { + cerr << "\t" << connecting.get_id(prev) << " <-" << endl; + }); + connecting.follow_edges(handle, false, [&](const handle_t& next) { + cerr << "\t-> " << connecting.get_id(next) << endl; + }); + }); +#endif + + // remove any handles in the connecting graph that aren't on the path + path_handle_t path_handle = graph->get_path_handle_of_step(ref_chunks.front().first); + vector off_path_handles; + connecting.for_each_handle([&](const handle_t& handle) { + bool found = false; + graph->for_each_step_on_handle(graph->get_handle(connecting.get_id(handle)), + [&](const step_handle_t& step) { + found = graph->get_path_handle_of_step(step) == path_handle; + return !found; + }); + if (!found) { + off_path_handles.push_back(handle); + } + }); + for (handle_t handle : off_path_handles) { + connecting.destroy_handle(handle); + } + + // TODO: we could probably dagify, but i don't want to worry about it yet + if (connecting.get_node_count() == 0 || !handlealgs::is_directed_acyclic(&connecting) + || algorithms::num_components(connecting) != 1) { +#ifdef debug_constrictions + cerr << "did not get well-behaved intervening graph: " << connecting.get_node_count() << " nodes, acyclic? " << handlealgs::is_directed_acyclic(&connecting) << ", components " << algorithms::num_components(connecting) << endl; +#endif + incompatible = true; + break; + } + + // make the graph single stranded + auto orientation = handlealgs::single_stranded_orientation(&connecting); + if (orientation.empty()) { +#ifdef debug_constrictions + cerr << "graph does not have a single stranded orientation" << endl; +#endif + incompatible = true; + break; + } + for (auto handle : orientation) { + if (id_trans[connecting.get_id(handle)] == id(left_pos)) { + if (connecting.get_is_reverse(handle) != is_rev(left_pos)) { + // the orientation we got doesn't match our bounding positions, flip it + for (auto& handle : orientation) { + handle = connecting.flip(handle); + } + } + break; + } + } + unordered_map> oriented_trans; + for (auto& handle : orientation) { + oriented_trans[connecting.get_id(handle)] = make_pair(id_trans[connecting.get_id(handle)], + connecting.get_is_reverse(handle)); + handle = connecting.apply_orientation(handle); + } + +#ifdef debug_constrictions + cerr << "connecting graph after pruning to the path and orienting:" << endl; + connecting.for_each_handle([&](const handle_t& handle) { + cerr << connecting.get_id(handle) << " " << connecting.get_sequence(handle) << endl; + connecting.follow_edges(handle, true, [&](const handle_t& prev) { + cerr << "\t" << connecting.get_id(prev) << " <-" << endl; + }); + connecting.follow_edges(handle, false, [&](const handle_t& next) { + cerr << "\t-> " << connecting.get_id(next) << endl; + }); + }); +#endif + + // make sure everything is still reachable after pruning + // TODO: might be cheaper to not do the work for strict max length in extraction since + // we duplicate it here, but also this code path is rare, so whatever + algorithms::prune_to_connecting_graph(connecting, connecting.get_handle(id(left_pos)), + connecting.get_handle(id(right_pos))); + +#ifdef debug_constrictions + cerr << "connecting graph after pruning for reachability:" << endl; + connecting.for_each_handle([&](const handle_t& handle) { + cerr << connecting.get_id(handle) << " " << connecting.get_sequence(handle) << endl; + connecting.follow_edges(handle, true, [&](const handle_t& prev) { + cerr << "\t" << connecting.get_id(prev) << " <-" << endl; + }); + connecting.follow_edges(handle, false, [&](const handle_t& next) { + cerr << "\t-> " << connecting.get_id(next) << endl; + }); + }); +#endif + + repair_alns.back().emplace_back(); + auto& aln = repair_alns.back().back(); + aln.set_sequence(string(path_chunks[*left_it].first.second, + path_chunks[i].first.first)); + if (!src_quality.empty()) { + auto qual_begin = src_quality.begin() + (path_chunks[*left_side.begin()].first.second - src_sequence.begin()); + aln.set_quality(string(qual_begin, qual_begin + aln.sequence().size())); + } + + // do the alignment + get_aligner(!src_quality.empty())->align_global_banded(aln, connecting, 1, true); + + auto first_pos = aln.mutable_path()->mutable_mapping(0)->mutable_position(); + first_pos->set_offset(offset(left_pos)); +#ifdef debug_constrictions + cerr << "raw connecting alignment" << endl; + cerr << pb2json(aln) << endl; +#endif + + if (mapping_from_length(aln.path().mapping(aln.path().mapping_size() - 1)) == 0 + && mapping_to_length(aln.path().mapping(aln.path().mapping_size() - 1)) == 0) { + // the last mapping is to an empty node + aln.mutable_path()->mutable_mapping()->DeleteSubrange(aln.path().mapping_size() - 1, 1); + } + + if (aln.path().mapping_size() != 0 && mapping_from_length(aln.path().mapping(0)) == 0 + && mapping_to_length(aln.path().mapping(0)) == 0) { + // the first mapping is to an empty node + aln.mutable_path()->mutable_mapping()->DeleteSubrange(0, 1); + } + + translate_oriented_node_ids(*aln.mutable_path(), oriented_trans); +#ifdef debug_constrictions + cerr << "processed connecting alignment" << endl; + cerr << pb2json(aln) << endl; +#endif + } + } + + if (incompatible) { + // we couldn't make a short connecting graph for at least one of the edges + continue; + } + + // we'll record where along the alignment it gets divided into before/after the splice + // records of (mapping index, final fwd step, final rev step, prefix end, suffix start) + vector>> divisions(repair_alns.size()); + + // TODO: we might not find the same break point if one of the adjacencies is just direct + // across the path + + size_t left_idx = 0; + for (auto left_it = left_side.begin(); left_it != left_side.end() && !incompatible; ++left_it) { + if (left_connected.count(*left_it)) { + continue; + } + + size_t right_idx = 0; + for (auto i : right_side) { + if (right_connected.count(i)) { + // we don't worry about it if the node has a connection, because it will lose + // all of its edges anyway + continue; + } +#ifdef debug_constrictions + cerr << "checking divisibility from " << *left_it << " to " << i << endl; +#endif + + auto& aln = repair_alns[left_idx][right_idx]; + + step_handle_t fwd_step = ref_chunks[*left_it].second; + step_handle_t rev_step = ref_chunks[i].first; + bool shared_fwd_node = true; + if (aln.path().mapping_size() != 0 && aln.path().mapping(0).position().offset() == 0) { + // the start of the alignment is on a new node + shared_fwd_node = false; + fwd_step = path_rev ? graph->get_previous_step(fwd_step) : graph->get_next_step(fwd_step); + } + bool shared_rev_node = true; + if (aln.path().mapping_size() != 0 && path_chunks[i].second.mapping(0).position().offset() == 0) { + // the end of the alignment is on a new node + shared_rev_node = false; + rev_step = path_rev ? graph->get_next_step(rev_step) : graph->get_previous_step(rev_step); + } + + // walk out the prefix of the alignment along the reference + int64_t fwd_to_length = 0; + int64_t fwd_idx = 0; + while (fwd_idx < aln.path().mapping_size()) { + handle_t handle = graph->get_handle_of_step(fwd_step); + const auto& pos = aln.path().mapping(fwd_idx).position(); + if (graph->get_id(handle) != pos.node_id() || + graph->get_is_reverse(handle) != (path_rev != pos.is_reverse())) { + break; + } + fwd_to_length += mapping_to_length(aln.path().mapping(fwd_idx)); + fwd_step = path_rev ? graph->get_previous_step(fwd_step) : graph->get_next_step(fwd_step); + ++fwd_idx; + } + + // walk the suffix of the alignment along the reference + int64_t rev_to_length = 0; + int64_t rev_idx = aln.path().mapping_size() - 1; + while (rev_idx >= fwd_idx) { + handle_t handle = graph->get_handle_of_step(rev_step); + const auto& pos = aln.path().mapping(rev_idx).position(); + if (graph->get_id(handle) != pos.node_id() || + graph->get_is_reverse(handle) != (path_rev != pos.is_reverse())) { + break; + } + rev_to_length += mapping_to_length(aln.path().mapping(rev_idx)); + rev_step = path_rev ? graph->get_next_step(rev_step) : graph->get_previous_step(rev_step); + --rev_idx; + } + + if (fwd_idx <= rev_idx) { + // you can't walk out the whole alignment along the path +#ifdef debug_constrictions + cerr << "could not walk out alignment along the path" << endl; +#endif + incompatible = true; + break; + } + + if (fwd_idx != 0 || !shared_fwd_node) { + // nudge back the forward step to the last match + fwd_step = path_rev ? graph->get_next_step(fwd_step) : graph->get_previous_step(fwd_step); + } + if (rev_idx != aln.path().mapping_size() - 1 || !shared_rev_node) { + // nudge back the reverse step to the last match + rev_step = path_rev ? graph->get_previous_step(rev_step) : graph->get_next_step(rev_step); + } +#ifdef debug_constrictions + cerr << "divided at mapping index " << fwd_idx << ", steps at " << graph->get_position_of_step(fwd_step) << ", " << graph->get_position_of_step(rev_step) << endl; +#endif + + // record the break in the alignment + divisions[left_idx].emplace_back(fwd_idx, fwd_step, rev_step, + path_chunks[*left_it].first.second + fwd_to_length, + path_chunks[i].first.first - rev_to_length); + + ++right_idx; + } + ++left_idx; + } + + if (incompatible) { + continue; + } + + // now check to make sure that all prefixes are identical + + for (size_t i = 0; i < divisions.size() && !incompatible; ++i) { + for (size_t j = 1, n = get<0>(divisions[i].front()); j < divisions.front().size() && !incompatible; ++j) { + if (get<3>(divisions[i][j]) != get<3>(divisions[i].front())) { + // they don't end at the same read position +#ifdef debug_constrictions + cerr << "after alignment, not all left adjacencies are at same read position" << endl; +#endif + incompatible = true; + break; + } + if (get<0>(divisions[i][j]) != n) { + // they don't have the same number of mappings + incompatible = true; + break; + } + // check for equivalence of the mappings + for (size_t k = 0; k < n && !incompatible; ++k) { + if (!mappings_equivalent(repair_alns[i][j].path().mapping(k), + repair_alns[i][0].path().mapping(k))) { + incompatible = true; + break; + } + } + } + } + + if (incompatible) { +#ifdef debug_constrictions + cerr << "not all alignment prefixes match" << endl; +#endif + continue; + } + + // and also check to make sure that all suffixes are identical + + for (size_t j = 0; j < divisions[0].size() && !incompatible; ++j) { + int64_t n = repair_alns[0][j].path().mapping_size() - get<0>(divisions[0][j]); + for (size_t i = 1; i < divisions.size() && !incompatible; ++i) { + + if (get<4>(divisions[i][j]) != get<4>(divisions[0][j])) { + // they don't end at the same read position +#ifdef debug_constrictions + cerr << "after alignment, not all right adjacencies are at same read position" << endl; +#endif + incompatible = true; + break; + } + if (repair_alns[i][j].path().mapping_size() - get<0>(divisions[i][j]) != n) { + // they don't have the same number of mappings + incompatible = true; + break; + } + for (size_t k = 0; k < n && !incompatible; ++k) { + if (!mappings_equivalent(repair_alns[i][j].path().mapping(get<0>(divisions[i][j]) + k), + repair_alns[0][j].path().mapping(get<0>(divisions[0][j]) + k))) { + // the mappings aren't equivalent + // TODO: a better condition would be equal scoring, same length + incompatible = true; + break; + } + } + } + } + + if (incompatible) { +#ifdef debug_constrictions + cerr << "not all alignment suffixes match" << endl; +#endif + continue; + } + +#ifdef debug_constrictions + cerr << "splice adjacency is repairable, filling in paths" << endl; +#endif + + // we've finally guaranteed that we can repair a missed splice edge alignment + // and can now update the chunks accordingly + + size_t left = 0, right = 0; + for (auto i : left_side) { + if (left_connected.count(i)) { +#ifdef debug_constrictions + cerr << "skipping left side " << i << ", which has a connection" << endl; +#endif + continue; + } + size_t n = get<0>(divisions[left][0]); + if (n == 0) { +#ifdef debug_constrictions + cerr << "left side " << i << " does not need to be extended" << endl; +#endif + ++left; + continue; + } +#ifdef debug_constrictions + cerr << "extend left sequence " << i << " from " << string(path_chunks[i].first.first, path_chunks[i].first.second); +#endif + + path_chunks[i].first.second = get<3>(divisions[left][0]); + +#ifdef debug_constrictions + cerr << " to " << string(path_chunks[i].first.first, path_chunks[i].first.second) << endl; + cerr << "move left step from position " << graph->get_position_of_step(ref_chunks[i].second); +#endif + + ref_chunks[i].second = get<1>(divisions[left][0]); +#ifdef debug_constrictions + cerr << " to " << graph->get_position_of_step(ref_chunks[i].second) << endl; +#endif + + // check if we need to merge the first and last mappings + size_t k = 0; + auto final_mapping = path_chunks[i].second.mutable_mapping(path_chunks[i].second.mapping_size() - 1); + const auto& final_position = final_mapping->position(); + const auto& aln = repair_alns[left][0]; + const auto& first_mapping = aln.path().mapping(0); + const auto& first_position = first_mapping.position(); + if (final_position.node_id() == first_position.node_id() && + final_position.is_reverse() == first_position.is_reverse() && + final_position.offset() + mapping_from_length(*final_mapping) == first_position.offset()) { + + for (const auto& edit : first_mapping.edit()) { + *final_mapping->add_edit() = edit; + } + ++k; + } + // copy over the rest of the mappings + for (; k < n; ++k) { + auto mapping = path_chunks[i].second.add_mapping(); + *mapping = aln.path().mapping(k); + mapping->set_rank(path_chunks[i].second.mapping_size()); + } + +#ifdef debug_constrictions + cerr << "extended left path " << i << " to " << pb2json(path_chunks[i].second) << endl; +#endif + ++left; + } + for (auto i : right_side) { + if (right_connected.count(i)) { +#ifdef debug_constrictions + cerr << "skipping right side " << i << ", which has a connection" << endl; +#endif + continue; + } + size_t n = get<0>(divisions[0][right]); + const auto& aln = repair_alns[0][right]; + if (n == aln.path().mapping_size()) { +#ifdef debug_constrictions + cerr << "right side " << i << " does not need to be extended" << endl; +#endif + ++right; + continue; + } +#ifdef debug_constrictions + cerr << "extend right sequence " << i << " from " << string(path_chunks[i].first.first, path_chunks[i].first.second); +#endif + + path_chunks[i].first.first = get<4>(divisions[0][right]); + +#ifdef debug_constrictions + cerr << " to " << string(path_chunks[i].first.first, path_chunks[i].first.second) << endl; + cerr << "move right step from position " << graph->get_position_of_step(ref_chunks[i].first); +#endif + ref_chunks[i].first = get<2>(divisions[0][right]); +#ifdef debug_constrictions + cerr << " to " << graph->get_position_of_step(ref_chunks[i].first) << endl; +#endif + + // copy the repair alignment + Path concat_path; + for (size_t k = n; k < aln.path().mapping_size(); ++k) { + auto mapping = concat_path.add_mapping(); + *mapping = aln.path().mapping(k); + mapping->set_rank(concat_path.mapping_size()); + } + + // check if we need to merge the first and last mappings + auto final_mapping = concat_path.mutable_mapping(concat_path.mapping_size() - 1); + const auto& final_position = final_mapping->position(); + const auto& first_mapping = path_chunks[i].second.mapping(0); + const auto& first_position = first_mapping.position(); + size_t k = 0; + if (final_position.node_id() == first_position.node_id() && + final_position.is_reverse() == first_position.is_reverse() && + final_position.offset() + mapping_from_length(*final_mapping) == first_position.offset()) { + for (const auto& edit : first_mapping.edit()) { + *final_mapping->add_edit() = edit; + } + ++k; + } + + // copy over the rest of the original path chunk + for (; k < path_chunks[i].second.mapping_size(); ++k) { + auto mapping = concat_path.add_mapping(); + *mapping = path_chunks[i].second.mapping(k); + mapping->set_rank(concat_path.mapping_size()); + } + + // replace the original path + path_chunks[i].second = concat_path; + +#ifdef debug_constrictions + cerr << "extended right path " << i << " to " << pb2json(path_chunks[i].second) << endl; +#endif + ++right; + } + } + + // we found a pure deletion constriction biclique +#ifdef debug_constrictions + cerr << "recording a constriction biclique" << endl; +#endif + + return_val.emplace_back(vector(left_side.begin(), left_side.end()), + vector(right_side.begin(), right_side.end())); + + auto& biclique = return_val.back(); + sort(biclique.first.begin(), biclique.first.end()); + sort(biclique.second.begin(), biclique.second.end()); + + } + } + + return return_val; + } + + void Surjector::cut_anchors(bool rev_strand, vector& path_chunks, + vector>& ref_chunks, + vector>& connections) const { + + // TODO: this is very repetitive with the similar function in the main spliced surject + + // distance along the path from end of chunk 1 to some mapping on chunk 2 + auto path_distance = [&](size_t chunk_idx_1, + size_t chunk_idx_2, size_t mapping_idx_2) { + + step_handle_t step_1 = ref_chunks[chunk_idx_1].second; + // move right from the beginning of the second chunk if necessary + step_handle_t step_2 = ref_chunks[chunk_idx_2].first; + for (size_t i = 0; i < mapping_idx_2; ++i) { + step_2 = rev_strand ? graph->get_previous_step(step_2) : graph->get_next_step(step_2); + } + + // get the distance component that is on the two mappings + const auto& mapping_1 = *path_chunks[chunk_idx_1].second.mapping().rbegin(); + const auto& mapping_2 = path_chunks[chunk_idx_2].second.mapping(mapping_idx_2); + int64_t dist = mapping_2.position().offset() - mapping_1.position().offset() - mapping_from_length(mapping_1); + + // get the distance component that is along the path + if (rev_strand) { + dist += (graph->get_position_of_step(step_1) + + graph->get_length(graph->get_handle_of_step(step_1)) + - graph->get_position_of_step(step_2) + - graph->get_length(graph->get_handle_of_step(step_2))); + } + else { + dist += (graph->get_position_of_step(step_2) + - graph->get_position_of_step(step_1)); + } + return dist; + }; + + // put the input in lexicographic order by read interval + vector order = range_vector(path_chunks.size()); + stable_sort(order.begin(), order.end(), [&](size_t i, size_t j) { + const auto& interval_1 = path_chunks[i].first; + const auto& interval_2 = path_chunks[j].first; + return (interval_1.first < interval_2.first || + (interval_1.first == interval_2.first && interval_1.second < interval_2.second)); + }); + vector index(order.size()); + for (size_t i = 0; i < order.size(); i++) { + index[order[i]] = i; + } + // update the connection indexes + for (auto& connection : connections) { + get<0>(connection) = index[get<0>(connection)]; + get<1>(connection) = index[get<1>(connection)]; + } + + // swap the order + for (size_t i = 0; i < path_chunks.size(); ++i) { + while (index[i] != i) { +#ifdef debug_spliced_surject + cerr << "reordering chunks, swapping " << i << " and " << index[i] << endl; +#endif + std::swap(path_chunks[index[i]], path_chunks[i]); + std::swap(ref_chunks[index[i]], ref_chunks[i]); + std::swap(index[index[i]], index[i]); + } + } + + // find any overlaps we want to break + vector> overlaps; + for (size_t i = 0; i < path_chunks.size(); ++i) { + auto i_end = path_chunks[i].first.second; + for (size_t j = i + 1; j < path_chunks.size() && path_chunks[j].first.first < i_end; ++j) { + if (i_end < path_chunks[j].first.second) { + // the first chunk overlaps the second (note: this check depends on sorting order) + + // figure out how far we have to go down the second chunk to get past the overlap + int64_t to_walk = i_end - path_chunks[j].first.first; + int64_t walked = 0; + size_t k = 0; + for (; k < path_chunks[j].second.mapping_size() && walked < to_walk; ++k) { + walked += mapping_to_length(path_chunks[j].second.mapping(k)); + } + if (k < path_chunks[j].second.mapping_size() && path_distance(i, j, k) >= 0) { + // we didn't walk off the end of the of second chunk and they're colinear along the path + // so we record an overlap + overlaps.emplace_back(j, k); +#ifdef debug_spliced_surject + cerr << "path chunk " << i << " overlaps " << j << " by " << to_walk << " on read, marking an overlap to split before mapping " << k << " at " << pb2json(path_chunks[j].second.mapping(k)) << endl; +#endif + } + } + } + } + + if (!overlaps.empty()) { + + sort(overlaps.begin(), overlaps.end()); + overlaps.resize(unique(overlaps.begin(), overlaps.end()) - overlaps.begin()); + +#ifdef debug_spliced_surject + cerr << "performing overlap splits: " << endl; + for (auto overlap : overlaps) { + cerr << "\t" << overlap.first << ", " << overlap.second << endl; + } +#endif + + vector split_path_chunks; + split_path_chunks.reserve(path_chunks.size() + overlaps.size()); + vector> split_ref_chunks; + split_ref_chunks.reserve(ref_chunks.size() + overlaps.size()); + + vector added_before(path_chunks.size(), 0); + for (size_t i = 0, j = 0; i < path_chunks.size(); ++i) { + if (i > 0) { + added_before[i] = added_before[i - 1]; + } + + if (j < overlaps.size() && overlaps[j].first == i) { + // find out how many overlap splits we need to perform + size_t n = 1; + while (j + n < overlaps.size() && overlaps[j + n].first == i) { + ++n; + } +#ifdef debug_spliced_surject + cerr << "path chunk " << i << " has " << n << " overlap splits" << endl; +#endif + // we'll walk along the ref path and the read intervals as we go + step_handle_t step = ref_chunks[i].first; + auto read_begin = path_chunks[i].first.first; + + for (size_t k = 0; k <= n; ++k) { + // figure out the bounds of mappings we'll move over + size_t begin_idx = (k == 0 ? 0 : overlaps[j + k - 1].second); + size_t end_idx = (k == n ? path_chunks[i].second.mapping_size() : overlaps[j + k].second); + + // add the mappings + split_path_chunks.emplace_back(); + auto& path_chunk = split_path_chunks.back(); + for (size_t l = begin_idx; l < end_idx; ++l) { + auto mapping = path_chunk.second.add_mapping(); + *mapping = path_chunks[i].second.mapping(l); + mapping->set_rank(l - begin_idx + 1); + } + // identify the read interval + path_chunk.first.first = read_begin; + path_chunk.first.second = path_chunk.first.first + path_to_length(path_chunk.second); + read_begin = path_chunk.first.second; + + // walk the reference path steps + split_ref_chunks.emplace_back(); + auto& ref_chunk = split_ref_chunks.back(); + ref_chunk.first = step; + for (size_t l = begin_idx + 1; l < end_idx; ++l) { + step = rev_strand ? graph->get_previous_step(step) : graph->get_next_step(step); + } + ref_chunk.second = step; + // set up the step for the next iteration + step = rev_strand ? graph->get_previous_step(step) : graph->get_next_step(step); +#ifdef debug_spliced_surject + cerr << "next split for chunk " << i << " as " << split_path_chunks.size() - 1 << ", consisting of " << endl; + cerr << "\t" << string(path_chunk.first.first, path_chunk.first.second) << endl; + cerr << "\t" << pb2json(path_chunk.second) << endl; + cerr << "\t" << graph->get_position_of_step(ref_chunk.first) << " : " << graph->get_position_of_step(ref_chunk.second) << endl; +#endif + } + j += n; + added_before[i] += n; + } + else { +#ifdef debug_spliced_surject + cerr << "no splits on chunk " << i << ", add as " << split_path_chunks.size() << endl; + cerr << "\t" << string(path_chunks[i].first.first, path_chunks[i].first.second) << endl; + cerr << "\t" << pb2json(path_chunks[i].second) << endl; + cerr << "\t" << graph->get_position_of_step(ref_chunks[i].first) << " : " << graph->get_position_of_step(ref_chunks[i].second) << endl; +#endif + split_path_chunks.emplace_back(move(path_chunks[i])); + split_ref_chunks.emplace_back(move(ref_chunks[i])); + + } + } + + // replace the original path chunks and ref chunks with the split ones + path_chunks = move(split_path_chunks); + ref_chunks = move(split_ref_chunks); + + // and update the indexes of the connections + for (auto& connection : connections) { + // edges out should be updated for the splits added in that iteration because + // the come out of the last split segment + get<0>(connection) += added_before[get<0>(connection)]; + // edges in should only be updated for the splits that happened in earlier + // iterations (also, these should never be in index 0, would violate + // colinearity) + get<1>(connection) += added_before[get<1>(connection) - 1]; + } + } + } + + void Surjector::downsample_chunks(const string& src_sequence, + vector& path_chunks, + vector>& ref_chunks, + vector>& connections) const { + int64_t total_cov = 0; + for (const auto& chunk : path_chunks) { + total_cov += chunk.first.second - chunk.first.first; + } + + if (total_cov < min_fold_coverage_for_downsample * src_sequence.size()) { +#ifdef debug_spliced_surject + cerr << "average chunk coverage of " << double(total_cov) / src_sequence.size() << " is lower than downsample limit " << min_fold_coverage_for_downsample << endl; +#endif + return; + } + +#ifdef debug_spliced_surject + cerr << "attempt to downsample chunks to reduce coverage to " << downsample_coverage << endl; +#endif + + // there might be a cleverer sweep line algorithm for this, but we'd still need + // something like a dynamic range max query for the removal stage... + vector coverage(src_sequence.size(), 0); + for (auto& chunk : path_chunks) { + for (int64_t i = chunk.first.first - src_sequence.begin(), n = chunk.first.second - src_sequence.begin(); i < n; ++i) { + ++coverage[i]; + } + } + unordered_set connected; + for (const auto& connection : connections) { + connected.insert(get<0>(connection)); + connected.insert(get<1>(connection)); + } + + // sort so that we remove short anchors first + auto index = range_vector(path_chunks.size()); + stable_sort(index.begin(), index.end(), + [&](size_t i, size_t j) { + const auto& range1 = path_chunks[i].first; + const auto& range2 = path_chunks[j].first; + return range1.second - range1.first < range2.second - range2.first; + }); + + unordered_set to_remove; + for (auto i : index) { + auto& range = path_chunks[i].first; + if (connected.count(i) || range.second == range.first) { + // we want to preserve connections, and pure deletions are sometimes important + // for anchoring + continue; + } + int min_cov = std::numeric_limits::max(); + for (int64_t j = range.first - src_sequence.begin(), n = range.second - src_sequence.begin(); j < n; ++j) { + min_cov = min(min_cov, coverage[j]); + } + if (min_cov > downsample_coverage) { + // we can remove this one without blowing our target coverage + to_remove.insert(i); + for (int64_t j = range.first - src_sequence.begin(), n = range.second - src_sequence.begin(); j < n; ++j) { + --coverage[j]; + } + } + } + if (!to_remove.empty()) { +#ifdef debug_spliced_surject + cerr << "removing " << to_remove.size() << " chunks" << endl; +#endif + + vector removed_so_far(path_chunks.size() + 1, 0); + for (size_t i = 0; i < path_chunks.size(); ++i) { + if (to_remove.count(i)) { +#ifdef debug_spliced_surject + cerr << "removing chunk " << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << endl; +#endif + removed_so_far[i + 1] = removed_so_far[i] + 1; + } + else { + if (removed_so_far[i]) { + path_chunks[i - removed_so_far[i]] = move(path_chunks[i]); + ref_chunks[i - removed_so_far[i]] = move(ref_chunks[i]); + } + removed_so_far[i + 1] = removed_so_far[i]; + } + } + path_chunks.resize(path_chunks.size() - to_remove.size()); + ref_chunks.resize(ref_chunks.size() - to_remove.size()); + + for (auto& connection : connections) { + get<0>(connection) -= removed_so_far[get<0>(connection)]; + get<1>(connection) -= removed_so_far[get<1>(connection)]; + } + } + } + + multipath_alignment_t Surjector::spliced_surject(const PathPositionHandleGraph* path_position_graph, + const string& src_sequence, const string& src_quality, + const int32_t src_mapping_quality, + const path_handle_t& path_handle, bool rev_strand, + vector& path_chunks, + vector>& ref_chunks, + vector>& connections, + pair& path_range_out, + bool allow_negative_scores, bool deletions_as_splices) const { + +#ifdef debug_spliced_surject + cerr << "doing spliced/multipath surject on path " << graph->get_path_name(path_handle) << endl; +#endif + + assert(path_chunks.size() == ref_chunks.size()); + + function path_distance = [&](size_t i, size_t j) { + const auto& final_mapping = *path_chunks[i].second.mapping().rbegin(); + int64_t dist = (path_chunks[j].second.mapping(0).position().offset() + - final_mapping.position().offset() + - mapping_from_length(final_mapping)); + if (rev_strand) { + dist += (graph->get_position_of_step(ref_chunks[i].second) + + graph->get_length(graph->get_handle_of_step(ref_chunks[i].second)) + - graph->get_position_of_step(ref_chunks[j].first) + - graph->get_length(graph->get_handle_of_step(ref_chunks[j].first))); + } + else { + dist += (graph->get_position_of_step(ref_chunks[j].first) + - graph->get_position_of_step(ref_chunks[i].second)); + } + return dist; + }; + + multipath_alignment_t surjected; + + // checks whether the end of i is connected to the beginning of j by an edge + + +#ifdef debug_spliced_surject + cerr << "removing any pure insertion path chunks" << endl; +#endif + + vector insertions_removed(path_chunks.size() + 1, 0); + for (size_t i = 0; i < path_chunks.size(); ++i) { + const auto& chunk = path_chunks[i].second; + bool has_aligned_bases = false; + for (size_t j = 0; j < chunk.mapping_size() && !has_aligned_bases; ++j) { + const auto& mapping = chunk.mapping(j); + for (size_t k = 0; k < mapping.edit_size() && !has_aligned_bases; ++k) { + has_aligned_bases = mapping.edit(k).from_length() != 0; + } + } + if (!has_aligned_bases) { + insertions_removed[i + 1] = insertions_removed[i] + 1; + } + else { + insertions_removed[i + 1] = insertions_removed[i]; + if (insertions_removed[i]) { + path_chunks[i - insertions_removed[i]] = move(path_chunks[i]); + ref_chunks[i - insertions_removed[i]] = move(ref_chunks[i]); + } + } + } + + if (insertions_removed.back()) { + path_chunks.resize(path_chunks.size() - insertions_removed.back()); + ref_chunks.resize(path_chunks.size()); + + // update connections with the new indexes + size_t connections_removed = 0; + for (size_t i = 0; i < connections.size(); ++i) { + auto& connection = connections[i]; + if (insertions_removed[get<0>(connection)] != insertions_removed[get<0>(connection) + 1] + || insertions_removed[get<1>(connection)] != insertions_removed[get<1>(connection) + 1]) { + ++connections_removed; + } + else { + get<0>(connection) -= insertions_removed[get<0>(connection)]; + get<1>(connection) -= insertions_removed[get<1>(connection)]; + connections[i - connections_removed] = connection; + } + } + + connections.resize(connections.size() - connections_removed); + } + +#ifdef debug_spliced_surject + cerr << "removed " << insertions_removed.back() << " chunks" << endl; +#endif + + if (path_chunks.size() == 1 + && path_chunks.front().first.first == src_sequence.begin() + && path_chunks.front().first.second == src_sequence.end()) { + + // this is an unambiguous surjection, we can skip the hole process + + // ugly: we can save a little work by skipping these, since they get copied + // over in the calling environment anyway + //surjected.set_sequence(src_sequence); + //surjected.set_quality(src_quality); + surjected.set_mapping_quality(src_mapping_quality); + + auto surj_subpath = surjected.add_subpath(); + from_proto_path(path_chunks.front().second, *surj_subpath->mutable_path()); + + Alignment aln; + aln.set_sequence(src_sequence); + aln.set_quality(src_quality); + *aln.mutable_path() = move(path_chunks.front().second); + surj_subpath->set_score(get_aligner(!src_quality.empty())->score_contiguous_alignment(aln)); + + surjected.add_start(0); + + path_range_out = ref_chunks.front(); + +#ifdef debug_spliced_surject + cerr << "surjection is unambiguous, skipping algorithm:" << endl; + cerr << debug_string(surjected) << endl; +#endif + + return surjected; + } + +#ifdef debug_spliced_surject + cerr << "checking for need to downsample chunks" << endl; +#endif + + downsample_chunks(src_sequence, path_chunks, ref_chunks, connections); + +#ifdef debug_spliced_surject + cerr << "checking for need to cut anchors" << endl; +#endif + + cut_anchors(rev_strand, path_chunks, ref_chunks, connections); + + +#ifdef debug_spliced_surject + cerr << "making colinearity graph for " << path_chunks.size() << " path chunks" << endl; +#endif + vector> colinear_adj(path_chunks.size()); + + for (size_t i = 0; i < path_chunks.size(); ++i) { + for (size_t j = i + 1; j < path_chunks.size(); ++j) { + if (path_chunks[i].first.second <= path_chunks[j].first.first + && path_distance(i, j) >= 0) { + // the second one is further along both the read and the path, so it is colinear + colinear_adj[i].push_back(j); + } + } + } + + // TODO: use tricks from multipath alignment graph to make a smaller initial chunk graph + +#ifdef debug_spliced_surject + cerr << "initial graph:" << endl; + for (size_t i = 0; i < colinear_adj.size(); ++i) { + cerr << i << ":"; + for (auto j : colinear_adj[i]) { + cerr << " " << j; + } + cerr << endl; + } + cerr << "connections:" << endl; + for (const auto& connection : connections) { + cerr << get<0>(connection) << " -> " << get<1>(connection) << ", " << get<2>(connection) << endl; + } + + cerr << "computing transitive reduction" << endl; +#endif + + // remove transitive edges + vector> colinear_adj_red = transitive_reduction(colinear_adj); + +#ifdef debug_spliced_surject + cerr << "reduced graph:" << endl; + for (size_t i = 0; i < colinear_adj_red.size(); ++i) { + cerr << i << ":"; + for (auto j : colinear_adj_red[i]) { + cerr << " " << j; + } + cerr << endl; + } + + cerr << "removing dominated path chunks" << endl; +#endif + + colinear_adj_red = remove_dominated_chunks(src_sequence, colinear_adj_red, path_chunks, ref_chunks, connections); + +#ifdef debug_spliced_surject + cerr << "with dominated chunks removed:" << endl; + for (size_t i = 0; i < colinear_adj_red.size(); ++i) { + cerr << i << ":"; + for (auto j : colinear_adj_red[i]) { + cerr << " " << j; + } + cerr << endl; + } + cerr << "connections:" << endl; + for (const auto& connection : connections) { + cerr << get<0>(connection) << " -> " << get<1>(connection) << ", " << get<2>(connection) << endl; + } +#endif + + // records of (to idx, score, is a connection) + vector>> splice_edges(path_chunks.size()); + + vector has_inward_connection(path_chunks.size(), false); + + if (!connections.empty()) { + + // clear outward edges for chunks that send connections, and record + // the scored edge + +#ifdef debug_spliced_surject + cerr << "handling any connections" << endl; +#endif + + unordered_set> connection_set; + for (const auto& connection : connections) { + connection_set.emplace(get<0>(connection), get<1>(connection)); + } + + for (const auto& connection : connections) { + splice_edges[get<0>(connection)].emplace_back(get<1>(connection), get<2>(connection), true); + has_inward_connection[get<1>(connection)] = true; + // move direct adjacency edges out of this node to the splice edges (unless they correspond to the + // the connection itself). + for (auto target : colinear_adj_red[get<0>(connection)]) { + if (!connection_set.count(make_pair(get<0>(connection), target)) + && path_chunks[get<0>(connection)].first.second == path_chunks[target].first.first + && path_distance(get<0>(connection), target) == 0) { + // TODO: why do we find directly abutting connections in the first place? + splice_edges[get<0>(connection)].emplace_back(target, 0, false); + } + } + colinear_adj_red[get<0>(connection)].clear(); + } + + + // move inward exactly abutting edges for path chunks that receive connections into the splice edges + // and clear the rest + for (auto& adj : colinear_adj_red) { + for (size_t i = 0; i < adj.size();) { + if (has_inward_connection[adj[i]]) { + if (!connection_set.count(make_pair(i, adj[i])) + && path_chunks[i].first.second == path_chunks[adj[i]].first.first + && path_distance(i, adj[i]) == 0) { + splice_edges[i].emplace_back(adj[i], 0, false); + } + adj[i] = adj.back(); + adj.pop_back(); + } + else { + ++i; + } + } + } + +#ifdef debug_spliced_surject + cerr << "after removing connections:" << endl; + for (size_t i = 0; i < colinear_adj_red.size(); ++i) { + cerr << i << ":"; + for (auto j : colinear_adj_red[i]) { + cerr << " " << j; + } + cerr << endl; + } + cerr << "splice graph:" << endl; + for (size_t i = 0; i < splice_edges.size(); ++i) { + cerr << i << ":"; + for (auto edge : splice_edges[i]) { + cerr << " (" << get<0>(edge) << ", " << get<1>(edge) << ", " << get<2>(edge) << ")"; + } + cerr << endl; + } +#endif + } + + + if (deletions_as_splices) { + + // look for constrictions and move them into the splice edges iteratively + // (some edges that are not originally constrictions can become constrictions + // once other constriction edges are removed, which separates the component) + + bool removed_edges = true; + while (removed_edges) { + removed_edges = false; + +#ifdef debug_spliced_surject + cerr << "finding constrictions" << endl; +#endif + + // find bicliques that constrict the colinearity graph + auto constrictions = find_constriction_bicliques(colinear_adj_red, src_sequence, + src_quality, path_chunks, + ref_chunks, connections); + +#ifdef debug_spliced_surject + cerr << "found " << constrictions.size() << " constriction bicliques:" << endl; + for (auto& constriction : constrictions) { + cerr << "left:" << endl; + for (auto i : constriction.first) { + cerr << "\t" << i << endl; + } + cerr << "right:" << endl; + for (auto i : constriction.second) { + cerr << "\t" << i << endl; + } + } +#endif + + // if any constrictions correspond to pure deletions, remove them from the colineary + // graph and record them as splice edges + + for (const auto& constriction : constrictions) { + + vector> new_edges; + bool includes_splice = false; + for (auto i : constriction.first) { + if (colinear_adj_red[i].empty()) { + // the edges have been cleared when incorporating a connection + continue; + } + for (auto j : constriction.second) { + if (has_inward_connection[j]) { + // backward edgs have been removed + continue; + } + int64_t dist = path_distance(i, j); + int64_t score; + if (dist >= min_splice_length) { + includes_splice = true; + score = 0; + } + else { + score = get_aligner(!src_quality.empty())->score_gap(dist); + } + +#ifdef debug_spliced_surject + cerr << "deletion of length " << dist << " from " << i << " to " << j << " is recorded as part of a splice biclique, and given score " << score << endl; +#endif + + new_edges.emplace_back(i, j, score); + } + } + if (includes_splice) { + removed_edges = true; + // remove the colinearity edges + for (auto i : constriction.first) { + colinear_adj_red[i].clear(); + } + // transfer them to splice edges + for (const auto& edge : new_edges) { + splice_edges[get<0>(edge)].emplace_back(get<1>(edge), get<2>(edge), false); + } + } +#ifdef debug_spliced_surject + else { + cerr << "actually did find any splice edges, not moving edges to the splice graph" << endl; + } +#endif + } + + +#ifdef debug_spliced_surject + cerr << "after removing long constriction deletions:" << endl; + for (size_t i = 0; i < colinear_adj_red.size(); ++i) { + cerr << i << ":"; + for (auto j : colinear_adj_red[i]) { + cerr << " " << j; + } + cerr << endl; + } + cerr << "splice graph:" << endl; + for (size_t i = 0; i < splice_edges.size(); ++i) { + cerr << i << ":"; + for (auto edge : splice_edges[i]) { + cerr << " (" << get<0>(edge) << ", " << get<1>(edge) << ", " << get<2>(edge) << ")"; + } + cerr << endl; + } +#endif + + } + } + +#ifdef debug_spliced_surject + cerr << "computing constriction components" << endl; +#endif + + // find the connected components in the graph with the splice edges removed + size_t num_comps = 0; + vector constriction_comps = connected_components(colinear_adj_red, + reverse_adjacencies(colinear_adj_red), + &num_comps); + vector> comp_groups(num_comps); + for (size_t i = 0; i < constriction_comps.size(); ++i) { + comp_groups[constriction_comps[i]].push_back(i); + } + +#ifdef debug_spliced_surject + for (size_t i = 0; i < comp_groups.size(); ++i) { + cerr << "group " << i << ":"; + for (auto j : comp_groups[i]) { + cerr << " " << j; + } + cerr << endl; + } +#endif + + prune_unconnectable(colinear_adj_red, splice_edges, constriction_comps, comp_groups, + path_chunks, ref_chunks); + +#ifdef debug_spliced_surject + cerr << "groups after pruning unconnectable" << endl; + for (size_t i = 0; i < comp_groups.size(); ++i) { + cerr << "group " << i << ":"; + for (auto j : comp_groups[i]) { + cerr << " " << j; + } + cerr << endl; + } +#endif + + + // convert the splice edges into edges between the components and identify sources/sinks + vector comp_is_source(comp_groups.size(), true); + vector>> comp_group_edges(comp_groups.size()); + for (size_t i = 0; i < splice_edges.size(); ++i) { + for (auto& edge : splice_edges[i]) { + if (constriction_comps[i] < constriction_comps[get<0>(edge)]) { + comp_group_edges[constriction_comps[i]].emplace_back(constriction_comps[get<0>(edge)], + get<1>(edge), get<2>(edge)); + comp_is_source[constriction_comps[get<0>(edge)]] = false; + } + } + } + + +#ifdef debug_spliced_surject + cerr << "component group edges:" << endl; + for (size_t i = 0; i < comp_group_edges.size(); ++i) { + cerr << i << ":"; + for (auto edge : comp_group_edges[i]) { + cerr << " (" << get<0>(edge) << ", " << get<1>(edge) << ", " << get<2>(edge) << ")"; + } + cerr << endl; + } +#endif + +#ifdef debug_spliced_surject + cerr << "surjecting " << comp_groups.size() << " constriction sections" << endl; +#endif + + vector> section_path_ranges; + vector sections; + + // adjacent values are the range of copies of the section + vector copy_range(comp_groups.size() + 1, 0); + // the index of the component group for each copy + vector original_copy; + original_copy.reserve(comp_groups.size()); + + for (size_t i = 0; i < comp_groups.size(); ++i) { + pair read_range; + vector section_path_chunks; + vector> section_ref_chunks; + + vector& group = comp_groups[i]; + + // the other end points are determine by how the portion of the + for (size_t j = 0; j < group.size(); ++j) { + + section_path_chunks.push_back(path_chunks[group[j]]); + section_ref_chunks.push_back(ref_chunks[group[j]]); + + if (j == 0 || read_range.first > path_chunks[group[j]].first.first) { + read_range.first = path_chunks[group[j]].first.first; + } + if (j == 0 || read_range.second < path_chunks[group[j]].first.second) { + read_range.second = path_chunks[group[j]].first.second; + } + } + + // sources/sinks align all the way to the end + if (comp_is_source[i]) { + read_range.first = src_sequence.begin(); + } + if (comp_group_edges[i].empty()) { + read_range.second = src_sequence.end(); + } + + // make a dummy alignment with the relevant portion of the sequence + Alignment section_source; + *section_source.mutable_sequence() = string(read_range.first, read_range.second); + if (!src_quality.empty()) { + *section_source.mutable_quality() = string(src_quality.begin() + (read_range.first - src_sequence.begin()), + src_quality.begin() + (read_range.second - src_sequence.begin())); + } +#if defined(debug_always_warn_on_too_long) || defined(debug_validate_anchored_multipath_alignment) + // give it the full sequence as a name so we can see it later + section_source.set_name(src_sequence); +#endif + + // update the path chunk ranges to point into the dummy section read + for (size_t j = 0; j < section_path_chunks.size(); ++j) { + auto& chunk_range = section_path_chunks[j].first; + chunk_range.first = section_source.sequence().begin() + (chunk_range.first - read_range.first); + chunk_range.second = section_source.sequence().begin() + (chunk_range.second - read_range.first); + } + +#ifdef debug_spliced_surject + cerr << "surjecting section " << i << ": " << pb2json(section_source) << endl; + cerr << "consists of " << section_path_chunks.size() << " path chunks" << endl; +#endif + + // perform a full length surjection within the section + section_path_ranges.emplace_back(); + vector> all_path_ranges; + sections.emplace_back(realigning_surject(graph, + section_source, + path_handle, + rev_strand, + section_path_chunks, + section_ref_chunks, + section_path_ranges.back(), + true, // allow negative scores (global) + true, // preserve N alignments + !comp_group_edges[i].empty(), // sinks are anchors + !comp_is_source[i], // sources are anchors + &all_path_ranges)); + original_copy.push_back(i); + // if there are multiple copies of the section on the path, add those as well + for (size_t j = 1; j < all_path_ranges.size(); ++j) { + sections.emplace_back(sections.back()); + section_path_ranges.emplace_back(all_path_ranges[j]); + original_copy.push_back(i); + } + copy_range[i + 1] = copy_range[i] + max(all_path_ranges.size(), 1); + +#ifdef debug_spliced_surject + cerr << "found " << all_path_ranges.size() << " path locations, recording in section copy interval " << copy_range[i] << ":" << copy_range[i + 1] << endl; +#endif + + // remove any extraneous full length bonuses + // TODO: technically, this can give a non-optimal alignment because it's post hoc to the dynamic programming + const auto& aligner = *get_aligner(!src_quality.empty()); + for (size_t j = copy_range[i], n = copy_range[i + 1]; j < n; ++j) { + if (sections[j].path().mapping_size() != 0) { + if (read_range.first != src_sequence.begin()) { + if (sections[j].path().mapping(0).edit(0).from_length() > 0) { + sections[j].set_score(sections[j].score() + - aligner.score_full_length_bonus(true, section_source)); + } + } + if (read_range.second != src_sequence.end()) { + const Mapping& m = sections[j].path().mapping(0); + if (m.edit(m.edit_size() - 1).from_length() > 0) { + sections[j].set_score(sections[j].score() + - aligner.score_full_length_bonus(false, section_source)); + } + } + } + } + +#ifdef debug_spliced_surject + cerr << "surjected section " << i << " after score adjustment: " << pb2json(sections.back()) << endl; + if (sections.back().path().mapping_size() == 0) { + cerr << "null alignment has no path range" << endl; + } + else { + for (auto path_range : all_path_ranges) { + cerr << "path range: " << graph->get_id(graph->get_handle_of_step(path_range.first)) << " " << graph->get_is_reverse(graph->get_handle_of_step(path_range.first)) << " " << graph->get_position_of_step(path_range.first) << " : " << graph->get_id(graph->get_handle_of_step(path_range.second)) << " " << graph->get_is_reverse(graph->get_handle_of_step(path_range.second)) << " " << graph->get_position_of_step(path_range.second) << endl; + } + } +#endif + } + + // distance between the path ranges of two sections + // assumes direct adjacency over an edge, but this may not be true in the case of a connection + // TODO: repetitive with path_distance + auto section_path_dist = [&](size_t i, size_t j) -> int64_t { + pos_t pos1 = final_position(sections[i].path()); + pos_t pos2 = initial_position(sections[j].path()); + step_handle_t step1 = section_path_ranges[i].second; + step_handle_t step2 = section_path_ranges[j].first; + if (rev_strand) { + return (graph->get_position_of_step(step1) + + graph->get_length(graph->get_handle_of_step(step1)) + - graph->get_position_of_step(step2) + - graph->get_length(graph->get_handle_of_step(step2)) + + offset(pos2) + - offset(pos1)); + } + else { + return (graph->get_position_of_step(step2) + - graph->get_position_of_step(step1) + + offset(pos2) + - offset(pos1)); + } + }; + +#ifdef debug_spliced_surject + cerr << "computing optimal combination of sections over section graph" << endl; + + cerr << "copy range array:" << endl; + for (auto i : copy_range) { + cerr << "\t" << i << endl; + } + cerr << "original copy array:" << endl; + for (auto i : original_copy) { + cerr << "\t" << i << endl; + } + cerr << "graph structure:" << endl; + for (size_t i = 0; i < sections.size(); ++i) { + cerr << i << " (original " << original_copy[i]; + if (sections[i].path().mapping_size() != 0) { + cerr << " at " << graph->get_position_of_step(section_path_ranges[i].first) << " - " << graph->get_position_of_step(section_path_ranges[i].second); + } + cerr << "):" << endl; + if (sections[i].path().mapping_size() != 0) { + continue; + } + + for (auto& edge : comp_group_edges[original_copy[i]]) { + + for (size_t j = copy_range[get<0>(edge)], n = copy_range[get<0>(edge) + 1]; j < n; ++j) { + if (sections[j].path().mapping_size() == 0) { + cerr << "\tX " << j << " (null section)" << endl; + continue; + } + auto d = section_path_dist(i, j); + if (d >= 0) { + cerr << "\t-> " << j << ", dist " << d << ", score " << get<1>(edge) << ", connection? " << get<2>(edge) << endl; + } + else { + cerr << "\tX " << j << " (noncolinear, dist " << d << ")" << endl; + } + } + } + } +#endif + + // now we find use dynamic programming to find the best alignment across chunks + + // pairs of (section index, edge index) + vector> backpointer(sections.size(), pair(-1, -1)); + vector score_dp(sections.size(), numeric_limits::min()); + + // initialize the scores at sources or at any section if we're doing subpath + // local alignments (i.e. not allowing negative scores) + for (size_t i = 0; i < sections.size(); ++i) { + if (!allow_negative_scores || comp_is_source[i]) { + score_dp[i] = sections[i].score(); + } + } + + // do the dynamic programming + for (size_t i = 0; i < comp_groups.size(); ++i) { + + auto& edges = comp_group_edges[i]; + + for (size_t l = 0; l < edges.size(); ++l) { + + auto& edge = edges[l]; + + for (size_t j = copy_range[i], m = copy_range[i + 1]; j < m; ++j) { + + if (sections[j].path().mapping_size() == 0) { + // this section failed to project + continue; + } + + for (size_t k = copy_range[get<0>(edge)], n = copy_range[get<0>(edge) + 1]; k < n; ++k) { + + if (sections[k].path().mapping_size() == 0) { + // this section failed to project + continue; + } + + int32_t extended_score = score_dp[j] + get<1>(edge) + sections[k].score(); + + int64_t dist = section_path_dist(j, k); +#ifdef debug_spliced_surject + cerr << "extending from component " << i << " section copy index " << j << " (DP score " << score_dp[i] << ") with score of " << extended_score << " to " << get<0>(edge) << " section copy index " << k << " (DP score " << score_dp[get<0>(edge)] << ") dist " << dist << endl; +#endif + if (dist < 0) { +#ifdef debug_spliced_surject + cerr << "the sections are not colinear along the path" << endl; +#endif + continue; + } + + if (extended_score > score_dp[k]) { + score_dp[k] = extended_score; + backpointer[k].first = j; + backpointer[k].second = l; + } + else if (extended_score == score_dp[k] + && sections[j].path().mapping_size() != 0 + && sections[k].path().mapping_size() != 0 + && backpointer[k].first >= 0 + && dist < section_path_dist(backpointer[k].first, k)) { + // break ties in favor of the closer exon + backpointer[k].first = j; + backpointer[k].second = l; + } + + } + } + } + } + +#ifdef debug_spliced_surject + cerr << "backpointers after DP:" << endl; + for (size_t i = 0; i < backpointer.size(); ++i) { + cerr << i << ": " << backpointer[i].first << " " << backpointer[i].second << endl; + } +#endif + + // find the maximum, subject to full length requirements + vector traceback(1, -1); + int32_t max_score = numeric_limits::min(); + for (size_t i = 0; i < score_dp.size(); ++i) { + + if (score_dp[i] > max_score && (!allow_negative_scores || comp_group_edges[original_copy[i]].empty())) { + max_score = score_dp[i]; + traceback[0] = i; + } + else if (score_dp[i] == max_score + && backpointer[i].first != -1 + && traceback[0] != -1 + && backpointer[traceback[0]].first != -1 + && (!allow_negative_scores || comp_group_edges[original_copy[i]].empty()) + && section_path_dist(backpointer[i].first, i) < section_path_dist(backpointer[traceback[0]].first, traceback[0])) { + // break ties in favor exon with closer connection + traceback[0] = i; + } + } + + if (traceback.front() != -1) { + + // follow the back pointers + while (backpointer[traceback.back()].first != -1) { + traceback.push_back(backpointer[traceback.back()].first); + } + +#ifdef debug_spliced_surject + cerr << "combining " << traceback.size() << " sections into surjected alignment" << endl; + for (int64_t i = traceback.size() - 1; i >= 0; --i) { + cerr << "\t" << traceback[i] << endl; + } +#endif + path_range_out.first = section_path_ranges[traceback.back()].first; + path_range_out.second = section_path_ranges[traceback.front()].second; + } + else { +#ifdef debug_spliced_surject + cerr << "traceback is empty" << endl; +#endif + + // sentinel for unmapped + path_range_out.first = path_range_out.second = graph->path_end(path_handle); + traceback.clear(); + } + + // make an alignment to build out the path in + surjected.set_sequence(src_sequence); + surjected.set_quality(src_quality); + surjected.set_mapping_quality(src_mapping_quality); + + subpath_t* prev_subpath = nullptr; + for (int64_t i = traceback.size() - 1; i >= 0; --i) { + + size_t section_idx = traceback[i]; + const Path& copy_path = sections[section_idx].path(); + + if (copy_path.mapping_size() == 0) { + // the DP chose a segment that was unsurjectable + path_range_out.first = path_range_out.second = graph->path_end(path_handle); + surjected = make_null_mp_alignment(src_sequence, src_quality); + return surjected; + } +#ifdef debug_spliced_surject + cerr << "appending path section " << pb2json(copy_path) << endl; +#endif + if (copy_path.mapping_size() == 0) { + // this happens if the surjected section is a pure deletion, we can just skip it + continue; + } + + if (i != traceback.size() - 1) { + // make an edge back to the previous section + + // get the edge between these sections that the DP used + size_t prev_idx = traceback[i + 1]; + auto& edge = comp_group_edges[original_copy[prev_idx]][backpointer[section_idx].second]; + + if (get<2>(edge)) { + // this is from a connection + auto connection = prev_subpath->add_connection(); + connection->set_next(surjected.subpath_size()); + connection->set_score(get<1>(edge)); + } + else { + // this is from a constriction or a preserved edge around a connection + prev_subpath->add_next(surjected.subpath_size()); + } + } + // TODO: merge adjacent mappings across subpaths + + // make a new subpath to hold the this path section + auto surj_subpath = surjected.add_subpath(); + surj_subpath->set_score(sections[section_idx].score()); + from_proto_path(copy_path, *surj_subpath->mutable_path()); + + prev_subpath = surj_subpath; + } + + // since the mp aln is a non-branching path, this is always the only start + if (surjected.subpath_size() != 0) { + surjected.add_start(0); + } + +#ifdef debug_spliced_surject + cerr << "final spliced surjection " << debug_string(surjected) << endl; +#endif + + return surjected; + } + + Alignment Surjector::realigning_surject(const PathPositionHandleGraph* path_position_graph, const Alignment& source, + const path_handle_t& path_handle, bool rev_strand, const vector& path_chunks, + const vector>& ref_chunks, + pair& path_range_out, bool allow_negative_scores, + bool preserve_N_alignments, bool sinks_are_anchors, bool sources_are_anchors, + vector>* all_path_ranges_out) const { + +#ifdef debug_anchored_surject + cerr << "using overlap chunks on path " << graph->get_path_name(path_handle) << " strand " << rev_strand << ", performing realigning surjection" << endl; + cerr << "chunks:" << endl; + for (size_t i = 0; i < path_chunks.size(); ++i) { + cerr << "\t" << string(path_chunks[i].first.first, path_chunks[i].first.second) << ", " << pb2json(path_chunks[i].second) << endl; + } +#endif + + // the alignment we will fill out + Alignment surjected; + + // find the end-inclusive interval of the ref path we need to consider + pair ref_path_interval = compute_path_interval(path_position_graph, source, + path_handle, rev_strand, + path_chunks, ref_chunks, + sources_are_anchors, sinks_are_anchors); + if (ref_path_interval.first <= ref_path_interval.second) { + // We actually got a nonempty range, so expand it. + + // having a buffer helps ensure that we get the correct anchoring position for some edge cases + // of a full deletion that occurs on a node boundary + if (ref_path_interval.first > 0) { + --ref_path_interval.first; + } + if (ref_path_interval.second + 1 < path_position_graph->get_path_length(path_handle)) { + ++ref_path_interval.second; + } + } + + if (path_chunks.size() == 0) { +#ifdef debug_anchored_surject + cerr << "no path chunks provided, surjecting as unmapped" << endl; +#endif + // Leave surjected path empty + } + else if (path_chunks.size() == 1 + && path_chunks.front().first.first == source.sequence().begin() + && path_chunks.front().first.second == source.sequence().end()) { +#ifdef debug_anchored_surject + cerr << "path chunk already constitutes a full alignment, skipping realignment" << endl; +#endif + + // just copy it over + surjected.set_sequence(source.sequence()); + surjected.set_quality(source.quality()); + *surjected.mutable_path() = path_chunks.front().second; + surjected.set_score(get_aligner(!source.quality().empty())->score_contiguous_alignment(surjected)); + + } + else { + // we're going to have to realign some portions + +#ifdef debug_anchored_surject + cerr << "final path interval is " << ref_path_interval.first << ":" << ref_path_interval.second << " on path of length " << path_position_graph->get_path_length(path_handle) << endl; +#endif + + // If we put in path chunks we need to have ended up with a + // nonempty path interval that they cover. + assert(ref_path_interval.first <= ref_path_interval.second); + + // get the path graph corresponding to this interval + bdsg::HashGraph path_graph; + unordered_map> node_trans = extract_linearized_path_graph(path_position_graph, &path_graph, path_handle, + ref_path_interval.first, ref_path_interval.second); + + // choose an orientation for the path graph + ReverseGraph rev_comp_path_graph(&path_graph, true); + HandleGraph* aln_graph; + if (rev_strand) { + // we align to the reverse strand of the path graph, and the translation chages accordingly + aln_graph = &rev_comp_path_graph; + for (pair>& translation : node_trans) { + translation.second.second = !translation.second.second; + } + } + else { + aln_graph = &path_graph; + } + +#ifdef debug_anchored_surject + cerr << "made split, linearized path graph with " << aln_graph->get_node_count() << " nodes" << endl; +#endif + + size_t subgraph_bases = aln_graph->get_total_length(); + if (subgraph_bases > max_subgraph_bases) { +#ifdef debug_always_warn_on_too_long + cerr << "gave up on too long read " + source.name() + "\n"; +#endif + if (!warned_about_subgraph_size.test_and_set()) { + cerr << "warning[vg::Surjector]: Refusing to perform very large alignment against " + << subgraph_bases << " bp strand split subgraph for read " << source.name() + << "; suppressing further warnings." << endl; + } + return move(make_null_alignment(source)); + } + + // compute the connectivity between the path chunks + // TODO: i'm not sure if we actually need to preserve all indel anchors in either case, but i don't + // want to change too much about the anchoring logic at once while i'm switching from blanket preservation + // to a more targeted method + bool preserve_tail_indel_anchors = (sinks_are_anchors || sources_are_anchors); + MultipathAlignmentGraph mp_aln_graph(*aln_graph, path_chunks, source, node_trans, !preserve_N_alignments, + preserve_tail_indel_anchors); + +#ifdef debug_anchored_surject + size_t total_edges = mp_aln_graph.count_reachability_edges(); + cerr << "constructed reachability graph with " << total_edges << " edges" << endl; +#endif + + // we don't overlap this reference path at all or we filtered out all of the path chunks, so just make a sentinel + if (mp_aln_graph.empty()) { + return move(make_null_alignment(source)); + } + + // TODO: is this necessary in a linear graph? + vector topological_order; + mp_aln_graph.topological_sort(topological_order); + mp_aln_graph.remove_transitive_edges(topological_order); + + if (!sinks_are_anchors && !sources_are_anchors) { + // We are allowed to create new sources and sinks. + + // Drop material that looks implausible. + vector scratch(topological_order.size()); + mp_aln_graph.prune_to_high_scoring_paths(source, get_aligner(), 2.0, topological_order, scratch); + } + +#ifdef debug_anchored_surject + size_t after_prune_edges = mp_aln_graph.count_reachability_edges(); + cerr << "pruning for high scoring paths leaves " << after_prune_edges << " edges after removing " << (total_edges - after_prune_edges) << endl; + total_edges = after_prune_edges; +#endif + + if (allow_negative_scores && mp_aln_graph.max_shift() > min_shift_for_prune) { + // we have at least one or more large implied indels, we will try to prune them away + // while maintaining topological invariants to save compute on low-promise alignments + mp_aln_graph.prune_high_shift_edges(shift_prune_diff, sources_are_anchors, sinks_are_anchors); + +#ifdef debug_anchored_surject + size_t after_shift_edges = mp_aln_graph.count_reachability_edges(); + cerr << "pruning for shift leaves " << after_shift_edges << " edges after removing " << (total_edges - after_shift_edges) << endl; + total_edges = after_shift_edges; +#endif + } + + // align the intervening segments and store the result in a multipath alignment + multipath_alignment_t mp_aln; + mp_aln_graph.align(source, *aln_graph, get_aligner(), + false, // anchors as matches + 1, // max alt alns + false, // dynamic alt alns + numeric_limits::max(), // max gap + 0.0, // pessimistic tail gap multiplier + false, // simplify topologies + 0, // unmergeable len + 1, // band padding + mp_aln, // output + nullptr, // snarl manager + nullptr, // distance index + nullptr, // projector + allow_negative_scores); + + topologically_order_subpaths(mp_aln); + + if (preserve_tail_indel_anchors) { + // this code path sometimes produces subpaths that have no aligned bases, which + // sometimes play poorly with other parts of the code base + remove_empty_alignment_sections(mp_aln); + } + + for (size_t i = 0; i < mp_aln.subpath_size(); i++) { + // translate back into the original ID space + translate_oriented_node_ids(*mp_aln.mutable_subpath(i)->mutable_path(), node_trans); + } + + // identify the source subpaths (necessary for subpath-global optimal alignment algorithm) + identify_start_subpaths(mp_aln); + +#ifdef debug_anchored_surject + cerr << "made multipath alignment " << debug_string(mp_aln) << endl; +#endif + +#ifdef debug_validate_anchored_multipath_alignment + if (!validate_multipath_alignment(mp_aln, *graph)) { + cerr << "WARNING: multipath alignment for surjection of " << source.name() << " with sequence " << " failed to validate" << endl; + } +#endif + // concatenate the subpaths either locally or globally, depending on whether we're + // allowing negative scores + optimal_alignment(mp_aln, surjected, allow_negative_scores); + } + + const auto& surj_path = surjected.path(); + if (surj_path.mapping_size() > 0) { + // the surjection is mapped + +#ifdef debug_anchored_surject + cerr << "assigning a path range to surjected path: " << pb2json(surj_path) << endl; +#endif + size_t mappings_matched = 0; + + // look in either the forward or reverse orientation along the path + +#ifdef debug_anchored_surject + cerr << "looking for path range on " << (rev_strand ? "reverse" : "forward") << " strand, for " << surj_path.mapping_size() << " mappings" << endl; +#endif + step_handle_t step = rev_strand ? graph->get_step_at_position(path_handle, ref_path_interval.second) + : graph->get_step_at_position(path_handle, ref_path_interval.first); + step_handle_t end = rev_strand ? graph->get_previous_step(graph->get_step_at_position(path_handle, ref_path_interval.first)) + : graph->get_next_step(graph->get_step_at_position(path_handle, ref_path_interval.second)); + + // walk the identified interval + for (; step != end; step = rev_strand ? graph->get_previous_step(step) : graph->get_next_step(step)) { + const auto& pos = surj_path.mapping(mappings_matched).position(); + handle_t handle = graph->get_handle_of_step(step); + if (graph->get_id(handle) == pos.node_id() && + ((graph->get_is_reverse(handle) != pos.is_reverse()) == rev_strand)) { + // we found the next position we were expecting to + if (mappings_matched == 0) { + path_range_out.first = step; + } + path_range_out.second = step; + ++mappings_matched; +#ifdef debug_anchored_surject + cerr << "\tmatch at node " << graph->get_id(handle) << " " << graph->get_is_reverse(handle) << " at position " << graph->get_position_of_step(step) << endl; +#endif + if (mappings_matched == surj_path.mapping_size()) { +#ifdef debug_anchored_surject + cerr << "\t\tcompleted a match" << endl; +#endif + if (!all_path_ranges_out) { + // we are satisfied with one path range + break; + } + else { + // record it and reset the search + all_path_ranges_out->push_back(path_range_out); + // return as if you hadn't matched at the start of this potential match + mappings_matched = 0; + // and go back to where we started on the path + // TODO: this is potentially quadratic, there are faster algorithms + step = path_range_out.first; + } + } + } + else { + // we mismatched the path + if (mappings_matched) { + // return as if you hadn't matched at the start of this potential match + mappings_matched = 0; + // and go back to where we started on the path + // TODO: this is potentially quadratic, there are faster algorithms + step = path_range_out.first; + } +#ifdef debug_anchored_surject + cerr << "\tmismatch at node " << graph->get_id(handle) << " " << graph->get_is_reverse(handle) << " at position " << graph->get_position_of_step(step) << endl; +#endif + } + } + if (all_path_ranges_out) { + if (all_path_ranges_out->empty()) { + cerr << "error: couldn't identify a path range corresponding to surjected read " << source.name() + << " because there are no path ranges on path " << graph->get_path_name(path_handle) << endl; + cerr << "Surjected read dump: " << pb2json(surjected) << endl; + exit(1); + } + path_range_out = all_path_ranges_out->front(); + } + else if (mappings_matched != surj_path.mapping_size()) { + cerr << "error: couldn't identify a path range corresponding to surjected read " << source.name() + << " because " << mappings_matched << "/" << surj_path.mapping_size() << " mappings were matched on path " << graph->get_path_name(path_handle) << endl; + cerr << "Surjected read dump: " << pb2json(surjected) << endl; + exit(1); + } + } + else { + // sentinel to indicate that surjection is unmapped + path_range_out.first = path_range_out.second = graph->path_end(path_handle); + } + + // transfer applicable metadata (including data that doesn't transit through multipath_alignment_t) + surjected.set_name(source.name()); + surjected.set_read_group(source.read_group()); + surjected.set_sample_name(source.sample_name()); + surjected.set_mapping_quality(source.mapping_quality()); + if (source.has_fragment_next()) { + *surjected.mutable_fragment_next() = source.fragment_next(); + } + if (source.has_fragment_prev()) { + *surjected.mutable_fragment_prev() = source.fragment_prev(); + } + if (source.has_annotation()) { + *surjected.mutable_annotation() = source.annotation(); + } + +#ifdef debug_anchored_surject + cerr << "concatenated and translated alignment " << pb2json(surjected) << endl; +#endif + + return surjected; + } + + unordered_map, pair, vector>>> + Surjector::extract_overlapping_paths(const PathPositionHandleGraph* graph, + const multipath_alignment_t& source, + const unordered_set& surjection_paths, + unordered_map, vector>>& connections_out) const { + + unordered_map, pair, vector>>> to_return; + + // reverse the connection edges for easy backwards lookup + vector>> rev_connections(source.subpath_size()); + + // compute the start of the read interval that corresponds to each mapping + vector> mapping_to_lengths(source.subpath_size()); + for (int64_t i = 0; i < source.subpath_size(); ++i) { + mapping_to_lengths[i].resize(source.subpath(i).path().mapping_size(), 0); + for (const auto& connection : source.subpath(i).connection()) { + rev_connections[connection.next()].emplace_back(i, connection.score()); + } + } + for (int64_t i = 0; i < source.subpath_size(); ++i) { + const auto& subpath = source.subpath(i); + const auto& path = subpath.path(); + auto& subpath_to_length = mapping_to_lengths[i]; + int64_t thru_length = subpath_to_length.front(); + for (size_t j = 0; j < path.mapping_size(); ++j) { + thru_length += mapping_to_length(path.mapping(j)); + if (j + 1 < path.mapping_size()) { + subpath_to_length[j + 1] = thru_length; + } + } + for (auto n : subpath.next()) { + mapping_to_lengths[n][0] = thru_length; + } + for (auto c : subpath.connection()) { + mapping_to_lengths[c.next()][0] = thru_length; + } + } + + // map from (path, strand, subpath idx) to indexes among path chunks that have outgoing connections + unordered_map, vector> connection_sources; + + // the mappings (subpath, mapping, step) that have already been associated + unordered_set> associated; + for (int64_t i = 0; i < source.subpath_size(); ++i) { + const auto& path = source.subpath(i).path(); + int64_t prefix_to_length = 0; + for (int64_t j = 0; j < path.mapping_size(); ++j) { + const auto& mapping = path.mapping(j); + const auto& pos = mapping.position(); + handle_t handle = graph->get_handle(pos.node_id(), pos.is_reverse()); + graph->for_each_step_on_handle(handle, [&](const step_handle_t& step) { + + path_handle_t path_handle = graph->get_path_handle_of_step(step); + + if (!surjection_paths.count(path_handle) || associated.count(make_tuple(i, j, step))) { + // this is not on a path we're surjecting to, or we've already + // done it + return; + } + +#ifdef debug_multipath_surject + cerr << "starting new path DFS for subpath " << i << ", mapping " << j << ", path " << graph->get_path_name(path_handle) << ", step at " << graph->get_position_of_step(step) << endl; +#endif + + // do DFS starting from this mapping to find maximal path overlapping chunks + + // records of (subpath, mapping, next edge idx, step here) + // internal mappings are treated as having a single edge + vector> stack; + stack.emplace_back(i, j, 0, step); + bool added_new_mappings = true; + while (!stack.empty()) { + + int64_t s_idx, m_idx, n_idx; + step_handle_t step_here; + tie(s_idx, m_idx, n_idx, step_here) = stack.back(); +#ifdef debug_multipath_surject + cerr << "stack frame s " << s_idx << ", m " << m_idx << ", n " << n_idx << ", step at " << graph->get_position_of_step(step_here) << endl; +#endif + + const auto& subpath_here = source.subpath(s_idx); + const auto& path_here = subpath_here.path(); + + if ((m_idx + 1 < path_here.mapping_size() && n_idx != 0) + || (m_idx + 1 == path_here.mapping_size() && (n_idx == subpath_here.next_size() + || !subpath_here.connection().empty()))) { + // we've exhausted all of the outgoing adjacencies from this mapping +#ifdef debug_multipath_surject + cerr << "adjacencies exhausted or hit a connection" << endl; +#endif + if (added_new_mappings) { + + // a DFS traveresal has gone as far as possible, output the stack as a path + auto path_strand = make_pair(path_handle, handle != graph->get_handle_of_step(step)); + auto& section_record = to_return[path_strand]; + + if (m_idx + 1 == path_here.mapping_size() && !subpath_here.connection().empty()) { + // record that connections leave this patch chunk + connection_sources[make_tuple(path_handle, path_strand.second, s_idx)].push_back(section_record.first.size()); + } + + // the interval of steps + section_record.second.emplace_back(get<3>(stack.front()), get<3>(stack.back())); + + section_record.first.emplace_back(); + auto& chunk = section_record.first.back(); + + // the aligned path + auto& path_chunk = chunk.second; + for (const auto& record : stack) { + associated.emplace(get<0>(record), get<1>(record), get<3>(record)); + const auto& next_mapping = source.subpath(get<0>(record)).path().mapping(get<1>(record)); + const auto& next_pos = next_mapping.position(); + bool merged_mapping = false; + if (path_chunk.mapping_size() != 0) { + + auto prev_mapping = path_chunk.mutable_mapping(path_chunk.mapping_size() - 1); + const auto& prev_pos = prev_mapping->position(); + + if (next_pos.node_id() == prev_pos.node_id() && + next_pos.is_reverse() == prev_pos.is_reverse() && + next_pos.offset() == prev_pos.offset() + mapping_from_length(*prev_mapping)) { + // the next mapping is contiguous on a node with the previous one, we can merge + // the two mappings into one + + auto prev_edit = prev_mapping->mutable_edit(prev_mapping->edit_size() - 1); + const auto& next_edit = next_mapping.edit(0); + if ((prev_edit->from_length() != 0) == (next_edit.from_length() != 0) && + (prev_edit->to_length() != 0) == (next_edit.to_length() != 0) && + prev_edit->sequence().empty() == next_edit.sequence().empty()) { + + prev_edit->set_from_length(prev_edit->from_length() + next_edit.from_length()); + prev_edit->set_to_length(prev_edit->to_length() + next_edit.to_length()); + prev_edit->set_sequence(prev_edit->sequence() + next_edit.sequence()); + } + else { + to_proto_edit(next_edit, *prev_mapping->add_edit()); + } + for (size_t k = 1; k < next_mapping.edit_size(); ++k) { + to_proto_edit(next_mapping.edit(k), *prev_mapping->add_edit()); + } + + merged_mapping = true; + } + } + if (!merged_mapping) { + // make a new mapping + to_proto_mapping(next_mapping, *path_chunk.add_mapping()); + } + } + + if (j == 0) { + // translate connections into the indexes of their path chunks + // note: if these are on different strands, they'll be ignored + for (const auto& c : rev_connections[i]) { + for (auto source_idx : connection_sources[make_tuple(path_handle, path_strand.second, c.first)]) { + + // compute the distance along the path + pos_t pos1 = final_position(section_record.first[source_idx].second); + pos_t pos2 = initial_position(section_record.first.back().second); + step_handle_t step1 = section_record.second[source_idx].second; + step_handle_t step2 = section_record.second.back().first; + int64_t dist; + if (path_strand.second) { + // reverse strand of path + dist = (graph->get_position_of_step(step1) + + graph->get_length(graph->get_handle_of_step(step1)) + - graph->get_position_of_step(step2) + - graph->get_length(graph->get_handle_of_step(step2)) + + offset(pos2) + - offset(pos1)); + } + else { + // forward strand of path + dist = (graph->get_position_of_step(step2) + - graph->get_position_of_step(step1) + + offset(pos2) + - offset(pos1)); + } + + if (dist >= 0) { + connections_out[path_strand].emplace_back(source_idx, section_record.first.size() - 1, + c.second); + } + } + } + } + + + // the read interval + chunk.first.first = source.sequence().begin() + mapping_to_lengths[i][j]; + chunk.first.second = chunk.first.first + path_to_length(path_chunk); + + // remember that we've already emitted all the mappings currently on the stack + added_new_mappings = false; +#ifdef debug_multipath_surject + cerr << "converted stack into path " << pb2json(path_chunk) << endl; + cerr << "read interval is " << (chunk.first.first - source.sequence().begin()) << ":" << (chunk.first.second - source.sequence().begin()) << " " << string(chunk.first.first, chunk.first.second) << endl; +#endif + } + + stack.pop_back(); + continue; + } -#ifdef debug_anchored_surject - cerr << "path chunk " << pb2json(path_chunk.second) << " can be aligned to reverse strand in interval " << left_boundary << ":" << right_boundary << endl; + // mark that we have used this adjacency up + ++get<2>(stack.back()); + + // get the indexes of the next mapping + const auto& mapping_here = path_here.mapping(m_idx); + int64_t next_s_idx, next_m_idx; + const path_mapping_t* next_mapping; + if (m_idx + 1 == path_here.mapping_size()) { + // mapping is at a subpath boundary + if (n_idx == 0) { + // check whether we branch to different segments of the same path + unordered_map> next_steps; + for (auto n : subpath_here.next()) { + const auto& path = source.subpath(n).path(); + // search through path until we find an aligned base + size_t p_idx = 0; + while (p_idx + 1 < path.mapping_size() && mapping_from_length(path.mapping(p_idx)) == 0) { + ++p_idx; + } + const auto& pos = path.mapping(p_idx).position(); + handle_t h = graph->get_handle(pos.node_id(), pos.is_reverse()); + graph->for_each_step_on_handle(h, [&](const step_handle_t& step) { + next_steps[graph->get_path_handle_of_step(step)].push_back(step); + }); + } + bool branches_along_path = false; + for (pair>& path_steps : next_steps) { + sort(path_steps.second.begin(), path_steps.second.end()); + if (unique(path_steps.second.begin(), path_steps.second.end()) - path_steps.second.begin() > 1) { + // the next subpaths along this branch point reach different places on the same + // path, we have to avoid this so that the splicing logic will work + branches_along_path = true; + break; + } + } + + if (branches_along_path) { +#ifdef debug_multipath_surject + cerr << "setting n to end to abort DFS to preserve a blunt end at a possible splice edge" << endl; +#endif + // we'll prematurely end the DFS at this mapping + get<2>(stack.back()) = subpath_here.next_size(); + continue; + } + } + next_s_idx = subpath_here.next(n_idx); + if (!rev_connections[next_s_idx].empty()) { + // we always break a path chunk at a connection + continue; + } + next_m_idx = 0; + next_mapping = &source.subpath(next_s_idx).path().mapping().front(); + } + else { + // mapping is not at a subpath boundary + next_s_idx = s_idx; + next_m_idx = m_idx + 1; + next_mapping = &path_here.mapping(next_m_idx); + } +#ifdef debug_multipath_surject + cerr << "next s " << next_s_idx << ", m " << next_m_idx << endl; +#endif + + // check if the next position is consistent with the path we're walking + const auto& pos_here = mapping_here.position(); + const auto& next_pos = next_mapping->position(); + if (pos_here.node_id() == next_pos.node_id() + && pos_here.is_reverse() == next_pos.is_reverse() + && pos_here.offset() + mapping_from_length(mapping_here) == next_pos.offset()) { + // mappings are abutting within a node, don't leave the current step + stack.emplace_back(next_s_idx, next_m_idx, 0, step_here); + added_new_mappings = true; + } + else { + // mappings cross an edge in the graph + bool strand_rev = pos_here.is_reverse() != graph->get_is_reverse(graph->get_handle_of_step(step_here)); + step_handle_t next_step = strand_rev ? graph->get_previous_step(step_here) : graph->get_next_step(step_here); + if (next_step != graph->path_end(path_handle) && next_step != graph->path_front_end(path_handle)) { + + handle_t next_handle = graph->get_handle_of_step(next_step); + if (graph->get_id(next_handle) == next_pos.node_id() && + (graph->get_is_reverse(next_handle) != next_pos.is_reverse()) == strand_rev) { + // the next mapping is along the path how we would expect +#ifdef debug_multipath_surject + cerr << "the next mapping is adjacent in the path" << endl; +#endif + stack.emplace_back(next_s_idx, next_m_idx, 0, next_step); + added_new_mappings = true; + } +#ifdef debug_multipath_surject + else { + cerr << "the next mapping is not adjacent in the path" << endl; + } +#endif + } +#ifdef debug_multipath_surject + else { + cerr << "we have hit the end of the path" << endl; + } #endif + } } - } + }); } } - - return interval; + + return to_return; } - VG Surjector::extract_linearized_path_graph(size_t first, size_t last, const xg::XGPath& xpath, - unordered_map>& node_trans) { + unordered_map, pair, vector>>> + Surjector::extract_overlapping_paths(const PathPositionHandleGraph* graph, const Alignment& source, + const unordered_set& surjection_paths) const { -#ifdef debug_anchored_surject - cerr << "extracting path graph for position interval " << first << ":" << last << " in path of length " << xpath.positions[xpath.positions.size() - 1] + xindex->node_length(xpath.node(xpath.ids.size() - 1)) << endl; -#endif + unordered_map, pair, vector>>> to_return; - VG path_graph; + const Path& path = source.path(); - size_t begin = xpath.offset_at_position(first); - size_t end = min(xpath.positions.size(), xpath.offset_at_position(last) + 1); + // for each path that we're extending, the previous step and the strand we were at on it + // mapped to the index of that path chunk in the path's vector + unordered_map, size_t> extending_steps; + int64_t through_to_length = 0; - Node* prev_node = nullptr; - for (size_t i = begin; i < end; i++) { + for (size_t i = 0; i < path.mapping_size(); i++) { - id_t node_id = xpath.node(i); - string seq = xindex->node_sequence(node_id); - bool rev = xpath.directions[i]; + int64_t before_to_length = through_to_length; + through_to_length += mapping_to_length(path.mapping(i)); - Node* node; - if (rev) { - node = path_graph.create_node(reverse_complement(seq)); - } - else { - node = path_graph.create_node(seq); - } + const Position& pos = path.mapping(i).position(); + handle_t handle = graph->get_handle(pos.node_id(), pos.is_reverse()); + +#ifdef debug_anchored_surject + cerr << "looking for paths on mapping " << i << " at position " << make_pos_t(pos) << endl; +#endif + + unordered_map, size_t> next_extending_steps; - if (prev_node) { - path_graph.create_edge(prev_node, node); + for (const step_handle_t& step : graph->steps_of_handle(handle)) { + +#ifdef debug_anchored_surject + cerr << "found a step on " << graph->get_path_name(graph->get_path_handle_of_step(step)) << endl; +#endif + + path_handle_t path_handle = graph->get_path_handle_of_step(step); + if (!surjection_paths.count(path_handle)) { + // we are not surjecting onto this path +#ifdef debug_anchored_surject + cerr << "not surjecting to this path, skipping" << endl; +#endif + continue; + } + + // We always see paths on the forward strand, so we need to + // work out if the read is running along the path in the path's + // forward (false) or reverse (true) direction. + // + // If the read visits the node in a different orientation than + // the path does, then the read runs along the path in reverse. + bool path_strand = graph->get_is_reverse(handle) != graph->get_is_reverse(graph->get_handle_of_step(step)); + + step_handle_t prev_step = path_strand ? graph->get_next_step(step) : graph->get_previous_step(step); + +#ifdef debug_anchored_surject + cerr << "path strand is " << (path_strand ? "rev" : "fwd") << ", prev step is "; + if (prev_step == graph->path_end(path_handle)) { + cerr << " path end"; + } + else if (prev_step == graph->path_front_end(path_handle)) { + cerr << " path front end"; + } + else { + cerr << graph->get_id(graph->get_handle_of_step(prev_step)) << (graph->get_is_reverse(graph->get_handle_of_step(prev_step)) ? "-" : "+"); + } + cerr << endl; + cerr << "possible extensions from: " << endl; + for (const auto& record : extending_steps) { + cerr << "\t" << "chunk " << record.second << " at " << graph->get_id(graph->get_handle_of_step(record.first.first)) << (graph->get_is_reverse(graph->get_handle_of_step(record.first.first)) ? "-" : "+") << " on " << graph->get_path_name(graph->get_path_handle_of_step(record.first.first)) << " " << (record.first.second ? "rev" : "fwd") << endl; + } +#endif + + auto& path_chunks = to_return[make_pair(path_handle, path_strand)]; + + if (extending_steps.count(make_pair(prev_step, path_strand))) { + // we are extending from the previous step, so we continue with the extension + + size_t chunk_idx = extending_steps[make_pair(prev_step, path_strand)]; + auto& aln_chunk = path_chunks.first[chunk_idx]; + auto& ref_chunk = path_chunks.second[chunk_idx]; + +#ifdef debug_anchored_surject + cerr << "comes after chunk " << chunk_idx << endl; +#endif + + // extend the range of the path on the reference + ref_chunk.second = step; + + // move the end of the sequence out + aln_chunk.first.second = source.sequence().begin() + through_to_length; + Mapping* mapping = aln_chunk.second.add_mapping(); + // add this mapping + *mapping = path.mapping(i); + mapping->set_rank(aln_chunk.second.mapping(aln_chunk.second.mapping_size() - 2).rank() + 1); + + // in the next iteration, this step should point into the chunk it just extended + next_extending_steps[make_pair(step, path_strand)] = extending_steps[make_pair(prev_step, path_strand)]; + } + else { + + // this step does not extend a previous step, so we start a new chunk + path_chunks.first.emplace_back(); + path_chunks.second.emplace_back(); + auto& aln_chunk = path_chunks.first.back(); + auto& ref_chunk = path_chunks.second.back(); + + // init the ref interval with the interval along the embedded path + ref_chunk.first = step; + ref_chunk.second = step; + + // init the new chunk with the sequence interval + aln_chunk.first.first = source.sequence().begin() + before_to_length; + aln_chunk.first.second = source.sequence().begin() + through_to_length; + + // and with the first mapping + Mapping* mapping = aln_chunk.second.add_mapping(); + *mapping = path.mapping(i); + mapping->set_rank(1); + + // keep track of where this chunk is in the vector and which step it came from + // for the next iteration + next_extending_steps[make_pair(step, path_strand)] = path_chunks.first.size() - 1; + +#ifdef debug_anchored_surject + cerr << "no preceeding chunk so start new chunk " << path_chunks.first.size() - 1 << endl; +#endif + } } - prev_node = node; - node_trans[node->id()] = make_pair(node_id, rev); + // we've finished extending the steps from the previous mapping, so we replace them + // with the steps we found in this iteration that we want to extend on the next one + extending_steps = next_extending_steps; } - return path_graph; + return to_return; } - - void Surjector::set_path_position(const Alignment& surjected, size_t best_path_rank, const xg::XGPath& xpath, - string& path_name_out, int64_t& path_pos_out, bool& path_rev_out, - unordered_map, vector>>* oriented_occurrences_memo) { + + void Surjector::filter_redundant_path_chunks(bool path_rev, vector& path_chunks, + vector>& ref_chunks, + vector>& connections) const { - const Path& path = surjected.path(); - if (path.mapping_size() == 0){ - // hack: we want a 0 position once we convert this to 1-based indexes - path_pos_out = -1; - path_rev_out = false; - path_name_out = ""; - return; +#ifdef debug_filter_paths + cerr << "filtering redundant path chunks" << endl; +#endif + + assert(path_chunks.size() == ref_chunks.size()); + vector order(path_chunks.size(), 0); + for (size_t i = 1; i < order.size(); ++i) { + order[i] = i; } - const Position& start_pos = path.mapping(0).position(); + // convert connections to adjacency lists + vector>> inward_connections(path_chunks.size()), outward_connections(path_chunks.size()); + for (const auto& connection : connections) { + inward_connections[get<1>(connection)].emplace_back(get<0>(connection), get<2>(connection)); + outward_connections[get<0>(connection)].emplace_back(get<1>(connection), get<2>(connection)); + } - vector> oriented_occurrences = xindex->memoized_oriented_occurrences_on_path(start_pos.node_id(), best_path_rank, - oriented_occurrences_memo); - - for (pair& occurrence : oriented_occurrences) { - if (occurrence.second == start_pos.is_reverse()) { - // the first node in this alignment occurs on the forward strand of the path - - if (occurrence.first + path.mapping_size() > xpath.ids.size()) { - // but it doesn't fit on the path + // sort the adjacency lists for easy determination of subsets + for (auto& adj : inward_connections) { + sort(adj.begin(), adj.end()); + } + for (auto& adj : outward_connections) { + sort(adj.begin(), adj.end()); + } + +#ifdef debug_filter_paths + cerr << "original order for chunks" << endl; + for (size_t i = 0; i < path_chunks.size(); ++i) { + cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << pb2json(path_chunks[i].second) << endl; + } + cerr << "connections" << endl; + for (size_t i = 0; i < outward_connections.size(); ++i) { + cerr << i << ":"; + for (const auto& c : outward_connections[i]) { + cerr << " (" << c.first << " " << c.second << ")"; + } + cerr << endl; + } +#endif + + // test it one adjacency list entry is a subset of another (assumes sort) + auto is_subset = [](const vector>& sub, const vector>& super) { + size_t i = 0, j = 0; + while (i < sub.size() && j < super.size()) { + if (sub[i] == super[j]) { + ++i; + ++j; + } + else { + ++j; + } + } + return (i == sub.size()); + }; + + // order the path chunks by the left index of their read interval + // and break ties in favor of longer intervals (so that the filteree + // will come later in the vector as we expect) + // among paths that are still tied, prioritize path chunks with more + // connections + stable_sort(order.begin(), order.end(), [&](size_t i, size_t j) { + auto& chunk1 = path_chunks[i]; + auto& chunk2 = path_chunks[j]; + return (chunk1.first.first < chunk2.first.first || + (chunk1.first.first == chunk2.first.first && chunk1.first.second > chunk2.first.second) || + (chunk1.first.first == chunk2.first.first && chunk1.first.second == chunk2.first.second && + (inward_connections[i].size() + outward_connections[i].size() + > inward_connections[j].size() + outward_connections[j].size()))); + }); + +#ifdef debug_filter_paths + cerr << "sort order for chunks" << endl; + for (auto i : order) { + cerr << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << pb2json(path_chunks[i].second) << endl; + } +#endif + + vector redundant(path_chunks.size(), false); + + // a heap where the top always points to the leftmost end of a read interval + auto cmp = [&](int64_t i, int64_t j) { + return path_chunks[i].first.second > path_chunks[j].first.second; + }; + vector curr_chunks; + + for (int64_t i = 0; i < order.size(); ++i) { + auto& chunk_here = path_chunks[order[i]]; +#ifdef debug_filter_paths + cerr << "looking for overlapping chunks for " << order[i] << endl; + cerr << string(chunk_here.first.first, chunk_here.first.second) << " " << pb2json(chunk_here.second) << endl; +#endif + + // remove items from the heap if they are outside the window of this read interval + while (!curr_chunks.empty() && path_chunks[curr_chunks.front()].first.second <= chunk_here.first.first) { + pop_heap(curr_chunks.begin(), curr_chunks.end(), cmp); + curr_chunks.pop_back(); + } + + for (auto j : curr_chunks) { + if (path_chunks[j].first.first > chunk_here.first.first || + path_chunks[j].first.second < chunk_here.first.second) { + // doesn't contain the right read interval continue; } + // check that the reference interval is contained + if (path_rev) { + if (graph->get_position_of_step(ref_chunks[order[i]].first) > graph->get_position_of_step(ref_chunks[j].first) + || graph->get_position_of_step(ref_chunks[order[i]].second) < graph->get_position_of_step(ref_chunks[j].second)) { + continue; + } + } + else { + if (graph->get_position_of_step(ref_chunks[order[i]].first) < graph->get_position_of_step(ref_chunks[j].first) + || graph->get_position_of_step(ref_chunks[order[i]].second) > graph->get_position_of_step(ref_chunks[j].second)) { + continue; + } + } - // does the alignment follow the path here? - bool match = true; - for (size_t i = 1; i < path.mapping_size(); i++) { - const Position& pos = path.mapping(i).position(); - if (pos.node_id() != xpath.node(occurrence.first + i) || pos.is_reverse() != xpath.is_reverse(occurrence.first + i)) { - match = false; - break; + auto& chunk_over = path_chunks[j]; + auto remaining = chunk_here.first.first - chunk_over.first.first; +#ifdef debug_filter_paths + cerr << "overlap candidate " << j << endl; + cerr << string(chunk_over.first.first, chunk_over.first.second) << " " << pb2json(chunk_over.second) << endl; + cerr << "at relative read offset " << remaining << endl; +#endif + + // walk the part of the overlapping path that comes before the path here + int64_t m_over_idx = 0, e_over_idx = 0; + while (m_over_idx < chunk_over.second.mapping_size() + && remaining >= chunk_over.second.mapping(m_over_idx).edit(e_over_idx).to_length() + && remaining > 0) { + + remaining -= chunk_over.second.mapping(m_over_idx).edit(e_over_idx).to_length(); + +#ifdef debug_filter_paths + cerr << "walk down overlapper before match at " << m_over_idx << " " << e_over_idx << endl; +#endif + + ++e_over_idx; + if (e_over_idx == chunk_over.second.mapping(m_over_idx).edit_size()) { + ++m_over_idx; + e_over_idx = 0; } } - // we found where the alignment could be from - if (match) { - path_pos_out = xpath.positions[occurrence.first] + start_pos.offset(); - path_rev_out = false; - return; + // we might need to walk another subpath of with to length of 0 to get to the start + // of the short path + while (m_over_idx < chunk_over.second.mapping_size() + && chunk_over.second.mapping(m_over_idx).edit(e_over_idx).to_length() == 0 + && (chunk_over.second.mapping(m_over_idx).position().node_id() + != chunk_here.second.mapping(0).position().node_id()) + && (chunk_over.second.mapping(m_over_idx).position().is_reverse() + != chunk_here.second.mapping(0).position().is_reverse()) + && (chunk_over.second.mapping(m_over_idx).position().offset() + != chunk_here.second.mapping(0).position().offset())) { +#ifdef debug_filter_paths + cerr << "walking forward through a deletion to find match" << endl; +#endif + ++e_over_idx; + if (e_over_idx == chunk_over.second.mapping(m_over_idx).edit_size()) { + ++m_over_idx; + e_over_idx = 0; + } } - } - else { - // the first node in this alignment occurs on the reverse strand of the path - if (occurrence.first + 1 < path.mapping_size()) { - // but it doesn't fit on the path +#ifdef debug_filter_paths + cerr << "search for overlap begins at over idx " << m_over_idx << " " << e_over_idx << endl; +#endif + + // we'll only consider it to match the paths meet at a mapping boundary + bool matches = (remaining == 0 && e_over_idx == 0 + && m_over_idx < chunk_over.second.mapping_size()); + if (!matches) { +#ifdef debug_filter_paths + cerr << "shorter path not at an internal mapping boundary" << endl; +#endif continue; } - // does the alignment follow the path here? - bool match = true; - for (size_t i = 1; i < path.mapping_size(); i++) { - const Position& pos = path.mapping(i).position(); - if (pos.node_id() != xpath.node(occurrence.first - i) || pos.is_reverse() == xpath.is_reverse(occurrence.first - i)) { - match = false; + bool shares_start = m_over_idx == 0; + + // try to walk the part of the path where they overlap + int64_t m_here_idx = 0, e_here_idx = 0; + while (m_over_idx < chunk_over.second.mapping_size() && + m_here_idx < chunk_here.second.mapping_size()) { +#ifdef debug_filter_paths + cerr << "looking for match at " << m_over_idx << " " << e_over_idx << ", " << m_here_idx << " " << e_here_idx << endl; +#endif + + if (e_here_idx == 0) { + const auto& pos_over = chunk_over.second.mapping(m_over_idx).position(); + const auto& pos_here = chunk_here.second.mapping(m_here_idx).position(); + if (pos_here.node_id() != pos_over.node_id() + || pos_here.is_reverse() != pos_over.is_reverse() + || pos_here.offset() != pos_over.offset()) { +#ifdef debug_filter_paths + cerr << "mappings not at the same position" << endl; +#endif + matches = false; + break; + } + } + const auto& edit_over = chunk_over.second.mapping(m_over_idx).edit(e_over_idx); + const auto& edit_here = chunk_here.second.mapping(m_here_idx).edit(e_here_idx); + if (edit_here.from_length() != edit_over.from_length() + || edit_here.to_length() != edit_here.to_length()) { + // note: we don't need to worry about edit sequence because we know we're at + // the same read interval + + // the edits don't match +#ifdef debug_filter_paths + cerr << "edits don't match" << endl; +#endif + matches = false; break; } + + ++e_over_idx; + ++e_here_idx; + if (e_over_idx == chunk_over.second.mapping(m_over_idx).edit_size()) { + if (e_here_idx != chunk_here.second.mapping(m_here_idx).edit_size()) { +#ifdef debug_filter_paths + cerr << "mapping boundaries don't occur at the same place" << endl; +#endif + matches = false; + break; + } + ++m_over_idx; + e_over_idx = 0; + } + if (e_here_idx == chunk_here.second.mapping(m_here_idx).edit_size()) { + ++m_here_idx; + e_here_idx = 0; + } + } + + + if (matches && m_here_idx == chunk_here.second.mapping_size()) { + + if (shares_start) { +#ifdef debug_filter_paths + cerr << "checking shared inward connections" << endl; +#endif + if (!is_subset(inward_connections[order[i]], inward_connections[j])) { +#ifdef debug_filter_paths + cerr << "connections are non-redundant" << endl; +#endif + // the chunk has unique inward connections, we can't eliminate it + continue; + } + } + else if (!inward_connections[order[i]].empty()) { +#ifdef debug_filter_paths + cerr << "has internal inward connections" << endl; +#endif + // has inward connections to the interior of the chunk + continue; + } + if (m_over_idx == chunk_over.second.mapping_size()) { +#ifdef debug_filter_paths + cerr << "checking shared outward connections" << endl; +#endif + if (!is_subset(outward_connections[order[i]], outward_connections[j])) { +#ifdef debug_filter_paths + cerr << "connections are non-redundant" << endl; +#endif + // the chunk has unique outward connections, we can't eliminate it + continue; + } + } + else if (!outward_connections[order[i]].empty()) { +#ifdef debug_filter_paths + cerr << "has internal outward connections" << endl; +#endif + // has outward connections to the interior of the chunk + continue; + } + + // the whole path matches an earlier, longer path + redundant[order[i]] = true; +#ifdef debug_filter_paths + cerr << "marking path chunk " << order[i] << " redundant" << endl; +#endif + break; + } + } + + // we only need to look at nonredundant chunks on later iterations + if (!redundant[order[i]]) { + curr_chunks.push_back(order[i]); + push_heap(curr_chunks.begin(), curr_chunks.end()); + } + } + + // filter down to nonredundant paths + vector removed_before(path_chunks.size() + 1, 0); + for (size_t i = 0; i < path_chunks.size(); ++i) { + if (redundant[i]) { +#ifdef debug_filter_paths + cerr << "filtering path chunk " << i << ": " << string(path_chunks[i].first.first, path_chunks[i].first.second) << " " << pb2json(path_chunks[i].second) << endl; +#endif + ++removed_before[i]; + } + else if (removed_before[i]) { + path_chunks[i - removed_before[i]] = move(path_chunks[i]); + ref_chunks[i - removed_before[i]] = move(ref_chunks[i]); + } + removed_before[i + 1] = removed_before[i]; + } + if (removed_before.back() != 0) { + path_chunks.resize(path_chunks.size() - removed_before.back()); + ref_chunks.resize(ref_chunks.size() - removed_before.back()); + } + + // update the indexes on connections and remove redundant ones + size_t removed_so_far = 0; + for (size_t i = 0; i < connections.size(); ++i) { + auto& connection = connections[i]; + if (redundant[get<0>(connection)] || redundant[get<1>(connection)]) { + ++removed_so_far; + } + else { +#ifdef debug_filter_paths + cerr << "updating connection " << get<0>(connection) << " -> " << get<1>(connection) << endl; +#endif + get<0>(connection) -= removed_before[get<0>(connection)]; + get<1>(connection) -= removed_before[get<1>(connection)]; + if (removed_so_far) { + connections[i - removed_so_far] = move(connection); + } + } + } + if (removed_so_far) { + connections.resize(connections.size() - removed_so_far); + } + + // sort the path chunks in lexicographic order like the downstream code expects + + if (!is_sorted(path_chunks.begin(), path_chunks.end(), + [&](path_chunk_t& a, path_chunk_t& b) { return a.first < b.first; })) { + +#ifdef debug_filter_paths + cerr << "putting path chunks in lexicographic order" << endl; +#endif + + // compute which index the chunks should end up in + for (size_t i = 0; i < path_chunks.size(); ++i) { + order[i] = i; + } + order.resize(path_chunks.size()); + stable_sort(order.begin(), order.end(), [&](size_t i, size_t j) { + return path_chunks[i].first < path_chunks[j].first; + }); + vector index(order.size()); + for (size_t i = 0; i < order.size(); ++i) { + index[order[i]] = i; + } + + // update the indexes of the connections + for (auto& connection : connections) { + get<0>(connection) = index[get<0>(connection)]; + get<1>(connection) = index[get<1>(connection)]; + } + + // and co-sort the vectors into the computed indexes + for (size_t i = 0; i < index.size(); ++i) { + while (index[i] != i) { + std::swap(path_chunks[i], path_chunks[index[i]]); + std::swap(ref_chunks[i], ref_chunks[index[i]]); + std::swap(index[i], index[index[i]]); } + } + } + } + + pair + Surjector::compute_path_interval(const PathPositionHandleGraph* graph, const Alignment& source, + path_handle_t path_handle, bool rev_strand, + const vector& path_chunks, + const vector>& ref_chunks, + bool no_left_expansion, bool no_right_expansion) const { + + pair interval(numeric_limits::max(), numeric_limits::min()); + + size_t path_length = graph->get_path_length(path_handle); + + for (size_t i = 0; i < path_chunks.size(); ++i) { + + const auto& path_chunk = path_chunks[i]; + const auto& ref_chunk = ref_chunks[i]; + + size_t left_overhang = no_left_expansion ? 0 : (get_aligner()->longest_detectable_gap(source, path_chunk.first.first) + + (path_chunk.first.first - source.sequence().begin())); + + size_t right_overhang = no_right_expansion ? 0 : (get_aligner()->longest_detectable_gap(source, path_chunk.first.second) + + (source.sequence().end() - path_chunk.first.second)); + + const Position& first_pos = path_chunk.second.mapping(0).position(); + if (rev_strand) { + size_t path_offset = (graph->get_position_of_step(ref_chunk.first) + + graph->get_length(graph->get_handle_of_step(ref_chunk.first)) + - first_pos.offset()); - // we found where the alignment could be from - if (match) { - const Mapping& last_mapping = path.mapping(path.mapping_size() - 1); - size_t last_offset = occurrence.first + 1 - path.mapping_size(); - int64_t node_start = last_offset + 1 < xpath.positions.size() ? xpath.positions[last_offset + 1] : xpath.offsets.size(); - path_pos_out = node_start - last_mapping.position().offset() - mapping_from_length(last_mapping); - path_rev_out = true; - return; + interval.second = max(interval.second, min(path_offset + left_overhang, path_length - 1)); + } + else { + size_t path_offset = graph->get_position_of_step(ref_chunk.first) + first_pos.offset(); + if (left_overhang > path_offset) { + // avoid underflow + interval.first = 0; } + else { + interval.first = min(min(interval.first, path_offset - left_overhang), path_length - 1); + } + } + + const Mapping& final_mapping = path_chunk.second.mapping(path_chunk.second.mapping_size() - 1); + const Position& final_pos = final_mapping.position(); + if (rev_strand) { + size_t path_offset = (graph->get_position_of_step(ref_chunk.second) + + graph->get_length(graph->get_handle_of_step(ref_chunk.second)) + - final_pos.offset() + - mapping_from_length(final_mapping)); + if (right_overhang > path_offset) { + // avoid underflow + interval.first = 0; + } + else { + interval.first = min(min(interval.first, path_offset - right_overhang), path_length - 1); + } + } + else { + size_t path_offset = (graph->get_position_of_step(ref_chunk.second) + + final_pos.offset() + + mapping_from_length(final_mapping)); + interval.second = max(interval.second, min(path_offset + right_overhang, path_length - 1)); + } + } + + return interval; + } + + unordered_map> + Surjector::extract_linearized_path_graph(const PathPositionHandleGraph* graph, MutableHandleGraph* into, + path_handle_t path_handle, size_t first, size_t last) const { + + // TODO: we need better semantics than an unsigned interval for surjecting to circular paths + +#ifdef debug_anchored_surject + cerr << "extracting path graph for position interval " << first << ":" << last << " in path of length " << graph->get_path_length(path_handle) << endl; +#endif + + unordered_map> node_trans; + + step_handle_t begin = graph->get_step_at_position(path_handle, first); + step_handle_t end = graph->get_step_at_position(path_handle, last); + + if (graph->get_position_of_step(end) <= last && end != graph->path_end(path_handle)) { + // we actually want part of this step too, so we use the next one as the end iterator + end = graph->get_next_step(end); + } + + handle_t prev_node; + for (step_handle_t step = begin; step != end; step = graph->get_next_step(step)) { + // copy the node with the local orientation now forward + handle_t copying = graph->get_handle_of_step(step); + handle_t node_here = into->create_handle(graph->get_sequence(copying)); + + if (step != begin) { + // add an edge from the previous node + into->create_edge(prev_node, node_here); } + + // record the translation + node_trans[into->get_id(node_here)] = pair(graph->get_id(copying), + graph->get_is_reverse(copying)); + + prev_node = node_here; } - // we ran through all of the occurrences without finding a full match... + return node_trans; + } + + void Surjector::set_path_position(const PathPositionHandleGraph* graph, const pos_t& init_surj_pos, const pos_t& final_surj_pos, + const step_handle_t& range_begin, const step_handle_t& range_end, + bool rev_strand, string& path_name_out, int64_t& path_pos_out, bool& path_rev_out) const { + + + assert(graph->get_path_handle_of_step(range_begin) == graph->get_path_handle_of_step(range_end)); - cerr << "error:[Surjector] could not identify path position of surjected alignment " << surjected.name() << endl; - exit(1); + if (range_begin == graph->path_end(graph->get_path_handle_of_step(range_begin)) + && range_begin == graph->path_end(graph->get_path_handle_of_step(range_end))) { + // sentinel for unmapped + path_name_out = ""; + path_pos_out = -1; + path_rev_out = false; + } + else { +#if defined(debug_anchored_surject) || defined(debug_spliced_surject) + cerr << "setting position with initial position " << init_surj_pos << " and final position " << final_surj_pos << " based on range:" << endl; + cerr << "\tbegin: id " << graph->get_id(graph->get_handle_of_step(range_begin)) << ", rev " << graph->get_is_reverse(graph->get_handle_of_step(range_begin)) << ", pos " << graph->get_position_of_step(range_begin) << endl; + cerr << "\tend: id " << graph->get_id(graph->get_handle_of_step(range_end)) << ", rev " << graph->get_is_reverse(graph->get_handle_of_step(range_end)) << ", pos " << graph->get_position_of_step(range_end) << endl; +#endif + + // the path name + path_name_out = graph->get_path_name(graph->get_path_handle_of_step(range_begin)); + path_rev_out = rev_strand; + + // are we on the reverse strand? + size_t path_pos_begin = graph->get_position_of_step(range_begin); + size_t path_pos_end = graph->get_position_of_step(range_end); + + // the path offset + if (rev_strand) { + path_pos_out = (path_pos_end + graph->get_length(graph->get_handle_of_step(range_end)) + - offset(final_surj_pos)); + } + else { + path_pos_out = path_pos_begin + offset(init_surj_pos); + } + } } Alignment Surjector::make_null_alignment(const Alignment& source) { @@ -874,6 +4218,24 @@ using namespace std; } return null; } + + multipath_alignment_t Surjector::make_null_mp_alignment(const string& src_sequence, + const string& src_quality) { + multipath_alignment_t null; + null.set_sequence(src_sequence); + null.set_quality(src_quality); + return null; + } + + + template<> + int32_t Surjector::get_score(const Alignment& aln) { + return aln.score(); + } + template<> + int32_t Surjector::get_score(const multipath_alignment_t& mp_aln) { + return optimal_alignment_score(mp_aln); + } } diff --git a/src/surjector.hpp b/src/surjector.hpp index 935ab1931cf..c1536abb03b 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -7,75 +7,295 @@ */ #include +#include +#include +#include +#include -#include "alignment.hpp" -#include "mapper.hpp" -#include "xg.hpp" -#include "vg.hpp" -#include "translator.hpp" -#include "vg.pb.h" -#include "multipath_alignment_graph.hpp" +#include "aligner.hpp" +#include "handle.hpp" +#include +#include "multipath_alignment.hpp" -#include "algorithms/topological_sort.hpp" -#include "algorithms/split_strands.hpp" namespace vg { using namespace std; - class Surjector : Mapper { + class Surjector : public AlignerClient { public: - Surjector(xg::XG* xg_index); - ~Surjector(); + Surjector(const PathPositionHandleGraph* graph); - /// lossily project an alignment into a particular path space of a graph - /// the resulting alignment is equivalent to a SAM record against the chosen path - Alignment surject_classic(const Alignment& source, - const set& path_names, - string& path_name, - int64_t& path_pos, - bool& path_reverse); - - /// extract the portions of an alignment that are on a chosen set of paths and try to + /// Extract the portions of an alignment that are on a chosen set of paths and try to /// align realign the portions that are off of the chosen paths to the intervening - /// path segments to obtain an alignment that is fully restricted to the paths - Alignment path_anchored_surject(const Alignment& source, - const set& path_names, - string& path_name_out, - int64_t& path_pos_out, - bool& path_rev_out); + /// path segments to obtain an alignment that is fully restricted to the paths. + /// + /// Also returns the path name, position, and strand of the new alignment. + /// + /// Optionally either allow softclips so that the alignment has a nonnegative score on + /// the path or require the full-length alignment, possibly creating a negative score. + /// + /// Also optionally leaves deletions against the reference path in the final alignment + /// (useful for splicing). + Alignment surject(const Alignment& source, + const unordered_set& paths, + string& path_name_out, + int64_t& path_pos_out, + bool& path_rev_out, + bool allow_negative_scores = false, + bool preserve_deletions = false) const; + + /// Same as above, but include alignments to all paths instead of only the optimal one + vector multi_surject(const Alignment& source, + const unordered_set& paths, + vector>& positions_out, + bool allow_negative_scores = false, + bool preserve_deletions = false) const; + + /// Extract the portions of an alignment that are on a chosen set of + /// paths and try to align realign the portions that are off of the + /// chosen paths to the intervening path segments to obtain an + /// alignment that is fully restricted to the paths. + /// + /// Replaces the alignment's refpos with the path name, position, and + /// strand the alignment has been surjected to. + /// + /// Optionally either allow softclips so that the alignment has a + /// nonnegative score on the path or require the full-length alignment, + /// possibly creating a negative score. + /// + /// Also optionally leaves deletions against the reference path in the final + /// alignment (useful for splicing). + Alignment surject(const Alignment& source, + const unordered_set& paths, + bool allow_negative_scores = false, + bool preserve_deletions = false) const; + + /// Same as above, but include alignments to all paths instead of only the optimal one + vector multi_surject(const Alignment& source, + const unordered_set& paths, + bool allow_negative_scores = false, + bool preserve_deletions = false) const; + /// Same semantics as with alignments except that connections are always + /// preserved as splices. The output consists of a multipath alignment with + /// a single path, separated by splices (either from large deletions or from + /// connections) + multipath_alignment_t surject(const multipath_alignment_t& source, + const unordered_set& paths, + string& path_name_out, int64_t& path_pos_out, + bool& path_rev_out, + bool allow_negative_scores = false, + bool preserve_deletions = false) const; + + /// Same as above, but include alignments to all paths instead of only the optimal one + vector multi_surject(const multipath_alignment_t& source, + const unordered_set& paths, + vector>& positions_out, + bool allow_negative_scores = false, + bool preserve_deletions = false) const; /// a local type that represents a read interval matched to a portion of the alignment path using path_chunk_t = pair, Path>; - private: + /// the minimum length deletion that the spliced algorithm will interpret as a splice event + int64_t min_splice_length = 20; + + int64_t dominated_path_chunk_diff = 10; + + /// the minimum length apparent intron that we will try to repair + int64_t min_splice_repair_length = 250; + + /// How big of a graph in bp should we ever try to align against for realigning surjection? + size_t max_subgraph_bases = 100 * 1024; + + /// in spliced surject, downsample if the base-wise average coverage by chunks is this high + int64_t min_fold_coverage_for_downsample = 8; + /// while downsampling, try to get down to this coverage on each base + int64_t downsample_coverage = 16; + + int64_t min_shift_for_prune = 32 * 1024; + int64_t shift_prune_diff = 16 * 1024; + + /// And have we complained about hitting it? + mutable atomic_flag warned_about_subgraph_size = ATOMIC_FLAG_INIT; + + bool prune_suspicious_anchors = false; + int64_t max_tail_anchor_prune = 4; + double low_complexity_p_value = .001; + + /// How many anchors (per path) will we use when surjecting using + /// anchors? + /// Excessive anchors will be pruned away. + size_t max_anchors = 200; + + bool annotate_with_all_path_scores = false; + + protected: + + void surject_internal(const Alignment* source_aln, const multipath_alignment_t* source_mp_aln, + vector* alns_out, vector* mp_alns_out, + const unordered_set& paths, + vector>& positions_out, bool all_paths, + bool allow_negative_scores, bool preserve_deletions) const; + + Alignment + realigning_surject(const PathPositionHandleGraph* graph, const Alignment& source, + const path_handle_t& path_handle, bool rev_strand, + const vector& path_chunks, + const vector>& ref_chunks, + pair& path_range_out, + bool allow_negative_scores, + bool preserve_N_alignments = false, + bool sinks_are_anchors = false, + bool sources_are_anchors = false, + vector>* all_path_ranges_out = nullptr) const; + + multipath_alignment_t + spliced_surject(const PathPositionHandleGraph* path_position_graph, + const string& src_sequence, const string& src_quality, + const int32_t src_mapping_quality, + const path_handle_t& path_handle, bool rev_strand, + vector& path_chunks, + vector>& ref_chunks, + vector>& connections, + pair& path_range_out, + bool allow_negative_scores, bool deletions_as_splices) const; + + /////////////////////// + // Support methods for the realigning surject algorithm + /////////////////////// /// get the chunks of the alignment path that follow the given reference paths - unordered_map> - extract_overlapping_paths(const Alignment& source, const unordered_map& path_rank_to_name, - unordered_map>* paths_of_node_memo = nullptr, - unordered_map, vector>>* oriented_occurrences_memo = nullptr); + unordered_map, pair, vector>>> + extract_overlapping_paths(const PathPositionHandleGraph* graph, const Alignment& source, + const unordered_set& surjection_paths) const; + + /// same semantics except for a multipath alignment + unordered_map, pair, vector>>> + extract_overlapping_paths(const PathPositionHandleGraph* graph, + const multipath_alignment_t& source, + const unordered_set& surjection_paths, + unordered_map, vector>>& connections_out) const; - /// compute the widest interval of path positions that the realigned sequence could align to + /// remove any path chunks and corresponding ref chunks that are identical to a longer + /// path chunk over the region where they overlap + void filter_redundant_path_chunks(bool path_rev, vector& path_chunks, + vector>& ref_chunks, + vector>& connections) const; + + /// Compute the widest end-inclusive interval of path positions that + /// the realigned sequence could align to, or an interval where start > + /// end if there are no path chunks. pair - compute_path_interval(const Alignment& source, size_t path_rank, const xg::XGPath& xpath, const vector& path_chunks, - unordered_map, vector>>* oriented_occurrences_memo = nullptr); + compute_path_interval(const PathPositionHandleGraph* graph, const Alignment& source, path_handle_t path_handle, + bool rev_strand, const vector& path_chunks, + const vector>& ref_chunks, + bool no_left_expansion, bool no_right_expansion) const; /// make a linear graph that corresponds to a path interval, possibly duplicating nodes in case of cycles - VG extract_linearized_path_graph(size_t first, size_t last, const xg::XGPath& xpath, - unordered_map>& node_trans); + unordered_map> + extract_linearized_path_graph(const PathPositionHandleGraph* graph, MutableHandleGraph* into, + path_handle_t path_handle, size_t first, size_t last) const; + + /// use the graph position bounds and the path range bounds to assign a path position to a surjected read + void set_path_position(const PathPositionHandleGraph* graph, const pos_t& init_surj_pos, + const pos_t& final_surj_pos, + const step_handle_t& range_begin, const step_handle_t& range_end, + bool rev_strand, string& path_name_out, int64_t& path_pos_out, bool& path_rev_out) const; + + template + string path_score_annotations(const unordered_map, pair>>& surjections) const; + + /////////////////////// + // Support methods for the spliced surject algorithm + /////////////////////// + + /// reverses an adjacency list + vector> reverse_adjacencies(const vector>& adj) const; + + /// returns a vector assignming each node to a connectd component, requires both the forward and reverse adjacency + /// lists. optionally also returns the total number of components + vector connected_components(const vector>& adj, + const vector>& rev_adj, + size_t* num_comps_out) const; + + /// returns the transitive reduction of a topologically sorted DAG's adjacency list + vector> transitive_reduction(const vector>& adj) const; + + /// eliminate any path chunks that have the exact same colinearities as another but are much shorter + vector> remove_dominated_chunks(const string& src_sequence, + const vector>& adj, + vector& path_chunks, + vector>& ref_chunks, + vector>& connections) const; + + /// if any anchors overlap each other, cut the second at the implied overlap position + void cut_anchors(bool rev_strand, vector& path_chunks, + vector>& ref_chunks, + vector>& connections) const; + + /// if there are too many chunks, downsample to a given level + void downsample_chunks(const string& src_sequence, + vector& path_chunks, + vector>& ref_chunks, + vector>& connections) const; + /// returns all sets of chunks such that 1) all of chunks on the left set abut all of the chunks on the right + /// set on the read, 2) all source-to-sink paths in the connected component go through an edge between + /// the left and right sides, 3) all of the chunks that do not have a connection between them are fully + /// connected (i.e. form a biclique) + vector, vector>> find_constriction_bicliques(const vector>& adj, + const string& src_sequence, + const string& src_quality, + vector& path_chunks, + vector>& ref_chunks, + const vector>& connections) const; - /// associate a path position and strand to a surjected alignment against this path - void set_path_position(const Alignment& surjected, size_t best_path_rank, const xg::XGPath& xpath, - string& path_name_out, int64_t& path_pos_out, bool& path_rev_out, - unordered_map, vector>>* oriented_occurrences_memo = nullptr); + void prune_unconnectable(vector>& adj, + vector>>& splice_adj, + vector& component, + vector>& comp_groups, + vector& path_chunks, + vector>& ref_chunks) const; - // make a sentinel meant to indicate an unmapped read - Alignment make_null_alignment(const Alignment& source); + /// make a sentinel meant to indicate an unmapped read + static Alignment make_null_alignment(const Alignment& source); + + static multipath_alignment_t make_null_mp_alignment(const string& src_sequence, + const string& src_quality); + + template + static int32_t get_score(const AlnType& aln); + + /// the graph we're surjecting onto + const PathPositionHandleGraph* graph = nullptr; }; + + + template + string Surjector::path_score_annotations(const unordered_map, pair>>& surjections) const { + + vector> paths; + for (const auto& surjection : surjections) { + paths.emplace_back(get_score(surjection.second.first), graph->get_path_name(surjection.first.first), surjection.first.second); + } + sort(paths.begin(), paths.end(), greater>()); + + stringstream sstrm; + + for (size_t i = 0; i < paths.size(); ++i) { + if (i != 0) { + sstrm << ','; + } + sstrm << get<1>(paths[i]); + sstrm << (get<2>(paths[i]) ? '-' : '+'); + sstrm << get<0>(paths[i]); + } + + return sstrm.str(); + } } #endif diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp new file mode 100644 index 00000000000..cbe587c7721 --- /dev/null +++ b/src/transcriptome.cpp @@ -0,0 +1,2971 @@ + +#include + +#include + +#include "../io/save_handle_graph.hpp" + +#include "transcriptome.hpp" +#include "../augment.hpp" +#include "../utility.hpp" + +namespace vg { + +using namespace vg::io; + +using namespace std; + +// #define transcriptome_debug + +bool operator==(const Exon & lhs, const Exon & rhs) { + + return (lhs.coordinates == rhs.coordinates && + lhs.border_offsets == rhs.border_offsets && + lhs.border_steps == rhs.border_steps); +} + +bool operator!=(const Exon & lhs, const Exon & rhs) { + + return !(lhs == rhs); +} + +bool operator<(const Exon & lhs, const Exon & rhs) { + + if (lhs.coordinates.first != rhs.coordinates.first) { + + return (lhs.coordinates.first < rhs.coordinates.first); + } + + if (lhs.coordinates.second != rhs.coordinates.second) { + + return (lhs.coordinates.second < rhs.coordinates.second); + } + + return false; +} + +bool operator==(const Transcript & lhs, const Transcript & rhs) { + + return (lhs.name == rhs.name && + lhs.is_reverse == rhs.is_reverse && + lhs.chrom == rhs.chrom && + lhs.chrom_length == rhs.chrom_length && + lhs.exons == rhs.exons); +} + +bool operator!=(const Transcript & lhs, const Transcript & rhs) { + + return !(lhs == rhs); +} + +bool operator<(const Transcript & lhs, const Transcript & rhs) { + + if (lhs.chrom != rhs.chrom) { + + return (lhs.chrom < rhs.chrom); + } + + if (!lhs.exons.empty() && !rhs.exons.empty()) { + + if (lhs.exons.front() != rhs.exons.front()) { + + return (lhs.exons.front() < rhs.exons.front()); + } + + } else if (!lhs.exons.empty() || !rhs.exons.empty()) { + + return (lhs.exons.size() < rhs.exons.size()); + } + + if (lhs.is_reverse != rhs.is_reverse) { + + return (lhs.is_reverse < rhs.is_reverse); + } + + return false; +} + +bool operator==(const Mapping & lhs, const Mapping & rhs) { + + return google::protobuf::util::MessageDifferencer::Equals(lhs, rhs); +} + +bool operator!=(const Mapping & lhs, const Mapping & rhs) { + + return !(lhs == rhs); +} + +bool operator==(const Path & lhs, const Path & rhs) { + + return google::protobuf::util::MessageDifferencer::Equals(lhs, rhs); +} + +bool operator!=(const Path & lhs, const Path & rhs) { + + return !(lhs == rhs); +} + +bool sort_pair_by_second(const pair & lhs, const pair & rhs) { + + return (lhs.second < rhs.second); +} + +bool sort_transcript_paths_by_name(const CompletedTranscriptPath & lhs, const CompletedTranscriptPath & rhs) { + + assert(!lhs.transcript_names.empty()); + assert(!lhs.transcript_names.front().empty()); + + assert(!rhs.transcript_names.empty()); + assert(!rhs.transcript_names.front().empty()); + + if (lhs.transcript_names.front() != rhs.transcript_names.front()) { + + return (lhs.transcript_names.front() < rhs.transcript_names.front()); + } + + if (lhs.is_reference != rhs.is_reference) { + + return (lhs.is_reference > rhs.is_reference); + } + + if (!lhs.embedded_path_names.empty() && !rhs.embedded_path_names.empty()) { + + return (lhs.embedded_path_names.front() < rhs.embedded_path_names.front()); + } + + if (!lhs.embedded_path_names.empty() || !rhs.embedded_path_names.empty()) { + + return !lhs.embedded_path_names.empty(); + } + + if (!lhs.haplotype_gbwt_ids.empty() && !rhs.haplotype_gbwt_ids.empty()) { + + return (lhs.haplotype_gbwt_ids.front() < rhs.haplotype_gbwt_ids.front()); + } + + return !lhs.haplotype_gbwt_ids.empty(); +} + +handle_t mapping_to_handle(const Mapping & mapping, const HandleGraph & graph) { + + return (graph.get_handle(mapping.position().node_id(), mapping.position().is_reverse())); +} + +string TranscriptPath::get_name() const { + + assert(!transcript_names.empty()); + assert(!transcript_names.front().empty()); + + assert(is_reference || is_haplotype); + + if (is_reference) { + + return (transcript_names.front() + "_R" + to_string(copy_id)); + + + } else { + + return (transcript_names.front() + "_H" + to_string(copy_id)); + } +} + +handle_t EditedTranscriptPath::get_first_node_handle(const HandleGraph & graph) const { + + assert(path.mapping_size() > 0); + + return graph.get_handle(path.mapping()[0].position().node_id(), path.mapping()[0].position().is_reverse()); +} + +CompletedTranscriptPath::CompletedTranscriptPath(const EditedTranscriptPath & edited_transcript_path_in) { + + transcript_names = edited_transcript_path_in.transcript_names; + embedded_path_names = edited_transcript_path_in.embedded_path_names; + haplotype_gbwt_ids = edited_transcript_path_in.haplotype_gbwt_ids; + + copy_id = edited_transcript_path_in.copy_id; + + is_reference = edited_transcript_path_in.is_reference; + is_haplotype = edited_transcript_path_in.is_haplotype; +} + +CompletedTranscriptPath::CompletedTranscriptPath(const EditedTranscriptPath & edited_transcript_path_in, const HandleGraph & graph) { + + transcript_names = edited_transcript_path_in.transcript_names; + embedded_path_names = edited_transcript_path_in.embedded_path_names; + haplotype_gbwt_ids = edited_transcript_path_in.haplotype_gbwt_ids; + + copy_id = edited_transcript_path_in.copy_id; + + is_reference = edited_transcript_path_in.is_reference; + is_haplotype = edited_transcript_path_in.is_haplotype; + + path.reserve(edited_transcript_path_in.path.mapping_size()); + + for (auto mapping: edited_transcript_path_in.path.mapping()) { + + auto handle = mapping_to_handle(mapping, graph); + + // Check that the path only consist of whole nodes (complete). + assert(mapping.edit_size() == 1); + assert(edit_is_match(mapping.edit(0))); + assert(mapping.position().offset() == 0); + assert(mapping.edit(0).from_length() == graph.get_length(handle)); + + path.emplace_back(handle); + } +} + +handle_t CompletedTranscriptPath::get_first_node_handle(const HandleGraph & graph) const { + + return path.front(); +} + +Transcriptome::Transcriptome(unique_ptr&& graph_in) : _graph(move(graph_in)) { + + if (!_graph) { + cerr << "\tERROR: Could not load graph." << endl; + exit(1); + } +} + +int32_t Transcriptome::add_intron_splice_junctions(vector intron_streams, unique_ptr & haplotype_index, const bool update_haplotypes) { + +#ifdef transcriptome_debug + double time_parsing_1 = gcsa::readTimer(); + cerr << "\tDEBUG Parsing start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + // Create path position overlay of graph + bdsg::PositionOverlay graph_path_pos_overlay(_graph.get()); + + vector introns; + + for (auto & intron_stream: intron_streams) { + + // Parse introns in BED format. + parse_introns(&introns, intron_stream, graph_path_pos_overlay); + } + + if (introns.empty()) { + + cerr << "\tERROR: No intron parsed" << endl; + exit(1); + } + + sort(introns.begin(), introns.end()); + + if (show_progress) { cerr << "\tParsed " << introns.size() << " introns" << endl; }; + +#ifdef transcriptome_debug + cerr << "\tDEBUG: " << gcsa::readTimer() - time_parsing_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_project_1 = gcsa::readTimer(); + cerr << "\tDEBUG Construction start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + // Construct edited reference intron paths using embedded graph paths. + auto edited_transcript_paths = construct_reference_transcript_paths_embedded(introns, graph_path_pos_overlay); + + if (show_progress) { cerr << "\tConstructed " << edited_transcript_paths.size() << " intron paths" << endl; }; + +#ifdef transcriptome_debug + cerr << "\tDEBUG: " << gcsa::readTimer() - time_project_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_augment_1 = gcsa::readTimer(); + cerr << "\tDEBUG Updating start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + if (has_novel_exon_boundaries(edited_transcript_paths, false)) { + + // Augment graph with new exon boundaries and splice-junction edges. + augment_graph(edited_transcript_paths, true, haplotype_index, update_haplotypes, false); + + } else { + + // Augment graph with new splice-junction edges. + add_splice_junction_edges(edited_transcript_paths); + } + + assert(_transcript_paths.empty()); + + if (show_progress) { cerr << "\tUpdated graph with intron paths" << endl; }; + +#ifdef transcriptome_debug + cerr << "\tDEBUG: " << gcsa::readTimer() - time_augment_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + return introns.size(); +} + +int32_t Transcriptome::add_reference_transcripts(vector transcript_streams, unique_ptr & haplotype_index, const bool use_haplotype_paths, const bool update_haplotypes) { + +#ifdef transcriptome_debug + double time_parsing_1 = gcsa::readTimer(); + cerr << "\tDEBUG Parsing start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + bdsg::PositionOverlay graph_path_pos_overlay; + + if (!use_haplotype_paths) { + + // Create path position overlay of graph if using embedded path references. + graph_path_pos_overlay = bdsg::PositionOverlay(_graph.get()); + } + + vector transcripts; + uint32_t number_of_excluded_transcripts = 0; + + int64_t lines_parsed = 0; + + for (auto & transcript_stream: transcript_streams) { + + // Parse transcripts in gtf/gff3 format. + lines_parsed += parse_transcripts(&transcripts, &number_of_excluded_transcripts, transcript_stream, graph_path_pos_overlay, *haplotype_index, use_haplotype_paths); + } + + if (number_of_excluded_transcripts > 0) { + + cerr << "\tWARNING: Excluded " << number_of_excluded_transcripts << " transcripts with overlapping exons or incorrect exon order." << endl; + } + + if (transcripts.empty() && lines_parsed != 0) { + + cerr << "\tERROR: No transcripts parsed (remember to set feature type \"-y\" in vg rna or \"-f\" in vg autoindex)" << endl; + exit(1); + } + + sort(transcripts.begin(), transcripts.end()); + + if (show_progress) { cerr << "\tParsed " << transcripts.size() << " transcripts" << endl; }; + +#ifdef transcriptome_debug + cerr << "\tDEBUG: " << gcsa::readTimer() - time_parsing_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_project_1 = gcsa::readTimer(); + cerr << "\tDEBUG Construction start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + list edited_transcript_paths; + + if (use_haplotype_paths) { + + // Construct edited reference transcript paths using haplotype GBWT paths. + edited_transcript_paths = construct_reference_transcript_paths_gbwt(transcripts, *haplotype_index); + + } else { + + // Construct edited reference transcript paths using embedded graph paths. + edited_transcript_paths = construct_reference_transcript_paths_embedded(transcripts, graph_path_pos_overlay); + } + + if (show_progress) { cerr << "\tConstructed " << edited_transcript_paths.size() << " reference transcript paths" << endl; }; + +#ifdef transcriptome_debug + cerr << "\tDEBUG: " << gcsa::readTimer() - time_project_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_augment_1 = gcsa::readTimer(); + cerr << "\tDEBUG Updating start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + if (has_novel_exon_boundaries(edited_transcript_paths, true)) { + + // Augment graph with new exon boundaries and splice-junction edges. + // Adds the edited transcript paths as reference transcript paths. + augment_graph(edited_transcript_paths, false, haplotype_index, update_haplotypes, true); + + } else { + + // Augment graph with new splice-junction edges and add reference transcript paths. + add_edited_transcript_paths(edited_transcript_paths); + } + + // Sort transcript paths and update their copy ids. + sort_transcript_paths_update_copy_id(); + + if (show_progress) { cerr << "\tUpdated graph with reference transcript paths" << endl; }; + +#ifdef transcriptome_debug + cerr << "\tDEBUG: " << gcsa::readTimer() - time_augment_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + return transcripts.size(); +} + +int32_t Transcriptome::add_haplotype_transcripts(vector transcript_streams, const gbwt::GBWT & haplotype_index, const bool proj_emded_paths) { + +#ifdef transcriptome_debug + double time_parsing_1 = gcsa::readTimer(); + cerr << "\tDEBUG Parsing start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + // Create path position overlay of splice graph + bdsg::PositionOverlay graph_path_pos_overlay(_graph.get()); + + vector transcripts; + uint32_t number_of_excluded_transcripts = 0; + + int64_t lines_parsed = 0; + + for (auto & transcript_stream: transcript_streams) { + + // Parse transcripts in gtf/gff3 format. + lines_parsed += parse_transcripts(&transcripts, &number_of_excluded_transcripts, transcript_stream, graph_path_pos_overlay, haplotype_index, false); + } + + if (number_of_excluded_transcripts > 0) { + + cerr << "\tWARNING: Excluded " << number_of_excluded_transcripts << " transcripts with overlapping exons or incorrect exon order." << endl; + } + + if (transcripts.empty() && lines_parsed != 0) { + + cerr << "\tERROR: No transcripts parsed (remember to set feature type \"-y\" in vg rna or \"-f\" in vg autoindex)" << endl; + exit(1); + } + + sort(transcripts.begin(), transcripts.end()); + + if (show_progress) { cerr << "\tParsed " << transcripts.size() << " transcripts" << endl; }; + +#ifdef transcriptome_debug + cerr << "\tDEBUG: " << gcsa::readTimer() - time_parsing_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_project_1 = gcsa::readTimer(); + cerr << "\tDEBUG Projection start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + // Save number of transcript paths before adding new. + auto pre_num_transcript_paths = _transcript_paths.size(); + + // Project and add transcripts to transcriptome. + project_haplotype_transcripts(transcripts, haplotype_index, graph_path_pos_overlay, proj_emded_paths, mean_node_length()); + + // Augment splice graph with new splice-junction edges. + add_splice_junction_edges(_transcript_paths); + + // Sort transcript paths and update their copy ids. + sort_transcript_paths_update_copy_id(); + + assert(_transcript_paths.size() >= pre_num_transcript_paths); + + if (show_progress) { cerr << "\tProjected " << _transcript_paths.size() - pre_num_transcript_paths << " haplotype-specific transcript paths" << endl; } + +#ifdef transcriptome_debug + cerr << "\tDEBUG: " << gcsa::readTimer() - time_project_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + return (_transcript_paths.size() - pre_num_transcript_paths); +} + +void Transcriptome::parse_introns(vector * introns, istream * intron_stream, const bdsg::PositionOverlay & graph_path_pos_overlay) const { + + int32_t line_number = 0; + + string chrom; + string pos; + string end; + string strand; + + while (intron_stream->good()) { + + line_number += 1; + + string intron_line; + getline(*intron_stream, intron_line); + + // Skip header. + if (intron_line.empty() || intron_line.front() == '#') { + + continue; + } + + stringstream intron_line_ss = stringstream(intron_line); + getline(intron_line_ss, chrom, '\t'); + + assert(_graph->has_path(chrom) == graph_path_pos_overlay.has_path(chrom)); + + if (!_graph->has_path(chrom)) { + + if (error_on_missing_path) { + + cerr << "\tERROR: Chromosome path \"" << chrom << "\" not found in graph (line " << line_number << ")." << endl; + exit(1); + + } else { + + // seek to the end of the line + continue; + } + } + + // Parse start and end intron position and convert end to inclusive. + assert(getline(intron_line_ss, pos, '\t')); + int32_t spos = stoi(pos); + assert(getline(intron_line_ss, pos, '\t')); + int32_t epos = stoi(pos) - 1; + + assert(spos <= epos); + + getline(intron_line_ss, strand, '\t'); + getline(intron_line_ss, strand, '\t'); + + bool is_reverse = false; + + if (getline(intron_line_ss, strand, '\t')) { + + assert(strand == "+" || strand == "-"); + is_reverse = (strand == "-") ? true : false; + } + + // Create "intron" transcript. + introns->emplace_back(Transcript("intron", is_reverse, chrom, graph_path_pos_overlay.get_path_length(_graph->get_path_handle(chrom)))); + + // Add intron boundaries as flanking exons to current "intron" transcript. + add_exon(&(introns->back()), make_pair(spos - 1, spos - 1), graph_path_pos_overlay); + add_exon(&(introns->back()), make_pair(epos + 1, epos + 1), graph_path_pos_overlay); + } +} + +int32_t Transcriptome::parse_transcripts(vector * transcripts, uint32_t * number_of_excluded_transcripts, istream * transcript_stream, const bdsg::PositionOverlay & graph_path_pos_overlay, const gbwt::GBWT & haplotype_index, const bool use_haplotype_paths) const { + + spp::sparse_hash_map chrom_lengths; + + if (use_haplotype_paths) { + + assert(haplotype_index.bidirectional()); + assert(haplotype_index.hasMetadata()); + + assert(haplotype_index.metadata.hasPathNames()); + assert(haplotype_index.metadata.hasContigNames()); + + // Parse reference sample tags. + auto gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(haplotype_index); + + for (size_t i = 0; i < haplotype_index.sequences(); i++) { + + // Skip reverse threads in bidirectional gbwt index. + if (i % 2 == 1) { + + continue; + } + + auto base_gbwt_path_name = get_base_gbwt_path_name(haplotype_index, gbwt::Path::id(i), gbwt_reference_samples); + chrom_lengths.emplace(base_gbwt_path_name, numeric_limits::max()); + } + + } else { + + assert(_graph->for_each_path_handle([&](const path_handle_t & path_handle) { + + assert(graph_path_pos_overlay.has_path(_graph->get_path_name(path_handle))); + chrom_lengths.emplace(_graph->get_path_name(path_handle), graph_path_pos_overlay.get_path_length(path_handle)); + })); + } + + spp::sparse_hash_map parsed_transcripts; + spp::sparse_hash_set excluded_transcripts; + + int32_t line_number = 0; + int32_t parsed_lines = 0; + + string chrom; + string feature; + string pos; + string strand; + string attributes; + string attribute; + + bool zero_based_exon_number = false; + + while (transcript_stream->good()) { + + line_number += 1; + + string transcript_line; + getline(*transcript_stream, transcript_line); + + // Skip header. + if (transcript_line.empty() || transcript_line.front() == '#') { + + continue; + } + + stringstream transcript_line_ss = stringstream(transcript_line); + getline(transcript_line_ss, chrom, '\t'); + + parsed_lines += 1; + + auto chrom_lengths_it = chrom_lengths.find(chrom); + + if (chrom_lengths_it == chrom_lengths.end()) { + + if (error_on_missing_path) { + + cerr << "\tERROR: Chromosome path \"" << chrom << "\" not found in graph or haplotypes index (line " << line_number << ")." << endl; + exit(1); + + } else { + + // Seek to the end of the line. + continue; + } + } + + transcript_line_ss.ignore(numeric_limits::max(), '\t'); + assert(getline(transcript_line_ss, feature, '\t')); + + // Select only relevant feature types. + if (feature != feature_type && !feature_type.empty()) { + + continue; + } + + // Parse start and end exon position and convert to 0-base. + assert(getline(transcript_line_ss, pos, '\t')); + int32_t spos = stoi(pos) - 1; + assert(getline(transcript_line_ss, pos, '\t')); + int32_t epos = stoi(pos) - 1; + + assert(spos <= epos); + + // Skip score column. + transcript_line_ss.ignore(numeric_limits::max(), '\t'); + + // Parse strand and set whether it is reverse. + assert(getline(transcript_line_ss, strand, '\t')); + assert(strand == "+" || strand == "-"); + bool is_reverse = (strand == "-") ? true : false; + + // Skip frame column. + transcript_line_ss.ignore(numeric_limits::max(), '\t'); + + string transcript_id = ""; + int32_t exon_number = -1; + + while (getline(transcript_line_ss, attribute, ';')) { + + if (attribute.empty()) { + + break; + } + + // Parse transcript ID. + if (transcript_id.empty()) { + + transcript_id = parse_attribute_value(attribute, transcript_tag); + } + + // Parse exon number. + if (exon_number < 0) { + + auto exon_number_str = parse_attribute_value(attribute, "exon_number"); + + if (exon_number_str.empty()) { + + // If not exon_number attribute try ID. + auto exon_id = parse_attribute_value(attribute, "ID"); + + if (count(exon_id.begin(), exon_id.end(), ':') == 2) { + + auto exon_id_ss = stringstream(exon_id); + + string element; + getline(exon_id_ss, element, ':'); + + if (element == "exon") { + + getline(exon_id_ss, element, ':'); + getline(exon_id_ss, element); + + exon_number = stoi(element); + } + } + + } else { + + exon_number = stoi(exon_number_str); + } + } + + if (!transcript_id.empty() && exon_number >= 0) { + + break; + } + } + + if (transcript_id.empty()) { + + cerr << "\tERROR: Tag \"" << transcript_tag << "\" not found in attributes (line " << line_number << ")." << endl; + exit(1); + } + + auto parsed_transcripts_it = parsed_transcripts.emplace(transcript_id, Transcript(transcript_id, is_reverse, chrom, chrom_lengths_it->second)); + + Transcript * transcript = &(parsed_transcripts_it.first->second); + + assert(transcript->name == transcript_id); + assert(transcript->is_reverse == is_reverse); + assert(transcript->chrom == chrom); + assert(transcript->chrom_length == chrom_lengths_it->second); + + if (use_haplotype_paths) { + + // Add exon to current transcript. + add_exon(transcript, make_pair(spos, epos)); + + } else { + + // Add exon to current transcript. + add_exon(transcript, make_pair(spos, epos), graph_path_pos_overlay); + } + + // Check if exons are in correct order in file. + if (exon_number >= 0) { + + // If first transcript and exon, set whether exon numbering is zero-based. + if (parsed_transcripts.size() == 1 && transcript->exons.size() == 1) { + + zero_based_exon_number = (exon_number == 0) ? true : false; + } + + if (transcript->exons.size() - static_cast(zero_based_exon_number) != exon_number) { + + // Exclude transcripts with exons in incorrect order according to attributes. + excluded_transcripts.emplace(transcript_id); + } + } + } + + for (auto & transcript: parsed_transcripts) { + + // Exclude transcripts with overlapping exons. + if (has_overlapping_exons(transcript.second.exons)) { + + excluded_transcripts.emplace(transcript.first); + } + } + + assert(excluded_transcripts.size() <= parsed_transcripts.size()); + + transcripts->reserve(transcripts->size() + parsed_transcripts.size() - excluded_transcripts.size()); + + for (auto & transcript: parsed_transcripts) { + + if (excluded_transcripts.find(transcript.first) == excluded_transcripts.end()) { + + // Reorder reversed order exons. + reorder_exons(&(transcript.second)); + + transcripts->emplace_back(move(transcript.second)); + } + } + + *number_of_excluded_transcripts += excluded_transcripts.size(); + + return parsed_lines; +} + +string Transcriptome::get_base_gbwt_path_name(const gbwt::GBWT & haplotype_index, const size_t path_id, const unordered_set & gbwt_reference_samples) const { + + auto gbwt_path_metadata = haplotype_index.metadata.path(path_id); + PathSense sense = gbwtgraph::get_path_sense(haplotype_index.metadata, gbwt_path_metadata, gbwt_reference_samples); + + string base_gbwt_path_name = ""; + + if (sense == PathSense::HAPLOTYPE) { + + // Create base gbwt path name without phaseblock and subrange. + base_gbwt_path_name = PathMetadata::create_path_name(PathSense::REFERENCE, + gbwtgraph::get_path_sample_name(haplotype_index.metadata, gbwt_path_metadata, sense), + gbwtgraph::get_path_locus_name(haplotype_index.metadata, gbwt_path_metadata, sense), + gbwtgraph::get_path_haplotype(haplotype_index.metadata, gbwt_path_metadata, sense), + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); + + } else { + + base_gbwt_path_name = gbwtgraph::compose_path_name(haplotype_index.metadata, gbwt_path_metadata, sense); + } + + return base_gbwt_path_name; +} + +string Transcriptome::parse_attribute_value(const string & attribute, const string & name) const { + + string value = ""; + + const uint32_t attribute_start_pos = (attribute.front() == ' '); + + if (attribute.substr(attribute_start_pos, name.size()) == name) { + + // Is gff3 format. + if (attribute.substr(name.size(), 1) == "=") { + + assert(attribute_start_pos == 0); + value = attribute.substr(name.size() + 1); + + } else { + + // Is value in quotes (""). + if (attribute.substr(attribute_start_pos + name.size() + 1, 1) == "\"") { + + value = attribute.substr(attribute_start_pos + name.size() + 2); + + assert(value.back() == '\"'); + value.pop_back(); + + } else { + + value = attribute.substr(attribute_start_pos + name.size() + 1); + } + } + } + + return value; +} + +float Transcriptome::mean_node_length() const { + + return static_cast(_graph->get_total_length()) / _graph->get_node_count(); +} + +void Transcriptome::add_exon(Transcript * transcript, const pair & exon_pos) const { + + assert(exon_pos.first >= 0); + assert(exon_pos.second < transcript->chrom_length); + + transcript->exons.emplace_back(Exon()); + transcript->exons.back().coordinates = exon_pos; +} + +void Transcriptome::add_exon(Transcript * transcript, const pair & exon_pos, const bdsg::PositionOverlay & graph_path_pos_overlay) const { + + add_exon(transcript, exon_pos); + + // Exon border positions (last position in upstream intron and + // first position in downstream intron). The positions are not + // offset if it is the first or last on the path. + const pair exon_border_pos = make_pair(max(0, exon_pos.first - 1), min(static_cast(transcript->chrom_length) - 1, exon_pos.second + 1)); + + auto path_handle = _graph->get_path_handle(transcript->chrom); + + // Find path positions of exon node borders (start - 1 and end + 1). + auto chrom_path_start_step = graph_path_pos_overlay.get_step_at_position(path_handle, exon_border_pos.first); + auto chrom_path_end_step = graph_path_pos_overlay.get_step_at_position(path_handle, exon_border_pos.second); + + assert(chrom_path_start_step != graph_path_pos_overlay.path_end(path_handle)); + assert(chrom_path_end_step != graph_path_pos_overlay.path_end(path_handle)); + + // Find the start position of the exon border nodes. + auto chrom_path_start_node_pos = graph_path_pos_overlay.get_position_of_step(chrom_path_start_step); + auto chrom_path_end_node_pos = graph_path_pos_overlay.get_position_of_step(chrom_path_end_step); + + assert(chrom_path_start_node_pos <= exon_border_pos.first); + assert(chrom_path_end_node_pos <= exon_border_pos.second); + + // Add node offsets of exon border boundaries. + transcript->exons.back().border_offsets = make_pair(exon_border_pos.first - chrom_path_start_node_pos, exon_border_pos.second - chrom_path_end_node_pos); + + // Add path steps of exon border boundaries. + transcript->exons.back().border_steps = make_pair(chrom_path_start_step, chrom_path_end_step); +} + +void Transcriptome::reorder_exons(Transcript * transcript) const { + + if (transcript->is_reverse) { + + // Is exons in reverse order. + bool is_reverse_order = true; + for (size_t i = 1; i < transcript->exons.size(); i++) { + + if (transcript->exons.at(i).coordinates.second > transcript->exons.at(i - 1).coordinates.first) { + + is_reverse_order = false; + } + } + + // Reverse if exons are in reverse order. + if (is_reverse_order) { + + reverse(transcript->exons.begin(), transcript->exons.end()); + } + } +} + +bool Transcriptome::has_overlapping_exons(const vector & exons) const { + + for (size_t i = 1; i < exons.size(); ++i) { + + // Is exons in reverse order. + if (exons.at(i - 1).coordinates.first <= exons.at(i).coordinates.first) { + + if (exons.at(i - 1).coordinates.second >= exons.at(i).coordinates.first) { + + return true; + } + + } else { + + if (exons.at(i).coordinates.second >= exons.at(i - 1).coordinates.first) { + + return true; + } + } + } + + return false; +} + +list Transcriptome::construct_reference_transcript_paths_embedded(const vector & transcripts, const bdsg::PositionOverlay & graph_path_pos_overlay) const { + + list edited_transcript_paths; + spp::sparse_hash_map > edited_transcript_paths_index; + + mutex edited_transcript_paths_mutex; + + vector construction_threads; + construction_threads.reserve(num_threads); + + // Spawn construction threads. + for (size_t thread_idx = 0; thread_idx < num_threads; thread_idx++) { + + construction_threads.push_back(thread(&Transcriptome::construct_reference_transcript_paths_embedded_callback, this, &edited_transcript_paths, &edited_transcript_paths_index, &edited_transcript_paths_mutex, thread_idx, ref(transcripts), ref(graph_path_pos_overlay))); + } + + // Join construction threads. + for (auto & thread: construction_threads) { + + thread.join(); + } + + return edited_transcript_paths; +} + +void Transcriptome::construct_reference_transcript_paths_embedded_callback(list * edited_transcript_paths, spp::sparse_hash_map > * edited_transcript_paths_index, mutex * edited_transcript_paths_mutex, const int32_t thread_idx, const vector & transcripts, const bdsg::PositionOverlay & graph_path_pos_overlay) const { + + list thread_edited_transcript_paths; + + int32_t transcripts_idx = thread_idx; + + while (transcripts_idx < transcripts.size()) { + + // Get next transcript belonging to current thread. + const Transcript & transcript = transcripts.at(transcripts_idx); + + // Construct edited transcript paths. + auto new_edited_transcript_paths = project_transcript_embedded(transcript, graph_path_pos_overlay, true, false); + + if (!new_edited_transcript_paths.empty()) { + + assert(new_edited_transcript_paths.size() == 1); + thread_edited_transcript_paths.emplace_back(new_edited_transcript_paths.front()); + } + + transcripts_idx += num_threads; + } + + edited_transcript_paths_mutex->lock(); + remove_redundant_transcript_paths(&thread_edited_transcript_paths, edited_transcript_paths_index); + edited_transcript_paths->splice(edited_transcript_paths->end(), thread_edited_transcript_paths); + edited_transcript_paths_mutex->unlock(); +} + +list Transcriptome::project_transcript_embedded(const Transcript & cur_transcript, const bdsg::PositionOverlay & graph_path_pos_overlay, const bool use_reference_paths, const bool use_haplotype_paths) const { + + assert(use_reference_paths != use_haplotype_paths); + + vector > exon_start_node_path_steps; + vector > exon_end_node_path_steps; + + exon_start_node_path_steps.reserve(cur_transcript.exons.size()); + exon_end_node_path_steps.reserve(cur_transcript.exons.size()); + + // Get embedded path ids and node mappings for all exon border nodes in transcript. + for (auto & exon: cur_transcript.exons) { + + auto exon_path_handle = _graph->get_path_handle_of_step(exon.border_steps.first); + assert(exon_path_handle == _graph->get_path_handle_of_step(exon.border_steps.second)); + + assert(cur_transcript.chrom == _graph->get_path_name(exon_path_handle)); + + exon_start_node_path_steps.emplace_back(multimap()); + exon_start_node_path_steps.back().emplace(exon_path_handle, exon.border_steps.first); + + exon_end_node_path_steps.emplace_back(multimap()); + exon_end_node_path_steps.back().emplace(exon_path_handle, exon.border_steps.second); + + if (use_haplotype_paths) { + + auto start_border_is_reverse = _graph->get_is_reverse(_graph->get_handle_of_step(exon.border_steps.first)); + + _graph->for_each_step_on_handle(_graph->get_handle_of_step(exon.border_steps.first), [&](const step_handle_t & step) { + + // Do not allow multiple lengths due to cycles for reference exon. + auto step_path_handle = _graph->get_path_handle_of_step(step); + if (step_path_handle != exon_path_handle && _graph->get_is_reverse(_graph->get_handle_of_step(step)) == start_border_is_reverse) { + + exon_start_node_path_steps.back().emplace(step_path_handle, step); + } + }); + + auto end_border_is_reverse = _graph->get_is_reverse(_graph->get_handle_of_step(exon.border_steps.second)); + + _graph->for_each_step_on_handle(_graph->get_handle_of_step(exon.border_steps.second), [&](const step_handle_t & step) { + + // Do not allow multiple lengths due to cycles for reference exon. + auto step_path_handle = _graph->get_path_handle_of_step(step); + if (step_path_handle != exon_path_handle && _graph->get_is_reverse(_graph->get_handle_of_step(step)) == end_border_is_reverse) { + + exon_end_node_path_steps.back().emplace(step_path_handle, step); + } + }); + } + } + + list edited_transcript_paths; + + // Loop over all paths that contain the transcript start node. + for (auto & path_steps_start: exon_start_node_path_steps.front()) { + + // Skip path if transcript end node is not in the current path. + if (exon_end_node_path_steps.back().find(path_steps_start.first) == exon_end_node_path_steps.back().end()) { + + continue; + } + + const auto path_origin_name = _graph->get_path_name(path_steps_start.first); + + // Skip alternative allele paths (_alt). + if (Paths::is_alt(path_origin_name)) { + + continue; + } + + // Do not construct transcript paths originating from a reference chromosome/contig. + if (path_origin_name == cur_transcript.chrom && !use_reference_paths) { + + continue; + } + + // Do not construct transcript paths originating from a haplotype. + if (path_origin_name != cur_transcript.chrom && !use_haplotype_paths) { + + continue; + } + + list cur_edited_transcript_paths; + + // Construct transcript path and set transcript origin name. + cur_edited_transcript_paths.emplace_back(cur_transcript.name, path_origin_name, use_reference_paths, use_haplotype_paths); + + bool is_partial = false; + + for (size_t exon_idx = 0; exon_idx < exon_start_node_path_steps.size(); ++exon_idx) { + + if (is_partial) { break; } + + // Transcripts with cycles at both exon boundaries are currently + // not supported. + // TODO: Add support for this. + if (exon_start_node_path_steps.at(exon_idx).count(path_steps_start.first) > 1 && exon_end_node_path_steps.at(exon_idx).count(path_steps_start.first) > 1) { + + assert(use_haplotype_paths); + + is_partial = true; + break; + } + + auto haplotype_path_start_it_range = exon_start_node_path_steps.at(exon_idx).equal_range(path_steps_start.first); + auto haplotype_path_end_it_range = exon_end_node_path_steps.at(exon_idx).equal_range(path_steps_start.first); + + // Transcript paths are partial if either the start or end exon path + // step is empty. Partial transcripts are currently not supported. + // TODO: Add support for partial transcript paths. + if (haplotype_path_start_it_range.first == haplotype_path_start_it_range.second || haplotype_path_end_it_range.first == haplotype_path_end_it_range.second) { + + is_partial = true; + break; + } + + haplotype_path_start_it_range.second--; + haplotype_path_end_it_range.second--; + + auto cur_edited_transcript_paths_base_it = cur_edited_transcript_paths.begin(); + auto cur_edited_transcript_paths_base_eit = cur_edited_transcript_paths.end(); + + assert(cur_edited_transcript_paths_base_it != cur_edited_transcript_paths_base_eit); + cur_edited_transcript_paths_base_eit--; + + while (true) { + + auto border_offsets = cur_transcript.exons.at(exon_idx).border_offsets; + + // Get path step at exon start if exon start node is in the current path. + auto haplotype_path_start_step = haplotype_path_start_it_range.first->second; + + // Get path mapping at exon end if exon end node is in the current path. + auto haplotype_path_end_step = haplotype_path_end_it_range.first->second; + + // Exons with different border node orientations are currently not supported + // for haplotype path projection. + // TODO: Add support for this special case. + if (use_haplotype_paths && (_graph->get_is_reverse(_graph->get_handle_of_step(haplotype_path_start_step)) != _graph->get_is_reverse(_graph->get_handle_of_step(haplotype_path_end_step)))) { + + is_partial = true; + break; + } + + // Swap start and end steps if in reverse order on path + if (graph_path_pos_overlay.get_position_of_step(haplotype_path_start_step) > graph_path_pos_overlay.get_position_of_step(haplotype_path_end_step)) { + + assert(border_offsets.first + 1 == _graph->get_length(_graph->get_handle_of_step(haplotype_path_start_step))); + assert(border_offsets.second == 0); + + swap(haplotype_path_start_step, haplotype_path_end_step); + border_offsets.first = _graph->get_length(_graph->get_handle_of_step(haplotype_path_start_step)) - 1; + } + + Path exon_path; + bool is_first_step = true; + + while (true) { + + auto node_length = _graph->get_length(_graph->get_handle_of_step(haplotype_path_start_step)); + int32_t offset = 0; + + // Adjust start position from exon border (last position in upstream intron) + // to first position in exon. Do not adjust if first position in path. + if ((cur_transcript.exons.at(exon_idx).coordinates.first > 0) && is_first_step) { + + if (border_offsets.first + 1 == node_length) { + + assert(haplotype_path_start_step != haplotype_path_end_step); + haplotype_path_start_step = _graph->get_next_step(haplotype_path_start_step); + + is_first_step = false; + continue; + + } else { + + offset = border_offsets.first + 1; + } + } + + int32_t edit_length = node_length - offset; + + // Adjust end position from exon border (first position in downstream intron) + // to last position in exon. Do not adjust if last position in path. + if ((cur_transcript.exons.at(exon_idx).coordinates.second < cur_transcript.chrom_length - 1) && (haplotype_path_start_step == haplotype_path_end_step)) { + + if (border_offsets.second == 0) { + + break; + + } else { + + edit_length = border_offsets.second - offset; + } + } + + assert(0 <= offset && offset < node_length); + assert(0 < edit_length && edit_length <= node_length); + + // Add new mapping in forward direction. Later the whole path will + // be reverse complemented if transcript is on the '-' strand. + auto new_mapping = exon_path.add_mapping(); + new_mapping->set_rank(exon_path.mapping_size()); + + new_mapping->mutable_position()->set_node_id(_graph->get_id(_graph->get_handle_of_step(haplotype_path_start_step))); + new_mapping->mutable_position()->set_offset(offset); + new_mapping->mutable_position()->set_is_reverse(_graph->get_is_reverse(_graph->get_handle_of_step(haplotype_path_start_step))); + + // Add new edit representing a complete match. + auto new_edit = new_mapping->add_edit(); + new_edit->set_from_length(edit_length); + new_edit->set_to_length(edit_length); + + if (haplotype_path_start_step == haplotype_path_end_step) { break; } + + haplotype_path_start_step = _graph->get_next_step(haplotype_path_start_step); + is_first_step = false; + } + + if (haplotype_path_start_it_range.first == haplotype_path_start_it_range.second && haplotype_path_end_it_range.first == haplotype_path_end_it_range.second) { + + auto exon_cur_edited_transcript_paths_base_it = cur_edited_transcript_paths_base_it; + + while (true) { + + exon_cur_edited_transcript_paths_base_it->path = concat_paths(exon_cur_edited_transcript_paths_base_it->path, exon_path); + + if (exon_cur_edited_transcript_paths_base_it == cur_edited_transcript_paths_base_eit) { break; } + ++exon_cur_edited_transcript_paths_base_it; + } + + break; + + } else { + + auto exon_cur_edited_transcript_paths_base_it = cur_edited_transcript_paths_base_it; + + while (true) { + + // If not last boundary combination copy current base transcipt path. + cur_edited_transcript_paths.emplace_back(*exon_cur_edited_transcript_paths_base_it); + cur_edited_transcript_paths.back().path = concat_paths(cur_edited_transcript_paths.back().path, exon_path); + + if (exon_cur_edited_transcript_paths_base_it == cur_edited_transcript_paths_base_eit) { break; } + ++exon_cur_edited_transcript_paths_base_it; + } + } + + assert(haplotype_path_start_it_range.first == haplotype_path_start_it_range.second || haplotype_path_end_it_range.first == haplotype_path_end_it_range.second); + + if (haplotype_path_start_it_range.first != haplotype_path_start_it_range.second) { + + ++haplotype_path_start_it_range.first; + + } else { + + assert(haplotype_path_end_it_range.first != haplotype_path_end_it_range.second); + ++haplotype_path_end_it_range.first; + } + } + } + + if (!is_partial) { + + auto cur_edited_transcript_paths_it = cur_edited_transcript_paths.begin(); + + while (cur_edited_transcript_paths_it != cur_edited_transcript_paths.end()) { + + if (cur_edited_transcript_paths_it->path.mapping_size() == 0) { + + // Delete empty paths. + cur_edited_transcript_paths_it = cur_edited_transcript_paths.erase(cur_edited_transcript_paths_it); + + } else { + + // Reverse complement transcript paths that are on the '-' strand. + if (cur_transcript.is_reverse) { + + reverse_complement_path_in_place(&(cur_edited_transcript_paths_it->path), [&](vg::id_t node_id) {return _graph->get_length(_graph->get_handle(node_id, false));}); + } + } + + ++cur_edited_transcript_paths_it; + } + + edited_transcript_paths.splice(edited_transcript_paths.end(), cur_edited_transcript_paths); + } + } + + return edited_transcript_paths; +} + +list Transcriptome::construct_reference_transcript_paths_gbwt(const vector & transcripts, const gbwt::GBWT & haplotype_index) const { + + vector > chrom_transcript_sets; + string cur_chrom = ""; + + // Create sets of transcripts on same chromosome/contig. + for (size_t i = 0; i < transcripts.size(); ++i) { + + if (cur_chrom != transcripts.at(i).chrom) { + + if (!chrom_transcript_sets.empty()) { + + // Set size of previous set. + chrom_transcript_sets.back().second = i - chrom_transcript_sets.back().first; + } + + chrom_transcript_sets.emplace_back(i, 0); + cur_chrom = transcripts.at(i).chrom; + } + } + + // Set size of last set. + chrom_transcript_sets.back().second = transcripts.size() - chrom_transcript_sets.back().first; + sort(chrom_transcript_sets.rbegin(), chrom_transcript_sets.rend(), sort_pair_by_second); + + assert(haplotype_index.bidirectional()); + assert(haplotype_index.hasMetadata()); + + assert(haplotype_index.metadata.hasPathNames()); + assert(haplotype_index.metadata.hasContigNames()); + + spp::sparse_hash_map > haplotype_name_index; + + // Parse reference sample tags. + auto gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(haplotype_index); + + // Create index mapping haplotype contig names to GBWT sequence ids and offsets. + for (size_t i = 0; i < haplotype_index.sequences(); i++) { + + // Skip reverse threads in bidirectional gbwt index. + if (i % 2 == 1) { + + continue; + } + + auto base_gbwt_path_name = get_base_gbwt_path_name(haplotype_index, gbwt::Path::id(i), gbwt_reference_samples); + + auto haplotype_name_index_it = haplotype_name_index.emplace(base_gbwt_path_name, map()); + assert(haplotype_name_index_it.first->second.emplace(haplotype_index.metadata.path(gbwt::Path::id(i)).count, i).second); + } + + list edited_transcript_paths; + spp::sparse_hash_map > edited_transcript_paths_index; + + uint32_t excluded_transcripts = 0; + mutex edited_transcript_paths_mutex; + + vector construction_threads; + construction_threads.reserve(num_threads); + + // Spawn construction threads. + for (size_t thread_idx = 0; thread_idx < num_threads; thread_idx++) { + + construction_threads.push_back(thread(&Transcriptome::construct_reference_transcript_paths_gbwt_callback, this, &edited_transcript_paths, &edited_transcript_paths_index, &excluded_transcripts, &edited_transcript_paths_mutex, thread_idx, ref(chrom_transcript_sets), ref(transcripts), ref(haplotype_index), ref(haplotype_name_index))); + } + + // Join construction threads. + for (auto & thread: construction_threads) { + + thread.join(); + } + + if (excluded_transcripts > 0) { + + cerr << "\tWARNING: Excluded " << excluded_transcripts << " transcripts with exon overlapping a haplotype break." << endl; + + } + + return edited_transcript_paths; +} + +void Transcriptome::construct_reference_transcript_paths_gbwt_callback(list * edited_transcript_paths, spp::sparse_hash_map > * edited_transcript_paths_index, uint32_t * excluded_transcripts, mutex * edited_transcript_paths_mutex, const int32_t thread_idx, const vector > & chrom_transcript_sets, const vector & transcripts, const gbwt::GBWT & haplotype_index, const spp::sparse_hash_map > & haplotype_name_index) const { + + int32_t chrom_transcript_sets_idx = thread_idx; + + while (chrom_transcript_sets_idx < chrom_transcript_sets.size()) { + + uint32_t excluded_transcripts_local = 0; + + list thread_edited_transcript_paths; + + const pair & transcript_set = chrom_transcript_sets.at(chrom_transcript_sets_idx); + + assert(transcript_set.second > 0); + uint32_t transcript_idx = transcript_set.first; + + list > > incomplete_transcript_paths; + + auto haplotype_name_index_it = haplotype_name_index.find(transcripts.at(transcript_idx).chrom); + assert(haplotype_name_index_it != haplotype_name_index.end()); + + for (auto & haplotype_idx: haplotype_name_index_it->second) { + + auto incomplete_transcript_paths_it = incomplete_transcript_paths.begin(); + + while (incomplete_transcript_paths_it != incomplete_transcript_paths.end()) { + + // Delete transcripts with exon overlapping haplotype break. + if (get<2>(incomplete_transcript_paths_it->second)) { + + ++excluded_transcripts_local; + incomplete_transcript_paths_it = incomplete_transcript_paths.erase(incomplete_transcript_paths_it); + + } else { + + ++incomplete_transcript_paths_it; + } + } + + auto node_start_pos = haplotype_idx.first; + const gbwt::vector_type & gbwt_haplotype = haplotype_index.extract(haplotype_idx.second); + + for (auto & gbwt_node: gbwt_haplotype) { + + auto node_handle = gbwt_to_handle(*_graph, gbwt_node); + auto node_length = _graph->get_length(node_handle); + + while (transcript_idx < transcript_set.first + transcript_set.second) { + + const Transcript & cur_transcript = transcripts.at(transcript_idx); + + // Create new transcript path for transcript with first + // exon starting in current node. + if (cur_transcript.exons.front().coordinates.first >= node_start_pos && cur_transcript.exons.front().coordinates.first < node_start_pos + node_length) { + + incomplete_transcript_paths.emplace_back(EditedTranscriptPath(cur_transcript.name, gbwt::Path::id(haplotype_idx.second), true, false), make_tuple(transcript_idx, 0, false)); + + } else if (node_start_pos + node_length <= cur_transcript.exons.front().coordinates.first) { + + break; + + } else { + + ++excluded_transcripts_local; + } + + ++transcript_idx; + } + + incomplete_transcript_paths_it = incomplete_transcript_paths.begin(); + + while (incomplete_transcript_paths_it != incomplete_transcript_paths.end()) { + + const Transcript & cur_transcript = transcripts.at(get<0>(incomplete_transcript_paths_it->second)); + + while (get<1>(incomplete_transcript_paths_it->second) < cur_transcript.exons.size()) { + + const pair & exon_coords = cur_transcript.exons.at(get<1>(incomplete_transcript_paths_it->second)).coordinates; + + // Exon is downstream current node. + if (node_start_pos + node_length <= exon_coords.first) { + + break; + } + + int32_t offset = 0; + int32_t edit_length = 0; + + // Exon is starting in current node. + if (!get<2>(incomplete_transcript_paths_it->second) && node_start_pos <= exon_coords.first) { + + offset = exon_coords.first - node_start_pos; + get<2>(incomplete_transcript_paths_it->second) = true; + } + + if (get<2>(incomplete_transcript_paths_it->second)) { + + edit_length = node_length - offset; + + // Exon is ending in current node. + if (exon_coords.second < node_start_pos + node_length) { + + edit_length = exon_coords.second - node_start_pos - offset + 1; + + get<1>(incomplete_transcript_paths_it->second)++; + get<2>(incomplete_transcript_paths_it->second) = false; + } + + assert(0 <= offset && offset < node_length); + assert(edit_length > 0 && edit_length <= node_length); + + // Add new mapping in forward direction. Later the whole path will + // be reverse complemented if transcript is on the '-' strand. + auto new_mapping = incomplete_transcript_paths_it->first.path.add_mapping(); + new_mapping->set_rank(incomplete_transcript_paths_it->first.path.mapping_size()); + + new_mapping->mutable_position()->set_node_id(_graph->get_id(node_handle)); + new_mapping->mutable_position()->set_offset(offset); + new_mapping->mutable_position()->set_is_reverse(_graph->get_is_reverse(node_handle)); + + // Add new edit representing a complete match. + auto new_edit = new_mapping->add_edit(); + new_edit->set_from_length(edit_length); + new_edit->set_to_length(edit_length); + + if (node_start_pos + node_length <= exon_coords.second) { + + break; + } + + } else { + + break; + } + } + + if (get<1>(incomplete_transcript_paths_it->second) == cur_transcript.exons.size()) { + + // Reverse complement transcript paths that are on the '-' strand. + if (cur_transcript.is_reverse) { + + reverse_complement_path_in_place(&(incomplete_transcript_paths_it->first.path), [&](vg::id_t node_id) {return _graph->get_length(_graph->get_handle(node_id, false));}); + } + + assert(incomplete_transcript_paths_it->first.path.mapping_size() > 0); + thread_edited_transcript_paths.emplace_back(move(incomplete_transcript_paths_it->first)); + + incomplete_transcript_paths_it = incomplete_transcript_paths.erase(incomplete_transcript_paths_it); + + } else { + + ++incomplete_transcript_paths_it; + } + } + + if (transcript_idx == transcript_set.first + transcript_set.second && incomplete_transcript_paths.empty()) { + + break; + } + + node_start_pos += node_length; + } + + if (transcript_idx == transcript_set.first + transcript_set.second && incomplete_transcript_paths.empty()) { + + break; + } + } + + excluded_transcripts_local += incomplete_transcript_paths.size(); + + assert(transcript_idx <= transcript_set.first + transcript_set.second); + excluded_transcripts_local += (transcript_set.first + transcript_set.second - transcript_idx); + + assert(thread_edited_transcript_paths.size() == transcript_set.second - excluded_transcripts_local); + + edited_transcript_paths_mutex->lock(); + + remove_redundant_transcript_paths(&thread_edited_transcript_paths, edited_transcript_paths_index); + edited_transcript_paths->splice(edited_transcript_paths->end(), thread_edited_transcript_paths); + *excluded_transcripts += excluded_transcripts_local; + + edited_transcript_paths_mutex->unlock(); + + chrom_transcript_sets_idx += num_threads; + } +} + +void Transcriptome::project_haplotype_transcripts(const vector & transcripts, const gbwt::GBWT & haplotype_index, const bdsg::PositionOverlay & graph_path_pos_overlay, const bool proj_emded_paths, const float mean_node_length) { + + list completed_transcript_paths; + spp::sparse_hash_map > completed_transcript_paths_index; + + for (auto & transcript_path: _transcript_paths) { + + auto completed_transcript_paths_index_it = completed_transcript_paths_index.emplace(transcript_path.get_first_node_handle(*_graph), vector()); + completed_transcript_paths_index_it.first->second.emplace_back(&transcript_path); + } + + mutex completed_transcript_paths_mutex; + + vector projection_threads; + projection_threads.reserve(num_threads); + + // Spawn projection threads. + for (size_t thread_idx = 0; thread_idx < num_threads; thread_idx++) { + + projection_threads.push_back(thread(&Transcriptome::project_haplotype_transcripts_callback, this, &completed_transcript_paths, &completed_transcript_paths_index, &completed_transcript_paths_mutex, thread_idx, ref(transcripts), ref(haplotype_index), ref(graph_path_pos_overlay), proj_emded_paths, mean_node_length)); + } + + // Join projection threads. + for (auto & thread: projection_threads) { + + thread.join(); + } + + _transcript_paths.reserve(_transcript_paths.size() + completed_transcript_paths.size()); + + for (auto & transcript_path: completed_transcript_paths) { + + _transcript_paths.emplace_back(move(transcript_path)); + } +} + +void Transcriptome::project_haplotype_transcripts_callback(list * completed_transcript_paths, spp::sparse_hash_map > * completed_transcript_paths_index, mutex * completed_transcript_paths_mutex, const int32_t thread_idx, const vector & transcripts, const gbwt::GBWT & haplotype_index, const bdsg::PositionOverlay & graph_path_pos_overlay, const bool proj_emded_paths, const float mean_node_length) { + + list thread_completed_transcript_paths; + + int32_t transcripts_idx = thread_idx; + + while (transcripts_idx < transcripts.size()) { + + // Get next transcript belonging to current thread. + const Transcript & transcript = transcripts.at(transcripts_idx); + + list completed_transcript_paths; + + if (!haplotype_index.empty()) { + + // Project transcript onto haplotypes in GBWT index. + thread_completed_transcript_paths.splice(thread_completed_transcript_paths.end(), construct_completed_transcript_paths(project_transcript_gbwt(transcript, haplotype_index, mean_node_length))); + } + + if (proj_emded_paths) { + + // Project transcript onto embedded paths. + thread_completed_transcript_paths.splice(thread_completed_transcript_paths.end(), construct_completed_transcript_paths(project_transcript_embedded(transcript, graph_path_pos_overlay, false, true))); + } + + transcripts_idx += num_threads; + } + + // Add haplotype transcript paths to transcriptome. + completed_transcript_paths_mutex->lock(); + remove_redundant_transcript_paths(&thread_completed_transcript_paths, completed_transcript_paths_index); + completed_transcript_paths->splice(completed_transcript_paths->end(), thread_completed_transcript_paths); + completed_transcript_paths_mutex->unlock(); +} + +list Transcriptome::project_transcript_gbwt(const Transcript & cur_transcript, const gbwt::GBWT & haplotype_index, const float mean_node_length) const { + + assert(haplotype_index.bidirectional()); + + list edited_transcript_paths; + + vector > exon_node_ids; + exon_node_ids.reserve(cur_transcript.exons.size()); + + vector, thread_ids_t> > haplotypes; + multimap > haplotype_id_index; + + for (size_t exon_idx = 0; exon_idx < cur_transcript.exons.size(); ++exon_idx) { + + const Exon & cur_exon = cur_transcript.exons.at(exon_idx); + + // Add node exon boundary ids + exon_node_ids.emplace_back(_graph->get_id(_graph->get_handle_of_step(cur_exon.border_steps.first)), _graph->get_id(_graph->get_handle_of_step(cur_exon.border_steps.second))); + + // Calculate expected number of nodes between exon start and end. + const int32_t expected_length = ceil((cur_exon.coordinates.second - cur_exon.coordinates.first + 1) / mean_node_length); + + // Get all haplotypes in GBWT index between exon start and end border nodes (last position in upstream intron and + // first position in downstream intron). + auto exon_haplotypes = get_exon_haplotypes(exon_node_ids.back().first, exon_node_ids.back().second, haplotype_index, expected_length); + + if (haplotypes.empty()) { + + for (auto & exon_haplotype: exon_haplotypes) { + + haplotypes.emplace_back(vector(1, exon_haplotype.first), exon_haplotype.second); + haplotypes.back().first.reserve(cur_transcript.exons.size()); + + for (auto & haplotype_id: exon_haplotype.second) { + + haplotype_id_index.emplace(haplotype_id, make_pair(haplotypes.size() - 1, exon_idx + 1)); + } + } + + } else { + + for (auto & exon_haplotype: exon_haplotypes) { + + assert(!exon_haplotype.first.empty()); + spp::sparse_hash_map extended_haplotypes; + + for (auto & haplotype_id: exon_haplotype.second) { + + auto haplotype_id_index_it_range = haplotype_id_index.equal_range(haplotype_id); + auto haplotype_id_index_it = haplotype_id_index_it_range.first; + + while (haplotype_id_index_it != haplotype_id_index_it_range.second) { + + if (exon_idx != haplotype_id_index_it->second.second) { + + assert(haplotype_id_index_it->second.second < exon_idx); + + haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it); + continue; + } + + haplotype_id_index_it->second.second++; + pair, thread_ids_t> * cur_haplotype = &haplotypes.at(haplotype_id_index_it->second.first); + + if (extended_haplotypes.find(haplotype_id_index_it->second.first) != extended_haplotypes.end()) { + + assert(cur_haplotype->first.size() == exon_idx + 1); + haplotypes.at(extended_haplotypes.at(haplotype_id_index_it->second.first)).second.emplace_back(haplotype_id); + haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first); + + } else if (cur_haplotype->first.size() == exon_idx) { + + cur_haplotype->first.emplace_back(exon_haplotype.first); + cur_haplotype->second = {haplotype_id}; + assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotype_id_index_it->second.first).second); + + } else if (cur_haplotype->first.size() == exon_idx + 1) { + + haplotypes.emplace_back(vector(cur_haplotype->first.begin(), cur_haplotype->first.end() - 1), thread_ids_t(1, haplotype_id)); + haplotypes.back().first.emplace_back(exon_haplotype.first); + + assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotypes.size() - 1).second); + haplotype_id_index_it->second.first = haplotypes.size() - 1; + + } else { + + haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it); + continue; + } + + ++haplotype_id_index_it; + } + } + } + } + } + + for (auto & haplotype: haplotypes) { + + // Skip partial transcript paths. + // TODO: Add support for partial transcript paths. + if (haplotype.first.size() != cur_transcript.exons.size()) { + + continue; + } + + auto haplotype_thread_ids_it = haplotype.second.begin(); + assert(haplotype_thread_ids_it != haplotype.second.end()); + + // Construct transcript path and set transcript origin name. + edited_transcript_paths.emplace_back(cur_transcript.name, gbwt::Path::id(*haplotype_thread_ids_it), false, true); + edited_transcript_paths.back().haplotype_gbwt_ids.reserve(haplotype.second.size()); + + ++haplotype_thread_ids_it; + + // Add haplotype names as origins. + while (haplotype_thread_ids_it != haplotype.second.end()) { + + // Convert bidirectional path id before finding name. + edited_transcript_paths.back().haplotype_gbwt_ids.emplace_back(gbwt::Path::id(*haplotype_thread_ids_it), false); + ++haplotype_thread_ids_it; + } + + for (size_t exon_idx = 0; exon_idx < cur_transcript.exons.size(); ++exon_idx) { + + const Exon & cur_exon = cur_transcript.exons.at(exon_idx); + + assert(gbwt::Node::id(haplotype.first.at(exon_idx).front()) == exon_node_ids.at(exon_idx).first); + assert(gbwt::Node::id(haplotype.first.at(exon_idx).back()) == exon_node_ids.at(exon_idx).second); + + for (size_t exon_node_idx = 0; exon_node_idx < haplotype.first.at(exon_idx).size(); ++exon_node_idx) { + + assert(haplotype.first.at(exon_idx).at(exon_node_idx) != gbwt::ENDMARKER); + + auto node_id = gbwt::Node::id(haplotype.first.at(exon_idx).at(exon_node_idx)); + auto node_length = _graph->get_length(_graph->get_handle(node_id, false)); + + int32_t offset = 0; + + // Adjust start position from exon border (last position in upstream intron) + // to first position in exon. Do not adjust if first position in path. + if ((cur_exon.coordinates.first > 0) && (exon_node_idx == 0)) { + + if (cur_exon.border_offsets.first + 1 == node_length) { + + assert(haplotype.first.at(exon_idx).size() > 1); + assert(node_id != exon_node_ids.at(exon_idx).second); + + continue; + + } else { + + offset = cur_exon.border_offsets.first + 1; + } + } + + int32_t edit_length = node_length - offset; + + // Adjust end position from exon border (first position in downstream intron) + // to last position in exon. Do not adjust if last position in path. + if ((cur_exon.coordinates.second < cur_transcript.chrom_length - 1) && (exon_node_idx == haplotype.first.at(exon_idx).size() - 1)) { + + if (cur_exon.border_offsets.second == 0) { + + break; + + } else { + + edit_length = cur_exon.border_offsets.second - offset; + } + } + + assert(0 <= offset && offset < node_length); + assert(0 < edit_length && edit_length <= node_length); + + // Add new mapping in forward direction. Later the whole path will + // be reverse complemented if transcript is on the '-' strand. + auto new_mapping = edited_transcript_paths.back().path.add_mapping(); + new_mapping->set_rank(edited_transcript_paths.back().path.mapping_size()); + + new_mapping->mutable_position()->set_node_id(node_id); + new_mapping->mutable_position()->set_offset(offset); + new_mapping->mutable_position()->set_is_reverse(false); + + // Add new edit representing a complete match. + auto new_edit = new_mapping->add_edit(); + new_edit->set_from_length(edit_length); + new_edit->set_to_length(edit_length); + } + } + + if (edited_transcript_paths.back().path.mapping_size() == 0) { + + // Delete empty paths. + edited_transcript_paths.pop_back(); + + } else { + + if (cur_transcript.is_reverse) { + + // Reverse complement transcript paths that are on the '-' strand. + reverse_complement_path_in_place(&(edited_transcript_paths.back().path), [&](vg::id_t node_id) {return _graph->get_length(_graph->get_handle(node_id, false));}); + } + + // Copy paths if collapse of identical transcript paths is not wanted. + if (path_collapse_type == "no" && edited_transcript_paths.back().haplotype_gbwt_ids.size() > 1) { + + auto all_haplotype_gbwt_ids = edited_transcript_paths.back().haplotype_gbwt_ids; + edited_transcript_paths.back().haplotype_gbwt_ids = {edited_transcript_paths.back().haplotype_gbwt_ids.front()}; + + // Create identical copies of all haplotype origins. + for (size_t i = 1; i < all_haplotype_gbwt_ids.size(); ++i) { + + edited_transcript_paths.emplace_back(edited_transcript_paths.back()); + edited_transcript_paths.back().haplotype_gbwt_ids.front() = {all_haplotype_gbwt_ids.at(i)}; + } + } + } + } + + return edited_transcript_paths; +} + +vector > Transcriptome::get_exon_haplotypes(const vg::id_t start_node, const vg::id_t end_node, const gbwt::GBWT & haplotype_index, const int32_t expected_length) const { + + assert(expected_length > 0); + + // Calculate the expected upperbound of the length between the two + // nodes (number of nodes). + const int32_t expected_length_upperbound = 1.1 * expected_length; + + // Calcuate frequency for how often a check on whether an extension + // should be terminated is performed. + const int32_t termination_frequency = ceil(0.1 * expected_length); + + // Get ids for haplotypes that contain the end node. + spp::sparse_hash_set end_haplotype_ids; + for (auto & haplotype_id: haplotype_index.locate(haplotype_index.find(gbwt::Node::encode(end_node, false)))) { + + end_haplotype_ids.emplace(haplotype_id); + } + + vector > exon_haplotypes; + + // Initialise haplotype extension queue on the start node. + std::queue > exon_haplotype_queue; + exon_haplotype_queue.push(make_pair(exon_nodes_t(1, gbwt::Node::encode(start_node, false)), haplotype_index.find(gbwt::Node::encode(start_node, false)))); + exon_haplotype_queue.front().first.reserve(expected_length_upperbound); + + // Empty queue if no haplotypes containing the start node exist. + if (exon_haplotype_queue.front().second.empty()) { exon_haplotype_queue.pop(); } + + // Perform depth-first haplotype extension. + while (!exon_haplotype_queue.empty()) { + + pair & cur_exon_haplotype = exon_haplotype_queue.front(); + + // Stop current extension if end node is reached. + if (gbwt::Node::id(cur_exon_haplotype.first.back()) == end_node) { + + exon_haplotypes.emplace_back(cur_exon_haplotype.first, haplotype_index.locate(cur_exon_haplotype.second)); + assert(exon_haplotypes.back().second.size() <= cur_exon_haplotype.second.size()); + + if (exon_haplotypes.back().second.size() == cur_exon_haplotype.second.size()) { + + exon_haplotype_queue.pop(); + continue; + } + } + + // Check whether any haplotypes in the current extension contains the + // end node. If not, end current extension. This check is only performed + // after the upperbound on the expected number of nodes is reached. + if (cur_exon_haplotype.first.size() >= expected_length_upperbound && (cur_exon_haplotype.first.size() % termination_frequency) == 0) { + + bool has_relevant_haplotype = false; + + for (auto & haplotype_id: haplotype_index.locate(cur_exon_haplotype.second)) { + + if (end_haplotype_ids.find(haplotype_id) != end_haplotype_ids.end()) { + + has_relevant_haplotype = true; + break; + } + } + + if (!has_relevant_haplotype) { + + exon_haplotype_queue.pop(); + continue; + } + } + + auto out_edges = haplotype_index.edges(cur_exon_haplotype.first.back()); + + // End current extension if no outgoing edges exist. + if (out_edges.empty()) { + + exon_haplotype_queue.pop(); + continue; + } + + auto out_edges_it = out_edges.begin(); + ++out_edges_it; + + while (out_edges_it != out_edges.end()) { + + // Do not extend haplotypes that end within the exon. + if (out_edges_it->first != gbwt::ENDMARKER) { + + auto extended_search = haplotype_index.extend(cur_exon_haplotype.second, out_edges_it->first); + + // Add new extension to queue if not empty (haplotypes found). + if (!extended_search.empty()) { + + exon_haplotype_queue.push(make_pair(cur_exon_haplotype.first, extended_search)); + exon_haplotype_queue.back().first.emplace_back(out_edges_it->first); + } + } + + ++out_edges_it; + } + + // Do not extend haplotypes that end within the exon. + if (out_edges.begin()->first != gbwt::ENDMARKER) { + + cur_exon_haplotype.first.emplace_back(out_edges.begin()->first); + cur_exon_haplotype.second = haplotype_index.extend(cur_exon_haplotype.second, out_edges.begin()->first); + + // End current extension if empty (no haplotypes found). + if (cur_exon_haplotype.second.empty()) { exon_haplotype_queue.pop(); } + + } else { + + exon_haplotype_queue.pop(); + } + } + + return exon_haplotypes; +} + +template +void Transcriptome::remove_redundant_transcript_paths(list * new_transcript_paths, spp::sparse_hash_map > * transcript_paths_index) const { + + auto new_transcript_paths_it = new_transcript_paths->begin(); + + while (new_transcript_paths_it != new_transcript_paths->end()) { + + assert(!new_transcript_paths_it->transcript_names.empty()); + + bool unique_transcript_path = true; + + auto transcript_paths_index_it = transcript_paths_index->emplace(new_transcript_paths_it->get_first_node_handle(*_graph), vector()); + + // Add unique transcript paths only. + if (!transcript_paths_index_it.second && path_collapse_type != "no") { + + assert(!transcript_paths_index_it.first->second.empty()); + + for (auto & transcript_path: transcript_paths_index_it.first->second) { + + assert(!transcript_path->transcript_names.empty()); + + if (path_collapse_type == "all" || transcript_path->transcript_names.front() == new_transcript_paths_it->transcript_names.front()) { + + // Check if two paths are identical. + if (transcript_path->path == new_transcript_paths_it->path) { + + if (path_collapse_type == "all") { + + // Merge unqiue transcript names. + for (auto & new_transcript_name: new_transcript_paths_it->transcript_names) { + + if (find(transcript_path->transcript_names.begin(), transcript_path->transcript_names.end(), new_transcript_name) == transcript_path->transcript_names.end()) { + + transcript_path->transcript_names.emplace_back(new_transcript_name); + } + } + + } else { + + assert(path_collapse_type == "haplotype"); + assert(transcript_path->transcript_names.size() == 1); + assert(new_transcript_paths_it->transcript_names.size() == 1); + } + + // Merge unqiue embedded path names. + for (auto & new_embedded_path_name: new_transcript_paths_it->embedded_path_names) { + + if (find(transcript_path->embedded_path_names.begin(), transcript_path->embedded_path_names.end(), new_embedded_path_name) == transcript_path->embedded_path_names.end()) { + + transcript_path->embedded_path_names.emplace_back(new_embedded_path_name); + } + } + + // Merge unqiue haplotype gbwt ids. + for (auto & new_haplotype_gbwt_id: new_transcript_paths_it->haplotype_gbwt_ids) { + + if (find(transcript_path->haplotype_gbwt_ids.begin(), transcript_path->haplotype_gbwt_ids.end(), new_haplotype_gbwt_id + ) == transcript_path->haplotype_gbwt_ids.end()) { + + transcript_path->haplotype_gbwt_ids.emplace_back(new_haplotype_gbwt_id); + } + } + + transcript_path->is_reference = (transcript_path->is_reference || new_transcript_paths_it->is_reference); + transcript_path->is_haplotype = (transcript_path->is_haplotype || new_transcript_paths_it->is_haplotype); + + // Delete non-unique transcript path. + new_transcript_paths_it = new_transcript_paths->erase(new_transcript_paths_it); + + unique_transcript_path = false; + break; + } + } + } + } + + if (unique_transcript_path) { + + transcript_paths_index_it.first->second.emplace_back(&(*new_transcript_paths_it)); + ++new_transcript_paths_it; + } + } +} + +list Transcriptome::construct_completed_transcript_paths(const list & edited_transcript_paths) const { + + list completed_transcript_paths; + + for (auto & transcript_path: edited_transcript_paths) { + + completed_transcript_paths.emplace_back(transcript_path, *_graph); + } + + return completed_transcript_paths; +} + +void Transcriptome::add_edited_transcript_paths(const list & edited_transcript_paths) { + + add_splice_junction_edges(edited_transcript_paths); + + for (auto & transcript_path: edited_transcript_paths) { + + _transcript_paths.emplace_back(transcript_path, *_graph); + } +} + +bool Transcriptome::has_novel_exon_boundaries(const list & edited_transcript_paths, const bool include_transcript_ends) const { + + for (auto & transcript_path: edited_transcript_paths) { + + for (size_t i = 0; i < transcript_path.path.mapping_size(); i++) { + + auto cur_mapping = transcript_path.path.mapping(i); + auto cur_handle = mapping_to_handle(cur_mapping, *_graph); + + assert(cur_mapping.edit_size() == 1); + assert(edit_is_match(cur_mapping.edit(0))); + + // Do not check if left boundary of start exon is novel. + if (!include_transcript_ends && i == 0) { + + if (cur_mapping.position().offset() + cur_mapping.edit(0).from_length() != _graph->get_length(cur_handle)) { + + return true; + } + + // Do not check if right boundary of end exon is novel. + } else if (!include_transcript_ends && i == transcript_path.path.mapping_size() - 1) { + + if (cur_mapping.position().offset() > 0) { + + return true; + } + + // Check if both boundaries are novel. + } else if (cur_mapping.position().offset() > 0 || cur_mapping.edit(0).from_length() != _graph->get_length(cur_handle)) { + + return true; + } + } + } + + return false; +} + +void Transcriptome::augment_graph(const list & edited_transcript_paths, const bool is_introns, unique_ptr & haplotype_index, const bool update_haplotypes, const bool add_reference_transcript_paths) { + +#ifdef transcriptome_debug + double time_convert_1 = gcsa::readTimer(); + cerr << "\t\tDEBUG Creation start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + // Create set of exon boundary paths to augment graph with. + vector exon_boundary_paths; + + if (is_introns) { + + exon_boundary_paths.reserve(edited_transcript_paths.size()); + + for (auto & transcript_path: edited_transcript_paths) { + + exon_boundary_paths.emplace_back(transcript_path.path); + } + + } else { + + spp::sparse_hash_set exon_boundary_mapping_index; + + for (auto & transcript_path: edited_transcript_paths) { + + for (size_t j = 0; j < transcript_path.path.mapping_size(); ++j) { + + const Mapping & mapping = transcript_path.path.mapping(j); + + const auto mapping_length = mapping_to_length(mapping); + assert(mapping_length == mapping_from_length(mapping)); + + // Add exon boundary path. + if (mapping.position().offset() > 0 || mapping.position().offset() + mapping_length < _graph->get_length(_graph->get_handle(mapping.position().node_id(), false))) { + + exon_boundary_paths.emplace_back(Path()); + *(exon_boundary_paths.back().add_mapping()) = mapping; + exon_boundary_paths.back().mutable_mapping(0)->set_rank(1); + + // Remove if already added. + if (!exon_boundary_mapping_index.emplace(exon_boundary_paths.back().mapping(0)).second) { + + exon_boundary_paths.pop_back(); + } + } + } + } + } + +#ifdef transcriptome_debug + cerr << "\t\tDEBUG Created " << exon_boundary_paths.size() << " exon boundary paths: " << gcsa::readTimer() - time_convert_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_augment_1 = gcsa::readTimer(); + cerr << "\t\tDEBUG Augmention start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + vector translations; + + // Augment graph with edited paths. + augment(static_cast(_graph.get()), exon_boundary_paths, "GAM", &translations, "", false, !is_introns); + +#ifdef transcriptome_debug + cerr << "\t\tDEBUG Augmented graph with " << translations.size() << " translations: " << gcsa::readTimer() - time_augment_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_index_1 = gcsa::readTimer(); + cerr << "\t\tDEBUG Indexing start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + spp::sparse_hash_map > > translation_index; + + #pragma omp parallel num_threads(num_threads) + { + spp::sparse_hash_map > > thread_translation_index; + + // Create translation index + #pragma omp for schedule(static) + for (size_t i = 0; i < translations.size(); ++i) { + + const Translation & translation = translations.at(i); + + assert(translation.from().mapping_size() == 1); + assert(translation.to().mapping_size() == 1); + + auto & from_mapping = translation.from().mapping(0); + auto & to_mapping = translation.to().mapping(0); + + assert(to_mapping.position().offset() == 0); + assert(from_mapping.position().is_reverse() == to_mapping.position().is_reverse()); + + // Only store changes + if (from_mapping != to_mapping) { + + auto thread_translation_index_it = thread_translation_index.emplace(mapping_to_handle(from_mapping, *_graph), vector >()); + thread_translation_index_it.first->second.emplace_back(from_mapping.position().offset(), mapping_to_handle(to_mapping, *_graph)); + } + } + + #pragma omp critical + { + for (auto & translation: thread_translation_index) { + + auto translation_index_it = translation_index.emplace(translation.first, translation.second); + + if (!translation_index_it.second) { + + translation_index_it.first->second.insert(translation_index_it.first->second.end(), translation.second.begin(), translation.second.end()); + } + } + } + } + + // Sort translation index by offset + for (auto & translation: translation_index) { + + sort(translation.second.begin(), translation.second.end()); + } + +#ifdef transcriptome_debug + cerr << "\t\tDEBUG Indexed " << translation_index.size() << " translated nodes: " << gcsa::readTimer() - time_index_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + if (!haplotype_index->empty() && update_haplotypes) { + +#ifdef transcriptome_debug + double time_update_1 = gcsa::readTimer(); + cerr << "\t\tDEBUG Updating (GBWT) start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + // Update haplotypes in gbwt index to match new augmented graph. + update_haplotype_index(haplotype_index, translation_index); + +#ifdef transcriptome_debug + cerr << "\t\tDEBUG Updated " << haplotype_index->sequences() / 2 << " haplotype paths: " << gcsa::readTimer() - time_update_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + } + + if (!_transcript_paths.empty()) { + +#ifdef transcriptome_debug + double time_update_2 = gcsa::readTimer(); + cerr << "\t\tDEBUG Updating (transcriptome) start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + // Update transcript paths in transcriptome to new augmented graph. + update_transcript_paths(translation_index); + +#ifdef transcriptome_debug + cerr << "\t\tDEBUG Updated " << _transcript_paths.size() << " transcript paths: " << gcsa::readTimer() - time_update_2 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + } + +#ifdef transcriptome_debug + double time_update_3 = gcsa::readTimer(); + cerr << "\t\tDEBUG Updating (paths) start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + list updated_transcript_paths; + + // Update paths to match new augmented graph and add them + // as reference transcript paths. + for (auto & transcript_path: edited_transcript_paths) { + + updated_transcript_paths.emplace_back(transcript_path); + + for (auto mapping: transcript_path.path.mapping()) { + + auto mapping_handle = mapping_to_handle(mapping, *_graph); + auto mapping_offset = mapping.position().offset(); + auto mapping_length = mapping_to_length(mapping); + + assert(mapping_length > 0); + assert(mapping_length == mapping_from_length(mapping)); + + auto translation_index_it = translation_index.find(mapping_handle); + + if (translation_index_it != translation_index.end()) { + + // First node id is the same (new node offset is larger than 0). + if (mapping_offset == 0 & translation_index_it->second.front().first > 0) { + + updated_transcript_paths.back().path.emplace_back(mapping_handle); + } + + // Add new nodes. + for (auto & new_node: translation_index_it->second) { + + if (new_node.first >= mapping_offset && new_node.first < mapping_offset + mapping_length) { + + updated_transcript_paths.back().path.emplace_back(new_node.second); + } + } + + } else { + + updated_transcript_paths.back().path.emplace_back(mapping_handle); + } + } + } + + add_splice_junction_edges(updated_transcript_paths); + + if (add_reference_transcript_paths) { + + if (!_transcript_paths.empty()) { + + spp::sparse_hash_map > transcript_paths_index; + + for (auto & transcript_path: _transcript_paths) { + + auto transcript_paths_index_it = transcript_paths_index.emplace(transcript_path.get_first_node_handle(*_graph), vector()); + transcript_paths_index_it.first->second.emplace_back(&transcript_path); + } + + remove_redundant_transcript_paths(&updated_transcript_paths, &transcript_paths_index); + } + + _transcript_paths.reserve(_transcript_paths.size() + updated_transcript_paths.size()); + + for (auto & transcript_path: updated_transcript_paths) { + + _transcript_paths.emplace_back(move(transcript_path)); + } + } + +#ifdef transcriptome_debug + cerr << "\t\tDEBUG Updated " << updated_transcript_paths.size() << " transcript paths: " << gcsa::readTimer() - time_update_3 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif +} + +void Transcriptome::update_haplotype_index(unique_ptr & haplotype_index, const spp::sparse_hash_map > > & update_index) const { + + assert(haplotype_index->bidirectional()); + + // Silence GBWT index construction. + gbwt::Verbosity::set(gbwt::Verbosity::SILENT); + gbwt::GBWTBuilder gbwt_builder(gbwt::bit_length(gbwt::Node::encode(_graph->max_node_id(), true))); + + // Transfer metadata + gbwt_builder.index.addMetadata(); + gbwt_builder.index.metadata = haplotype_index->metadata; + + for (size_t i = 0; i < haplotype_index->sequences(); i++) { + + // Only update forward threads in bidirectional gbwt index. + if (i % 2 == 1) { + + continue; + } + + auto cur_gbwt_thread = haplotype_index->extract(i); + + gbwt::vector_type new_gbwt_threads; + new_gbwt_threads.reserve(cur_gbwt_thread.size()); + + for (auto & node: cur_gbwt_thread) { + + auto handle = gbwt_to_handle(*_graph, node); + auto update_index_it = update_index.find(handle); + + if (update_index_it != update_index.end()) { + + // First node id is the same (new node offset is + // larger than 0). + if (update_index_it->second.front().first > 0) { + + new_gbwt_threads.emplace_back(node); + } + + // Add new nodes. + for (auto & new_node: update_index_it->second) { + + assert(_graph->get_is_reverse(handle) == _graph->get_is_reverse(new_node.second)); + new_gbwt_threads.emplace_back(handle_to_gbwt(*_graph, new_node.second)); + } + + } else { + + new_gbwt_threads.emplace_back(node); + } + } + + // Insert thread bidirectionally. + gbwt_builder.insert(new_gbwt_threads, true); + } + + // Finish contruction and recode index. + gbwt_builder.finish(); + haplotype_index.reset(new gbwt::GBWT(gbwt_builder.index)); +} + +void Transcriptome::update_transcript_paths(const spp::sparse_hash_map > > & update_index) { + + #pragma omp parallel num_threads(num_threads) + { + // Update transcript paths + #pragma omp for schedule(static) + for (size_t i = 0; i < _transcript_paths.size(); ++i) { + + vector new_transcript_path; + new_transcript_path.reserve(_transcript_paths.at(i).path.size()); + + for (auto & handle: _transcript_paths.at(i).path) { + + auto update_index_it = update_index.find(handle); + + if (update_index_it != update_index.end()) { + + // First handle is the same (new node offset is + // larger than 0). + if (update_index_it->second.front().first > 0) { + + new_transcript_path.emplace_back(handle); + } + + // Add new handles. + for (auto & new_handle: update_index_it->second) { + + new_transcript_path.emplace_back(new_handle.second); + } + + } else { + + update_index_it = update_index.find(_graph->flip(handle)); + + if (update_index_it != update_index.end()) { + + // First handle is the same (new node offset is + // larger than 0). + if (update_index_it->second.front().first > 0) { + + new_transcript_path.emplace_back(handle); + } + + for (auto update_handle_rit = update_index_it->second.rbegin(); update_handle_rit != update_index_it->second.rend(); ++update_handle_rit) { + + new_transcript_path.emplace_back(_graph->flip(update_handle_rit->second)); + } + + } else { + + new_transcript_path.emplace_back(handle); + } + } + } + + _transcript_paths.at(i).path = move(new_transcript_path); + } + } +} + +void Transcriptome::add_splice_junction_edges(const list & edited_transcript_paths) { + + for (auto & transcript_path: edited_transcript_paths) { + + for (size_t i = 1; i < transcript_path.path.mapping_size(); i++) { + + auto & prev_mapping = transcript_path.path.mapping(i - 1); + auto & cur_mapping = transcript_path.path.mapping(i); + + auto prev_handle = mapping_to_handle(prev_mapping, *_graph); + auto cur_handle = mapping_to_handle(cur_mapping, *_graph); + + // Ensure the edge exists. + _graph->create_edge(prev_handle, cur_handle); + } + } +} + +void Transcriptome::add_splice_junction_edges(const list & completed_transcript_paths) { + + for (auto & transcript_path: completed_transcript_paths) { + + for (size_t i = 1; i < transcript_path.path.size(); i++) { + + // Ensure the edge exists. + _graph->create_edge(transcript_path.path.at(i - 1), transcript_path.path.at(i)); + } + } +} + +void Transcriptome::add_splice_junction_edges(const vector & completed_transcript_paths) { + + for (auto & transcript_path: completed_transcript_paths) { + + for (size_t i = 1; i < transcript_path.path.size(); i++) { + + // Ensure the edge exists. + _graph->create_edge(transcript_path.path.at(i - 1), transcript_path.path.at(i)); + } + } +} + +void Transcriptome::sort_transcript_paths_update_copy_id() { + + for (auto & transcript_path: _transcript_paths) { + + assert(!transcript_path.transcript_names.empty()); + assert(!transcript_path.transcript_names.front().empty()); + + assert(transcript_path.is_reference || transcript_path.is_haplotype); + + sort(transcript_path.transcript_names.begin(), transcript_path.transcript_names.end()); + sort(transcript_path.embedded_path_names.begin(), transcript_path.embedded_path_names.end()); + sort(transcript_path.haplotype_gbwt_ids.begin(), transcript_path.haplotype_gbwt_ids.end()); + } + + sort(_transcript_paths.begin(), _transcript_paths.end(), sort_transcript_paths_by_name); + + string cur_transcript_name = ""; + bool cur_is_reference = false; + + uint32_t cur_copy_id = 0; + + for (auto & transcript_path: _transcript_paths) { + + if (cur_transcript_name != transcript_path.transcript_names.front() || cur_is_reference != transcript_path.is_reference) { + + cur_transcript_name = transcript_path.transcript_names.front(); + cur_is_reference = transcript_path.is_reference; + + cur_copy_id = 0; + } + + cur_copy_id++; + transcript_path.copy_id = cur_copy_id; + } +} + +const vector & Transcriptome::transcript_paths() const { + + return _transcript_paths; +} + +vector Transcriptome::reference_transcript_paths() const { + + vector reference_transcript_paths; + + for (auto & transcript_path: _transcript_paths) { + + assert(transcript_path.is_reference || transcript_path.is_haplotype); + + if (transcript_path.is_reference) { + + reference_transcript_paths.emplace_back(transcript_path); + } + } + + return reference_transcript_paths; +} + +vector Transcriptome::haplotype_transcript_paths() const { + + vector haplotype_transcript_paths; + + for (auto & transcript_path: _transcript_paths) { + + assert(transcript_path.is_reference || transcript_path.is_haplotype); + + if (transcript_path.is_haplotype) { + + haplotype_transcript_paths.emplace_back(transcript_path); + } + } + + return haplotype_transcript_paths; +} + +const MutablePathDeletableHandleGraph & Transcriptome::graph() const { + + return *_graph; +} + +void Transcriptome::collect_transcribed_nodes(spp::sparse_hash_set * transcribed_nodes) const { + + for (auto & transcript_path: _transcript_paths) { + + assert(transcript_path.path.size() > 0); + for (auto & handle: transcript_path.path) { + + transcribed_nodes->emplace(_graph->get_id(handle)); + } + } +} + +void Transcriptome::remove_non_transcribed_nodes() { + + vector path_handles; + path_handles.reserve(_graph->get_path_count()); + + assert(_graph->for_each_path_handle([&](const path_handle_t & path_handle) { + + path_handles.emplace_back(path_handle); + })); + + // Remove all paths. + for (auto & path_handle: path_handles) { + + _graph->destroy_path(path_handle); + } + + assert(_graph->get_path_count() == 0); + + // Find all nodes that are in a transcript path. + spp::sparse_hash_set transcribed_nodes; + + collect_transcribed_nodes(&transcribed_nodes); + + vector non_transcribed_handles; + non_transcribed_handles.reserve(_graph->get_node_count() - transcribed_nodes.size()); + + // Collect all nodes that are not in a transcript path. + assert(_graph->for_each_handle([&](const handle_t & handle) { + + if (transcribed_nodes.count(_graph->get_id(handle)) == 0) { + + non_transcribed_handles.emplace_back(handle); + } + })); + + for (auto & handle: non_transcribed_handles) { + + // Delete node and in/out edges. + _graph->destroy_handle(handle); + } + + assert(_graph->get_node_count() == transcribed_nodes.size()); +} + +void Transcriptome::chop_nodes(const uint32_t max_node_length) { + + spp::sparse_hash_map > > split_index; + + assert(_graph->for_each_handle([&](const handle_t & handle) { + + const uint32_t handle_length = _graph->get_length(handle); + + if (handle_length > max_node_length) { + + vector offsets; + offsets.reserve(ceil(handle_length / static_cast(max_node_length))); + + uint32_t offset = max_node_length; + + while (offset < handle_length) { + + offsets.emplace_back(offset); + offset += max_node_length; + } + + auto split_index_it = split_index.emplace(handle, vector >()); + assert(split_index_it.second); + + for (auto & div_handle: _graph->divide_handle(handle, offsets)) { + + split_index_it.first->second.emplace_back(0, div_handle); + } + } + })); + + update_transcript_paths(split_index); + + if (show_progress) { cerr << "\tSplit " << split_index.size() << " nodes" << endl; }; +} + +bool Transcriptome::sort_compact_nodes() { + + if (dynamic_cast(_graph.get()) == nullptr) { + + return false; + } + +#ifdef transcriptome_debug + double time_sort_1 = gcsa::readTimer(); + cerr << "\tDEBUG Sorting start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + auto new_order = handlealgs::topological_order(_graph.get()); + assert(new_order.size() == _graph->get_node_count()); + +#ifdef transcriptome_debug + cerr << "\tDEBUG Sorted " << new_order.size() << " nodes: " << gcsa::readTimer() - time_sort_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_index_1 = gcsa::readTimer(); + cerr << "\tDEBUG Indexing start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + spp::sparse_hash_map > > update_index; + + for (auto & transcript_path: _transcript_paths) { + + for (auto & handle: transcript_path.path) { + + update_index.emplace(handle, vector >()); + } + } + + uint32_t order_idx = 1; + + for (auto handle: new_order) { + + auto update_index_it = update_index.find(handle); + + if (update_index_it != update_index.end()) { + + assert(update_index_it->second.empty()); + update_index_it->second.emplace_back(0, _graph->get_handle(order_idx, _graph->get_is_reverse(handle))); + + } + + auto handle_flip = _graph->flip(handle); + + update_index_it = update_index.find(handle_flip); + + if (update_index_it != update_index.end()) { + + assert(update_index_it->second.empty()); + update_index_it->second.emplace_back(0, _graph->get_handle(order_idx, _graph->get_is_reverse(handle_flip))); + } + + ++order_idx; + } + +#ifdef transcriptome_debug + cerr << "\tDEBUG Indexed " << update_index.size() << " node updates: " << gcsa::readTimer() - time_index_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_update_1 = gcsa::readTimer(); + cerr << "\tDEBUG Updating (graph) start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + _graph->apply_ordering(new_order, true); + +#ifdef transcriptome_debug + cerr << "\tDEBUG Updated graph: " << gcsa::readTimer() - time_update_1 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + +#ifdef transcriptome_debug + double time_update_2 = gcsa::readTimer(); + cerr << "\tDEBUG Updating (paths) start: " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + update_transcript_paths(update_index); + +#ifdef transcriptome_debug + cerr << "\tDEBUG Updated " << _transcript_paths.size() << " transcript paths: " << gcsa::readTimer() - time_update_2 << " seconds, " << gcsa::inGigabytes(gcsa::memoryUsage()) << " GB" << endl; +#endif + + if (show_progress) { cerr << "\tSorted " << new_order.size() << " nodes" << endl; }; + + return true; +} + +void Transcriptome::embed_transcript_paths(const bool add_reference_transcripts, const bool add_haplotype_transcripts) { + + assert(add_reference_transcripts || add_haplotype_transcripts); + + int32_t num_embedded_paths = 0; + + // Add transcript paths to graph + for (auto & transcript_path: _transcript_paths) { + + assert(transcript_path.is_reference || transcript_path.is_haplotype); + + // Exclude reference or haplotype transcripts + if (!((transcript_path.is_reference && add_reference_transcripts) || (transcript_path.is_haplotype && add_haplotype_transcripts))) { + + continue; + } + + ++num_embedded_paths; + + assert(!_graph->has_path(transcript_path.get_name())); + + auto path_handle = _graph->create_path_handle(transcript_path.get_name()); + + for (auto & handle: transcript_path.path) { + + _graph->append_step(path_handle, handle); + } + } + + if (show_progress) { cerr << "\tEmbedded " << num_embedded_paths << " paths in graph" << endl; }; +} + +void Transcriptome::add_transcripts_to_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool add_bidirectional, const bool exclude_reference_transcripts) const { + + int32_t num_added_threads = 0; + + vector sample_names; + sample_names.reserve(_transcript_paths.size()); + + if (!gbwt_builder->index.hasMetadata()) { + + gbwt_builder->index.addMetadata(); + } + + // Get current number of haplotypes in GBWT index. + auto pre_num_haplotypes = gbwt_builder->index.metadata.haplotypes(); + + for (auto & transcript_path: _transcript_paths) { + + assert(transcript_path.is_reference || transcript_path.is_haplotype); + + // Exclude unique reference transcripts + if (exclude_reference_transcripts && transcript_path.is_reference && !transcript_path.is_haplotype) { + + continue; + } + + ++num_added_threads; + + // Convert transcript path to GBWT thread. + gbwt::vector_type gbwt_thread(transcript_path.path.size()); + for (size_t i = 0; i < transcript_path.path.size(); i++) { + + gbwt_thread[i] = handle_to_gbwt(*_graph, transcript_path.path.at(i)); + } + + // Insert transcript path as thread into GBWT index. + gbwt_builder->insert(gbwt_thread, add_bidirectional); + + // Insert transcript path name into GBWT index. + gbwt_builder->index.metadata.addPath(pre_num_haplotypes + sample_names.size(), 0, 0, 0); + + sample_names.emplace_back(transcript_path.get_name()); + } + + sample_names.shrink_to_fit(); + + // Set number number of haplotypes and transcript path name in metadata. + gbwt_builder->index.metadata.setHaplotypes(pre_num_haplotypes + sample_names.size()); + gbwt_builder->index.metadata.addSamples(sample_names); +} + +void Transcriptome::write_transcript_sequences(ostream * fasta_ostream, const bool exclude_reference_transcripts) const { + + int32_t num_written_sequences = 0; + + for (auto & transcript_path: _transcript_paths) { + + assert(transcript_path.is_reference || transcript_path.is_haplotype); + + // Exclude unique reference transcripts + if (exclude_reference_transcripts && transcript_path.is_reference && !transcript_path.is_haplotype) { + + continue; + } + + ++num_written_sequences; + + // Construct transcript path sequence. + string transcript_path_sequence = ""; + for (auto & handle: transcript_path.path) { + + transcript_path_sequence += _graph->get_sequence(handle); + } + + // Write transcript path name and sequence. + write_fasta_sequence(transcript_path.get_name(), transcript_path_sequence, *fasta_ostream); + } +} + +void Transcriptome::write_transcript_info(ostream * tsv_ostream, const gbwt::GBWT & haplotype_index, const bool exclude_reference_transcripts) const { + + *tsv_ostream << "Name\tLength\tTranscripts\tHaplotypes" << endl; + + // Parse reference sample tags. + auto gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(haplotype_index); + + int32_t num_written_info = 0; + + for (auto & transcript_path: _transcript_paths) { + + assert(transcript_path.is_reference || transcript_path.is_haplotype); + + // Exclude unique reference transcripts + if (exclude_reference_transcripts && transcript_path.is_reference && !transcript_path.is_haplotype) { + + continue; + } + + ++num_written_info; + + // Get transcript path length. + int32_t transcript_path_length = 0; + + for (auto & handle: transcript_path.path) { + + transcript_path_length += _graph->get_length(handle); + } + + *tsv_ostream << transcript_path.get_name(); + *tsv_ostream << "\t" << transcript_path_length; + *tsv_ostream << "\t"; + + assert(!transcript_path.transcript_names.empty()); + + bool is_first = true; + + for (auto & name: transcript_path.transcript_names) { + + if (!is_first) { + + *tsv_ostream << ","; + } + + is_first = false; + *tsv_ostream << name; + } + + *tsv_ostream << "\t"; + + assert(!transcript_path.embedded_path_names.empty() || !transcript_path.haplotype_gbwt_ids.empty()); + + is_first = true; + + for (auto & name: transcript_path.embedded_path_names) { + + if (exclude_reference_transcripts && name.second) { + + continue; + } + + if (!is_first) { + + *tsv_ostream << ","; + } + + is_first = false; + *tsv_ostream << name.first; + } + + for (auto & id: transcript_path.haplotype_gbwt_ids) { + + if (exclude_reference_transcripts && id.second) { + + continue; + } + + if (!is_first) { + + *tsv_ostream << ","; + } + + is_first = false; + + *tsv_ostream << get_base_gbwt_path_name(haplotype_index, id.first, gbwt_reference_samples); + } + + *tsv_ostream << endl; + } +} + +void Transcriptome::write_graph(ostream * graph_ostream) const { + + vg::io::save_handle_graph(_graph.get(), *graph_ostream); +} + +} + + diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp new file mode 100644 index 00000000000..672cc72f162 --- /dev/null +++ b/src/transcriptome.hpp @@ -0,0 +1,378 @@ + +#ifndef VG_TRANSCRIPTOME_HPP_INCLUDED +#define VG_TRANSCRIPTOME_HPP_INCLUDED + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../vg.hpp" +#include "../types.hpp" +#include "../gbwt_helper.hpp" + +namespace vg { + +using namespace std; + + +typedef vector exon_nodes_t; +typedef vector thread_ids_t; + + +/** + * Data structure that defines a transcript annotation. + */ +struct Exon { + + /// Exon coordinates (start and end) on the chromosome/contig. + pair coordinates; + + /// Exon border node offsets (last position in upstream intron and + /// first position in downstream intron) on a graph. + pair border_offsets; + + /// Exon border reference path steps (last position in upstream intron and + /// first position in downstream intron) on a graph. + pair border_steps; +}; + +/** + * Data structure that defines a transcript annotation. + */ +struct Transcript { + + /// Transcript name. + string name; + + /// Is transcript in reverse direction (strand == '-'). + bool is_reverse; + + /// Name of chromosome/contig where transcript exist. + string chrom; + + /// Length of chromosome/contig where transcript exist. + uint32_t chrom_length; + + /// Transcript exons. + vector exons; + + Transcript(const string & name_in, const bool is_reverse_in, const string & chrom_in, const uint32_t & chrom_length_in) : name(name_in), is_reverse(is_reverse_in), chrom(chrom_in), chrom_length(chrom_length_in) { + + assert(chrom_length > 0); + } +}; + +/** + * Data structure that defines a base transcript path. + */ +struct TranscriptPath { + + /// Transcript names. + vector transcript_names; + + /// Embedded path names and reference path origin. + vector > embedded_path_names; + + /// Haplotype gbwt ids and reference path origin. + vector > haplotype_gbwt_ids; + + /// Copy id of transcript path + uint32_t copy_id; + + /// Is it a reference and/or haplotype-specific transcript path. + bool is_reference; + bool is_haplotype; + + TranscriptPath() {} + + TranscriptPath(const string & transcript_name, const string & embedded_path_name, const bool is_reference_in, const bool is_haplotype_in) : is_reference(is_reference_in), is_haplotype(is_haplotype_in) { + + assert(!transcript_name.empty()); + assert(!embedded_path_name.empty()); + + assert(is_reference != is_haplotype); + + transcript_names.emplace_back(transcript_name); + embedded_path_names.emplace_back(embedded_path_name, is_reference); + + copy_id = 1; + } + + TranscriptPath(const string & transcript_name, const gbwt::size_type & haplotype_gbwt_id, const bool is_reference_in, const bool is_haplotype_in) : is_reference(is_reference_in), is_haplotype(is_haplotype_in) { + + assert(!transcript_name.empty()); + + assert(is_reference != is_haplotype); + + transcript_names.emplace_back(transcript_name); + haplotype_gbwt_ids.emplace_back(haplotype_gbwt_id, is_reference); + + copy_id = 1; + } + + virtual ~TranscriptPath() {}; + + string get_name() const; +}; + +/** + * Data structure that defines an edited transcript path. + */ +struct EditedTranscriptPath : public TranscriptPath { + + /// Transcript path. + Path path; + + EditedTranscriptPath(const string & transcript_name, const string & embedded_path_name, const bool is_reference_in, const bool is_haplotype_in) : TranscriptPath(transcript_name, embedded_path_name, is_reference_in, is_haplotype_in) {} + EditedTranscriptPath(const string & transcript_name, const gbwt::size_type & haplotype_gbwt_id, const bool is_reference_in, const bool is_haplotype_in) : TranscriptPath(transcript_name, haplotype_gbwt_id, is_reference_in, is_haplotype_in) {} + + ~EditedTranscriptPath() {}; + + handle_t get_first_node_handle(const HandleGraph & graph) const; + +}; + +/** + * Data structure that defines a completed transcript path. + */ +struct CompletedTranscriptPath : public TranscriptPath { + + /// Transcript path. + vector path; + + CompletedTranscriptPath(const EditedTranscriptPath & edited_transcript_path); + CompletedTranscriptPath(const EditedTranscriptPath & edited_transcript_path, const HandleGraph & graph); + ~CompletedTranscriptPath() {}; + + handle_t get_first_node_handle(const HandleGraph & graph) const; +}; + +struct MappingHash +{ + size_t operator()(const Mapping & mapping) const + { + size_t seed = 0; + + spp::hash_combine(seed, mapping.position().node_id()); + spp::hash_combine(seed, mapping.position().offset()); + spp::hash_combine(seed, mapping.position().is_reverse()); + + for (auto & edit: mapping.edit()) { + + spp::hash_combine(seed, edit.to_length()); + } + + return seed; + } + }; + +/** + * Class that defines a transcriptome represented by a set of transcript paths. + */ +class Transcriptome { + + public: + + Transcriptome(unique_ptr&& graph_in); + + /// Write progress to stderr. + bool show_progress = false; + + /// Number of threads used for transcript path construction. + int32_t num_threads = 1; + + /// Feature type to parse in the gtf/gff file. Parse all types if empty. + string feature_type = "exon"; + + /// Attribute tag used to parse the transcript id/name in the gtf/gff file. + string transcript_tag = "transcript_id"; + + /// Speicifies which paths should be compared when collapsing identical paths. + /// Can be no, haplotype or all. + string path_collapse_type = "haplotype"; + + /// Treat a missing path in the transcripts/introns as a data error + bool error_on_missing_path = true; + + /// Adds splice-junstions from intron BED files to the graph. + /// Optionally update haplotype GBWT index with new splice-junctions. + /// Returns the number of introns parsed. + int32_t add_intron_splice_junctions(vector intron_streams, unique_ptr & haplotype_index, const bool update_haplotypes); + + /// Adds splice-junstions from transcript gtf/gff3 files to the graph and + /// creates reference transcript paths. Optionally update haplotype GBWT + /// index with new splice-junctions. Returns the number of transcripts parsed. + int32_t add_reference_transcripts(vector transcript_streams, unique_ptr & haplotype_index, const bool use_haplotype_paths, const bool update_haplotypes); + + /// Adds haplotype-specific transcript paths by projecting transcripts in + /// gtf/gff3 files onto either non-reference embedded paths and/or haplotypes + /// in a GBWT index. Returns the number of haplotype transcript paths projected. + int32_t add_haplotype_transcripts(vector transcript_streams, const gbwt::GBWT & haplotype_index, const bool proj_emded_paths); + + /// Returns transcript paths. + const vector & transcript_paths() const; + + /// Returns the reference transcript paths. + vector reference_transcript_paths() const; + + /// Returns the haplotype transcript paths. + vector haplotype_transcript_paths() const; + + /// Returns the graph. + const MutablePathDeletableHandleGraph & graph() const; + + /// Removes non-transcribed (not in transcript paths) nodes. + void remove_non_transcribed_nodes(); + + /// Chop nodes so that they are not longer than the supplied + /// maximum node length. Returns number of chopped nodes. + void chop_nodes(const uint32_t max_node_length); + + /// Topological sorts graph and compacts node ids. Only works for + /// graphs in the PackedGraph format. Return false if not sorted. + bool sort_compact_nodes(); + + /// Embeds transcriptome transcript paths in the graph. + /// Returns the number of paths embedded. + void embed_transcript_paths(const bool add_reference_transcripts, const bool add_haplotype_transcripts); + + /// Adds transcriptome transcript paths as threads to a GBWT index. + /// Returns the number of added threads. + void add_transcripts_to_gbwt(gbwt::GBWTBuilder * gbwt_builder, const bool add_bidirectional, const bool exclude_reference_transcripts) const; + + /// Writes transcriptome transcript path sequences to a fasta file. + /// Returns the number of written sequences. + void write_transcript_sequences(ostream * fasta_ostream, const bool exclude_reference_transcripts) const; + + /// Writes info on transcriptome transcript paths to tsv file. + /// Returns the number of written transcripts. + void write_transcript_info(ostream * tsv_ostream, const gbwt::GBWT & haplotype_index, const bool exclude_reference_transcripts) const; + + /// Writes the graph to a file. + void write_graph(ostream * graph_ostream) const; + + private: + + /// Transcript paths representing the transcriptome. + vector _transcript_paths; + mutex mutex_transcript_paths; + + /// Spliced pangenome graph. + unique_ptr _graph; + mutex mutex_graph; + + /// Parse BED file of introns. + void parse_introns(vector * introns, istream * intron_stream, const bdsg::PositionOverlay & graph_path_pos_overlay) const; + + /// Parse gtf/gff3 file of transcripts. Returns the number of non-header lines in the parsed file. + int32_t parse_transcripts(vector * transcripts, uint32_t * number_of_excluded_transcripts, istream * transcript_stream, const bdsg::PositionOverlay & graph_path_pos_overlay, const gbwt::GBWT & haplotype_index, const bool use_haplotype_paths) const; + + /// Returns gbwt path name without phaseblock and subrange. + string get_base_gbwt_path_name(const gbwt::GBWT & haplotype_index, const size_t path_id, const unordered_set & gbwt_reference_samples) const; + + /// Parse gtf/gff3 attribute value. + string parse_attribute_value(const string & attribute, const string & name) const; + + /// Returns the the mean node length of the graph + float mean_node_length() const; + + /// Adds the exon coordinates to a transcript. + void add_exon(Transcript * transcript, const pair & exon_pos) const; + + /// Adds the exon coordinates to a transcript and finds the + /// position of each end of a exon on the contig path in the graph. + void add_exon(Transcript * transcript, const pair & exon_pos, const bdsg::PositionOverlay & graph_path_pos_overlay) const; + + /// Reverses exon order if the transcript is on the reverse strand and the exons + /// are ordered in reverse. + void reorder_exons(Transcript * transcript) const; + + /// Checks whether any adjacent exons overlap. + bool has_overlapping_exons(const vector & exons) const; + + /// Constructs edited reference transcript paths from a set of + /// transcripts using embedded graph paths. + list construct_reference_transcript_paths_embedded(const vector & transcripts, const bdsg::PositionOverlay & graph_path_pos_overlay) const; + + /// Threaded reference transcript path construction using embedded paths. + void construct_reference_transcript_paths_embedded_callback(list * edited_transcript_paths, spp::sparse_hash_map > * edited_transcript_paths_index, mutex * edited_transcript_paths_mutex, const int32_t thread_idx, const vector & transcripts, const bdsg::PositionOverlay & graph_path_pos_overlay) const; + + /// Projects transcripts onto embedded paths in a graph and returns the resulting transcript paths. + list project_transcript_embedded(const Transcript & cur_transcript, const bdsg::PositionOverlay & graph_path_pos_overlay, const bool use_reference_paths, const bool use_haplotype_paths) const; + + /// Constructs edited reference transcript paths from a set of + /// transcripts using haplotype paths in a GBWT index. + list construct_reference_transcript_paths_gbwt(const vector & transcripts, const gbwt::GBWT & haplotype_index) const; + + /// Threaded reference transcript path construction using GBWT haplotype paths. + void construct_reference_transcript_paths_gbwt_callback(list * edited_transcript_paths, spp::sparse_hash_map > * edited_transcript_paths_index, uint32_t * excluded_transcripts, mutex * edited_transcript_paths_mutex, const int32_t thread_idx, const vector > & chrom_transcript_sets, const vector & transcripts, const gbwt::GBWT & haplotype_index, const spp::sparse_hash_map > & haplotype_name_index) const; + + /// Constructs haplotype transcript paths by projecting transcripts onto + /// embedded paths in a graph and/or haplotypes in a GBWT index. + /// Adds haplotype transcript to transcriptome. + void project_haplotype_transcripts(const vector & transcripts, const gbwt::GBWT & haplotype_index, const bdsg::PositionOverlay & graph_path_pos_overlay, const bool proj_emded_paths, const float mean_node_length); + + /// Threaded haplotype transcript projecting. + void project_haplotype_transcripts_callback(list * completed_transcript_paths, spp::sparse_hash_map > * completed_transcript_paths_index, mutex * completed_transcript_paths_mutex, const int32_t thread_idx, const vector & transcripts, const gbwt::GBWT & haplotype_index, const bdsg::PositionOverlay & graph_path_pos_overlay, const bool proj_emded_paths, const float mean_node_length); + + /// Projects transcripts onto haplotypes in a GBWT index and returns the resulting transcript paths. + list project_transcript_gbwt(const Transcript & cur_transcript, const gbwt::GBWT & haplotype_index, const float mean_node_length) const; + + /// Extracts all unique haplotype paths between two nodes from a GBWT index and returns the + /// resulting paths and the corresponding haplotype ids for each path. + vector > get_exon_haplotypes(const vg::id_t start_node, const vg::id_t end_node, const gbwt::GBWT & haplotype_index, const int32_t expected_length) const; + + /// Remove redundant transcript paths and update index. + template + void remove_redundant_transcript_paths(list * new_transcript_paths, spp::sparse_hash_map > * transcript_paths_index) const; + + /// Constructs completed transcripts paths from + /// edited transcript paths. Checks that the + /// paths contain no edits compared to the graph. + list construct_completed_transcript_paths(const list & edited_transcript_paths) const; + + /// Adds edited transcript paths to transcriptome + /// Checks that the paths contain no edits compared to the graph. + void add_edited_transcript_paths(const list & edited_transcript_paths); + + /// Checks whether transcript path only consist of + /// whole nodes (complete). + bool has_novel_exon_boundaries(const list & edited_transcript_paths, const bool include_transcript_ends) const; + + /// Augments the graph with transcript path exon boundaries and + /// splice-junctions. Updates threads in gbwt index to match the augmented graph. + /// Optinally adds transcript paths to the transcriptome. + void augment_graph(const list & edited_transcript_paths, const bool is_introns, unique_ptr & haplotype_index, const bool update_haplotypes, const bool add_reference_transcript_paths); + + /// Update threads in gbwt index using graph translations. + void update_haplotype_index(unique_ptr & haplotype_index, const spp::sparse_hash_map > > & update_index) const; + + /// Update/split node handles in transcriptome transcript paths according to index. + void update_transcript_paths(const spp::sparse_hash_map > > & update_index); + + /// Adds transcript path splice-junction edges to the graph + void add_splice_junction_edges(const list & edited_transcript_paths); + void add_splice_junction_edges(const list & completed_transcript_paths); + void add_splice_junction_edges(const vector & completed_transcript_paths); + + /// Collects all unique nodes in transcriptome transcript paths. + void collect_transcribed_nodes(spp::sparse_hash_set * transcribed_nodes) const; + + /// Sort transcriptome transcript paths by name and is reference, + /// and update their copy ids. + void sort_transcript_paths_update_copy_id(); + +}; + +} + + +#endif diff --git a/src/translator.cpp b/src/translator.cpp index 32d65c3cec9..8360afc2394 100644 --- a/src/translator.cpp +++ b/src/translator.cpp @@ -1,5 +1,5 @@ #include "translator.hpp" -#include "stream.hpp" +#include namespace vg { @@ -11,7 +11,7 @@ Translator::Translator(istream& in) { function lambda = [&](Translation& trans) { translations.push_back(trans); }; - stream::for_each(in, lambda); + vg::io::for_each(in, lambda); build_position_table(); } diff --git a/src/translator.hpp b/src/translator.hpp index c741c0f61d7..94615ef41c9 100644 --- a/src/translator.hpp +++ b/src/translator.hpp @@ -12,7 +12,7 @@ #include #include #include -#include "vg.pb.h" +#include #include "vg.hpp" #include "hash_map.hpp" #include "utility.hpp" diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp index 9205e15946e..04f57c5ab3b 100644 --- a/src/traversal_finder.cpp +++ b/src/traversal_finder.cpp @@ -1,16 +1,16 @@ #include "traversal_finder.hpp" #include "genotypekit.hpp" -#include "algorithms/topological_sort.hpp" -#include "algorithms/is_acyclic.hpp" +#include "algorithms/k_widest_paths.hpp" #include "cactus.hpp" - +#include "gbwt_helper.hpp" +#include "haplotype_extracter.hpp" //#define debug namespace vg { using namespace std; -PathBasedTraversalFinder::PathBasedTraversalFinder(vg::VG& g, SnarlManager& sm) : graph(g), snarlmanager(sm){ +PathBasedTraversalFinder::PathBasedTraversalFinder(const PathHandleGraph& g, SnarlManager& sm) : graph(g), snarlmanager(sm){ } vector PathBasedTraversalFinder::find_traversals(const Snarl& site){ @@ -21,13 +21,13 @@ vector PathBasedTraversalFinder::find_traversals(const Snarl& si vector ret; // If the snarl is not an ultrabubble, just return an empty set of traversals. - if (!site.type() == ULTRABUBBLE){ + if (site.type() != ULTRABUBBLE){ return ret; } // Get the Snarl's nodes unordered_set snarl_node_ids; - pair, unordered_set > contents = snarlmanager.shallow_contents(&site, graph, true); + pair, unordered_set > contents = snarlmanager.shallow_contents(&site, graph, true); // Get the variant paths at the snarl nodes. @@ -35,19 +35,23 @@ vector PathBasedTraversalFinder::find_traversals(const Snarl& si regex front ("(_alt_)(.*)"); regex alt_str ("(_alt_)"); regex back ("(_[0-9]*)"); - auto& gpaths = graph.paths._paths; set gpath_names; - for (auto x : gpaths){ - gpath_names.insert(x.first); - } + graph.for_each_path_handle([&](const path_handle_t& path_handle) { + gpath_names.insert(graph.get_path_name(path_handle)); + }); + map > basename_to_pathnames; map path_processed; // Collect our paths which cross our snarl's nodes. - for (auto node : contents.first){ + for (id_t node_id : contents.first){ + handle_t node = graph.get_handle(node_id); //cerr << "Processing node " << id << endl; - set p_of_n = graph.paths.of_node(node->id()); + set p_of_n; + graph.for_each_step_on_handle(node, [&](const step_handle_t& step) { + p_of_n.insert(graph.get_path_name(graph.get_path_handle_of_step(step))); + }); for (auto pn : p_of_n){ if (!std::regex_match(pn, front)){ @@ -98,13 +102,14 @@ vector PathBasedTraversalFinder::find_traversals(const Snarl& si // Add the start node to the traversal *fresh_trav.add_visit() = site.start(); // Fill in our traversal - auto& ms = gpaths[a]; - for (auto m : ms){ - int64_t n_id = m.node_id(); - bool backward = m.is_reverse(); - Visit* v = fresh_trav.add_visit(); - v->set_node_id(n_id); - v->set_backward(backward); + if (graph.has_path(a)) { + for (auto h : graph.scan_path(graph.get_path_handle(a))) { + int64_t n_id = graph.get_id(h); + bool backward = graph.get_is_reverse(h); + Visit* v = fresh_trav.add_visit(); + v->set_node_id(n_id); + v->set_backward(backward); + } } // Add the end node to the traversal *fresh_trav.add_visit() = site.end(); @@ -128,7 +133,7 @@ vector PathBasedTraversalFinder::find_traversals(const Snarl& si return ret; } -ExhaustiveTraversalFinder::ExhaustiveTraversalFinder(VG& graph, SnarlManager& snarl_manager, +ExhaustiveTraversalFinder::ExhaustiveTraversalFinder(const HandleGraph& graph, SnarlManager& snarl_manager, bool include_reversing_traversals) : graph(graph), snarl_manager(snarl_manager), include_reversing_traversals(include_reversing_traversals) { @@ -139,65 +144,51 @@ ExhaustiveTraversalFinder::~ExhaustiveTraversalFinder() { // no heap objects } -void ExhaustiveTraversalFinder::stack_up_valid_walks(NodeTraversal walk_head, vector& stack) { +void ExhaustiveTraversalFinder::stack_up_valid_walks(handle_t walk_head, vector& stack) { - id_t head_id = walk_head.node->id(); + id_t head_id = graph.get_id(walk_head); // get all edges involving this node so we can filter them down to valid walks - for (Edge* edge : graph.edges_of(walk_head.node)) { - Node* next_node = nullptr; - bool next_backward; - bool from_start; - // determine id and orientation of our nodes given that they can - // be either from or to in the edge - if (head_id == edge->from()) { - next_node = graph.get_node(edge->to()); - next_backward = edge->to_end(); - from_start = edge->from_start(); - } else { - next_node = graph.get_node(edge->from()); - next_backward = !edge->from_start(); - from_start = !edge->to_end(); - } - // are we walking the same direction relative to head_id? - if (walk_head.backward == from_start && - // derived classes can use this to filter search - visit_next_node(next_node, edge)) { - // add the next traversal in the walk to the stack - stack.push_back(NodeTraversal(next_node, next_backward)); - } - } + graph.follow_edges(walk_head, false, [&](const handle_t next_node) { + if (visit_next_node(next_node)) { + Visit next_visit; + next_visit.set_node_id(graph.get_id(next_node)); + next_visit.set_backward(graph.get_is_reverse(next_node)); + stack.push_back(next_visit); + } + }); } void ExhaustiveTraversalFinder::add_traversals(vector& traversals, - NodeTraversal traversal_start, - set& stop_at, - set& yield_at) { + handle_t traversal_start, + unordered_set& stop_at, + unordered_set& yield_at) { // keeps track of the walk of the DFS traversal list path; // these mark the start of the edges out of the node that is on the head of the path // they can be used to see how many nodes we need to peel off the path when we're // backtracking - NodeTraversal stack_sentinel(nullptr); + Visit stack_sentinel; // initialize stack for DFS traversal of site - vector stack{traversal_start}; + vector stack{to_visit(graph, traversal_start)}; while (stack.size()) { - NodeTraversal node_traversal = stack.back(); + Visit node_visit = stack.back(); stack.pop_back(); // we have traversed all of edges out of the head of the path, so we can pop it off - if (node_traversal == stack_sentinel) { + if (node_visit == stack_sentinel) { path.pop_back(); continue; } // have we finished a traversal through the site? - if (stop_at.count(node_traversal)) { - if (yield_at.count(node_traversal)) { + handle_t node_handle = graph.get_handle(node_visit.node_id(), node_visit.backward()); + if (stop_at.count(node_handle)) { + if (yield_at.count(node_handle)) { // yield path as a snarl traversal traversals.emplace_back(); @@ -206,7 +197,7 @@ void ExhaustiveTraversalFinder::add_traversals(vector& traversal *traversals.back().add_visit() = *iter; } // add the final visit - *traversals.back().add_visit() = to_visit(node_traversal); + *traversals.back().add_visit() = node_visit; } // don't proceed to add more onto the DFS stack @@ -218,15 +209,15 @@ void ExhaustiveTraversalFinder::add_traversals(vector& traversal // make a visit through the node traversal and add it to the path path.emplace_back(); - path.back().set_node_id(node_traversal.node->id()); - path.back().set_backward(node_traversal.backward); + path.back().set_node_id(node_visit.node_id()); + path.back().set_backward(node_visit.backward()); // does this traversal point into a child snarl? - const Snarl* into_snarl = snarl_manager.into_which_snarl(node_traversal.node->id(), - node_traversal.backward); + const Snarl* into_snarl = snarl_manager.into_which_snarl(node_visit.node_id(), + node_visit.backward()); #ifdef debug - cerr << "Traversal " << node_traversal.node->id() << " " << node_traversal.backward << " enters"; + cerr << "Traversal " << node_visit.node_id() << " " << node_visit.backward() << " enters"; if (into_snarl != nullptr) { cerr << " " << pb2json(*into_snarl) << endl; } else { @@ -234,7 +225,7 @@ void ExhaustiveTraversalFinder::add_traversals(vector& traversal } #endif - if (into_snarl && !(node_traversal == traversal_start)) { + if (into_snarl && !(node_handle == traversal_start)) { // add a visit for the child snarl path.emplace_back(); *path.back().mutable_snarl()->mutable_start() = into_snarl->start(); @@ -244,26 +235,28 @@ void ExhaustiveTraversalFinder::add_traversals(vector& traversal stack.push_back(stack_sentinel); // which side of the snarl does the traversal point into? - if (into_snarl->start().node_id() == node_traversal.node->id() - && into_snarl->start().backward() == node_traversal.backward) { + if (into_snarl->start().node_id() == node_visit.node_id() + && into_snarl->start().backward() == node_visit.backward()) { // Into the start #ifdef debug cerr << "Entered child through its start" << endl; #endif if (into_snarl->start_end_reachable()) { // skip to the other side and proceed in the orientation that the end node takes. - stack.push_back(to_node_traversal(into_snarl->end(), graph)); + stack.push_back(into_snarl->end()); #ifdef debug - cerr << "Stack up " << stack.back().node->id() << " " << stack.back().backward << endl; + cerr << "Stack up " << stack.back().node_id() << " " << stack.back().backward() << endl; #endif } // if the same side is also reachable, add it to the stack too if (into_snarl->start_self_reachable()) { // Make sure to flip it around so we come out of the snarl instead of going in again, - stack.push_back(to_rev_node_traversal(into_snarl->start(), graph).reverse()); + Visit rev_visit = into_snarl->start(); + rev_visit.set_backward(!rev_visit.backward()); + stack.push_back(rev_visit); #ifdef debug - cerr << "Stack up " << stack.back().node->id() << " " << stack.back().backward << endl; + cerr << "Stack up " << stack.back().node_id() << " " << stack.back().backward() << endl; #endif } @@ -277,24 +270,28 @@ void ExhaustiveTraversalFinder::add_traversals(vector& traversal // skip to the other side and proceed in the orientation // *opposite* what the start node takes (i.e. out of the // snarl) - stack.push_back(to_node_traversal(into_snarl->start(), graph).reverse()); + Visit rev_visit = into_snarl->start(); + rev_visit.set_backward(!rev_visit.backward()); + stack.push_back(rev_visit); #ifdef debug - cerr << "Stack up " << stack.back().node->id() << " " << stack.back().backward << endl; + cerr << "Stack up " << stack.back().node_id() << " " << stack.back().backward() << endl; #endif } // if the same side is also reachable, add it to the stack too if (into_snarl->end_self_reachable()) { - stack.push_back(to_rev_node_traversal(into_snarl->end(), graph)); + Visit rev_visit = into_snarl->end(); + rev_visit.set_backward(!rev_visit.backward()); + stack.push_back(rev_visit); #ifdef debug - cerr << "Stack up " << stack.back().node->id() << " " << stack.back().backward << endl; + cerr << "Stack up " << stack.back().node_id() << " " << stack.back().backward() << endl; #endif } } } else { // add all of the node traversals we can reach through valid walks to stack - stack_up_valid_walks(node_traversal, stack); + stack_up_valid_walks(node_handle, stack); } } } @@ -302,18 +299,18 @@ void ExhaustiveTraversalFinder::add_traversals(vector& traversal vector ExhaustiveTraversalFinder::find_traversals(const Snarl& site) { vector to_return; - - NodeTraversal site_end = to_node_traversal(site.end(), graph); - NodeTraversal site_start = to_node_traversal(site.start(), graph); - NodeTraversal site_rev_start = NodeTraversal(site_start.node, !site_start.backward); + + handle_t site_end = graph.get_handle(site.end().node_id(), site.end().backward()); + handle_t site_start = graph.get_handle(site.start().node_id(), site.start().backward()); + handle_t site_rev_start = graph.get_handle(site.start().node_id(), !site.start().backward()); // stop searching when the traversal is leaving the site - set stop_at; + unordered_set stop_at; stop_at.insert(site_end); stop_at.insert(site_rev_start); // choose which side(s) can be the end of the traversal - set yield_at; + unordered_set yield_at; yield_at.insert(site_end); if (include_reversing_traversals) { yield_at.insert(site_rev_start); @@ -326,7 +323,7 @@ vector ExhaustiveTraversalFinder::find_traversals(const Snarl& s // if the end is reachable from itself, also look for traversals that both enter and // leave through the end yield_at.erase(site_rev_start); - add_traversals(to_return, NodeTraversal(site_end.node, !site_end.backward), + add_traversals(to_return, graph.get_handle(graph.get_id(site_end), !graph.get_is_reverse(site_end)), stop_at, yield_at); } @@ -357,12 +354,14 @@ bool SupportRestrictedTraversalFinder::visit_next_node(const Node* node, const E PathRestrictedTraversalFinder::PathRestrictedTraversalFinder(VG& graph, SnarlManager& snarl_manager, map& reads_by_name, - int min_recurrence, int max_path_search_steps) : + int min_recurrence, int max_path_search_steps, + bool allow_duplicates) : graph(graph), snarl_manager(snarl_manager), reads_by_name(reads_by_name), - min_recurrence(min_recurrence), - max_path_search_steps(max_path_search_steps) { + min_recurrence(min_recurrence), + max_path_search_steps(max_path_search_steps), + allow_duplicates(allow_duplicates) { // nothing else to do } @@ -409,7 +408,7 @@ static bool mapping_exits_side(const Mapping& mapping, const handle_t& side, con } // replaces get_paths_through_site from genotyper -vector PathRestrictedTraversalFinder::find_traversals(const Snarl& site) { +pair, vector> PathRestrictedTraversalFinder::find_named_traversals(const Snarl& site) { // We're going to emit traversals supported by any paths in the graph. // Put all our subpaths in here to deduplicate them by sequence they spell @@ -417,7 +416,7 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar // boosted to min_recurrence if a non-read path in the graph supports a // certain traversal string, so we don't end up dropping unsupported ref // alleles. - map> results; + map> results; #ifdef debug #pragma omp critical (cerr) @@ -446,7 +445,6 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar for(auto* mapping : name_and_mappings.second) { // Start at each mapping in the appropriate orientation - #ifdef debug #pragma omp critical (cerr) cerr << "Trying mapping of read/path " << name_and_mappings.first << " to " << mapping->node_id() << (mapping->is_reverse() ? "-" : "+") << endl; @@ -472,7 +470,8 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar // We are going left in the read but right in the snarl, so // we want to enter the snarl's start node - bool enter_start = mapping_enters_side(mapping->to_mapping(), graph.get_handle(site.start()), &graph); + bool enter_start = mapping_enters_side(mapping->to_mapping(), + graph.get_handle(site.start().node_id(), site.start().backward()), &graph); #ifdef debug #pragma omp critical (cerr) @@ -487,7 +486,8 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar } else { // We are going right, so we want to exit the snarl's start // node - bool exit_start = mapping_exits_side(mapping->to_mapping(), graph.get_handle(site.start()), &graph); + bool exit_start = mapping_exits_side(mapping->to_mapping(), + graph.get_handle(site.start().node_id(), site.start().backward()), &graph); #ifdef debug #pragma omp critical (cerr) @@ -529,9 +529,11 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar // node, depending on which way in the read we read. And // if it doesn't we try again. if (!traversal_direction && - !mapping_enters_side(mapping->to_mapping(), graph.get_handle(site.end()), &graph) || + !mapping_enters_side(mapping->to_mapping(), + graph.get_handle(site.end().node_id(), site.end().backward()), &graph) || traversal_direction && - !mapping_exits_side(mapping->to_mapping(), graph.get_handle(site.end()), &graph)) { + !mapping_exits_side(mapping->to_mapping(), + graph.get_handle(site.end().node_id(), site.end().backward()), &graph)) { break; } @@ -542,6 +544,11 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar Node* map_node = graph.get_node(path_visit.node_id()); allele_seq += path_visit.backward() ? reverse_complement(map_node->sequence()) : map_node->sequence(); } + + // hack for allow_duplicates toggle + if (!reads_by_name.count(name) && allow_duplicates) { + allele_seq = name; + } // We have stumbled upon the end node in the orientation we wanted it in. if(results.count(allele_seq)) { @@ -553,16 +560,16 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar if(reads_by_name.count(name)) { // We are a read. Just increment count - results[allele_seq].second++; + get<1>(results[allele_seq])++; } else { // We are a named path (like "ref") - if(results[allele_seq].second < min_recurrence) { + if(get<1>(results[allele_seq]) < min_recurrence) { // Ensure that this allele doesn't get // eliminated, since ref or some other named // path supports it. - results[allele_seq].second = min_recurrence; + get<1>(results[allele_seq]) = min_recurrence; } else { - results[allele_seq].second++; + get<1>(results[allele_seq])++; } } } else { @@ -570,7 +577,8 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar // and a count of min_recurrence (so it doesn't get // filtered later) if we are a named non-read path // (like "ref"). - results[allele_seq] = make_pair(path_traversed, reads_by_name.count(name) ? 1 : min_recurrence); + int trav_occ = reads_by_name.count(name) ? 1 : min_recurrence; + results[allele_seq] = std::tie(path_traversed, trav_occ, name); #ifdef debug #pragma omp critical (cerr) cerr << "\tFinished; got novel sequence " << allele_seq << endl; @@ -585,9 +593,17 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar if(traversal_direction) { // We're going backwards mapping = graph.paths.traverse_left(mapping); +#ifdef debug +#pragma omp critical (cerr) + cerr << "traversing left to mapping " << *mapping << endl; +#endif } else { // We're going forwards mapping = graph.paths.traverse_right(mapping); +#ifdef debug +#pragma omp critical (cerr) + cerr << "traversing right to mapping " << *mapping << endl; +#endif } // Tick the counter so we don't go really far on long paths. traversal_count++; @@ -601,13 +617,14 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar } // Now collect the unique results - vector to_return; + pair, vector> to_return; for(auto& result : results) { // Break out each result const string& seq = result.first; - auto& traversals = result.second.first; - auto& count = result.second.second; + auto& traversals = get<0>(result.second); + auto& count = get<1>(result.second); + auto& name = get<2>(result.second); if(count < min_recurrence) { // We don't have enough initial hits for this sequence to justify @@ -618,12 +635,17 @@ vector PathRestrictedTraversalFinder::find_traversals(const Snar } // Send out each list of traversals - to_return.emplace_back(std::move(traversals)); + to_return.first.emplace_back(std::move(traversals)); + to_return.second.push_back(name); } return to_return; } +vector PathRestrictedTraversalFinder::find_traversals(const Snarl& site) { + return find_named_traversals(site).first; +} + ReadRestrictedTraversalFinder::ReadRestrictedTraversalFinder(AugmentedGraph& augmented_graph, SnarlManager& snarl_manager, int min_recurrence, int max_path_search_steps) : @@ -793,12 +815,123 @@ vector ReadRestrictedTraversalFinder::find_traversals(const Snar return to_return; } -TrivialTraversalFinder::TrivialTraversalFinder(VG& graph) : graph(graph) { +PathTraversalFinder::PathTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, + const vector& path_names) : + graph(graph), snarl_manager(snarl_manager) { + for (const string& path_name : path_names) { + assert(graph.has_path(path_name)); + paths.insert(graph.get_path_handle(path_name)); + } +} + +vector PathTraversalFinder::find_traversals(const Snarl& site) { + return find_path_traversals(site).first; +} + +pair, vector > > PathTraversalFinder::find_path_traversals(const Snarl& site) { + + handle_t start_handle = graph.get_handle(site.start().node_id(), site.start().backward()); + handle_t end_handle = graph.get_handle(site.end().node_id(), site.end().backward()); + + vector start_steps = graph.steps_of_handle(start_handle); + vector end_steps = graph.steps_of_handle(end_handle); + + pair, unordered_set > snarl_contents = snarl_manager.deep_contents(&site, graph, true); + + // use this to skip paths that don't reach the end node + unordered_set end_path_handles; + for (const step_handle_t& step : end_steps) { + end_path_handles.insert(graph.get_path_handle_of_step(step)); + } + +#ifdef debug + cerr << "Finding traversals of " << pb2json(site) << " using PathTraversalFinder" << endl + << " - there are " << start_steps.size() << " start_steps, " << end_steps.size() << " end_steps" + << " and " << end_path_handles.size() << " end_path_handles" << endl; +#endif + + vector out_travs; + vector > out_steps; + + for (const step_handle_t& start_step : start_steps) { + path_handle_t start_path_handle = graph.get_path_handle_of_step(start_step); + // only crawl paths that have a chance of reaching the end + if ((paths.empty() || paths.count(start_path_handle)) && end_path_handles.count(start_path_handle)) { + + handle_t end_check = end_handle; + +#ifdef debug + cerr << " - considering path " << graph.get_path_name(start_path_handle) << endl; +#endif + // try to make a traversal by walking forward + SnarlTraversal trav; + bool can_continue = true; + step_handle_t step = start_step; + while (can_continue) { + handle_t handle = graph.get_handle_of_step(step); + Visit* start_visit = trav.add_visit(); + start_visit->set_node_id(graph.get_id(handle)); + start_visit->set_backward(graph.get_is_reverse(handle)); + + can_continue = false; + if (graph.has_next_step(step) && handle != end_handle) { + step_handle_t next_step = graph.get_next_step(step); + handle_t next_handle = graph.get_handle_of_step(next_step); + if (snarl_contents.first.count(graph.get_id(next_handle)) && + snarl_contents.second.count(graph.edge_handle(handle, next_handle))) { + step = next_step; + can_continue = true; + } + } + } + + if (graph.get_handle_of_step(step) != end_check) { +#ifdef debug + cerr << " - failed to find forward traversal of path " << graph.get_path_name(start_path_handle) << endl; +#endif + // try to make a traversal by walking backward + end_check = graph.flip(end_handle); + + trav.Clear(); + can_continue = true; + step = start_step; + while (can_continue) { + handle_t handle = graph.flip(graph.get_handle_of_step(step)); + + Visit* start_visit = trav.add_visit(); + start_visit->set_node_id(graph.get_id(handle)); + start_visit->set_backward(graph.get_is_reverse(handle)); + + can_continue = false; + if (graph.has_previous_step(step) && handle != end_handle) { + step_handle_t prev_step = graph.get_previous_step(step); + handle_t prev_handle = graph.flip(graph.get_handle_of_step(prev_step)); + + if (snarl_contents.first.count(graph.get_id(prev_handle)) && + snarl_contents.second.count(graph.edge_handle(handle, prev_handle))) { + step = prev_step; + can_continue = true; + } + } + } + } + if (graph.get_handle_of_step(step) == end_check) { + out_travs.push_back(trav); + out_steps.push_back(make_pair(start_step, step)); + } + } + } + + return make_pair(out_travs, out_steps); +} + +TrivialTraversalFinder::TrivialTraversalFinder(const HandleGraph& graph) : graph(graph) { // Nothing to do! } vector TrivialTraversalFinder::find_traversals(const Snarl& site) { - assert(site.type() == ULTRABUBBLE); + assert(site.start_end_reachable()); + assert(site.directed_acyclic_net_graph()); // We'll fill this in and send it back vector to_return; @@ -806,20 +939,20 @@ vector TrivialTraversalFinder::find_traversals(const Snarl& site // We don't want to be duplicating partial paths, so we store for each // NodeTraversal we can reach the previous NodeTraversal we can reach it // from. - map previous; + unordered_map previous; - list stack{to_node_traversal(site.start(), graph)}; + list stack{graph.get_handle(site.start().node_id(), site.start().backward())}; while (!stack.empty()) { // While there's still stuff on the stack // Grab the first thing - NodeTraversal here = stack.front(); + handle_t here = stack.front(); stack.pop_front(); - if (here.node->id() == site.end().node_id()) { + if (graph.get_id(here) == site.end().node_id()) { // Trace back a path - list path; + list path; while (true) { // Until we get to the start of the site @@ -827,7 +960,7 @@ vector TrivialTraversalFinder::find_traversals(const Snarl& site // Put this traversal on the front of the path path.push_front(here); - if (here.node->id() == site.start().node_id()) { + if (graph.get_id(here) == site.start().node_id()) { // Stop when we've reached the start of the site break; } @@ -840,27 +973,24 @@ vector TrivialTraversalFinder::find_traversals(const Snarl& site to_return.emplace_back(); // Translate the path into the traversal - for (NodeTraversal node_traversal : path) { - *(to_return.back().add_visit()) = to_visit(node_traversal); + for (handle_t node_traversal : path) { + *(to_return.back().add_visit()) = to_visit(graph, node_traversal); } // Stop early after having found one path break; } else { // We haven't reached the end of the site - - for (NodeTraversal next : graph.nodes_next(here)) { - // Look at all the places we can go from this node - if (previous.count(next)) { - // We already know how to get there. - continue; - } - - // Remember how we got there - previous[next] = here; - // Explore it, depth first - stack.push_front(next); - } + + graph.follow_edges(here, false, [&] (const handle_t& next) { + // Look at all the places we can go from this node + if (!previous.count(next)) { + // Remember how we got there + previous[next] = here; + // Explore it, depth first + stack.push_front(next); + } + }); } } @@ -868,11 +998,20 @@ vector TrivialTraversalFinder::find_traversals(const Snarl& site return to_return; } - -RepresentativeTraversalFinder::RepresentativeTraversalFinder(AugmentedGraph& augmented, - SnarlManager& snarl_manager, size_t max_depth, size_t max_width, size_t max_bubble_paths, - function get_index) : augmented(augmented), snarl_manager(snarl_manager), - max_depth(max_depth), max_width(max_width), max_bubble_paths(max_bubble_paths), get_index(get_index) { +RepresentativeTraversalFinder::RepresentativeTraversalFinder(const PathHandleGraph& graph, + SnarlManager& snarl_manager, + size_t max_depth, + size_t max_width, + size_t max_bubble_paths, + size_t min_node_support, + size_t min_edge_support, + function get_index, + function get_node_support, + function get_edge_support) : + graph(graph), snarl_manager(snarl_manager), max_depth(max_depth), max_width(max_width), + max_bubble_paths(max_bubble_paths), min_node_support(min_node_support), min_edge_support(min_edge_support), + get_index(get_index), get_node_support(get_node_support), get_edge_support(get_edge_support) { + has_supports = this->get_node_support != nullptr && this->get_edge_support != nullptr; // Nothing to do! @@ -885,7 +1024,7 @@ Path RepresentativeTraversalFinder::find_backbone(const Snarl& site) { // Find a traversal, ignoring the fact that child sites ought to own their // nodes. - TrivialTraversalFinder finder(augmented.graph); + TrivialTraversalFinder finder(graph); auto traversals = finder.find_traversals(site); assert(!traversals.empty()); auto& traversal = traversals.front(); @@ -893,7 +1032,7 @@ Path RepresentativeTraversalFinder::find_backbone(const Snarl& site) { // Convert it into a path that includes the boundary nodes Path to_return; for (size_t i = 0; i < traversal.visit_size(); i++) { - *to_return.add_mapping() = to_mapping(traversal.visit(i), augmented.graph); + *to_return.add_mapping() = to_mapping(traversal.visit(i), graph); } return to_return; @@ -902,9 +1041,11 @@ Path RepresentativeTraversalFinder::find_backbone(const Snarl& site) { vector RepresentativeTraversalFinder::find_traversals(const Snarl& site) { - // TODO: we can only do ultrabubbles right now. Other snarls may not have - // traversals through from end to end. - assert(site.type() == ULTRABUBBLE); + // We can only do snarls with start-to-end traversals. + assert(site.start_end_reachable()); + // And that aren't themselves directed-cyclic + // TODO: We don't ignore children, but this check does! + assert(site.directed_acyclic_net_graph()); const Snarl* managed_site = snarl_manager.manage(site); @@ -932,16 +1073,19 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar backbone_index = unique_ptr(new PathIndex(backbone)); } - // Determnine what path will be the path we use to scaffold the traversals: + // Determine what path will be the path we use to scaffold the traversals: // the primary path index by default, or the backbone index if we needed one. PathIndex& index = (backbone_index.get() != nullptr ? *backbone_index : *primary_path_index); // Get the site's nodes and edges, including our outer boundary nodes, not used inside children. // TODO: can we not include the child boundaries? Would that make things easier? - pair, unordered_set> contents = snarl_manager.shallow_contents(&site, augmented.graph, true); + pair, unordered_set> contents = snarl_manager.shallow_contents(&site, graph, true); // Copy its node set - unordered_set nodes_left(contents.first); + unordered_set nodes_left; + for (id_t node_id : contents.first) { + nodes_left.insert(node_id); + } // Trace the ref path through the site. vector ref_path_for_site; @@ -951,13 +1095,13 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar size_t site_end = index.by_id.at(site.end().node_id()).first; #ifdef debug - cerr << "Site starts with " << to_node_traversal(site.start(), augmented.graph) + cerr << "Site starts with " << pb2json(site.start()) << " at " << site_start - << " and ends with " << to_node_traversal(site.end(), augmented.graph) + << " and ends with " << pb2json(site.end()) << " at " << site_end << endl; - for (auto* node : nodes_left) { - cerr << "\tContains node " << node->id() << endl; + for (id_t node : nodes_left) { + cerr << "\tContains node " << node << endl; } #endif @@ -991,12 +1135,14 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar Visit found_visit = found->second.to_visit(); // What node did we hit? - Node* visited_node = augmented.graph.get_node(found_visit.node_id()); + id_t visited_node = found_visit.node_id(); const Snarl* child = snarl_manager.into_which_snarl(found_visit); - if (child != nullptr && child != managed_site - && snarl_manager.into_which_snarl(reverse(found_visit)) != managed_site) { - // If the node in this orientation enters a child + if (child != nullptr && child != managed_site && + snarl_manager.into_which_snarl(reverse(found_visit)) != managed_site && + !(eat_trivial_children && snarl_manager.is_trivial(child, graph))) { + // If the node in this orientation enters a child, and it's not a + // trivial child we are taking care of ourselves // Visit the child Visit child_visit; @@ -1014,19 +1160,19 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar // And skip to its other end. // TODO: the path is not allowed to end inside the snarl. - Node* here = visited_node; + id_t here = visited_node; do { #ifdef debug - cerr << "at node " << pb2json(*here) << endl; + cerr << "at node " << here << endl; #endif // Advance - ref_node_start = found->first + here->sequence().size(); + ref_node_start = found->first + graph.get_length(graph.get_handle(here)); // And look at what we get found = index.by_start.lower_bound(ref_node_start); assert(found != index.by_start.end()); // And grab out the node found_visit = found->second.to_visit(); - here = augmented.graph.get_node(found_visit.node_id()); + here = found_visit.node_id(); // Until we find something in this parent again that isn't the // closing visit of a child snarl. We'll look at what we find // next. @@ -1037,11 +1183,11 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar if (snarl_manager.into_which_snarl(found_visit) == nullptr) { // We don't have another child snarl immediately. Look at the node after this one. - ref_node_start = found->first + here->sequence().size(); + ref_node_start = found->first + graph.get_length(graph.get_handle(here)); found = index.by_start.lower_bound(ref_node_start); assert(found != index.by_start.end()); found_visit = found->second.to_visit(); - here = augmented.graph.get_node(found_visit.node_id()); + here = found_visit.node_id(); } else { // It's also the start node of another child snarl, so loop // on it again. Do nothing here. @@ -1068,7 +1214,7 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar // Next iteration look where this node ends. - ref_node_start = found->first + visited_node->sequence().size(); + ref_node_start = found->first + graph.get_length(graph.get_handle(visited_node)); } #ifdef debug @@ -1084,16 +1230,17 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar // Make sure none of the nodes in the site that we didn't visit // while tracing along the ref path are on the ref path. - if (snarl_manager.into_which_snarl(node->id(), true) || snarl_manager.into_which_snarl(node->id(), false)) { + if (snarl_manager.into_which_snarl(node, true) || snarl_manager.into_which_snarl(node, false)) { // Skip child boundary nodes. continue; } - if(index.by_id.count(node->id())) { - cerr << "Node " << node->id() << " is on backbone path at " - << index.by_id.at(node->id()).first << " but not traced in site " - << to_node_traversal(site.start(), augmented.graph) << " to " - << to_node_traversal(site.end(), augmented.graph) << " that contains it." << endl; + if(index.by_id.count(node)) { + cerr << "error[RepresentativeTraversalFinder]: Node " << node << " is on backbone path at " + << index.by_id.at(node).first << " but not traced in site " + << pb2json(site) << endl; + cerr << "error[RepresentativeTraversalFinder]: This can happen when the path you are calling " + << "against traverses the same part of your graph twice." << endl; throw runtime_error("Extra ref node found"); } } @@ -1112,11 +1259,38 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar #ifdef debug cerr << "Input path: " << endl; for(auto& visit : path) { - if(visit.node_id() != 0 && index.by_id.count(visit.node_id())) { - cerr << "\tPath member " << visit << " lives on backbone at " - << index.by_id.at(visit.node_id()).first << endl; + if(visit.node_id() != 0) { + auto found = index.find_in_orientation(visit.node_id(), visit.backward()); + if (found != index.end()) { + cerr << "\tPath member " << visit << " lives on backbone at " + << found->first << endl; + } else { + cerr << "\tPath member " << visit << " does not live on backbone" << endl; + } } else { - cerr << "\tPath member " << visit << " does not live on backbone" << endl; + cerr << "\tPath member " << visit << " is to a child snarl" << endl; + + auto found_start = index.find_in_orientation(visit.snarl().start().node_id(), + visit.snarl().start().backward() != visit.backward()); + + if (found_start != index.end()) { + cerr << "\t\tStart lives on backbone at " + << found_start->first << endl; + } else { + cerr << "\t\tStart does not live on backbone" << endl; + } + + auto found_end = index.find_in_orientation(visit.snarl().end().node_id(), + visit.snarl().end().backward() != visit.backward()); + + if (found_end != index.end()) { + cerr << "\t\tEnd lives on backbone at " + << found_end->first << endl; + } else { + cerr << "\t\tEnd does not live on backbone" << endl; + } + + } } #endif @@ -1124,9 +1298,61 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar for(auto& visit : path) { if (visit.node_id() != 0) { // Make sure the site actually has the nodes we're visiting. - assert(contents.first.count(augmented.graph.get_node(visit.node_id()))); + if (!contents.first.count(visit.node_id())) { + cerr << "error[RepresentativeTraversalFinder::find_traversals]: Node " + << visit.node_id() << " not in snarl " << pb2json(site) << " contents:" << endl; + + for (auto& node_id : contents.first) { + cerr << "\t" << node_id << "," << endl; + } + + cerr << "children:" << endl; + + for (auto& snarl_ptr : snarl_manager.children_of(&site)) { + cerr << pb2json(*snarl_ptr) << endl; + } + + cerr << "Input path: " << endl; + for(auto& visit : path) { + if(visit.node_id() != 0) { + auto found = index.find_in_orientation(visit.node_id(), visit.backward()); + if (found != index.end()) { + cerr << "\tPath member " << visit << " lives on backbone at " + << found->first << endl; + } else { + cerr << "\tPath member " << visit << " does not live on backbone" << endl; + } + } else { + cerr << "\tPath member " << visit << " is to a child snarl" << endl; + + auto found_start = index.find_in_orientation(visit.snarl().start().node_id(), + visit.snarl().start().backward() != visit.backward()); + + if (found_start != index.end()) { + cerr << "\t\tStart lives on backbone at " + << found_start->first << endl; + } else { + cerr << "\t\tStart does not live on backbone" << endl; + } + + auto found_end = index.find_in_orientation(visit.snarl().end().node_id(), + visit.snarl().end().backward() != visit.backward()); + + if (found_end != index.end()) { + cerr << "\t\tEnd lives on backbone at " + << found_end->first << endl; + } else { + cerr << "\t\tEnd does not live on backbone" << endl; + } + + + } + } + + assert(false); + } } - // Child snarls will have ownership of their end nodes, so they won't be part of our contents. + // Child snarl end nodes will still appear in our contents. } size_t ref_path_index = 0; @@ -1154,27 +1380,53 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar } }; - while(frontier_visit(ref_path_for_site[ref_path_index], false) != frontier_visit(path[bubble_path_index], true) && - !(path[bubble_path_index].node_id() == 0 && - frontier_visit(ref_path_for_site[ref_path_index], false) == frontier_visit(path[bubble_path_index], false))) { +#ifdef debug + cerr << "Ref path length: " << ref_path_for_site.size() << " visits" << endl; + cerr << "Path to be anchored: " << path.size() << " visits" << endl; + cerr << "Looking for " << frontier_visit(path.at(bubble_path_index), true) + << " or " << frontier_visit(path.at(bubble_path_index), false) << " exiting an anchoring snarl" << endl; + + cerr << "Check pos " << ref_path_index << " on ref path and " << bubble_path_index << " on path to be anchored" << endl; +#endif + + + + while(frontier_visit(ref_path_for_site.at(ref_path_index), false) != frontier_visit(path.at(bubble_path_index), true) && + !(path.at(bubble_path_index).node_id() == 0 && + frontier_visit(ref_path_for_site.at(ref_path_index), false) == frontier_visit(path.at(bubble_path_index), false))) { + // The right visit of where we are on the ref path isn't the left + // visit of where we want to start, nor is it the end of a snarl + // and the right visit of where we want to start. + // Collect NodeTraversals from the ref path until we hit the one // at which the bubble path starts. #ifdef debug - cerr << "Before path: " << pb2json(ref_path_for_site[ref_path_index]) << endl; + cerr << "Before path: " << pb2json(ref_path_for_site.at(ref_path_index)) << endl; +#endif + extended_path.push_back(ref_path_for_site.at(ref_path_index++)); + +#ifdef debug + cerr << "Check pos " << ref_path_index << " on ref path and " << bubble_path_index << " on path to be anchored" << endl; #endif - extended_path.push_back(ref_path_for_site[ref_path_index++]); + if (ref_path_index >= ref_path_for_site.size()) { + // We hit the end of the reference path. If the path we are + // trying to anchor actually starts and ends along the + // reference in the right orientation, this should never + // happen. + throw runtime_error("Ran out of reference path when looking for start of path to be anchored"); + } } - if (ref_path_for_site[ref_path_index].node_id() == 0) { + if (ref_path_for_site.at(ref_path_index).node_id() == 0) { // The last Visit we traversed from the ref was a Snarl, so it already // includes the first node of the path as one of its boundaries. We need // to add the ref visit and exclude the bubble visit unless it is also of // a child Snarl and that Snarl is different from the ref path #ifdef debug - cerr << "Adding final ref child visit " << pb2json(ref_path_for_site[ref_path_index]) << endl; + cerr << "Adding final ref child visit " << pb2json(ref_path_for_site.at(ref_path_index)) << endl; #endif - extended_path.push_back(ref_path_for_site[ref_path_index]); + extended_path.push_back(ref_path_for_site.at(ref_path_index)); if (path.front().node_id() != 0 || (path.front().snarl().start() == extended_path.back().snarl().start() && path.front().snarl().end() == extended_path.back().snarl().end())) { @@ -1214,7 +1466,7 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar // Otherwise this ref visit isn't the right one to match up with our // bubble's traversal. #ifdef debug - cerr << "Skip ref: " << pb2json(ref_path_for_site[ref_path_index]) << endl; + cerr << "Skip ref: " << pb2json(ref_path_for_site.at(ref_path_index)) << endl; cerr << "\tWant: " << pb2json(path.back()) << endl; #endif ref_path_index++; @@ -1242,31 +1494,31 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar } } - if (ref_path_for_site[ref_path_index].node_id() == 0) { + if (ref_path_for_site.at(ref_path_index).node_id() == 0) { // The next Visit on the ref path is to a Snarl, so the final Visit we added // from the bubble will be redundant with its boundary nodes unless that Visit // was also to a Snarl if (extended_path.back().node_id() != 0 || - (ref_path_for_site[ref_path_index].snarl().start() == extended_path.back().snarl().start() - && ref_path_for_site[ref_path_index].snarl().end() == extended_path.back().snarl().end())) { + (ref_path_for_site.at(ref_path_index).snarl().start() == extended_path.back().snarl().start() + && ref_path_for_site.at(ref_path_index).snarl().end() == extended_path.back().snarl().end())) { #ifdef debug cerr << "Removing bubble visit " << pb2json(extended_path.back()) << endl; #endif extended_path.pop_back(); } #ifdef debug - cerr << "Adding adjacent ref child visit" << pb2json(ref_path_for_site[ref_path_index]) << endl; + cerr << "Adding adjacent ref child visit" << pb2json(ref_path_for_site.at(ref_path_index)) << endl; #endif - extended_path.push_back(ref_path_for_site[ref_path_index]); + extended_path.push_back(ref_path_for_site.at(ref_path_index)); } // Skip the matching NodeTraversal ref_path_index++; while(ref_path_index < ref_path_for_site.size()) { // Then take the entier rest of the ref path #ifdef debug - cerr << "After path: " << pb2json(ref_path_for_site[ref_path_index]) << endl; + cerr << "After path: " << pb2json(ref_path_for_site.at(ref_path_index)) << endl; #endif - extended_path.push_back(ref_path_for_site[ref_path_index++]); + extended_path.push_back(ref_path_for_site.at(ref_path_index++)); } #ifdef debug @@ -1285,30 +1537,30 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar cerr << "Explore " << contents.first.size() << " nodes" << endl; #endif - for (Node* node : contents.first) { + for (id_t node_id : contents.first) { // Find the bubble for each node - if (snarl_manager.into_which_snarl(node->id(), true) || snarl_manager.into_which_snarl(node->id(), false)) { + if (snarl_manager.into_which_snarl(node_id, true) || snarl_manager.into_which_snarl(node_id, false)) { // Don't start from nodes that are child boundaries continue; } - if (augmented.has_supports() && total(augmented.get_support(node)) == 0) { + if (has_supports && total(get_node_support(node_id)) < min_node_support) { // Don't bother with unsupported nodes continue; } - if (index.by_id.count(node->id())) { + if (index.by_id.count(node_id)) { // Don't try to pathfind to the backbone for backbone nodes. continue; } #ifdef debug - cerr << "Base path on " << node->id() << endl; + cerr << "Base path on " << node_id << endl; #endif // Find bubbles that backend into the backbone path - pair> sup_path = find_bubble(node, nullptr, nullptr, index, site); + pair> sup_path = find_bubble(node_id, nullptr, nullptr, index, site); vector& path = sup_path.second; @@ -1316,7 +1568,7 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar // We couldn't find a path back to the primary path. Discard // this material. if (verbose) { - cerr << "Warning: No path found for node " << node->id() << endl; + cerr << "Warning: No path found for node " << node_id << endl; } // TODO: record the node's bases as lost. @@ -1334,37 +1586,42 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar cerr << "Explore " << contents.second.size() << " edges" << endl; #endif - for(Edge* edge : contents.second) { + for(const edge_t edge : contents.second) { // Go through all the edges - if(augmented.has_supports() && total(augmented.get_support(edge)) == 0) { + if(has_supports && total(get_edge_support(edge)) < min_edge_support) { // Don't bother with unsupported edges #ifdef debug - cerr << "Skip unsupported edge " << edge->from() << " -> " << edge->to() << endl; + cerr << "Skip unsupported edge " << graph.get_id(edge.first) << ":" << graph.get_is_reverse(edge.first) + << " -> " << graph.get_id(edge.second) << ":" << graph.get_is_reverse(edge.second) << endl; #endif continue; } - if(!index.by_id.count(edge->from()) || !index.by_id.count(edge->to())) { + if(!index.by_id.count(graph.get_id(edge.first)) || !index.by_id.count(graph.get_id(edge.second))) { // Edge doesn't touch backbone at both ends. Don't use it // because for some reason it makes performance worse // overall. #ifdef debug - cerr << "Skip off-backbone edge " << edge->from() << " -> " << edge->to() << endl; + cerr << "Skip off-backbone edge " << graph.get_id(edge.first) << ":" << graph.get_is_reverse(edge.first) + << " -> " << graph.get_id(edge.second) << ":" << graph.get_is_reverse(edge.second) << endl; #endif continue; } #ifdef debug - cerr << "Base path on " << edge->from() << " -> " << edge->to() << endl; + cerr << "Base path on " << graph.get_id(edge.first) << ":" << graph.get_is_reverse(edge.first) + << " -> " << graph.get_id(edge.second) << ":" << graph.get_is_reverse(edge.second) << endl; #endif // Find a path based around this edge - pair> sup_path = find_bubble(nullptr, edge, nullptr, index, site); + pair> sup_path = find_bubble(0, &edge, nullptr, index, site); vector& path = sup_path.second; #ifdef debug - cerr << "Edge " << edge->from() << " to " << edge->to() << " yields:" << endl; + cerr << "Edge " << graph.get_id(edge.first) << ":" << graph.get_is_reverse(edge.first) + << " -> " << graph.get_id(edge.second) << ":" << graph.get_is_reverse(edge.second) + << " yields:" << endl; for(auto& visit : path) { cerr << "\t" << visit << endl; } @@ -1374,7 +1631,8 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar // We couldn't find a path back to the primary path. Discard // this material. if (verbose) { - cerr << "Warning: No path found for edge " << edge->from() << "," << edge->to() << endl; + cerr << "Warning: No path found for edge " << graph.get_id(edge.first) << ":" << graph.get_is_reverse(edge.first) + << " -> " << graph.get_id(edge.second) << ":" << graph.get_is_reverse(edge.second) << endl; } // TODO: bases lost continue; @@ -1392,13 +1650,18 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar for (const Snarl* child : children) { // Go through all the child snarls + + if (eat_trivial_children && snarl_manager.is_trivial(child, graph)) { + // Skip trivial children + continue; + } #ifdef debug cerr << "Base path on " << *child << endl; #endif // Find a path based around this child snarl - pair> sup_path = find_bubble(nullptr, nullptr, child, index, site); + pair> sup_path = find_bubble(0, nullptr, child, index, site); vector& path = sup_path.second; if(path.empty()) { @@ -1479,7 +1742,7 @@ vector RepresentativeTraversalFinder::find_traversals(const Snar return unique_traversals; } -pair> RepresentativeTraversalFinder::find_bubble(Node* node, Edge* edge, +pair> RepresentativeTraversalFinder::find_bubble(id_t node, const edge_t* edge, const Snarl* snarl, PathIndex& index, const Snarl& site) { // What are we going to find our left and right path halves based on? @@ -1493,15 +1756,17 @@ pair> RepresentativeTraversalFinder::find_bubble(Node* no // Find the nodes at the ends of the edges. Look at them traversed in the // edge's local orientation. - left_visit = to_visit(edge->from(), edge->from_start()); - right_visit = to_visit(edge->to(), edge->to_end()); + left_visit = to_visit(graph, edge->first); + right_visit = to_visit(graph, edge->second); + // Find any child snarls looking out form the edge const Snarl* right_child = snarl_manager.into_which_snarl(right_visit); - const Snarl* left_child = snarl_manager.into_which_snarl(left_visit); + const Snarl* left_child = snarl_manager.into_which_snarl(reverse(left_visit)); if (right_child != nullptr && right_child != managed_site && snarl_manager.into_which_snarl(reverse(right_visit)) != managed_site) { // We're reading into a child snarl on the right. + // And we're not reading out of ourselves. #ifdef debug cerr << "Child to right of edge " << pb2json(*right_child) << endl; #endif @@ -1519,8 +1784,9 @@ pair> RepresentativeTraversalFinder::find_bubble(Node* no } if (left_child != nullptr && left_child != managed_site - && snarl_manager.into_which_snarl(reverse(left_visit)) != managed_site) { + && snarl_manager.into_which_snarl(left_visit) != managed_site) { // We're reading out of a child snarl on the left. + // And we're not reading into ourselves. #ifdef debug cerr << "Child to left of edge " << pb2json(*left_child) << endl; #endif @@ -1540,10 +1806,10 @@ pair> RepresentativeTraversalFinder::find_bubble(Node* no cerr << "Edge becomes " << left_visit << " -> " << right_visit << endl; #endif - } else if (node != nullptr) { + } else if (node != 0) { // Be node-based. TODO: we trust the caller not to feed us nodes that // are part of/boundaries of child snarls. - left_visit = right_visit = to_visit(node->id(), false); + left_visit = right_visit = to_visit(node, false); } else { // Be snarl-based assert(snarl != nullptr); @@ -1554,24 +1820,75 @@ pair> RepresentativeTraversalFinder::find_bubble(Node* no cerr << "Starting from: " << left_visit << ", " << right_visit << endl; #endif - // Find paths on both sides, with nodes or snarls on the primary path at the - // outsides and this visit in the middle. Returns path lengths and paths in - // pairs in a set. - auto leftPaths = bfs_left(left_visit, index, false, managed_site); - auto rightPaths = bfs_right(right_visit, index, false, managed_site); + // Find paths on both sides, with nodes or snarls on the primary path at + // the outsides and this visit in the middle. Returns path lengths, + // orientatioins in which the reference path was encountered, and paths in + // tuples in a set. + // Make sure to keep looking for the other orientation of the ref path + // after we find a first one, to handle inversions of up to a certain size. + auto leftPaths = bfs_left(left_visit, index, false, managed_site, other_orientation_timeout); + auto rightPaths = bfs_right(right_visit, index, false, managed_site, other_orientation_timeout); + + // Sort out the paths not just by whether they are left or right from here, + // but also by whether they hit the reference path in forward or reverse + // ref-path-relative orientation. + // TODO: give ImmutableList a .back() so we can avoid converting to real lists here. + list> left_forward; + list> right_forward; + list> left_reverse; + list> right_reverse; + + for (auto& annotatedPath : leftPaths) { + // Break up the paths on the left by orientation + auto& ref_reverse = get<1>(annotatedPath); + auto& path = get<2>(annotatedPath); + // TODO: ImmutableList iterators don't actually satisfy + // https://en.cppreference.com/w/cpp/named_req/Iterator because they + // lack the tyypedefs for std::iterator_traits. So we can't use them to + // construct lists. So we have to build the lists manually. + list converted; + for (auto& item : path) { + converted.push_back(item); + } + (ref_reverse ? left_reverse : left_forward).emplace_back(move(converted)); + } - // Find a combination of two paths which gets us to the reference in a - // consistent orientation (meaning that when you look at the ending nodes' - // Mappings in the reference path, the ones with minimal ranks have the same - // orientations) and which doesn't use the same nodes on both sides. - // Track support of up to max_bubble_paths combinations, and return the - // highest - pair > bestBubblePath; - int bubbleCount = 0; + for (auto& annotatedPath : rightPaths) { + // Break up the paths on the right by orientation + auto& ref_reverse = get<1>(annotatedPath); + auto& path = get<2>(annotatedPath); + // TODO: ImmutableList iterators don't actually satisfy + // https://en.cppreference.com/w/cpp/named_req/Iterator because they + // lack the tyypedefs for std::iterator_traits. So we can't use them to + // construct lists. So we have to build the lists manually. + list converted; + for (auto& item : path) { + converted.push_back(item); + } + (ref_reverse ? right_reverse : right_forward).emplace_back(move(converted)); + } // We need to look in different combinations of lists. auto testCombinations = [&](const list>& leftList, - const list>& rightList) { + const list>& rightList) -> pair> { + + + // Find a combination of two paths which gets us to the reference and + // which doesn't use the same nodes on both sides. Track support of up + // to max_bubble_paths combinations, and return the highest. Always + // returns the combined path in a valid reference-relative-forward + // orientation. + + // Because we do our own identification of the anchoring reverence + // occurrences, we may produce a reference-relative-forward path from + // what was supposed to be a reference-relative-backward pair of + // partial paths. + + // TODO: Fix that by making the BFS code pass along the particular + // anchoring occurrences it finds + + pair > bestBubblePath; + int bubbleCount = 0; #ifdef debug cerr << "Combine " << leftList.size() << " left sides and " @@ -1582,7 +1899,6 @@ pair> RepresentativeTraversalFinder::find_bubble(Node* no // node visit, if only to the snarl's start or end. for(auto leftPath : leftList) { - // Figure out the relative orientation for the leftmost node. #ifdef debug cerr << "Left path: " << endl; for(auto visit : leftPath ) { @@ -1604,6 +1920,13 @@ pair> RepresentativeTraversalFinder::find_bubble(Node* no // in the reference path. bool leftRelativeOrientation = leftOrientation != leftRefPos.second; + // TODO: We're using the first occurrence, because that's what + // we'll encounter as we scan along the reference path and what + // we'll use to try and build the final traversal. This may NOT be + // the occurence that got this partial path into the collection for + // this particular relative orientation. So we still need to check + // on orientation consistency. + // Make a set of all the nodes in the left path set leftPathNodes; // And one of all the snarls (with bounding visits set) @@ -1701,6 +2024,10 @@ pair> RepresentativeTraversalFinder::find_bubble(Node* no if(leftRelativeOrientation) { // Turns out our anchored path is backwards. +#ifdef debug + cerr << "Anchored to ref path backward! Reverse combination!" << endl; +#endif + // Reorder everything the other way reverse(fullPath.begin(), fullPath.end()); @@ -1741,26 +2068,40 @@ pair> RepresentativeTraversalFinder::find_bubble(Node* no }; - // Convert sets to lists, which requires a copy again... - // TODO: Can we just completely remove the length calculation? - list> leftConverted; - for(auto lengthAndPath : leftPaths) { - leftConverted.emplace_back(move(lengthAndPath.second)); + // Find the best valid combination, if any, in each orientation +#ifdef debug + cerr << "Combine forward paths" << endl; +#endif + pair > best_forward = testCombinations(left_forward, right_forward); +#ifdef debug + cerr << "Combine reverse paths" << endl; +#endif + pair > best_reverse = testCombinations(left_reverse, right_reverse); + +#ifdef debug + cerr << "Best forward path:" << endl; + for (auto& visit : best_forward.second) { + cerr << "\t" << visit << endl; } - list> rightConverted; - for(auto lengthAndPath : rightPaths) { - rightConverted.emplace_back(move(lengthAndPath.second)); + cerr << "Best reverse path (in forward orientation):" << endl; + for (auto& visit : best_reverse.second) { + cerr << "\t" << visit << endl; } +#endif - // Look for a valid combination, or return an empty path if one iesn't - // found. - return testCombinations(leftConverted, rightConverted); - + if (total(best_forward.first) > total(best_reverse.first) || best_reverse.second.empty()) { + // The forward orientation wins + return best_forward; + } else { + // The reverse orientation wins. + // testCombinations already made it be reference-forward. + return best_reverse; + } } Support RepresentativeTraversalFinder::min_support_in_path(const list& path) { - if (path.empty()) { + if (path.empty() || !has_supports) { // No support if we visit nothing! return Support(); } @@ -1778,7 +2119,11 @@ Support RepresentativeTraversalFinder::min_support_in_path(const list& pa if (cur->node_id() != 0) { // We're at a node visit, so we have a support to start with - minSupport = augmented.get_support(augmented.graph.get_node(cur->node_id())); + minSupport = get_node_support(cur->node_id()); + if (cur->backward()) { + // Put the support in the path forward direction + minSupport = flip(minSupport); + } supportFound = true; } @@ -1787,7 +2132,12 @@ Support RepresentativeTraversalFinder::min_support_in_path(const list& pa if (next->node_id() != 0) { // The next visit is to a node, so get its support - Support nextSupport = augmented.get_support(augmented.graph.get_node(next->node_id())); + Support nextSupport = get_node_support(next->node_id()); + + if (next->backward()) { + // This occurs backward on the path, so flip its support + nextSupport = flip(nextSupport); + } if (supportFound) { // Min it against existing support @@ -1802,11 +2152,19 @@ Support RepresentativeTraversalFinder::min_support_in_path(const list& pa // TODO: Support for child snarls! // check the edge support - Edge* edge = augmented.graph.get_edge(to_right_side(*cur), to_left_side(*next)); - - if (edge != nullptr) { + NodeSide from_side = to_right_side(*cur); + NodeSide to_side = to_left_side(*next); + edge_t edge = graph.edge_handle(graph.get_handle(from_side.node, !from_side.is_end), + graph.get_handle(to_side.node, to_side.is_end)); + + if (graph.has_edge(edge.first, edge.second)) { // The edge exists (because we aren't back-to-back child snarls) - Support edgeSupport = augmented.get_support(edge); + Support edgeSupport = get_edge_support(edge); + + if (cur->node_id() > next->node_id() || (cur->node_id() == next->node_id() && cur->backward())) { + // We are taking the edge backward, so flip its support + edgeSupport = flip(edgeSupport); + } if (supportFound) { // Min it against existing support @@ -1823,24 +2181,34 @@ Support RepresentativeTraversalFinder::min_support_in_path(const list& pa return minSupport; } -set>> RepresentativeTraversalFinder::bfs_left(Visit visit, - PathIndex& index, bool stopIfVisited, const Snarl* in_snarl) { +set>> +RepresentativeTraversalFinder::bfs_left(Visit visit, + PathIndex& index, bool stop_if_visited, const Snarl* in_snarl, + size_t both_orientations_distance) { - // Holds partial paths we want to return, with their lengths in bp. - set>> toReturn; + // Holds partial paths we want to return, with their lengths in bp and + // target-path-relative orientations. + set>> toReturn; // Do a BFS + // Define a stack frame fro the BFS to track an outstanding path being explored. + // Stores the path, the path length in nodes, the countdown to reach the + // target path in the other orientation (or 0 if we have not yet reached + // the target path), and a flag for if the target path was reached in + // reverse orientation when we started the countdown. + using frame_t = tuple, size_t, size_t, bool>; + // This holds the paths to get to NodeTraversals to visit (all of which will // end with the node we're starting with). - list> toExtend; + list toExtend; // This keeps a set of all the oriented nodes we already got to and don't // need to queue again. set alreadyQueued; - // Start at this node at depth 0 - toExtend.emplace_back(list {visit}); + // Start at this node, with no visits and no countdown running. + toExtend.emplace_back(visit, 0, 0, false); // Mark this traversal as already queued alreadyQueued.insert(visit); @@ -1848,7 +2216,12 @@ set>> RepresentativeTraversalFinder::bfs_left(Visit vis size_t searchTicks = 0; #ifdef debug - cerr << "Start BFS" << endl; + cerr << "Start BFS left from " << visit << endl; + + if (in_snarl != nullptr) { + cerr << "Stay inside " << pb2json(*in_snarl) << endl; + } + #endif // Track how many options we have because size may be O(n). @@ -1866,62 +2239,162 @@ set>> RepresentativeTraversalFinder::bfs_left(Visit vis #endif - // Dequeue a path to extend. + // Dequeue a frame to extend. // Make sure to move out of the list to avoid a useless copy. - list path(move(toExtend.front())); + frame_t frame(move(toExtend.front())); toExtend.pop_front(); stillToExtend--; - // We can't just throw out longer paths, because shorter paths may need - // to visit a node twice (in opposite orientations) and thus might get - // rejected later. Or they might overlap with paths on the other side. - - // Look up and see if the front node on the path is on our reference - // path - if (path.front().node_id() != 0 && index.by_id.count(path.front().node_id())) { - // This visit is to a node, which is on the reference path. + // Unpack it + auto& path = get<0>(frame); + auto& path_length = get<1>(frame); + auto& countdown = get<2>(frame); + auto& first_found_reverse = get<3>(frame); + + // Determine an effective node ID and orientation. Either the one we + // actually have, or the one on the other side of the snarl we're + // visiting if we have no node. + size_t node_id; + bool is_reverse; + if (path.front().node_id() != 0) { + // We are visiting a node so just unpack it + node_id = path.front().node_id(); + is_reverse = path.front().backward(); + } else { + // We are visiting a snarl -#ifdef debug - cerr << "Reached anchoring node " << path.front().node_id() << endl; - cerr << "Emit path of length " << path.size() << endl; -#endif + if (path.front().backward()) { + // We are using the snarl in reverse. Since we are searching + // left, we want what's on the left of the reversed snarl, + // which is its end node. + node_id = path.front().snarl().end().node_id(); + // Since we're using the snarl in reverse, we invert that end node's orientation + is_reverse = !path.front().snarl().end().backward(); + } else { + // We're visiting a snarl forward, so we use its start node in the orientation the snarl does. + node_id = path.front().snarl().start().node_id(); + is_reverse = path.front().snarl().start().backward(); + } + } + + // Determine if we connect to the forward orientation and/or reverse orientation of the target path + pair orientations = index.get_contained_orientations(node_id); + if (is_reverse) { + // We actually hit the path in the opposite orientation. + // So if the node we hit is on the path in reverse, we hit the path forward, because we used the node in reverse. + std::swap(orientations.first, orientations.second); + } + + // Now that we know where we are, work out where we want to be + + // Determine if we want forward orientation hits + bool want_forward = countdown == 0 || first_found_reverse; + // And if we want reverse orientation hits + bool want_reverse = countdown == 0 || !first_found_reverse; - // Say we got to the right place - toReturn.emplace(bp_length(path), move(path)); + // This flag will determine if we want to extend from here + bool extend = false; - // Don't bother looking for extensions, we already got there. - } else if (path.front().node_id() == 0 && !path.front().backward() && - index.by_id.count(path.front().snarl().start().node_id())) { - // This visit is to a snarl, which is on the reference path on its - // left end. + // We want to lazily compute the path length in bases + size_t length = 0; + if (want_forward) { + if (orientations.first) { + // Process a wanted hit in forward orientation + + length = bp_length(path); + #ifdef debug - cerr << "Reached start of anchoring snarl " << path.front().snarl() << endl; + cerr << "Reached anchoring node " << node_id << " on target path forward" << endl; + cerr << "Emit path of length " << path_length << " visits and " << length << " bp to forward" << endl; #endif - - // Say we got to the right place - toReturn.emplace(bp_length(path), move(path)); - - // Don't bother looking for extensions, we already got there. - } else if (path.front().node_id() == 0 && path.front().backward() && - index.by_id.count(path.front().snarl().end().node_id())) { - // This visit is to a snarl in reverse, which is on the reference - // path on its right end. + + toReturn.emplace(length, false, path); + + if (want_reverse) { + // We found forward first and now need to start the countdown + first_found_reverse = false; + countdown = both_orientations_distance; + } + + } else { + // We still want forward. + if (want_reverse) { + // We also want reverse; No countdown is active. + extend = true; + } else { + // We are doing a countdown because we already found reverse + if (countdown > 1) { + // There is still time, so extend. + countdown--; + extend = true; + } + } + } + } + + if (want_reverse) { + if(orientations.second) { + // Process a wanted hit in reverse orientation + + if (length == 0) { + length = bp_length(path); + } #ifdef debug - cerr << "Reached end of anchoring snarl " << path.front().snarl() << endl; + cerr << "Reached anchoring node " << node_id << " on target path forward" << endl; + cerr << "Emit path of length " << path_length << " visits and " << length << " bp to reverse" << endl; #endif - - // Say we got to the right place - toReturn.emplace(bp_length(path), move(path)); - - // Don't bother looking for extensions, we already got there. - } else if (path.size() <= max_depth) { - // We haven't hit the reference path yet, but we also haven't hit - // the max depth. Extend with all the possible extensions. + + toReturn.emplace(length, true, path); + + if (want_forward) { + // We found reverse first and now need to start the countdown + first_found_reverse = true; + countdown = both_orientations_distance; + } + } else { + // We still want reverse. + if (want_forward) { + // We also want forward; No countdown is active. + extend = true; + } else { + // We are doing a countdown because we already found forward + if (countdown > 1) { + // There is still time, so extend. + countdown--; + extend = true; + } + } + } + } + + + if (path_length >= max_depth) { +#ifdef debug + cerr << "Path has reached max depth! Aborting!" << endl; +#endif + } else if (!extend) { + // We chose not to extend. +#ifdef debug + cerr << "Choosing not to extend" << endl; +#endif + } else if (in_snarl != nullptr && + ((node_id == in_snarl->start().node_id() && is_reverse == in_snarl->start().backward()) || + (node_id == in_snarl->end().node_id() && is_reverse != in_snarl->end().backward()))) { + // We hit a boundary node of the snarl we are working on, and are + // headed out of the snarl (i.e. we're at the start or end in the + // into-snarl orientation). +#ifdef debug + cerr << "Path has reached containing snarl boundary! Aborting!" << endl; +#endif + } else { + // We haven't hit the reference path yet in all orientations, but + // we also haven't hit the max depth or the snarl bounds. Extend + // with all the possible extensions. // Look left, possibly entering child snarls - vector prevVisits = snarl_manager.visits_left(path.front(), augmented.graph, in_snarl); + vector prevVisits = snarl_manager.visits_left(path.front(), graph, in_snarl); #ifdef debug cerr << "Consider " << prevVisits.size() << " prev visits" << endl; @@ -1935,22 +2408,27 @@ set>> RepresentativeTraversalFinder::bfs_left(Visit vis if (prevVisit.node_id() != 0) { // This is a visit to a node - + + NodeSide from_side = to_right_side(prevVisit); + NodeSide to_side = to_left_side(path.front()); + edge_t edge = graph.edge_handle(graph.get_handle(from_side.node, !from_side.is_end), + graph.get_handle(to_side.node, to_side.is_end)); + // Make sure the edge is real, since it can't be a back-to- // back site - Edge* edge = augmented.graph.get_edge(to_right_side(prevVisit), to_left_side(path.front())); - assert(edge != NULL); + assert(graph.has_edge(edge.first, edge.second)); // Fetch the actual node - Node* prevNode = augmented.graph.get_node(prevVisit.node_id()); + id_t prevNode = prevVisit.node_id(); - if (augmented.has_supports() && - (total(augmented.get_support(prevNode)) == 0 || total(augmented.get_support(edge)) == 0)) { + if (has_supports && + (total(get_node_support(prevNode)) < min_node_support || + total(get_edge_support(edge)) < min_edge_support)) { // We have no support at all for visiting this node by this // edge (but we do have some read support data) #ifdef debug - cerr << "Reject " << prevNode->id() << " with no support" << endl; + cerr << "Reject " << prevNode << " with no support" << endl; #endif continue; @@ -1960,9 +2438,9 @@ set>> RepresentativeTraversalFinder::bfs_left(Visit vis // Look at the node we would leave the child snarl on // That node can't be shared with a snarl we are already at. - Node* prevNode = augmented.graph.get_node(to_left_side(prevVisit).node); + id_t prevNode = to_left_side(prevVisit).node; - if (augmented.has_supports() && total(augmented.get_support(prevNode)) == 0) { + if (has_supports && total(get_node_support(prevNode)) < min_node_support) { // We have no support at all for visiting the far node of this snarl #ifdef debug @@ -1976,7 +2454,7 @@ set>> RepresentativeTraversalFinder::bfs_left(Visit vis } - if (stopIfVisited && alreadyQueued.count(prevVisit)) { + if (stop_if_visited && alreadyQueued.count(prevVisit)) { // We already have a way to get here. #ifdef debug @@ -2007,58 +2485,59 @@ set>> RepresentativeTraversalFinder::bfs_left(Visit vis #endif // Make a new path extended left with the node - list extended(path); - extended.push_front(prevVisit); - toExtend.emplace_back(move(extended)); + toExtend.emplace_back(path.push_front(prevVisit), path_length + 1, countdown, first_found_reverse); stillToExtend++; // Remember we found a way to this node, so we don't try and // visit it other ways. alreadyQueued.insert(prevVisit); } - } else if (path.size() >= max_depth) { -#ifdef debug - cerr << "Path has reached max depth! Aborting!" << endl; -#endif - } else { - // We should have handled all the possibilities. - assert(false); - } - + } } return toReturn; } -set>> RepresentativeTraversalFinder::bfs_right(Visit visit, PathIndex& index, bool stopIfVisited, - const Snarl* in_snarl) { +set>> +RepresentativeTraversalFinder::bfs_right(Visit visit, PathIndex& index, bool stop_if_visited, + const Snarl* in_snarl, size_t both_orientations_distance) { // Look left from the backward version of the visit. - auto toConvert = bfs_left(reverse(visit), index, stopIfVisited, in_snarl); + auto toConvert = bfs_left(reverse(visit), index, stop_if_visited, in_snarl, both_orientations_distance); // Since we can't modify set records in place, we need to do a copy - set>> toReturn; + set>> toReturn; - for(auto lengthAndPath : toConvert) { + for(auto lengthOrientationAndPath : toConvert) { + // Unpack + auto& length = get<0>(lengthOrientationAndPath); + auto& orientation = get<1>(lengthOrientationAndPath); + auto& path = get<2>(lengthOrientationAndPath); + // Flip every path to run the other way - lengthAndPath.second.reverse(); - for(auto& v : lengthAndPath.second) { - // And invert the orientation of every visit in the path in place. - v = reverse(v); + // TODO: this duplicates previously shared nodes... + structures::ImmutableList reverse_path; + for (auto& item : path) { + // While we're at it, reverse each visit + reverse_path = reverse_path.push_front(reverse(item)); } - // Stick it in the new set - toReturn.emplace(move(lengthAndPath)); + + // Stick it in the new set. + // Also flip the orientation flag, since if we encountered the forward + // version of a path searchilg left, we should hit the reverse version + // searching right, and visa versa. + toReturn.emplace(length, !orientation, reverse_path); } return toReturn; } -size_t RepresentativeTraversalFinder::bp_length(const list& path) { +size_t RepresentativeTraversalFinder::bp_length(const structures::ImmutableList& path) { size_t length = 0; for(auto& visit : path) { // Sum up length of each node's sequence if (visit.node_id() != 0) { - length += augmented.graph.get_node(visit.node_id())->sequence().size(); + length += graph.get_length(graph.get_handle(visit.node_id())); } // TODO: handle nested sites } @@ -2066,4 +2545,944 @@ size_t RepresentativeTraversalFinder::bp_length(const list& path) { } +VCFTraversalFinder::VCFTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, + vcflib::VariantCallFile& vcf, + const vector& ref_path_names, + FastaReference* ref_fasta, + FastaReference* ins_fasta, + function skip_alt, + size_t max_traversal_cutoff) : + graph(graph), + snarl_manager(snarl_manager), + skip_alt(skip_alt), + max_traversal_cutoff(max_traversal_cutoff), + path_finder(graph, snarl_manager, ref_path_names) { + + create_variant_index(vcf, ref_fasta, ins_fasta); +} + +VCFTraversalFinder::~VCFTraversalFinder() { + delete_variant_index(); +} + +void VCFTraversalFinder::create_variant_index(vcflib::VariantCallFile& vcf, FastaReference* ref_fasta, + FastaReference* ins_fasta) { + + vcflib::Variant var; +#ifdef debug + cerr << "indexing vcf using alt-path information from graph" << endl; +#endif + vector insertion_fastas; + if (ins_fasta != nullptr) { + insertion_fastas.push_back(ins_fasta); + } + + while (vcf.getNextVariant(var)) { + bool path_found = false; + path_handle_t path_handle; + + // we need to run this in order for symbolic alleles to get the same hashes as in construct + if (var.isSymbolicSV()) { + if (ref_fasta == nullptr) { + cerr << "[VCFTraversalFinder] Warning: Unable to canonicalize symbolic variant because no reference fasta" + << " was given:\n" << var << endl; + continue; + } + bool could_canonicalize = var.canonicalize(*ref_fasta, insertion_fastas, true); + if (!could_canonicalize) { + cerr << "[VCFTraversalFinder] Warning: Failed to canonicalize symbolic variant:\n" << var << endl; + continue; + } + } + + // scan paths in the graph for any alt path that could have come from this variant + // then add any node id from the path to our index + // we add the first id we find under the assumption that alt paths are entirely contained within sites + for (int allele = 0; !path_found && allele < var.alleles.size(); ++allele) { + string alt_path_name = "_alt_" + make_variant_id(var) + "_" + to_string(allele); + if (graph.has_path(alt_path_name)) { + path_handle_t path_handle = graph.get_path_handle(alt_path_name); + if (!graph.is_empty(path_handle)) { + path_found = true; + step_handle_t step_handle = graph.path_begin(path_handle); + handle_t handle = graph.get_handle_of_step(step_handle); + id_t node_id = graph.get_id(handle); + // copy our variant just this once, and add its new pointer to our map + if (node_to_variant.count(node_id)) { + node_to_variant[node_id].push_back(new vcflib::Variant(var)); + } else { + node_to_variant[node_id] = list({new vcflib::Variant(var)}); + } + } + } + } + if (!path_found) { + cerr << "[VCFTraversalFinder] Warning: No alt path (prefix=" + << ("_alt_" + make_variant_id(var) + "_") << ") found in graph for variant. It will be ignored:\n" + << var << endl; + } + } +#ifdef debug + cerr << "Indexed " << node_to_variant.size() << " nodes" << endl; +#endif +} + +void VCFTraversalFinder::delete_variant_index() { + for (auto nv : node_to_variant) { + for (auto var : nv.second) { + delete var; + } + } + node_to_variant.clear(); +} + +vector VCFTraversalFinder::get_variants_in_site(const Snarl& site) { + vector site_variants; + + pair, unordered_set > contents = snarl_manager.deep_contents(&site, graph, false); + + for (auto node_id : contents.first) { + auto map_it = node_to_variant.find(node_id); + if (map_it != node_to_variant.end()) { + for (auto var : map_it->second) { + site_variants.push_back(var); + } + } + } + + return site_variants; +} + +pair>>, vector> +VCFTraversalFinder::find_allele_traversals(Snarl site) { + + vector>> output_traversals; + + // This traversal finder is pretty simple-minded. It's expecting forward-oriented variation relative + // to the reference. We flip our snarl to canonicalize it if possible. + if (site.start().backward() && site.end().backward()) { + Visit start = site.start(); + *site.mutable_start() = site.end(); + *site.mutable_end() = start; + site.mutable_start()->set_backward(false); + site.mutable_end()->set_backward(false); + } + + vector site_variants = get_variants_in_site(site); + + if (site_variants.empty()) { + return make_pair(output_traversals, site_variants); + } + + pair, vector > > trav_steps = path_finder.find_path_traversals(site); + + if (trav_steps.first.empty()) { + return make_pair(output_traversals, site_variants); + } + + // we can certainly relax this if needed + assert(trav_steps.first.size() == 1); + + step_handle_t start_step = trav_steps.second[0].first; + step_handle_t end_step = trav_steps.second[0].second; + path_handle_t ref_path = graph.get_path_handle_of_step(start_step); + +#ifdef debug + cerr << "Computing alt traversals for site " << pb2json(site) << " that spans the following " + << site_variants.size() << " variants:\n"; + for (auto site_var : site_variants) { + cerr << " ->" << *site_var << endl; + } +#endif + + // fill in the alt traversals + brute_force_alt_traversals(site, site_variants, ref_path, start_step, end_step, output_traversals); + + return make_pair(output_traversals, site_variants); +} + +vector VCFTraversalFinder::find_traversals(const Snarl& site) { + pair>>, vector> allele_travs = find_allele_traversals(site); + vector traversals; + traversals.reserve(allele_travs.first.size()); + for (auto& trav : allele_travs.first) { + traversals.push_back(trav.first); + } + return traversals; +} + +void VCFTraversalFinder::brute_force_alt_traversals( + const Snarl& site, + const vector& site_variants, + path_handle_t ref_path, + step_handle_t start_step, + step_handle_t end_step, + vector > >& output_traversals) { + + // the haplotype we're going to look for a traversal for + // it's in terms of alt_alleles below (and not the VCF), so needs + // to be converted back + vector haplotype(site_variants.size(), 0); + + // use our skip_alt() method (if defined) to prune the search space + vector> alt_alleles = get_pruned_alt_alleles(site, site_variants, ref_path); + assert(alt_alleles.size() == haplotype.size()); + + // if we failed to prune enough, we print a message here: + // todo: we can move to a ranking (eg by support), where instead of filtering, we just + // take the K most supported traversals. this would avoid ever skipping a site + if (!check_max_trav_cutoff(alt_alleles)) { + cerr << "[VCFTraversalFinder] Warning: Site " << pb2json(site) << " with " << site_variants.size() + << " variants contains too many traversals (>" << max_traversal_cutoff + << ") to enumerate so it will be skipped:\n"; + for (auto site_var : site_variants) { + cerr << " " << *site_var << endl; + } + output_traversals.clear(); + return; + } + + // increment the haplotype. we can use this to loop over every possible haplotype + auto next_haplotype = [&] () -> bool { + // do this by "adding" 1 to our haplotype. each digit is in base-|alleles| + for (int i = alt_alleles.size() - 1; i >= 0; --i) { + if (haplotype[i] < alt_alleles[i].size() - 1) { + // add to column + ++haplotype[i]; + return true; + } else if (i > 0) { + // carry 1 to left + haplotype[i] = 0; + } + } + return false; + }; + + do { + // convert back to vcf allele offsets + // todo: can we change the enumeration to avoid this? + vector vcf_haplotype(haplotype.size()); + for (int i = 0; i < site_variants.size(); ++i) { + vcf_haplotype[i] = (alt_alleles[i][haplotype[i]]); + assert(skip_alt != nullptr || vcf_haplotype[i] == haplotype[i]); + } + // make sure we don't forget the reference. I'm sure there's a more elegant way to + // do this, but it's fussy in light of pruning logic + if (output_traversals.empty() && + !std::all_of(vcf_haplotype.begin(), vcf_haplotype.end(), [](int i) {return i == 0;})) { + vector ref_haplotype(vcf_haplotype.size(), 0); + pair alt_traversal = get_alt_traversal( + site, site_variants, ref_path, start_step, end_step, ref_haplotype); + assert(alt_traversal.second == true); + output_traversals.push_back(make_pair(alt_traversal.first, ref_haplotype)); + } + + pair alt_traversal = get_alt_traversal(site, site_variants, ref_path, + start_step, end_step, vcf_haplotype); +#ifdef debug + cerr << "bf haplotype <"; + for (auto allele : vcf_haplotype) { + cerr << allele << ","; + } + cerr << "> gives " << (alt_traversal.second ? "valid" : "invalid") << " trav: " + << pb2json(alt_traversal.first) << endl; +#endif + if (alt_traversal.second) { + output_traversals.push_back(make_pair(alt_traversal.first, vcf_haplotype)); + } + } while (next_haplotype()); +} + +pair VCFTraversalFinder::get_alt_traversal(const Snarl& site, + const vector& site_variants, + path_handle_t ref_path, + step_handle_t start_step, + step_handle_t end_step, + const vector& haplotype) { + + // Find the alt paths that we must cover if we traverse this haplotype + pair, unordered_set>> alt_contents = + get_haplotype_alt_contents(site_variants, haplotype, ref_path); + unordered_set& alt_nodes = alt_contents.first; + unordered_set>& alt_edges = alt_contents.second; + + // the edges of our reference path. we must follow these in our traversal + // unless we're going to an alt + unordered_set > ref_edges; + // nodes of the reference path. we use these to see if we can go from + // an alt path back to the reference + unordered_set ref_nodes; + for (auto step = start_step; step != end_step; step = graph.get_next_step(step)) { + auto next = graph.get_next_step(step); + + // todo: assuming forward ref path + ref_edges.insert(graph.edge_handle(graph.get_handle_of_step(step), + graph.get_handle_of_step(next))); + + ref_nodes.insert(graph.get_handle_of_step(step)); + } + ref_nodes.insert(graph.get_handle_of_step(end_step)); + +#ifdef debug + cerr << " alt nodes: "; + for (auto alt_node : alt_nodes) { + cerr << graph.get_id(alt_node) << ":" << graph.get_is_reverse(alt_node) << ","; + } + cerr << endl << " alt edges: "; + for (auto alt_edge : alt_edges) { + cerr << graph.get_id(alt_edge.first) << ":" << graph.get_is_reverse(alt_edge.first) << "-" + << graph.get_id(alt_edge.second) << ":" << graph.get_is_reverse(alt_edge.second) << ","; + } + cerr << endl << " ref edges: "; + for (auto ref_edge : ref_edges) { + cerr << graph.get_id(ref_edge.first) << ":" << graph.get_is_reverse(ref_edge.first) << "-" + << graph.get_id(ref_edge.second) << ":" << graph.get_is_reverse(ref_edge.second) << ","; + } +#endif + + // we walk by always following reference edges unless we can step into an alt + // there are some simplifying assumptions about alt paths at play here, like + // how there are unique hooks between them and the reference. + bool in_alt_path = false; + auto walk_forward = [&] (Visit& visit) { + handle_t handle = graph.get_handle(visit.node_id(), visit.backward()); + // take an alt edge if we find one + bool found_edge = !graph.follow_edges(handle, false, [&] (const handle_t& next) { + auto edge = graph.edge_handle(handle, next); + bool ret = true; + if (alt_edges.count(edge)) { + ret = false; // stop, we found deletion edge + } else if (alt_nodes.count(next)) { + in_alt_path = true; + ret = false; // stop, we found an edge to an alt path node + } + if (ret == false) { + // make sure we never cross this node/edge again in our traversal + alt_edges.erase(edge); + alt_nodes.erase(graph.get_handle(graph.get_id(next), false)); + alt_nodes.erase(graph.get_handle(graph.get_id(next), true)); + visit.set_node_id(graph.get_id(next)); + visit.set_backward(graph.get_is_reverse(next)); + } + return ret; + }); + if (!found_edge) { + // no alt edge found, take a reference edge + found_edge = !graph.follow_edges(handle, false, [&] (const handle_t& next) { + auto edge = graph.edge_handle(handle, next); + bool ret = true; + if (ref_edges.count(edge) && ref_nodes.count(next)) { + ret = false; // stop, we found a reference edge + } else if (in_alt_path && ref_nodes.count(next)) { + in_alt_path = false; + ret = false; // stop, we found a reference node after our alt path + } + if (ret == false) { + // make sure we never cross this node/edge again in our traversal + ref_edges.erase(edge); + ref_nodes.erase(graph.get_handle(graph.get_id(next), false)); + ref_nodes.erase(graph.get_handle(graph.get_id(next), true)); + visit.set_node_id(graph.get_id(next)); + visit.set_backward(graph.get_is_reverse(next)); + } + return ret; + }); + } + return found_edge; + }; + + + Visit visit; + SnarlTraversal traversal; + + // start at the start + // todo: should make sure this works if our snarl is backward on reference + visit.set_node_id(graph.get_id(graph.get_handle_of_step(start_step))); + visit.set_backward(graph.get_is_reverse(graph.get_handle_of_step(start_step))); + ref_nodes.erase(graph.get_handle(visit.node_id(), false)); + ref_nodes.erase(graph.get_handle(visit.node_id(), true)); + + + if (include_endpoints) { + *traversal.add_visit() = visit; + } + +#ifdef debug + cerr << " start walk: " << pb2json(visit) << endl; +#endif + + // walk our traversal + bool found_end; + while (walk_forward(visit)) { +#ifdef debug + cerr << " visit: " << pb2json(visit) << endl; +#endif + if (visit.node_id() != graph.get_id(graph.get_handle_of_step(end_step))) { + *traversal.add_visit() = visit; + } else { + found_end = true; + break; + } + } + + if (include_endpoints) { + Visit* visit = traversal.add_visit(); + handle_t end_handle = graph.get_handle_of_step(end_step); + visit->set_node_id(graph.get_id(end_handle)); + visit->set_backward(graph.get_is_reverse(end_handle)); + } + + // sanity check: we compare the output to something gotten directly from the + // path index when doing the reference haplotype. + if (all_of(haplotype.begin(), haplotype.end(), [] (int i) {return i == 0;})) { + SnarlTraversal ref_trav; + step_handle_t step = graph.get_next_step(start_step); + if (include_endpoints) { + Visit* visit = ref_trav.add_visit(); + visit->set_node_id(graph.get_id(graph.get_handle_of_step(start_step))); + visit->set_backward(graph.get_is_reverse(graph.get_handle_of_step(start_step))); + } + for (; step != end_step; step = graph.get_next_step(step)) { + Visit* visit = ref_trav.add_visit(); + visit->set_node_id(graph.get_id(graph.get_handle_of_step(step))); + // todo: do we get an orientation out of the path index? + visit->set_backward(graph.get_is_reverse(graph.get_handle_of_step(step))); + } + if (include_endpoints) { + Visit* visit = ref_trav.add_visit(); + visit->set_node_id(graph.get_id(graph.get_handle_of_step(end_step))); + visit->set_backward(graph.get_is_reverse(graph.get_handle_of_step(end_step))); + } + assert(found_end && ref_trav == traversal); + } + + return make_pair(traversal, found_end && alt_nodes.empty() && alt_edges.empty()); +} + +pair, unordered_set > > +VCFTraversalFinder::get_haplotype_alt_contents( + const vector& site_variants, + const vector& haplotype, + path_handle_t ref_path) { + + assert(haplotype.size() == site_variants.size()); + + unordered_set alt_nodes; + unordered_set > alt_deletion_edges; + + for (size_t allele = 0; allele < haplotype.size(); ++allele) { + // ignore reference alleles + if (haplotype[allele] == 0) { + continue; + } + vcflib::Variant* var = site_variants[allele]; + + // get the alt path information out of the graph + pair> alt_path_info = get_alt_path(var, haplotype[allele], ref_path); + if (alt_path_info.first.visit_size() == 0) { + // skip deletion alt path where we can't find the deletion edge in the graph + continue; + } + SnarlTraversal& alt_traversal = alt_path_info.first; + bool is_deletion = !alt_path_info.second.empty(); + + if (!is_deletion) { + // fill in the nodes from the path + for (size_t i = 0; i < alt_traversal.visit_size(); ++i) { + alt_nodes.insert(graph.get_handle(alt_traversal.visit(i).node_id(), + alt_traversal.visit(i).backward())); + } + } else { + // add the deletion edges from the path + for (auto deletion_edge : alt_path_info.second) { + alt_deletion_edges.insert(deletion_edge); + } + } + } + + return make_pair(alt_nodes, alt_deletion_edges); +} + +pair> VCFTraversalFinder::get_alt_path(vcflib::Variant* var, int allele, + path_handle_t ref_path) { + + SnarlTraversal alt_path; + vector deletion_edges; + + string alt_path_name = "_alt_" + make_variant_id(*var) + "_" + to_string(allele); + if (graph.has_path(alt_path_name) && !graph.is_empty(graph.get_path_handle(alt_path_name))) { + // if there's an alt path, then we're dealing with a snp or insertion. + // we take the edges from the path, as well as those back to the reference + for (handle_t handle : graph.scan_path(graph.get_path_handle(alt_path_name))) { + // fill in the nodes from the path + Visit* visit = alt_path.add_visit(); + visit->set_node_id(graph.get_id(handle)); + visit->set_backward(graph.get_is_reverse(handle)); + } + } else { + // there's no alt path, it must be a deletion (if our input allele != 0) + // in this case we use the reference allele path, and try to find an edge that spans + // it. this will be our alt edge + // todo: put an alt path name maker into utility.hpp + bool is_deletion = allele != 0; + alt_path_name = "_alt_" + make_variant_id(*var) + "_0"; + // allele 0 can be empty for an insertion. we don't complain if it's not in the graph + assert(allele == 0 || graph.has_path(alt_path_name)); + + if (graph.has_path(alt_path_name)) { + path_handle_t path_handle = graph.get_path_handle(alt_path_name); + if (!graph.is_empty(path_handle)) { + // find where this path begins and ends in the reference path index + auto first_step_found = step_in_path(graph.get_handle_of_step(graph.path_begin(path_handle)), ref_path); + assert(first_step_found.second); + step_handle_t first_step = first_step_found.first; + auto last_step_found = step_in_path(graph.get_handle_of_step(graph.path_back(path_handle)), ref_path); + assert(last_step_found.second); + step_handle_t last_step = last_step_found.first; + + // todo: logic needed here if want to support non-forward reference paths. + first_step = graph.get_previous_step(first_step); + last_step = graph.get_next_step(last_step); + if (allele == 0) { + handle_t left = graph.get_handle_of_step(first_step); + handle_t right = graph.get_handle_of_step(last_step); + + Visit* visit = alt_path.add_visit(); + visit->set_node_id(graph.get_id(left)); + visit->set_backward(graph.get_is_reverse(left)); + visit = alt_path.add_visit(); + visit->set_node_id(graph.get_id(right)); + visit->set_backward(graph.get_is_reverse(right)); + } else { + // alt paths don't always line up with deletion edges, so we hunt for + // our deletion edge using the path_index here. + pair> scan_deletion = scan_for_deletion(var, allele, ref_path, + first_step, last_step); + if (scan_deletion.first.visit_size() == 0) { + cerr << "[VCFTraversalFinder] Warning: Could not find deletion edge that matches allele " + << allele << " of\n" << *var << "\naround alt path" << alt_path_name << ":"; + } + alt_path = std::move(scan_deletion.first); + deletion_edges = std::move(scan_deletion.second); + } + } + } + } + + return make_pair(alt_path, deletion_edges); +} + +pair> VCFTraversalFinder::scan_for_deletion(vcflib::Variant* var, int allele, path_handle_t ref_path, + step_handle_t first_path_step, step_handle_t last_path_step) { + assert(allele > 0); + + // if our path matches an edge, we don't need to do anything + edge_t spanning_edge = graph.edge_handle(graph.get_handle_of_step(first_path_step), + graph.get_handle_of_step(last_path_step)); + if (graph.has_edge(spanning_edge)) { + SnarlTraversal traversal; + Visit* visit = traversal.add_visit(); + visit->set_node_id(graph.get_id(graph.get_handle_of_step(first_path_step))); + visit->set_backward(graph.get_is_reverse(graph.get_handle_of_step(first_path_step))); + visit = traversal.add_visit(); + visit->set_node_id(graph.get_id(graph.get_handle_of_step(last_path_step))); + visit->set_backward(graph.get_is_reverse(graph.get_handle_of_step(last_path_step))); + return make_pair(traversal, vector(1, spanning_edge)); + } + + // we're doing everything via length comparison, so keep track of the length we're + // looking for (vcf_deletion_length) and the length we have (path_deletion_length) + int vcf_deletion_length = var->alleles[0].length() - var->alleles[allele].length(); + + // make our search window by scanning out the ends of our path + step_handle_t first_window_step = first_path_step; + for (int i = 0; i < max_deletion_scan_nodes && graph.has_previous_step(first_window_step); ++i) { + first_window_step = graph.get_previous_step(first_window_step); + } + step_handle_t last_window_step = last_path_step; + for (int i = 0; i < max_deletion_scan_nodes && graph.has_next_step(last_window_step); ++i) { + last_window_step = graph.get_next_step(last_window_step); + } + + // index our reference offsets (assuming forward reference path with no cycles) + // needing this logic doesn't happen very often, otherwise would consider + // requiring path position interface + unordered_map ref_offsets; + unordered_map node_to_step; + int offset = 0; + int first_offset = 0; + int last_offset = 0; + step_handle_t cur_step = first_window_step; + while (true) { // want to iterate last-inclusive + handle_t cur_handle = graph.get_handle_of_step(cur_step); + assert(graph.get_is_reverse(cur_handle) == false); + ref_offsets[graph.get_id(cur_handle)] = offset; + node_to_step[graph.get_id(cur_handle)] = cur_step; + if (cur_step == first_path_step) { + first_offset = offset + graph.get_length(cur_handle); + } + if (cur_step == last_path_step) { + last_offset = offset; + } + offset += graph.get_length(cur_handle); + if (cur_step == last_window_step) { + break; + } else { + cur_step = graph.get_next_step(cur_step); + } + } + + // find our deletions, and index them by how close they match our given alt path + // the delta is the min length of the deletion's two endpoints to the paths enpoints. + multimap delta_to_deletion; // index deleions by their distance from ref path ends + unordered_map deletion_to_length; // lookup deletion sizes + for (cur_step = first_window_step; cur_step != last_window_step; cur_step = graph.get_next_step(cur_step)) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + handle_t next_handle = graph.get_handle_of_step(graph.get_next_step(cur_step)); + graph.follow_edges(cur_handle, false, [&] (const handle_t& edge_next_handle) { + if (!graph.get_is_reverse(edge_next_handle) && // ignore inversions + graph.get_id(next_handle) != graph.get_id(edge_next_handle) && + ref_offsets.count(graph.get_id(edge_next_handle))) { + // we are in a deletion that's contained in the window + int deletion_start_offset = ref_offsets[graph.get_id(cur_handle)] + graph.get_length(cur_handle); + int deletion_end_offset = ref_offsets[graph.get_id(edge_next_handle)]; + int delta = std::min(std::abs(deletion_start_offset - first_offset), std::abs(deletion_end_offset - last_offset)); + delta_to_deletion.insert(make_pair(delta, make_pair(cur_handle, edge_next_handle))); + deletion_to_length[make_pair(cur_handle, edge_next_handle)] = deletion_end_offset - deletion_start_offset; + } + }); + } + + // our goal is to find a traversal that threads the deletions we find. in order to do this, our deltions + // can't overlap + function&)> doesnt_intersect = [&](edge_t edge, const vector& edge_set) { + int edge_start = ref_offsets[graph.get_id(edge.first)] + graph.get_length(edge.first); + int edge_end = ref_offsets[graph.get_id(edge.second)]; + // because of previous assumptins, are edges shoudl always be forward alogn the path. + // note they are not made wtih graph.edge_handle, and are just oriented in the scan order + assert(edge_start <= edge_end); + for (edge_t other_edge : edge_set) { + int other_start = ref_offsets[graph.get_id(other_edge.first)] + graph.get_length(other_edge.first); + int other_end = ref_offsets[graph.get_id(other_edge.second)]; + if ((other_start >= edge_start && other_start < edge_end) || (other_end > edge_start && other_end <= edge_end) || + (edge_start >= other_start && edge_start < other_end) || (edge_end > other_start && edge_end >= other_end)) { + return false; + } + } + return true; + }; + + // greedily try to find some deletions that add up to the desired length, and are close to spanning + // the alt path + int best_delta = numeric_limits::max(); + vector best_set; + for (auto delta_edge : delta_to_deletion) { + // can do better than quadratic here, but the sizes should be small enough not to matter + vector candidate_set = {delta_edge.second}; + size_t total_size = deletion_to_length[delta_edge.second]; + int total_delta = delta_edge.first; + for (auto delta_edge2 : delta_to_deletion) { + if (delta_edge2 != delta_edge) { + if (total_size + deletion_to_length[delta_edge2.second] <= vcf_deletion_length && + // make that cubic... + doesnt_intersect(delta_edge2.second, candidate_set)) { + total_size += deletion_to_length[delta_edge2.second]; + total_delta += delta_edge2.first; + candidate_set.push_back(delta_edge2.second); + } + } + if (total_size == vcf_deletion_length) { + break; + } + } + if (total_delta < best_delta) { + best_set = candidate_set; + best_delta = total_delta; + } + } + + // sort the edges along the path + std::sort(best_set.begin(), best_set.end(), [&](edge_t e1, edge_t e2) { + return ref_offsets[graph.get_id(e1.first)] < ref_offsets[graph.get_id(e2.first)]; }); + + SnarlTraversal traversal; + Visit* visit; + // fill out the traversal + for (int i = 0; i < best_set.size(); ++i) { + // add a visit for each edge endpoint + if (i == 0 || best_set[i].first != best_set[i-1].second) { + visit = traversal.add_visit(); + visit->set_node_id(graph.get_id(best_set[i].first)); + visit->set_backward(graph.get_is_reverse(best_set[i].first)); + } + visit = traversal.add_visit(); + visit->set_node_id(graph.get_id(best_set[i].second)); + visit->set_backward(graph.get_is_reverse(best_set[i].second)); + // the fill in the reference path to the next edge + if (i < best_set.size() - 1) { + step_handle_t next_step = node_to_step[graph.get_id(best_set[i + 1].first)]; + step_handle_t cur_step = node_to_step[graph.get_id(best_set[i].second)]; + if (cur_step != next_step) { + for (cur_step = graph.get_next_step(cur_step); cur_step != next_step; cur_step = graph.get_next_step(cur_step)) { + visit = traversal.add_visit(); + visit->set_node_id(graph.get_id(graph.get_handle_of_step(cur_step))); + visit->set_backward(graph.get_is_reverse(graph.get_handle_of_step(cur_step))); + } + } + } + } + + return make_pair(traversal, best_set); +} + + +vector> VCFTraversalFinder::get_pruned_alt_alleles( + const Snarl& site, + const vector& site_variants, + path_handle_t ref_path) { + + vector > alt_alleles(site_variants.size()); + + for (int var_i = 0; var_i < site_variants.size(); ++var_i) { + for (int allele = 0; allele < site_variants[var_i]->alleles.size(); ++allele) { + alt_alleles[var_i].push_back(allele); + } + } + + // only invoke pruning if we exceed our cutoff. fairly rare on most graphs + for (int prune_it = 0; prune_it < max_prune_iterations && !check_max_trav_cutoff(alt_alleles); ++prune_it) { + for (auto& alleles : alt_alleles) { + alleles.clear(); + } + + for (int var_i = 0; var_i < site_variants.size(); ++var_i) { + for (int allele = 0; allele < site_variants[var_i]->alleles.size(); ++allele) { + if (skip_alt == nullptr || + skip_alt(get_alt_path(site_variants[var_i], allele, ref_path).first, prune_it) == false) { + alt_alleles[var_i].push_back(allele); + } +#ifdef debug + else { + cerr << "Pruning allele " << allele << " from variant " << site_variants[var_i]->id << endl; + } +#endif + } + // always leave at least one path through the site, even if that means + // going through a reference allele that fails the skip_alt check. + if (alt_alleles[var_i].empty()) { + alt_alleles[var_i].push_back(0); + } + } + } + + return alt_alleles; +} + +bool VCFTraversalFinder::check_max_trav_cutoff(const vector >& alleles) { + if (alleles.empty()) { + return true; + } + size_t count = 1; + + for (int i = 0; i < alleles.size(); ++i) { + count *= alleles[i].size(); + if (count > max_traversal_cutoff) { + return false; + } + } + + return true; +} + +pair VCFTraversalFinder::step_in_path(handle_t handle, path_handle_t path_handle) const { + vector steps = graph.steps_of_handle(handle); + // must be a cyclic! + for (auto step : steps) { + if (graph.get_path_handle_of_step(step) == path_handle) { + return make_pair(step, true); + } + } + return make_pair(step_handle_t(), false); +} + + +FlowTraversalFinder::FlowTraversalFinder(const HandleGraph& graph, SnarlManager& snarl_manager, + size_t K, + function node_weight_callback, + function edge_weight_callback) : + graph(graph), + snarl_manager(snarl_manager), + K(K), + node_weight_callback(node_weight_callback), + edge_weight_callback(edge_weight_callback) { + +} + +void FlowTraversalFinder::setK(size_t k) { + K = k; +} + +vector FlowTraversalFinder::find_traversals(const Snarl& site) { + return find_weighted_traversals(site).first; +} + +pair, vector> FlowTraversalFinder::find_weighted_traversals(const Snarl& site, bool greedy_avg, + const HandleGraph* overlay) { + + // option to use the overlay graph for the search + const HandleGraph* use_graph = overlay != nullptr ? overlay : & graph; + + handle_t start_handle = use_graph->get_handle(site.start().node_id(), site.start().backward()); + handle_t end_handle = use_graph->get_handle(site.end().node_id(), site.end().backward()); + + vector>> widest_paths = algorithms::yens_k_widest_paths(use_graph, start_handle, end_handle, K, + node_weight_callback, + edge_weight_callback, + greedy_avg); + + vector travs; + travs.reserve(widest_paths.size()); + vector weights; + weights.reserve(widest_paths.size()); + + for (const auto& wp : widest_paths) { + weights.push_back(wp.first); + travs.emplace_back(); + for (const auto& h : wp.second) { + Visit* visit = travs.back().add_visit(); + visit->set_node_id(use_graph->get_id(h)); + visit->set_backward(use_graph->get_is_reverse(h)); + } + } + + return make_pair(travs, weights); +} + +GBWTTraversalFinder::GBWTTraversalFinder(const HandleGraph& graph, const gbwt::GBWT& gbwt) : + graph(graph), + gbwt(gbwt) { + +} + +GBWTTraversalFinder::~GBWTTraversalFinder() { + +} + +pair, vector>> +GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) { + + // follow all gbwt threads from start to end + vector, gbwt::SearchState> > forward_traversals = list_haplotypes( + graph, + gbwt, + graph.get_handle(site.start().node_id(), site.start().backward()), + [&] (const vector& new_thread) { + return gbwt::Node::id(new_thread.back()) == site.end().node_id() && + gbwt::Node::is_reverse(new_thread.back()) == site.end().backward(); + }); + + // follow all gbwt threads from end to start + vector, gbwt::SearchState> > backward_traversals; + if (!gbwt.bidirectional()) { + backward_traversals = list_haplotypes( + graph, + gbwt, + graph.get_handle(site.end().node_id(), !site.end().backward()), + [&] (const vector& new_thread) { + return gbwt::Node::id(new_thread.back()) == site.start().node_id() && + gbwt::Node::is_reverse(new_thread.back()) == !site.start().backward(); + }); + } + + // store them all as snarltraversals + vector traversals; + vector> gbwt_paths; + traversals.reserve(forward_traversals.size() + backward_traversals.size()); + + // copy the forward traversals from gbwt vectors to snarl traversals + for (int i = 0; i < forward_traversals.size(); ++i) { + traversals.emplace_back(); + for (auto j = forward_traversals[i].first.begin(); j != forward_traversals[i].first.end(); ++j) { + Visit* visit = traversals.back().add_visit(); + *visit = to_visit(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j)); + } + if (return_paths) { + gbwt_paths.push_back(gbwt.locate(forward_traversals[i].second)); + } + } + + if (!backward_traversals.empty()) { + + // want to check we don't have the same element twice + std::sort(forward_traversals.begin(), forward_traversals.end(), + [&](const pair, gbwt::SearchState>& t1, + const pair, gbwt::SearchState>& t2) { + return t1.first < t2.first; }); + + // copy and reverse the backward traversals into the snarl traversals + for (int i = 0; i < backward_traversals.size(); ++i) { + + vector gbwt_path; + if (return_paths) { + gbwt_path = gbwt.locate(backward_traversals[i].second); + } + + // orient along the snarl + std::reverse(backward_traversals[i].first.begin(), backward_traversals[i].first.end()); + for (auto& gnode : backward_traversals[i].first) { + gnode = gbwt::Node::encode(gbwt::Node::id(gnode), !gbwt::Node::is_reverse(gnode)); + } + + // search in the forward traversals + auto si = std::lower_bound(forward_traversals.begin(), forward_traversals.end(), backward_traversals[i], + [&](const pair, gbwt::SearchState>& t1, + const pair, gbwt::SearchState>& t2) { + return t1.first < t2.first; }); + if (si != forward_traversals.end() && si->first == backward_traversals[i].first) { + // we found and exact forward match, just add in the paths + if (return_paths) { + size_t idx = si - forward_traversals.begin(); + gbwt_paths[idx].insert(gbwt_paths[idx].end(), gbwt_path.begin(), gbwt_path.end()); + } + } else { + // insert if not duplicate of existing forward traversal + traversals.emplace_back(); + for (auto j = backward_traversals[i].first.begin(); j != backward_traversals[i].first.end(); ++j) { + Visit* visit = traversals.back().add_visit(); + *visit = to_visit(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j)); + } + if (return_paths) { + gbwt_paths.push_back(gbwt.locate(backward_traversals[i].second)); + } + } + } + } + return make_pair(traversals, gbwt_paths); +} + +vector GBWTTraversalFinder::find_traversals(const Snarl& site) { + return find_gbwt_traversals(site, false).first; +} + +pair, vector> GBWTTraversalFinder::find_path_traversals(const Snarl& site) { + // get the unique traversals + pair, vector>> gbwt_traversals = find_gbwt_traversals(site, true); + + // expand them out to one per path (this is to be consistent with PathTraversalFinder as used in deconstruct) + pair, vector> path_traversals; + for (size_t i = 0; i < gbwt_traversals.first.size(); ++i) { + SnarlTraversal& trav = gbwt_traversals.first[i]; + vector& paths = gbwt_traversals.second[i]; + for (size_t j = 0; j < paths.size(); ++j) { + path_traversals.first.push_back(trav); + path_traversals.second.push_back(paths[j]); + } + } + + return path_traversals; +} + } + + + diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp index ffa1b534dce..190ff4547e0 100644 --- a/src/traversal_finder.hpp +++ b/src/traversal_finder.hpp @@ -15,7 +15,10 @@ #include #include #include -#include "vg.pb.h" + +#include + +#include #include "vg.hpp" #include "translator.hpp" #include "hash_map.hpp" @@ -24,6 +27,7 @@ #include "snarls.hpp" #include "path_index.hpp" #include "genotypekit.hpp" +#include "gbwt_helper.hpp" namespace vg { @@ -39,17 +43,17 @@ class TraversalFinder { public: virtual ~TraversalFinder() = default; - virtual vector find_traversals(const Snarl& site) = 0; + virtual vector find_traversals(const Snarl& site) = 0; }; class ExhaustiveTraversalFinder : public TraversalFinder { - VG& graph; + const HandleGraph& graph; SnarlManager& snarl_manager; bool include_reversing_traversals; public: - ExhaustiveTraversalFinder(VG& graph, SnarlManager& snarl_manager, + ExhaustiveTraversalFinder(const HandleGraph& graph, SnarlManager& snarl_manager, bool include_reversing_traversals = false); virtual ~ExhaustiveTraversalFinder(); @@ -61,10 +65,10 @@ class ExhaustiveTraversalFinder : public TraversalFinder { virtual vector find_traversals(const Snarl& site); protected: - void stack_up_valid_walks(NodeTraversal walk_head, vector& stack); - virtual bool visit_next_node(const Node*, const Edge*) { return true; } - void add_traversals(vector& traversals, NodeTraversal traversal_start, - set& stop_at, set& yield_at); + void stack_up_valid_walks(handle_t walk_head, vector& stack); + virtual bool visit_next_node(handle_t handle) { return true; } + void add_traversals(vector& traversals, handle_t traversal_start, + unordered_set& stop_at, unordered_set& yield_at); }; /** Does exhaustive traversal, but restricting to nodes and edges that meet @@ -126,6 +130,8 @@ class ReadRestrictedTraversalFinder : public TraversalFinder { * I'm not sure what PathBasedTraversalFinder (see below) does, but it does not work * as a drop-in replacement for this class, so keep the two implementations at least * for now. + * + * DEPRECATED: Use PathTraversalFinder instead */ class PathRestrictedTraversalFinder : public TraversalFinder { @@ -144,12 +150,16 @@ class PathRestrictedTraversalFinder : public TraversalFinder { // How many nodes max should we walk when checking if a path runs through a superbubble/site int max_path_search_steps; + + // Allow multiple traversals with the same sequence + bool allow_duplicates; public: PathRestrictedTraversalFinder(VG& graph, SnarlManager& snarl_manager, map& reads_by_name, int min_recurrence = 2, - int max_path_search_steps = 100); + int max_path_search_steps = 100, + bool allow_duplicates = false); virtual ~PathRestrictedTraversalFinder(); @@ -161,31 +171,71 @@ class PathRestrictedTraversalFinder : public TraversalFinder { * while those supported by actual embedded named paths are not. */ virtual vector find_traversals(const Snarl& site); + + /** + * Like above, but return the path name corresponding to each traversal + */ + virtual pair, vector > find_named_traversals(const Snarl& site); }; class PathBasedTraversalFinder : public TraversalFinder{ - vg::VG& graph; + const PathHandleGraph& graph; SnarlManager& snarlmanager; public: - PathBasedTraversalFinder(vg::VG& graph, SnarlManager& sm); + PathBasedTraversalFinder(const PathHandleGraph& graph, SnarlManager& sm); virtual ~PathBasedTraversalFinder() = default; virtual vector find_traversals(const Snarl& site); }; +/** This is a Handle Graph replacement for PathRestrictedTraversalFinder + * that uses the PathHandleGraph interface instead of the VG-based + * path index. It returns all traversals through a snarl that are contained + * within paths in the graph. It can also return a mapping from the traversals + * to their paths*/ +class PathTraversalFinder : public TraversalFinder { + +protected: + // our graph with indexed path positions + const PathHandleGraph& graph; + + SnarlManager& snarl_manager; + + // restrict to these paths + unordered_set paths; + +public: + // if path_names not empty, only those paths will be considered + PathTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, + const vector& path_names = {}); + + /** + * Return all traversals through the site that are sub-paths of embedded paths in the graph + */ + virtual vector find_traversals(const Snarl& site); + + /** + * Like above, but return the path steps for the for the traversal endpoints + */ + virtual pair, vector > > find_path_traversals(const Snarl& site); + +}; + + /** - * This traversal finder finds one or more traversals through leaf sites with no - * children. It uses a depth-first search. It doesn't work on non-leaf sites, - * and is not guaranteed to find all traversals. Only works on ultrabubbles. + * This traversal finder finds one or more traversals through leaf sites with + * no children. It uses a depth-first search. It doesn't work on non-leaf + * sites, and is not guaranteed to find all traversals. Only works on acyclic + * sites that are start-end-reachable. */ class TrivialTraversalFinder : public TraversalFinder { // Holds the vg graph we are looking for traversals in. - VG& graph; + const HandleGraph& graph; public: - TrivialTraversalFinder(VG& graph); + TrivialTraversalFinder(const HandleGraph& graph); virtual ~TrivialTraversalFinder() = default; @@ -205,7 +255,8 @@ class RepresentativeTraversalFinder : public TraversalFinder { protected: /// The annotated, augmented graph we're finding traversals in - AugmentedGraph& augmented; + const PathHandleGraph& graph; + /// The SnarlManager managiung the snarls we use SnarlManager& snarl_manager; @@ -220,6 +271,10 @@ class RepresentativeTraversalFinder : public TraversalFinder { size_t max_width; /// How many search intermediates can we allow? size_t max_bubble_paths; + /// Minimum support for a node to consider travnersal through it + size_t min_node_support; + /// Minimum support for a edge to consider travnersal through it + size_t min_edge_support; /** * Find a Path that runs from the start of the given snarl to the end, which @@ -245,37 +300,58 @@ class RepresentativeTraversalFinder : public TraversalFinder { * (including the reference node endpoints and their edges which aren't * stored in the path). */ - pair> find_bubble(Node* node, Edge* edge, const Snarl* snarl, PathIndex& index, + pair> find_bubble(id_t node, const edge_t* edge, const Snarl* snarl, PathIndex& index, const Snarl& site); /** - * Get the minimum support of all nodes and edges in path + * Get the minimum support of all nodes and edges in path, in the path's forward orientation. */ Support min_support_in_path(const list& path); /** * Do a breadth-first search left from the given node traversal, and return - * lengths and paths starting at the given node and ending on the given - * indexed path. Refuses to visit nodes with no support, if support data is - * available in the augmented graph. + * lengths, target-path-relative orientations, and paths starting at the + * given node and ending on the given indexed path. Refuses to visit nodes + * with no support, if support data is available in the augmented graph. + * + * If in_snarl is not null, restricts the found paths to stay within the + * given snarl. + * + * If both_orientations_distance is not zero, keeps searching up to that + * many steps after finding the target path to see if it can find a node on + * the target path in the opposite orientation. This is useful for + * inversions. */ - set>> bfs_left(Visit visit, PathIndex& index, bool stopIfVisited = false, - const Snarl* in_snarl = nullptr); + set>> bfs_left(Visit visit, PathIndex& index, + bool stop_if_visited = false, + const Snarl* in_snarl = nullptr, + size_t both_orientations_distance = 0); /** * Do a breadth-first search right from the given node traversal, and return - * lengths and paths starting at the given node and ending on the given - * indexed path. Refuses to visit nodes with no support, if support data is - * available in the augmented graph. + * lengths, target-path-relative orientations, and paths starting at the + * given node and ending on the given indexed path. Refuses to visit nodes + * with no support, if support data is available in the augmented graph. + * + * API is similar to bfs_left(). */ - set>> bfs_right(Visit visit, PathIndex& index, bool stopIfVisited = false, - const Snarl* in_snarl = nullptr); + set>> bfs_right(Visit visit, PathIndex& index, + bool stop_if_visited = false, + const Snarl* in_snarl = nullptr, + size_t both_orientations_distance = 0); /** * Get the length of a path through nodes, in base pairs. */ - size_t bp_length(const list& path); - + size_t bp_length(const structures::ImmutableList& path); + + /** + * Get support + */ + bool has_supports = false; + function get_node_support; + function get_edge_support; + public: /** @@ -290,12 +366,22 @@ class RepresentativeTraversalFinder : public TraversalFinder { * Uses the given get_index function to try and find a PathIndex for a * reference path traversing a child snarl. */ - RepresentativeTraversalFinder(AugmentedGraph& augmented, SnarlManager& snarl_manager, - size_t max_depth, size_t max_width, size_t max_bubble_paths, - function get_index = [](const Snarl& s) { return nullptr; }); + RepresentativeTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, + size_t max_depth, size_t max_width, size_t max_bubble_paths, + size_t min_node_support = 1, size_t min_edge_support = 1, + function get_index = [](const Snarl& s) { return nullptr; }, + function get_node_support = nullptr, + function get_edge_support = nullptr); /// Should we emit verbose debugging info? bool verbose = false; + + /// Should trivial child snarls have their traversals glommed into ours? + bool eat_trivial_children = false; + + /// What timeout/step limit should we use for finding other orientations of + /// the reference path after we find one? + size_t other_orientation_timeout = 10; virtual ~RepresentativeTraversalFinder() = default; @@ -307,6 +393,267 @@ class RepresentativeTraversalFinder : public TraversalFinder { }; + +/** + * This TraversalFinder returns a traversals and their corresponding genotypes + * from an input vcf. It relies on alt-paths in the graph (via construct -a) + * to map between the vcf and the graph. + */ +class VCFTraversalFinder : public TraversalFinder { + +protected: + const PathHandleGraph& graph; + + /// Use this to check if our snarl runs through a reference path + /// (may be overkill, but can be used for sanity checking) + PathTraversalFinder path_finder; + + /// The SnarlManager managiung the snarls we use + SnarlManager& snarl_manager; + + /// Store variants indexed by an arbitrary node in one of their associated + /// alt paths. We can then use this to find all variants in a top-level snarl + unordered_map> node_to_variant; + + /// Use this method to prune the search space by selecting alt-alleles + /// to skip by considering their paths (in SnarlTraversal) format + /// It will try again and again until enough traversals are pruned, + /// with iteration keeping track of how many tries (so it should become stricter + /// as iteration increases) + function skip_alt; + + /// If a snarl has more than this many traversals, return nothing and print + /// a warning. Dense and large deletions will make this happen from time + /// to time. In practice, skip_alt (above) can be used to prune down + /// the search space by selecting alleles to ignore. + size_t max_traversal_cutoff; + + /// Maximum number of pruning iterations + size_t max_prune_iterations = 2; + + /// Include snarl endpoints in traversals + bool include_endpoints = true; + + /// How far to scan when looking for deletions + size_t max_deletion_scan_nodes = 50; + +public: + + /** + * Make a new VCFTraversalFinder. Builds the indexes needed to find all the + * variants in a site. + * + * The skip_alt() method is defined, it is run on the alt-path of each variant + * allele in the snarl. If it returns true, that alt-path will never be included + * in any traversals returned in find_traversals(). + * This is used to, for example, use read support to prune the number of traversals + * that are enumerated. + */ + VCFTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, vcflib::VariantCallFile& vcf, + const vector& ref_path_names = {}, + FastaReference* fasta_ref = nullptr, + FastaReference* ins_ref = nullptr, + function skip_alt = nullptr, + size_t max_traversal_cutoff = 50000); + + virtual ~VCFTraversalFinder(); + + /** + * Find traversals for the site. Each traversa is returned in a pair with + * its haplotype. The haplotype refers to the list of variants (also returned) + */ + pair>>, vector> find_allele_traversals(Snarl site); + + /** + * Return a list of traversals for the site. The same traversals as above, only the + * haplotype information not included + */ + virtual vector find_traversals(const Snarl& site); + + /** + * Get all the variants that are contained in a site */ + vector get_variants_in_site(const Snarl& site); + + +protected: + + /** Load up all the variants into our node index + */ + void create_variant_index(vcflib::VariantCallFile& vcf, FastaReference* ref_fasta = nullptr, + FastaReference* ins_fasta = nullptr); + void delete_variant_index(); + + /** Get a traversal for every possible haplotype (but reference) + * in the most naive way possibe. This will blow up terribly for sites that contain more than a few + * variants. There's an obvious dynamic programming speedup, but the main issue is that + * the output size is exponential in the number of variants. + */ + void brute_force_alt_traversals(const Snarl& site, + const vector& site_variants, + path_handle_t ref_path, + step_handle_t start_step, + step_handle_t end_step, + vector > >& output_traversals); + + /** Get a traversal for a given haplotype. It gets all the nodes and edges from the alt + * paths, and greedily walks over them whenever possible (traversing the reference otherwise). + * if there is no traversal that can satisfy the haplotype, then the returned bool is set to false + */ + pair get_alt_traversal(const Snarl& site, + const vector& site_variants, + path_handle_t ref_path, + step_handle_t start_step, + step_handle_t end_step, + const vector& haplotype); + + /** Get a set of all alt path nodes and deletion edges for a halptype. + */ + pair, unordered_set >> + get_haplotype_alt_contents(const vector& site_variants, + const vector& haplotype, + path_handle_t ref_path); + + /** Get one alt-path out of the graph in the form of a snarl traversal. if the path is a deletion, + * the edges corresponding to the deletion are also returned. note that it is indeed possible + * for one alt path (and therefore one vcf alleles) to correspond to several deletion edges in the + * graph due to normalization during construction. + */ + pair> get_alt_path(vcflib::Variant* site_variant, int allele, path_handle_t ref_path); + + /** + * An alt path for a deletion is the deleted reference path. But sometimes vg construct doesn't + * write a deletion edge that exactly jumps over the alt path. In these cases, we need to + * search the graph for one. This does a brute-force check of all deletion edges in the vicinity + * for one that's the same size as the one we're looking for. + * It tries to find a set of nearyby deletions that match the desired length. + * Todo: check the sequence as well + * Also todo: It'd be really nice if construct -fa would make the deletion-edge easily inferrable + * from the alt path. It really shouldn't be necessary to hunt around. + * Returns: + */ + pair> scan_for_deletion(vcflib::Variant* var, int allele, path_handle_t ref_path, + step_handle_t first_path_step, step_handle_t last_path_step); + + /** + * Prune our search space using the skip_alt method. Will return a list of pruned VCF alleles/ + * + * ex, if the input has A --> T + * G --> C,A + * there input alleles are <0,1>, <0,1,2>. If there's no support for the G->C on the second one, + * the output would be <0,1>, <0,2>. + * + */ + vector> get_pruned_alt_alleles(const Snarl& site, + const vector& site_variants, + path_handle_t ref_path); + + /** + * Count the possible traversal paths. Return false if we ever get beyond our cutoff + */ + bool check_max_trav_cutoff(const vector >& alleles); + + /** + * Lookup a node in the reference path (mimics old PathIndex) + */ + pair step_in_path(handle_t handle, path_handle_t path_handle) const; + +}; + +/** Finds traversals with the most flow. Node and edge weights are specified + * using the callbacks and can be used, ex, to yield read supports. + * If one traversal is requested, then the path with the highest flow (whose + * node or edge with the minimum weight is maximum) is returned. If K + * traversals are specified, then the K highest flow traversals are returned. + * This is designed to be a replacement for RepresentativeTraversalFinder. + * It should do a better job of enumerating off-reference traversals, and will + * of course guarantee to return all the optimal traversals (in the context of max flow). + * Unlike RepresentativeTraversalFinder, it does not currently support nested + * snarls, so all traversals returned are explicit. + * It is possible that it will blow up on massive snarls, espeically for large Ks. + */ +class FlowTraversalFinder : public TraversalFinder { + +protected: + const HandleGraph& graph; + + SnarlManager& snarl_manager; + + /// The K-best traversals are returned + size_t K; + + /// Callbacks to get supports + function node_weight_callback; + function edge_weight_callback; + +public: + + // if path_names not empty, only those paths will be considered + FlowTraversalFinder(const HandleGraph& graph, SnarlManager& snarl_manager, + size_t K, + function node_weight_callback, + function edge_weight_callback); + + /** + * Return the K widest (most flow) traversals through the site + * The reference traversal will be returned first (regardless of its flow). + * After, the traversals are listed in decreasing order + */ + virtual vector find_traversals(const Snarl& site); + + /** + * Return the K widest traversals, along with their flows + */ + virtual pair, vector> find_weighted_traversals(const Snarl& site, + bool greedy_avg = false, + const HandleGraph* overlay = nullptr); + + /// Set K + void setK(size_t k); + +}; + +/** Rerturn all traversals of a snarl that correspond to haplotypes stored in a GBWT + */ +class GBWTTraversalFinder : public TraversalFinder { + +protected: + + const HandleGraph& graph; + const gbwt::GBWT& gbwt; + +public: + + GBWTTraversalFinder(const HandleGraph& graph, const gbwt::GBWT& gbwt); + + virtual ~GBWTTraversalFinder(); + + /* Return a traversal for every gbwt thread through the snarl + */ + virtual vector find_traversals(const Snarl& site); + + /** Return the traversals, paired with their path identifiers in the gbwt. The traversals are + * unique, but there can be more than one path along each one (hence the vector) + */ + virtual pair, vector>> + find_gbwt_traversals(const Snarl& site, bool return_paths = true); + + /** Return traversals paired with path identifiers from the GBWT. The traversals are *not* unique + * (which is consistent with PathTraversalFinder) + * To get the sample name from the path identifier id, use gbwtgraph::get_path_sample_name(); + */ + virtual pair, vector> find_path_traversals(const Snarl& site); + + const gbwt::GBWT& get_gbwt() { return gbwt; } + +protected: + + /** + * Breadth first search from the start to the end, only branching if there's a haplotype + * in the GBWT, and returning all unique haplotypes found. + */ + vector, gbwt::SearchState> > get_spanning_haplotypes(handle_t start, handle_t end); +}; + } #endif diff --git a/src/traversal_support.cpp b/src/traversal_support.cpp new file mode 100644 index 00000000000..e0b6b372de1 --- /dev/null +++ b/src/traversal_support.cpp @@ -0,0 +1,628 @@ +#include "traversal_support.hpp" +#include "genotypekit.hpp" + +//#define debug + +namespace vg { + +TraversalSupportFinder::TraversalSupportFinder(const HandleGraph& graph, SnarlManager& snarl_manager) : + graph(graph), + snarl_manager(snarl_manager) { +} + +TraversalSupportFinder::~TraversalSupportFinder() { + +} + +int64_t TraversalSupportFinder::get_edge_length(const edge_t& edge, const unordered_map& ref_offsets) const { + int len = -1; + // use our reference traversal to try to come up with a deletion length for our edge + // idea: if our edge corresponds to a huge deltion, it should be weighted accordingly + auto s_it = ref_offsets.find(graph.get_id(edge.first)); + auto e_it = ref_offsets.find(graph.get_id(edge.second)); + if (s_it != ref_offsets.end() && e_it != ref_offsets.end()) { + size_t start_offset = s_it->second; + if (!graph.get_is_reverse(edge.first)) { + start_offset += graph.get_length(edge.first); + } + size_t end_offset = e_it->second; + if (graph.get_is_reverse(edge.second)) { + end_offset += graph.get_length(edge.second); + } + if (start_offset > end_offset) { + std::swap(start_offset, end_offset); + } + len = end_offset - start_offset; + } + return std::max(len, 1); +} + +tuple TraversalSupportFinder::get_child_support(const Snarl& snarl) const { + // port over old functionality from support caller + // todo: do we need to flag nodes as covered like it does? + pair, unordered_set > contents = snarl_manager.deep_contents(&snarl, graph, true); + Support child_max_support; + Support child_total_support; + size_t child_size = 0; + for (id_t node_id : contents.first) { + Support child_support = get_avg_node_support(node_id); + child_max_support = support_max(child_max_support, child_support); + child_size += graph.get_length(graph.get_handle(node_id)); + child_total_support += child_support; + } + Support child_avg_support = child_total_support / child_size; + // we always use child_max like the old support_caller. + // this is the only way to get top-down recursion to work in many cases + // todo: fix to use bottom up, get get support from actual traversals + // every time!! + return std::tie(child_max_support, child_max_support, child_size); +} + + +Support TraversalSupportFinder::get_traversal_support(const SnarlTraversal& traversal) const { + return get_traversal_set_support({traversal}, {}, {}, {}, false, {}, {}).at(0); +} + +vector TraversalSupportFinder::get_traversal_genotype_support(const vector& traversals, + const vector& genotype, + const set& other_trav_subset, + int ref_trav_idx, + int* max_trav_size) { + set tgt_trav_set(genotype.begin(), genotype.end()); + vector tgt_travs(tgt_trav_set.begin(), tgt_trav_set.end()); + vector other_travs; + for (int i = 0; i < traversals.size(); ++i) { + if (!tgt_trav_set.count(i) && (other_trav_subset.empty() || other_trav_subset.count(i))) { + other_travs.push_back(i); + } + } + int max_trav_size_internal = 0; // size of longest traversal + if (max_trav_size == nullptr) { + max_trav_size = &max_trav_size_internal; + } + + // compute independent support of allele in the genotype + // todo: pass this in instead of recomputing. as there's no reason to compute this more than once + vector ind_allele_support = get_traversal_set_support(traversals, {}, {}, tgt_trav_set, false, {}, {}, ref_trav_idx, max_trav_size); + + // get the support of just the alleles in the genotype, but splitting support of nodes/edges they share + // the split is weighted by the total support of the alleles compute above. for node N and genotype A,B: + // so support(node N in allele A) = support(node N) * support(allele A) / (support(allele A + allele B)) + vector allele_support = get_traversal_set_support(traversals, genotype, ind_allele_support, tgt_trav_set, false, {}, {}, ref_trav_idx, max_trav_size); + + // get the support of everythin else, subtracting genotype supports, and splitting mutual supports + vector other_support = get_traversal_set_support(traversals, other_travs, {}, other_trav_subset, false, genotype, allele_support, ref_trav_idx, max_trav_size); + + // combine the above two vectors + for (int allele : tgt_travs) { + other_support[allele] = allele_support[allele]; + } + return other_support; +} + +vector TraversalSupportFinder::get_traversal_set_support(const vector& traversals, + const vector& shared_travs, + const vector& shared_support, + const set& tgt_travs, + bool exclusive_only, + const vector& exclusive_count_travs, + const vector& exclusive_count_support, + int ref_trav_idx, + int* max_trav_size) const { + + // exclusive_count_support corresponds to traversals + assert(exclusive_count_support.empty() || exclusive_count_support.size() == traversals.size()); + // shared support same as shared size + assert(shared_support.empty() || shared_support.size() == traversals.size()); + + // pass 1: how many times have we seen a node or edge + // share count (first): number of times a node is touched in {shared_travs} or. if shared_supports is given + // then these supports are used for weighting instead of just a count + // exclusive count (second): how much exclusive support from {exclusive_count_travs} do we subtract + unordered_map> node_counts; + unordered_map> edge_counts; + map> child_counts; + + // all the traversals we need for pass 1 {shared_travs U exclusive_count_travs} + set share_set(shared_travs.begin(), shared_travs.end()); + set exclu_set(exclusive_count_travs.begin(), exclusive_count_travs.end()); + set pre_set; + std::set_union(share_set.begin(), share_set.end(), exclu_set.begin(), exclu_set.end(), std::inserter(pre_set, pre_set.begin())); + + for (auto trav_idx : pre_set) { + const SnarlTraversal& trav = traversals[trav_idx]; + for (int i = 0; i < trav.visit_size(); ++i) { + const Visit& visit = trav.visit(i); + // keep track of exclusive support count for the node so we can subtract it + double evalue = 0.; + if (!exclusive_count_support.empty() && exclu_set.count(trav_idx)) { + evalue = support_val(exclusive_count_support[trav_idx]); + } + // keep track of shared support count for scaling + // each visit gets a count of 1 unless shared_support is given, in that case the value is take from + // that. the total value will be used to normalize while counting + double svalue = 0.; + if (share_set.count(trav_idx)) { + svalue = shared_support.empty() ? 1. : support_val(shared_support[trav_idx]); + } + if (visit.node_id() != 0) { + // Count the node once + if (node_counts.count(visit.node_id())) { + pair& counts = node_counts[visit.node_id()]; + counts.first += svalue; + counts.second += evalue; + } else { + node_counts[visit.node_id()] = make_pair(svalue, evalue); + } + } else { + // Count the child once + if (child_counts.count(visit.snarl())) { + pair& counts = child_counts[visit.snarl()]; + counts.first += svalue; + counts.second += evalue; + } else { + child_counts[visit.snarl()] = make_pair(svalue, evalue); + } + } + // note: there is no edge between adjacent snarls as they overlap + // on their endpoints. + if (i > 0 && (trav.visit(i - 1).node_id() != 0 || trav.visit(i).node_id() != 0)) { + edge_t edge = to_edge(graph, trav.visit(i - 1), visit); + // Count the edge once + if (edge_counts.count(edge)) { + pair& counts = edge_counts[edge]; + counts.first += svalue; + counts.second += evalue; + } else { + edge_counts[edge] = make_pair(svalue, evalue); + } + } + } + } + + // pass 1.5: get index for looking up deletion edge lengths (so far we aren't dependent + // on having anything but a path handle graph, so we index on the fly) + unordered_map ref_offsets; + if (ref_trav_idx >= 0) { + ref_offsets = get_ref_offsets(traversals[ref_trav_idx]); + } + + // pass 2: get the supports + // we compute the various combinations of min/avg node/trav supports as we don't know which + // we will need until all the sizes are known + Support max_support; + max_support.set_forward(numeric_limits::max()); + vector min_supports_min(traversals.size(), max_support); // use min node support + vector min_supports_avg(traversals.size(), max_support); // use avg node support + vector has_support(traversals.size(), false); + vector tot_supports_min(traversals.size()); // weighted by lengths, using min node support + vector tot_supports_avg(traversals.size()); // weighted by lengths, using avg node support + vector tot_sizes(traversals.size(), 0); // to compute average from to_supports; + vector tot_sizes_all(traversals.size(), 0); // as above, but includes excluded lengths + int max_trav_size_internal = 0; // size of longest traversal + if (max_trav_size == nullptr) { + max_trav_size = &max_trav_size_internal; + } + + bool count_end_nodes = false; // toggle to include snarl ends + + auto update_support = [&] (int trav_idx, const Support& min_support, + const Support& avg_support, int length, pair share_count) { + // keep track of overall size of longest traversal + tot_sizes_all[trav_idx] += length; + *max_trav_size = std::max(tot_sizes_all[trav_idx], *max_trav_size); + + // apply the scaling + double scale_factor = 1.; + // scale zero when excluding + if (exclusive_only && share_count.first > 0) { + scale_factor = 0.; + } else if (share_count.first > 0.) { + if (shared_support.empty()) { + // our counts are just increments so we can divide by them; + scale_factor = 1. / share_count.first; + } else if (share_count.first > 0.) { + // our counts are supports, so we need to normalize by the support + // scale factor is the support of the traversal over the total support of the node +#ifdef debug + cerr << " doing a scale factor of " << support_val(shared_support[trav_idx]) << " / " << share_count.first + << " where the min in is " << support_val(min_support) << endl; +#endif + scale_factor = support_val(shared_support[trav_idx]) / share_count.first; + } + } + + // when looking at exclusive support, we don't normalize by skipped lengths + if (scale_factor != 0 || !exclusive_only || !exclusive_count_support.empty()) { + has_support[trav_idx] = true; + Support scaled_support_min; + Support scaled_support_avg; + // apply the subtraction of the exclusive count supports + if (support_val(min_support) >= share_count.second) { + scaled_support_min.set_forward(support_val(min_support) - share_count.second); + } + if (support_val(avg_support) >= share_count.second) { + scaled_support_avg.set_forward(support_val(avg_support) - share_count.second); + } + // apply scaling from the shared counts + scaled_support_min *= scale_factor; + scaled_support_avg *= scale_factor; + + tot_supports_min[trav_idx] += scaled_support_min * length; + tot_supports_avg[trav_idx] += scaled_support_avg * length; + tot_sizes[trav_idx] += length; + min_supports_min[trav_idx] = support_min(min_supports_min[trav_idx], scaled_support_min); + min_supports_avg[trav_idx] = support_min(min_supports_avg[trav_idx], scaled_support_avg); + +#ifdef debug + cerr << "updating min support to " << pb2json(min_supports_min[trav_idx]) << endl; + cerr << "updating min avg support to " << pb2json(min_supports_avg[trav_idx]) << endl; +#endif + } + }; + + for (int trav_idx = 0; trav_idx < traversals.size(); ++trav_idx) { + // target_set filter here + if (!tgt_travs.empty() && !tgt_travs.count(trav_idx)) { + continue; + } +#ifdef debug + cerr << "Doing Trav " << trav_idx << endl; +#endif + const SnarlTraversal& trav = traversals[trav_idx]; + for (int visit_idx = 0; visit_idx < trav.visit_size(); ++visit_idx) { + const Visit& visit = trav.visit(visit_idx); + Support min_support; + Support avg_support; + int64_t length; + pair share_count = make_pair(0., 0.); + + if (visit.node_id() != 0) { + // get the node support + min_support = get_min_node_support(visit.node_id()); + avg_support = get_avg_node_support(visit.node_id()); + length = graph.get_length(graph.get_handle(visit.node_id())); + if (node_counts.count(visit.node_id())) { + share_count = node_counts[visit.node_id()]; + } + } else { + // get the child support + tie(min_support, avg_support, length) = get_child_support(visit.snarl()); + if (child_counts.count(visit.snarl())) { + share_count = child_counts[visit.snarl()]; + } + } + if (count_end_nodes || (visit_idx > 0 && visit_idx < trav.visit_size() - 1)) { + update_support(trav_idx, min_support, avg_support, length, share_count); + } + share_count = make_pair(0., 0.); + + if (visit_idx > 0 && (trav.visit(visit_idx - 1).node_id() != 0 || trav.visit(visit_idx).node_id() != 0)) { + // get the edge support + edge_t edge = to_edge(graph, trav.visit(visit_idx - 1), visit); + min_support = get_edge_support(edge); + length = get_edge_length(edge, ref_offsets); + if (edge_counts.count(edge)) { + share_count = edge_counts[edge]; + } + update_support(trav_idx, min_support, min_support, length, share_count); + } + } + } + + // correct for case where no exclusive support found + // or we're ignoring some traversals vg tgt_set interface + for (int i = 0; i < min_supports_min.size(); ++i) { + if (!has_support[i]) { + min_supports_min[i] = Support(); + min_supports_avg[i] = Support(); + } + } + + bool use_avg_trav_support = *max_trav_size >= average_traversal_support_switch_threshold; + bool use_avg_node_support = *max_trav_size >= average_node_support_switch_threshold; + + if (use_avg_trav_support) { + vector& tot_supports = use_avg_node_support ? tot_supports_avg : tot_supports_min; + for (int i = 0; i < tot_supports.size(); ++i) { + if (tot_sizes[i] > 0) { + tot_supports[i] /= (double)tot_sizes[i]; + } else { + tot_supports[i] = Support(); + } + } + if (min_bp_edge_override && ref_trav_idx >= 0) { + apply_min_bp_edge_override(traversals, tgt_travs, tot_supports, ref_trav_idx); + } + return tot_supports; + } else { + return use_avg_node_support ? min_supports_avg : min_supports_min; + } +} + +vector TraversalSupportFinder::get_traversal_sizes(const vector& traversals) const { + vector sizes(traversals.size(), 0); + for (int i = 0; i < traversals.size(); ++i) { + for (int j = 0; j < traversals[i].visit_size(); ++j) { + if (traversals[i].visit(j).node_id() != 0) { + sizes[i] += graph.get_length(graph.get_handle(traversals[i].visit(j).node_id())); + } else { + // just summing up the snarl contents, which isn't a great heuristic but will + // help in some cases + pair, unordered_set > contents = snarl_manager.deep_contents( + snarl_manager.into_which_snarl(traversals[i].visit(j)), graph, true); + for (id_t node_id : contents.first) { + sizes[i] += graph.get_length(graph.get_handle(node_id)); + } + } + } + } + return sizes; + +} + +vector TraversalSupportFinder::get_traversal_mapqs(const vector& traversals) const { + vector mapqs; + mapqs.reserve(traversals.size()); + for (int i = 0; i < traversals.size(); ++i) { + double total_mapq = 0; + double total_denominator = 0; + for (int j = 0; j < traversals[i].visit_size(); ++j) { + if (traversals[i].visit(j).node_id() != 0) { + double len = graph.get_length(graph.get_handle(traversals[i].visit(j).node_id())); + double sup = support_val(get_avg_node_support(traversals[i].visit(j).node_id())); + double mapq = get_avg_node_mapq(traversals[i].visit(j).node_id()); + total_mapq += mapq * len * sup; + total_denominator += len * sup; + } else { + // just summing up the snarl contents, which isn't a great heuristic but will + // help in some cases + pair, unordered_set > contents = snarl_manager.deep_contents( + snarl_manager.into_which_snarl(traversals[i].visit(j)), graph, true); + for (id_t node_id : contents.first) { + double len = graph.get_length(graph.get_handle(node_id)); + double sup = support_val(get_avg_node_support(node_id)); + double mapq = get_avg_node_mapq(node_id); + total_mapq += mapq * len * sup; + total_denominator += len * sup; + } + } + } + mapqs.push_back(total_denominator > 0 ? total_mapq / total_denominator : 0); + } + return mapqs; +} + +size_t TraversalSupportFinder::get_average_traversal_support_switch_threshold() const { + return average_traversal_support_switch_threshold; +} + +unordered_map TraversalSupportFinder::get_ref_offsets(const SnarlTraversal& ref_trav) const { + unordered_map ref_offsets; + size_t offset = 0; + for (int i = 0; i < ref_trav.visit_size(); ++i) { + const Visit& visit = ref_trav.visit(i); + if (visit.node_id() != 0) { + if (visit.backward()) { + offset += graph.get_length(graph.get_handle(visit.node_id())); + ref_offsets[visit.node_id()] = offset; + } else { + ref_offsets[visit.node_id()] = offset; + offset += graph.get_length(graph.get_handle(visit.node_id())); + } + } + } + return ref_offsets; +} + +void TraversalSupportFinder::set_support_switch_threshold(size_t trav_thresh, size_t node_thresh) { + average_traversal_support_switch_threshold = trav_thresh; + average_node_support_switch_threshold = node_thresh; +} + +void TraversalSupportFinder::set_min_bp_edge_override(bool bp_override) { + min_bp_edge_override = bp_override; +} + +void TraversalSupportFinder::apply_min_bp_edge_override(const vector& traversals, + const set& tgt_travs, + vector& supports, int ref_trav_idx) const { + assert(ref_trav_idx >=0 && ref_trav_idx < supports.size()); + + // define a breakpoint edge as one that joins a ref and a non-ref node + // to find them, we index the reference nodes + unordered_set ref_nodes; + const SnarlTraversal& ref_trav = traversals[ref_trav_idx]; + for (size_t i = 0; i < ref_trav.visit_size(); ++i) { + ref_nodes.insert(ref_trav.visit(i).node_id()); + } + + for (size_t i = 0; i < traversals.size(); ++i) { + if (tgt_travs.empty() || tgt_travs.count(i)) { + Support bp_edge_support; + bp_edge_support.set_forward(numeric_limits::max()); + const SnarlTraversal& trav = traversals[i]; + const Visit* prev_visit = nullptr; + for (size_t j = 0; j < trav.visit_size(); ++j) { + const Visit& visit = trav.visit(j); + if (j > 0) { + if (ref_nodes.count(visit.node_id()) != ref_nodes.count(prev_visit->node_id())) { + Support edge_support = get_edge_support(prev_visit->node_id(), prev_visit->backward(), + visit.node_id(), visit.backward()); + bp_edge_support = support_min(bp_edge_support, edge_support); + } + } + prev_visit = &visit; + } + // todo: parameterize + // in practice, just takingthe min makes things worse. so we hardcode a conservative cutoff + // here to prevent unsupported edges from leaking through + if (support_val(bp_edge_support) < 1) { + supports[i] = support_min(bp_edge_support, supports[i]); + } + } + } +} + +PackedTraversalSupportFinder::PackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager) : + TraversalSupportFinder(*dynamic_cast(packer.get_graph()), snarl_manager), + packer(packer) { +} + +PackedTraversalSupportFinder::~PackedTraversalSupportFinder() { +} + +Support PackedTraversalSupportFinder::get_edge_support(const edge_t& edge) const { + return get_edge_support(graph.get_id(edge.first), graph.get_is_reverse(edge.first), + graph.get_id(edge.second), graph.get_is_reverse(edge.second)); +} + +Support PackedTraversalSupportFinder::get_edge_support(id_t from, bool from_reverse, + id_t to, bool to_reverse) const { + Edge proto_edge; + proto_edge.set_from(from); + proto_edge.set_from_start(from_reverse); + proto_edge.set_to(to); + proto_edge.set_to_end(to_reverse); + Support support; + support.set_forward(packer.edge_coverage(proto_edge)); + return support; +} + +Support PackedTraversalSupportFinder::get_min_node_support(id_t node) const { + Position pos; + pos.set_node_id(node); + size_t offset = packer.position_in_basis(pos); + size_t coverage = packer.coverage_at_position(offset); + size_t end_offset = offset + graph.get_length(graph.get_handle(node)); + for (int i = offset + 1; i < end_offset; ++i) { + coverage = min(coverage, packer.coverage_at_position(i)); + } + Support support; + support.set_forward(coverage); + return support; +} + +Support PackedTraversalSupportFinder::get_avg_node_support(id_t node) const { + Position pos; + pos.set_node_id(node); + size_t offset = packer.position_in_basis(pos); + size_t coverage = 0; + size_t length = graph.get_length(graph.get_handle(node)); + for (int i = 0; i < length; ++i) { + coverage += packer.coverage_at_position(offset + i); + } + Support support; + support.set_forward((double)coverage / (double)length); + return support; +} + +size_t PackedTraversalSupportFinder::get_avg_node_mapq(id_t node) const { + size_t offset = packer.node_index(node); + return packer.average_node_quality(offset); +} + + +CachedPackedTraversalSupportFinder::CachedPackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager, size_t cache_size) : + PackedTraversalSupportFinder(packer, snarl_manager) { + size_t num_threads = get_thread_count(); + min_node_support_cache.resize(num_threads); + avg_node_support_cache.resize(num_threads); + edge_support_cache.resize(num_threads); + avg_node_mapq_cache.resize(num_threads); + for (size_t i = 0; i < num_threads; ++i) { + min_node_support_cache[i] = new LRUCache(cache_size); + avg_node_support_cache[i] = new LRUCache(cache_size); + edge_support_cache[i] = new LRUCache(cache_size); + avg_node_mapq_cache[i] = new LRUCache(cache_size); + } +} + +CachedPackedTraversalSupportFinder::~CachedPackedTraversalSupportFinder() { + for (size_t i = 0; i < min_node_support_cache.size(); ++i) { + delete min_node_support_cache[i]; + delete avg_node_support_cache[i]; + delete edge_support_cache[i]; + delete avg_node_mapq_cache[i]; + } +} + +Support CachedPackedTraversalSupportFinder::get_edge_support(id_t from, bool from_reverse, + id_t to, bool to_reverse) const { + const HandleGraph* graph = packer.get_graph(); + edge_t edge = graph->edge_handle(graph->get_handle(from, from_reverse), + graph->get_handle(to, to_reverse)); + + auto& support_cache = *edge_support_cache[omp_get_thread_num()]; + pair cached = support_cache.retrieve(edge); + if (cached.second == true) { + return cached.first; + } else { + Support support = PackedTraversalSupportFinder::get_edge_support(from, from_reverse, to, to_reverse); + support_cache.put(edge, support); + return support; + } +} + +Support CachedPackedTraversalSupportFinder::get_min_node_support(id_t node) const { + auto& support_cache = *min_node_support_cache[omp_get_thread_num()]; + pair cached = support_cache.retrieve(node); + if (cached.second == true) { + return cached.first; + } else { + Support support = PackedTraversalSupportFinder::get_min_node_support(node); + support_cache.put(node, support); + return support; + } +} + +Support CachedPackedTraversalSupportFinder::get_avg_node_support(id_t node) const { + auto& support_cache = *avg_node_support_cache[omp_get_thread_num()]; + pair cached = support_cache.retrieve(node); + if (cached.second == true) { + return cached.first; + } else { + Support support = PackedTraversalSupportFinder::get_avg_node_support(node); + support_cache.put(node, support); + return support; + } +} + +size_t CachedPackedTraversalSupportFinder::get_avg_node_mapq(id_t node) const { + auto& mapq_cache = *avg_node_mapq_cache[omp_get_thread_num()]; + pair cached = mapq_cache.retrieve(node); + if (cached.second == true) { + return cached.first; + } else { + size_t mapq = PackedTraversalSupportFinder::get_avg_node_mapq(node); + mapq_cache.put(node, mapq); + return mapq; + } + +} + +NestedCachedPackedTraversalSupportFinder::NestedCachedPackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager, size_t cache_size) : + CachedPackedTraversalSupportFinder(packer, snarl_manager, cache_size) { + + snarl_manager.for_each_snarl_preorder([&](const Snarl* snarl) { + Support s; + child_support_map[*snarl] = make_tuple(s, s, 0); + }); +} + +NestedCachedPackedTraversalSupportFinder::~NestedCachedPackedTraversalSupportFinder() { +} + +tuple NestedCachedPackedTraversalSupportFinder::get_child_support(const Snarl& snarl) const { +#ifdef debug + cerr << "looking up support for " << pb2json(snarl) << " in " << &child_support_map << endl; + cerr << "in map = " << child_support_map.count(snarl) << endl; + cerr << "internal this is the state of our support map " << &child_support_map << endl; + for (auto xxx : child_support_map) { + cerr << pb2json(xxx.first) << endl; + } +#endif + return child_support_map.at(snarl); +} + +} diff --git a/src/traversal_support.hpp b/src/traversal_support.hpp new file mode 100644 index 00000000000..c9aa3373f04 --- /dev/null +++ b/src/traversal_support.hpp @@ -0,0 +1,212 @@ +#ifndef VG_TRAVERSAL_SUPPORT_HPP_INCLUDED +#define VG_TRAVERSAL_SUPPORT_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include "handle.hpp" +#include "snarls.hpp" +#include "genotypekit.hpp" +#include "packer.hpp" + +namespace vg { + +using namespace std; + + +/** + * Get the read support of snarl traversals or sets of snarl traversals + */ +class TraversalSupportFinder { +public: + TraversalSupportFinder(const HandleGraph& graph, SnarlManager& snarl_manager); + virtual ~TraversalSupportFinder(); + + /// Support of an edge + virtual Support get_edge_support(const edge_t& edge) const = 0; + virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const = 0; + + /// Effective length of an edge + virtual int64_t get_edge_length(const edge_t& edge, const unordered_map& ref_offsets) const; + + /// Minimum support of a node + virtual Support get_min_node_support(id_t node) const = 0; + + /// Average support of a node + virtual Support get_avg_node_support(id_t node) const = 0; + + /// Average MAPQ of reads that map to a node + virtual size_t get_avg_node_mapq(id_t node) const = 0; + + /// Use node or edge support as proxy for child support (as was done in original calling code) + virtual tuple get_child_support(const Snarl& snarl) const; + + /// Get the support of a traversal + /// Child snarls are handled as in the old call code: their maximum support is used + virtual Support get_traversal_support(const SnarlTraversal& traversal) const; + + /// wrapper for using get_traversal_set_support to get the support for + /// some alleles in a genotype, where everything is split evently among them + /// anything not in the genotype gets a support using "exclusive_count" + /// where nodes taken by the genotype are counted as 0 + /// stuff not in the genotype is limited to other_trav_subset (or all if empty) + virtual vector get_traversal_genotype_support(const vector& traversals, + const vector& genotype, + const set& other_trav_subset, + int ref_trav_idx = -1, + int* max_trav_size = nullptr); + + /// traversals: get support for each traversal in this set + /// shared_travs: if a node appears N times in shared_travs, then it will count as 1 / (N+1) support + /// shared_support: optional supports for shared_travs. used to weight support split by traversal support. + /// tgt_travs: if not empty, only compute support for these traversals (remaining slots in output vector left 0) + /// eclusive_only: shared_travs are completely ignored + /// exclusive_count_travs: these traversals get subtracted from supports in the target traversals + /// exclusive_count_support: used with above, to determine amount of support to subtract + /// ref_trav_idx: index of reference traversal if known + /// max_trav_size: optional input of max trav size. useful when longest traversral is outside target set + virtual vector get_traversal_set_support(const vector& traversals, + const vector& shared_travs, + const vector& shared_support, + const set& tgt_travs, + bool exclusive_only, + const vector& exclusive_count_travs, + const vector& exclusive_count_support, + int ref_trav_idx = -1, + int* max_trav_size = nullptr) const; + + /// Get the total length of all nodes in the traversal + virtual vector get_traversal_sizes(const vector& traversals) const; + + /// Get the average MAPQ in each traversal + /// Only consider nodes + /// Normalize by base coverage (ie avg coverage / node by node length) + virtual vector get_traversal_mapqs(const vector& traversals) const; + + /// Get the average traversal support thresholdek + virtual size_t get_average_traversal_support_switch_threshold() const; + + /// Relic from old code + static double support_val(const Support& support) { return total(support); }; + + /// get a map of the beginning of a node (in forward orientation) on a traversal + /// used for up-weighting large deletion edges in complex snarls with average support + unordered_map get_ref_offsets(const SnarlTraversal& ref_trav) const; + + /// set the threshold + virtual void set_support_switch_threshold(size_t trav_thresh, size_t node_thresh); + + /// set the breakpoint stricter upper override + virtual void set_min_bp_edge_override(bool bp_override); + + /// apply the override to a set of traversals + virtual void apply_min_bp_edge_override(const vector& traversals, + const set& tgt_travs, + vector& supports, int ref_trav_idx) const; + +protected: + + size_t average_traversal_support_switch_threshold = 50; + /// Use average instead of minimum support when determining a node's support + /// its position supports. + size_t average_node_support_switch_threshold = 50; + + /// If on, always apply minimum edge support for breakpoint (ref->offref) edges + bool min_bp_edge_override = false; + + const HandleGraph& graph; + + SnarlManager& snarl_manager; + +}; + +/** + * Get the read support from a Packer object + */ +class PackedTraversalSupportFinder : public TraversalSupportFinder { +public: + PackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager); + virtual ~PackedTraversalSupportFinder(); + + /// Support of an edge + virtual Support get_edge_support(const edge_t& edge) const; + virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const; + + /// Minimum support of a node + virtual Support get_min_node_support(id_t node) const; + + /// Average support of a node + virtual Support get_avg_node_support(id_t node) const; + + /// Average MAPQ of reads that map to a node + virtual size_t get_avg_node_mapq(id_t node) const; + +protected: + + /// Derive supports from this pack index + const Packer& packer; +}; + +/** + * Add a caching overlay to the PackedTravesalSupportFinder to avoid frequent + * base queries which can become expensive. Even caching the edges seems + * to have an impact + */ +class CachedPackedTraversalSupportFinder : public PackedTraversalSupportFinder { +public: + // good if cache_size lines up with FlowCaller::max_snarl_edges in graph_caller.hpp + CachedPackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager, size_t cache_size = 500000); + virtual ~CachedPackedTraversalSupportFinder(); + + /// Support of an edge + virtual Support get_edge_support(id_t from, bool from_reverse, id_t to, bool to_reverse) const; + + /// Minimum support of a node + virtual Support get_min_node_support(id_t node) const; + + /// Average support of a node + virtual Support get_avg_node_support(id_t node) const; + + /// Average MAPQ of reads that map to a node + virtual size_t get_avg_node_mapq(id_t node) const; + +protected: + + /// One node cache per threade + mutable vector*> edge_support_cache; + mutable vector*> min_node_support_cache; + mutable vector*> avg_node_support_cache; + mutable vector*> avg_node_mapq_cache; +}; + +/** + * Add table to keep track of child snarl support that can be maintained by outside logic + */ +class NestedCachedPackedTraversalSupportFinder : public CachedPackedTraversalSupportFinder { +public: + NestedCachedPackedTraversalSupportFinder(const Packer& packer, SnarlManager& snarl_manager, size_t cache_size = 500000); + virtual ~NestedCachedPackedTraversalSupportFinder(); + + virtual tuple get_child_support(const Snarl& snarl) const; + + /** + * map used for get_child_support(). It's intialized for every snarl so that the values can be + * updated from different threads. + */ + // todo: why can't we use unordered_map -- there's a hash function in snarls.hpp + // perhaps we can switch to pointers but not so sure at moment + struct snarl_less { + inline bool operator()(const Snarl& s1, const Snarl& s2) const { + return s1.start() < s2.start() || (s1.start() == s2.start() && s1.end() < s2.end()); + } + }; + typedef map, snarl_less> SupportMap; + SupportMap child_support_map; +}; +} + +#endif diff --git a/src/tree_subgraph.cpp b/src/tree_subgraph.cpp new file mode 100644 index 00000000000..53d00435307 --- /dev/null +++ b/src/tree_subgraph.cpp @@ -0,0 +1,198 @@ +/** + * \file tree_subgraph.cpp + * Contains the implementation of the TreeSubgraph. + */ + + +#include "tree_subgraph.hpp" +#include +#include + +namespace vg { + +using namespace std; + + +TreeSubgraph::TreeSubgraph(const HandleGraph* super, vector>&& tree, size_t root_trim) : super(super), + tree(tree), root_trim(root_trim), children(tree.size()) { + + for (size_t i = 1; i < tree.size(); i++) { + // Populate children by scanning the tree vector. + // Don't scan the root because it can't be anyone's child. + + // Tell the parent of this tree node that it has us for a child. + children.at(tree[i].first).push_back(i); + } +} + +vector TreeSubgraph::get_topological_order() const { + vector to_return; + + for (size_t i = 0; i < tree.size(); i++) { + // The tree is already in topological order, so we just use that order. + to_return.push_back(handlegraph::number_bool_packing::pack(i, false)); + } + + return to_return; +} + +handle_t TreeSubgraph::get_root() const { + if (tree.empty()) { + throw runtime_error("Tree is empty and has no root"); + } + + // Return the handle for the 0th element in the tree. + return handlegraph::number_bool_packing::pack(0, false); +} + +bool TreeSubgraph::has_node(id_t node_id) const { + return node_id > 0 && node_id <= tree.size(); +} + +handle_t TreeSubgraph::get_handle(const id_t& node_id, bool is_reverse) const { + return handlegraph::number_bool_packing::pack(node_id - 1, is_reverse); +} + +id_t TreeSubgraph::get_id(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_number(handle) + 1; +} + +bool TreeSubgraph::get_is_reverse(const handle_t& handle) const { + return handlegraph::number_bool_packing::unpack_bit(handle); +} + +handle_t TreeSubgraph::flip(const handle_t& handle) const { + return handlegraph::number_bool_packing::toggle_bit(handle); +} + +size_t TreeSubgraph::get_length(const handle_t& handle) const { + // Get the length in the backing graph + size_t length = super->get_length(get_underlying_handle(handle)); + + if (get_id(handle) == 1 && root_trim != 0) { + // Trim the root as necessary + length -= root_trim; + } + + return length; +} + +string TreeSubgraph::get_sequence(const handle_t& handle) const { + // TODO: use get_subsequence to efficiently trim the root + + // Get the full backing sequence in the correct orientation to return + string sequence = super->get_sequence(get_underlying_handle(handle)); + + if (get_id(handle) == 1 && root_trim != 0) { + // Trim the root + + if (get_is_reverse(handle)) { + // We need to cut off the end + sequence = sequence.substr(0, sequence.size() - root_trim); + } else { + // We need to cut off the start + sequence = sequence.substr(root_trim); + } + } + + return sequence; +} + +bool TreeSubgraph::follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const { + // Work out our index in the tree vector + size_t index = get_id(handle) - 1; + // Work out if we want the parent or the child + bool find_parent = (go_left != get_is_reverse(handle)); + // Work out if we need to flip the result when we get there + bool flip_result = get_is_reverse(handle); + + if (find_parent) { + if (index == 0) { + // No parent of the root + return true; + } + + // Otherwise, just visit the one parent. + size_t result_index = tree.at(index).first; + return iteratee(get_handle(result_index + 1, flip_result)); + } else { + bool keep_going = true; + for (const size_t& result_index : children.at(index)) { + // Go through all the children + + keep_going &= iteratee(get_handle(result_index + 1, flip_result)); + + if (!keep_going) { + break; + } + } + return keep_going; + } + +} + +bool TreeSubgraph::for_each_handle_impl(const function& iteratee, bool parallel) const { + for (size_t i = 0; i < tree.size(); i++) { + // For each index, turn it into the corresponding ID and get the handle and run on it. + if (!iteratee(get_handle(i + 1, false))) { + // Asked to stop early. + return false; + } + } + return true; +} + +size_t TreeSubgraph::get_node_count() const { + return tree.size(); +} + +id_t TreeSubgraph::min_node_id() const { + return 1; +} + +id_t TreeSubgraph::max_node_id() const { + return tree.size(); +} + +handle_t TreeSubgraph::get_underlying_handle(const handle_t& handle) const { + size_t index = get_id(handle) - 1; + bool flip = get_is_reverse(handle); + + // Find the backing graph handle + handle_t to_return = tree.at(index).second; + + if (flip) { + // Flip it if necessary + to_return = super->flip(to_return); + } + + return to_return; +} + +Path TreeSubgraph::translate_down(const Path& path_against_subgraph) const { + // Copy the whole path + Path translated = path_against_subgraph; + + for (size_t i = 0; i < translated.mapping_size(); i++) { + // Get the handle in ourselves + handle_t visited = get_handle(translated.mapping(i).position().node_id(), translated.mapping(i).position().is_reverse()); + + // Translate it down + handle_t underlying = get_underlying_handle(visited); + + // Put its ID and orientation in. + translated.mutable_mapping(i)->mutable_position()->set_node_id(super->get_id(underlying)); + translated.mutable_mapping(i)->mutable_position()->set_is_reverse(super->get_is_reverse(underlying)); + + if (get_id(visited) == 1 && !get_is_reverse(visited) && root_trim != 0) { + // We're on the forward strand of the root, and the root needs trimming, so adjust the offset. + // If we are at 0 on the trimmed node, we are at root_trim on the untrimmed node. + translated.mutable_mapping(i)->mutable_position()->set_offset(translated.mapping(i).position().offset() + root_trim); + } + } + + return translated; +} + +} + diff --git a/src/tree_subgraph.hpp b/src/tree_subgraph.hpp new file mode 100644 index 00000000000..2fc1dce25e2 --- /dev/null +++ b/src/tree_subgraph.hpp @@ -0,0 +1,140 @@ +#ifndef VG_TREE_SUBGRAPH_HPP_INCLUDED +#define VG_TREE_SUBGRAPH_HPP_INCLUDED + +/** \file tree_subgraph.hpp + * Represents a subgraph of another graph with an internal tree topology. + */ + +#include "handle.hpp" +#include +#include +#include + +namespace vg { + +using namespace std; + + /** + * A HandleGraph implementation that represents a subgraph of another + * HandleGraph, defined by an internal tree whose nodes and edges are + * embedded in the nodes and edges of the backing graph. + * + * Useful for describing the haplotype tree embedded in a graph, radiating + * from a certain point. + * + * The tree is always exposed as rooted at the left, with child nodes + * radiating out on the right. The user has to re-orient the handles fed in + * to match that topology. + * + * The root handle can be trimmed on its left side. + * + * Supports translation of other Paths from this graph into the base graph. + */ + class TreeSubgraph : public handlegraph::ExpandingOverlayGraph { + public: + + /// Create a TreeSubgraph describing the subgraph of the given graph + /// defined by the given tree. The tree is stored as a vector of pairs + /// of (previous item number, base graph handle). + /// + /// The tree handles must be given reading from the root end of the + /// tree towards the branches. + /// + /// The tree must be topologically sorted, with the root at 0. The root + /// must point to -1. + /// + /// If given, root_trim specifies a number of bases to cut off of the + /// left side of the root handle. + /// + TreeSubgraph(const HandleGraph* super, vector>&& tree, size_t root_trim = 0); + + /// Get a topological order very easily, since the tree defines one. + vector get_topological_order() const; + + /// Get a handle to the root of the tree, oriented towards the side with edges, if any. + /// + /// Throws an exception if the tree is empty and there is no root. + handle_t get_root() const; + + ////////////////////////// + /// HandleGraph interface + ////////////////////////// + + /// Method to check if a node exists by ID + virtual bool has_node(id_t node_id) const; + + /// Look up the handle for the node with the given ID in the given orientation + virtual handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + + /// Get the ID from a handle + virtual id_t get_id(const handle_t& handle) const; + + /// Get the orientation of a handle + virtual bool get_is_reverse(const handle_t& handle) const; + + /// Invert the orientation of a handle (potentially without getting its ID) + virtual handle_t flip(const handle_t& handle) const; + + /// Get the length of a node + virtual size_t get_length(const handle_t& handle) const; + + /// Get the sequence of a node, presented in the handle's local forward + /// orientation. + virtual string get_sequence(const handle_t& handle) const; + + protected: + /// Loop over all the handles to next/previous (right/left) nodes. Passes + /// them to a callback which returns false to stop iterating and true to + /// continue. Returns true if we finished and false if we stopped early. + virtual bool follow_edges_impl(const handle_t& handle, bool go_left, const function& iteratee) const; + + /// Loop over all the nodes in the graph in their local forward + /// orientations, in their internal stored order. Stop if the iteratee + /// returns false. Can be told to run in parallel, in which case stopping + /// after a false return value is on a best-effort basis and iteration + /// order is not defined. + virtual bool for_each_handle_impl(const function& iteratee, bool parallel = false) const; + + public: + /// Return the number of nodes in the graph + virtual size_t get_node_count() const; + + /// Return the smallest ID in the graph, or some smaller number if the + /// smallest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t min_node_id() const; + + /// Return the largest ID in the graph, or some larger number if the + /// largest ID is unavailable. Return value is unspecified if the graph is empty. + virtual id_t max_node_id() const; + + ////////////////////////// + /// ExpandingOverlayGraph interface + ////////////////////////// + + /// Get the handle in the backing graph that the given handle in this graph represents. + virtual handle_t get_underlying_handle(const handle_t& handle) const; + + ////////////////////////// + /// Additional Interface + ////////////////////////// + + /// Translate a Path against us to a Path against the base graph + Path translate_down(const Path& path_against_subgraph) const; + + protected: + /// What graph are we based on? + const HandleGraph* super; + + /// What tree are we using in the backing graph? + /// Index in this vector corresponds to node ID in the projected graph. + vector> tree; + + /// How much of the root do we trim off? + size_t root_trim; + + /// For each node, what child indexes does it have? + vector> children; + }; +} + +#endif diff --git a/src/types.hpp b/src/types.hpp index 40f8f876b70..1bf7ac2e246 100644 --- a/src/types.hpp +++ b/src/types.hpp @@ -1,26 +1,115 @@ #ifndef VG_TYPES_HPP_INCLUDED #define VG_TYPES_HPP_INCLUDED +#include +#include +#include #include +#include + /** \file - * Contains typedefs for basic types useful for talking about graphs. + * Contains typedefs for basic types useful for talking about graphs and + * basic operations using them. */ namespace vg { /// Represents a Node ID. /// ID type is a 64-bit signed int. -typedef int64_t id_t; +typedef handlegraph::nid_t id_t; /// Represents an offset along the sequence of a Node. /// Offsets are size_t. -typedef size_t off_t; +typedef size_t offset_t; /// Represents an oriented position on a Node. /// Position type: id, direction, offset. -typedef std::tuple pos_t; +/// Offset is counted as for as prorobuf Position, from the node's first base +/// on the forward strand, and from its last base on the reverse strand. +typedef std::tuple pos_t; + +/// Create a pos_t from a Node ID, an orientation flag, and an offset along that strand of the node. +inline pos_t make_pos_t(id_t id, bool is_rev, offset_t off) { + return std::make_tuple(id, is_rev, off); +} + +/// Extract the id of the node a pos_t is on. +inline id_t id(const pos_t& pos) { + return std::get<0>(pos); +} + +/// Return true if a pos_t is on the reverse strand of its node. +inline bool is_rev(const pos_t& pos) { + return std::get<1>(pos); +} + +/// Get the offset along the selected strand of the node from a pos_t. +inline offset_t offset(const pos_t& pos) { + return std::get<2>(pos); +} + +/// Get a reference to the Node ID of a pos_t. +inline id_t& get_id(pos_t& pos) { + return std::get<0>(pos); +} + +/// Get a reference to the reverse flag of a pos_t. +inline bool& get_is_rev(pos_t& pos) { + return std::get<1>(pos); +} + +/// Get a reference to the offset field of a pos_t, which counts along the selected strand of the node. +inline offset_t& get_offset(pos_t& pos) { + return std::get<2>(pos); +} + +/// Return true if a pos_t is unset. +inline bool is_empty(const pos_t& pos) { + return (id(pos) == 0); +} + +/// Get an unset pos_t +inline pos_t empty_pos_t() { + return {0, false, 0}; +} + +/// Reverse a pos_t and get a pos_t at the same **point between bases**, going the other direction. +/// To get a pos_t to the same *base*, subtract 1 from the resulting offset or call reverse_base_pos(). +inline pos_t reverse(const pos_t& pos, size_t node_length) { + pos_t rev = pos; + // swap the offset onto the other strand + get_offset(rev) = node_length - offset(rev); + // invert the position + get_is_rev(rev) = !is_rev(rev); + return rev; +} +/// Reverse a pos_t and get a pos_t at the same **base**, going the other direction. +inline pos_t reverse_base_pos(const pos_t& pos, size_t node_length) { + pos_t rev = pos; + // swap the offset onto the other strand + get_offset(rev) = (node_length - 1) - offset(rev); + // invert the position + get_is_rev(rev) = !is_rev(rev); + return rev; +} + +/// Print a pos_t to a stream. +inline std::ostream& operator<<(std::ostream& out, const pos_t& pos) { + return out << id(pos) << (is_rev(pos) ? "-" : "+") << offset(pos); +} + +} // namespace vg + +namespace std { + +inline string to_string(const vg::pos_t& pos) { + stringstream ss; + vg::operator<<(ss, pos); + return ss.str(); +} + } #endif diff --git a/src/unittest/aligner.cpp b/src/unittest/aligner.cpp index 76a5efcf2c4..6b0e5d7e5ef 100644 --- a/src/unittest/aligner.cpp +++ b/src/unittest/aligner.cpp @@ -1,4 +1,4 @@ -/// \file aligner.cpp +/// \file unittest/aligner.cpp /// /// Unit tests for the basic methods of the Aligner class. See also: /// pinned_alignment.cpp. @@ -6,11 +6,16 @@ #include #include -#include "../json2pb.h" -#include "../vg.pb.h" -#include "../gssw_aligner.hpp" + +#include "vg/io/json2pb.h" +#include +#include "vg.hpp" +#include "path.hpp" +#include "test_aligner.hpp" #include "catch.hpp" +#include + namespace vg { namespace unittest { using namespace std; @@ -19,9 +24,14 @@ TEST_CASE("Aligner respects the full length bonus at both ends", "[aligner][alig VG graph; - Aligner aligner_1(1, 4, 6, 1, 0); - Aligner aligner_2(1, 4, 6, 1, 10); + TestAligner aligner_source_1; + aligner_source_1.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner_1 = *aligner_source_1.get_regular_aligner(); + TestAligner aligner_source_2; + aligner_source_2.set_alignment_scores(1, 4, 6, 1, 10); + const Aligner& aligner_2 = *aligner_source_2.get_regular_aligner(); + Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); Node* n2 = graph.create_node("A"); @@ -37,9 +47,9 @@ TEST_CASE("Aligner respects the full length bonus at both ends", "[aligner][alig aln1.set_sequence(read); aln2.set_sequence(read); - aligner_1.align(aln1, graph.graph, true, false); - aligner_2.align(aln2, graph.graph, true, false); - + aligner_1.align(aln1, graph, true); + aligner_2.align(aln2, graph, true); + SECTION("bonus is collected at both ends") { REQUIRE(aln2.score() == aln1.score() + 20); } @@ -50,8 +60,14 @@ TEST_CASE("Aligner respects the full length bonus for a single base read", "[ali VG graph; - Aligner aligner_1(1, 4, 6, 1, 0); - Aligner aligner_2(1, 4, 6, 1, 10); + TestAligner aligner_source_1; + aligner_source_1.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner_1 = *aligner_source_1.get_regular_aligner(); + + TestAligner aligner_source_2; + aligner_source_2.set_alignment_scores(1, 4, 6, 1, 10); + const Aligner& aligner_2 = *aligner_source_2.get_regular_aligner(); + Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -68,8 +84,8 @@ TEST_CASE("Aligner respects the full length bonus for a single base read", "[ali aln1.set_sequence(read); aln2.set_sequence(read); - aligner_1.align(aln1, graph.graph, true, false); - aligner_2.align(aln2, graph.graph, true, false); + aligner_1.align(aln1, graph, true); + aligner_2.align(aln2, graph, true); SECTION("bonus is collected twice even though both ends are one match") { REQUIRE(aln2.score() == aln1.score() + 20); @@ -80,8 +96,14 @@ TEST_CASE("Aligner works when end bonus is granted to a match at the start of a VG graph; - Aligner aligner_1(1, 4, 6, 1, 0); - Aligner aligner_2(1, 4, 6, 1, 10); + TestAligner aligner_source_1; + aligner_source_1.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner_1 = *aligner_source_1.get_regular_aligner(); + + TestAligner aligner_source_2; + aligner_source_2.set_alignment_scores(1, 4, 6, 1, 10); + const Aligner& aligner_2 = *aligner_source_2.get_regular_aligner(); + Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -99,8 +121,8 @@ TEST_CASE("Aligner works when end bonus is granted to a match at the start of a aln2.set_sequence(read); // Make sure aligner runs - aligner_1.align(aln1, graph.graph, true, false); - aligner_2.align(aln2, graph.graph, true, false); + aligner_1.align(aln1, graph, true); + aligner_2.align(aln2, graph, true); SECTION("bonus is collected twice") { REQUIRE(aln2.score() == aln1.score() + 20); @@ -110,8 +132,14 @@ TEST_CASE("Aligner works when end bonus is granted to a match at the start of a TEST_CASE("Full-length bonus can hold down the left end", "[aligner][alignment][mapping]") { VG graph; - Aligner aligner_1(1, 4, 6, 1, 0); - Aligner aligner_2(1, 4, 6, 1, 10); + + TestAligner aligner_source_1; + aligner_source_1.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner_1 = *aligner_source_1.get_regular_aligner(); + + TestAligner aligner_source_2; + aligner_source_2.set_alignment_scores(1, 4, 6, 1, 10); + const Aligner& aligner_2 = *aligner_source_2.get_regular_aligner(); Node* n0 = graph.create_node("AGTGCTGAAGT"); @@ -120,10 +148,10 @@ TEST_CASE("Full-length bonus can hold down the left end", "[aligner][alignment][ aln1.set_sequence(read); aln2.set_sequence(read); - aligner_1.align(aln1, graph.graph, true, false); - aligner_2.align(aln2, graph.graph, true, false); + aligner_1.align(aln1, graph, true); + aligner_2.align(aln2, graph, true); - SECTION("left end is detatched without bonus") { + SECTION("left end is detached without bonus") { REQUIRE(aln1.path().mapping_size() == 1); REQUIRE(aln1.path().mapping(0).position().node_id() == n0->id()); REQUIRE(aln1.path().mapping(0).position().offset() == 2); @@ -145,8 +173,14 @@ TEST_CASE("Full-length bonus can hold down the left end", "[aligner][alignment][ TEST_CASE("Full-length bonus can hold down the right end", "[aligner][alignment][mapping]") { VG graph; - Aligner aligner_1(1, 4, 6, 1, 0); - Aligner aligner_2(1, 4, 6, 1, 10); + + TestAligner aligner_source_1; + aligner_source_1.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner_1 = *aligner_source_1.get_regular_aligner(); + + TestAligner aligner_source_2; + aligner_source_2.set_alignment_scores(1, 4, 6, 1, 10); + const Aligner& aligner_2 = *aligner_source_2.get_regular_aligner(); Node* n0 = graph.create_node("AGTGCTGAAGT"); @@ -155,10 +189,10 @@ TEST_CASE("Full-length bonus can hold down the right end", "[aligner][alignment] aln1.set_sequence(read); aln2.set_sequence(read); - aligner_1.align(aln1, graph.graph, true, false); - aligner_2.align(aln2, graph.graph, true, false); + aligner_1.align(aln1, graph, true); + aligner_2.align(aln2, graph, true); - SECTION("right end is detatched without bonus") { + SECTION("right end is detached without bonus") { REQUIRE(aln1.path().mapping_size() == 1); REQUIRE(aln1.path().mapping(0).position().node_id() == n0->id()); REQUIRE(aln1.path().mapping(0).position().offset() == 0); @@ -182,8 +216,13 @@ TEST_CASE("Full-length bonus can attach Ns", "[aligner][alignment][mapping]") { VG graph; - Aligner aligner_1(1, 4, 6, 1, 0); - Aligner aligner_2(1, 4, 6, 1, 10); + TestAligner aligner_source_1; + aligner_source_1.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner_1 = *aligner_source_1.get_regular_aligner(); + + TestAligner aligner_source_2; + aligner_source_2.set_alignment_scores(1, 4, 6, 1, 10); + const Aligner& aligner_2 = *aligner_source_2.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -200,8 +239,8 @@ TEST_CASE("Full-length bonus can attach Ns", "[aligner][alignment][mapping]") { aln1.set_sequence(read); aln2.set_sequence(read); - aligner_1.align(aln1, graph.graph, true, false); - aligner_2.align(aln2, graph.graph, true, false); + aligner_1.align(aln1, graph, true); + aligner_2.align(aln2, graph, true); SECTION("bonused alignment ends in full-length match/mismatches") { REQUIRE(aln2.path().mapping_size() == 3); @@ -221,8 +260,13 @@ TEST_CASE("Full-length bonus can attach to Ns", "[aligner][alignment][mapping]") VG graph; - Aligner aligner_1(1, 4, 6, 1, 0); - Aligner aligner_2(1, 4, 6, 1, 10); + TestAligner aligner_source_1; + aligner_source_1.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner_1 = *aligner_source_1.get_regular_aligner(); + + TestAligner aligner_source_2; + aligner_source_2.set_alignment_scores(1, 4, 6, 1, 10); + const Aligner& aligner_2 = *aligner_source_2.get_regular_aligner(); Node* n0 = graph.create_node("NNNG"); Node* n1 = graph.create_node("C"); @@ -239,8 +283,8 @@ TEST_CASE("Full-length bonus can attach to Ns", "[aligner][alignment][mapping]") aln1.set_sequence(read); aln2.set_sequence(read); - aligner_1.align(aln1, graph.graph, true, false); - aligner_2.align(aln2, graph.graph, true, false); + aligner_1.align(aln1, graph, true); + aligner_2.align(aln2, graph, true); SECTION("bonused alignment ends in full-length match/mismatches") { REQUIRE(aln2.path().mapping_size() == 3); @@ -260,8 +304,13 @@ TEST_CASE("Full-length bonus can attach Ns to Ns", "[aligner][alignment][mapping VG graph; - Aligner aligner_1(1, 4, 6, 1, 0); - Aligner aligner_2(1, 4, 6, 1, 10); + TestAligner aligner_source_1; + aligner_source_1.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner_1 = *aligner_source_1.get_regular_aligner(); + + TestAligner aligner_source_2; + aligner_source_2.set_alignment_scores(1, 4, 6, 1, 10); + const Aligner& aligner_2 = *aligner_source_2.get_regular_aligner(); Node* n0 = graph.create_node("NNNG"); Node* n1 = graph.create_node("C"); @@ -278,8 +327,8 @@ TEST_CASE("Full-length bonus can attach Ns to Ns", "[aligner][alignment][mapping aln1.set_sequence(read); aln2.set_sequence(read); - aligner_1.align(aln1, graph.graph, true, false); - aligner_2.align(aln2, graph.graph, true, false); + aligner_1.align(aln1, graph, true); + aligner_2.align(aln2, graph, true); SECTION("bonused alignment ends in full-length match/mismatches") { REQUIRE(aln2.path().mapping_size() == 3); @@ -302,21 +351,24 @@ TEST_CASE("Full-length bonus is applied to both ends by rescoring", "[aligner][a Alignment aln; json2pb(aln, aln_str.c_str(), aln_str.size()); - // Make an aligner with a full lenth bonus of 5 - Aligner aligner1(1, 4, 6, 1, 5); - // And one with no bonus - Aligner aligner2(1, 4, 6, 1, 0); + TestAligner aligner_source_1; + aligner_source_1.set_alignment_scores(1, 4, 6, 1, 5); + const Aligner& aligner1 = *aligner_source_1.get_regular_aligner(); + + TestAligner aligner_source_2; + aligner_source_2.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner2 = *aligner_source_2.get_regular_aligner(); REQUIRE(!softclip_start(aln)); REQUIRE(!softclip_end(aln)); // Normal score would be 129 - REQUIRE(aligner2.score_ungapped_alignment(aln) == 129); + REQUIRE(aligner2.score_contiguous_alignment(aln) == 129); // And with a full length bonus at each end it's 139. - REQUIRE(aligner1.score_ungapped_alignment(aln) == 139); + REQUIRE(aligner1.score_contiguous_alignment(aln) == 139); } -TEST_CASE("BaseAligner mapping quality estimation is robust", "[aligner][alignment][mapping][mapq]") { +TEST_CASE("GSSWAligner mapping quality estimation is robust", "[aligner][alignment][mapping][mapq]") { vector scaled_scores; size_t max_idx; @@ -327,25 +379,25 @@ TEST_CASE("BaseAligner mapping quality estimation is robust", "[aligner][alignme SECTION("a 1-element positive vector has its element chosen") { scaled_scores = {10}; - BaseAligner::maximum_mapping_quality_exact(scaled_scores, &max_idx); + GSSWAligner::maximum_mapping_quality_exact(scaled_scores, &max_idx); REQUIRE(max_idx == 0); } SECTION("a 1-element zero vector has its element chosen") { scaled_scores = {0}; - BaseAligner::maximum_mapping_quality_exact(scaled_scores, &max_idx); + GSSWAligner::maximum_mapping_quality_exact(scaled_scores, &max_idx); REQUIRE(max_idx == 0); } SECTION("a 1-element negative vector has its element chosen") { scaled_scores = {-10}; - BaseAligner::maximum_mapping_quality_exact(scaled_scores, &max_idx); + GSSWAligner::maximum_mapping_quality_exact(scaled_scores, &max_idx); REQUIRE(max_idx == 0); } SECTION("a multi-element vector has a maximal element chosen") { scaled_scores = {1, 5, 2, 5, 4}; - BaseAligner::maximum_mapping_quality_exact(scaled_scores, &max_idx); + GSSWAligner::maximum_mapping_quality_exact(scaled_scores, &max_idx); REQUIRE(max_idx >= 1); REQUIRE(max_idx != 2); REQUIRE(max_idx <= 3); @@ -358,34 +410,95 @@ TEST_CASE("BaseAligner mapping quality estimation is robust", "[aligner][alignme SECTION("a 1-element positive vector has its element chosen") { scaled_scores = {10}; - BaseAligner::maximum_mapping_quality_approx(scaled_scores, &max_idx); + GSSWAligner::maximum_mapping_quality_approx(scaled_scores, &max_idx); REQUIRE(max_idx == 0); } SECTION("a 1-element zero vector has its element chosen") { scaled_scores = {0}; - BaseAligner::maximum_mapping_quality_approx(scaled_scores, &max_idx); + GSSWAligner::maximum_mapping_quality_approx(scaled_scores, &max_idx); REQUIRE(max_idx == 0); } SECTION("a 1-element negative vector has its element chosen") { scaled_scores = {-10}; - BaseAligner::maximum_mapping_quality_approx(scaled_scores, &max_idx); + GSSWAligner::maximum_mapping_quality_approx(scaled_scores, &max_idx); REQUIRE(max_idx == 0); } SECTION("a multi-element vector has a maximal element chosen") { scaled_scores = {1, 5, 2, 5, 4}; - BaseAligner::maximum_mapping_quality_approx(scaled_scores, &max_idx); + GSSWAligner::maximum_mapping_quality_approx(scaled_scores, &max_idx); REQUIRE(max_idx >= 1); REQUIRE(max_idx != 2); REQUIRE(max_idx <= 3); } } - } - + +void check_mapping(const HandleGraph& graph, const Mapping& mapping, const handle_t& handle, size_t offset, size_t length) { + REQUIRE(mapping.position().node_id() == graph.get_id(handle)); + REQUIRE(mapping.position().is_reverse() == graph.get_is_reverse(handle)); + REQUIRE(mapping.position().offset() == offset); + REQUIRE(mapping.edit_size() == 1); + REQUIRE(mapping.edit(0).from_length() == length); + REQUIRE(mapping.edit(0).to_length() == length); + REQUIRE(mapping.edit(0).sequence().empty()); +} + +TEST_CASE("Aligner can align to a subgraph", "[aligner][alignment][mapping]") { + + // Create a graph with four nodes. + bdsg::HashGraph graph; + std::vector handles; + handles.push_back(graph.create_handle("AAAA")); + handles.push_back(graph.create_handle("GATT")); + handles.push_back(graph.create_handle("ACAT")); + handles.push_back(graph.create_handle("AAAA")); + + // Make the graph a cycle. + for (size_t i = 0; i < handles.size(); i++) { + graph.create_edge(handles[i], handles[(i + 1) % handles.size()]); + } + + // We want to align to the two nodes in the middle. + std::unordered_set subgraph = { + static_cast(graph.get_id(handles[1])), + static_cast(graph.get_id(handles[2])) + }; + std::vector topological_order = { + handles[1], handles[2], graph.flip(handles[2]), graph.flip(handles[1]) + }; + + // Get an Aligner. + TestAligner aligner_source; + aligner_source.set_alignment_scores(1, 4, 6, 1, 0); + const Aligner& aligner = *(aligner_source.get_regular_aligner()); + + SECTION("Align to forward strand") { + Alignment alignment; + alignment.set_sequence("ATTACA"); + aligner.align(alignment, graph, topological_order); + + const Path& path = alignment.path(); + REQUIRE(path.mapping_size() == 2); + check_mapping(graph, path.mapping(0), topological_order[0], 1, 3); + check_mapping(graph, path.mapping(1), topological_order[1], 0, 3); + } + + SECTION("Align to reverse strand") { + Alignment alignment; + alignment.set_sequence("TGTAAT"); + aligner.align(alignment, graph, topological_order); + + const Path& path = alignment.path(); + REQUIRE(path.mapping_size() == 2); + check_mapping(graph, path.mapping(0), topological_order[2], 1, 3); + check_mapping(graph, path.mapping(1), topological_order[3], 0, 3); + } +} + } } diff --git a/src/unittest/alignment.cpp b/src/unittest/alignment.cpp index 62ab7e916e7..e45a5d4658a 100644 --- a/src/unittest/alignment.cpp +++ b/src/unittest/alignment.cpp @@ -5,9 +5,11 @@ #include #include -#include "../json2pb.h" -#include "../vg.pb.h" +#include "vg/io/json2pb.h" +#include #include "../alignment.hpp" +#include "../vg.hpp" +#include "../xg.hpp" #include "catch.hpp" namespace vg { @@ -109,6 +111,227 @@ TEST_CASE("Alignment trimming works even on unaligned reads", "[alignment]") { REQUIRE(a.sequence().size() - 100 == aln.sequence().size()); } + +TEST_CASE("Alignment normalization behaves as expected","[alignment]") { + string alignment_string = R"( + {"sequence":"ATNNNNANCT","path":{"mapping":[{"position":{"node_id":"1"},"edit":[{"from_length":"1","to_length":"1"},{"from_length":"1","to_length":"1"},{"from_length":"1","to_length":"1","sequence":"N"},{"from_length":"1","to_length":"1"},{"from_length":"2","to_length":"2","sequence":"NN"},{"from_length":"2","to_length":"2","sequence":"AN"},{"to_length":"1","sequence":"C"},{"to_length":"1","sequence":"T"},{"from_length":"1"},{"from_length":"1"}]}]}} + )"; + + string normalized_string = R"( + {"sequence":"ATNNNNANCT","path":{"mapping":[{"position":{"node_id":"1"},"edit":[{"from_length":"2","to_length":"2"},{"from_length":"4","to_length":"4","sequence":"NNNN"},{"from_length":"1","to_length":"1","sequence":"A"},{"from_length":"1","to_length":"1","sequence":"N"},{"to_length":"2","sequence":"CT"},{"from_length":"2"}]}]}} + )"; + + Alignment aln; + json2pb(aln, alignment_string.c_str(), alignment_string.size()); + Alignment target; + json2pb(target, normalized_string.c_str(), normalized_string.size()); + + normalize_alignment(aln); + + REQUIRE(aln.path().mapping_size() == target.path().mapping_size()); + for (size_t i = 0; i < aln.path().mapping_size(); i++) { + REQUIRE(aln.path().mapping(i).edit_size() == target.path().mapping(i).edit_size()); + for (size_t j = 0; j < target.path().mapping(i).edit_size(); j++) { + REQUIRE(aln.path().mapping(i).edit(j).from_length() == target.path().mapping(i).edit(j).from_length()); + REQUIRE(aln.path().mapping(i).edit(j).to_length() == target.path().mapping(i).edit(j).to_length()); + REQUIRE(aln.path().mapping(i).edit(j).sequence() == target.path().mapping(i).edit(j).sequence()); + } + } +} + + +TEST_CASE("Target to alignment extraction", "[target-to-aln]") { + + VG vg; + + Node* n0 = vg.create_node("CGA"); + Node* n1 = vg.create_node("TTGG"); + Node* n2 = vg.create_node("CCGT"); + Node* n3 = vg.create_node("C"); + Node* n4 = vg.create_node("GT"); + Node* n5 = vg.create_node("GATAA"); + Node* n6 = vg.create_node("CGG"); + Node* n7 = vg.create_node("ACA"); + Node* n8 = vg.create_node("GCCG"); + Node* n9 = vg.create_node("A"); + Node* n10 = vg.create_node("C"); + Node* n11 = vg.create_node("G"); + Node* n12 = vg.create_node("T"); + Node* n13 = vg.create_node("A"); + Node* n14 = vg.create_node("C"); + Node* n15 = vg.create_node("C"); + + vg.create_edge(n0, n1); + vg.create_edge(n2, n0, true, true); + vg.create_edge(n1, n3); + vg.create_edge(n2, n3); + vg.create_edge(n3, n4, false, true); + vg.create_edge(n4, n5, true, false); + vg.create_edge(n5, n6); + vg.create_edge(n8, n6, false, true); + vg.create_edge(n6, n7, false, true); + vg.create_edge(n7, n9, true, true); + vg.create_edge(n9, n10, true, false); + vg.create_edge(n10, n11, false, false); + vg.create_edge(n12, n11, false, true); + vg.create_edge(n13, n12, false, false); + vg.create_edge(n14, n13, true, false); + vg.create_edge(n15, n14, true, true); + + Graph graph = vg.graph; + + Path* path = graph.add_path(); + path->set_name("path"); + Mapping* mapping = path->add_mapping(); + mapping->mutable_position()->set_node_id(n0->id()); + mapping->set_rank(1); + mapping = path->add_mapping(); + mapping->mutable_position()->set_node_id(n2->id()); + mapping->set_rank(2); + mapping = path->add_mapping(); + mapping->mutable_position()->set_node_id(n3->id()); + mapping->set_rank(3); + mapping = path->add_mapping(); + mapping->mutable_position()->set_node_id(n4->id()); + mapping->mutable_position()->set_is_reverse(true); + mapping->set_rank(4); + mapping = path->add_mapping(); + mapping->mutable_position()->set_node_id(n5->id()); + mapping->set_rank(5); + mapping = path->add_mapping(); + mapping->mutable_position()->set_node_id(n6->id()); + mapping->set_rank(6); + mapping = path->add_mapping(); + mapping->mutable_position()->set_node_id(n8->id()); + mapping->mutable_position()->set_is_reverse(true); + mapping->set_rank(7); + + xg::XG xg_index; + xg_index.from_path_handle_graph(VG(graph)); + path_handle_t path_handle = xg_index.get_path_handle("path"); + + SECTION("Subpath getting gives us the expected 1bp alignment") { + Alignment target = target_alignment(&xg_index, path_handle, 1, 2, "feature", false); + REQUIRE(alignment_from_length(target) == 2 - 1); + } + + SECTION("Subpath getting gives us the expected 10bp alignment") { + Alignment target = target_alignment(&xg_index, path_handle, 10, 20, "feature", false); + REQUIRE(alignment_from_length(target) == 20 - 10); + } + + SECTION("Subpath getting gives us the expected 14bp alignment") { + Alignment target = target_alignment(&xg_index, path_handle, 0, 14, "feature", false); + REQUIRE(alignment_from_length(target) == 14); + } + + SECTION("Subpath getting gives us the expected 21bp alignment") { + Alignment target = target_alignment(&xg_index, path_handle, 0, 21, "feature", false); + REQUIRE(alignment_from_length(target) == 21); + } + + SECTION("Subpath getting gives us the expected inverted 7bp alignment") { + Alignment target = target_alignment(&xg_index, path_handle, 0, 7, "feature", true); + REQUIRE(alignment_from_length(target) == 7); + REQUIRE(target.path().mapping(0).position().node_id() == n2->id()); + REQUIRE(target.path().mapping(1).position().node_id() == n0->id()); + REQUIRE(target.path().mapping(0).position().is_reverse() == true); + REQUIRE(target.path().mapping(1).position().is_reverse() == true); + } + +} + +TEST_CASE("simplify_cigar merges runs of adjacent I's and D's in cigars", "[alignment][surject]") { + + vector> cigar{ + make_pair(2, 'D'), + make_pair(1, 'I'), + make_pair(4, 'D'), + make_pair(1, 'M'), + make_pair(3, 'I'), + make_pair(5, 'D'), + make_pair(1, 'I') + }; + + simplify_cigar(cigar); + REQUIRE(cigar.size() == 5); + bool consolidated_1 = ((cigar[0] == make_pair(6, 'D') && cigar[1] == make_pair(1, 'I')) + || (cigar[0] == make_pair(1, 'I') && cigar[1] == make_pair(6, 'D'))); + bool consolidated_2 = ((cigar[3] == make_pair(5, 'D') && cigar[4] == make_pair(4, 'I')) + || (cigar[3] == make_pair(4, 'I') && cigar[4] == make_pair(5, 'D'))); + + REQUIRE(consolidated_1); + REQUIRE(consolidated_2); +} + +TEST_CASE("simplify_cigar merges runs of adjacent operations and removes empty operations", "[alignment][surject]") { + + vector> cigar{ + make_pair(2, 'S'), + make_pair(1, 'M'), + make_pair(1, 'M'), + make_pair(0, 'D'), + make_pair(1, 'I'), + make_pair(1, 'M') + }; + + simplify_cigar(cigar); + REQUIRE(cigar.size() == 4); + REQUIRE(cigar[0] == make_pair(2, 'S')); + REQUIRE(cigar[1] == make_pair(2, 'M')); + REQUIRE(cigar[2] == make_pair(1, 'I')); + REQUIRE(cigar[3] == make_pair(1, 'M')); +} + +TEST_CASE("Inter-alignment distance computation for HTS output formats matches BWA", "[alignment]") { + // See https://github.com/vgteam/vg/issues/3078. We want to match BWA on + // these straightforward, fully-matching reads. + auto lengths = compute_template_lengths(10206220, {{151, 'M'}}, 10206662, {{151, 'M'}}); + REQUIRE(lengths.first == 593); + REQUIRE(lengths.second == -593); +} + +TEST_CASE("CIGAR generation forces adjacent insertions and deletions to obey GATK's constraints", "[alignment]") { + // See https://github.com/vgteam/vg/issues/3080 + vector> cigar; + + SECTION("DID becomes DI") { + append_cigar_operation(1, 'D', cigar); + append_cigar_operation(5, 'I', cigar); + append_cigar_operation(2, 'D', cigar); + + REQUIRE(cigar.size() == 2); + REQUIRE(cigar[0].first == 3); + REQUIRE(cigar[0].second == 'D'); + REQUIRE(cigar[1].first == 5); + REQUIRE(cigar[1].second == 'I'); + } + + SECTION("MMIDIIDDIMM becomes MDIM") { + append_cigar_operation(1, 'M', cigar); + append_cigar_operation(1, 'M', cigar); + append_cigar_operation(1, 'I', cigar); + append_cigar_operation(1, 'D', cigar); + append_cigar_operation(1, 'I', cigar); + append_cigar_operation(1, 'I', cigar); + append_cigar_operation(1, 'D', cigar); + append_cigar_operation(1, 'D', cigar); + append_cigar_operation(1, 'I', cigar); + append_cigar_operation(1, 'M', cigar); + append_cigar_operation(1, 'M', cigar); + + REQUIRE(cigar.size() == 4); + REQUIRE(cigar[0].first == 2); + REQUIRE(cigar[0].second == 'M'); + REQUIRE(cigar[1].first == 3); + REQUIRE(cigar[1].second == 'D'); + REQUIRE(cigar[2].first == 4); + REQUIRE(cigar[2].second == 'I'); + REQUIRE(cigar[3].first == 2); + REQUIRE(cigar[3].second == 'M'); + } + +} } } diff --git a/src/unittest/annotation.cpp b/src/unittest/annotation.cpp index fe01c6bfcb9..de4d31d0dcb 100644 --- a/src/unittest/annotation.cpp +++ b/src/unittest/annotation.cpp @@ -5,8 +5,8 @@ #include #include -#include "../json2pb.h" -#include "../vg.pb.h" +#include "vg/io/json2pb.h" +#include #include "../annotation.hpp" #include "catch.hpp" @@ -76,6 +76,16 @@ TEST_CASE("Multi-value annotations can be set and gotten and cleared", "[alignme REQUIRE(recovered.size() == 0); } + +TEST_CASE("Annotations convert to JSON sensibly", "[alignment][annotation]") { + + Alignment aln; + set_annotation(&aln, "snake_case_number", 1.5); + + string json = pb2json(aln); + + REQUIRE(json == R"({"annotation": {"snake_case_number": 1.5}})"); +} } } diff --git a/src/unittest/back_translate.cpp b/src/unittest/back_translate.cpp new file mode 100644 index 00000000000..01929b6c6d4 --- /dev/null +++ b/src/unittest/back_translate.cpp @@ -0,0 +1,374 @@ +/// \file alignment.cpp +/// +/// unit tests for Alignments and their utility functions +/// + +#include +#include +#include +#include "vg/io/json2pb.h" +#include +#include +#include +#include "../algorithms/back_translate.hpp" +#include "catch.hpp" +#include "alignment.hpp" + +namespace vg { +namespace unittest { +using namespace std; + +class MockBackTranslation : public NamedNodeBackTranslation { +public: + + unordered_map> translation; + unordered_map node_names; + + /** + * Translate the given range of bases on the given orientation of the given + * node in the current graph, to zero or more ranges on orientations of + * nodes in some prior graph. + */ + std::vector translate_back(const oriented_node_range_t& range) const { + return translation.at(range); + } + + /** + * Get the name of a node in the graph that translate_back() translates + * into, given its number. + */ + std::string get_back_graph_node_name(const nid_t& back_node_id) const { + return node_names.at(back_node_id); + } + + +}; + +TEST_CASE("A Path can be back-translated", "[algorithms][back_translate]") { + + // Make a path + string path_string = R"( + { + "mapping": [ + { + "position": {"node_id": 1}, + "edit": [ + {"from_length": 1, "to_length": 1}, + {"from_length": 1} + ] + }, + { + "position": {"node_id": 2}, + "edit": [ + {"from_length": 1}, + {"from_length": 1, "to_length": 1} + ] + } + ] + } + )"; + Path p; + json2pb(p, path_string.c_str(), path_string.size()); + + // Define a translation back to a named node space + MockBackTranslation trans; + trans.translation[oriented_node_range_t(1, false, 0, 2)] = {oriented_node_range_t(1, false, 5, 2)}; + trans.translation[oriented_node_range_t(2, false, 0, 2)] = {oriented_node_range_t(1, false, 7, 2)}; + trans.node_names[1] = "TheNode"; + + // Translate the path + vg::algorithms::back_translate_in_place(&trans, p); + + // Check the result + REQUIRE(p.mapping_size() == 1); + REQUIRE(p.mapping(0).edit_size() == 3); + REQUIRE(p.mapping(0).position().node_id() == 0); + REQUIRE(p.mapping(0).position().name() == "TheNode"); + REQUIRE(p.mapping(0).position().offset() == 5); +} + +TEST_CASE("A Path can be back-translated properly when it goes around cycles", "[algorithms][back_translate]") { + + // Make a path + string path_string = R"( + { + "mapping": [ + { + "position": {"node_id": 1}, + "edit": [ + {"from_length": 10, "to_length": 10} + ] + }, + { + "position": {"node_id": 1}, + "edit": [ + {"from_length": 10, "to_length": 10} + ] + }, + { + "position": {"node_id": 1}, + "edit": [ + {"from_length": 10, "to_length": 10} + ] + }, + { + "position": {"node_id": 1, "is_reverse": true}, + "edit": [ + {"from_length": 10, "to_length": 10} + ] + }, + { + "position": {"node_id": 2}, + "edit": [ + {"from_length": 5, "to_length": 5} + ] + }, + { + "position": {"node_id": 3}, + "edit": [ + {"from_length": 5, "to_length": 5} + ] + }, + { + "position": {"node_id": 2}, + "edit": [ + {"from_length": 5, "to_length": 5} + ] + }, + { + "position": {"node_id": 3}, + "edit": [ + {"from_length": 5, "to_length": 5} + ] + }, + { + "position": {"node_id": 2}, + "edit": [ + {"from_length": 5, "to_length": 5} + ] + } + ] + } + )"; + Path p; + json2pb(p, path_string.c_str(), path_string.size()); + + // Define a translation back to a named node space. + // We have node 1 just be as is and nodes 2 and 3 come from two halves of segment 2 + MockBackTranslation trans; + trans.translation[oriented_node_range_t(1, false, 0, 10)] = {oriented_node_range_t(1, false, 0, 10)}; + trans.translation[oriented_node_range_t(1, true, 0, 10)] = {oriented_node_range_t(1, true, 0, 10)}; + trans.translation[oriented_node_range_t(2, false, 0, 5)] = {oriented_node_range_t(2, false, 0, 5)}; + trans.translation[oriented_node_range_t(3, false, 0, 5)] = {oriented_node_range_t(2, false, 5, 5)}; + trans.node_names[1] = "A"; + trans.node_names[2] = "B"; + + // Translate the path + vg::algorithms::back_translate_in_place(&trans, p); + + // Check the result + REQUIRE(p.mapping_size() == 7); + REQUIRE(p.mapping(0).edit_size() == 1); + REQUIRE(p.mapping(0).edit(0).from_length() == 10); + REQUIRE(p.mapping(0).position().name() == "A"); + REQUIRE(p.mapping(0).position().offset() == 0); + REQUIRE(p.mapping(0).position().is_reverse() == false); + + REQUIRE(p.mapping(1).edit_size() == 1); + REQUIRE(p.mapping(1).edit(0).from_length() == 10); + REQUIRE(p.mapping(1).position().name() == "A"); + REQUIRE(p.mapping(1).position().offset() == 0); + REQUIRE(p.mapping(1).position().is_reverse() == false); + + REQUIRE(p.mapping(2).edit_size() == 1); + REQUIRE(p.mapping(2).edit(0).from_length() == 10); + REQUIRE(p.mapping(2).position().name() == "A"); + REQUIRE(p.mapping(2).position().offset() == 0); + REQUIRE(p.mapping(2).position().is_reverse() == false); + + REQUIRE(p.mapping(3).edit_size() == 1); + REQUIRE(p.mapping(3).edit(0).from_length() == 10); + REQUIRE(p.mapping(3).position().name() == "A"); + REQUIRE(p.mapping(3).position().offset() == 0); + REQUIRE(p.mapping(3).position().is_reverse() == true); + + REQUIRE(p.mapping(4).edit_size() == 1); + REQUIRE(p.mapping(4).edit(0).from_length() == 10); + REQUIRE(p.mapping(4).position().name() == "B"); + REQUIRE(p.mapping(4).position().offset() == 0); + REQUIRE(p.mapping(4).position().is_reverse() == false); + + REQUIRE(p.mapping(5).edit_size() == 1); + REQUIRE(p.mapping(5).edit(0).from_length() == 10); + REQUIRE(p.mapping(5).position().name() == "B"); + REQUIRE(p.mapping(5).position().offset() == 0); + REQUIRE(p.mapping(5).position().is_reverse() == false); + + REQUIRE(p.mapping(6).edit_size() == 1); + REQUIRE(p.mapping(6).edit(0).from_length() == 5); + REQUIRE(p.mapping(6).position().name() == "B"); + REQUIRE(p.mapping(6).position().offset() == 0); + REQUIRE(p.mapping(6).position().is_reverse() == false); +} + +TEST_CASE("An Alignment can be back-translated while converting to GAF", "[algorithms][back_translate]") { + + bdsg::HashGraph g; + + handle_t h1 = g.create_handle("G"); + handle_t h2 = g.create_handle("GGGG"); + handle_t h3 = g.create_handle("AT"); + handle_t h4 = g.create_handle("ACACAAA"); + handle_t h5 = g.create_handle("A"); + + g.create_edge(h1, h2); + g.create_edge(h2, h3); + g.create_edge(h3, h4); + g.create_edge(h4, h5); + + // Define a translation back to a named node space + MockBackTranslation trans; + trans.translation[oriented_node_range_t(2, false, 2, 2)] = {oriented_node_range_t(1, false, 3, 2)}; + trans.translation[oriented_node_range_t(3, false, 0, 2)] = {oriented_node_range_t(1, false, 5, 2)}; + trans.translation[oriented_node_range_t(4, false, 0, 5)] = {oriented_node_range_t(2, false, 0, 5)}; + // Make sure to include reverse strand translation info for segment length sniffing. + trans.translation[oriented_node_range_t(2, true, 0, 0)] = {oriented_node_range_t(1, true, 2, 0)}; + trans.translation[oriented_node_range_t(3, true, 0, 0)] = {oriented_node_range_t(1, true, 0, 0)}; + trans.translation[oriented_node_range_t(4, true, 0, 0)] = {oriented_node_range_t(2, true, 1, 0)}; + trans.node_names[1] = "FirstSegment"; + trans.node_names[2] = "SecondSegment"; + + + string alignment_string = R"( + { + "name": "francine", + "mapping_quality": 30, + "sequence": "GATTACA", + "path": {"mapping": [ + { + "position": {"node_id": 2, "offset": 2}, + "edit": [ + {"from_length": 1, "to_length": 1}, + {"from_length": 1} + ] + }, + { + "position": {"node_id": 3}, + "edit": [ + {"from_length": 1, "to_length": 1}, + {"to_length": 1, "sequence": "T"}, + {"from_length": 1, "to_length": 1} + ] + }, + { + "position": {"node_id": 4}, + "edit": [ + {"from_length": 1, "to_length": 1}, + {"from_length": 2}, + {"from_length": 2, "to_length": 2} + ] + } + ]} + } + )"; + + Alignment a; + json2pb(a, alignment_string.c_str(), alignment_string.size()); + + REQUIRE(alignment_from_length(a) == 9); + REQUIRE(alignment_to_length(a) == 7); + size_t block_length = std::max(alignment_from_length(a), alignment_to_length(a)); + + SECTION("Translating GAF generation produces the right GAF") { + auto node_space = vg::io::alignment_to_gaf(g, a); + stringstream s; + s << node_space; + + // See column definitions at https://github.com/lh3/gfatools/blob/master/doc/rGFA.md#the-graph-alignment-format-gaf + // Alignment block length is longest involved sequence. + // Note that we combine adjacent duplicate operations in cs across node boundaries. + // Note that end position is 0-based inclusive + // Path length is 13 bp, and we run [2 to 11) (9 bases) + REQUIRE(s.str() == "francine\t7\t0\t7\t+\t>2>3>4\t13\t2\t11\t6\t" + std::to_string(block_length) + "\t30\tcs:Z::1-G:1+T:2-CA:2"); + } + + SECTION("Translating GAF generation with a translation produces the right GAF") { + auto back_translated = vg::io::alignment_to_gaf(g, a, &trans); + stringstream s; + s << back_translated; + + REQUIRE(s.str() == "francine\t7\t0\t7\t+\t>FirstSegment>SecondSegment\t15\t3\t12\t6\t" + std::to_string(block_length) + "\t30\tcs:Z::1-G:1+T:2-CA:2"); + } +} + +TEST_CASE("A reverse-strand Alignment can be back-translated while converting to GAF", "[algorithms][back_translate]") { + + bdsg::HashGraph g; + + handle_t h1 = g.create_handle("G"); + handle_t h2 = g.create_handle("GGGG"); + handle_t h3 = g.create_handle("AT"); + handle_t h4 = g.create_handle("ACACAAA"); + handle_t h5 = g.create_handle("A"); + + g.create_edge(h1, h2); + g.create_edge(h2, h3); + g.create_edge(h3, h4); + g.create_edge(h4, h5); + + // Define a translation back to a named node space + MockBackTranslation trans; + trans.translation[oriented_node_range_t(3, true, 1, 1)] = {oriented_node_range_t(1, true, 1, 1)}; + trans.translation[oriented_node_range_t(2, true, 0, 2)] = {oriented_node_range_t(1, true, 2, 2)}; + // Make sure to include reverse strand translation info for segment length sniffing. + trans.translation[oriented_node_range_t(2, false, 0, 0)] = {oriented_node_range_t(1, false, 1, 0)}; + trans.translation[oriented_node_range_t(3, false, 0, 0)] = {oriented_node_range_t(1, false, 5, 0)}; + trans.node_names[1] = "FirstSegment"; + trans.node_names[2] = "SecondSegment"; + + + string alignment_string = R"( + { + "name": "steve", + "mapping_quality": 30, + "sequence": "TCC", + "path": {"mapping": [ + { + "position": {"node_id": 3, "offset": 1, "is_reverse": true}, + "edit": [ + {"from_length": 1, "to_length": 1} + ] + }, + { + "position": {"node_id": 2, "is_reverse": true}, + "edit": [ + {"from_length": 2, "to_length": 2} + ] + } + ]} + } + )"; + + // We take up 3 bases. + // We leave 1 base of node 3 on the left, and 2 bases of node 4 on the right. + // We leave 1 base of segment 1 on the left and 3 bases of segment 1 on the right. + + Alignment a; + json2pb(a, alignment_string.c_str(), alignment_string.size()); + + REQUIRE(alignment_from_length(a) == 3); + REQUIRE(alignment_to_length(a) == 3); + size_t block_length = std::max(alignment_from_length(a), alignment_to_length(a)); + + auto back_translated = vg::io::alignment_to_gaf(g, a, &trans); + stringstream s; + s << back_translated; + + // Should be mapped to a length-7 path, at offset 1 to 4, 3 matches in a length 3 block. + REQUIRE(s.str() == "steve\t3\t0\t3\t+\t #include "catch.hpp" -#include "gssw_aligner.hpp" +#include "test_aligner.hpp" #include "vg.hpp" #include "path.hpp" #include "banded_global_aligner.hpp" -#include "json2pb.h" +#include "vg/io/json2pb.h" +#include "bdsg/hash_graph.hpp" using namespace google::protobuf; - +using namespace vg::io; namespace vg { namespace unittest { @@ -24,7 +25,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -42,7 +44,7 @@ namespace vg { int band_width = 1; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -73,7 +75,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -91,7 +94,7 @@ namespace vg { int band_width = 1; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -122,7 +125,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("CCCAGTT"); Node* n1 = graph.create_node("C"); @@ -140,7 +144,7 @@ namespace vg { int band_width = 1; bool permissive_banding = true; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -175,7 +179,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("CCCAGATG"); Node* n1 = graph.create_node("C"); @@ -193,7 +198,7 @@ namespace vg { int band_width = 2; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -232,7 +237,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AACCCAGATG"); Node* n1 = graph.create_node("C"); @@ -250,7 +256,7 @@ namespace vg { int band_width = 2; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding);; + aligner.align_global_banded(aln, graph, band_width, permissive_banding);; const Path& path = aln.path(); @@ -289,7 +295,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AACCCAGG"); Node* n1 = graph.create_node("C"); @@ -307,7 +314,7 @@ namespace vg { int band_width = 2; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -346,7 +353,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AACCCAGG"); Node* n1 = graph.create_node("C"); @@ -364,7 +372,7 @@ namespace vg { int band_width = 2; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -403,7 +411,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AACCCAGG"); Node* n1 = graph.create_node("CA"); @@ -421,7 +430,7 @@ namespace vg { int band_width = 2; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -461,7 +470,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AACCCAGG"); Node* n1 = graph.create_node("CA"); @@ -479,7 +489,7 @@ namespace vg { int band_width = 2; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -516,7 +526,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AACCCAGG"); Node* n1 = graph.create_node("CA"); @@ -534,7 +545,7 @@ namespace vg { int band_width = 2; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -569,7 +580,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AACCCAGG"); Node* n1 = graph.create_node("CA"); @@ -587,7 +599,7 @@ namespace vg { int band_width = 2; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding);; + aligner.align_global_banded(aln, graph, band_width, permissive_banding);; const Path& path = aln.path(); @@ -622,7 +634,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AACCCAGG"); Node* n1 = graph.create_node("CA"); @@ -640,7 +653,7 @@ namespace vg { int band_width = 2; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding);; + aligner.align_global_banded(aln, graph, band_width, permissive_banding);; const Path& path = aln.path(); @@ -675,7 +688,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AA"); Node* n1 = graph.create_node("CCCAGGCA"); @@ -693,7 +707,7 @@ namespace vg { int band_width = 3; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -729,7 +743,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("A"); Node* n1 = graph.create_node("C"); @@ -742,7 +757,7 @@ namespace vg { int band_padding = 1; - aligner.align_global_banded(aln, graph.graph, band_padding); + aligner.align_global_banded(aln, graph, band_padding); const Path& path = aln.path(); @@ -770,7 +785,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AATG"); Node* n1 = graph.create_node("C"); @@ -783,7 +799,7 @@ namespace vg { int band_padding = 1; - aligner.align_global_banded(aln, graph.graph, band_padding); + aligner.align_global_banded(aln, graph, band_padding); const Path& path = aln.path(); @@ -814,7 +830,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("TG"); Node* n1 = graph.create_node("TGGC"); @@ -832,7 +849,7 @@ namespace vg { int band_width = 1; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -863,7 +880,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("C"); @@ -873,7 +891,7 @@ namespace vg { int band_padding = 1; - aligner.align_global_banded(aln, graph.graph, band_padding); + aligner.align_global_banded(aln, graph, band_padding); const Path& path = aln.path(); @@ -895,7 +913,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("CTAG"); Node* n1 = graph.create_node("T"); @@ -914,7 +933,7 @@ namespace vg { int band_padding = 1; - aligner.align_global_banded(aln, graph.graph, band_padding); + aligner.align_global_banded(aln, graph, band_padding); const Path& path = aln.path(); @@ -945,7 +964,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("TG"); Node* n1 = graph.create_node("TGGC"); @@ -963,7 +983,7 @@ namespace vg { int band_width = 1; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -994,7 +1014,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("CGCC"); @@ -1009,7 +1030,7 @@ namespace vg { int band_width = 1; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1035,7 +1056,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -1050,7 +1072,7 @@ namespace vg { int band_width = 1; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1076,7 +1098,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("CGCC"); @@ -1095,7 +1118,7 @@ namespace vg { int band_width = 1; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1126,7 +1149,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("CGCC"); @@ -1142,7 +1166,7 @@ namespace vg { int band_width = 1; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1173,7 +1197,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("CAGGA"); Node* n1 = graph.create_node("AA"); @@ -1191,7 +1216,7 @@ namespace vg { int band_width = 20; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1222,7 +1247,10 @@ namespace vg { VG graph; - Aligner aligner; + + + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -1240,7 +1268,7 @@ namespace vg { int band_width = 0; bool permissive_banding = false; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1285,14 +1313,17 @@ namespace vg { graph.create_edge(n3, n4); graph.create_edge(n4, n5); - Aligner aligner; + + + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); string read = "C"; Alignment aln; aln.set_sequence(read); int band_padding = 1; - aligner.align_global_banded(aln, graph.graph, band_padding); + aligner.align_global_banded(aln, graph, band_padding); const Path& path = aln.path(); @@ -1314,7 +1345,8 @@ namespace vg { VG graph; - Aligner aligner = Aligner(); + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -1332,7 +1364,7 @@ namespace vg { int band_width = 1; bool permissive_banding = true; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1363,7 +1395,10 @@ namespace vg { VG graph; - Aligner aligner; + + + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("CTGGTGTAGTA"); @@ -1381,7 +1416,7 @@ namespace vg { int band_width = 0; bool permissive_banding = true; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1445,7 +1480,7 @@ namespace vg { int band_width = 1; bool permissive_banding = true; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1497,7 +1532,7 @@ namespace vg { int band_width = 1; bool permissive_banding = true; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); const Path& path = aln.path(); @@ -1548,8 +1583,8 @@ namespace vg { int band_width = 1; bool permissive_banding = true; - aligner.align_global_banded(aln_full, graph.graph, band_width, permissive_banding); - aligner.align_global_banded(aln_reduced, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln_full, graph, band_width, permissive_banding); + aligner.align_global_banded(aln_reduced, graph, band_width, permissive_banding); const Path& path_full = aln_full.path(); const Path& path_reduced = aln_reduced.path(); @@ -1604,7 +1639,8 @@ namespace vg { VG graph; - Aligner aligner = Aligner(); + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("ACGTAGTCTGAA"); Node* n1 = graph.create_node("CA"); @@ -1626,7 +1662,7 @@ namespace vg { bool permissive_banding = true; vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); const Path& path = aln.path(); @@ -1650,7 +1686,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("ACGTAGTCTGAA"); Node* n1 = graph.create_node("CA"); @@ -1671,7 +1708,7 @@ namespace vg { bool permissive_banding = true; vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); REQUIRE(aln.sequence() == multi_alns[0].sequence()); @@ -1694,7 +1731,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("ACGTAGTCTGAA"); Node* n1 = graph.create_node("CA"); @@ -1715,7 +1753,7 @@ namespace vg { bool permissive_banding = true; vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); bool found_first_opt = false; @@ -1790,7 +1828,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("ACGTAGTCTGAA"); Node* n1 = graph.create_node("C"); @@ -1811,7 +1850,7 @@ namespace vg { bool permissive_banding = true; vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); bool took_alternate_path = false; @@ -1835,7 +1874,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AAAAAAAA"); Node* n1 = graph.create_node("GGG"); @@ -1863,7 +1903,7 @@ namespace vg { bool permissive_banding = true; vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); bool found_first_opt = false; @@ -1949,7 +1989,8 @@ namespace vg { VG graph; - Aligner aligner(1, 4, 6, 1); + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("A"); Node* n1 = graph.create_node("T"); @@ -1970,7 +2011,7 @@ namespace vg { bool permissive_banding = true; vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); bool found_first_opt = false; @@ -2052,7 +2093,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AAAAAAAAAA"); Node* n1 = graph.create_node("CGGC"); @@ -2073,7 +2115,7 @@ namespace vg { bool permissive_banding = true; vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); bool found_first_opt = false; @@ -2173,7 +2215,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("C"); Node* n1 = graph.create_node("T"); @@ -2191,7 +2234,7 @@ namespace vg { bool permissive_banding = true; vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); bool found_first_opt = false; @@ -2252,7 +2295,8 @@ namespace vg { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); // low complexity sequences to ensure many alternate alignments Node* n0 = graph.create_node("CCCCCCCCCTCCCCCCCCCCTCCCCCCCCCCGACCCCCCCCCCC"); @@ -2274,7 +2318,7 @@ namespace vg { bool permissive_banding = true; vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); unordered_set alns_seen; @@ -2293,7 +2337,8 @@ namespace vg { SECTION( "Banded global aligner can align Ns to letters" ) { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -2311,7 +2356,7 @@ namespace vg { int band_width = 1; bool permissive_banding = true; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); SECTION("alignment ends in full-length matches/mismatches") { REQUIRE(aln.path().mapping_size() == 3); @@ -2327,7 +2372,8 @@ namespace vg { SECTION( "Banded global aligner can align letters to Ns" ) { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("AGTG"); Node* n1 = graph.create_node("C"); @@ -2345,7 +2391,7 @@ namespace vg { int band_width = 1; bool permissive_banding = true; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); SECTION("alignment ends in full-length matches/mismatches") { REQUIRE(aln.path().mapping_size() == 3); @@ -2361,7 +2407,8 @@ namespace vg { SECTION( "Banded global aligner can align Ns to Ns" ) { VG graph; - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("NNNG"); Node* n1 = graph.create_node("C"); @@ -2379,7 +2426,7 @@ namespace vg { int band_width = 1; bool permissive_banding = true; - aligner.align_global_banded(aln, graph.graph, band_width, permissive_banding); + aligner.align_global_banded(aln, graph, band_width, permissive_banding); SECTION("alignment ends in full-length matches/mismatches") { REQUIRE(aln.path().mapping_size() == 3); @@ -2399,7 +2446,8 @@ namespace vg { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node(""); Node* n1 = graph.create_node("CT"); @@ -2422,7 +2470,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2447,14 +2495,14 @@ namespace vg { REQUIRE(aln.path().mapping(2).edit(0).sequence().empty()); aln.Clear(); - read = "TGA"; + read = "AGA"; qual = "HHH"; aln.set_sequence(read); aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2462,7 +2510,7 @@ namespace vg { // follows correct path REQUIRE(aln.path().mapping(0).position().node_id() == n0->id()); - REQUIRE(aln.path().mapping(1).position().node_id() == n1->id()); + REQUIRE(aln.path().mapping(1).position().node_id() == n2->id()); REQUIRE(aln.path().mapping(2).position().node_id() == n3->id()); // has corrects edits @@ -2487,7 +2535,8 @@ namespace vg { SECTION( "Banded global aligner can align to a graph with an empty sink node") { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node("GA"); Node* n1 = graph.create_node("CT"); @@ -2511,7 +2560,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2539,7 +2588,8 @@ namespace vg { SECTION( "Banded global aligner can align to a graph with an empty source and sink node") { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node(""); Node* n1 = graph.create_node("CT"); @@ -2562,7 +2612,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2590,7 +2640,8 @@ namespace vg { SECTION( "Banded global aligner can align to a graph with a chained empty source and sink nodes") { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node(""); Node* n1 = graph.create_node(""); @@ -2617,7 +2668,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2655,7 +2706,8 @@ namespace vg { SECTION( "Banded global aligner can align to a graph with an empty nodes that is both a source and sink") { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node(""); @@ -2670,7 +2722,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2688,7 +2740,8 @@ namespace vg { SECTION( "Banded global aligner can align to a graph with both empty and non-empty sources and sinks") { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node(""); Node* n1 = graph.create_node("GA"); @@ -2712,7 +2765,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2745,7 +2798,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2773,7 +2826,8 @@ namespace vg { SECTION( "Banded global aligner can align to a graph with empty interior nodes") { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node("GA"); Node* n1 = graph.create_node(""); @@ -2798,7 +2852,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2831,7 +2885,8 @@ namespace vg { SECTION( "Banded global aligner can align to an empty graph" ) { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node(""); @@ -2846,7 +2901,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2865,7 +2920,8 @@ namespace vg { SECTION( "Banded global aligner can align to an empty graph of more than one node" ) { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node(""); Node* n1 = graph.create_node(""); @@ -2883,7 +2939,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2907,7 +2963,8 @@ namespace vg { SECTION( "Banded global aligner can align to a graph with empty and non-empty paths" ) { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node(""); Node* n1 = graph.create_node(""); @@ -2926,7 +2983,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2955,7 +3012,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -2974,7 +3031,8 @@ namespace vg { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node("GA"); Node* n1 = graph.create_node(""); @@ -2996,7 +3054,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); bool found_first_opt = false; @@ -3053,7 +3111,8 @@ namespace vg { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node("C"); Node* n1 = graph.create_node("TT"); @@ -3072,7 +3131,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); REQUIRE(multi_alns.size() <= 3); @@ -3136,7 +3195,8 @@ namespace vg { VG graph; - QualAdjAligner aligner(1, 4, 6, 1, 5, 6); + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); Node* n0 = graph.create_node("A"); Node* n1 = graph.create_node("G"); @@ -3156,7 +3216,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -3186,7 +3246,8 @@ namespace vg { VG graph; - Aligner aligner(1, 4, 6, 1, 5); + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("A"); Node* n1 = graph.create_node("G"); @@ -3209,7 +3270,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -3240,7 +3301,8 @@ namespace vg { VG graph; - Aligner aligner(1, 4, 6, 1, 5); + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node(""); Node* n1 = graph.create_node("G"); @@ -3267,7 +3329,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -3308,7 +3370,8 @@ namespace vg { VG graph; - Aligner aligner(1, 4, 6, 1, 5); + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node(""); @@ -3323,7 +3386,7 @@ namespace vg { aln.set_quality(qual); alignment_quality_char_to_short(aln); - aligner.align_global_banded(aln, graph.graph, band_padding, permissive_banding); + aligner.align_global_banded(aln, graph, band_padding, permissive_banding); // is a global alignment REQUIRE(aln.path().mapping(0).position().offset() == 0); @@ -3344,7 +3407,8 @@ namespace vg { VG graph; - Aligner aligner(1, 4, 6, 1, 5); + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); Node* n0 = graph.create_node("A"); Node* n1 = graph.create_node("GG"); @@ -3369,7 +3433,7 @@ namespace vg { alignment_quality_char_to_short(aln); vector multi_alns; - aligner.align_global_banded_multi(aln, multi_alns, graph.graph, max_multi_alns, + aligner.align_global_banded_multi(aln, multi_alns, graph, max_multi_alns, band_padding, permissive_banding); bool found_first_opt = false, found_second_opt = false; @@ -3416,8 +3480,10 @@ namespace vg { Graph graph; json2pb(graph, graph_json.c_str(), graph_json.size()); + VG vg_graph(graph); - Aligner aligner; + TestAligner aligner_source; + const Aligner& aligner = *aligner_source.get_regular_aligner(); string read = "TTTTGATGGAGGCC"; Alignment aln; @@ -3425,7 +3491,7 @@ namespace vg { int band_width = 1; bool permissive_banding = true; - aligner.align_global_banded(aln, graph, band_width, permissive_banding); + aligner.align_global_banded(aln, vg_graph, band_width, permissive_banding); const Path& p = aln.path(); for (int i = 0; i < p.mapping_size();i++) { @@ -3438,6 +3504,64 @@ namespace vg { } } } + + TEST_CASE( "Banded global aligner doesn't crash when the worst possible score is just on the edge of needing a larger int size", + "[alignment][banded][mapping]" ) { + + bdsg::HashGraph graph; + + handle_t h0 = graph.create_handle("", 68181350); + handle_t h1 = graph.create_handle("G", 68181343); + handle_t h2 = graph.create_handle("TGAGTGG", 68181344); + handle_t h3 = graph.create_handle("CTTTGGTTCCCGGCTGAGGTGGAGTGGGCTGA", 68181345); + handle_t h4 = graph.create_handle("GGACTAGACTGAGCCCTCGGACATGGAGGTGG", 68181346); + handle_t h5 = graph.create_handle("GGATGGGGCAGACTCATCCCATTCTTGACCAA", 68181347); + handle_t h6 = graph.create_handle("GCCCTTGTTCTGCTCCCTTCCCAG", 68181348); + handle_t h7 = graph.create_handle("", 68181349); + + graph.create_edge(h0, h1); + graph.create_edge(h0, h7); + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + graph.create_edge(h5, h6); + graph.create_edge(h6, h7); + + string sequence = "AA"; + Alignment aln; + aln.set_sequence(sequence); + + TestAligner aligner_source; + aligner_source.set_alignment_scores(1, 1, 1, 1, 0); + const Aligner& aligner = *aligner_source.get_regular_aligner(); + + aligner.align_global_banded(aln, graph, 2, true); + + REQUIRE(aln.path().mapping_size() == 2); + REQUIRE(aln.path().mapping(0).position().node_id() == graph.get_id(h0)); + REQUIRE(aln.path().mapping(1).position().node_id() == graph.get_id(h7)); + } + + TEST_CASE("Try to recreate a memory access bug", "[alignment][banded][mapping][memory]") { + + // note: this never crashed, but the bug shows up on valgrind + + bdsg::HashGraph graph; + + handle_t h0 = graph.create_handle("T"); + + string sequence = "CTCATTCCCGGAACCTTGAAATGGAGCT"; + string qual = "DCDD=2DECBEC=F@E?BEFEEFECED<"; + Alignment aln; + aln.set_sequence(sequence); + aln.set_quality(string_quality_short_to_char(qual)); + + TestAligner aligner_source; + const QualAdjAligner& aligner = *aligner_source.get_qual_adj_aligner(); + + aligner.align_global_banded(aln, graph, 1, true); + } } } diff --git a/src/unittest/blocked_gzip_input_stream.cpp b/src/unittest/blocked_gzip_input_stream.cpp index 74b6bb6f199..89d230cb5f8 100644 --- a/src/unittest/blocked_gzip_input_stream.cpp +++ b/src/unittest/blocked_gzip_input_stream.cpp @@ -2,9 +2,9 @@ /// /// Unit tests for BlockedGzipInputStream -#include "../blocked_gzip_input_stream.hpp" -#include "../blocked_gzip_output_stream.hpp" -#include "../hfile_cppstream.hpp" +#include +#include +#include #include "catch.hpp" #include @@ -23,7 +23,7 @@ namespace vg { namespace unittest { using namespace std; -using namespace vg::stream; +using namespace vg::io; // We have a tiny function to get virtual offsets, based on the block's start // offset in the file, and the offset in the block diff --git a/src/unittest/blocked_gzip_output_stream.cpp b/src/unittest/blocked_gzip_output_stream.cpp index 114ffb73106..1402ced8e98 100644 --- a/src/unittest/blocked_gzip_output_stream.cpp +++ b/src/unittest/blocked_gzip_output_stream.cpp @@ -2,8 +2,8 @@ /// /// Unit tests for BlockedGzipOutputStream -#include "../blocked_gzip_output_stream.hpp" -#include "../hfile_cppstream.hpp" +#include +#include #include "catch.hpp" #include @@ -13,7 +13,7 @@ namespace vg { namespace unittest { using namespace std; -using namespace vg::stream; +using namespace vg::io; // We have a tiny function to get virtual offsets, based on the block's start // offset in the file, and the offset in the block diff --git a/src/unittest/cactus.cpp b/src/unittest/cactus.cpp index e4535fd17d9..7447ee247dd 100644 --- a/src/unittest/cactus.cpp +++ b/src/unittest/cactus.cpp @@ -5,7 +5,7 @@ #include #include -#include "../json2pb.h" +#include "vg/io/json2pb.h" #include "../cactus.hpp" #include "catch.hpp" diff --git a/src/unittest/catch.hpp b/src/unittest/catch.hpp index 79036a40438..db1fed3b981 100644 --- a/src/unittest/catch.hpp +++ b/src/unittest/catch.hpp @@ -1,17 +1,21 @@ /* - * Catch v1.8.2 - * Generated: 2017-03-13 21:18:33.619572 + * Catch v2.13.8 + * Generated: 2022-01-03 21:20:09.589503 * ---------------------------------------------------------- * This file has been merged from multiple headers. Please don't edit it directly - * Copyright (c) 2012 Two Blue Cubes Ltd. All rights reserved. + * Copyright (c) 2022 Two Blue Cubes Ltd. All rights reserved. * * Distributed under the Boost Software License, Version 1.0. (See accompanying * file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) */ #ifndef TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED #define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED +// start catch.hpp -#define TWOBLUECUBES_CATCH_HPP_INCLUDED + +#define CATCH_VERSION_MAJOR 2 +#define CATCH_VERSION_MINOR 13 +#define CATCH_VERSION_PATCH 8 #ifdef __clang__ # pragma clang system_header @@ -19,40 +23,69 @@ # pragma GCC system_header #endif -// #included from: internal/catch_suppress_warnings.h +// start catch_suppress_warnings.h #ifdef __clang__ # ifdef __ICC // icpc defines the __clang__ macro # pragma warning(push) # pragma warning(disable: 161 1682) # else // __ICC -# pragma clang diagnostic ignored "-Wglobal-constructors" -# pragma clang diagnostic ignored "-Wvariadic-macros" -# pragma clang diagnostic ignored "-Wc99-extensions" -# pragma clang diagnostic ignored "-Wunused-variable" # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wpadded" -# pragma clang diagnostic ignored "-Wc++98-compat" -# pragma clang diagnostic ignored "-Wc++98-compat-pedantic" # pragma clang diagnostic ignored "-Wswitch-enum" # pragma clang diagnostic ignored "-Wcovered-switch-default" # endif #elif defined __GNUC__ -# pragma GCC diagnostic ignored "-Wvariadic-macros" -# pragma GCC diagnostic ignored "-Wunused-variable" - - // For newer version we can use __Pragma to disable the warnings locally -# if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ <= 7 -# pragma GCC diagnostic ignored "-Wparentheses" -# endif + // Because REQUIREs trigger GCC's -Wparentheses, and because still + // supported version of g++ have only buggy support for _Pragmas, + // Wparentheses have to be suppressed globally. +# pragma GCC diagnostic ignored "-Wparentheses" // See #674 for details # pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-variable" # pragma GCC diagnostic ignored "-Wpadded" #endif +// end catch_suppress_warnings.h #if defined(CATCH_CONFIG_MAIN) || defined(CATCH_CONFIG_RUNNER) # define CATCH_IMPL +# define CATCH_CONFIG_ALL_PARTS +#endif + +// In the impl file, we want to have access to all parts of the headers +// Can also be used to sanely support PCHs +#if defined(CATCH_CONFIG_ALL_PARTS) +# define CATCH_CONFIG_EXTERNAL_INTERFACES +# if defined(CATCH_CONFIG_DISABLE_MATCHERS) +# undef CATCH_CONFIG_DISABLE_MATCHERS +# endif +# if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER) +# define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER +# endif +#endif + +#if !defined(CATCH_CONFIG_IMPL_ONLY) +// start catch_platform.h + +// See e.g.: +// https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html +#ifdef __APPLE__ +# include +# if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) || \ + (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1) +# define CATCH_PLATFORM_MAC +# elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1) +# define CATCH_PLATFORM_IPHONE +# endif + +#elif defined(linux) || defined(__linux) || defined(__linux__) +# define CATCH_PLATFORM_LINUX + +#elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) || defined(__MINGW32__) +# define CATCH_PLATFORM_WINDOWS #endif +// end catch_platform.h + #ifdef CATCH_IMPL # ifndef CLARA_CONFIG_MAIN # define CLARA_CONFIG_MAIN_NOT_DEFINED @@ -60,309 +93,378 @@ # endif #endif -// #included from: internal/catch_notimplemented_exception.h -#define TWOBLUECUBES_CATCH_NOTIMPLEMENTED_EXCEPTION_H_INCLUDED +// start catch_user_interfaces.h + +namespace Catch { + unsigned int rngSeed(); +} + +// end catch_user_interfaces.h +// start catch_tag_alias_autoregistrar.h -// #included from: catch_common.h -#define TWOBLUECUBES_CATCH_COMMON_H_INCLUDED +// start catch_common.h -// #included from: catch_compiler_capabilities.h -#define TWOBLUECUBES_CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED +// start catch_compiler_capabilities.h -// Detect a number of compiler features - mostly C++11/14 conformance - by compiler +// Detect a number of compiler features - by compiler // The following features are defined: // -// CATCH_CONFIG_CPP11_NULLPTR : is nullptr supported? -// CATCH_CONFIG_CPP11_NOEXCEPT : is noexcept supported? -// CATCH_CONFIG_CPP11_GENERATED_METHODS : The delete and default keywords for compiler generated methods -// CATCH_CONFIG_CPP11_IS_ENUM : std::is_enum is supported? -// CATCH_CONFIG_CPP11_TUPLE : std::tuple is supported -// CATCH_CONFIG_CPP11_LONG_LONG : is long long supported? -// CATCH_CONFIG_CPP11_OVERRIDE : is override supported? -// CATCH_CONFIG_CPP11_UNIQUE_PTR : is unique_ptr supported (otherwise use auto_ptr) -// CATCH_CONFIG_CPP11_SHUFFLE : is std::shuffle supported? -// CATCH_CONFIG_CPP11_TYPE_TRAITS : are type_traits and enable_if supported? - -// CATCH_CONFIG_CPP11_OR_GREATER : Is C++11 supported? - -// CATCH_CONFIG_VARIADIC_MACROS : are variadic macros supported? // CATCH_CONFIG_COUNTER : is the __COUNTER__ macro supported? // CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported? // CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported? +// CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled? // **************** // Note to maintainers: if new toggles are added please document them // in configuration.md, too // **************** // In general each macro has a _NO_ form -// (e.g. CATCH_CONFIG_CPP11_NO_NULLPTR) which disables the feature. +// (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature. // Many features, at point of detection, define an _INTERNAL_ macro, so they // can be combined, en-mass, with the _NO_ forms later. -// All the C++11 features can be disabled with CATCH_CONFIG_NO_CPP11 - #ifdef __cplusplus -# if __cplusplus >= 201103L -# define CATCH_CPP11_OR_GREATER +# if (__cplusplus >= 201402L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) +# define CATCH_CPP14_OR_GREATER # endif -# if __cplusplus >= 201402L -# define CATCH_CPP14_OR_GREATER +# if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +# define CATCH_CPP17_OR_GREATER # endif #endif -#ifdef __clang__ +// Only GCC compiler should be used in this block, so other compilers trying to +// mask themselves as GCC should be ignored. +#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__) && !defined(__LCC__) +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" ) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic pop" ) -# if __has_feature(cxx_nullptr) -# define CATCH_INTERNAL_CONFIG_CPP11_NULLPTR -# endif +# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) + +#endif + +#if defined(__clang__) + +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" ) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic pop" ) -# if __has_feature(cxx_noexcept) -# define CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT +// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug +// which results in calls to destructors being emitted for each temporary, +// without a matching initialization. In practice, this can result in something +// like `std::string::~string` being called on an uninitialized value. +// +// For example, this code will likely segfault under IBM XL: +// ``` +// REQUIRE(std::string("12") + "34" == "1234") +// ``` +// +// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented. +# if !defined(__ibmxl__) && !defined(__CUDACC__) +# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */ # endif -# if defined(CATCH_CPP11_OR_GREATER) -# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wparentheses\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS \ - _Pragma( "clang diagnostic pop" ) -# endif +# define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \ + _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"") -#endif // __clang__ +# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wparentheses\"" ) -//////////////////////////////////////////////////////////////////////////////// -// Cygwin -#ifdef __CYGWIN__ +# define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" ) -# if !defined(CATCH_CONFIG_POSIX_SIGNALS) -# define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS -# endif +# define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" ) -// Required for some versions of Cygwin to declare gettimeofday -// see: http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin -# define _BSD_SOURCE +# define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wunused-template\"" ) -#endif // __CYGWIN__ +#endif // __clang__ //////////////////////////////////////////////////////////////////////////////// -// Borland -#ifdef __BORLANDC__ - -#endif // __BORLANDC__ +// Assume that non-Windows platforms support posix signals by default +#if !defined(CATCH_PLATFORM_WINDOWS) + #define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS +#endif //////////////////////////////////////////////////////////////////////////////// -// EDG -#ifdef __EDG_VERSION__ +// We know some environments not to support full POSIX signals +#if defined(__CYGWIN__) || defined(__QNX__) || defined(__EMSCRIPTEN__) || defined(__DJGPP__) + #define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS +#endif -#endif // __EDG_VERSION__ +#ifdef __OS400__ +# define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS +# define CATCH_CONFIG_COLOUR_NONE +#endif //////////////////////////////////////////////////////////////////////////////// -// Digital Mars -#ifdef __DMC__ - -#endif // __DMC__ +// Android somehow still does not support std::to_string +#if defined(__ANDROID__) +# define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING +# define CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE +#endif //////////////////////////////////////////////////////////////////////////////// -// GCC -#ifdef __GNUC__ +// Not all Windows environments support SEH properly +#if defined(__MINGW32__) +# define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH +#endif -# if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) -# define CATCH_GCC_HAS_NEW_PRAGMA -# endif +//////////////////////////////////////////////////////////////////////////////// +// PS4 +#if defined(__ORBIS__) +# define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE +#endif -# if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && defined(__GXX_EXPERIMENTAL_CXX0X__) -# define CATCH_INTERNAL_CONFIG_CPP11_NULLPTR -# endif +//////////////////////////////////////////////////////////////////////////////// +// Cygwin +#ifdef __CYGWIN__ -# if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS) && defined(CATCH_GCC_HAS_NEW_PRAGMA) -# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ - _Pragma( "GCC diagnostic push" ) \ - _Pragma( "GCC diagnostic ignored \"-Wparentheses\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS \ - _Pragma( "GCC diagnostic pop" ) -# endif +// Required for some versions of Cygwin to declare gettimeofday +// see: http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin +# define _BSD_SOURCE +// some versions of cygwin (most) do not support std::to_string. Use the libstd check. +// https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html line 2812-2813 +# if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) \ + && !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF)) -// - otherwise more recent versions define __cplusplus >= 201103L -// and will get picked up below +# define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING -#endif // __GNUC__ +# endif +#endif // __CYGWIN__ //////////////////////////////////////////////////////////////////////////////// // Visual C++ -#ifdef _MSC_VER +#if defined(_MSC_VER) -#define CATCH_INTERNAL_CONFIG_WINDOWS_SEH - -#if (_MSC_VER >= 1600) -# define CATCH_INTERNAL_CONFIG_CPP11_NULLPTR -# define CATCH_INTERNAL_CONFIG_CPP11_UNIQUE_PTR -#endif +// Universal Windows platform does not support SEH +// Or console colours (or console at all...) +# if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP) +# define CATCH_CONFIG_COLOUR_NONE +# else +# define CATCH_INTERNAL_CONFIG_WINDOWS_SEH +# endif -#if (_MSC_VER >= 1900 ) // (VC++ 13 (VS2015)) -#define CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT -#define CATCH_INTERNAL_CONFIG_CPP11_GENERATED_METHODS -#define CATCH_INTERNAL_CONFIG_CPP11_SHUFFLE -#define CATCH_INTERNAL_CONFIG_CPP11_TYPE_TRAITS -#endif +# if !defined(__clang__) // Handle Clang masquerading for msvc -#endif // _MSC_VER +// MSVC traditional preprocessor needs some workaround for __VA_ARGS__ +// _MSVC_TRADITIONAL == 0 means new conformant preprocessor +// _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor +# if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL) +# define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR +# endif // MSVC_TRADITIONAL -//////////////////////////////////////////////////////////////////////////////// +// Only do this if we're not using clang on Windows, which uses `diagnostic push` & `diagnostic pop` +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma( warning(push) ) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION __pragma( warning(pop) ) +# endif // __clang__ -// Use variadic macros if the compiler supports them -#if ( defined _MSC_VER && _MSC_VER > 1400 && !defined __EDGE__) || \ - ( defined __WAVE__ && __WAVE_HAS_VARIADICS ) || \ - ( defined __GNUC__ && __GNUC__ >= 3 ) || \ - ( !defined __cplusplus && __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L ) +#endif // _MSC_VER -#define CATCH_INTERNAL_CONFIG_VARIADIC_MACROS +#if defined(_REENTRANT) || defined(_MSC_VER) +// Enable async processing, as -pthread is specified or no additional linking is required +# define CATCH_INTERNAL_CONFIG_USE_ASYNC +#endif // _MSC_VER +//////////////////////////////////////////////////////////////////////////////// +// Check if we are compiled with -fno-exceptions or equivalent +#if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND) +# define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED #endif -// Use __COUNTER__ if the compiler supports it -#if ( defined _MSC_VER && _MSC_VER >= 1300 ) || \ - ( defined __GNUC__ && __GNUC__ >= 4 && __GNUC_MINOR__ >= 3 ) || \ - ( defined __clang__ && __clang_major__ >= 3 ) - -#define CATCH_INTERNAL_CONFIG_COUNTER +//////////////////////////////////////////////////////////////////////////////// +// DJGPP +#ifdef __DJGPP__ +# define CATCH_INTERNAL_CONFIG_NO_WCHAR +#endif // __DJGPP__ +//////////////////////////////////////////////////////////////////////////////// +// Embarcadero C++Build +#if defined(__BORLANDC__) + #define CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN #endif //////////////////////////////////////////////////////////////////////////////// -// C++ language feature support -// catch all support for C++11 -#if defined(CATCH_CPP11_OR_GREATER) +// Use of __COUNTER__ is suppressed during code analysis in +// CLion/AppCode 2017.2.x and former, because __COUNTER__ is not properly +// handled by it. +// Otherwise all supported compilers support COUNTER macro, +// but user still might want to turn it off +#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L ) + #define CATCH_INTERNAL_CONFIG_COUNTER +#endif -# if !defined(CATCH_INTERNAL_CONFIG_CPP11_NULLPTR) -# define CATCH_INTERNAL_CONFIG_CPP11_NULLPTR -# endif +//////////////////////////////////////////////////////////////////////////////// -# ifndef CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT -# define CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT -# endif +// RTX is a special version of Windows that is real time. +// This means that it is detected as Windows, but does not provide +// the same set of capabilities as real Windows does. +#if defined(UNDER_RTSS) || defined(RTX64_BUILD) + #define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH + #define CATCH_INTERNAL_CONFIG_NO_ASYNC + #define CATCH_CONFIG_COLOUR_NONE +#endif -# ifndef CATCH_INTERNAL_CONFIG_CPP11_GENERATED_METHODS -# define CATCH_INTERNAL_CONFIG_CPP11_GENERATED_METHODS -# endif +#if !defined(_GLIBCXX_USE_C99_MATH_TR1) +#define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER +#endif -# ifndef CATCH_INTERNAL_CONFIG_CPP11_IS_ENUM -# define CATCH_INTERNAL_CONFIG_CPP11_IS_ENUM -# endif +// Various stdlib support checks that require __has_include +#if defined(__has_include) + // Check if string_view is available and usable + #if __has_include() && defined(CATCH_CPP17_OR_GREATER) + # define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW + #endif + + // Check if optional is available and usable + # if __has_include() && defined(CATCH_CPP17_OR_GREATER) + # define CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL + # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) + + // Check if byte is available and usable + # if __has_include() && defined(CATCH_CPP17_OR_GREATER) + # include + # if defined(__cpp_lib_byte) && (__cpp_lib_byte > 0) + # define CATCH_INTERNAL_CONFIG_CPP17_BYTE + # endif + # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) + + // Check if variant is available and usable + # if __has_include() && defined(CATCH_CPP17_OR_GREATER) + # if defined(__clang__) && (__clang_major__ < 8) + // work around clang bug with libstdc++ https://bugs.llvm.org/show_bug.cgi?id=31852 + // fix should be in clang 8, workaround in libstdc++ 8.2 + # include + # if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9) + # define CATCH_CONFIG_NO_CPP17_VARIANT + # else + # define CATCH_INTERNAL_CONFIG_CPP17_VARIANT + # endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9) + # else + # define CATCH_INTERNAL_CONFIG_CPP17_VARIANT + # endif // defined(__clang__) && (__clang_major__ < 8) + # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) +#endif // defined(__has_include) + +#if defined(CATCH_INTERNAL_CONFIG_COUNTER) && !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER) +# define CATCH_CONFIG_COUNTER +#endif +#if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_NO_WINDOWS_SEH) && !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH) +# define CATCH_CONFIG_WINDOWS_SEH +#endif +// This is set by default, because we assume that unix compilers are posix-signal-compatible by default. +#if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) && !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_POSIX_SIGNALS) +# define CATCH_CONFIG_POSIX_SIGNALS +#endif +// This is set by default, because we assume that compilers with no wchar_t support are just rare exceptions. +#if !defined(CATCH_INTERNAL_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_WCHAR) +# define CATCH_CONFIG_WCHAR +#endif -# ifndef CATCH_INTERNAL_CONFIG_CPP11_TUPLE -# define CATCH_INTERNAL_CONFIG_CPP11_TUPLE -# endif +#if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_CPP11_TO_STRING) +# define CATCH_CONFIG_CPP11_TO_STRING +#endif -# ifndef CATCH_INTERNAL_CONFIG_VARIADIC_MACROS -# define CATCH_INTERNAL_CONFIG_VARIADIC_MACROS -# endif +#if defined(CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_NO_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_CPP17_OPTIONAL) +# define CATCH_CONFIG_CPP17_OPTIONAL +#endif -# if !defined(CATCH_INTERNAL_CONFIG_CPP11_LONG_LONG) -# define CATCH_INTERNAL_CONFIG_CPP11_LONG_LONG -# endif +#if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW) +# define CATCH_CONFIG_CPP17_STRING_VIEW +#endif -# if !defined(CATCH_INTERNAL_CONFIG_CPP11_OVERRIDE) -# define CATCH_INTERNAL_CONFIG_CPP11_OVERRIDE -# endif -# if !defined(CATCH_INTERNAL_CONFIG_CPP11_UNIQUE_PTR) -# define CATCH_INTERNAL_CONFIG_CPP11_UNIQUE_PTR -# endif -# if !defined(CATCH_INTERNAL_CONFIG_CPP11_SHUFFLE) -# define CATCH_INTERNAL_CONFIG_CPP11_SHUFFLE -# endif -# if !defined(CATCH_INTERNAL_CONFIG_CPP11_TYPE_TRAITS) -# define CATCH_INTERNAL_CONFIG_CPP11_TYPE_TRAITS -# endif +#if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) && !defined(CATCH_CONFIG_NO_CPP17_VARIANT) && !defined(CATCH_CONFIG_CPP17_VARIANT) +# define CATCH_CONFIG_CPP17_VARIANT +#endif -#endif // __cplusplus >= 201103L +#if defined(CATCH_INTERNAL_CONFIG_CPP17_BYTE) && !defined(CATCH_CONFIG_NO_CPP17_BYTE) && !defined(CATCH_CONFIG_CPP17_BYTE) +# define CATCH_CONFIG_CPP17_BYTE +#endif -// Now set the actual defines based on the above + anything the user has configured -#if defined(CATCH_INTERNAL_CONFIG_CPP11_NULLPTR) && !defined(CATCH_CONFIG_CPP11_NO_NULLPTR) && !defined(CATCH_CONFIG_CPP11_NULLPTR) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_NULLPTR +#if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT) +# define CATCH_INTERNAL_CONFIG_NEW_CAPTURE #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT) && !defined(CATCH_CONFIG_CPP11_NO_NOEXCEPT) && !defined(CATCH_CONFIG_CPP11_NOEXCEPT) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_NOEXCEPT + +#if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) && !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NEW_CAPTURE) +# define CATCH_CONFIG_NEW_CAPTURE #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP11_GENERATED_METHODS) && !defined(CATCH_CONFIG_CPP11_NO_GENERATED_METHODS) && !defined(CATCH_CONFIG_CPP11_GENERATED_METHODS) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_GENERATED_METHODS + +#if !defined(CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) +# define CATCH_CONFIG_DISABLE_EXCEPTIONS #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP11_IS_ENUM) && !defined(CATCH_CONFIG_CPP11_NO_IS_ENUM) && !defined(CATCH_CONFIG_CPP11_IS_ENUM) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_IS_ENUM + +#if defined(CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_NO_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_POLYFILL_ISNAN) +# define CATCH_CONFIG_POLYFILL_ISNAN #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP11_TUPLE) && !defined(CATCH_CONFIG_CPP11_NO_TUPLE) && !defined(CATCH_CONFIG_CPP11_TUPLE) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_TUPLE + +#if defined(CATCH_INTERNAL_CONFIG_USE_ASYNC) && !defined(CATCH_INTERNAL_CONFIG_NO_ASYNC) && !defined(CATCH_CONFIG_NO_USE_ASYNC) && !defined(CATCH_CONFIG_USE_ASYNC) +# define CATCH_CONFIG_USE_ASYNC #endif -#if defined(CATCH_INTERNAL_CONFIG_VARIADIC_MACROS) && !defined(CATCH_CONFIG_NO_VARIADIC_MACROS) && !defined(CATCH_CONFIG_VARIADIC_MACROS) -# define CATCH_CONFIG_VARIADIC_MACROS + +#if defined(CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_NO_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_ANDROID_LOGWRITE) +# define CATCH_CONFIG_ANDROID_LOGWRITE #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP11_LONG_LONG) && !defined(CATCH_CONFIG_CPP11_NO_LONG_LONG) && !defined(CATCH_CONFIG_CPP11_LONG_LONG) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_LONG_LONG + +#if defined(CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_NO_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_GLOBAL_NEXTAFTER) +# define CATCH_CONFIG_GLOBAL_NEXTAFTER #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP11_OVERRIDE) && !defined(CATCH_CONFIG_CPP11_NO_OVERRIDE) && !defined(CATCH_CONFIG_CPP11_OVERRIDE) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_OVERRIDE + +// Even if we do not think the compiler has that warning, we still have +// to provide a macro that can be used by the code. +#if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION) +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP11_UNIQUE_PTR) && !defined(CATCH_CONFIG_CPP11_NO_UNIQUE_PTR) && !defined(CATCH_CONFIG_CPP11_UNIQUE_PTR) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_UNIQUE_PTR +#if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION #endif -// Use of __COUNTER__ is suppressed if __JETBRAINS_IDE__ is #defined (meaning we're being parsed by a JetBrains IDE for -// analytics) because, at time of writing, __COUNTER__ is not properly handled by it. -// This does not affect compilation -#if defined(CATCH_INTERNAL_CONFIG_COUNTER) && !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER) && !defined(__JETBRAINS_IDE__) -# define CATCH_CONFIG_COUNTER +#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS) +# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP11_SHUFFLE) && !defined(CATCH_CONFIG_CPP11_NO_SHUFFLE) && !defined(CATCH_CONFIG_CPP11_SHUFFLE) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_SHUFFLE +#if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS) +# define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS #endif -# if defined(CATCH_INTERNAL_CONFIG_CPP11_TYPE_TRAITS) && !defined(CATCH_CONFIG_CPP11_NO_TYPE_TRAITS) && !defined(CATCH_CONFIG_CPP11_TYPE_TRAITS) && !defined(CATCH_CONFIG_NO_CPP11) -# define CATCH_CONFIG_CPP11_TYPE_TRAITS -# endif -#if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_NO_WINDOWS_SEH) && !defined(CATCH_CONFIG_WINDOWS_SEH) -# define CATCH_CONFIG_WINDOWS_SEH +#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS) +# define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS #endif -// This is set by default, because we assume that unix compilers are posix-signal-compatible by default. -#if !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_POSIX_SIGNALS) -# define CATCH_CONFIG_POSIX_SIGNALS +#if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS) +# define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS #endif -#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS) -# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS +// The goal of this macro is to avoid evaluation of the arguments, but +// still have the compiler warn on problems inside... +#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN) +# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) #endif -// noexcept support: -#if defined(CATCH_CONFIG_CPP11_NOEXCEPT) && !defined(CATCH_NOEXCEPT) -# define CATCH_NOEXCEPT noexcept -# define CATCH_NOEXCEPT_IS(x) noexcept(x) -#else -# define CATCH_NOEXCEPT throw() -# define CATCH_NOEXCEPT_IS(x) +#if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10) +# undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS +#elif defined(__clang__) && (__clang_major__ < 5) +# undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif -// nullptr support -#ifdef CATCH_CONFIG_CPP11_NULLPTR -# define CATCH_NULL nullptr -#else -# define CATCH_NULL NULL +#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS) +# define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif -// override support -#ifdef CATCH_CONFIG_CPP11_OVERRIDE -# define CATCH_OVERRIDE override +#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) +#define CATCH_TRY if ((true)) +#define CATCH_CATCH_ALL if ((false)) +#define CATCH_CATCH_ANON(type) if ((false)) #else -# define CATCH_OVERRIDE +#define CATCH_TRY try +#define CATCH_CATCH_ALL catch (...) +#define CATCH_CATCH_ANON(type) catch (type) #endif -// unique_ptr support -#ifdef CATCH_CONFIG_CPP11_UNIQUE_PTR -# define CATCH_AUTO_PTR( T ) std::unique_ptr -#else -# define CATCH_AUTO_PTR( T ) std::auto_ptr +#if defined(CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_NO_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) +#define CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #endif +// end catch_compiler_capabilities.h #define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line #define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) #ifdef CATCH_CONFIG_COUNTER @@ -371,95 +473,48 @@ # define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ ) #endif -#define INTERNAL_CATCH_STRINGIFY2( expr ) #expr -#define INTERNAL_CATCH_STRINGIFY( expr ) INTERNAL_CATCH_STRINGIFY2( expr ) +#include +#include +#include -#include -#include +// We need a dummy global operator<< so we can bring it into Catch namespace later +struct Catch_global_namespace_dummy {}; +std::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy); namespace Catch { - struct IConfig; - struct CaseSensitive { enum Choice { Yes, No }; }; class NonCopyable { -#ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS NonCopyable( NonCopyable const& ) = delete; NonCopyable( NonCopyable && ) = delete; NonCopyable& operator = ( NonCopyable const& ) = delete; NonCopyable& operator = ( NonCopyable && ) = delete; -#else - NonCopyable( NonCopyable const& info ); - NonCopyable& operator = ( NonCopyable const& ); -#endif protected: - NonCopyable() {} + NonCopyable(); virtual ~NonCopyable(); }; - class SafeBool { - public: - typedef void (SafeBool::*type)() const; - - static type makeSafe( bool value ) { - return value ? &SafeBool::trueValue : 0; - } - private: - void trueValue() const {} - }; - - template - inline void deleteAll( ContainerT& container ) { - typename ContainerT::const_iterator it = container.begin(); - typename ContainerT::const_iterator itEnd = container.end(); - for(; it != itEnd; ++it ) - delete *it; - } - template - inline void deleteAllValues( AssociativeContainerT& container ) { - typename AssociativeContainerT::const_iterator it = container.begin(); - typename AssociativeContainerT::const_iterator itEnd = container.end(); - for(; it != itEnd; ++it ) - delete it->second; - } - - bool startsWith( std::string const& s, std::string const& prefix ); - bool startsWith( std::string const& s, char prefix ); - bool endsWith( std::string const& s, std::string const& suffix ); - bool endsWith( std::string const& s, char suffix ); - bool contains( std::string const& s, std::string const& infix ); - void toLowerInPlace( std::string& s ); - std::string toLower( std::string const& s ); - std::string trim( std::string const& str ); - bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis ); - - struct pluralise { - pluralise( std::size_t count, std::string const& label ); - - friend std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser ); + struct SourceLineInfo { - std::size_t m_count; - std::string m_label; - }; + SourceLineInfo() = delete; + SourceLineInfo( char const* _file, std::size_t _line ) noexcept + : file( _file ), + line( _line ) + {} - struct SourceLineInfo { + SourceLineInfo( SourceLineInfo const& other ) = default; + SourceLineInfo& operator = ( SourceLineInfo const& ) = default; + SourceLineInfo( SourceLineInfo&& ) noexcept = default; + SourceLineInfo& operator = ( SourceLineInfo&& ) noexcept = default; - SourceLineInfo(); - SourceLineInfo( char const* _file, std::size_t _line ); -# ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS - SourceLineInfo(SourceLineInfo const& other) = default; - SourceLineInfo( SourceLineInfo && ) = default; - SourceLineInfo& operator = ( SourceLineInfo const& ) = default; - SourceLineInfo& operator = ( SourceLineInfo && ) = default; -# endif - bool empty() const; - bool operator == ( SourceLineInfo const& other ) const; - bool operator < ( SourceLineInfo const& other ) const; + bool empty() const noexcept { return file[0] == '\0'; } + bool operator == ( SourceLineInfo const& other ) const noexcept; + bool operator < ( SourceLineInfo const& other ) const noexcept; char const* file; std::size_t line; @@ -467,24 +522,17 @@ namespace Catch { std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info ); - // This is just here to avoid compiler warnings with macro constants and boolean literals - inline bool isTrue( bool value ){ return value; } - inline bool alwaysTrue() { return true; } - inline bool alwaysFalse() { return false; } - - void throwLogicError( std::string const& message, SourceLineInfo const& locationInfo ); - - void seedRng( IConfig const& config ); - unsigned int rngSeed(); + // Bring in operator<< from global namespace into Catch namespace + // This is necessary because the overload of operator<< above makes + // lookup stop at namespace Catch + using ::operator<<; // Use this in variadic streaming macros to allow // >> +StreamEndStop // as well as // >> stuff +StreamEndStop struct StreamEndStop { - std::string operator+() { - return std::string(); - } + std::string operator+() const; }; template T const& operator + ( T const& value, StreamEndStop ) { @@ -492,347 +540,812 @@ namespace Catch { } } -#define CATCH_INTERNAL_LINEINFO ::Catch::SourceLineInfo( __FILE__, static_cast( __LINE__ ) ) -#define CATCH_INTERNAL_ERROR( msg ) ::Catch::throwLogicError( msg, CATCH_INTERNAL_LINEINFO ); +#define CATCH_INTERNAL_LINEINFO \ + ::Catch::SourceLineInfo( __FILE__, static_cast( __LINE__ ) ) +// end catch_common.h namespace Catch { - class NotImplementedException : public std::exception - { - public: - NotImplementedException( SourceLineInfo const& lineInfo ); - NotImplementedException( NotImplementedException const& ) {} - - virtual ~NotImplementedException() CATCH_NOEXCEPT {} - - virtual const char* what() const CATCH_NOEXCEPT; - - private: - std::string m_what; - SourceLineInfo m_lineInfo; + struct RegistrarForTagAliases { + RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo ); }; } // end namespace Catch -/////////////////////////////////////////////////////////////////////////////// -#define CATCH_NOT_IMPLEMENTED throw Catch::NotImplementedException( CATCH_INTERNAL_LINEINFO ) +#define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ + CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ + namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION -// #included from: internal/catch_context.h -#define TWOBLUECUBES_CATCH_CONTEXT_H_INCLUDED +// end catch_tag_alias_autoregistrar.h +// start catch_test_registry.h -// #included from: catch_interfaces_generators.h -#define TWOBLUECUBES_CATCH_INTERFACES_GENERATORS_H_INCLUDED +// start catch_interfaces_testcase.h -#include +#include namespace Catch { - struct IGeneratorInfo { - virtual ~IGeneratorInfo(); - virtual bool moveNext() = 0; - virtual std::size_t getCurrentIndex() const = 0; + class TestSpec; + + struct ITestInvoker { + virtual void invoke () const = 0; + virtual ~ITestInvoker(); }; - struct IGeneratorsForTest { - virtual ~IGeneratorsForTest(); + class TestCase; + struct IConfig; - virtual IGeneratorInfo& getGeneratorInfo( std::string const& fileInfo, std::size_t size ) = 0; - virtual bool moveNext() = 0; + struct ITestCaseRegistry { + virtual ~ITestCaseRegistry(); + virtual std::vector const& getAllTests() const = 0; + virtual std::vector const& getAllTestsSorted( IConfig const& config ) const = 0; }; - IGeneratorsForTest* createGeneratorsForTest(); + bool isThrowSafe( TestCase const& testCase, IConfig const& config ); + bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config ); + std::vector filterTests( std::vector const& testCases, TestSpec const& testSpec, IConfig const& config ); + std::vector const& getAllTestCasesSorted( IConfig const& config ); -} // end namespace Catch +} -// #included from: catch_ptr.hpp -#define TWOBLUECUBES_CATCH_PTR_HPP_INCLUDED +// end catch_interfaces_testcase.h +// start catch_stringref.h -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wpadded" -#endif +#include +#include +#include +#include namespace Catch { - // An intrusive reference counting smart pointer. - // T must implement addRef() and release() methods - // typically implementing the IShared interface - template - class Ptr { + /// A non-owning string class (similar to the forthcoming std::string_view) + /// Note that, because a StringRef may be a substring of another string, + /// it may not be null terminated. + class StringRef { public: - Ptr() : m_p( CATCH_NULL ){} - Ptr( T* p ) : m_p( p ){ - if( m_p ) - m_p->addRef(); - } - Ptr( Ptr const& other ) : m_p( other.m_p ){ - if( m_p ) - m_p->addRef(); - } - ~Ptr(){ - if( m_p ) - m_p->release(); - } - void reset() { - if( m_p ) - m_p->release(); - m_p = CATCH_NULL; - } - Ptr& operator = ( T* p ){ - Ptr temp( p ); - swap( temp ); - return *this; - } - Ptr& operator = ( Ptr const& other ){ - Ptr temp( other ); - swap( temp ); - return *this; - } - void swap( Ptr& other ) { std::swap( m_p, other.m_p ); } - T* get() const{ return m_p; } - T& operator*() const { return *m_p; } - T* operator->() const { return m_p; } - bool operator !() const { return m_p == CATCH_NULL; } - operator SafeBool::type() const { return SafeBool::makeSafe( m_p != CATCH_NULL ); } + using size_type = std::size_t; + using const_iterator = const char*; private: - T* m_p; - }; + static constexpr char const* const s_empty = ""; - struct IShared : NonCopyable { - virtual ~IShared(); - virtual void addRef() const = 0; - virtual void release() const = 0; - }; + char const* m_start = s_empty; + size_type m_size = 0; + + public: // construction + constexpr StringRef() noexcept = default; - template - struct SharedImpl : T { + StringRef( char const* rawChars ) noexcept; - SharedImpl() : m_rc( 0 ){} + constexpr StringRef( char const* rawChars, size_type size ) noexcept + : m_start( rawChars ), + m_size( size ) + {} + + StringRef( std::string const& stdString ) noexcept + : m_start( stdString.c_str() ), + m_size( stdString.size() ) + {} - virtual void addRef() const { - ++m_rc; + explicit operator std::string() const { + return std::string(m_start, m_size); } - virtual void release() const { - if( --m_rc == 0 ) - delete this; + + public: // operators + auto operator == ( StringRef const& other ) const noexcept -> bool; + auto operator != (StringRef const& other) const noexcept -> bool { + return !(*this == other); } - mutable unsigned int m_rc; - }; + auto operator[] ( size_type index ) const noexcept -> char { + assert(index < m_size); + return m_start[index]; + } -} // end namespace Catch + public: // named queries + constexpr auto empty() const noexcept -> bool { + return m_size == 0; + } + constexpr auto size() const noexcept -> size_type { + return m_size; + } -#ifdef __clang__ -#pragma clang diagnostic pop -#endif + // Returns the current start pointer. If the StringRef is not + // null-terminated, throws std::domain_exception + auto c_str() const -> char const*; -namespace Catch { + public: // substrings and searches + // Returns a substring of [start, start + length). + // If start + length > size(), then the substring is [start, size()). + // If start > size(), then the substring is empty. + auto substr( size_type start, size_type length ) const noexcept -> StringRef; - class TestCase; - class Stream; - struct IResultCapture; - struct IRunner; - struct IGeneratorsForTest; - struct IConfig; + // Returns the current start pointer. May not be null-terminated. + auto data() const noexcept -> char const*; - struct IContext - { - virtual ~IContext(); + constexpr auto isNullTerminated() const noexcept -> bool { + return m_start[m_size] == '\0'; + } - virtual IResultCapture* getResultCapture() = 0; - virtual IRunner* getRunner() = 0; - virtual size_t getGeneratorIndex( std::string const& fileInfo, size_t totalSize ) = 0; - virtual bool advanceGeneratorsForCurrentTest() = 0; - virtual Ptr getConfig() const = 0; + public: // iterators + constexpr const_iterator begin() const { return m_start; } + constexpr const_iterator end() const { return m_start + m_size; } }; - struct IMutableContext : IContext - { - virtual ~IMutableContext(); - virtual void setResultCapture( IResultCapture* resultCapture ) = 0; - virtual void setRunner( IRunner* runner ) = 0; - virtual void setConfig( Ptr const& config ) = 0; - }; + auto operator += ( std::string& lhs, StringRef const& sr ) -> std::string&; + auto operator << ( std::ostream& os, StringRef const& sr ) -> std::ostream&; - IContext& getCurrentContext(); - IMutableContext& getCurrentMutableContext(); - void cleanUpContext(); - Stream createStream( std::string const& streamName ); + constexpr auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef { + return StringRef( rawChars, size ); + } +} // namespace Catch +constexpr auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef { + return Catch::StringRef( rawChars, size ); } -// #included from: internal/catch_test_registry.hpp -#define TWOBLUECUBES_CATCH_TEST_REGISTRY_HPP_INCLUDED +// end catch_stringref.h +// start catch_preprocessor.hpp -// #included from: catch_interfaces_testcase.h -#define TWOBLUECUBES_CATCH_INTERFACES_TESTCASE_H_INCLUDED -#include +#define CATCH_RECURSION_LEVEL0(...) __VA_ARGS__ +#define CATCH_RECURSION_LEVEL1(...) CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(__VA_ARGS__))) +#define CATCH_RECURSION_LEVEL2(...) CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(__VA_ARGS__))) +#define CATCH_RECURSION_LEVEL3(...) CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(__VA_ARGS__))) +#define CATCH_RECURSION_LEVEL4(...) CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(__VA_ARGS__))) +#define CATCH_RECURSION_LEVEL5(...) CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(__VA_ARGS__))) -namespace Catch { +#ifdef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR +#define INTERNAL_CATCH_EXPAND_VARGS(...) __VA_ARGS__ +// MSVC needs more evaluations +#define CATCH_RECURSION_LEVEL6(...) CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(__VA_ARGS__))) +#define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL6(CATCH_RECURSION_LEVEL6(__VA_ARGS__)) +#else +#define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL5(__VA_ARGS__) +#endif - class TestSpec; +#define CATCH_REC_END(...) +#define CATCH_REC_OUT + +#define CATCH_EMPTY() +#define CATCH_DEFER(id) id CATCH_EMPTY() + +#define CATCH_REC_GET_END2() 0, CATCH_REC_END +#define CATCH_REC_GET_END1(...) CATCH_REC_GET_END2 +#define CATCH_REC_GET_END(...) CATCH_REC_GET_END1 +#define CATCH_REC_NEXT0(test, next, ...) next CATCH_REC_OUT +#define CATCH_REC_NEXT1(test, next) CATCH_DEFER ( CATCH_REC_NEXT0 ) ( test, next, 0) +#define CATCH_REC_NEXT(test, next) CATCH_REC_NEXT1(CATCH_REC_GET_END test, next) + +#define CATCH_REC_LIST0(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ ) +#define CATCH_REC_LIST1(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0) ) ( f, peek, __VA_ARGS__ ) +#define CATCH_REC_LIST2(f, x, peek, ...) f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ ) + +#define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ ) +#define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD) ) ( f, userdata, peek, __VA_ARGS__ ) +#define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...) f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ ) + +// Applies the function macro `f` to each of the remaining parameters, inserts commas between the results, +// and passes userdata as the first parameter to each invocation, +// e.g. CATCH_REC_LIST_UD(f, x, a, b, c) evaluates to f(x, a), f(x, b), f(x, c) +#define CATCH_REC_LIST_UD(f, userdata, ...) CATCH_RECURSE(CATCH_REC_LIST2_UD(f, userdata, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) + +#define CATCH_REC_LIST(f, ...) CATCH_RECURSE(CATCH_REC_LIST2(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) + +#define INTERNAL_CATCH_EXPAND1(param) INTERNAL_CATCH_EXPAND2(param) +#define INTERNAL_CATCH_EXPAND2(...) INTERNAL_CATCH_NO## __VA_ARGS__ +#define INTERNAL_CATCH_DEF(...) INTERNAL_CATCH_DEF __VA_ARGS__ +#define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF +#define INTERNAL_CATCH_STRINGIZE(...) INTERNAL_CATCH_STRINGIZE2(__VA_ARGS__) +#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR +#define INTERNAL_CATCH_STRINGIZE2(...) #__VA_ARGS__ +#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) +#else +// MSVC is adding extra space and needs another indirection to expand INTERNAL_CATCH_NOINTERNAL_CATCH_DEF +#define INTERNAL_CATCH_STRINGIZE2(...) INTERNAL_CATCH_STRINGIZE3(__VA_ARGS__) +#define INTERNAL_CATCH_STRINGIZE3(...) #__VA_ARGS__ +#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) (INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) + 1) +#endif - struct ITestCase : IShared { - virtual void invoke () const = 0; - protected: - virtual ~ITestCase(); - }; +#define INTERNAL_CATCH_MAKE_NAMESPACE2(...) ns_##__VA_ARGS__ +#define INTERNAL_CATCH_MAKE_NAMESPACE(name) INTERNAL_CATCH_MAKE_NAMESPACE2(name) - class TestCase; - struct IConfig; +#define INTERNAL_CATCH_REMOVE_PARENS(...) INTERNAL_CATCH_EXPAND1(INTERNAL_CATCH_DEF __VA_ARGS__) - struct ITestCaseRegistry { - virtual ~ITestCaseRegistry(); - virtual std::vector const& getAllTests() const = 0; - virtual std::vector const& getAllTestsSorted( IConfig const& config ) const = 0; +#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR +#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) decltype(get_wrapper()) +#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__)) +#else +#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) INTERNAL_CATCH_EXPAND_VARGS(decltype(get_wrapper())) +#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__))) +#endif + +#define INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(...)\ + CATCH_REC_LIST(INTERNAL_CATCH_MAKE_TYPE_LIST,__VA_ARGS__) + +#define INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_0) INTERNAL_CATCH_REMOVE_PARENS(_0) +#define INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_0, _1) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_1) +#define INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_0, _1, _2) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_1, _2) +#define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3) +#define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4) +#define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5) +#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6) +#define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7) +#define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8) +#define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9) +#define INTERNAL_CATCH_REMOVE_PARENS_11_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10) + +#define INTERNAL_CATCH_VA_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N + +#define INTERNAL_CATCH_TYPE_GEN\ + template struct TypeList {};\ + template\ + constexpr auto get_wrapper() noexcept -> TypeList { return {}; }\ + template class...> struct TemplateTypeList{};\ + template class...Cs>\ + constexpr auto get_wrapper() noexcept -> TemplateTypeList { return {}; }\ + template\ + struct append;\ + template\ + struct rewrap;\ + template class, typename...>\ + struct create;\ + template class, typename>\ + struct convert;\ + \ + template \ + struct append { using type = T; };\ + template< template class L1, typename...E1, template class L2, typename...E2, typename...Rest>\ + struct append, L2, Rest...> { using type = typename append, Rest...>::type; };\ + template< template class L1, typename...E1, typename...Rest>\ + struct append, TypeList, Rest...> { using type = L1; };\ + \ + template< template class Container, template class List, typename...elems>\ + struct rewrap, List> { using type = TypeList>; };\ + template< template class Container, template class List, class...Elems, typename...Elements>\ + struct rewrap, List, Elements...> { using type = typename append>, typename rewrap, Elements...>::type>::type; };\ + \ + template
    + vg:zoomFactor + (rdf:type + + owl:DatatypeProperty + ) + +
    + rdfs:comment + "The zoom factor of pangenome, which is defined as bin width, generally." + + xsd:string +
    + rdfs:domain + vg:ZoomLevel
    + rdfs:label + "zoomFactor" + xsd:string